In [564]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from hazm import Lemmatizer


## Reading Dataset

In [565]:
df_test = pd.read_csv("./data/test.csv")
df_train = pd.read_csv("./data/train.csv")


In [566]:
df_merged = pd.concat([df_train,df_test], ignore_index=True,sort=False)


In [567]:

def convDescription(df):
    vectorizer = TfidfVectorizer(max_features=2)  
    tfidf_matrix = vectorizer.fit_transform(df['description'])

    svd = TruncatedSVD(n_components=2)  
    reduced_features = svd.fit_transform(tfidf_matrix)


    df['description'] = reduced_features
    return df

In [568]:
def convDescription2(df):
    df['description'].fillna("null",inplace=True)
    df['description'].fillna("null",inplace=True)

    lemmatizer = Lemmatizer()


    def lemmatize_text(text):
        return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    df['description'] = df['description'].apply(lemmatize_text)

    vectorizer = TfidfVectorizer(min_df=0.005,max_df=0.025)
    tfidf_matrix = vectorizer.fit_transform(list(df['description']))

    # svd = TruncatedSVD(n_components=1)
    # reduced_features = svd.fit_transform(tfidf_matrix)
    # print(reduced_features.shape)
    # df['description'] = reduced_features

    feature_names = vectorizer.get_feature_names_out()

    df2 = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

    df = pd.concat([df,df2],axis=1)
    return df , tfidf_matrix

In [569]:
df_merged,a = convDescription2(df_merged)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['description'].fillna("null",inplace=True)


In [570]:
list(df_merged.columns)

['title',
 'year',
 'mileage',
 'transmission',
 'fuel',
 'body_color',
 'inside_color',
 'body_status',
 'description',
 'body_type',
 'volume',
 'engine',
 'acceleration',
 'price',
 '10',
 '11',
 '12',
 'esp',
 'آخر',
 'آفتاب',
 'آلومینیوم',
 'آماده',
 'آمریکایی',
 'آپشن',
 'آینه',
 'ابتدا',
 'اتاق',
 'اتو',
 'ارتفاع',
 'اردیبهشت',
 'اس',
 'اساسی',
 'استارت',
 'استثنایی',
 'اسفند',
 'اسپرت',
 'اصلا',
 'اصلی',
 'اضافه',
 'اطلاعات',
 'افتاب',
 'اقساط',
 'الی',
 'ام',
 'اماده',
 'انتقال',
 'انداخت',
 'انداز',
 'اندازه',
 'اندروید',
 'اپشن',
 'اگه',
 'ایران',
 'ایربگ',
 'این',
 'اینه',
 'اینچ',
 'باتری',
 'باد',
 'بار',
 'باران',
 'باربند',
 'باز',
 'باسلام',
 'باشد',
 'بالا',
 'بالای',
 'باند',
 'بخاری',
 'بخاطر',
 'بدلیل',
 'بدونه',
 'بر',
 'برد',
 'برگه',
 'بزرگ',
 'بست',
 'بشرط',
 'بعد',
 'بلوتوث',
 'بند',
 'بنزین',
 'بهترین',
 'بهمن',
 'بود',
 'بگیره',
 'بگیرید',
 'بین',
 'تاشو',
 'تایر',
 'تحویل',
 'تر',
 'ترمز',
 'ترک',
 'ترین',
 'تشکر',
 'تصویر',
 'تنظیم',
 'تو',
 'تودوزی',
 'تو

In [571]:
df_train = df_merged[df_merged['price'].notna()].reset_index(drop=True)
df_test = df_merged[df_merged['price'].isna()].reset_index(drop=True)
df_test.drop('price',axis=1,inplace=True)
# df_test.drop('description',axis=1,inplace=True)
# df_train.drop('description',axis=1,inplace=True)

# convert

## convert years to same scale

In [572]:
def ConvYear(df):
    df.year = df.year.map(lambda x: x if x<1500 else x-621)
    return df

In [573]:
df_train = ConvYear(df_train)
df_test = ConvYear(df_test)


## Mapping mileage columns to int

In [574]:
def mapMil(x):
    if (isinstance(x,str) and 'km' in x):
        lenOfSentence = len(x)
        km = lenOfSentence-3
        return int(x[:km].replace(",",""))
    else:
        return None
def convMil(df):
    df['mileage'] = df['mileage'].map(mapMil)
    return df

In [575]:
df_train = convMil(df_train)
df_test = convMil(df_test)


In [576]:
df_test['mileage'].describe()

count      3629.000000
mean     112184.834390
std       91541.341537
min          13.000000
25%       37784.000000
50%       91000.000000
75%      168000.000000
max      425000.000000
Name: mileage, dtype: float64

## map price to int

In [577]:
def convPrice(df):
    df['price'] = df['price'].map(lambda x : int(x.replace(",","")))
    return df

In [578]:
df_train = convPrice(df_train)


## Mapping acceleration columns to float

In [579]:
def mapAcc(x):
    if (isinstance(x,str) and 'ثانیه' in x):
        lenOfSentence = len(x)
        km = lenOfSentence-5
        return float(x[:km].replace("/","."))
    else:
        return None

def convAcc(df):
    df['acceleration'] = df['acceleration'].map(mapAcc)
    return df

In [580]:
df_train = convAcc(df_train)
df_test = convAcc(df_test)


## Mapping Volume columns to float

In [581]:
def mapVol(x):
    if (isinstance(x,str) and 'لیتر' in x):
        lenOfSentence = len(x)
        km = lenOfSentence-5
        return float(x[:km].replace("/","."))
    else:
        return None
def convVol(df):
    df['volume'] = df['volume'].map(mapVol)
    return df

In [582]:
df_train = convVol(df_train)
df_test = convVol(df_test)


## Mapping Fuel columns to float

In [583]:
def mapToFloat(x):
    if(pd.isna(x)):
        return None
    elif (len(x) > 5):
        lenOfSentence = len(x)
        liter = lenOfSentence-19
        return float(x[:liter])

def convFuel(df):
    df['fuel'] = df['fuel'].map(mapToFloat)
    return df

In [584]:
df_train = convFuel(df_train)
df_test = convFuel(df_test)


## filling mil

In [585]:
def fillingMil(df):
    df['mileage'] = df.apply(lambda x :(1403.3-x.year)*15000 if pd.isna(x['mileage']) else x.mileage,axis=1)
    return df


In [586]:
df_train = fillingMil(df_train)
df_test = fillingMil(df_test)


## filling cars

In [587]:
def fillCars(df):
    ## پراید صندوق دار
    target = df['title'] == "پراید، صندوق دار"
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(7.5)
    df.loc[target,'volume'] = df.loc[target,'volume'].fillna(1.3)
    df.loc[target,'engine'] = df.loc[target,'engine'].fillna("4 سیلندر یورو2 بهینه (M13)")
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(15)

    ## پژو ۴۰۵

    target = (df['title'] == 'پژو، 405')&(df['year']<=1390)
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(9.0)
    df.loc[target,'volume'] = df.loc[target,'volume'].fillna(1.8)
    df.loc[target,'engine'] = df.loc[target,'engine'].fillna("4 سیلندر L3")
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(11.0)

    df.loc[df['title'] == "پژو، 405",'engine'] = df.loc[df['title'] == "پژو، 405",'engine'].fillna("test")
    df.loc[df['engine']=='test','engine'] = '4 سیلندر L3'
    df.loc[df['engine']=='4 سیلندر L3','acceleration'] = 13.2
    df.loc[df['engine']=='4 سیلندر TU5','acceleration'] = 11

    target = (df['title'] ==  'پژو، 405')
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(9)
    df.loc[target,'volume'] = df.loc[target,'volume'].fillna(1.8)

    # 'سمند، X7'
    target = df['title'] == 'سمند، X7'
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(8.5)
    df.loc[target,'volume'] = df.loc[target,'volume'].fillna(1.8)
    df.loc[target,'engine'] = df.loc[target,'engine'].fillna("4 سیلندر")
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(14.9)

    # 'ام وی ام، X22'
    target = (df['title'] == 'ام وی ام، X22')&(df['transmission'] == 'دنده ای')
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(5.9)
    df.loc[target,'volume'] = df.loc[target,'volume'].fillna(1.5)
    df.loc[target,'engine'] = df.loc[target,'engine'].fillna("4 سیلندر")
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(12.0)

    target = (df['title'] == 'ام وی ام، X22')&(df['transmission'] == 'اتوماتیک')
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(6.1)
    df.loc[target,'volume'] = df.loc[target,'volume'].fillna(1.5)
    df.loc[target,'engine'] = df.loc[target,'engine'].fillna("4 سیلندر")
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(12.7)

    # پژو ۲۰۷
    target = (df['title'] == 'پژو، 207')&(df['transmission'] == 'اتوماتیک')
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(7.4)
    df.loc[target,'volume'] = df.loc[target,'volume'].fillna(1.6)
    df.loc[target,'engine'] = df.loc[target,'engine'].fillna("4 سیلندر TU5P")
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(12.6)

    # کی ام سی، T8
    target = (df['title'] == 'کی ام سی، T8')
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(9.8)
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(12.0)

    # جک، S5
    # دو لیتر اتوماتیک
    # یکونیم لیتر اتوماتیک نال است
    target = (df['title']=='جک، S5')&(df['volume']==1.5)
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(11.0)
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(7.5)

    # ام وی ام، X22 PRO
    # اتوماتیک همه یک لیتری هستندو  لیتری سوخت و شتاب خالی است
    # دنده ای همه یکونیم لیتری و فقط شتاب خالی است.
    target = (df['title']=='ام وی ام، X22 PRO')&(df['transmission']=='دنده ای')
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(14.0)

    target = (df['title']=='ام وی ام، X22 PRO')&(df['transmission']=='اتوماتیک')
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(4.9)
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(12.0)

    # کی ام سی، J7
    target = (df['title']=='کی ام سی، J7')
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(6.7)
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(6.7)

    # ام وی ام، X55 PRO
    target = (df['title']=='ام وی ام، X55 PRO')
    df.loc[target,'fuel'] = df.loc[target,'fuel'].fillna(6.5)
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(10.5)

    # فونیکس، تیگو 7 پرو
    target = (df['title']=='فونیکس، تیگو 7 پرو')
    df.loc[target,'acceleration'] = df.loc[target,'acceleration'].fillna(9.8)


    return df

In [588]:
df_test.isnull().sum()

title             0
year              0
mileage           0
transmission      0
fuel            357
               ... 
۲۰۵               0
۲۰۷               0
۷۰                0
۸۰                0
۹۰                0
Length: 349, dtype: int64

In [589]:
def fillNullWithMean(df):
    df['fuel'].fillna(df['fuel'].median(),inplace=True)
    df['volume'].fillna(df['volume'].median(),inplace=True)
    df['acceleration'].fillna(df['acceleration'].median(),inplace=True)
    df['engine'].fillna('null',inplace=True)
    return df

In [590]:
df_train[df_train['price']>6000000000]

Unnamed: 0,title,year,mileage,transmission,fuel,body_color,inside_color,body_status,description,body_type,...,۱۰۰,۱۱,۱۲,۱۴۰۴,۲۰,۲۰۵,۲۰۷,۷۰,۸۰,۹۰
42,ب ام و، سری 7,1396,7000.0,اتوماتیک,6.2,سفید,موکا,بدون رنگ,همراه با کارشناسی,passenger_car,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,ب ام و، X3,1395,55000.0,اتوماتیک,7.3,سفید,مشکی,بدون رنگ,فول کامل با کیت ام و در حد صفر سیستم هارمن فول...,crossover,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55,ب ام و، سری 5 سدان,1395,30000.0,اتوماتیک,6.5,کربن بلک,موکا,بدون رنگ,فول ترین و کامل ترین نسخه وارد شده شرکت ۵ کلید...,passenger_car,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68,لکسوس، NX,1396,35000.0,اتوماتیک,7.9,مشکی,مارون,بدون رنگ,۷ کلید عمان به شرط کارشناسی لطفا فقط تماس گرفت...,crossover,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106,تویوتا، اف جی کروز,1390,150000.0,اتوماتیک,12.4,سفید,مشکی,بدون رنگ,,suv,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15423,بنز، کلاس S,1387,90000.0,اتوماتیک,,سفید,کرم,چند لکه رنگ,یخچال- سقف جیر- سیستم صوتی هارمن- نایت-تنظیم ا...,passenger_car,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15430,تویوتا، اف جی کروز,1391,220000.0,اتوماتیک,12.4,مشکی,مشکی,چند لکه رنگ,کد آگهی :2082 آپشن :فول اکستریم 6ایربگ -برای ه...,suv,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15531,ب ام و، X4,1394,127000.0,اتوماتیک,7.4,قهوه ای,موکا,درب تعویض,بسیار سالم و بدون هیچ گونه خط و خش,crossover,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15537,تویوتا، پرادو چهار در,1389,170000.0,اتوماتیک,13.0,مشکی,مشکی,یک لکه رنگ,تنظیم ارتفاع دوربین عقب گرمکن کرال صندلی برقی ...,suv,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [591]:
def dropOutLierPrice(df):
    df.drop(index=df[df['price']<40000000].index,inplace=True)
    df.drop(index=df[df['price']>7800000000].index,inplace=True)
    return df

In [592]:
df_train = fillCars(df_train)
df_test = fillCars(df_test)

In [593]:

df_train = fillNullWithMean(df_train)
df_test = fillNullWithMean(df_test)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fuel'].fillna(df['fuel'].median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['volume'].fillna(df['volume'].median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 

In [594]:
df_train = dropOutLierPrice(df_train)

In [595]:
from scipy import stats

def remove_outliers_by_zscore(df, column_name, threshold=3):

    df['zscore'] = df.groupby('title')[column_name].transform(lambda x: stats.zscore(x, nan_policy='omit'))
    df_clean = df[(df['zscore'].abs() < threshold)]
    df_clean = df_clean.drop(columns=['zscore'])

    return df_clean

df_train = remove_outliers_by_zscore(df_train, 'price', threshold=2.6)


## Train Model

In [596]:
df_train.columns

Index(['title', 'year', 'mileage', 'transmission', 'fuel', 'body_color',
       'inside_color', 'body_status', 'description', 'body_type',
       ...
       '۱۰۰', '۱۱', '۱۲', '۱۴۰۴', '۲۰', '۲۰۵', '۲۰۷', '۷۰', '۸۰', '۹۰'],
      dtype='object', length=350)

In [608]:
X_train = df_train.drop(columns=['price'])
y_train = df_train['price']
X_test = df_test

categorical_cols = ['title', 'transmission', 'body_color', 'inside_color', 'body_status', 'body_type', 'engine']
numerical_cols = ['year', 'mileage', 'volume', 'acceleration','fuel']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = RandomForestRegressor()

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

param_grid = {
    'model__n_estimators': [100], 
    'model__criterion': ['friedman_mse'],  
    'model__max_depth': [None],     
    'model__min_samples_split': [2],     
    'model__min_samples_leaf': [1],  
    'model__bootstrap': [True],    
    'model__warm_start': [True],     
    'model__random_state': [42],      

    'preprocessor__num__with_mean': [True], 
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=8, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


Fitting 8 folds for each of 1 candidates, totalling 8 fits
[CV] END model__bootstrap=True, model__criterion=friedman_mse, model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, model__random_state=42, model__warm_start=True, preprocessor__num__with_mean=True; total time=  25.5s
[CV] END model__bootstrap=True, model__criterion=friedman_mse, model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, model__random_state=42, model__warm_start=True, preprocessor__num__with_mean=True; total time=  25.5s
[CV] END model__bootstrap=True, model__criterion=friedman_mse, model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, model__random_state=42, model__warm_start=True, preprocessor__num__with_mean=True; total time=  25.8s
[CV] END model__bootstrap=True, model__criterion=friedman_mse, model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split

In [610]:
grid_search.cv_results_

{'mean_fit_time': array([25.58544976]),
 'std_fit_time': array([0.202314]),
 'mean_score_time': array([0.15616718]),
 'std_score_time': array([0.03028871]),
 'param_model__bootstrap': masked_array(data=[True],
              mask=[False],
        fill_value=True),
 'param_model__criterion': masked_array(data=['friedman_mse'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_model__max_depth': masked_array(data=[nan],
              mask=[False],
        fill_value=1e+20),
 'param_model__min_samples_leaf': masked_array(data=[1],
              mask=[False],
        fill_value=999999),
 'param_model__min_samples_split': masked_array(data=[2],
              mask=[False],
        fill_value=999999),
 'param_model__n_estimators': masked_array(data=[100],
              mask=[False],
        fill_value=999999),
 'param_model__random_state': masked_array(data=[42],
              mask=[False],
        fill_value=999999),
 'param_model__warm_start': masked_arr

In [352]:
y_pred.shape

(3629,)

In [609]:
pd.DataFrame({'price' : y_pred}).to_csv('sheikh.csv')

In [None]:
X_train.columns

Index(['title', 'year', 'mileage', 'transmission', 'fuel', 'body_color',
       'inside_color', 'body_status', 'description', 'body_type', 'volume',
       'engine', 'acceleration', 'null', 'انجام شده', 'بدنه', 'بدون',
       'بدون خط', 'برقی', 'برگ', 'بسیار', 'به شرط', 'تازه', 'تخفیف', 'تسمه',
       'تعویض', 'تعویض شده', 'تمامی', 'تمیز', 'جلو', 'حد', 'خش', 'خط', 'خط خش',
       'دارای', 'در حد', 'درب', 'دودی', 'دوربین', 'روغن', 'روکش', 'روکش صندلی',
       'سالم', 'سرویس', 'سرویس ها', 'سمت', 'سند', 'شاسی', 'شیشه', 'صفر',
       'صندلی', 'عالی', 'فابریک', 'فنی سالم', 'فول', 'مانیتور', 'نو', 'واقعی',
       'پلمپ', 'کاملا سالم', 'گارانتی', 'گیربکس'],
      dtype='object')

In [550]:
from sklearn.ensemble import HistGradientBoostingRegressor

X_train = df_train.drop(columns=['price'])
y_train = df_train['price']
X_test = df_test

categorical_cols = ['title', 'transmission', 'body_color', 'inside_color', 'body_status', 'body_type', 'engine']
numerical_cols = ['year', 'mileage', 'volume', 'acceleration','fuel']

categorical_transformer = OneHotEncoder(handle_unknown='ignore',sparse_output=False)

numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = HistGradientBoostingRegressor()

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

param_grid = {
    'model__learning_rate': [0.1],  
    'model__max_iter': [600], 
    'model__max_depth': [None], 
    'model__min_samples_leaf': [1],  
    'model__max_bins': [255],  
    'model__loss': ["squared_error"],
    'model__early_stopping': [False],
    'model__l2_regularization': [0.3],
    'preprocessor__num__with_mean': [True], 
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=[(slice(None), slice(None))], n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


Fitting 1 folds for each of 1 candidates, totalling 1 fits
[CV] END model__learning_rate=0.1, preprocessor__num__with_mean=True; total time=   5.6s
Best parameters found:  {'model__learning_rate': 0.1, 'preprocessor__num__with_mean': True}


In [238]:
y_pred.shape

(3629,)

In [551]:
pd.DataFrame({'price' : y_pred}).to_csv('sheikh.csv')

In [263]:
import tensorflow as tf

X_train = df_train.drop(columns=['price'])
y_train = df_train['price']
X_test = df_test


categorical_cols = ['title', 'transmission', 'body_color', 'inside_color', 'body_status', 'body_type', 'engine']
numerical_cols = ['year', 'mileage', 'volume', 'acceleration','fuel',
 'esp',
 'آخر',
 'آفتاب',
 'آلومینیوم',
 'آمریکایی',
 'آپشن',
 'آینه',
 'اتاق',
 'ارتفاع',
 'اساسی',
 'استارت',
 'استثنایی',
 'اسفند',
 'اسپرت',
 'اصلی',
 'اضافه',
 'افتاب',
 'اماده',
 'انتقال',
 'اندروید',
 'اپشن',
 'ایربگ',
 'اینه',
 'باتری',
 'باربند',
 'باند',
 'بخاری',
 'بدونه',
 'برگه',
 'بشرط',
 'بلوتوث',
 'بهترین',
 'بگیره',
 'تاشو',
 'تایر',
 'ترمز',
 'ترک',
 'تشکر',
 'تصویر',
 'تودوزی',
 'تیکه',
 'جانبی',
 'جدید',
 'جزیی',
 'جفت',
 'جوش',
 'حالته',
 'حساسیت',
 'خارجی',
 'خشک',
 'خواب',
 'خوابیده',
 'خوب',
 'خورده',
 'خوردگی',
 'خوش',
 'خونگی',
 'داشبورد',
 'درحد',
 'دریچه',
 'دوخت',
 'دوگانه',
 'دینام',
 'رادار',
 'رادیاتور',
 'رخ',
 'رکاب',
 'ریال',
 'زنون',
 'زیبایی',
 'ساب',
 'سالمه',
 'ستون',
 'سرامیک',
 'سرحال',
 'سرقت',
 'سفارش',
 'سلامت',
 'سواری',
 'سوختگی',
 'سوز',
 'سینی',
 'شتاب',
 'شمع',
 'صافکاری',
 'صدا',
 'صوتی',
 'ضبط',
 'عیب',
 'فرمون',
 'قفل',
 'مالیدگی',
 'مسافرت',
 'مقطوع',
 'مموری',
 'هدلایت',
 'هیدرولیک',
 'وارداتی',
 'وایر',
 'وخش',
 'وسواس',
 'ویندوز',
 'پاور',
 'پایونیر',
 'پخش',
 'چرمی',
 'کارت',
 'کلاچ',
 'کیت',
 'کیلس',
 'گرمکن']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])


X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
af = 'hard_silu'

model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train_processed.shape[1],)),
    tf.keras.layers.Dense(256, activation=af),  # First hidden layer

    tf.keras.layers.Dense(128, activation=af),  # First hidden layer
    tf.keras.layers.Dense(64, activation=af),   # Second hidden layer
    tf.keras.layers.Dense(32, activation=af),   # Third hidden layer
    tf.keras.layers.Dense(1)  # Output layer (regression)
])


model.compile(optimizer='adam', loss='mse', metrics=['mae'])

history = model.fit(X_train_processed, y_train, epochs=200, batch_size=16, validation_split=0.1)


y_pred = model.predict(X_test_processed)


Epoch 1/200




[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 2719055696728424448.0000 - mae: 1083662848.0000 - val_loss: 1263512445712859136.0000 - val_mae: 673063744.0000
Epoch 2/200
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 1065539329817837568.0000 - mae: 673730304.0000 - val_loss: 868367483021557760.0000 - val_mae: 591945984.0000
Epoch 3/200
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 783867265647902720.0000 - mae: 572503552.0000 - val_loss: 729023213337575424.0000 - val_mae: 518283584.0000
Epoch 4/200
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 639329865106980864.0000 - mae: 502162112.0000 - val_loss: 657144358658637824.0000 - val_mae: 483361472.0000
Epoch 5/200
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 576625266730729472.0000 - mae: 468507648.0000 - val_loss: 603114700867108864.0000 - val_mae: 46032028

KeyboardInterrupt: 

In [253]:
pd.DataFrame({'price' : y_pred[:,0]}).to_csv('sheikh.csv')