In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [1]:
df=pd.read_csv("/kaggle/input/cleaned-tayara-cars/cleaned_tayara_cars2.0.csv")

In [1]:
print("nb of brands "+str(df["brand"].nunique()))
print("nb of fuel types "+str(df["Fuel_Type"].nunique()))
print("nb of cities "+str(df["city"].nunique()))
print("nb of car models "+str(df["model"].nunique()))

In [1]:
df.drop(["price"],axis=1,inplace=True)
df.drop(["Unnamed: 0"],axis=1,inplace=True)
df.drop(["Kms_Driven"],axis=1,inplace=True)
df.drop(["year"],axis=1,inplace=True)
df.head()

In [1]:
#encoding model frequency
model_frequency  = df.groupby('model').size()/len(df)
df.loc[:,'model_freq_encode'] = df['model'].map(model_frequency)

df.drop(["model"],axis=1,inplace=True)

df.head()

In [1]:
#encoding brand frequency
brand_frequency  = df.groupby('brand').size()/len(df)
df.loc[:,'brand_freq_encode'] = df['brand'].map(brand_frequency)
df.drop(["brand"],axis=1,inplace=True)

df.head()

In [1]:
#encoding city frequency
city_frequency  = df.groupby('city').size()/len(df)
df.loc[:,'city_freq_encode'] = df['city'].map(city_frequency)
df.drop(["city"],axis=1,inplace=True)

df.head()

In [1]:
#encoding fuel frequency
fuel_frequency  = df.groupby('Fuel_Type').size()/len(df)
df.loc[:,'fuel_freq_encode'] = df['Fuel_Type'].map(fuel_frequency)


df.drop(["Fuel_Type"],axis=1,inplace=True)
df.head()


In [1]:
#encoding horsepow frequency
fuel_frequency  = df.groupby('Horse_pow').size()/len(df)
df.loc[:,'horse_freq_encode'] = df['Horse_pow'].map(fuel_frequency)


df.drop(["Horse_pow"],axis=1,inplace=True)
df.head()

In [1]:
df.columns.tolist()

In [1]:
L= list(df.columns.values)
del L[1]

In [1]:
X=df[L]
X

In [1]:
y=df[['log_price']]
y

In [1]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [1]:
#standardization scaler 
from sklearn.preprocessing import StandardScaler
s_scaler = StandardScaler()
X_train = s_scaler.fit_transform(X_train.astype(np.float))
X_test = s_scaler.transform(X_test.astype(np.float))

#  Linear Regression


In [1]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

#predicting the test set result
y_pred = regressor.predict(X_test)


#evaluate the results
from sklearn import metrics


print('MAE:', metrics.mean_absolute_error(y_test, y_pred))  
print('MSE:', metrics.mean_squared_error(y_test, y_pred))  
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


In [1]:
sns.distplot(y_test-y_pred)

# Decision Tree Regressor

In [1]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()

dtr.fit(X_train, y_train)            
dtr_y_pred=dtr.predict(X_test)    
print('MAE:', metrics.mean_absolute_error(y_test, dtr_y_pred))  
print('MSE:', metrics.mean_squared_error(y_test, dtr_y_pred))  
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, dtr_y_pred)))

In [1]:
y_test=y_test.to_numpy()
y_test=np.squeeze(y_test)
sns.distplot(y_test-dtr_y_pred)
plt.xlabel("Prediction Error")
_ = plt.ylabel("Count")

# Random Forest Regressor

In [1]:
from sklearn.ensemble import RandomForestRegressor
#use the random grid to search for best hyperparameters
#first create the best model to tune
rf = RandomForestRegressor()

In [1]:
import numpy as np
#HyperParameters for Randomized Search CV
#note for self : Randomized searchCV is much faster than Grid SearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [1]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)



In [1]:
from sklearn.model_selection import RandomizedSearchCV

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [1]:
rf_random.fit(X_train,y_train.values.ravel())

In [1]:
rf_random.best_params_

In [1]:
predictions = rf_random.predict(X_test)

In [1]:
y_test=y_test.to_numpy()
y_test=np.squeeze(y_test)

sns.distplot(y_test-predictions)
plt.xlabel("Prediction Error")
_ = plt.ylabel("Count")

In [1]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))  
print('MSE:', metrics.mean_squared_error(y_test, predictions))  
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

# XGBoost

In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split


In [1]:
net = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                       colsample_bynode=1, colsample_bytree=1, gamma=0,
                       importance_type='gain', learning_rate=0.08, max_delta_step=0,
                       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
                       n_jobs=1, nthread=None, random_state=0,
                       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                       silent=None, subsample=0.75, verbosity=1, objective='reg:squarederror')

In [1]:
net.fit(X_train, y_train)
XGB_y_pred=net.predict(X_test)

In [1]:
print('MAE:', metrics.mean_absolute_error(y_test, XGB_y_pred))  
print('MSE:', metrics.mean_squared_error(y_test, XGB_y_pred))  
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, XGB_y_pred)))

In [1]:
sns.distplot(y_test-XGB_y_pred)
plt.xlabel("Prediction Error")
_ = plt.ylabel("Count")