In [83]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# import the regressors
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

# import the metrics
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_log_error as MSLE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import median_absolute_error as MAD

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.metrics import mean_squared_error as MSE

# import standard scaler 
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import seaborn as sns

In [107]:
data = 'Data/clean_try.csv'

In [108]:
df = pd.read_csv(data)

In [109]:
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color
0,15000,2013.0,ford,others,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,truck,black
1,27990,2012.0,gmc,others,good,8 cylinders,gas,68696.0,clean,other,4wd,pickup,black
2,34590,2016.0,chevrolet,others,good,6 cylinders,gas,29499.0,clean,other,4wd,pickup,silver
3,35000,2019.0,toyota,tacoma,excellent,6 cylinders,gas,43000.0,clean,automatic,4wd,truck,grey
4,29990,2016.0,chevrolet,others,good,6 cylinders,gas,17302.0,clean,other,4wd,pickup,red


In [110]:
df['type'] = df['type'].replace('pickup', 'truck')

In [111]:
df.to_csv('Data/final_modeling.csv', index=False)

########################################################################################################################

In [34]:
# plt.figure(figsize=(10,6))
# sns.distplot(df['price'], bins=50)
# plt.title('Histogram of Price')
# plt.xlabel('Price')

In [89]:
df['price'] = np.log(df['price'])

In [90]:
df['odometer'] = np.log(df['odometer'])

In [105]:
df['age'] = 2020 - df['year']
df['age'] = df['age'].apply(lambda x: x if x > 0 else 0)

# drop year
df = df.drop('year', axis=1)

df.head()

Unnamed: 0,price,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,age
0,15000,ford,others,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,truck,black,7.0
1,27990,gmc,others,good,8 cylinders,gas,68696.0,clean,other,4wd,truck,black,8.0
2,34590,chevrolet,others,good,6 cylinders,gas,29499.0,clean,other,4wd,truck,silver,4.0
3,35000,toyota,tacoma,excellent,6 cylinders,gas,43000.0,clean,automatic,4wd,truck,grey,1.0
4,29990,chevrolet,others,good,6 cylinders,gas,17302.0,clean,other,4wd,truck,red,4.0


In [106]:
df.head()

Unnamed: 0,price,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,age
0,15000,ford,others,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,truck,black,7.0
1,27990,gmc,others,good,8 cylinders,gas,68696.0,clean,other,4wd,truck,black,8.0
2,34590,chevrolet,others,good,6 cylinders,gas,29499.0,clean,other,4wd,truck,silver,4.0
3,35000,toyota,tacoma,excellent,6 cylinders,gas,43000.0,clean,automatic,4wd,truck,grey,1.0
4,29990,chevrolet,others,good,6 cylinders,gas,17302.0,clean,other,4wd,truck,red,4.0


In [92]:
from sklearn import preprocessing
import pandas as pd
le = preprocessing.LabelEncoder()

In [93]:
df.columns

Index(['price', 'manufacturer', 'model', 'condition', 'cylinders', 'fuel',
       'odometer', 'title_status', 'transmission', 'drive', 'type',
       'paint_color', 'age'],
      dtype='object')

In [94]:
df[['manufacturer', 'model', 'condition', 'cylinders', 'fuel',
        'title_status', 'transmission', 'drive', 'type',
       'paint_color']] = df[['manufacturer', 'model', 'condition', 'cylinders', 'fuel',
       'title_status', 'transmission', 'drive', 'type',
       'paint_color']].apply(le.fit_transform)

In [95]:
df.shape

(106657, 13)

In [96]:
df = df.loc[0:50000, :]

In [97]:
# import train_test_split
from sklearn.model_selection import train_test_split

In [98]:
X = df.drop('price', axis=1)
y = df['price']

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [100]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('Mean Absolute Error:', round(metrics.mean_absolute_error(y_test, y_pred),2))
print('Mean Squared Error:', round(metrics.mean_squared_error(y_test, y_pred),2))
print('Root Mean Squared Error:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),2))
print(regressor.score(X_test,y_test)*100)

Mean Absolute Error: 0.19
Mean Squared Error: 0.1
Root Mean Squared Error: 0.32
84.14964709474442


In [None]:
feature_imp = pd.Series(regressor.feature_importances_, index=X.columns).sort_values(ascending=False)
feature_imp

In [76]:
import xgboost as xgb
from xgboost import XGBRegressor

model = XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print (model.score(X_test, y_test)*100)

83.64440143255898


In [78]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10)
scoring = 'r2'

In [79]:
models = []
models.append(('RandomForestRegressor', RandomForestRegressor()))
models.append(('XGBRegressor', XGBRegressor()))
models.append(('GB', GradientBoostingRegressor()))
models.append(('LASSO', linear_model.Lasso()))
models.append(('Ridge', linear_model.Ridge()))

results = []
names = []
for name, model in models:
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, round(cv_results.mean(),2), cv_results.std())
    print(msg)

RandomForestRegressor: 0.840000 (0.009436)
XGBRegressor: 0.840000 (0.007983)
GB: 0.790000 (0.009210)
LASSO: 0.390000 (0.012000)
Ridge: 0.570000 (0.011986)


In [80]:
# hyper parameter tuning for random forest regressor

from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30, 40, 50],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
# Summarize the results
print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.best_estimator_)


Fitting 3 folds for each of 360 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=10, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.0s
[CV] END bootstrap=True, max_depth=10, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.0s
[CV] END bootstrap=True, max_depth=10, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.0s
[CV] END bootstrap=True, max_depth=10, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   1.9s
[CV] END bootstrap=True, max_depth=10, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   1.9s
[CV] END bootstrap=True, max_depth=10, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   2.0s
[CV] END bootstrap=True, max_depth=10, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=100; total time=   0.9s
[CV]

In [82]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=1000, min_samples_split=8, min_samples_leaf=3, max_features=3, max_depth=50, bootstrap=True)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(regressor.score(X_test,y_test)*100)


83.57978746397899
