# End-to-end Vehicle Sales Price Recommendation Project

[Front-end](https://recommend-vehicle-price.herokuapp.com/)

[GitHub repo](https://github.com/MichaelBryantDS/vehicle-price-rec)

**Import libraries and data**

In [None]:
#import libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#supress warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#import data
car_data = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')

# EDA

**Defining variables and cleaning data**

In [None]:
#look at formatting of entries
car_data.head()

In [None]:
#look at null count and dtype
car_data.info()

In [None]:
#numerical features
numerical = [
    'Year',
    'Present_Price',
    'Kms_Driven',
    'Selling_Price'
]

#categorical features
categorical = [
    'Car_Name',
    'Fuel_Type',
    'Seller_Type',
    'Transmission',
    'Owner'
]

**Data distribution and outliers**


In [None]:
#look at distribution of data
car_data.describe()

In [None]:
#look at outliers in selling price as a percentage
percentage=(len(car_data.Selling_Price[np.abs(stats.zscore(car_data.Selling_Price)) >= 3])/len(car_data))*100
print('Percentage of Selling_Price outliers >= 3 std from the mean: {}%'.format(percentage))

In [None]:
#look at number of outliers greater than or equal to 3 std from mean
car_data[numerical][np.abs(stats.zscore(car_data[numerical])) >= 3]

In [None]:
#look at number of outliers greater than or equal to 4 std from mean
car_data[numerical][np.abs(stats.zscore(car_data[numerical])) >= 4]

In [None]:
#look at number of outliers greater than or equal to 5 std from mean
car_data[numerical][np.abs(stats.zscore(car_data[numerical])) >= 5]

In [None]:
#look at number of outliers greater than or equal to 6 std from mean
car_data[numerical][np.abs(stats.zscore(car_data[numerical])) >= 6]

In [None]:
#selling price outliers visualized
sns.boxplot(x=car_data['Selling_Price'])
plt.xlabel('Selling_Price')

In [None]:
#present price outlier visualized
sns.boxplot(x=car_data['Present_Price'])
plt.xlabel('Present_Price')

In [None]:
#kms driven outlier visualized
sns.boxplot(x=car_data['Kms_Driven'])
plt.xlabel('Kms_Driven')

**Data cleaning**

In [None]:
#capitalize all car names
for name in car_data['Car_Name']:
    car_data = car_data.replace(name,name.title())

In [None]:
#look for anything that needs to be fixed
values,counts=np.unique(car_data['Car_Name'],return_counts=True)
unique_cars_counts = pd.DataFrame({'car names':values, 'counts':counts})
values

In [None]:
#reassign categorical names to numbers
car_data = car_data.replace('Petrol',0)
car_data = car_data.replace('Diesel',1)
car_data = car_data.replace('CNG',2)

car_data = car_data.replace('Dealer',0)
car_data = car_data.replace('Individual',1)

car_data = car_data.replace('Manual',0)
car_data = car_data.replace('Automatic',1)

In [None]:
#replace car names with numbers
for i in unique_cars_counts['car names']:
    idx = pd.Index(unique_cars_counts['car names'])
    car_data = car_data.replace(i,idx.get_loc(i))

In [None]:
#assign categorical variables to int dtype
car_data[categorical].astype('int64')

**Data distributions**

In [None]:
#look at numerical data distribution
for i in car_data[numerical].columns:
    plt.hist(car_data[numerical][i], edgecolor='black')
    plt.xticks()
    plt.xlabel(i)
    plt.ylabel('number of cars')
    plt.show()

In [None]:
#look at categorical data distribution
for i in car_data[categorical].columns:
    plt.hist(car_data[categorical][i], edgecolor='black')
    plt.xticks()
    plt.xlabel(i)
    plt.ylabel('number of cars')
    plt.show()

**Finding correlations with a heat map and visualizations**

In [None]:
#heat map to find extreme positive and negative correlations in numerical data
plt.figure(figsize=(16, 6))
sns.heatmap(car_data[numerical].corr(), annot=True)
plt.title('Correlation Heatmap for Numerical Variables', fontdict={'fontsize':12}, pad=12);

In [None]:
#look at how target is distributed among variables
sns.pairplot(car_data)
plt.legend()
plt.show()

In [None]:
#lmplot comparing year and kms driven (-0.52 corr)
sns.lmplot(x='Year', y='Kms_Driven',data=car_data)

#settings to display all markers
xticks, xticklabels = plt.xticks()
xmin = 2002
xmax = 2019
plt.xlim(xmin, xmax)
plt.xticks(xticks)

plt.show()

In [None]:
#violin plot comparing selling price and seller type
sns.violinplot(y='Selling_Price',
              x='Seller_Type', data = car_data)
plt.show()

In [None]:
#stripplot comparing selling price and fuel type
sns.stripplot(y=car_data['Selling_Price'],
              x=car_data['Fuel_Type'])
plt.show()

In [None]:
#lmplot comparing selling price and present price (0.88 corr)
sns.lmplot(x='Selling_Price', y='Present_Price',data=car_data)

#settings to display all markers
xmin = -2
xmax = 37
plt.xlim(xmin, xmax)

plt.show()

In [None]:
#vionlinplot comparing present price and seller type
sns.violinplot(y='Present_Price',
              x='Seller_Type', data = car_data)
plt.show()

**Applying linear model to better understand feature relationship with selling price**

In [None]:
#change dtype of categorical features to object
car_data[categorical]=car_data[categorical].astype('object')

#copy of variables and target
X = car_data.copy().drop('Selling_Price', axis=1)
y = car_data.pop('Selling_Price')

#remove Selling_Price from numerical variables
numerical.remove('Selling_Price')

In [None]:
X.info()

In [None]:
#create dummy variables for categorical variables
car_data_dum = pd.get_dummies(X, drop_first=True)

In [None]:
#generate OLS Regression Results
import statsmodels.api as sm

X_sm = sm.add_constant(car_data_dum)
model = sm.OLS(y,X_sm)
model.fit().summary()

**Mutual information**

In [None]:
X_mi = X.copy()

In [None]:
#label encoding for categorical variables
for colname in X_mi.select_dtypes("object"):
    X_mi[colname], _ = X_mi[colname].factorize()

#all discrete features have int dtypes
discrete_features = X_mi.dtypes == int

In [None]:
#some continuous variables also have int dtypes
discrete_features[['Year','Kms_Driven']] = False

In [None]:
#use regression since the target variable is continuous
from sklearn.feature_selection import mutual_info_regression

#define a function to produce mutual information scores
def make_mi_scores(X_mi, y, discrete_features):
    mi_scores = mutual_info_regression(X_mi, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X_mi.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

#compute mutual information scores
mi_scores = make_mi_scores(X_mi, y, discrete_features)
mi_scores

In [None]:
#define a function to plot mutual information scores
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

#plot the scores
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

In [None]:
#plot selling_price against car_name
fig, ax = plt.subplots(figsize=(12,4))
sns.scatterplot(x=X_mi.Car_Name, y=y, ax=ax)

# ax.text(15,33,"{}".format(values[15]))
# ax.text(24,35,"{}".format(values[24]))

#add names and arrows to highest values
ax.annotate("{}".format(values[15]), xy=(15,33), xytext=(30,15), arrowprops=dict(facecolor='black',shrink=0.05))
ax.annotate("{}".format(values[24]), xy=(24,35), xytext=(39,30), arrowprops=dict(facecolor='black',shrink=0.05))


plt.show()

# ML Modeling

**Peparing data for ML**

In [None]:
#import ML preprocessing packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [None]:
#one hot encoder for categorical variables
encoder=OneHotEncoder(handle_unknown='error', drop='first')
X = pd.concat([X[numerical],pd.get_dummies(X[categorical], drop_first=True)],axis=1)
feature_names = X.columns

# train/test split with stratify making sure classes are evenlly represented across splits
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1)

#numerical pipeline
scaler=MinMaxScaler()

#apply scaler to numerical data
X_train[numerical] = scaler.fit_transform(X_train[numerical])
X_test[numerical] = scaler.transform(X_test[numerical])

**Untuned model prerformance**

In [None]:
#import ML packages
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std

In [None]:
#LinearRegression mean cross-validation
lm = LinearRegression()
lm.fit(X_train, y_train)
cv = cross_val_score(lm,X_train,y_train,scoring='neg_mean_absolute_error',cv=5)
print('LinearRegression')
print(mean(cv), '+/-', std(cv))

In [None]:
#Lasso mean cross-validation
lm_l = Lasso(random_state = 1)
cv = cross_val_score(lm_l,X_train,y_train,scoring='neg_mean_absolute_error',cv=5)
print('Lasso')
print(mean(cv), '+/-', std(cv))

In [None]:
#Ridge mean cross-validation
rid = Ridge(random_state = 1)
cv = cross_val_score(rid,X_train,y_train,scoring='neg_mean_absolute_error',cv=5)
print('Ridge')
print(mean(cv), '+/-', std(cv))

In [None]:
#ElasticNet mean cross-validation
enr = ElasticNet(random_state = 1)
cv = cross_val_score(enr,X_train,y_train,scoring='neg_mean_absolute_error',cv=5)
print('ElasticNet')
print(mean(cv), '+/-', std(cv))

In [None]:
#RandomForestRegressor mean cross-validation
rf = RandomForestRegressor(random_state = 1)
cv = cross_val_score(rf,X_train,y_train,scoring='neg_mean_absolute_error',cv=5)
print('RandomForestRegressor')
print(mean(cv), '+/-', std(cv))

In [None]:
#GradientBoostingRegressor mean cross-validation
gbr = GradientBoostingRegressor(random_state = 1)
cv = cross_val_score(gbr,X_train,y_train,scoring='neg_mean_absolute_error',cv=5)
print('GradientBoostingRegressor')
print(mean(cv), '+/-', std(cv))

In [None]:
#SVR mean cross-validation
svr = SVR()
cv = cross_val_score(svr,X_train,y_train,scoring='neg_mean_absolute_error',cv=5)
print('SVR')
print(mean(cv), '+/-', std(cv))

**Tuning model performance**

In [None]:
#ml algorithm tuner
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

#performance reporting function
def clf_performance(regressor, model_name):
    print(model_name)
    print('Best Score: {} +/- {}'.format(str(regressor.best_score_),str(regressor.cv_results_['std_test_score'][regressor.best_index_])))
    print('Best Parameters: ' + str(regressor.best_params_))

In [None]:
#LinearRegression GridSearchCV
lm = LinearRegression()
param_grid = {
                'fit_intercept':[True,False],
                'normalize':[True,False],
                'copy_X':[True, False]
}
clf_lm = GridSearchCV(lm, param_grid = param_grid, cv = 5, scoring='neg_mean_absolute_error', n_jobs = -1)
best_clf_lm = clf_lm.fit(X_train,y_train)
clf_performance(best_clf_lm,'LinearRegressor')

In [None]:
#determine optimal lasso alpha value
alpha = []
error = []

for i in range(1,100):
    alpha.append(i/5000)
    lm_l = Lasso(random_state = 1,alpha=(i/5000))
    error.append(np.mean(cross_val_score(lm_l,X_train,y_train,scoring='neg_mean_absolute_error',cv=5)))
    
plt.plot(alpha,error)

plt.show()

In [None]:
#print optimal alpha value
err = tuple(zip(alpha,error))
df_err = pd.DataFrame(err, columns=['alpha','error'])
df_err[df_err.error == max(df_err.error)]

In [None]:
#Lasso GridSearchCV
lm_l = Lasso(random_state = 1)
param_grid = {
                'alpha':[0.0038],
                'fit_intercept':[True,False],
                'normalize':[True, False],
                'copy_X':[True, False]
}
clf_lm_l = GridSearchCV(lm_l, param_grid = param_grid, cv = 5, scoring='neg_mean_absolute_error', n_jobs = -1)
best_clf_lm_l = clf_lm_l.fit(X_train,y_train)
clf_performance(best_clf_lm_l,'Lasso')

In [None]:
#Ridge GridSearchCV
rid = Ridge(random_state = 1)
param_grid = {
                'fit_intercept':[True,False],
                'normalize':[True, False],
                'copy_X':[True, False],
                'solver': ['auto','svd','cholesky','lsqr','sparse_cg','sag','saga']
}
clf_rid = GridSearchCV(rid, param_grid = param_grid, cv = 5, scoring='neg_mean_absolute_error', n_jobs = -1)
best_clf_rid = clf_rid.fit(X_train,y_train)
clf_performance(best_clf_rid,'Ridge')

In [None]:
#determine optimal elasticnet alpha value
alpha = []
error = []

for i in range(1,100):
    alpha.append(i/10000)
    enr = ElasticNet(random_state = 1,alpha=(i/10000))
    error.append(np.mean(cross_val_score(enr,X_train,y_train,scoring='neg_mean_absolute_error',cv=5)))
    
plt.plot(alpha,error)

plt.show()

In [None]:
#print optimal alpha value
err = tuple(zip(alpha,error))
df_err = pd.DataFrame(err, columns=['alpha','error'])
df_err[df_err.error == max(df_err.error)]

In [None]:
#ElasticNet GridSearchCV
enr = ElasticNet(random_state = 1)
param_grid = {
                'alpha':[0.0018],
                'fit_intercept':[True,False],
                'normalize':[True, False],
                'copy_X':[True, False],
}
clf_enr = GridSearchCV(enr, param_grid = param_grid, cv = 5, scoring='neg_mean_absolute_error', n_jobs = -1)
best_clf_enr = clf_enr.fit(X_train,y_train)
clf_performance(best_clf_enr,'ElasticNet')

In [None]:
#RanddomForestRegressor GridSearchCV
rf = RandomForestRegressor(random_state = 1)
param_grid = {
                'n_estimators': [385] , 
                'bootstrap': [True],
                'max_depth': [9],
                'max_features': ['auto'],
                'min_samples_leaf': [1,],
                'min_samples_split': [2]
              }
clf_rf = GridSearchCV(rf, param_grid = param_grid, cv = 5, scoring='neg_mean_absolute_error', n_jobs = -1)
best_clf_rf = clf_rf.fit(X_train,y_train)
clf_performance(best_clf_rf,'RandomForestRegressor')

In [None]:
#determine optimal gbr alpha value
alpha = []
error = []

for i in range(1,10):
    alpha.append(i/10000)
    gbr = GradientBoostingRegressor(random_state = 1,alpha=(i/10000))
    error.append(np.mean(cross_val_score(gbr,X_train,y_train,scoring='neg_mean_absolute_error',cv=5)))
    
plt.plot(alpha,error)

plt.show()

In [None]:
#print optimal alpha value
err = tuple(zip(alpha,error))
df_err = pd.DataFrame(err, columns=['alpha','error'])
df_err[df_err.error == max(df_err.error)]

In [None]:
#GradientBoostingRegressor GridSearchCV
gbr = GradientBoostingRegressor(random_state = 1)
param_grid = {
                'n_estimators': [20], 
                'max_depth': [7],
                'max_features': ['auto'],
                'learning_rate': [0.2],
#                 'alpha': [0.0001],
                'min_samples_leaf': [3],
                'min_samples_split': [2]
              }
clf_gbr = GridSearchCV(gbr, param_grid = param_grid, cv = 5, scoring='neg_mean_absolute_error', n_jobs = -1)
best_clf_gbr = clf_gbr.fit(X_train,y_train)
clf_performance(best_clf_gbr,'GradientBoostingRegressor')

In [None]:
#SVR GridSearchCV
svr = SVR()
param_grid = {
                'kernel' : ['poly'],
                'C' : [24],
                'coef0' : [0.9],
                'gamma' : ['scale','auto']
}
clf_svr = GridSearchCV(svr, param_grid = param_grid, cv = 5, scoring='neg_mean_absolute_error', n_jobs = -1)
best_clf_svr = clf_svr.fit(X_train,y_train)
clf_performance(best_clf_svr,'SVR')

**StackingRegressor**

In [None]:
#import ensemble packages and numpy functions
from sklearn.ensemble import StackingRegressor, VotingRegressor, BaggingRegressor, AdaBoostRegressor

Baseline

In [None]:
#StackingRegressor mean cross-validation
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lm', LinearRegression()))
    level0.append(('lm_l', Lasso(random_state = 1)))
    level0.append(('rid', Ridge(random_state = 1)))
    level0.append(('enr', ElasticNet(random_state = 1)))
    level0.append(('rf', RandomForestRegressor(random_state = 1)))
    level0.append(('gbr', GradientBoostingRegressor(random_state = 1)))
    level0.append(('svr', SVR()))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    return model

def get_models():
    models = dict()
    models['lm'] = LinearRegression()
    models['lm_l'] = Lasso(random_state = 1)
    models['rid'] = Ridge(random_state = 1)
    models['enr'] = ElasticNet(random_state = 1)
    models['rf'] = RandomForestRegressor(random_state = 1)
    models['gbr'] = GradientBoostingRegressor(random_state = 1)
    models['svr'] = SVR()
    models['stacking'] = get_stacking()
    return models

models = get_models()
results, names = list(),list()
for name, model in models.items():
    scores = cross_val_score(model,X_train,y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f +/- %.3f' % (name, mean(scores), std(scores)))

Hyperparameter tuning

In [None]:
#StackingRegressor mean cross-validation
def get_stacking():
    # define the base models
    level0 = list()
    #level0.append(('lm', LinearRegression(copy_X= True, fit_intercept= True, normalize= True)))
    level0.append(('lm_l', Lasso(random_state = 1, alpha=0.0038, copy_X=True, fit_intercept=True, normalize=False)))
    level0.append(('rid', Ridge(random_state = 1, copy_X=True, fit_intercept=False, normalize=True, solver='cholesky')))
    #level0.append(('enr', ElasticNet(random_state = 1,alpha=0.0018, copy_X=True,fit_intercept=True, normalize= False)))
    level0.append(('rf', RandomForestRegressor(random_state = 1,bootstrap=True, max_depth=9, max_features='auto', min_samples_leaf=1, min_samples_split= 2, n_estimators=385)))
    level0.append(('gbr', GradientBoostingRegressor(random_state = 1,learning_rate= 0.2, max_depth= 7, max_features= 'auto', min_samples_leaf= 3, min_samples_split= 2, n_estimators= 20)))
    level0.append(('svr', SVR(C=24, coef0=0.9, gamma='scale', kernel='poly')))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    stacking_model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    return stacking_model

def get_models():
    models = dict()
    #models['lm'] = LinearRegression(copy_X= True, fit_intercept= True, normalize= True)
    models['lm_l'] = Lasso(random_state = 1, alpha=0.0038, copy_X=True, fit_intercept=True, normalize=False)
    models['rid'] = Ridge(random_state = 1, copy_X=True, fit_intercept=False, normalize=True, solver='cholesky')
    #models['enr'] = ElasticNet(random_state = 1,alpha=0.0018, copy_X=True,fit_intercept=True, normalize= False)
    models['rf'] = RandomForestRegressor(random_state = 1,bootstrap=True, max_depth=9, max_features='auto', min_samples_leaf=1, min_samples_split= 2, n_estimators=385)
    models['gbr'] = GradientBoostingRegressor(random_state = 1,learning_rate= 0.2, max_depth= 7, max_features= 'auto', min_samples_leaf= 3, min_samples_split= 2, n_estimators= 20)
    models['svr'] = SVR(C=24, coef0=0.9, gamma='scale', kernel='poly')
    models['stacking'] = get_stacking()
    return models

models = get_models()
results, names = list(),list()
for name, model in models.items():
    scores = cross_val_score(model,X_train,y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f +/- %.3f' % (name, mean(scores), std(scores)))

**VotingRegressor**

Baseline

In [None]:
#VotingRegressor mean cross-validation
def get_voting():
    # define the base models
    level0 = list()
    level0.append(('lm', LinearRegression()))
    level0.append(('lm_l', Lasso(random_state = 1)))
    level0.append(('rid', Ridge(random_state = 1)))
    level0.append(('enr', ElasticNet(random_state = 1)))
    level0.append(('rf', RandomForestRegressor(random_state = 1)))
    level0.append(('gbr', GradientBoostingRegressor(random_state = 1)))
    level0.append(('svr', SVR()))
    # define the stacking ensemble
    voting_model = VotingRegressor(estimators=level0)
    return voting_model

def get_models():
    models = dict()
    models['lm'] = LinearRegression()
    models['lm_l'] = Lasso(random_state = 1)
    models['rid'] = Ridge(random_state = 1)
    models['enr'] = ElasticNet(random_state = 1)
    models['rf'] = RandomForestRegressor(random_state = 1)
    models['gbr'] = GradientBoostingRegressor(random_state = 1)
    models['svr'] = SVR()
    models['voting'] = get_voting()
    return models

models = get_models()
results, names = list(),list()
for name, model in models.items():
    scores = cross_val_score(model,X_train,y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f +/- %.3f' % (name, mean(scores), std(scores)))

Hyperparameter tuning

In [None]:
#VotingRegressor mean cross-validation
def get_voting():
    # define the base models
    level0 = list()
    #level0.append(('lm', LinearRegression(copy_X= True, fit_intercept= True, normalize= True)))
    #level0.append(('lm_l', Lasso(random_state = 1, alpha=0.0038, copy_X=True, fit_intercept=True, normalize=False)))
    #level0.append(('rid', Ridge(random_state = 1, copy_X=True, fit_intercept=False, normalize=True, solver='cholesky')))
    #level0.append(('enr', ElasticNet(random_state = 1,alpha=0.0018, copy_X=True,fit_intercept=True, normalize= False)))
    level0.append(('rf', RandomForestRegressor(random_state = 1,bootstrap=True, max_depth=9, max_features='auto', min_samples_leaf=1, min_samples_split= 2, n_estimators=385)))
    level0.append(('gbr', GradientBoostingRegressor(random_state = 1,learning_rate= 0.2, max_depth= 7, max_features= 'auto', min_samples_leaf= 3, min_samples_split= 2, n_estimators= 20)))
    level0.append(('svr', SVR(C=24, coef0=0.9, gamma='scale', kernel='poly')))
    # define the stacking ensemble
    voting_model = VotingRegressor(estimators=level0)
    return voting_model

def get_models():
    models = dict()
    #models['lm'] = LinearRegression(copy_X= True, fit_intercept= True, normalize= True)
    #models['lm_l'] = Lasso(random_state = 1, alpha=0.0038, copy_X=True, fit_intercept=True, normalize=False)
    #models['rid'] = Ridge(random_state = 1, copy_X=True, fit_intercept=False, normalize=True, solver='cholesky')
    #models['enr'] = ElasticNet(random_state = 1,alpha=0.0018, copy_X=True,fit_intercept=True, normalize= False)
    models['rf'] = RandomForestRegressor(random_state = 1,bootstrap=True, max_depth=9, max_features='auto', min_samples_leaf=1, min_samples_split= 2, n_estimators=385)
    models['gbr'] = GradientBoostingRegressor(random_state = 1,learning_rate= 0.2, max_depth= 7, max_features= 'auto', min_samples_leaf= 3, min_samples_split= 2, n_estimators= 20)
    models['svr'] = SVR(C=24, coef0=0.9, gamma='scale', kernel='poly')
    models['voting'] = get_voting()
    return models

models = get_models()
results, names = list(),list()
for name, model in models.items():
    scores = cross_val_score(model,X_train,y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f +/- %.3f' % (name, mean(scores), std(scores)))

**BaggingRegressor: Bagging and Pasting**

Bagging baseline

In [None]:
#BaggingRegressor mean cross-validation
bagging_model = BaggingRegressor(
                                     bootstrap=True,
                                     random_state=1,
                                     n_jobs=-1
                                     )

bagging_model.fit(X_train, y_train)

cv = cross_val_score(bagging_model, X_train, y_train,scoring='neg_mean_absolute_error', cv=5)
print(mean(cv), '+/-', std(cv))

Hyperparameter tuning bagging

In [None]:
#BaggingRegressor mean cross-validation
bagging_model = BaggingRegressor(
#                                     base_estimator=RandomForestRegressor(),
                                     bootstrap=True,
                                     random_state=1,
                                     n_estimators=20,
                                     n_jobs=-1
                                     )

bagging_model.fit(X_train, y_train)

cv = cross_val_score(bagging_model, X_train, y_train,scoring='neg_mean_absolute_error', cv=5)
print(mean(cv), '+/-', std(cv))

Pasting baseline

In [None]:
#BaggingRegressor (pasting) mean cross-validation
pasting_model = BaggingRegressor(
                                     bootstrap=False,
                                     random_state=1,
                                     n_jobs=-1
                                     )

pasting_model.fit(X_train, y_train)

cv = cross_val_score(pasting_model, X_train, y_train,scoring='neg_mean_absolute_error', cv=5)
print(mean(cv), '+/-', std(cv))

Hyperparameter tuning pasting

In [None]:
#BaggingRegressor (pasting) mean cross-validation
pasting_model = BaggingRegressor(
                                     base_estimator=RandomForestRegressor(),
                                     bootstrap=False,
                                     random_state=1,
                                     n_estimators=40,
                                     n_jobs=-1
                                     )

pasting_model.fit(X_train, y_train)

cv = cross_val_score(pasting_model, X_train, y_train,scoring='neg_mean_absolute_error', cv=5)
print(mean(cv), '+/-', std(cv))

**AdaBoostRegressor**

Baseline

In [None]:
#AdaBoostRegressor mean cross-validation
adaboost_model = AdaBoostRegressor(
                                       random_state=1)

adaboost_model.fit(X_train , y_train)

cv = cross_val_score(adaboost_model, X_train, y_train,scoring='neg_mean_absolute_error', cv=5)
print(mean(cv), '+/-', std(cv))

Hyperparameter tuning

In [None]:
#AdaBoostRegressor mean cross-validation
adaboost_model = AdaBoostRegressor(
                                       base_estimator=RandomForestRegressor(),
                                       learning_rate=0.01,
                                       random_state=1)

adaboost_model.fit(X_train , y_train)

cv = cross_val_score(adaboost_model, X_train, y_train,scoring='neg_mean_absolute_error', cv=5)
print(mean(cv), '+/-', std(cv))

**MSE, RMSE, MAE, and R-squared values for best models using test set**

In [None]:
#import metrics packages
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
#VotingRegressor metrics
voting_model = get_voting()
voting_model.fit(X_train,y_train)
tpred_voting=voting_model.predict(X_test)
print('VotingRegressor')
print('MSE: {}'.format(mean_squared_error(y_test,tpred_voting)))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test,tpred_voting))))
print('MAE: {}'.format(mean_absolute_error(y_test,tpred_voting)))
print('R-squared: {}'.format(r2_score(y_test,tpred_voting)))

In [None]:
#StackingRegressor metrics
stacking_model = get_stacking()
stacking_model.fit(X_train,y_train)
tpred_stack=stacking_model.predict(X_test)
print('StackingRegressor')
print('MSE: {}'.format(mean_squared_error(y_test,tpred_stack)))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test,tpred_stack))))
print('MAE: {}'.format(mean_absolute_error(y_test,tpred_stack)))
print('R-squared: {}'.format(r2_score(y_test,tpred_stack)))

In [None]:
#BaggingRegressor (pasting) metrics
pasting_model.fit(X_train,y_train)
tpred_pasting=pasting_model.predict(X_test)
print('BaggingRegressor (Pasting)')
print('MSE: {}'.format(mean_squared_error(y_test,tpred_pasting)))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test,tpred_pasting))))
print('MAE: {}'.format(mean_absolute_error(y_test,tpred_pasting)))
print('R-squared: {}'.format(r2_score(y_test,tpred_pasting)))

In [None]:
#RandomForestRegressor metrics
rf = RandomForestRegressor(random_state = 1,bootstrap=True, max_depth=9, max_features='auto', min_samples_leaf=1, min_samples_split= 2, n_estimators=385)
rf.fit(X_train,y_train)
tpred_rf=rf.predict(X_test)
print('RandomForestRegressor')
print('MSE: {}'.format(mean_squared_error(y_test,tpred_rf)))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test,tpred_rf))))
print('MAE: {}'.format(mean_absolute_error(y_test,tpred_rf)))
print('R-squared: {}'.format(r2_score(y_test,tpred_rf)))

In [None]:
#BaggingRegressor metrics
bagging_model.fit(X_train,y_train)
tpred_bagging=bagging_model.predict(X_test)
print('BaggingRegressor')
print('MSE: {}'.format(mean_squared_error(y_test,tpred_bagging)))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test,tpred_bagging))))
print('MAE: {}'.format(mean_absolute_error(y_test,tpred_bagging)))
print('R-squared: {}'.format(r2_score(y_test,tpred_bagging)))

In [None]:
#AdaBoostRegressor metrics
adaboost_model.fit(X_train,y_train)
tpred_adaboost=adaboost_model.predict(X_test)
print('AdaBoostRegressor')
print('MSE: {}'.format(mean_squared_error(y_test,tpred_adaboost)))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test,tpred_adaboost))))
print('MAE: {}'.format(mean_absolute_error(y_test,tpred_adaboost)))
print('R-squared: {}'.format(r2_score(y_test,tpred_adaboost)))

In [None]:
#GradientBoostingRegressor metrics
gbr = GradientBoostingRegressor(random_state = 1,learning_rate= 0.2, max_depth= 7, max_features= 'auto', min_samples_leaf= 3, min_samples_split= 2, n_estimators= 20)
gbr.fit(X_train,y_train)
tpred_gbr=gbr.predict(X_test)
print('GradientBoostingRegressor')
print('MSE: {}'.format(mean_squared_error(y_test,tpred_gbr)))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test,tpred_gbr))))
print('MAE: {}'.format(mean_absolute_error(y_test,tpred_gbr)))
print('R-squared: {}'.format(r2_score(y_test,tpred_gbr)))

In [None]:
#SVR metrics
svr = SVR(C=24, coef0= 0.9, gamma='scale', kernel='poly')
svr.fit(X_train,y_train)
tpred_svr=svr.predict(X_test)
print('SVR')
print('MSE: {}'.format(mean_squared_error(y_test,tpred_svr)))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test,tpred_svr))))
print('MAE: {}'.format(mean_absolute_error(y_test,tpred_svr)))
print('R-squared: {}'.format(r2_score(y_test,tpred_svr)))

# Feature Importance

In [None]:
#import packages for explaining feature importance
from pdpbox import pdp, get_dataset, info_plots
import eli5
from eli5.sklearn import PermutationImportance
import shap

In [None]:
#preparing data for shap
X_shap = pd.DataFrame(X_train)
X_shap.columns = feature_names

pred_data = pd.DataFrame(X_test)
pred_data.columns = feature_names

In [None]:
#create object that can calculate shap values
explainer = shap.Explainer(svr.predict, X_shap)
shap_values = explainer(pred_data)

In [None]:
#summary_plot using SVR
shap.initjs()
shap.summary_plot(shap_values, pred_data)

In [None]:
#car that has the most impact on SVR model: car_name_37
values[37]

In [None]:
#permutation importance from Voting Regressor
perm = PermutationImportance(voting_model).fit(pred_data, y_test)
eli5.show_weights(perm, feature_names = list(feature_names), top=len(feature_names))

In [None]:
#cars that has the most impact on Voting Regressor model: car_name_27
print(values[27])
print(values[28])

# Conclusions

**Best model**
- SVR
- MSE: 0.61430
- RMSE: 783.77 USD
- MAE: 502.69 USD
- R-squared: 0.97032

**Most important features**
- Present_Price
- Year
- Seller_Type


# Productionization

I created a [front-end](https://recommend-vehicle-price.herokuapp.com/) using this model using Flask and Heroku to recommend vehicle sales prices.

See the [GitHub repo](https://github.com/MichaelBryantDS/vehicle-price-rec) for more information.