# Used Car Price Prediction 

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Installing Necessary Libraries and Packages

In [5]:
import pandas as pd
import numpy as np
import datetime
import pickle
from sklearn import preprocessing
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from spark_sklearn import GridSearchCV
from spark_sklearn.util import createLocalSparkSession
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

## Importing & Reading CSV File 


In [6]:
'''
Partial data preprocessing for the dataset being imported (cars_outliers_removed.csv)
has already been done (i.e. the dataset has dropped the missing & duplicate 
values, been standardized and outliers have been removed by model).

Also, from the data analysis that has already been done on the dataset, it has 
been found that the features Id, State, Vin and City can be discarded since they 
have very weak to no correlation with the Price attribute (i.e. the dependent variable)
and hence these attributes need to be dropped.
'''

# Importing the csv file i.e. dataset from Local System into Google Colaboratory
from google.colab import files
uploaded=files.upload()

Saving cars_outliers_removed.csv to cars_outliers_removed.csv


In [7]:
# Reading the dataset
cleaned_cars = pd.read_csv('cars_outliers_removed.csv', encoding='latin1', error_bad_lines=False,warn_bad_lines=False)

In [8]:
cleaned_cars.sample(5)

Unnamed: 0.1,Unnamed: 0,Id,Price,Year,Mileage,City,State,Vin,Make,Model
688659,688659,830574,4850,2006,188401,Auburn,WA,KM8JN12DX6U393613,Hyundai,Tucson4dr
217990,217990,644961,16590,2013,92683,Englewood,CO,2FMDK4JC8DBA44551,Ford,EdgeSEL
420102,420102,948979,12500,2010,92788,Southern Pines,NC,2LMDJ6JC5ABJ31726,Lincoln,MKXFWD
454591,454591,994700,52995,2014,49125,Burlington,NJ,WDDUG8CB6EA006322,Mercedes-Benz,S-ClassS550
150695,150695,574241,15988,2014,42241,Chesapeake,VA,2C4RDGCG0ER429486,Dodge,Grand


## Preprocessing

### Outlier Management (to remove the remaining outliers)

In [9]:
print("Too new: %d" % cleaned_cars.loc[cleaned_cars.Year >= 2017].count()['Id'])
print("Too few km: " , cleaned_cars.loc[cleaned_cars.Mileage < 5000].count()['Id'])
print("Too many km: " , cleaned_cars.loc[cleaned_cars.Mileage > 250000].count()['Id'])

Too new: 71946
Too few km:  14055
Too many km:  374


In [10]:
#Dropping the unuseful attributes 
cleaned_cars = cleaned_cars.drop(["Id", "State", "Vin", "City"], axis=1)

# Replacing the NaN values for categoric attributes
cleaned_cars['Make'].fillna(value='blank', inplace=True)
cleaned_cars['Model'].fillna(value='blank', inplace=True)

# Dropping the duplicates
cleaned_cars = cleaned_cars.drop_duplicates(["Year", "Mileage", "Price", "Make", "Model"])

# Removing the outliers
cleaned_cars = cleaned_cars[
        (cleaned_cars.Year <= 2017) 
      & (cleaned_cars.Year >= 2008)  
      & (cleaned_cars.Mileage >= 5000) 
      & (cleaned_cars.Mileage <= 250000)]

# Removing the extra column
cleaned_cars = cleaned_cars.drop(["Unnamed: 0"], axis=1)

In [11]:
# Sample of the final dataset to be used 
cleaned_cars.sample(5)

Unnamed: 0,Price,Year,Mileage,Make,Model
169494,14999,2014,49068,Dodge,JourneySXT
372218,12289,2017,41718,Kia,ForteLX
129325,16986,2015,26408,Chrysler,200S
60299,12500,2015,22032,Chevrolet,Cruze1LT
158665,25800,2015,47844,Jeep,Grand


### Normalizing price distribution

In [12]:
# Applying log transformation 
cleaned_cars['Price'] = np.log(cleaned_cars['Price'])

### Label Encoding

In [13]:
features = ['Make', 'Model']
les = {}

for f in features:
  les[f] = preprocessing.LabelEncoder()
  les[f] = les[f].fit(cleaned_cars[f])
  cleaned_cars[f] = les[f].transform(cleaned_cars[f])

### Train/Test Split

In [14]:
# Splitting the dataset into training set(66% of total) and test set(33% of total)
train_set, test_set = train_test_split(cleaned_cars, test_size = 0.33, random_state = 42)

# Separating target labels from the rest
cars_train = train_set.drop("Price", axis=1) #train without target
cars_price_train = train_set["Price"].copy() #target

cars_test  = test_set.drop("Price", axis=1) #test without target
cars_price_test = test_set["Price"].copy() #target


## Training and Evaluating Model

### Function Definitions

#### Best Score Function

In [15]:
# This function returns the best score achieved by the model over all the cv splits.

def best_score(forest, cv):
  best_score = 0
  for i in range(0, cv):
    items = list(map(lambda x: abs(x), forest.cv_results_['split'+str(i)+'_test_score']))
    arr = np.append(best_score, items)
    best_score = max(arr)
  return best_score

#### Best Parameters Function

In [16]:
# This functions returns the best combination of parameters, which allows us to
# get the best score.

def best_params(forest):
  return forest.cv_results_['params'][forest.cv_results_['rank_test_score'][0]-1]

#### Performance Metric Function

In [17]:
# Calculates and returns the performance score between true (y_true) and 
# predicted (y_predict) values based on the metric chosen.

from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
   score = r2_score(y_true, y_predict)
   return score

### Linear Regression

In [18]:
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

import os

# Class DFSelector created to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames.

class DFSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.feature_names].values

In [19]:
# Setting categorical and numerical attributes
cat_features = ["Make", "Model"]
num_features = list(cars_train.drop(cat_features, axis=1))

# Building the Pipelines for categorical and numerical dataframes

numerical_pipeline = Pipeline([
    ("selector", DFSelector(num_features)),
    ("std_scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("selector", DFSelector(cat_features)),
    ("encoder", OneHotEncoder(sparse=True))
])

# Full Pipeline
full_pipeline = FeatureUnion(transformer_list =[
    ("num_pipeline", numerical_pipeline),
    ("cat_pipeline", categorical_pipeline)
])

In [20]:
# Applying the full pipeline on the training set
ohe_cars_train = full_pipeline.fit_transform(cars_train) 

In [21]:
from sklearn.linear_model import LinearRegression

sc = createLocalSparkSession().sparkContext

model = LinearRegression()
parameters = {'fit_intercept':[False], 'normalize':[True,False], 'copy_X':[True, False]}

# Spark parallelized GridSearchCV for hyperparameter tuning
gs = GridSearchCV(sc, estimator=model, param_grid=parameters, cv=3, n_jobs=-1, verbose=1, return_train_score=True)
lin_reg = gs.fit(ohe_cars_train, cars_price_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [22]:
# Best CV parameters
bp = best_params(lin_reg)
best_params(lin_reg)

lin_reg_model = LinearRegression(
                              fit_intercept=bp["fit_intercept"],
                              normalize=bp["normalize"],
                              copy_X=bp["copy_X"])
%time lin_reg_model.fit(ohe_cars_train, cars_price_train)

CPU times: user 23 s, sys: 19.3 s, total: 42.3 s
Wall time: 21.6 s


LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=True)

In [23]:
# Linear Regression score for price prediction
ohe_cars_test = full_pipeline.transform(cars_test)

print("Best Linear Regression parameters:")
print(bp)
print("\nLinear Regressor score without CV on train set: %.3f" % lin_reg_model.score(ohe_cars_train, cars_price_train)) #score on train set
print("Linear Regression score without CV on test set: %.3f" % lin_reg_model.score(ohe_cars_test, cars_price_test)) # score on test set
print("Linear Regression Best score with CV=3: %.3f" % best_score(lin_reg, 3)) # -> best score on test set is high

Best Linear Regression parameters:
{'copy_X': True, 'fit_intercept': False, 'normalize': True}

Linear Regressor score without CV on train set: 0.948
Linear Regression score without CV on test set: 0.947
Linear Regression Best score with CV=3: 0.944


In [24]:
# Prediction on the whole training set
from sklearn.metrics import mean_squared_error

price_predictions_train = lin_reg_model.predict(ohe_cars_train) 

# Reversing np.log operation (was done when the price attribute was normalized)
price_predictions_train_normal = np.exp(price_predictions_train)
cars_price_train_normal = np.exp(cars_price_train)

# MSE between target values (i.e price) and predicted values
lin_mse = mean_squared_error(cars_price_train_normal, price_predictions_train_normal)
lin_rmse = np.sqrt(lin_mse)
lin_rmse 

2361.212103512848

In [25]:
print(price_predictions_train_normal[580:590])
print('\n')
print(list(cars_price_train_normal[580:590]))

[16456.0012509  29733.54045406 13460.80264209 26743.80331393
 27811.42135111 14289.70104336 21386.21090102 21667.23741894
 24194.97259724 31464.62567763]


[17900.000000000007, 27785.99999999998, 13998.999999999996, 29995.000000000015, 28699.999999999996, 12947.999999999993, 21992.00000000002, 19995.00000000002, 25204.99999999998, 36000.00000000002]


In [26]:
# Prediction on test set
price_predictions_test = lin_reg_model.predict(ohe_cars_test)

# Reversing np.log operation 
price_predictions_test_normal = np.exp(price_predictions_test)
cars_price_test_normal = np.exp(cars_price_test)

final_mse = mean_squared_error(cars_price_test_normal, price_predictions_test_normal)
final_rmse = np.sqrt(final_mse)

final_rmse

2450.7668485224413

In [27]:
print(price_predictions_test_normal[7650:7660]) #predictions on test set
print('\n')
print(list(cars_price_test_normal[7650:7660])) #known values in test set

[19376.86848468 30242.44523216 40573.57759926 30227.9027336
 24661.87767341 21181.83979224 37891.16627097 15743.47274119
 16088.42071314 17888.54186287]


[18300.000000000015, 33879.0, 42499.99999999999, 29000.00000000001, 25991.0, 21987.000000000015, 22496.999999999985, 16796.99999999999, 15999.00000000001, 18200.000000000004]


In [28]:
# R^2 regression score between hold out prices and predicted prices 
from sklearn.metrics import r2_score
r2_score(cars_price_test_normal, price_predictions_test_normal, multioutput='variance_weighted') 

0.9438673984537591

### Decision Tree Regression

In [29]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer

# This function performs grid search over the 'max_depth' parameter for a 
# decision tree regressor trained on the input data [X, y]. 

def DT_SparkizedGridSearchCV(X, y):

    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 42)

    # Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':[1, 5, 10, 15, 16, 17]}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Create the grid search cv object --> GridSearchCV()
    sc = createLocalSparkSession().sparkContext
    grid = GridSearchCV(sc, estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    tree_reg = grid.fit(X, y)
    
    # Return grid search output after fitting the data
    return tree_reg

In [30]:
from sklearn.model_selection import ShuffleSplit

# Fit the training data to the model using spark parallelized grid search CV
tree_reg = DT_SparkizedGridSearchCV(cars_train, cars_price_train)

# Taking best parameters
bp = best_params(tree_reg)

# Produce the optimal value for 'max_depth'
print("Parameter 'max_depth' is {} for the optimal model.".format(bp['max_depth']))

Parameter 'max_depth' is 17 for the optimal model.


In [31]:
# Due to the limitation of the spark-sklearn library's implementation of
# GridSearchCV, best_estimator_ parameter it's not available, so we need to
# fit a DecisionTreeRegressor on the best parameters given to us by gridSearchCV

tree_reg_model = DecisionTreeRegressor(
                              max_depth=bp['max_depth'])
%time tree_reg_model.fit(cars_train, cars_price_train)

CPU times: user 1.38 s, sys: 0 ns, total: 1.38 s
Wall time: 1.37 s


DecisionTreeRegressor(criterion='mse', max_depth=17, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [32]:
# DecisionTreeRegressor score for price prediction

print("Best Decision Tree Regressor parameters:")
print(bp)
print("\nDecision Tree Regressor score without CV on train set: %.3f" % tree_reg_model.score(cars_train, cars_price_train)) #score on train set
print("Decision Tree Regressor score without CV on test set: %.3f" % tree_reg_model.score(cars_test, cars_price_test)) # score on test set
print("Decision Tree Regressor Best score with CV=10: %.3f" % best_score(tree_reg, 10)) # -> best score on test set is high

Best Decision Tree Regressor parameters:
{'max_depth': 17}

Decision Tree Regressor score without CV on train set: 0.949
Decision Tree Regressor score without CV on test set: 0.930
Decision Tree Regressor Best score with CV=10: 0.931


In [33]:
# Prediction on whole training set with the final model given by the best CV parameters
price_predictions_train = tree_reg_model.predict(cars_train) 

# Reversing np.log operation
price_predictions_train_normal = np.exp(price_predictions_train)
cars_price_train_normal = np.exp(cars_price_train)

# MSE between target values (i.e known) and predicted values
lin_mse = mean_squared_error(cars_price_train_normal, price_predictions_train_normal)
lin_rmse = np.sqrt(lin_mse)
lin_rmse 

2438.805253296204

In [34]:
print(price_predictions_train_normal[1670:1680])
print('\n')
print(list(cars_price_train_normal[1670:1680]))

[46410.9161647  15140.92013303 23604.93919515 26968.62968701
 25458.8409949  13832.74746199 62580.15299687 12876.00829602
 22918.37949149 16002.9971716 ]


[40997.0, 15500.00000000001, 23994.99999999999, 28575.999999999978, 28902.999999999993, 12399.999999999996, 64993.99999999994, 11989.99999999999, 22994.999999999996, 14998.000000000002]


In [35]:
# Prediction on test set
price_predictions_test = tree_reg_model.predict(cars_test)

# Reversing np.log operation
price_predictions_test_normal = np.exp(price_predictions_test)
cars_price_test_normal = np.exp(cars_price_test)

final_mse = mean_squared_error(cars_price_test_normal, price_predictions_test_normal)
final_rmse = np.sqrt(final_mse)

final_rmse

3160.7621493842207

In [36]:
print(price_predictions_test_normal[1939:1949]) #predictions on test set
print('\n')
print(list(cars_price_test_normal[1939:1949])) #known values in test set

[15816.97837917 26968.62968701 21798.24986954 14677.83327566
 17703.06484765  8700.         31594.53691537 15816.97837917
 22043.31240188 12551.5251707 ]


[15960.000000000011, 26975.000000000004, 21794.99999999998, 13221.000000000005, 21932.0, 9499.999999999995, 31494.99999999998, 14899.999999999995, 20887.000000000015, 11613.999999999995]


In [37]:
# R^2 regression score between hold out prices and predicted prices
from sklearn.metrics import r2_score
r2_score(cars_price_test_normal, price_predictions_test_normal, multioutput='variance_weighted') 

0.9066326981600589

### Random Forest Regression

In [38]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

# This function performs grid search over the 'max_depth' parameter for a 
# random forest regressor trained on the input data [X, y]. 

def RF_SparkizedGridSearchCV(X, y):
  
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 42)

    # Create a random forest regressor object
    regressor = RandomForestRegressor()

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':[16, 17, 18]}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Create the grid search cv object --> GridSearchCV()
    sc = createLocalSparkSession().sparkContext
    grid = GridSearchCV(sc, estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    tree_reg = grid.fit(X, y)
    
    # Return the best parameters after fitting the data
    return tree_reg

  from numpy.core.umath_tests import inner1d


In [39]:
from sklearn.model_selection import ShuffleSplit

# Fit the training data to the model using spark parallelized grid search CV
forest_reg = RF_SparkizedGridSearchCV(cars_train, cars_price_train)

# Taking best parameters
bp = best_params(forest_reg)

# Produce the optimal value for 'max_depth'
print("Parameter 'max_depth' is {} for the optimal model.".format(bp['max_depth']))

Parameter 'max_depth' is 18 for the optimal model.


In [40]:
# Fitting the forest

forest_reg_model = RandomForestRegressor(
                              max_depth=bp['max_depth']
                                 
)

%time forest_reg_model.fit(cars_train, cars_price_train)

CPU times: user 9.85 s, sys: 0 ns, total: 9.85 s
Wall time: 9.79 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=18,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [41]:
# RandomForestRegressor score for price prediction

print(bp)
print("\nRandom Forest Regressor score without CV on train set: %.3f" % forest_reg_model.score(cars_train, cars_price_train)) #score on train set
print("Random Forest Regressor score without CV on test set: %.3f" % forest_reg_model.score(cars_test, cars_price_test)) #score on test set
print("Random Forest Regressor Best score with CV=4: %.3f" % best_score(forest_reg, 4)) # -> best score on test set is high

{'max_depth': 18}

Random Forest Regressor score without CV on train set: 0.962
Random Forest Regressor score without CV on test set: 0.946
Random Forest Regressor Best score with CV=4: 0.946


In [42]:
# Prediction on whole training set
price_predictions_train = forest_reg_model.predict(cars_train) #using the whole training set for making prediction with the final model given by the best CV parameters

# Reversing np.log operation
price_predictions_train_normal = np.exp(price_predictions_train)
cars_price_train_normal = np.exp(cars_price_train)

# MSE between target values (i.e known) and predicted values
lin_mse = mean_squared_error(cars_price_train_normal, price_predictions_train_normal)
lin_rmse = np.sqrt(lin_mse)
lin_rmse 

2211.9047553644486

In [43]:
print(price_predictions_train_normal[25670:25680])
print('\n')
print(list(cars_price_train_normal[25670:25680]))

[14061.76521865 19683.06657528 16013.37336972 14871.34843627
 39137.87336305 13185.70223213 16824.92418112 13903.47707999
 19431.53122588 27099.26618374]


[15275.00000000001, 20495.99999999998, 15190.00000000001, 16995.00000000001, 37844.99999999997, 13499.999999999996, 17994.999999999993, 12500.0, 17499.00000000001, 24491.000000000007]


In [44]:
# Prediction on test set
price_predictions_test = forest_reg_model.predict(cars_test)

# Reversing np.log operation
price_predictions_test_normal = np.exp(price_predictions_test)
cars_price_test_normal = np.exp(cars_price_test)

final_mse = mean_squared_error(cars_price_test_normal, price_predictions_test_normal)
final_rmse = np.sqrt(final_mse)

final_rmse

2765.368980973027

In [45]:
print(price_predictions_test_normal[1870:1880]) #predictions on test set
print('\n')
print(list(cars_price_test_normal[1870:1880])) #known values in test set

[65023.49841155 15408.42232228 12410.79521011 24387.10014802
 23735.18718594 19797.47738623 28375.66386148 16670.57015332
 16356.11536583 15367.89181753]


[58950.00000000004, 13997.999999999995, 10775.00000000001, 22999.999999999993, 28854.99999999999, 17585.000000000007, 28921.00000000002, 17294.0, 18991.000000000015, 16162.000000000004]


In [46]:
# R^2 regression score between hold out prices and predicted prices
r2_score(cars_price_test_normal, price_predictions_test_normal, multioutput='variance_weighted') 

0.9285310583604977

## Cross Validation 

In [47]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

### Linear Regression

In [48]:
# Cross val score on training set

train_scores = cross_val_score(lin_reg_model, ohe_cars_train, np.exp(cars_price_train),
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-train_scores)

display_scores(tree_rmse_scores)

Scores: [2511.16403533 2507.64383075 2503.00190225 2570.6978191  2837.54811306
 2508.55138873 2665.38489349 2564.91373215 2658.98436036 2585.36184036]
Mean: 2591.3251915586393
Standard deviation: 99.96390942170886


### Decision Tree Regression

In [49]:
# Cross val score on training set

train_scores = cross_val_score(tree_reg_model, cars_train, np.exp(cars_price_train),
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-train_scores)

display_scores(tree_rmse_scores)

Scores: [2696.87595687 3037.32820464 2897.5985634  3427.60381124 3341.84638881
 2854.09935897 3167.30626662 2929.34425399 3533.94756845 2920.27513081]
Mean: 3080.6225503790783
Standard deviation: 261.57108597970387


### Random Forest Regression

In [50]:
from sklearn.model_selection import KFold

# Cross val score on training set, although we already used grid search CV

train_scores = cross_val_score(forest_reg_model, cars_train, np.exp(cars_price_train),
                         scoring="neg_mean_squared_error", cv=KFold(10, shuffle=True))
forest_rmse_scores = np.sqrt(-train_scores)

display_scores(forest_rmse_scores)

Scores: [2566.89817077 2553.6659989  2566.35606121 2884.85336817 2480.34922272
 2854.26509875 2664.94947824 2534.50524374 2555.71775399 2739.76393727]
Mean: 2640.132433377814
Standard deviation: 133.55887349211537
