# Used Car Price Prediction

- Predict used car price by various regression models

- Regression Models:
  - Linear Regression
  - Multivariate Adaptive Regression Splines
  - Decision Tree Regressor
  - XGBoost Regressor
  - Deep Neural Network

- Performace Metrics:
  - R-Squared
  - Mean Absolute Error

# 1. Import Libraries and Load Dataset

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(color_codes=True)

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from pyearth import Earth
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

import warnings
warnings.simplefilter('ignore')

In [None]:
# Load Data
df = pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/car data.csv")
print("Shape of Dataset:", df.shape)
df.head()

# 2. Data Exploration and Preprocessing

In [None]:
# Data Information
df.info()

There are no missing values in the dataset.

In [None]:
# Descriptive statistics
df.describe()

## 2.1. Remove Outliers in Target Variable

In [None]:
# Show the distribution of car price 
sns.distplot(df['Selling_Price'],color="blue")

The distribution is right-skewed. Let's remove outliers using the IQR as the criteria.

In [None]:
# Find IQR
Q1 = df['Selling_Price'].quantile(0.25)
Q3 = df['Selling_Price'].quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
# Remove outliers with a criteria: 1.5 x IOR
df = df[~((df['Selling_Price'] < (Q1 - 1.5 * IQR)) |(df['Selling_Price'] > (Q3 + 1.5 * IQR)))]
df.shape

In [None]:
# Show the distribution of price: outliers removed
sns.distplot(df['Selling_Price'], color="blue")

## 2.2. Exploration of Categorical Variables

In [None]:
# Show the list of car models
print(df['Car_Name'].unique().tolist())

In [None]:
# Show the frequency of each car model
for index, value in df['Car_Name'].value_counts().iteritems():
    print(index, ': ', value)

Since great many car models are contained, let's ignore car models in prediction.

In [None]:
# Show the value counts of transmission
for index, value in df['Fuel_Type'].value_counts().iteritems():
    print(index, ': ', value)

In [None]:
# Show the value counts of transmission
for index, value in df['Seller_Type'].value_counts().iteritems():
    print(index, ': ', value)

In [None]:
# Show the value counts of transmission
for index, value in df['Transmission'].value_counts().iteritems():
    print(index, ': ', value)

In [None]:
# Show the value counts of transmission
for index, value in df['Owner'].value_counts().iteritems():
    print(index, ': ', value)

In [None]:
# Representing categorical data using swarm plots
fig = plt.figure(figsize=(10,7))
plt.subplot(2,2,1)
sns.swarmplot(x = 'Fuel_Type', y = 'Selling_Price', data = df, palette="winter")
plt.subplot(2,2,2)
sns.swarmplot(x = 'Seller_Type', y = 'Selling_Price', data = df, palette="winter")
plt.subplot(2,2,3)
sns.swarmplot(x = 'Transmission', y = 'Selling_Price', data = df, palette="winter")
plt.subplot(2,2,4)
sns.swarmplot(x = 'Owner', y = 'Selling_Price', data = df, palette="winter")
plt.tight_layout()
plt.show()

## 2.3. Exploration of Continuous Variables

In [None]:
# Create a list of continuous variables
cont = ["Selling_Price", "Present_Price", "Year", "Kms_Driven"]

# Create a dataframe of continuous variables
df_cont = df[cont]

In [None]:
# Visualize correlation between continuous variables

# Compute the correlation matrix
corr = df_cont.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(4, 3))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap="winter", vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=False)

In [None]:
# Visualize regression between each feature and output variable

# The number of variables
k = 4
fig = plt.figure(figsize=(16,6))
# Correlations between each variable
corrmat = df_cont.corr()
# Take k elements in descending order of coefficient 
cols = corrmat.nlargest(k, "Selling_Price")["Selling_Price"].index
# Calculate correlation
for i in np.arange(1,k):
    regline = df_cont[cols[i]]
    ax = fig.add_subplot(1,3,i)
    sns.regplot(x=regline, y=df['Selling_Price'], scatter_kws={"color": "royalblue", "s": 3},
                line_kws={"color": "turquoise"})
plt.tight_layout()
plt.show()

## 2.4. Data Preparation for Modeling

In [None]:
# Split X and y
X = df.drop(['Car_Name', 'Selling_Price'], axis=1)
y = df['Selling_Price']

In [None]:
# Create dummies for categorical variables

# subset all categorical variables
cars_categorical = X.select_dtypes(include=['object'])
# convert into dummies
cars_dummies = pd.get_dummies(cars_categorical, drop_first=True)
# drop categorical variables 
X = X.drop(list(cars_categorical.columns), axis=1)
# concat dummy variables with X
X = pd.concat([X, cars_dummies], axis=1)

In [None]:
# Scale the features

# Store column names since the column names will be lost after scaling
cols = X.columns

# Scale the features and convert it back to a dataframe
X = pd.DataFrame(scale(X))

# Write in the column names again
X.columns = cols
X.columns

In [None]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)

# 3. Regression

## 3.1. Linear Regression

In [None]:
# Instantiate the model
lm = LinearRegression()

# Fit the model
lm.fit(X_train, y_train)

# Make prediction
y_pred = lm.predict(X_test)

# Performance metrics
lr_r2= r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

# Show the metrics
print("Linear Regression R2: ", lr_r2)
print("Linear Regression MAE: ", lr_mae)

In [None]:
# Evaluate the model based on the assumption of linear regression:

# Assumption 1. The error terms are normally distributed with mean approximately 0.

fig = plt.figure()
sns.distplot((y_test-y_pred),bins=50, color="blue")
fig.suptitle('Error Terms', fontsize=14)                  
plt.xlabel('y_test-y_pred', fontsize=12)                  
plt.ylabel('Index', fontsize=12)                          
plt.show()

The first assumption seems to be met.

In [None]:
# Assumption 2: Homoscedasticity, i.e. the variance of the error term (y_true-y_pred) is constant.

c = [i for i in range(len(y_pred))]
fig = plt.figure()
plt.plot(c,y_test-y_pred, color="blue", linewidth=2.5, linestyle="-", alpha=0.4)
fig.suptitle('Error Terms', fontsize=14)               
plt.xlabel('Index', fontsize=12)                      
plt.ylabel('ytest-ypred', fontsize=12)                
plt.show()

The second assumption seems to be met.

In [None]:
# Assumption 3: There is little correlation between the predictors. i.e., Multicollinearity:

predictors = ['Year', 'Present_Price', 'Kms_Driven', 'Owner', 'Fuel_Type_Diesel','Fuel_Type_Petrol', 
              'Seller_Type_Individual', 'Transmission_Manual']

# Compute the correlation matrix
cors = X.loc[:, list(predictors)].corr()

# Generate a mask for the upper triangle
mask_2 = np.triu(np.ones_like(cors, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(9, 6))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(cors, mask=mask_2, cmap="winter", vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=False)
plt.show()

Some features are highly correlated. So let's check the multicolliearity by VIF.

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

"fuel_Petrol" shows the highest VIF, so let's delete it.

In [None]:
X = X.drop('Fuel_Type_Diesel', axis=1)

In [None]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)

In [None]:
# Instantiate the model
lm = LinearRegression()

# Fit the model
lm.fit(X_train, y_train)

# Make prediction
y_pred = lm.predict(X_test)

# Performance metrics
lr_r2= r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

# Show the metrics
print("Linear Regression R2: ", lr_r2)
print("Linear Regression MAE: ", lr_mae)

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

## 3.2. Multivariate Adaptive Regression Splines (MARS)

In [None]:
# Initiate the model
mars_model = Earth()

# By default, we do not need to set any of the algorithm hyperparameters.
# The algorithm automatically discovers the number and type of basis functions to use.

# Fit the model
mars_model.fit(X_train, y_train)

# Making predictions
mars_y_pred = mars_model.predict(X_test)

# Performance Metrics
mars_r2 = r2_score(y_test, mars_y_pred)
mars_mae = mean_absolute_error(y_test, mars_y_pred)

# Show the model performance
print("MARS R2: ", mars_r2)
print("MARS MAE: ", mars_mae)

## 3.3. Decision Tree Regression

In [None]:
# Initiate the model
dt_model = DecisionTreeRegressor()

# Grid search
dt_gs = GridSearchCV(dt_model,
                     param_grid = {'max_depth': range(1, 11),
                                   'min_samples_split': range(1, 10, 1)},
                     cv=5,
                     n_jobs=1,
                     scoring='neg_mean_squared_error')

dt_gs.fit(X_train, y_train)

print(dt_gs.best_params_)
print(-dt_gs.best_score_)

In [None]:
# Initiate the best model
dt_model_best = DecisionTreeRegressor(max_depth=8, min_samples_split=2)

# Fit the best model
dt_model_best.fit(X_train, y_train)

In [None]:
# Make predictions
dt_y_pred = dt_model_best.predict(X_test)

# Performance metrics
dt_r2 = r2_score(y_test, dt_y_pred)
dt_mae = mean_absolute_error(y_test, dt_y_pred)

# Show the model performance
print("DT R2: ", dt_r2)
print("DT MAE: ", dt_mae)

## 3.4. XGB Regression

In [None]:
# Initiate the model
xgb_model = xgb.XGBRegressor()

# Grid search
xgb_gs = GridSearchCV(xgb_model,
                      param_grid = {'max_depth': range(8, 15),
                                   'min_samples_split': range(2, 11, 3)},
                      cv=5,
                      n_jobs=1,
                      scoring='neg_mean_squared_error')
                      
xgb_gs.fit(X_train, y_train)

print(xgb_gs.best_params_)
print(-xgb_gs.best_score_)

In [None]:
# Initiate the best model
xgb_model_best = xgb.XGBRegressor(max_depth=9, min_samples_split=2)

# Fit the best model
xgb_bst = xgb_model_best.fit(X_train, y_train)

In [None]:
# Make predictions
xgb_y_pred = xgb_bst.predict(X_test)

# Performance metrics
xgb_r2 = r2_score(y_test, xgb_y_pred)
xgb_mae = mean_absolute_error(y_test, xgb_y_pred)

# Show the model performance
print("XGB R2: ", xgb_r2)
print("XGB MAE: ", xgb_mae)

## 3.5. Deep Neural Network

In [None]:
# Define a DNN
def create_model(optimizer='adam'):
    model = Sequential()
    model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(16, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))

    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

In [None]:
# Initiate DNN
dnn = KerasRegressor(build_fn=create_model, epochs=10000, batch_size=20, verbose=1)

# Fit DNN
dnn_history = dnn.fit(X_train, y_train)

In [None]:
# Visualize the DNN learning
loss_train = dnn_history.history['loss']
epochs = range(1,10001)
plt.figure(figsize=(8,6))
plt.plot(epochs, loss_train, 'royalblue', label='Training loss', linewidth=3)
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Make predictions
dnn_y_pred = dnn.predict(X_test)

# Performance metrics
dnn_r2 = r2_score(y_test, dnn_y_pred)
dnn_mae = mean_absolute_error(y_test, dnn_y_pred)

# Show the model performance
print("DNN R2: ", dnn_r2)
print("DNN MAE: ", dnn_mae)

# 4. Summary of Results

In [None]:
results_table = pd.DataFrame([[np.mean(lr_r2), np.mean(lr_mae)],
                             [np.mean(mars_r2), np.mean(mars_mae)],
                             [np.mean(dt_r2), np.mean(dt_mae)],
                             [np.mean(xgb_r2), np.mean(xgb_mae)],
                             [np.mean(dnn_r2), np.mean(dnn_mae)]],
                            columns=['R2', 'MAE'],
                            index=["Linear Regression","MARS","Decision Tree","XGBoost","DNN"])
pd.options.display.precision = 3
results_table

In [None]:
pred_table = pd.DataFrame({"Linear Regression: Predicted Price": y_pred,
                           "MARS: Predicted Price": mars_y_pred,
                           "Decision Tree: Predicted Price": dt_y_pred,
                           "XGBoost: Predicted Price": xgb_y_pred,
                           "DNN: Predicted Price": dnn_y_pred,
                          "Actual Price": y_test})

In [None]:
# Visualize the predicted price and actual price
fig = plt.figure(figsize=(10,10))
plt.subplot(3,2,1)
sns.regplot(x = 'Linear Regression: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,2,2)
sns.regplot(x = 'MARS: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,2,3)
sns.regplot(x = 'Decision Tree: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,2,4)
sns.regplot(x = 'XGBoost: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,2,5)
sns.regplot(x = 'DNN: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.tight_layout()
plt.show()

# 5. Feature Importance

Since DNN achieved the highest performance, show the feature importance of DNN.

In [None]:
# Import libaries
import eli5
from eli5.sklearn import PermutationImportance

# Show permutation importance
perm = PermutationImportance(dnn, random_state=1).fit(X,y)
eli5.show_weights(perm, feature_names = X.columns.tolist())