# Car Price Prediction Case Study

###### Kaggle dataset link - https://www.kaggle.com/nehalbirla/vehicle-dataset-from-cardekho

In [None]:
#Import important libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor,RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

sns.set_style('whitegrid')

In [None]:
# Load the dataset
data = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')
print(data.shape)
data.head(2)

In [None]:
# check in case of any duplicate records and then drop it
print(data.duplicated().sum())
data.drop_duplicates(inplace=True)

    There were 2 duplicate records which are now removed from the dataset

In [None]:
# check for any missing value
plt.figure(figsize=(8,3))
sns.heatmap(data.isnull(),yticklabels=False,cmap='viridis',cbar=False)

In [None]:
# categorising columns based on column type
Nominal_cat_var =['Car_Name','Fuel_Type','Seller_Type','Transmission']
Ordinal_cat_var = ['Owner']
Num_var = ['Selling_Price','Present_Price','Kms_Driven']
Date_colum = ['Year']

## Data Exploration

In [None]:
# Distribution of all the numerical variables
for column in Num_var :
    plt.hist(data[column])
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.show()

    All the numerical variables are right skewed

In [None]:
# Distribution of all the categorical variables
for column in Nominal_cat_var :
    sns.countplot(data[column])
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.show()

    Note:
    
    1) We will drop the column car name, as it has many options.
    2) we will convert columns - 'Fuel_Type','Seller_Type','Transmission' using One hot encoding

In [None]:
# Analysing date column
plt.figure(figsize=(12,3))
sns.countplot(data.Year)

      We will convert the date column to a numerical field by subtracting it from 2020, as cars have depreciation value

## Feature Engineering

In [None]:
# Creating new year column
data['No_of_year'] = 2020 - data['Year'] 

In [None]:
# One hot encoding
Nominal =['Fuel_Type','Seller_Type','Transmission']
data = pd.get_dummies(data,columns=Nominal,drop_first=True)

In [None]:
# Dropping unwanted columns
drop_var = ['Car_Name','Year']
data.drop(drop_var,axis=1,inplace=True)

In [None]:
data.head(2)

In [None]:
# Checking the corelation between the variables
sns.heatmap(data.corr(),annot=True,cmap='viridis')

## Feature Importance

Feature importance scores play an important role in a predictive modeling project, including providing insight into the data,
insight into the model, and the basis for dimensionality reduction and feature selection that can improve the efficiency and 
effectiveness of a predictive model on the problem.

Feature importance scores can provide insight into the dataset. The relative scores can highlight which features may be most 
relevant to the target, and the converse, which features are the least relevant. This may be interpreted by a domain expert 
and could be used as the basis for gathering more or different data.

In [None]:
# Splitting data into dependent and independent variables
X = data.iloc[:,1:]
y = data.iloc[:,0]

In [None]:
# ExtrTressRegressor to see the feature importance
Imp_reg = ExtraTreesRegressor()
Imp_reg.fit(X,y)

In [None]:
# Important features
Importnt_feat = pd.Series(Imp_reg.feature_importances_,X.columns).sort_values(ascending=False)
print(Importnt_feat)

In [None]:
# Let's visualise it
Importnt_feat.plot(kind='barh')

## Cross Validation

In Machine Learning there are several ways the split your data into training and test sets in order to determine how a
model performs on them, making sure that our model performs well no matter how the data is partitioned.
Suppose we have a model with one or more unknown parameters, and a data set to which the model can be fit 
(the training data set). The fitting process optimizes the model parameters to make the model fit the training data 
as well as possible. If we then take an independent sample of validation data from the same population as the training 
data, it will generally turn out that the model does not fit the validation data as well as it fits the training data. 
The size of this difference is likely to be large especially when the size of the training data set is small, or when the
number of parameters in the model is large. Cross-validation is a way to estimate the size of this effect.


Please note cross validation is a validating technique.It is not a technique to split the data into test and train to
build the model. It is suggested to run this validation across multiple classifiers to finally choose one. Gernerally, train
and test split will divide the data based random state, This may lead to different accuracy. Cross validation, will help us 
undersatnd the variation in the accuracy. The classifier which has less deviation will be selected as it ies more stable

In [None]:
models = [LinearRegression(),RandomForestRegressor()]
mean = []
std = []

for model in models:
    mean_score = cross_val_score(model,X,y,scoring='neg_mean_squared_error',cv=5)
    mean.append(mean_score)
    std.append(mean_score)

In [None]:
classifiers=['Linear Regression', 'Random Forest']

for i in range(len(mean)):
    sns.distplot(mean[i],hist=False, kde_kws={"shade": True})
plt.title("Distribution of each classifier's Accuracy", fontsize=15)
plt.legend(classifiers)
plt.xlabel("Neg_mean_squared_error", labelpad=20)
plt.yticks([])
    
    

## Model Building

In [None]:
# Train and test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

### Linear Regression

In [None]:
Lr_reg = LinearRegression()
Lr_reg.fit(X_train,y_train)

In [None]:
# For train data
pred_lr_train = Lr_reg.predict(X_train)
print('The r2_score for linear regression is {}'.format(r2_score(y_train,pred_lr_train)))
print('The MAE for linear regression is {}'.format(mean_absolute_error(y_train,pred_lr_train)))
print('The MSE for linear regression is {}'.format(mean_squared_error(y_train,pred_lr_train)))
print('The RMSE for linear regression is {}'.format(np.sqrt(mean_squared_error(y_train,pred_lr_train))))

In [None]:
# For test data
pred_lr = Lr_reg.predict(X_test)
print('The r2_score for linear regression is {}'.format(r2_score(y_test,pred_lr)))
print('The MAE for linear regression is {}'.format(mean_absolute_error(y_test,pred_lr)))
print('The MSE for linear regression is {}'.format(mean_squared_error(y_test,pred_lr)))
print('The RMSE for linear regression is {}'.format(np.sqrt(mean_squared_error(y_test,pred_lr))))

In [None]:
# Coeff table
Coeff_table = pd.DataFrame(Lr_reg.coef_,X.columns,columns=['Coeff'])
Coeff_table.sort_values(by='Coeff',ascending=False)

### Ridge Regression (L1) (Just for learning purpose)

In [None]:
from sklearn.linear_model import Ridge
rid = Ridge()
params = {'alpha':[0,1,0.01]}
result = RandomizedSearchCV(rid,params,scoring='neg_mean_squared_error',cv=5,n_jobs = 1,n_iter = 10)
result.fit(X,y)

In [None]:
print(result.best_params_)
print(result.best_score_)

### Random Forest Regressor

In [None]:
 #Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

In [None]:
# Random search of parameters, using 5 fold cross validation, 

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
grid_result = rf_random.fit(X,y)

In [None]:
# Printing the best params
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# Running the regresson model for best param
rf_best = RandomForestRegressor(n_estimators= 700,min_samples_split=15, min_samples_leaf= 1,max_features= 'auto', max_depth=20)

In [None]:
# Fitting the model on training data
rf_best.fit(X_train,y_train)
pred_rf = rf_best.predict(X_test) 
print('The r2_score for random forest regression is {}'.format(r2_score(y_test,pred_rf)))
print('The MAE for random forest regression is {}'.format(mean_absolute_error(y_test,pred_rf)))
print('The MSE for random forest regression is {}'.format(mean_squared_error(y_test,pred_rf)))
print('The RMSE for random forest regression is {}'.format(np.sqrt(mean_squared_error(y_test,pred_rf))))