# Car Price Prediction

## Part 2 - Model Buliding

### Imports

In [23]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns 
sns.set()

import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import confusion_matrix , accuracy_score , roc_curve,classification_report , auc , recall_score

from sklearn.model_selection import cross_val_score as cvs
from sklearn.model_selection import GridSearchCV

In [2]:
dataset = pd.read_csv('cars_with_cmp.csv')
# 0name,1manufacturer,2model,3year,4km_driven,5fuel,6transmission,7owner,8mileage_int,9cmp,10selling_price
# X = dataset.iloc[:, [1,2,3,4,5,6,7,8,9]]
X = dataset.iloc[:, [1,3,4,5,6,7,8,9]]
y = dataset.iloc[:, -1]

In [3]:
X.head(1)

Unnamed: 0,manufacturer,year,km_driven,fuel,transmission,owner,mileage_int,cmp
0,Maruti,2014,145500,Diesel,Manual,First Owner,23,690000


### Train Test Split & Feature Scaling

In [4]:
le = LabelEncoder()
for column in X.drop(['year','km_driven', 'mileage_int', 'cmp'],axis=1).columns:
    X[column]=le.fit_transform(X[column])
X 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column]=le.fit_transform(X[column])


Unnamed: 0,manufacturer,year,km_driven,fuel,transmission,owner,mileage_int,cmp
0,11,2014,145500,1,1,0,23,690000
1,15,2014,120000,1,1,2,21,156000
2,7,2010,127000,1,1,0,23,780000
3,7,2017,45000,2,1,0,20,685000
4,17,2011,90000,1,1,0,24,869000
...,...,...,...,...,...,...,...,...
6801,7,2014,80000,1,1,2,23,698000
6802,7,2013,110000,2,1,0,19,698000
6803,11,2009,120000,1,1,0,19,690000
6804,16,2013,25000,1,1,0,24,566000


In [5]:
# X=power_transform(X,method='yeo-johnson')
# scale = StandardScaler()
# X=scale.fit_transform(X)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Model Building

### Linear Regression

In [7]:
lr= LinearRegression()
lr.fit(X_train,y_train)
lr.coef_
pred_train=lr.predict(X_train)
pred_test=lr.predict(X_test)
print('Linear Regression Score:',lr.score(X_train,y_train))
print('Linear Regression r2_score:',r2_score(y_test,pred_test))
print("Mean squared error of Linear Regression:",mean_squared_error(y_test,pred_test))
print("Root Mean Square error of Linear Regression:",np.sqrt(mean_squared_error(y_test,pred_test)))

Linear Regression Score: 0.5545828469002703
Linear Regression r2_score: 0.5409897940662145
Mean squared error of Linear Regression: 227918075025.99808
Root Mean Square error of Linear Regression: 477407.66125607793


<b>The accuracy of Linear Regression is only 54% which is too less to use this model </b>

### KNN

In [8]:
knr = KNeighborsRegressor()
knr.fit(X_train,y_train)
pred_train_knr=knr.predict(X_train)
pred_test_knr=knr.predict(X_test)
print('K Neighbors Regressor Score:',knr.score(X_train,y_train))
print('K Neighbors Regressor r2_score:',r2_score(y_test,pred_test_knr))
print("Mean squared error of K Neighbors Regressor:",mean_squared_error(y_test,pred_test_knr))
print("Root Mean Square error of K Neighbors Regressor:",np.sqrt(mean_squared_error(y_test,pred_test_knr)))

K Neighbors Regressor Score: 0.8725937174784384
K Neighbors Regressor r2_score: 0.8037958462868734
Mean squared error of K Neighbors Regressor: 97423700929.32468
Root Mean Square error of K Neighbors Regressor: 312127.6997149158


<b>The accuracy of K Neighbors Regressor is 80% which is good.</b>

### Decision Tree

In [20]:
dtr=DecisionTreeRegressor(criterion='mse')
dtr.fit(X_train,y_train)
pred_train_dtr=dtr.predict(X_train)
pred_test_dtr=dtr.predict(X_test)
print('Decision Tree Regressor Score:',dtr.score(X_train,y_train))
print('Decision Tree Regressor r2_score:',r2_score(y_test,pred_test_dtr))
print("Mean squared error of Decision Tree Regressor:",mean_squared_error(y_test,pred_test_dtr))
print("Root Mean Square error of Decision Tree Regressor:",np.sqrt(mean_squared_error(y_test,pred_test_dtr)))


Decision Tree Regressor Score: 0.9994652794959573
Decision Tree Regressor r2_score: 0.9624081097716884
Mean squared error of Decision Tree Regressor: 18665971141.0891
Root Mean Square error of Decision Tree Regressor: 136623.4648261019




<b>The accuracy of Decision Tree Regressor is 96% which is in acceptable range but we will try random forest regressor </b>

### Random Forest Regression

In [10]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)
pred_train_rf=rf.predict(X_train)
pred_test_rf=rf.predict(X_test)
print('Random Forest Regressor Score:',rf.score(X_train,y_train))
print('Random Forest Regressor r2_score:',r2_score(y_test,pred_test_rf))
print("Mean squared error of Random Forest Regressor:",mean_squared_error(y_test,pred_test_rf))
print("Root Mean Square error of Random Forest Regressor:",np.sqrt(mean_squared_error(y_test,pred_test_rf)))

Random Forest Regressor Score: 0.9944236142257203
Random Forest Regressor r2_score: 0.9742054326883964
Mean squared error of Random Forest Regressor: 12808098930.674566
Root Mean Square error of Random Forest Regressor: 113172.87188489371


<b>The accuracy of Random Forest Regressor is 97.39% which is best among all.

## Cross Validation

In [25]:

print('Cross Validation Score of Linear Regression is',(cvs(lr,X,y,cv=5).mean())*100)
print('Cross Validation Score of KNeighbors Regressor is',(cvs(knr,X,y,cv=5).mean())*100)
print('Cross Validation Score of Decision Tree Regressor is',(cvs(dtr,X,y,cv=5).mean())*100)
print('Cross Validation Score of Random Forest Regressor is',(cvs(rf,X,y,cv=5).mean())*100)

Cross Validation Score of Linear Regression is 54.239029647672844
Cross Validation Score of KNeighbors Regressor is 78.64082931416318




Cross Validation Score of Decision Tree Regressor is 94.14247613695996
Cross Validation Score of Random Forest Regressor is 96.02956169097553


In [12]:
parameter = { 'bootstrap': [True, False],
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10],}

gvc = GridSearchCV(RandomForestRegressor(),parameter,cv=5)
gvc.fit(X_train,y_train)
gvc.best_params_

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


{'bootstrap': False,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2}

### Training hyperparamerter tuned model

In [13]:
rf_hpt = RandomForestRegressor(bootstrap=False,min_samples_leaf=1,max_features='sqrt',min_samples_split=2)
rf_hpt.fit(X_train,y_train)
pred=rf_hpt.predict(X_test)
acc=r2_score(y_test,pred)
print('Score of Hyper Parameter Tuned Ranfom Forest Regressor is:',rf_hpt.score(X_train,y_train))
print('Accuracy for predicting price of car is', (acc*100),'%')
print("Mean squared error of Hyper Parameter Tuned Random Forest Regressor:",mean_squared_error(y_test,pred))
print("Root Mean Square error of Hyper Parameter Tuned Random Forest Regressor:",np.sqrt(mean_squared_error(y_test,pred)))

Score of Hyper Parameter Tuned Ranfom Forest Regressor is: 0.9994651541275414
Accuracy for predicting price of car is 97.21279783084901 %
Mean squared error of Hyper Parameter Tuned Random Forest Regressor: 13839643321.412754
Root Mean Square error of Hyper Parameter Tuned Random Forest Regressor: 117642.01341958049


## Predicting 

In [14]:
a= np.array(y_test)
predicted = np.array(rf.predict(X_test))
Price=pd.DataFrame({"Original":a,"Predicted":predicted},index=range(len(a)))
Price

Unnamed: 0,Original,Predicted
0,325000,4.815400e+05
1,700000,5.822667e+05
2,750000,7.500000e+05
3,340000,2.706500e+05
4,450000,4.266924e+05
...,...,...
1357,425000,3.465700e+05
1358,132000,1.598300e+05
1359,391000,4.725900e+05
1360,3800000,3.800000e+06


## Model Saving

In [15]:

# import pickle
# filename = 'car_price_prediction.pkl'
# pickle.dump(rf_hpt,open(filename,'wb'))