# MODEL BUILDING for CarDekho Data

In [1]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

Loading from EDA pickle file

In [2]:
EDAFile=pickle.load(open('EDA_CarDekho.pkl','rb'))


In [3]:
EDAFile

Unnamed: 0,selling_price,km_driven,owner,mileage,engine,max_power,seats,model,year_old,transmission,Seller_Dealer,Seller_Individual,Seller_Trustmark Dealer,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol
0,450000,145500,1,23.40,1248,74.00,5.0,Swift,8,1,0,1,0,0,1,0,0
1,370000,120000,2,21.14,1498,103.52,5.0,Rapid,8,1,0,1,0,0,1,0,0
2,158000,140000,3,17.70,1497,78.00,5.0,City,16,1,0,1,0,0,0,0,1
3,225000,127000,1,23.00,1396,90.00,5.0,i20,12,1,0,1,0,0,1,0,0
4,130000,120000,1,16.10,1298,88.20,5.0,Swift,15,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8121,260000,50000,2,18.90,998,67.10,5.0,Wagon,9,1,0,1,0,0,0,0,1
8122,475000,80000,2,22.54,1396,88.73,5.0,i20,8,1,0,1,0,0,1,0,0
8123,320000,110000,1,18.50,1197,82.85,5.0,i20,9,1,0,1,0,0,0,0,1
8124,135000,119000,4,16.80,1493,110.00,5.0,Verna,15,1,0,1,0,0,1,0,0


Seperating Dependant and Independent variables

In [4]:
x =  EDAFile[['km_driven', 'owner', 'mileage', 'engine', 'max_power','seats', 'year_old', 
            'transmission', 'Seller_Dealer', 'Seller_Individual', 'Seller_Trustmark Dealer', 
            'fuel_CNG', 'fuel_Diesel', 'fuel_LPG', 'fuel_Petrol']]


In [5]:
y=EDAFile['selling_price']

In [6]:
x

Unnamed: 0,km_driven,owner,mileage,engine,max_power,seats,year_old,transmission,Seller_Dealer,Seller_Individual,Seller_Trustmark Dealer,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol
0,145500,1,23.40,1248,74.00,5.0,8,1,0,1,0,0,1,0,0
1,120000,2,21.14,1498,103.52,5.0,8,1,0,1,0,0,1,0,0
2,140000,3,17.70,1497,78.00,5.0,16,1,0,1,0,0,0,0,1
3,127000,1,23.00,1396,90.00,5.0,12,1,0,1,0,0,1,0,0
4,120000,1,16.10,1298,88.20,5.0,15,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8121,50000,2,18.90,998,67.10,5.0,9,1,0,1,0,0,0,0,1
8122,80000,2,22.54,1396,88.73,5.0,8,1,0,1,0,0,1,0,0
8123,110000,1,18.50,1197,82.85,5.0,9,1,0,1,0,0,0,0,1
8124,119000,4,16.80,1493,110.00,5.0,15,1,0,1,0,0,1,0,0


In [7]:
y

0       450000
1       370000
2       158000
3       225000
4       130000
         ...  
8121    260000
8122    475000
8123    320000
8124    135000
8125    382000
Name: selling_price, Length: 6683, dtype: int64

# Buliding Regression Model

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.2,random_state=40)

In [9]:
X_train.shape,X_test.shape, y_train.shape, y_test.shape

((5346, 15), (1337, 15), (5346,), (1337,))

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Define two functions to test which regressor will give accurate score

In [12]:
def do_prediction(classifier):
    
    # training the classifier on the dataset
    classifier.fit(X_train, y_train)
    
    #Do prediction and evaluting the prediction
    prediction = classifier.predict(X_test)
    cross_validation_score = cross_val(X_train, y_train, classifier)
    error = mean_absolute_error(y_test, prediction)
    return error, cross_validation_score

def cross_val(xtrain, ytrain, classifier):
    
    # Applying k-Fold Cross Validation
    accuracies = cross_val_score(estimator = classifier, X = xtrain, y = ytrain, cv = 5)
    return accuracies.mean()


# LinearRegression Model

In [13]:
model_1 = LinearRegression()
error, score = do_prediction(model_1)

print('Linear Regression MAE: {}'.format(round(error,2)))
print('Cross validation score: {}'.format(round(score,2)))

Linear Regression MAE: 185424.25
Cross validation score: 0.61


In [14]:
model_1.fit(X_train, y_train)

LinearRegression()

In [15]:
mean_absolute_error(y_train, model_1.predict(X_train))

172791.67840752038

In [16]:
mean_absolute_error(y_test, model_1.predict(X_test))

185424.25059476495

In [17]:
model_1.coef_

array([ -35408.27303509,   -6014.09079822,   30188.55678239,
         37209.25380234,  275928.24099273,    3135.4254743 ,
       -131052.87012794,  -87389.92352889,   19304.16986267,
        -17316.59636817,   -7561.43921667,    -634.5333342 ,
          9006.09618671,    9144.45924412,  -10270.86527251])

# DecisionTreeRegressor Model

In [19]:
model_2 = DecisionTreeRegressor()
error, score = do_prediction(model_2)

print('Decision Tree Regressor MAE: {}'.format(round(error,2)))
print('Cross validation score: {}'.format(round(score,2)))

TypeError: 'DecisionTreeRegressor' object is not callable

# RandomForestRegressor Model

In [20]:
model_3 = RandomForestRegressor()
error, score = do_prediction(model_3)

print('Random Forest Regressor MAE: {}'.format(round(error,2)))
print('Cross validation score: {}'.format(round(score,2)))

Random Forest Regressor MAE: 82151.77
Cross validation score: 0.88


So from the above observations RandomForestRegressor model is the best model with good score.
So using RandomForestRegressor is best for prediction of our car data.



# BestModel: RandomForestRegressor

In [21]:
best_model = RandomForestRegressor(n_estimators=300,
                                  min_samples_split=10,
                                  min_samples_leaf=1,
                                  max_features='sqrt',
                                  max_depth=60,
                                  bootstrap=False)

error, score = do_prediction(best_model)
print('Random Forest with hyperparameter tuning MAE: {}'.format(round(error,2)))
print('Cross validation score: {}'.format(round(score,2)))

Random Forest with hyperparameter tuning MAE: 86945.62
Cross validation score: 0.85


In [22]:
best_model

RandomForestRegressor(bootstrap=False, max_depth=60, max_features='sqrt',
                      min_samples_split=10, n_estimators=300)

In [23]:
pickle.dump(best_model, open('rfr_model.pkl', 'wb'))

In [33]:
y_pred=best_model.predict(X_test)

In [34]:
# compare the actual output values for X_test with the predicted values

car = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
car.reset_index(inplace=True,drop=True)
car

Unnamed: 0,Actual,Predicted
0,540000,578011.069331
1,800000,687886.054955
2,631000,627431.870298
3,100000,123143.099708
4,390000,460852.690708
...,...,...
1332,350000,384892.460286
1333,180000,185438.726787
1334,155000,116039.592841
1335,390000,447884.043324
