In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler,MinMaxScaler
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score,f1_score,recall_score,confusion_matrix,mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold,train_test_split,RandomizedSearchCV
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.utils import resample
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
import joblib

In [2]:
df = pd.read_csv('cardata.csv') 

In [3]:
df.head(5)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [5]:
df.shape

(301, 9)

In [6]:
df['Car_Name'].unique()

array(['ritz', 'sx4', 'ciaz', 'wagon r', 'swift', 'vitara brezza',
       's cross', 'alto 800', 'ertiga', 'dzire', 'alto k10', 'ignis',
       '800', 'baleno', 'omni', 'fortuner', 'innova', 'corolla altis',
       'etios cross', 'etios g', 'etios liva', 'corolla', 'etios gd',
       'camry', 'land cruiser', 'Royal Enfield Thunder 500',
       'UM Renegade Mojave', 'KTM RC200', 'Bajaj Dominar 400',
       'Royal Enfield Classic 350', 'KTM RC390', 'Hyosung GT250R',
       'Royal Enfield Thunder 350', 'KTM 390 Duke ',
       'Mahindra Mojo XT300', 'Bajaj Pulsar RS200',
       'Royal Enfield Bullet 350', 'Royal Enfield Classic 500',
       'Bajaj Avenger 220', 'Bajaj Avenger 150', 'Honda CB Hornet 160R',
       'Yamaha FZ S V 2.0', 'Yamaha FZ 16', 'TVS Apache RTR 160',
       'Bajaj Pulsar 150', 'Honda CBR 150', 'Hero Extreme',
       'Bajaj Avenger 220 dtsi', 'Bajaj Avenger 150 street',
       'Yamaha FZ  v 2.0', 'Bajaj Pulsar  NS 200', 'Bajaj Pulsar 220 F',
       'TVS Apache RTR 180', 

In [7]:
df.drop(['Car_Name'] , axis = 1,inplace = True)
df['Year'].max()

2018

In [8]:
df['age'] = 2020 - df['Year']

In [9]:
df['Owner'].value_counts()

0    290
1     10
3      1
Name: Owner, dtype: int64

In [10]:
df['Owner'] = df['Owner'].map({0:1,1:2,3:3}) 

In [11]:
df['Owner'].value_counts()

1    290
2     10
3      1
Name: Owner, dtype: int64

In [12]:
df['Owner'] = df['Owner'].astype(str) 
df['Owner'].value_counts()

1    290
2     10
3      1
Name: Owner, dtype: int64

In [13]:
df['Owner'].value_counts()

1    290
2     10
3      1
Name: Owner, dtype: int64

In [14]:
df['Owner'].dtype

dtype('O')

In [15]:
df['AgeAdjustedPrice'] = (df['age'].max() - df['age'] + 1)*df['Present_Price']

In [16]:
df.corr()['Selling_Price'].sort_values(ascending = False)

Selling_Price       1.000000
AgeAdjustedPrice    0.975848
Present_Price       0.878983
Year                0.236141
Kms_Driven          0.029187
age                -0.236141
Name: Selling_Price, dtype: float64

In [17]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,age,AgeAdjustedPrice
count,301.0,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,6.372093,87.517748
std,2.891554,5.082812,8.644115,38886.883882,2.891554,95.669908
min,2003.0,0.1,0.32,500.0,2.0,1.71
25%,2012.0,0.9,1.2,15000.0,4.0,13.5
50%,2014.0,3.6,6.4,32000.0,6.0,70.0
75%,2016.0,6.0,9.9,48767.0,8.0,117.6
max,2018.0,35.0,92.6,500000.0,17.0,740.8


In [18]:
df.dtypes

Year                  int64
Selling_Price       float64
Present_Price       float64
Kms_Driven            int64
Fuel_Type            object
Seller_Type          object
Transmission         object
Owner                object
age                   int64
AgeAdjustedPrice    float64
dtype: object

In [19]:
df['Selling_Price'].skew()

2.493422417797524

In [20]:
df.columns

Index(['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type',
       'Seller_Type', 'Transmission', 'Owner', 'age', 'AgeAdjustedPrice'],
      dtype='object')

In [21]:
cols = df.describe().columns

In [22]:
for column in cols: 
    if (df[column].skew() > 1): 
        df[column] = np.log1p(df[column])

In [23]:
df['Selling_Price'].skew()

0.115141322922288

In [24]:
df.columns

Index(['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type',
       'Seller_Type', 'Transmission', 'Owner', 'age', 'AgeAdjustedPrice'],
      dtype='object')

In [25]:
df = pd.get_dummies(df,drop_first = True)

In [26]:
df.columns

Index(['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'age',
       'AgeAdjustedPrice', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol',
       'Seller_Type_Individual', 'Transmission_Manual', 'Owner_2', 'Owner_3'],
      dtype='object')

In [27]:
target = df['Selling_Price'] 
df.drop(['Selling_Price'] , axis = 1,inplace = True) 
X_train,X_test,y_train,y_test = train_test_split(df,target,test_size = 0.2) 


In [28]:
X_train.head()

Unnamed: 0,Year,Present_Price,Kms_Driven,age,AgeAdjustedPrice,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual,Owner_2,Owner_3
130,2017,0.625938,9.305741,1.386294,2.642622,0,1,1,1,0,0
210,2012,1.722767,10.485033,2.197225,3.850148,0,1,0,1,0,0
25,2011,1.790091,9.903538,2.302585,3.826683,0,1,0,1,0,0
36,2015,2.104134,10.79483,1.791759,4.549657,0,1,0,1,0,0
205,2016,1.902108,8.158802,1.609438,4.391977,0,1,0,1,1,0


# Model Training

In [29]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [30]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}  
              


In [31]:
model = RandomForestRegressor()

In [32]:
tuned_model = RandomizedSearchCV(estimator = model, param_distributions = random_grid,cv = 5, n_iter = 10 , scoring= 'neg_mean_squared_error') 


In [33]:
tuned_model.fit(X_train,y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [34]:
prediction = tuned_model.predict(X_test)

In [35]:
mse = mean_squared_error(np.expm1(prediction) , np.expm1(np.asarray(y_test))) 
rmse = np.sqrt(mse) 
mae = mean_absolute_error(np.expm1(prediction) , np.expm1(np.asarray(y_test)))

In [36]:
print(prediction)

[0.40725979 0.20566906 1.75245785 0.41501017 1.78153767 1.99531435
 3.04752463 2.21972776 1.82158963 0.21817123 0.20824716 2.10147884
 0.38507392 0.73080456 1.6364549  0.20584178 0.33553896 1.82665766
 0.23829852 1.95473    0.52668607 1.69411592 2.28858188 0.34726136
 0.4916842  2.31076404 1.76313174 2.033467   1.34085848 0.43272033
 0.54663112 1.25723292 0.38165379 0.38294805 0.44415614 2.1914588
 2.25670548 0.87115931 0.56098034 0.20140556 0.34058739 1.95127087
 2.09540438 2.32010056 3.04820775 1.37151005 1.95237193 0.74902043
 0.73181482 0.89926516 2.3898649  0.37012266 1.8303743  2.32150883
 0.77671975 3.04705789 2.14384623 2.95485988 1.38061632 1.66676425
 1.64620497]


In [37]:
print(np.expm1(prediction))

[ 0.50269444  0.22834663  4.768764    0.51438615  4.93898161  6.35451455
 20.06314079  8.20482464  5.18167721  0.24380003  0.23151752  7.17825527
  0.46972297  1.0767508   4.1369263   0.2285588   0.39869403  5.2130857
  0.26908799  6.062012    0.69331149  4.44183281  8.8609438   0.41518656
  0.63506769  9.08212487  4.83066898  6.64053023  2.8223235   0.54144506
  0.72742372  2.51567985  0.46470491  0.46660184  0.55917392  7.94825727
  8.55156942  1.38967964  0.7523896   0.22312072  0.40577309  6.0376258
  7.1287274   9.17669765 20.0775344   2.94129777  6.04537896  1.11492728
  1.07884993  1.45779636  9.9120196   0.44791221  5.23622044  9.19103927
  1.17432823 20.05331201  7.5321914  18.19903239  2.97735219  4.29500673
  4.18725661]


In [38]:
print("mae %.4f\nmse %.4f\nrmse %.4f" % (mae,mse,rmse))

mae 0.7730
mse 3.6146
rmse 1.9012


In [39]:
import pickle  
file = open('model3.pkl' , 'wb')
pickle.dump(tuned_model,file) 


In [40]:
df = pd.read_csv('cardata.csv') 
df['age'] = 2020 - df['Year'] 
df['AgeAdjustedPrice'] = (df['age'].max() - df['age'] + 1)*df['Present_Price'] 
df['Owner'] = df['Owner'].astype(str) 
cols = df.describe().columns 
for column in cols: 
    if (df[column].skew() > 1):  
        print(column)
        df[column] = np.log1p(df[column]) 

Selling_Price
Present_Price
Kms_Driven
age
AgeAdjustedPrice


In [41]:
X_test.head(5)

Unnamed: 0,Year,Present_Price,Kms_Driven,age,AgeAdjustedPrice,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual,Owner_2,Owner_3
167,2014,0.593327,10.645449,1.94591,2.372111,0,1,1,1,0,0
199,2007,0.457425,10.878066,2.639057,1.360977,0,1,1,1,0,0
294,2014,2.054124,10.404869,1.94591,4.41401,0,1,0,1,0,0
152,2012,0.688135,9.472782,2.197225,2.388763,0,1,1,1,0,0
287,2015,2.140066,10.691968,1.791759,4.590057,0,1,0,0,0,0


In [42]:
df['Year']

0      2014
1      2013
2      2017
3      2011
4      2014
       ... 
296    2016
297    2015
298    2009
299    2017
300    2016
Name: Year, Length: 301, dtype: int64

In [43]:
df['Year'].skew()

-1.2461891109538386

In [46]:
df['Year'].min()

2003