In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings .filterwarnings('ignore')
%matplotlib inline

In [31]:
df = pd.read_csv('cardekho_imputated.csv',index_col=[0])

In [32]:
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [33]:
df.drop(columns=['car_name','brand'],inplace=True)

In [34]:
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [35]:
len(df['model'].unique())

120

In [36]:
cat_features = df.select_dtypes(include='object').columns
num_feature = df.select_dtypes(exclude='object').columns
discrete_fea = [ feature for feature in num_feature if len(df[feature].unique())<=25]
continueous_fea = [ feature for feature in num_feature if len(df[feature].unique())>25]

In [37]:
print('object feature :' , len(cat_features))
print('numerical feature :' , len(num_feature))
print('discrete feature :' , len(discrete_fea))
print('contineous feature :' , len(continueous_fea))

object feature : 4
numerical feature : 7
discrete feature : 2
contineous feature : 5


In [38]:
from sklearn.model_selection import train_test_split
x = df.drop('selling_price',axis=1)
y= df['selling_price']

In [39]:
# doning label encoding for the model column as it has many unique elements
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
x['model']=le.fit_transform(x['model'])

In [40]:
x.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [41]:
(len(df['seller_type'].unique()), len(df['transmission_type'].unique()), len(df['fuel_type'].unique()))

(3, 2, 5)

##### here we can do one hot encoding as there are limited no of features

In [42]:
cat_features = ['seller_type','transmission_type','fuel_type']
num_feature = x.select_dtypes(exclude='object').columns

In [43]:
num_feature


Index(['model', 'vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power',
       'seats'],
      dtype='object')

In [44]:
cat_features

['seller_type', 'transmission_type', 'fuel_type']

In [45]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

scaler  = StandardScaler()
encoder = OneHotEncoder(drop='first')

transformer = ColumnTransformer(
    [
       ("OneHotEncoder",encoder,cat_features),
       ("StandardScaler", scaler,num_feature)
    ],remainder="passthrough"
)

x = transformer.fit_transform(x)

In [46]:
x

array([[ 1.        ,  0.        ,  1.        , ..., -1.32425883,
        -1.26335238, -0.40302241],
       [ 1.        ,  0.        ,  1.        , ..., -0.55471774,
        -0.43257082, -0.40302241],
       [ 1.        ,  0.        ,  1.        , ..., -0.55471774,
        -0.47911321, -0.40302241],
       ...,
       [ 0.        ,  0.        ,  1.        , ...,  0.02291783,
         0.06822523, -0.40302241],
       [ 0.        ,  0.        ,  1.        , ...,  1.32979434,
         0.91715831,  2.07344426],
       [ 0.        ,  0.        ,  0.        , ...,  0.02099878,
         0.39588361, -0.40302241]])

In [47]:
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022
15407,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444
15408,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022
15409,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444


In [48]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

## model training and model selection

In [49]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [50]:
def model_evaluate(test,predicted):
    mae = mean_absolute_error(test,predicted)
    mse = mean_squared_error(test,predicted)
    rmse = np.sqrt(mse)
    r2score = r2_score(test,predicted)
    return mae,rmse,r2score

In [51]:
models = {
    "liniear Reagression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "RandomForestRegressor":RandomForestRegressor(),
    "Decision Tree Regressor":DecisionTreeRegressor(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "AdaboostRegressor":AdaBoostRegressor()
}

In [52]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    train_mae,train_rmse,train_r2score = model_evaluate(y_train,y_train_pred)
    test_mae,test_rmse,test_r2score = model_evaluate(y_test,y_test_pred)

    print(list(models.keys())[i])
    print('Model Performence for the traing data')
    print('- root_mean_squared_error:', train_rmse)
    print('- mean_absolute_error:', train_mae)
    print('-r2_score:', train_r2score)
    print('-'*35)
    
    print('Model Performence for the test data')
    print('- root_mean_squared_error:', test_rmse)
    print('- mean_absolute_error:', test_mae)
    print('-r2_score:', test_r2score)
    print('='*35)
    print('\n')


liniear Reagression
Model Performence for the traing data
- root_mean_squared_error: 569128.0258930413
- mean_absolute_error: 272483.12399847724
-r2_score: 0.610421650521705
-----------------------------------
Model Performence for the test data
- root_mean_squared_error: 479921.4392154529
- mean_absolute_error: 276977.9839051148
-r2_score: 0.6820715203889245


Lasso
Model Performence for the traing data
- root_mean_squared_error: 569128.0360233758
- mean_absolute_error: 272482.0436312296
-r2_score: 0.6104216366529133
-----------------------------------
Model Performence for the test data
- root_mean_squared_error: 479920.9054549655
- mean_absolute_error: 276978.2286584658
-r2_score: 0.68207222757786


Ridge
Model Performence for the traing data
- root_mean_squared_error: 569128.842364236
- mean_absolute_error: 272442.46354632336
-r2_score: 0.6104205327425266
-----------------------------------
Model Performence for the test data
- root_mean_squared_error: 479903.5936056117
- mean_abso

#### for the models which have high accuracy we will perform hyperparameter tunning

In [55]:
knn_param = {
    "n_neighbors": [2,3,5,8,10,15]
    }
rf_param = {
   'max_depth':[1,2,3,4,5,8,15,None,10],
   'max_features':['auto','sqrt','log2'],
   'n_estimators':[100,200,500],
   'min_samples_split':[2,8,15,20]
}
AdaBoostRegressor()
ada_boost_param={
    "n_estimators":[50,60,70,80,90],
    "loss":['linear', 'square', 'exponential']
}

In [57]:
## tunning for knn ans random forest and Adaboost

tunnedModel = [
    ("Knn",KNeighborsRegressor(),knn_param),
    ("RandomForestRegressor",RandomForestRegressor(),rf_param),
    ("AddaboostRegressor",AdaBoostRegressor(),ada_boost_param)
]

from sklearn.model_selection import RandomizedSearchCV
model_params={}
for name , model ,param in tunnedModel:
    random = RandomizedSearchCV(estimator=model,param_distributions=param,cv=3,verbose=3,n_jobs=-1)
    random.fit(x_train,y_train)
    model_params[name]=random.best_params_
for model_name in model_params:
    print(f'---------------Best Params for {model_name}-------------------')
    print(model_params[model_name])

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
---------------Best Params for Knn-------------------
{'n_neighbors': 3}
---------------Best Params for RandomForestRegressor-------------------
{'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'log2', 'max_depth': None}
---------------Best Params for AddaboostRegressor-------------------
{'n_estimators': 50, 'loss': 'square'}


In [58]:
tunned_models={
    "randomForsest":RandomForestRegressor(n_estimators=200,min_samples_split=2, max_features= 'log2', max_depth= None),
    "KNeighborsRegressor":KNeighborsRegressor(n_neighbors=3),
    "AdaBoostRegressor":AdaBoostRegressor(n_estimators=50,loss='square')
}

In [59]:
for i in range(len(list(tunned_models))):
    model = list(tunned_models.values())[i]
    model.fit(x_train,y_train)

    y_train_pred1 = model.predict(x_train)
    y_test_pred1 = model.predict(x_test)

    train_mae1,train_rmse1,train_r2score1 = model_evaluate(y_train,y_train_pred1)
    test_mae1,test_rmse1,test_r2score1 = model_evaluate(y_test,y_test_pred1)

    print(list(tunned_models.keys())[i])
    print('Model Performence for the traing data')
    print('- root_mean_squared_error:', train_rmse1)
    print('- mean_absolute_error:', train_mae1)
    print('-r2_score:', train_r2score1)
    print('-'*35)
    
    print('Model Performence for the test data')
    print('- root_mean_squared_error:', test_rmse1)
    print('- mean_absolute_error:', test_mae1)
    print('-r2_score:', test_r2score1)
    print('='*35)
    print('\n')


randomForsest
Model Performence for the traing data
- root_mean_squared_error: 132849.97720569378
- mean_absolute_error: 39141.07263415154
-r2_score: 0.9787725494885019
-----------------------------------
Model Performence for the test data
- root_mean_squared_error: 238640.3569331309
- mean_absolute_error: 103163.93743797053
-r2_score: 0.9213901603493684


KNeighborsRegressor
Model Performence for the traing data
- root_mean_squared_error: 266278.82402021135
- mean_absolute_error: 76960.22990636878
-r2_score: 0.9147198159085397
-----------------------------------
Model Performence for the test data
- root_mean_squared_error: 435741.6316685509
- mean_absolute_error: 124024.60892445216
-r2_score: 0.7379119369606951


AdaBoostRegressor
Model Performence for the traing data
- root_mean_squared_error: 422440.41001149063
- mean_absolute_error: 328240.7373234088
-r2_score: 0.7853624118761375
-----------------------------------
Model Performence for the test data
- root_mean_squared_error: 72