In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


# READ DATASET

In [2]:
dataset3=pd.read_csv("cardekho_imputated.csv")

dataset3.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


# CLEAN DATASET

In [3]:
dataset3["car_name"].value_counts()

car_name
Hyundai i20              906
Maruti Swift Dzire       890
Maruti Swift             781
Maruti Alto              778
Honda City               757
                        ... 
Mercedes-AMG C             1
Rolls-Royce Ghost          1
Maserati Quattroporte      1
Isuzu MUX                  1
Force Gurkha               1
Name: count, Length: 121, dtype: int64

In [4]:
dataset3.drop(["brand","car_name"],axis=1,inplace=True)

In [5]:
dataset3["seller_type"].value_counts()

seller_type
Dealer              9539
Individual          5699
Trustmark Dealer     173
Name: count, dtype: int64

In [6]:
dataset3["fuel_type"].value_counts()

fuel_type
Petrol      7643
Diesel      7419
CNG          301
LPG           44
Electric       4
Name: count, dtype: int64

In [7]:
dataset3["transmission_type"].value_counts()

transmission_type
Manual       12225
Automatic     3186
Name: count, dtype: int64

In [8]:
dataset3["seats"].value_counts()

seats
5    12910
7     1922
8      311
6      127
4       77
9       55
2        7
0        2
Name: count, dtype: int64

In [9]:
dataset3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         15411 non-null  int64  
 1   model              15411 non-null  object 
 2   vehicle_age        15411 non-null  int64  
 3   km_driven          15411 non-null  int64  
 4   seller_type        15411 non-null  object 
 5   fuel_type          15411 non-null  object 
 6   transmission_type  15411 non-null  object 
 7   mileage            15411 non-null  float64
 8   engine             15411 non-null  int64  
 9   max_power          15411 non-null  float64
 10  seats              15411 non-null  int64  
 11  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 1.4+ MB


# CHECK FOR NULL VALUE

In [10]:
dataset3.isnull().sum()

Unnamed: 0           0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [11]:
dataset3.head()

Unnamed: 0.1,Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


# TRAIN TEST SPLIT

In [12]:

from sklearn.model_selection import train_test_split
X=dataset3.drop(["selling_price"],axis=1)
Y=dataset3["selling_price"]

# ENCODING 


In [13]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

X["model"]=le.fit_transform(X["model"])

In [14]:
X.head()

Unnamed: 0.1,Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


# convert seller,fuel typpe and tramission type encoding

In [15]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

num_data=X.select_dtypes(exclude="object").columns
cat_data=["seller_type","fuel_type","transmission_type"]


num_transformer=StandardScaler()
cat_transform=OneHotEncoder(drop="first")

preprocessor=ColumnTransformer(

    [
      ("OneHotEncoder" , cat_transform,cat_data),
      ("StandardScaler",num_transformer,num_data)  
    ] , remainder="passthrough"


)

In [16]:
X

Unnamed: 0.1,Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,0,7,9,120000,Individual,Petrol,Manual,19.70,796,46.30,5
1,1,54,5,20000,Individual,Petrol,Manual,18.90,1197,82.00,5
2,2,118,11,60000,Individual,Petrol,Manual,17.00,1197,80.00,5
3,3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.10,5
4,4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5
...,...,...,...,...,...,...,...,...,...,...,...
15406,19537,117,9,10723,Dealer,Petrol,Manual,19.81,1086,68.05,5
15407,19540,42,2,18000,Dealer,Petrol,Manual,17.50,1373,91.10,7
15408,19541,77,6,67000,Dealer,Diesel,Manual,21.14,1498,103.52,5
15409,19542,114,5,3800000,Dealer,Diesel,Manual,16.00,2179,140.00,7


In [17]:
X=preprocessor.fit_transform(X)

In [18]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738694,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738516,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738339,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738162,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.737985,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.723327,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022
15407,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.723859,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444
15408,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.724036,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022
15409,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.724213,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444


In [19]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=40)

# MODEL TRAINING

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,root_mean_squared_error,r2_score


def model_evaluation1(true,predicted):
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    rmse=root_mean_squared_error(true,predicted)
    score=r2_score(true,predicted)
    return mse,mae,rmse,score


models={
       "Lasso":Lasso(),
       "Ridge":Ridge(),
       "random forest":RandomForestRegressor(),
       "Adaboost":AdaBoostRegressor(),
       "DecisionTREE":DecisionTreeRegressor(),
       "Kneighbours":KNeighborsRegressor(),
       "Gredientboosting":GradientBoostingRegressor(),
       "LinearRegression":LinearRegression()
}

for name,model in models.items():
    model.fit(x_train,y_train)

    x_train_predict=model.predict(x_train)
    x_test_predict=model.predict(x_test)


    mse_train,mae_train,rmse_train,r2score_train=model_evaluation1(y_train,x_train_predict)
    mse_test,mae_test,rmse_test,r2_score_test=model_evaluation1(y_test,x_test_predict)


    print(f"\n{name}")
    print("-------------------------------------------------")
    print("THE PREDICTION OF TRAIN MODEL IS :")
    print("MEAN SQUARED ERROR IS : {:.4f}".format(mse_train))
    print("MEAN ABSOLUTE ERROR: {:.4F}".format(mae_train))
    print("ROOT MEAN SQUARED ERROR :{:.4f}".format(rmse_train))
    print("R2 score :{:.4f}".format(r2score_train))

    print("-------------------------------------------------")
    print("-------------------------------------------------")
    print("THE PREDICTION OF TEST MODEL IS :")
    print("MEAN SQUARED ERROR IS : {:.4f}".format(mse_test))
    print("MEAN ABSOLUTE ERROR: {:.4F}".format(mae_test))
    print("ROOT MEAN SQUARED ERROR :{:.4f}".format(rmse_test))
    print("R2 score :{:.4f}".format(r2_score_test))


Lasso
-------------------------------------------------
THE PREDICTION OF TRAIN MODEL IS :
MEAN SQUARED ERROR IS : 335612387984.9074
MEAN ABSOLUTE ERROR: 277641.9030
ROOT MEAN SQUARED ERROR :579320.6262
R2 score :0.6168
-------------------------------------------------
-------------------------------------------------
THE PREDICTION OF TEST MODEL IS :
MEAN SQUARED ERROR IS : 202718328336.3163
MEAN ABSOLUTE ERROR: 262719.8594
ROOT MEAN SQUARED ERROR :450242.5217
R2 score :0.6736

Ridge
-------------------------------------------------
THE PREDICTION OF TRAIN MODEL IS :
MEAN SQUARED ERROR IS : 335612981076.1039
MEAN ABSOLUTE ERROR: 277599.7710
ROOT MEAN SQUARED ERROR :579321.1381
R2 score :0.6168
-------------------------------------------------
-------------------------------------------------
THE PREDICTION OF TEST MODEL IS :
MEAN SQUARED ERROR IS : 202707346804.8712
MEAN ABSOLUTE ERROR: 262694.8638
ROOT MEAN SQUARED ERROR :450230.3264
R2 score :0.6736

random forest
-----------------

# we got random forest and knn best model so hyperparamater tuning on both this models

In [21]:
knn_params={"n_neighbors":[2,3,5,7,8,9,10,12]}
rf_params={"max_depth":[5,8,10,None,10],
           "max_features":[5,7,"auto",8],
           "min_samples_split":[2,8,15,20],
           "n_estimators":[100,200,500,1000]
           }
          

In [22]:
randomcv_models=[("KNN",KNeighborsRegressor(),knn_params),
          ("RF",RandomForestRegressor(),rf_params)
        ]


In [23]:
from sklearn.model_selection import RandomizedSearchCV
import warnings

warnings.filterwarnings("ignore")

model_params={}

for name,model,params in randomcv_models:
    random=RandomizedSearchCV(estimator=model,
                              param_distributions=params,
                              n_iter=100,
                              n_jobs=-1,
                              verbose=2,
                              cv=3)
    
    random.fit(x_train,y_train)
    model_params[name]=random.best_params_

for model_name in model_params:
    print(f"------------------------{model_name}-------------------")
    print(model_params[model_name])


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
------------------------KNN-------------------
{'n_neighbors': 5}
------------------------RF-------------------
{'n_estimators': 1000, 'min_samples_split': 2, 'max_features': 7, 'max_depth': None}


# NOW CHECKING AGAIN THE ACCURACY FOR BOTH THIS MODELS

In [24]:
def model_evaluation1(true,predicted):
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    rmse=root_mean_squared_error(true,predicted)
    score=r2_score(true,predicted)
    return mse,mae,rmse,score


models={
       
       "Random forest":RandomForestRegressor(n_estimators=100,min_samples_split=2,max_features=8,max_depth=10),
       
       "Kneighbours":KNeighborsRegressor(n_neighbors=5),
       
}

for name,model in models.items():
    model.fit(x_train,y_train)

    x_train_predict=model.predict(x_train)
    x_test_predict=model.predict(x_test)


    mse_train,mae_train,rmse_train,r2score_train=model_evaluation1(y_train,x_train_predict)
    mse_test,mae_test,rmse_test,r2_score_test=model_evaluation1(y_test,x_test_predict)


    print(f"\n{name}")
    print("-------------------------------------------------")
    print("THE PREDICTION OF TRAIN MODEL IS :")
    print("MEAN SQUARED ERROR IS : {:.4f}".format(mse_train))
    print("MEAN ABSOLUTE ERROR: {:.4F}".format(mae_train))
    print("ROOT MEAN SQUARED ERROR :{:.4f}".format(rmse_train))
    print("R2 score :{:.4f}".format(r2score_train))

    print("-------------------------------------------------")
    print("-------------------------------------------------")
    print("THE PREDICTION OF TEST MODEL IS :")
    print("MEAN SQUARED ERROR IS : {:.4f}".format(mse_test))
    print("MEAN ABSOLUTE ERROR: {:.4F}".format(mae_test))
    print("ROOT MEAN SQUARED ERROR :{:.4f}".format(rmse_test))
    print("R2 score :{:.4f}".format(r2_score_test))


Random forest
-------------------------------------------------
THE PREDICTION OF TRAIN MODEL IS :
MEAN SQUARED ERROR IS : 38710326108.0409
MEAN ABSOLUTE ERROR: 83168.3712
ROOT MEAN SQUARED ERROR :196749.3993
R2 score :0.9558
-------------------------------------------------
-------------------------------------------------
THE PREDICTION OF TEST MODEL IS :
MEAN SQUARED ERROR IS : 43655752713.5224
MEAN ABSOLUTE ERROR: 102703.0141
ROOT MEAN SQUARED ERROR :208939.5911
R2 score :0.9297

Kneighbours
-------------------------------------------------
THE PREDICTION OF TRAIN MODEL IS :
MEAN SQUARED ERROR IS : 112827784413.9937
MEAN ABSOLUTE ERROR: 98867.2059
ROOT MEAN SQUARED ERROR :335898.4734
R2 score :0.8712
-------------------------------------------------
-------------------------------------------------
THE PREDICTION OF TEST MODEL IS :
MEAN SQUARED ERROR IS : 71103748985.9970
MEAN ABSOLUTE ERROR: 118144.5826
ROOT MEAN SQUARED ERROR :266652.8623
R2 score :0.8855


# PROJECT COMPLETED