In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('cleaned_car_dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,0,Hyundai Santro Xing,Hyundai,2007,80000.0,45000,Petrol
1,1,Mahindra Jeep CL550,Mahindra,2006,425000.0,40,Diesel
2,2,Hyundai Grand i10,Hyundai,2014,325000.0,28000,Petrol
3,3,Ford EcoSport Titanium,Ford,2014,575000.0,36000,Diesel
4,4,Ford Figo,Ford,2012,175000.0,41000,Diesel


In [4]:
del df['Unnamed: 0']

In [5]:
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000.0,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000.0,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000.0,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000.0,36000,Diesel
4,Ford Figo,Ford,2012,175000.0,41000,Diesel


In [6]:
df.corr()

Unnamed: 0,year,Price,kms_driven
year,1.0,0.318706,-0.233695
Price,0.318706,1.0,-0.13514
kms_driven,-0.233695,-0.13514,1.0


# Model Understanding:

In [7]:
df['name'].value_counts()

Maruti Suzuki Swift            51
Maruti Suzuki Alto             42
Maruti Suzuki Wagon            28
Maruti Suzuki Ertiga           16
Hyundai Santro Xing            15
                               ..
Mercedes Benz A                 1
Tata Manza ELAN                 1
Volkswagen Polo Comfortline     1
Nissan Sunny                    1
Tata Zest XM                    1
Name: name, Length: 254, dtype: int64

In [8]:
df['company'].value_counts()

Maruti        221
Hyundai       139
Mahindra       98
Tata           65
Honda          60
Toyota         36
Chevrolet      34
Renault        33
Ford           30
Volkswagen     19
Skoda          13
Audi           11
Mini            8
BMW             8
Datsun          7
Mitsubishi      6
Nissan          6
Mercedes        6
Fiat            4
Force           4
Hindustan       3
Jaguar          2
Land            1
Jeep            1
Volvo           1
Name: company, dtype: int64

# Model Preparation :

In [9]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
from sklearn.compose import ColumnTransformer

In [11]:
from sklearn.pipeline import Pipeline

# Variable Formation

In [12]:
X = df.drop('Price',axis=1)

In [13]:
y = df['Price']

In [14]:
X.head()

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,36000,Diesel
4,Ford Figo,Ford,2012,41000,Diesel


In [15]:
y.head()

0     80000.0
1    425000.0
2    325000.0
3    575000.0
4    175000.0
Name: Price, dtype: float64

# Train Test Split

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [18]:
X_train.shape

(652, 5)

In [19]:
X_test.shape

(164, 5)

Training Our Model

# Model Formation

In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [22]:
X.head(1)

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,45000,Petrol


In [23]:
ohe=OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

OneHotEncoder()

In [24]:
step1=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),
                                    remainder='passthrough')

In [25]:
step2= LinearRegression()

In [26]:
pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

In [27]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('step1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories=[array(['Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A6 2.0',
       'Audi A8', 'Audi Q3 2.0', 'Audi Q5 2.0', 'Audi Q7', 'BMW 3 Series',
       'BMW 5 Series', 'BMW 7 Series', 'BMW X1', 'BMW X1 sDrive20d',
       'BMW X1 xDrive20d', 'Chevrolet Beat', 'Chevrolet Beat Diesel',
       'Che...
                                                                            array(['Audi', 'BMW', 'Chevrolet', 'Datsun', 'Fiat', 'Force', 'Ford',
       'Hindustan', 'Honda', 'Hyundai', 'Jaguar', 'Jeep', 'Land',
       'Mahindra', 'Maruti', 'Mercedes', 'Mini', 'Mitsubishi', 'Nissan',
       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],
      dtype=object),
                                                                            array(['Diesel', '

# Model Predictions:

In [28]:
y_train_pred = pipe.predict(X_train)

In [29]:
y_train_pred

array([ 6.86069356e+05,  1.00032486e+06,  1.93779983e+05,  1.65286311e+05,
        3.80012370e+05,  4.62055818e+05,  8.21292876e+04,  3.39867079e+05,
        2.85013056e+05,  1.18325842e+05,  4.71195330e+05,  3.18218890e+05,
        3.55191056e+05,  6.06761233e+05,  2.10527687e+05,  2.54972566e+05,
        3.69048361e+05,  1.88509529e+06,  6.83355400e+05,  2.29172307e+05,
        1.07695556e+05,  5.30713632e+05,  8.75171241e+04,  5.29459155e+05,
        1.85859712e+05,  7.78740640e+05,  1.00714554e+05,  1.19121517e+05,
        2.98062152e+05,  4.20981655e+05,  2.13377181e+05,  2.26323044e+05,
        5.42322756e+05,  3.58443443e+05,  1.56202653e+06,  1.57277603e+06,
        8.44645358e+05,  3.77426950e+05,  2.78585541e+05,  2.54926645e+05,
        3.47135821e+05,  1.51995132e+05,  5.95890570e+05,  3.22168140e+05,
        8.44645358e+05,  5.37596128e+05,  5.12453068e+05,  2.79733518e+05,
        5.14998695e+05,  1.01061295e+05,  5.02170431e+05,  5.19246293e+05,
        8.44260361e+04,  

In [30]:
r2_score(y_train,y_train_pred)

0.8326140559989371

In [31]:
# Test data prediction

In [32]:
y_test_pred = pipe.predict(X_test)

In [33]:
y_test_pred

array([ 4.80518334e+05,  3.51179093e+05,  1.25283461e+05,  1.23150184e+05,
        2.52895472e+05,  1.88484061e+05,  2.37228726e+05,  6.22716526e+05,
        1.54166777e+05,  5.09221574e+05,  1.86888332e+05,  4.04037824e+05,
        1.37226775e+05,  1.48201663e+05,  1.39412824e+05,  5.29592377e+05,
        1.04485967e+06,  3.27453819e+05,  3.09883976e+06,  3.47597616e+05,
        1.31933649e+06,  9.69619958e+05,  1.55301070e+05,  7.04914898e+02,
        1.36525921e+05,  5.92068878e+05,  5.46102547e+05,  8.37888702e+04,
        2.06195202e+05,  2.45308763e+05,  2.08084315e+05,  2.83771443e+05,
        1.94160431e+05,  2.16465552e+05,  1.78153398e+05,  2.29036090e+05,
        3.51464785e+05, -1.79560713e+04,  2.34210566e+05,  2.23816705e+05,
        2.29198731e+05,  5.65208694e+05,  2.21040273e+05,  3.59267207e+04,
        4.67680119e+04,  4.96677569e+05,  4.49985179e+05,  1.88509529e+06,
        3.16243627e+05,  3.28332539e+05,  4.02201937e+05,  1.56116078e+06,
        3.87791962e+05,  

Finding the model with a random state of TrainTestSplit where the model was found to give almost 0.92 as r2_score

In [34]:
scores=[]
for i in range (1000):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    pipe=make_pipeline(step1,step2)
    pipe.fit(X_train,y_train)
    y_test_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_test_pred))
    

In [35]:
np.argmax(scores)

144

In [36]:
scores[np.argmax(scores)]

0.9242249001816017

In [37]:
pipe.predict(
pd.DataFrame(columns=X_test.columns,data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']).reshape(1,5))
)

array([413822.60707328])

## The best model is found at a certain random state

In [38]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
pipe=make_pipeline(step1,step2)
pipe.fit(X_train,y_train)
y_test_pred=pipe.predict(X_test)
r2_score(y_test,y_test_pred)

0.9242249001816017

In [39]:
import pickle as pkl

In [40]:
pkl.dump(pipe,open('pipe.pkl','wb'))
pkl.dump(df,open('df.pkl','wb'))

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        816 non-null    object 
 1   company     816 non-null    object 
 2   year        816 non-null    int64  
 3   Price       816 non-null    float64
 4   kms_driven  816 non-null    int64  
 5   fuel_type   816 non-null    object 
dtypes: float64(1), int64(2), object(3)
memory usage: 38.4+ KB
