In [1]:
!python --version

Python 3.8.10


In [2]:
# import required libraries
import numpy as  np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV

### step1: Problem Statement

### step2: Data Gathering

In [3]:
df=pd.read_csv('car.csv')
df.head()

Unnamed: 0,full_name,selling_price,new_price,year,seller_type,km_driven,owner_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Maruti Alto Std,1.2 Lakh*,,2012,Individual,"1,20,000 kms",First Owner,Petrol,Manual,Mileage19.7 kmpl,Engine796 CC,Max Power46.3 bhp,Seats5
1,Hyundai Grand i10 Asta,5.5 Lakh*,New Car (On-Road Price) : Rs.7.11-7.48 Lakh*,2016,Individual,"20,000 kms",First Owner,Petrol,Manual,Mileage18.9 kmpl,Engine1197 CC,Max Power82 bhp,Seats5
2,Hyundai i20 Asta,2.15 Lakh*,,2010,Individual,"60,000 kms",First Owner,Petrol,Manual,Mileage17.0 kmpl,Engine1197 CC,Max Power80 bhp,Seats5
3,Maruti Alto K10 2010-2014 VXI,2.26 Lakh*,,2012,Individual,"37,000 kms",First Owner,Petrol,Manual,Mileage20.92 kmpl,Engine998 CC,Max Power67.1 bhp,Seats5
4,Ford Ecosport 2015-2021 1.5 TDCi Titanium BSIV,5.7 Lakh*,New Car (On-Road Price) : Rs.10.14-13.79 Lakh*,2015,Dealer,"30,000 kms",First Owner,Diesel,Manual,Mileage22.77 kmpl,Engine1498 CC,Max Power98.59 bhp,Seats5


### step3 EDA & Feature Engineering

In [4]:
#check for data shape ,nature 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19974 entries, 0 to 19973
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   full_name          19974 non-null  object
 1   selling_price      19974 non-null  object
 2   new_price          9564 non-null   object
 3   year               19974 non-null  int64 
 4   seller_type        19974 non-null  object
 5   km_driven          19974 non-null  object
 6   owner_type         19974 non-null  object
 7   fuel_type          19974 non-null  object
 8   transmission_type  19974 non-null  object
 9   mileage            19814 non-null  object
 10  engine             19864 non-null  object
 11  max_power          19637 non-null  object
 12  seats              19887 non-null  object
dtypes: int64(1), object(12)
memory usage: 2.0+ MB


### 3.1 Full name 

In [5]:
print(df['full_name'].nunique())
print(df['full_name'].unique())
print(df['full_name'].value_counts())
print(df['full_name'].isna().sum())
print(df['full_name'].dtype)

3321
['Maruti Alto Std' 'Hyundai Grand i10 Asta' 'Hyundai i20 Asta' ...
 'Tata Safari 2005-2017 DICOR 2.2 GX 4x2'
 'Ford Ecosport 2015-2021 Signature Edition Petrol BSIV'
 'Toyota Yaris V Optional CVT BSIV']
Maruti Swift Dzire VDI                 210
Maruti Alto 800 LXI                    189
Maruti Wagon R VXI                     163
Maruti Swift VDI                       150
Maruti Alto K10 VXI                    121
                                      ... 
Tata Tiago 2019-2020 XM                  1
Maruti Esteem VX - BSII                  1
Hyundai Santro Xing XL eRLX Euro II      1
Honda City i DTec E                      1
Toyota Yaris V Optional CVT BSIV         1
Name: full_name, Length: 3321, dtype: int64
0
object


In [6]:
df['full_name'][0][0]

'M'

In [7]:
# here we extract only brand name from full name
brand_list=[]
for item in df['full_name']:
    var=item.split(' ')
    brand_list.append(var[0])
#     print(var[0])
#     var[1]
brand_list

['Maruti',
 'Hyundai',
 'Hyundai',
 'Maruti',
 'Ford',
 'Maruti',
 'Hyundai',
 'Maruti',
 'Hyundai',
 'Mahindra',
 'Tata',
 'Renault',
 'Maruti',
 'Nissan',
 'Hyundai',
 'Renault',
 'Mini',
 'Maruti',
 'Maruti',
 'Mercedes-Benz',
 'Maruti',
 'Toyota',
 'Maruti',
 'Maruti',
 'Fiat',
 'Volkswagen',
 'Maruti',
 'Hyundai',
 'Mahindra',
 'Honda',
 'Mahindra',
 'Maruti',
 'Maruti',
 'Honda',
 'Hyundai',
 'Toyota',
 'Renault',
 'Honda',
 'Hyundai',
 'Mahindra',
 'Maruti',
 'Mahindra',
 'Chevrolet',
 'Hyundai',
 'Hyundai',
 'Hyundai',
 'Honda',
 'Mahindra',
 'Maruti',
 'Tata',
 'Maruti',
 'Fiat',
 'Ambassador',
 'Maruti',
 'Maruti',
 'Mahindra',
 'Tata',
 'Mahindra',
 'Toyota',
 'Maruti',
 'Maruti',
 'Maruti',
 'Maruti',
 'Toyota',
 'Maruti',
 'Datsun',
 'Tata',
 'Hyundai',
 'Hyundai',
 'Maruti',
 'Tata',
 'Hyundai',
 'Hyundai',
 'Hyundai',
 'Toyota',
 'Maruti',
 'Hyundai',
 'Mahindra',
 'Volkswagen',
 'Mahindra',
 'Hyundai',
 'Mahindra',
 'Hyundai',
 'Ford',
 'Maruti',
 'Hyundai',
 'Ford',
 '

In [8]:
# here we create new feature with brand name
s1=pd.Series(brand_list)
df['brand_name']=s1
df['brand_name']

0          Maruti
1         Hyundai
2         Hyundai
3          Maruti
4            Ford
           ...   
19969      Toyota
19970      Maruti
19971       Skoda
19972    Mahindra
19973       Honda
Name: brand_name, Length: 19974, dtype: object

In [9]:
df['brand_name'].nunique()
df['brand_name'].unique()

array(['Maruti', 'Hyundai', 'Ford', 'Mahindra', 'Tata', 'Renault',
       'Nissan', 'Mini', 'Mercedes-Benz', 'Toyota', 'Fiat', 'Volkswagen',
       'Honda', 'Chevrolet', 'Ambassador', 'Datsun', 'Kia', 'BMW',
       'Mitsubishi', 'Audi', 'Skoda', 'Land', 'Jaguar', 'Daewoo',
       'Bentley', 'MG', 'Isuzu', 'Porsche', 'Volvo', 'Lexus', 'Jeep',
       'Premier', 'Maserati', 'Force', 'Lamborghini', 'ISUZU', 'Ferrari',
       'OpelCorsa', 'Mercedes-AMG', 'DC', 'Rolls-Royce', 'Opel'],
      dtype=object)

In [10]:
# here we used one hot encoding for brand name
df=pd.get_dummies(df,columns=['brand_name'])
df.head()


Unnamed: 0,full_name,selling_price,new_price,year,seller_type,km_driven,owner_type,fuel_type,transmission_type,mileage,...,brand_name_OpelCorsa,brand_name_Porsche,brand_name_Premier,brand_name_Renault,brand_name_Rolls-Royce,brand_name_Skoda,brand_name_Tata,brand_name_Toyota,brand_name_Volkswagen,brand_name_Volvo
0,Maruti Alto Std,1.2 Lakh*,,2012,Individual,"1,20,000 kms",First Owner,Petrol,Manual,Mileage19.7 kmpl,...,0,0,0,0,0,0,0,0,0,0
1,Hyundai Grand i10 Asta,5.5 Lakh*,New Car (On-Road Price) : Rs.7.11-7.48 Lakh*,2016,Individual,"20,000 kms",First Owner,Petrol,Manual,Mileage18.9 kmpl,...,0,0,0,0,0,0,0,0,0,0
2,Hyundai i20 Asta,2.15 Lakh*,,2010,Individual,"60,000 kms",First Owner,Petrol,Manual,Mileage17.0 kmpl,...,0,0,0,0,0,0,0,0,0,0
3,Maruti Alto K10 2010-2014 VXI,2.26 Lakh*,,2012,Individual,"37,000 kms",First Owner,Petrol,Manual,Mileage20.92 kmpl,...,0,0,0,0,0,0,0,0,0,0
4,Ford Ecosport 2015-2021 1.5 TDCi Titanium BSIV,5.7 Lakh*,New Car (On-Road Price) : Rs.10.14-13.79 Lakh*,2015,Dealer,"30,000 kms",First Owner,Diesel,Manual,Mileage22.77 kmpl,...,0,0,0,0,0,0,0,0,0,0


### 3.2 Selling Price

In [11]:
print(df['selling_price'].nunique())
print(df['selling_price'].unique())
print(df['selling_price'].value_counts())
print(df['selling_price'].isna().sum())
print(df['selling_price'].dtype)

1215
['1.2 Lakh*' '5.5 Lakh*' '2.15 Lakh*' ... '52.95 Lakh*' '16.4 Lakh*'
 '4.56 Lakh*']
4.5 Lakh*      466
3.5 Lakh*      434
5.5 Lakh*      408
6.5 Lakh*      378
4 Lakh*        359
              ... 
1.17 Lakh*       1
10.64 Lakh*      1
14.72 Lakh*      1
4.27 Lakh*       1
4.56 Lakh*       1
Name: selling_price, Length: 1215, dtype: int64
0
object


In [12]:
# df['selling_price'].replace(r'[*a-zA-Z]+','',regex=True)
print(df['selling_price'].T)

0          1.2 Lakh*
1          5.5 Lakh*
2         2.15 Lakh*
3         2.26 Lakh*
4          5.7 Lakh*
            ...     
19969      6.5 Lakh*
19970     9.25 Lakh*
19971     4.25 Lakh*
19972    12.25 Lakh*
19973       12 Lakh*
Name: selling_price, Length: 19974, dtype: object


In [13]:
price=[]
for item in df['selling_price']:
    var=item.split(' ')
#     print(var)
    if var[-1]=='Lakh*':
        temp=int(float(var[0])*100000)

        price.append(temp)
    elif var[-1]=='Cr*':
        temp=int(float(var[0])*10000000)
        price.append(temp)
    else:
        temp=var[0].replace(',','').replace('*','')
        price.append(int(float(temp)))
price    

            

[120000,
 550000,
 215000,
 225999,
 570000,
 350000,
 315000,
 409999,
 1050000,
 575000,
 305000,
 1150000,
 511000,
 409999,
 425000,
 750000,
 3250000,
 650000,
 627000,
 1425000,
 425000,
 605000,
 600000,
 575000,
 260000,
 425000,
 229999,
 1225000,
 375000,
 750000,
 350000,
 600000,
 390000,
 145000,
 700000,
 1150000,
 340000,
 465000,
 125000,
 600000,
 380000,
 400000,
 300000,
 500000,
 850000,
 598000,
 700000,
 900000,
 75000,
 300000,
 409999,
 300000,
 225000,
 300000,
 590000,
 850000,
 108000,
 550000,
 325000,
 625000,
 600000,
 445000,
 530000,
 385000,
 409999,
 350000,
 265000,
 520000,
 750000,
 560000,
 200000,
 600000,
 655000,
 125000,
 425000,
 220000,
 550000,
 1150000,
 500000,
 990000,
 551000,
 1650000,
 350000,
 500000,
 575000,
 545000,
 525000,
 400000,
 155000,
 540000,
 545000,
 545000,
 240000,
 725000,
 480000,
 375000,
 225000,
 400000,
 550000,
 1050000,
 450000,
 775000,
 350000,
 550000,
 1080000,
 575000,
 950000,
 229999,
 270000,
 165000,
 

In [14]:
s2=pd.Series(price)
df['price']=s2

df['price']

0         120000
1         550000
2         215000
3         225999
4         570000
          ...   
19969     650000
19970     925000
19971     425000
19972    1225000
19973    1200000
Name: price, Length: 19974, dtype: int64

In [15]:
df.columns

Index(['full_name', 'selling_price', 'new_price', 'year', 'seller_type',
       'km_driven', 'owner_type', 'fuel_type', 'transmission_type', 'mileage',
       'engine', 'max_power', 'seats', 'brand_name_Ambassador',
       'brand_name_Audi', 'brand_name_BMW', 'brand_name_Bentley',
       'brand_name_Chevrolet', 'brand_name_DC', 'brand_name_Daewoo',
       'brand_name_Datsun', 'brand_name_Ferrari', 'brand_name_Fiat',
       'brand_name_Force', 'brand_name_Ford', 'brand_name_Honda',
       'brand_name_Hyundai', 'brand_name_ISUZU', 'brand_name_Isuzu',
       'brand_name_Jaguar', 'brand_name_Jeep', 'brand_name_Kia',
       'brand_name_Lamborghini', 'brand_name_Land', 'brand_name_Lexus',
       'brand_name_MG', 'brand_name_Mahindra', 'brand_name_Maruti',
       'brand_name_Maserati', 'brand_name_Mercedes-AMG',
       'brand_name_Mercedes-Benz', 'brand_name_Mini', 'brand_name_Mitsubishi',
       'brand_name_Nissan', 'brand_name_Opel', 'brand_name_OpelCorsa',
       'brand_name_Porsche', 'bra

### 3.3  year

In [16]:
print(df['year'].nunique())
print(df['year'].unique())
print(df['year'].value_counts())
print(df['year'].isna().sum())
print(df['year'].dtype)

28
[2012 2016 2010 2015 2013 2018 2019 2017 2014 2011 2007 2009 2004 2020
 2006 2008 2000 2005 2002 1999 2003 1997 2001 1992 2021 1998 1991 1996]
2017    2653
2016    2587
2015    2429
2018    2163
2014    1947
2013    1775
2012    1553
2019    1238
2011    1096
2010     845
2009     444
2008     374
2007     247
2020     240
2006     179
2005      65
2004      44
2003      23
2002      16
2001      15
2000      13
1999      12
2021       6
1998       6
1997       1
1992       1
1991       1
1996       1
Name: year, dtype: int64
0
int64


In [17]:
# here we calculated how old car 
df['car_age']=2022-df['year']
df['car_age']

0        10
1         6
2        12
3        10
4         7
         ..
19969     5
19970     3
19971     7
19972     6
19973     3
Name: car_age, Length: 19974, dtype: int64

In [18]:
df.head()

Unnamed: 0,full_name,selling_price,new_price,year,seller_type,km_driven,owner_type,fuel_type,transmission_type,mileage,...,brand_name_Premier,brand_name_Renault,brand_name_Rolls-Royce,brand_name_Skoda,brand_name_Tata,brand_name_Toyota,brand_name_Volkswagen,brand_name_Volvo,price,car_age
0,Maruti Alto Std,1.2 Lakh*,,2012,Individual,"1,20,000 kms",First Owner,Petrol,Manual,Mileage19.7 kmpl,...,0,0,0,0,0,0,0,0,120000,10
1,Hyundai Grand i10 Asta,5.5 Lakh*,New Car (On-Road Price) : Rs.7.11-7.48 Lakh*,2016,Individual,"20,000 kms",First Owner,Petrol,Manual,Mileage18.9 kmpl,...,0,0,0,0,0,0,0,0,550000,6
2,Hyundai i20 Asta,2.15 Lakh*,,2010,Individual,"60,000 kms",First Owner,Petrol,Manual,Mileage17.0 kmpl,...,0,0,0,0,0,0,0,0,215000,12
3,Maruti Alto K10 2010-2014 VXI,2.26 Lakh*,,2012,Individual,"37,000 kms",First Owner,Petrol,Manual,Mileage20.92 kmpl,...,0,0,0,0,0,0,0,0,225999,10
4,Ford Ecosport 2015-2021 1.5 TDCi Titanium BSIV,5.7 Lakh*,New Car (On-Road Price) : Rs.10.14-13.79 Lakh*,2015,Dealer,"30,000 kms",First Owner,Diesel,Manual,Mileage22.77 kmpl,...,0,0,0,0,0,0,0,0,570000,7


### 3.4'seller_type'

In [19]:
print(df['seller_type'].nunique())
print(df['seller_type'].unique())
print(df['seller_type'].value_counts())
print(df['seller_type'].isna().sum())
print(df['seller_type'].dtype)

3
['Individual' 'Dealer' 'Trustmark Dealer']
Dealer              11970
Individual           7814
Trustmark Dealer      190
Name: seller_type, dtype: int64
0
object


In [20]:
# so here we convert by one hot encoding
df=pd.get_dummies(df,columns=['seller_type'])
df.columns

Index(['full_name', 'selling_price', 'new_price', 'year', 'km_driven',
       'owner_type', 'fuel_type', 'transmission_type', 'mileage', 'engine',
       'max_power', 'seats', 'brand_name_Ambassador', 'brand_name_Audi',
       'brand_name_BMW', 'brand_name_Bentley', 'brand_name_Chevrolet',
       'brand_name_DC', 'brand_name_Daewoo', 'brand_name_Datsun',
       'brand_name_Ferrari', 'brand_name_Fiat', 'brand_name_Force',
       'brand_name_Ford', 'brand_name_Honda', 'brand_name_Hyundai',
       'brand_name_ISUZU', 'brand_name_Isuzu', 'brand_name_Jaguar',
       'brand_name_Jeep', 'brand_name_Kia', 'brand_name_Lamborghini',
       'brand_name_Land', 'brand_name_Lexus', 'brand_name_MG',
       'brand_name_Mahindra', 'brand_name_Maruti', 'brand_name_Maserati',
       'brand_name_Mercedes-AMG', 'brand_name_Mercedes-Benz',
       'brand_name_Mini', 'brand_name_Mitsubishi', 'brand_name_Nissan',
       'brand_name_Opel', 'brand_name_OpelCorsa', 'brand_name_Porsche',
       'brand_name_Premier

### 3.5 Km Driven

In [21]:
print(df['km_driven'].nunique())
print(df['km_driven'].unique())
print(df['km_driven'].value_counts())
print(df['km_driven'].isna().sum())
print(df['km_driven'].dtype)

4515
['1,20,000 kms' '20,000 kms' '60,000 kms' ... '10,723 kms' '69,480 kms'
 '38,00,000 kms']
50,000 kms       750
70,000 kms       680
60,000 kms       644
40,000 kms       624
80,000 kms       502
                ... 
2,530 kms          1
56,913 kms         1
54,278 kms         1
94,603 kms         1
38,00,000 kms      1
Name: km_driven, Length: 4515, dtype: int64
0
object


In [22]:
df['km_driven'].replace(r'[,*a-zA-Z]+','',regex=True,inplace=True)


In [23]:
df['km_driven']=df['km_driven'].astype(float)

In [24]:
print(df['km_driven'].nunique())
print(df['km_driven'].unique())
print(df['km_driven'].value_counts())
print(df['km_driven'].isna().sum())
print(df['km_driven'].dtype)

4499
[ 120000.   20000.   60000. ...   10723.   69480. 3800000.]
50000.0      750
70000.0      680
60000.0      644
40000.0      624
80000.0      502
            ... 
2530.0         1
56913.0        1
54278.0        1
94603.0        1
3800000.0      1
Name: km_driven, Length: 4499, dtype: int64
0
float64


In [25]:
df.columns

Index(['full_name', 'selling_price', 'new_price', 'year', 'km_driven',
       'owner_type', 'fuel_type', 'transmission_type', 'mileage', 'engine',
       'max_power', 'seats', 'brand_name_Ambassador', 'brand_name_Audi',
       'brand_name_BMW', 'brand_name_Bentley', 'brand_name_Chevrolet',
       'brand_name_DC', 'brand_name_Daewoo', 'brand_name_Datsun',
       'brand_name_Ferrari', 'brand_name_Fiat', 'brand_name_Force',
       'brand_name_Ford', 'brand_name_Honda', 'brand_name_Hyundai',
       'brand_name_ISUZU', 'brand_name_Isuzu', 'brand_name_Jaguar',
       'brand_name_Jeep', 'brand_name_Kia', 'brand_name_Lamborghini',
       'brand_name_Land', 'brand_name_Lexus', 'brand_name_MG',
       'brand_name_Mahindra', 'brand_name_Maruti', 'brand_name_Maserati',
       'brand_name_Mercedes-AMG', 'brand_name_Mercedes-Benz',
       'brand_name_Mini', 'brand_name_Mitsubishi', 'brand_name_Nissan',
       'brand_name_Opel', 'brand_name_OpelCorsa', 'brand_name_Porsche',
       'brand_name_Premier

### 3.6 'owner_type'

In [26]:
print(df['owner_type'].nunique())
print(df['owner_type'].unique())
print(df['owner_type'].value_counts())
print(df['owner_type'].isna().sum())
print(df['owner_type'].dtype)

1
['First Owner']
First Owner    19974
Name: owner_type, dtype: int64
0
object


In [27]:
# from above it only one value so we not consider this feture for model training

### 3.7 'fuel_type'

In [28]:
print(df['fuel_type'].nunique())
print(df['fuel_type'].unique())
print(df['fuel_type'].value_counts())
print(df['fuel_type'].isna().sum())
print(df['fuel_type'].dtype)

5
['Petrol' 'Diesel' 'CNG' 'LPG' 'Electric']
Diesel      9815
Petrol      9763
CNG          316
LPG           66
Electric      14
Name: fuel_type, dtype: int64
0
object


In [29]:
# here we apply one hot encoding
df=pd.get_dummies(df,columns=['fuel_type'])
df.columns

Index(['full_name', 'selling_price', 'new_price', 'year', 'km_driven',
       'owner_type', 'transmission_type', 'mileage', 'engine', 'max_power',
       'seats', 'brand_name_Ambassador', 'brand_name_Audi', 'brand_name_BMW',
       'brand_name_Bentley', 'brand_name_Chevrolet', 'brand_name_DC',
       'brand_name_Daewoo', 'brand_name_Datsun', 'brand_name_Ferrari',
       'brand_name_Fiat', 'brand_name_Force', 'brand_name_Ford',
       'brand_name_Honda', 'brand_name_Hyundai', 'brand_name_ISUZU',
       'brand_name_Isuzu', 'brand_name_Jaguar', 'brand_name_Jeep',
       'brand_name_Kia', 'brand_name_Lamborghini', 'brand_name_Land',
       'brand_name_Lexus', 'brand_name_MG', 'brand_name_Mahindra',
       'brand_name_Maruti', 'brand_name_Maserati', 'brand_name_Mercedes-AMG',
       'brand_name_Mercedes-Benz', 'brand_name_Mini', 'brand_name_Mitsubishi',
       'brand_name_Nissan', 'brand_name_Opel', 'brand_name_OpelCorsa',
       'brand_name_Porsche', 'brand_name_Premier', 'brand_name_Renau

### 3.8transmission_type

In [30]:
print(df['transmission_type'].nunique())
print(df['transmission_type'].unique())
print(df['transmission_type'].value_counts())
print(df['transmission_type'].isna().sum())
print(df['transmission_type'].dtype)

2
['Manual' 'Automatic']
Manual       16025
Automatic     3949
Name: transmission_type, dtype: int64
0
object


In [31]:
# here we also apply one hot encodinf
df=pd.get_dummies(df,columns=['transmission_type'])
df.columns

Index(['full_name', 'selling_price', 'new_price', 'year', 'km_driven',
       'owner_type', 'mileage', 'engine', 'max_power', 'seats',
       'brand_name_Ambassador', 'brand_name_Audi', 'brand_name_BMW',
       'brand_name_Bentley', 'brand_name_Chevrolet', 'brand_name_DC',
       'brand_name_Daewoo', 'brand_name_Datsun', 'brand_name_Ferrari',
       'brand_name_Fiat', 'brand_name_Force', 'brand_name_Ford',
       'brand_name_Honda', 'brand_name_Hyundai', 'brand_name_ISUZU',
       'brand_name_Isuzu', 'brand_name_Jaguar', 'brand_name_Jeep',
       'brand_name_Kia', 'brand_name_Lamborghini', 'brand_name_Land',
       'brand_name_Lexus', 'brand_name_MG', 'brand_name_Mahindra',
       'brand_name_Maruti', 'brand_name_Maserati', 'brand_name_Mercedes-AMG',
       'brand_name_Mercedes-Benz', 'brand_name_Mini', 'brand_name_Mitsubishi',
       'brand_name_Nissan', 'brand_name_Opel', 'brand_name_OpelCorsa',
       'brand_name_Porsche', 'brand_name_Premier', 'brand_name_Renault',
       'brand_na

### 3.9 mileage

In [32]:
print(df['mileage'].nunique())
print(df['mileage'].unique())
print(df['mileage'].value_counts())
print(df['mileage'].isna().sum())
print(df['mileage'].dtype)

561
['Mileage19.7 kmpl' 'Mileage18.9 kmpl' 'Mileage17.0 kmpl'
 'Mileage20.92 kmpl' 'Mileage22.77 kmpl' 'Mileage20.36 kmpl'
 'Mileage20.51 kmpl' 'Mileage18.15 kmpl' 'Mileage18.49 kmpl'
 'Mileage19.09 kmpl' 'Mileage20.37 kmpl' 'Mileage16.6 kmpl'
 'Mileage19.34 kmpl' 'Mileage22.32 kmpl' 'Mileage19.64 kmpl'
 'Mileage14.41 kmpl' 'Mileage28.09 kmpl' 'Mileage25.2 kmpl'
 'Mileage19.27 kmpl' 'Mileage28.4 kmpl' 'Mileage12.99 kmpl'
 'Mileage21.4 kmpl' 'Mileage20.85 kmpl' 'Mileage17.8 kmpl'
 'Mileage16.09 kmpl' 'Mileage19.67 kmpl' 'Mileage14.02 kmpl'
 'Mileage17.4 kmpl' 'Mileage13.6 kmpl' 'Mileage26.59 kmpl'
 'Mileage17.7 kmpl' 'Mileage17.19 kmpl' 'Mileage13.0 kmpl'
 'Mileage23.01 kmpl' 'Mileage18.6 kmpl' 'Mileage17.92 kmpl'
 'Mileage15.1 kmpl' 'Mileage23.4 kmpl' 'Mileage14.0 kmpl'
 'Mileage22.1 kmpl' 'Mileage19.1 kmpl' 'Mileage22.54 kmpl'
 'Mileage18.0 kmpl' 'Mileage16.1 kmpl' 'Mileage20.5 kmpl'
 'Mileage12.2 kmpl' 'Mileage21.0 kmpl' 'Mileage26.0 kmpl'
 'Mileage17.6 kmpl' 'Mileage15.4 kmpl' 'Mile

160
object


In [33]:
df['mileage'].replace(r'[a-zA-Z]','',regex=True,inplace=True)

In [34]:
print(df['mileage'].nunique())
print(df['mileage'].unique())
print(df['mileage'].value_counts())
print(df['mileage'].isna().sum())
print(df['mileage'].dtype)

561
['19.7 ' '18.9 ' '17.0 ' '20.92 ' '22.77 ' '20.36 ' '20.51 ' '18.15 '
 '18.49 ' '19.09 ' '20.37 ' '16.6 ' '19.34 ' '22.32 ' '19.64 ' '14.41 '
 '28.09 ' '25.2 ' '19.27 ' '28.4 ' '12.99 ' '21.4 ' '20.85 ' '17.8 '
 '16.09 ' '19.67 ' '14.02 ' '17.4 ' '13.6 ' '26.59 ' '17.7 ' '17.19 '
 '13.0 ' '23.01 ' '18.6 ' '17.92 ' '15.1 ' '23.4 ' '14.0 ' '22.1 ' '19.1 '
 '22.54 ' '18.0 ' '16.1 ' '20.5 ' '12.2 ' '21.0 ' '26.0 ' '17.6 ' '15.4 '
 '20.89 ' '14.53 ' '26.6 /' '22.5 ' '25.0 ' '25.4 ' '16.8 ' '20.54 '
 '17.3 ' '24.0 ' '18.16 ' '21.21 ' '23.5 ' '25.17 ' '24.3 ' '18.5 '
 '23.84 ' '18.78 ' '11.74 ' '25.1 ' '21.01 ' '22.74 ' '21.9 ' '25.44 '
 '15.6 ' '23.59 ' '22.07 ' '22.0 ' '23.1 ' '23.0 ' '19.08 ' '18.27 '
 '20.45 ' '23.9 ' '14.66 ' '20.14 ' '19.81 ' '19.44 ' '18.59 ' '20.62 '
 '17.01 ' nan '17.5 ' '12.05 ' '25.83 ' '21.79 ' '13.7 ' '16.47 ' '20.7 '
 '17.71 ' '10.91 ' '25.8 ' '25.32 ' '14.49 ' '19.87 ' '18.7 ' '14.4 /'
 '18.2 ' '13.5 ' '22.69 ' '21.1 ' '13.01 ' '15.3 ' '20.73 ' '12.8 '
 '16

In [35]:
df['mileage'].replace(r'[/]+','',regex=True,inplace=True)

In [36]:
print(df['mileage'].unique())

['19.7 ' '18.9 ' '17.0 ' '20.92 ' '22.77 ' '20.36 ' '20.51 ' '18.15 '
 '18.49 ' '19.09 ' '20.37 ' '16.6 ' '19.34 ' '22.32 ' '19.64 ' '14.41 '
 '28.09 ' '25.2 ' '19.27 ' '28.4 ' '12.99 ' '21.4 ' '20.85 ' '17.8 '
 '16.09 ' '19.67 ' '14.02 ' '17.4 ' '13.6 ' '26.59 ' '17.7 ' '17.19 '
 '13.0 ' '23.01 ' '18.6 ' '17.92 ' '15.1 ' '23.4 ' '14.0 ' '22.1 ' '19.1 '
 '22.54 ' '18.0 ' '16.1 ' '20.5 ' '12.2 ' '21.0 ' '26.0 ' '17.6 ' '15.4 '
 '20.89 ' '14.53 ' '26.6 ' '22.5 ' '25.0 ' '25.4 ' '16.8 ' '20.54 '
 '17.3 ' '24.0 ' '18.16 ' '21.21 ' '23.5 ' '25.17 ' '24.3 ' '18.5 '
 '23.84 ' '18.78 ' '11.74 ' '25.1 ' '21.01 ' '22.74 ' '21.9 ' '25.44 '
 '15.6 ' '23.59 ' '22.07 ' '22.0 ' '23.1 ' '23.0 ' '19.08 ' '18.27 '
 '20.45 ' '23.9 ' '14.66 ' '20.14 ' '19.81 ' '19.44 ' '18.59 ' '20.62 '
 '17.01 ' nan '17.5 ' '12.05 ' '25.83 ' '21.79 ' '13.7 ' '16.47 ' '20.7 '
 '17.71 ' '10.91 ' '25.8 ' '25.32 ' '14.49 ' '19.87 ' '18.7 ' '14.4 '
 '18.2 ' '13.5 ' '22.69 ' '21.1 ' '13.01 ' '15.3 ' '20.73 ' '12.8 '
 '16.55 ' 

In [37]:

df['mileage']=df['mileage'].astype(float)
print(df['mileage'].dtype)

float64


In [38]:
df['mileage'].isna().sum()

160

In [39]:
s4=df['mileage'].fillna(df['mileage'].mean())
s4.isna().sum()

0

In [40]:
df['mileage']=s4

In [41]:
df['mileage'].isna().sum()

0

In [42]:
df.columns

Index(['full_name', 'selling_price', 'new_price', 'year', 'km_driven',
       'owner_type', 'mileage', 'engine', 'max_power', 'seats',
       'brand_name_Ambassador', 'brand_name_Audi', 'brand_name_BMW',
       'brand_name_Bentley', 'brand_name_Chevrolet', 'brand_name_DC',
       'brand_name_Daewoo', 'brand_name_Datsun', 'brand_name_Ferrari',
       'brand_name_Fiat', 'brand_name_Force', 'brand_name_Ford',
       'brand_name_Honda', 'brand_name_Hyundai', 'brand_name_ISUZU',
       'brand_name_Isuzu', 'brand_name_Jaguar', 'brand_name_Jeep',
       'brand_name_Kia', 'brand_name_Lamborghini', 'brand_name_Land',
       'brand_name_Lexus', 'brand_name_MG', 'brand_name_Mahindra',
       'brand_name_Maruti', 'brand_name_Maserati', 'brand_name_Mercedes-AMG',
       'brand_name_Mercedes-Benz', 'brand_name_Mini', 'brand_name_Mitsubishi',
       'brand_name_Nissan', 'brand_name_Opel', 'brand_name_OpelCorsa',
       'brand_name_Porsche', 'brand_name_Premier', 'brand_name_Renault',
       'brand_na

### 3.9 'engine'

In [43]:
print(df['engine'].nunique())
print(df['engine'].unique())
print(df['engine'].value_counts())
print(df['engine'].isna().sum())
print(df['engine'].dtype)

169
['Engine796 CC' 'Engine1197 CC' 'Engine998 CC' 'Engine1498 CC'
 'Engine1493 CC' 'Engine1405 CC' 'Engine1461 CC' 'Engine1198 CC'
 'Engine1582 CC' 'Engine1998 CC' 'Engine1248 CC' 'Engine2143 CC'
 'Engine2494 CC' 'Engine1598 CC' 'Engine2179 CC' 'Engine1497 CC'
 'Engine2523 CC' 'Engine2982 CC' 'Engine999 CC' 'Engine1199 CC'
 'Engine1086 CC' 'Engine2498 CC' 'Engine1396 CC' 'Engine1817 CC'
 'Engine624 CC' 'Engine1496 CC' 'Engine1590 CC' 'Engine1798 CC'
 'Engine1186 CC' 'Engine1794 CC' 'Engine1997 CC' 'Engine1120 CC'
 'Engine1196 CC' 'Engine1061 CC' 'Engine909 CC' 'Engine799 CC'
 'Engine1796 CC' 'Engine936 CC' 'Engine1364 CC' 'Engine2199 CC'
 'Engine2993 CC' 'Engine1298 CC' 'Engine3198 CC' 'Engine1399 CC'
 'Engine2477 CC' 'Engine1995 CC' 'Engine2956 CC' 'Engine1373 CC'
 'Engine1591 CC' 'Engine2354 CC' 'Engine1968 CC' 'Engine1047 CC'
 'Engine1353 CC' 'Engine1991 CC' 'Engine2755 CC' 'Engine1395 CC'
 'Engine1193 CC' 'Engine1896 CC' 'Engine1999 CC' 'Engine2953 CC'
 'Engine2393 CC' 'Engine2967

In [44]:
df['engine'].replace(r'[,*a-zA-Z/]+','',regex=True,inplace=True)

In [45]:
print(df['engine'].nunique())
print(df['engine'].unique())
print(df['engine'].value_counts())
print(df['engine'].isna().sum())
print(df['engine'].dtype)

169
['796 ' '1197 ' '998 ' '1498 ' '1493 ' '1405 ' '1461 ' '1198 ' '1582 '
 '1998 ' '1248 ' '2143 ' '2494 ' '1598 ' '2179 ' '1497 ' '2523 ' '2982 '
 '999 ' '1199 ' '1086 ' '2498 ' '1396 ' '1817 ' '624 ' '1496 ' '1590 '
 '1798 ' '1186 ' '1794 ' '1997 ' '1120 ' '1196 ' '1061 ' '909 ' '799 '
 '1796 ' '936 ' '1364 ' '2199 ' '2993 ' '1298 ' '3198 ' '1399 ' '2477 '
 '1995 ' '2956 ' '1373 ' '1591 ' '2354 ' '1968 ' '1047 ' '1353 ' '1991 '
 '2755 ' '1395 ' '1193 ' '1896 ' '1999 ' '2953 ' '2393 ' '2967 ' '2998 '
 '2995 ' '2694 ' '4461 ' '1586 ' '6752 ' '1956 ' '1462 ' '814 ' '995 '
 '1799 ' '2499 ' '1451 ' '1194 ' '1969 ' '2400 ' nan '993 ' '1150 '
 '1299 ' '1495 ' '2487 ' '3604 ' '1499 ' '2987 ' '970 ' '1489 ' '1599 '
 '2835 ' '1172 ' '1950 ' '2489 ' '2148 ' '4134 ' '4951 ' '1984 ' '2979 '
 '2609 ' '1242 ' '1390 ' '2092 ' '2997 ' '1368 ' '5998 ' '1330 ' '2359 '
 '2200 ' '1596 ' '3598 ' '2362 ' '793 ' '2696 ' '2112 ' '3597 ' '1388 '
 '72 ' '2360 ' '5461 ' '1343 ' '2446 ' '2497 ' '1595 ' '4367 ' 

In [46]:
df['engine']=df['engine'].astype(float)
df['engine']=df['engine'].fillna(df['engine'].mean())

In [47]:
print(df['engine'].isna().sum())
print(df['engine'].dtype)

0
float64


In [48]:
df.columns

Index(['full_name', 'selling_price', 'new_price', 'year', 'km_driven',
       'owner_type', 'mileage', 'engine', 'max_power', 'seats',
       'brand_name_Ambassador', 'brand_name_Audi', 'brand_name_BMW',
       'brand_name_Bentley', 'brand_name_Chevrolet', 'brand_name_DC',
       'brand_name_Daewoo', 'brand_name_Datsun', 'brand_name_Ferrari',
       'brand_name_Fiat', 'brand_name_Force', 'brand_name_Ford',
       'brand_name_Honda', 'brand_name_Hyundai', 'brand_name_ISUZU',
       'brand_name_Isuzu', 'brand_name_Jaguar', 'brand_name_Jeep',
       'brand_name_Kia', 'brand_name_Lamborghini', 'brand_name_Land',
       'brand_name_Lexus', 'brand_name_MG', 'brand_name_Mahindra',
       'brand_name_Maruti', 'brand_name_Maserati', 'brand_name_Mercedes-AMG',
       'brand_name_Mercedes-Benz', 'brand_name_Mini', 'brand_name_Mitsubishi',
       'brand_name_Nissan', 'brand_name_Opel', 'brand_name_OpelCorsa',
       'brand_name_Porsche', 'brand_name_Premier', 'brand_name_Renault',
       'brand_na

### 3.10 max_power

In [49]:
print(df['max_power'].nunique())
print(df['max_power'].unique())
print(df['max_power'].value_counts())
print(df['max_power'].isna().sum())
print(df['max_power'].dtype)

493
['Max Power46.3 bhp' 'Max Power82 bhp' 'Max Power80 bhp'
 'Max Power67.1 bhp' 'Max Power98.59 bhp' 'Max Power78.9 bhp'
 'Max Power67.04 bhp' 'Max Power118.35 bhp' 'Max Power100 bhp'
 'Max Power69.01 bhp' 'Max Power108.49 bhp' 'Max Power85 bhp'
 'Max Power76 bhp' 'Max Power126.32 bhp' 'Max Power108.45 bhp'
 'Max Power189.08 bhp' 'Max Power88.5 bhp' 'Max Power74 bhp'
 'Max Power170 bhp' 'Max Power100.6 bhp' 'Max Power83.1 bhp'
 'Max Power83.14 bhp' 'Max Power103.2 bhp' 'Max Power126.2 bhp'
 'Max Power118.3 bhp' 'Max Power117.3 bhp' 'Max Power63 bhp'
 'Max Power78 bhp' 'Max Power81.86 bhp' 'Max Power168.5 bhp'
 'Max Power67 bhp' 'Max Power88.76 bhp' 'Max Power62.1 bhp'
 'Max Power140 bhp' 'Max Power112 bhp' 'Max Power88.73 bhp'
 'Max Power81.83 bhp' 'Max Power37 bhp' 'Max Power69 bhp'
 'Max Power91.7 bhp' 'Max Power75 bhp' nan 'Max Power74.96 bhp'
 'Max Power35 bhp' 'Max Power94 bhp' 'Max Power81.80 bhp'
 'Max Power138.1 bhp' 'Max Power58.16 bhp' 'Max Power68 bhp'
 'Max Power126.3 bhp

In [50]:
df['max_power'].replace(r'[,*a-zA-Z/]+','',regex=True,inplace=True)
print(df['max_power'].unique())

[' 46.3 ' ' 82 ' ' 80 ' ' 67.1 ' ' 98.59 ' ' 78.9 ' ' 67.04 ' ' 118.35 '
 ' 100 ' ' 69.01 ' ' 108.49 ' ' 85 ' ' 76 ' ' 126.32 ' ' 108.45 '
 ' 189.08 ' ' 88.5 ' ' 74 ' ' 170 ' ' 100.6 ' ' 83.1 ' ' 83.14 ' ' 103.2 '
 ' 126.2 ' ' 118.3 ' ' 117.3 ' ' 63 ' ' 78 ' ' 81.86 ' ' 168.5 ' ' 67 '
 ' 88.76 ' ' 62.1 ' ' 140 ' ' 112 ' ' 88.73 ' ' 81.83 ' ' 37 ' ' 69 '
 ' 91.7 ' ' 75 ' nan ' 74.96 ' ' 35 ' ' 94 ' ' 81.80 ' ' 138.1 ' ' 58.16 '
 ' 68 ' ' 126.3 ' ' 70 ' ' 73.97 ' ' 34.2 ' ' 120 ' ' 103.6 ' ' 121 '
 ' 113.42 ' ' 152.87 ' ' 86.8 ' ' 86.7 ' ' 64 ' ' 45 ' ' 116.3 ' ' 53.3 '
 ' 82.9 ' ' 84 ' ' 186 ' ' 98.6 ' ' 113.4 ' ' 47.3 ' ' 90 ' ' 56.3 '
 ' 85.8 ' ' 67.05 ' ' 73.9 ' ' 82.85 ' ' 98.96 ' ' 83.8 ' ' 194.3 '
 ' 68.05 ' ' 261.49 ' ' 76.8 ' ' 74.02 ' ' 99 ' ' 153.86 ' ' 197 ' ' 77 '
 ' 92.7 ' ' 88.7 ' ' 61.7 ' ' 82.4 ' ' 175.56 ' ' 190 ' ' 85.80 '
 ' 83.83 ' ' 91.1 ' ' 121.3 ' ' 177.6 ' ' 118 ' ' 188 ' ' 147.51 '
 ' 88.8 ' ' 138 ' ' 105.5 ' ' 174.33 ' ' 88.50 ' ' 84.8 ' ' 63.1 '
 ' 67.06 ' ' 3

In [51]:
df['max_power']=df['max_power'].astype(float)
df['max_power']=df['max_power'].fillna(df['max_power'].mean())

In [52]:
print(df['max_power'].isna().sum())
print(df['max_power'].dtype)

0
float64


In [53]:
df.columns

Index(['full_name', 'selling_price', 'new_price', 'year', 'km_driven',
       'owner_type', 'mileage', 'engine', 'max_power', 'seats',
       'brand_name_Ambassador', 'brand_name_Audi', 'brand_name_BMW',
       'brand_name_Bentley', 'brand_name_Chevrolet', 'brand_name_DC',
       'brand_name_Daewoo', 'brand_name_Datsun', 'brand_name_Ferrari',
       'brand_name_Fiat', 'brand_name_Force', 'brand_name_Ford',
       'brand_name_Honda', 'brand_name_Hyundai', 'brand_name_ISUZU',
       'brand_name_Isuzu', 'brand_name_Jaguar', 'brand_name_Jeep',
       'brand_name_Kia', 'brand_name_Lamborghini', 'brand_name_Land',
       'brand_name_Lexus', 'brand_name_MG', 'brand_name_Mahindra',
       'brand_name_Maruti', 'brand_name_Maserati', 'brand_name_Mercedes-AMG',
       'brand_name_Mercedes-Benz', 'brand_name_Mini', 'brand_name_Mitsubishi',
       'brand_name_Nissan', 'brand_name_Opel', 'brand_name_OpelCorsa',
       'brand_name_Porsche', 'brand_name_Premier', 'brand_name_Renault',
       'brand_na

### seat

In [54]:
df['seats'].replace(r'[*,a-zA-Z/]+','',regex=True,inplace=True)


In [55]:
print(df['seats'].nunique())
print(df['seats'].unique())
print(df['seats'].value_counts())
print(df['seats'].isna().sum())
print(df['seats'].dtype)

9
['5' '7' '8' '4' '6' nan '2' '9' '10' '14']
5     16585
7      2370
8       443
4       225
6       143
9        72
10       26
2        21
14        2
Name: seats, dtype: int64
87
object


In [56]:
df['seats']=df['seats'].astype(float)
df['seats']=df['seats'].fillna(df['seats'].mean())

In [57]:
print(df['seats'].isna().sum())
print(df['seats'].dtype)

0
float64


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19974 entries, 0 to 19973
Data columns (total 64 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   full_name                     19974 non-null  object 
 1   selling_price                 19974 non-null  object 
 2   new_price                     9564 non-null   object 
 3   year                          19974 non-null  int64  
 4   km_driven                     19974 non-null  float64
 5   owner_type                    19974 non-null  object 
 6   mileage                       19974 non-null  float64
 7   engine                        19974 non-null  float64
 8   max_power                     19974 non-null  float64
 9   seats                         19974 non-null  float64
 10  brand_name_Ambassador         19974 non-null  uint8  
 11  brand_name_Audi               19974 non-null  uint8  
 12  brand_name_BMW                19974 non-null  uint8  
 13  b

### step5: Feature Selection

In [59]:
df.columns

Index(['full_name', 'selling_price', 'new_price', 'year', 'km_driven',
       'owner_type', 'mileage', 'engine', 'max_power', 'seats',
       'brand_name_Ambassador', 'brand_name_Audi', 'brand_name_BMW',
       'brand_name_Bentley', 'brand_name_Chevrolet', 'brand_name_DC',
       'brand_name_Daewoo', 'brand_name_Datsun', 'brand_name_Ferrari',
       'brand_name_Fiat', 'brand_name_Force', 'brand_name_Ford',
       'brand_name_Honda', 'brand_name_Hyundai', 'brand_name_ISUZU',
       'brand_name_Isuzu', 'brand_name_Jaguar', 'brand_name_Jeep',
       'brand_name_Kia', 'brand_name_Lamborghini', 'brand_name_Land',
       'brand_name_Lexus', 'brand_name_MG', 'brand_name_Mahindra',
       'brand_name_Maruti', 'brand_name_Maserati', 'brand_name_Mercedes-AMG',
       'brand_name_Mercedes-Benz', 'brand_name_Mini', 'brand_name_Mitsubishi',
       'brand_name_Nissan', 'brand_name_Opel', 'brand_name_OpelCorsa',
       'brand_name_Porsche', 'brand_name_Premier', 'brand_name_Renault',
       'brand_na

In [60]:
final_df=df.drop(['full_name', 'selling_price', 'new_price', 'year','owner_type',],axis=1)
final_df.head()

Unnamed: 0,km_driven,mileage,engine,max_power,seats,brand_name_Ambassador,brand_name_Audi,brand_name_BMW,brand_name_Bentley,brand_name_Chevrolet,...,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,fuel_type_CNG,fuel_type_Diesel,fuel_type_Electric,fuel_type_LPG,fuel_type_Petrol,transmission_type_Automatic,transmission_type_Manual
0,120000.0,19.7,796.0,46.3,5.0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
1,20000.0,18.9,1197.0,82.0,5.0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
2,60000.0,17.0,1197.0,80.0,5.0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
3,37000.0,20.92,998.0,67.1,5.0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
4,30000.0,22.77,1498.0,98.59,5.0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,1


In [61]:
final_df.isna().sum()

km_driven                       0
mileage                         0
engine                          0
max_power                       0
seats                           0
brand_name_Ambassador           0
brand_name_Audi                 0
brand_name_BMW                  0
brand_name_Bentley              0
brand_name_Chevrolet            0
brand_name_DC                   0
brand_name_Daewoo               0
brand_name_Datsun               0
brand_name_Ferrari              0
brand_name_Fiat                 0
brand_name_Force                0
brand_name_Ford                 0
brand_name_Honda                0
brand_name_Hyundai              0
brand_name_ISUZU                0
brand_name_Isuzu                0
brand_name_Jaguar               0
brand_name_Jeep                 0
brand_name_Kia                  0
brand_name_Lamborghini          0
brand_name_Land                 0
brand_name_Lexus                0
brand_name_MG                   0
brand_name_Mahindra             0
brand_name_Mar

In [62]:
final_df.dtypes

km_driven                       float64
mileage                         float64
engine                          float64
max_power                       float64
seats                           float64
brand_name_Ambassador             uint8
brand_name_Audi                   uint8
brand_name_BMW                    uint8
brand_name_Bentley                uint8
brand_name_Chevrolet              uint8
brand_name_DC                     uint8
brand_name_Daewoo                 uint8
brand_name_Datsun                 uint8
brand_name_Ferrari                uint8
brand_name_Fiat                   uint8
brand_name_Force                  uint8
brand_name_Ford                   uint8
brand_name_Honda                  uint8
brand_name_Hyundai                uint8
brand_name_ISUZU                  uint8
brand_name_Isuzu                  uint8
brand_name_Jaguar                 uint8
brand_name_Jeep                   uint8
brand_name_Kia                    uint8
brand_name_Lamborghini            uint8


### step6: Model Traning

In [63]:
# Data Splitting
x=final_df.drop('price',axis=1)
y=final_df['price']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(13981, 58)
(5993, 58)
(13981,)
(5993,)


In [64]:
# Linear Regression
lin_model=LinearRegression()
lin_model.fit(x_train,y_train)

### step7: Model Evalution

In [65]:
#testing data 
y_test_predict=lin_model.predict(x_test)
def model_eval(actual,pred):
    mse = mean_squared_error(actual,pred)
    print(f"MSE = {mse}")
    print(f"RMSE = {np.sqrt(mse)}")
    
    mae = mean_absolute_error(actual, pred)
    print(f"MAE = {mae}")
    
    r2 = r2_score(actual,pred)
    print(f"R - Sequared Value = {r2}")
    
model_eval(y_test, y_test_predict)

MSE = 389966438137.83496
RMSE = 624472.9282665782
MAE = 218884.12292085265
R - Sequared Value = 0.612120225271424


In [66]:
#traning data
y_train_predict=lin_model.predict(x_train)
model_eval(y_train,y_train_predict)

MSE = 181247957539.61304
RMSE = 425732.26039332873
MAE = 209893.40101361906
R - Sequared Value = 0.7593019401775254


In [67]:
#from above traning=75% &testing=61 >>>high bias & high variance

### By using KNN ALgorithm:
    

In [68]:
norm=MinMaxScaler()
norm_x=norm.fit_transform(x)
norm_x

array([[0.03155346, 0.13534483, 0.10838323, ..., 1.        , 0.        ,
        1.        ],
       [0.00523698, 0.12844828, 0.16841317, ..., 1.        , 0.        ,
        1.        ],
       [0.01576357, 0.11206897, 0.16841317, ..., 1.        , 0.        ,
        1.        ],
       ...,
       [0.01760573, 0.14775862, 0.21347305, ..., 0.        , 0.        ,
        1.        ],
       [1.        , 0.10344828, 0.31541916, ..., 0.        , 0.        ,
        1.        ],
       [0.00339483, 0.12068966, 0.21332335, ..., 1.        , 1.        ,
        0.        ]])

In [69]:
x_train,x_test,y_train,y_test=train_test_split(norm_x,y,test_size=0.3,random_state=5)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(13981, 58)
(5993, 58)
(13981,)
(5993,)


In [70]:
#model traning
knn=KNeighborsRegressor()
knn.fit(x_train,y_train)

In [71]:
# Model Evalution
#testing data 
y_test_predict=knn.predict(x_test)
model_eval(y_test, y_test_predict)

MSE = 256281046678.9569
RMSE = 506242.0830778067
MAE = 124636.01114633739
R - Sequared Value = 0.7450902823132128


In [72]:
#traning data
y_train_predict=knn.predict(x_train)
model_eval(y_train,y_train_predict)

MSE = 79075565646.22235
RMSE = 281203.7795731458
MAE = 97022.15742793791
R - Sequared Value = 0.8949873119190843


In [73]:
#from above traning=89% &testing=74 >>>low bias & high variance>>overfitted

### Hypeerparameter of knn

In [74]:
knn_model=KNeighborsRegressor()
hyp={'n_neighbors':np.arange(5,15),'p':[1,2]}
gscv=GridSearchCV(knn_model,hyp,cv=5)
gscv.fit(x_train,y_train)

In [75]:
#  n_neighbors=5,
#     *,
#     weights='uniform',
#     algorithm='auto',
#     leaf_size=30,
#     p=2,
#     metric='minkowski',
#     metric_params=None,
#     n_jobs=None,
# )

In [76]:
gscv.best_estimator_

In [77]:
gscv.best_params_

{'n_neighbors': 5, 'p': 1}

In [78]:
knn_hyp=gscv.best_estimator_
knn_hyp.fit(x_train,y_train)

In [79]:
# Model Evalution
#testing data 
y_test_predict=knn_hyp.predict(x_test)
model_eval(y_test, y_test_predict)

MSE = 274305981094.56186
RMSE = 523742.2849976521
MAE = 124787.454864008
R - Sequared Value = 0.7271617971491868


In [80]:
#traning data
y_train_predict=knn_hyp.predict(x_train)
model_eval(y_train,y_train_predict)

MSE = 80360152683.76761
RMSE = 283478.6635423689
MAE = 96366.53873113511
R - Sequared Value = 0.8932813748602202


In [81]:
#from above traning=89% &testing=72>>>low bias & high variance>>overfitted

### By Using Decision tree

In [82]:
x=final_df.drop('price',axis=1)
y=final_df['price']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(13981, 58)
(5993, 58)
(13981,)
(5993,)


In [83]:
dt=DecisionTreeRegressor()
dt.fit(x_train,y_train)

In [84]:
# Model Evalution
#testing data 
y_test_predict=dt.predict(x_test)
model_eval(y_test, y_test_predict)

MSE = 312174368500.6117
RMSE = 558725.6647949973
MAE = 142541.21710328717
R - Sequared Value = 0.6894960389200098


In [85]:
#traning data
y_train_predict=dt.predict(x_train)
model_eval(y_train,y_train_predict)

MSE = 321951661.77869254
RMSE = 17943.011502495687
MAE = 4129.931647712372
R - Sequared Value = 0.9995724468214776


In [86]:
#from above traning=99% &testing=81>>>low bias & high variance>>overfitted

### hypertune by decision tree

In [89]:
dt_model=DecisionTreeRegressor(random_state=7)
hyp={'criterion':['squared_error','absolute_error'],'max_depth':np.arange(3,7),
    'min_samples_split':np.arange(5,7),'min_samples_leaf':np.arange(7,8),}

rscv=RandomizedSearchCV(dt_model,hyp,cv=7)
rscv.fit(x_train,y_train)

In [None]:
# criterion='squared_error',
#     splitter='best',
#     max_depth=None,
#     min_samples_split=2,
#     min_samples_leaf=1,
#     min_weight_fraction_leaf=0.0,
#     max_features=None,
#     random_state=None,
#     max_leaf_nodes=None,
#     min_impurity_decrease=0.0,
#     ccp_alpha=0.0,
# )

In [90]:
rscv.best_estimator_

In [91]:
dt_hyp=rscv.best_estimator_
dt_hyp.fit(x_train,y_train)

In [92]:
# Model Evalution
#testing data 
y_test_predict=dt_hyp.predict(x_test)
model_eval(y_test, y_test_predict)

MSE = 292343616961.5856
RMSE = 540688.0958201184
MAE = 170509.04071590013
R - Sequared Value = 0.7092206784976778


In [93]:
#traning data
y_train_predict=dt_hyp.predict(x_train)
model_eval(y_train,y_train_predict)

MSE = 103910015198.55235
RMSE = 322350.7642282741
MAE = 152680.31987644386
R - Sequared Value = 0.8620070571060141


In [94]:
#from above traning=93% &testing=74>>>low bias & high variance>>overfitted

### By using Random Forest

In [95]:
rf=RandomForestRegressor(random_state=3)
rf.fit(x_train,y_train)

In [96]:
# Model Evalution
#testing data 
y_test_predict=rf.predict(x_test)
model_eval(y_test, y_test_predict)

MSE = 206275139966.06183
RMSE = 454175.23046293686
MAE = 110125.68806198362
R - Sequared Value = 0.7948286134463147


In [97]:
#traning data
y_train_predict=rf.predict(x_train)
model_eval(y_train,y_train_predict)

MSE = 11212894568.343557
RMSE = 105890.95602714879
MAE = 40690.31192980313
R - Sequared Value = 0.9851092282405203


In [98]:
#from above traning=98% &testing=79>>>low bias & high variance>>overfitted

### hypertune by random forest

In [100]:
# rf=RandomForestRegressor(random_state=10)
# hyp={'n_estimators':np.arange(50,500,10),'criterion':['squared_error','absolute_error'],'max_depth':np.arange(5,9),
#     'min_samples_split':np.arange(3,8),'min_samples_leaf':np.arange(5,8),'bootstrap':[True,False],
#     'oob_score':[False,True]}
# rscv=RandomizedSearchCV(rf,hyp,cv=5)
# rscv.fit(x_train,y_train)

In [None]:
rscv=

In [None]:
rf_hyp=

In [None]:
# Model Evalution
#testing data 
y_test_predict=rf.predict(x_test)
model_eval(y_test, y_test_predict)

In [None]:
#traning data
y_train_predict=rf.predict(x_train)
model_eval(y_train,y_train_predict)

In [None]:
#from above traning=89% &testing=72>>>low bias & high variance>>overfitted
#from above traning=89% &testing=74 >>>low bias & high variance>>overfitted
#from above traning=99% &testing=81>>>low bias & high variance>>overfitted
#from above traning=93% &testing=74>>>low bias & high variance>>overfitted
#from above traning=75% &testing=61 >>>high bias & high variance
#from above traning=98% &testing=79>>>low bias & high variance>>overfitted

In [101]:
col_list=list(x.columns)
col_dict={'column':col_list}
col_dict

{'column': ['km_driven',
  'mileage',
  'engine',
  'max_power',
  'seats',
  'brand_name_Ambassador',
  'brand_name_Audi',
  'brand_name_BMW',
  'brand_name_Bentley',
  'brand_name_Chevrolet',
  'brand_name_DC',
  'brand_name_Daewoo',
  'brand_name_Datsun',
  'brand_name_Ferrari',
  'brand_name_Fiat',
  'brand_name_Force',
  'brand_name_Ford',
  'brand_name_Honda',
  'brand_name_Hyundai',
  'brand_name_ISUZU',
  'brand_name_Isuzu',
  'brand_name_Jaguar',
  'brand_name_Jeep',
  'brand_name_Kia',
  'brand_name_Lamborghini',
  'brand_name_Land',
  'brand_name_Lexus',
  'brand_name_MG',
  'brand_name_Mahindra',
  'brand_name_Maruti',
  'brand_name_Maserati',
  'brand_name_Mercedes-AMG',
  'brand_name_Mercedes-Benz',
  'brand_name_Mini',
  'brand_name_Mitsubishi',
  'brand_name_Nissan',
  'brand_name_Opel',
  'brand_name_OpelCorsa',
  'brand_name_Porsche',
  'brand_name_Premier',
  'brand_name_Renault',
  'brand_name_Rolls-Royce',
  'brand_name_Skoda',
  'brand_name_Tata',
  'brand_name_To

In [103]:
final_df.columns

Index(['km_driven', 'mileage', 'engine', 'max_power', 'seats',
       'brand_name_Ambassador', 'brand_name_Audi', 'brand_name_BMW',
       'brand_name_Bentley', 'brand_name_Chevrolet', 'brand_name_DC',
       'brand_name_Daewoo', 'brand_name_Datsun', 'brand_name_Ferrari',
       'brand_name_Fiat', 'brand_name_Force', 'brand_name_Ford',
       'brand_name_Honda', 'brand_name_Hyundai', 'brand_name_ISUZU',
       'brand_name_Isuzu', 'brand_name_Jaguar', 'brand_name_Jeep',
       'brand_name_Kia', 'brand_name_Lamborghini', 'brand_name_Land',
       'brand_name_Lexus', 'brand_name_MG', 'brand_name_Mahindra',
       'brand_name_Maruti', 'brand_name_Maserati', 'brand_name_Mercedes-AMG',
       'brand_name_Mercedes-Benz', 'brand_name_Mini', 'brand_name_Mitsubishi',
       'brand_name_Nissan', 'brand_name_Opel', 'brand_name_OpelCorsa',
       'brand_name_Porsche', 'brand_name_Premier', 'brand_name_Renault',
       'brand_name_Rolls-Royce', 'brand_name_Skoda', 'brand_name_Tata',
       'brand_na

In [104]:
import json
with open('col_dict.json','w') as file:
    json.dump(col_dict,file)

In [105]:
import pickle
with open('model.pkl','wb') as file:
    pickle.dump(knn,file)