In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report


In [2]:
# Load dataset
df = pd.read_excel("cars_data.xlsx")
df.head()

Unnamed: 0,Name,Price,Rating,city,Kilometers,Year_of_Purchase,Owner,Fuel_Type,Transmission,RTO,Insurance,Insurance_Type
0,2010 Maruti Alto LXI,"₹ 174,699",4out of 5,3686,"Kilometers34,854 km",Year of PurchaseMay 2010,OwnerFirst Owner,FuelPetrol,TransmissionMANUAL,RTOAP09,,Insurance TypeInsurance Expired
1,2013 Maruti Wagon R 1.0 VXI,"₹ 333,999",4.2out of 5,3686,"Kilometers39,541 km",Year of PurchaseJuly 2013,OwnerSecond Owner,FuelPetrol,TransmissionMANUAL,RTOAP28,Insurance6/8/2021,Insurance TypeComp
2,2014 Maruti Wagon R 1.0 VXI,"₹ 353,199",4.3out of 5,3686,"Kilometers23,233 km",Year of PurchaseMarch 2014,OwnerSecond Owner,FuelPetrol,TransmissionMANUAL,RTOAP28,Insurance28/4/2021,Insurance TypeComp
3,2013 Hyundai Eon D LITE PLUS,"₹ 237,899",4.4out of 5,3686,"Kilometers27,748 km",Year of PurchaseNovember 2013,OwnerFirst Owner,FuelPetrol,TransmissionMANUAL,RTOAP13,Insurance1/1/2022,Insurance TypeComp
4,2017 Hyundai Eon ERA PLUS,"₹ 300,699",4.4out of 5,3686,"Kilometers12,238 km",Year of PurchaseAugust 2017,OwnerSecond Owner,FuelPetrol,TransmissionMANUAL,RTOTS10,,Insurance TypeExpired


In [3]:
df.isna().sum()

Name                   0
Price                  0
Rating                 0
city                   0
Kilometers             0
Year_of_Purchase       0
Owner                  0
Fuel_Type              0
Transmission        2816
RTO                    0
Insurance           9193
Insurance_Type         0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32158 entries, 0 to 32157
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Name              32158 non-null  object
 1   Price             32158 non-null  object
 2   Rating            32158 non-null  object
 3   city              32158 non-null  int64 
 4   Kilometers        32158 non-null  object
 5   Year_of_Purchase  32158 non-null  object
 6   Owner             32158 non-null  object
 7   Fuel_Type         32158 non-null  object
 8   Transmission      29342 non-null  object
 9   RTO               32158 non-null  object
 10  Insurance         22965 non-null  object
 11  Insurance_Type    32158 non-null  object
dtypes: int64(1), object(11)
memory usage: 2.9+ MB


In [5]:
df['Price'] = df['Price'].apply(lambda x: x.replace('₹', ''))
df['Price'] = df['Price'].apply(lambda x: x.replace(',', ''))
df['Rating'] = df['Rating'].apply(lambda x: x.replace('out of 5', ''))
df['Kilometers'] = df['Kilometers'].apply(lambda x: x.replace('Kilometers', ''))
df['Kilometers'] = df['Kilometers'].apply(lambda x: x.replace('km', ''))
df['Kilometers'] = df['Kilometers'].apply(lambda x: x.replace(',', ''))
df['Year_of_Purchase'] = df['Year_of_Purchase'].apply(lambda x: x.replace('Year of Purchase', ''))
df['Fuel_Type'] = df['Fuel_Type'].apply(lambda x: x.replace('Fuel', ''))
df['Owner']= df['Owner'].apply(lambda x: x.replace('Owner', ''))
# df['Transmission'] = df['Transmission'].apply(lambda x: x.replace('Transmission', ''))
df['RTO']= df['RTO'].apply(lambda x: x.replace('RTO', ''))
df['Insurance_Type'] = df['Insurance_Type'].apply(lambda x: x.replace('Insurance Type', ''))


In [6]:
df

Unnamed: 0,Name,Price,Rating,city,Kilometers,Year_of_Purchase,Owner,Fuel_Type,Transmission,RTO,Insurance,Insurance_Type
0,2010 Maruti Alto LXI,174699,4,3686,34854,May 2010,First,Petrol,TransmissionMANUAL,AP09,,Insurance Expired
1,2013 Maruti Wagon R 1.0 VXI,333999,4.2,3686,39541,July 2013,Second,Petrol,TransmissionMANUAL,AP28,Insurance6/8/2021,Comp
2,2014 Maruti Wagon R 1.0 VXI,353199,4.3,3686,23233,March 2014,Second,Petrol,TransmissionMANUAL,AP28,Insurance28/4/2021,Comp
3,2013 Hyundai Eon D LITE PLUS,237899,4.4,3686,27748,November 2013,First,Petrol,TransmissionMANUAL,AP13,Insurance1/1/2022,Comp
4,2017 Hyundai Eon ERA PLUS,300699,4.4,3686,12238,August 2017,Second,Petrol,TransmissionMANUAL,TS10,,Expired
...,...,...,...,...,...,...,...,...,...,...,...,...
32153,2008 Ford Fiesta 1.6 ZXI,150000,3.4,769,95792,February 2008,Third,Petrol,TransmissionMANUAL,CH04,,Insurance Expired
32154,2015 Chevrolet Beat LS DIESEL,250000,3.5,769,54505,November 2015,Second,Diesel,TransmissionMANUAL,PB13,,Insurance Expired
32155,2008 Ford Endeavour XLT TDCI 4X2,500000,3.6,769,161322,January 2008,Second,Diesel,TransmissionMANUAL,HR03,,Insurance Expired
32156,2016 Maruti Swift Dzire VDI ABS,600000,4.7,769,52006,October 2016,First,Diesel,,PB65,Insurance27/10/2019,Zero Depreciation


In [7]:
df.tail

<bound method NDFrame.tail of                                         Name    Price Rating  city Kilometers  \
0                       2010 Maruti Alto LXI   174699      4  3686     34854    
1                2013 Maruti Wagon R 1.0 VXI   333999    4.2  3686     39541    
2                2014 Maruti Wagon R 1.0 VXI   353199    4.3  3686     23233    
3               2013 Hyundai Eon D LITE PLUS   237899    4.4  3686     27748    
4                  2017 Hyundai Eon ERA PLUS   300699    4.4  3686     12238    
...                                      ...      ...    ...   ...        ...   
32153              2008 Ford Fiesta 1.6 ZXI    150000    3.4   769     95792    
32154          2015 Chevrolet Beat LS DIESEL   250000    3.5   769     54505    
32155       2008 Ford Endeavour XLT TDCI 4X2   500000    3.6   769    161322    
32156        2016 Maruti Swift Dzire VDI ABS   600000    4.7   769     52006    
32157  2007 Mercedes Benz C Class 220 CDI AT   484000    3.8   769     88818   

In [8]:
df.drop('Transmission',inplace=True,axis=1)

In [9]:
df.drop('Insurance',inplace=True,axis=1)

In [10]:
df['Car_Name'] = df['Name'].apply(lambda x: ''.join(x.split()[1]))
df['Year'] = df['Name'].str.extract(r'(^\d{4})')
df['Model'] = df['Name'].apply(lambda x: ''.join(x.split()[2:]))

In [11]:
df.drop('Name',inplace=True,axis=1)

In [12]:
df

Unnamed: 0,Price,Rating,city,Kilometers,Year_of_Purchase,Owner,Fuel_Type,RTO,Insurance_Type,Car_Name,Year,Model
0,174699,4,3686,34854,May 2010,First,Petrol,AP09,Insurance Expired,Maruti,2010,AltoLXI
1,333999,4.2,3686,39541,July 2013,Second,Petrol,AP28,Comp,Maruti,2013,WagonR1.0VXI
2,353199,4.3,3686,23233,March 2014,Second,Petrol,AP28,Comp,Maruti,2014,WagonR1.0VXI
3,237899,4.4,3686,27748,November 2013,First,Petrol,AP13,Comp,Hyundai,2013,EonDLITEPLUS
4,300699,4.4,3686,12238,August 2017,Second,Petrol,TS10,Expired,Hyundai,2017,EonERAPLUS
...,...,...,...,...,...,...,...,...,...,...,...,...
32153,150000,3.4,769,95792,February 2008,Third,Petrol,CH04,Insurance Expired,Ford,2008,Fiesta1.6ZXI
32154,250000,3.5,769,54505,November 2015,Second,Diesel,PB13,Insurance Expired,Chevrolet,2015,BeatLSDIESEL
32155,500000,3.6,769,161322,January 2008,Second,Diesel,HR03,Insurance Expired,Ford,2008,EndeavourXLTTDCI4X2
32156,600000,4.7,769,52006,October 2016,First,Diesel,PB65,Zero Depreciation,Maruti,2016,SwiftDzireVDIABS


In [13]:
df['Price']= df['Price'].astype(int)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32158 entries, 0 to 32157
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Price             32158 non-null  int32 
 1   Rating            32158 non-null  object
 2   city              32158 non-null  int64 
 3   Kilometers        32158 non-null  object
 4   Year_of_Purchase  32158 non-null  object
 5   Owner             32158 non-null  object
 6   Fuel_Type         32158 non-null  object
 7   RTO               32158 non-null  object
 8   Insurance_Type    32158 non-null  object
 9   Car_Name          32158 non-null  object
 10  Year              32158 non-null  object
 11  Model             32158 non-null  object
dtypes: int32(1), int64(1), object(10)
memory usage: 2.8+ MB


In [15]:
df['Rating']= df['Rating'].astype(float)

In [16]:
df['city']= df['city'].astype(int)

In [17]:
df['Kilometers']= df['Kilometers'].astype(int)

In [18]:
df['Year_of_Purchase']= df['Year_of_Purchase'].astype(str)

In [19]:
df['Owner']= df['Owner'].astype(str)

In [20]:
df['Fuel_Type']= df['Fuel_Type'].astype(str)

In [21]:
df['RTO']= df['RTO'].astype(str)

In [22]:
df['Insurance_Type']= df['Insurance_Type'].astype(str)

In [23]:
df['Car_Name']= df['Car_Name'].astype(str)

In [24]:
df['Year']= df['Year'].astype(int)

In [25]:
df['Model']= df['Model'].astype(str)

In [26]:
df.isna().sum()

Price               0
Rating              0
city                0
Kilometers          0
Year_of_Purchase    0
Owner               0
Fuel_Type           0
RTO                 0
Insurance_Type      0
Car_Name            0
Year                0
Model               0
dtype: int64

In [27]:
x=df.drop('Fuel_Type',axis=1)
y=df['Fuel_Type']

In [28]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=22)

In [29]:
# from sklearn.linear_model import LogisticRegression
# #save the model
# model=LogisticRegression()
# model.fit(x_train,y_train)

In [30]:
df


Unnamed: 0,Price,Rating,city,Kilometers,Year_of_Purchase,Owner,Fuel_Type,RTO,Insurance_Type,Car_Name,Year,Model
0,174699,4.0,3686,34854,May 2010,First,Petrol,AP09,Insurance Expired,Maruti,2010,AltoLXI
1,333999,4.2,3686,39541,July 2013,Second,Petrol,AP28,Comp,Maruti,2013,WagonR1.0VXI
2,353199,4.3,3686,23233,March 2014,Second,Petrol,AP28,Comp,Maruti,2014,WagonR1.0VXI
3,237899,4.4,3686,27748,November 2013,First,Petrol,AP13,Comp,Hyundai,2013,EonDLITEPLUS
4,300699,4.4,3686,12238,August 2017,Second,Petrol,TS10,Expired,Hyundai,2017,EonERAPLUS
...,...,...,...,...,...,...,...,...,...,...,...,...
32153,150000,3.4,769,95792,February 2008,Third,Petrol,CH04,Insurance Expired,Ford,2008,Fiesta1.6ZXI
32154,250000,3.5,769,54505,November 2015,Second,Diesel,PB13,Insurance Expired,Chevrolet,2015,BeatLSDIESEL
32155,500000,3.6,769,161322,January 2008,Second,Diesel,HR03,Insurance Expired,Ford,2008,EndeavourXLTTDCI4X2
32156,600000,4.7,769,52006,October 2016,First,Diesel,PB65,Zero Depreciation,Maruti,2016,SwiftDzireVDIABS


In [31]:
df['Month'] = df['Year_of_Purchase'].apply(lambda x: ''.join(x.split()[0]))

In [32]:
df

Unnamed: 0,Price,Rating,city,Kilometers,Year_of_Purchase,Owner,Fuel_Type,RTO,Insurance_Type,Car_Name,Year,Model,Month
0,174699,4.0,3686,34854,May 2010,First,Petrol,AP09,Insurance Expired,Maruti,2010,AltoLXI,May
1,333999,4.2,3686,39541,July 2013,Second,Petrol,AP28,Comp,Maruti,2013,WagonR1.0VXI,July
2,353199,4.3,3686,23233,March 2014,Second,Petrol,AP28,Comp,Maruti,2014,WagonR1.0VXI,March
3,237899,4.4,3686,27748,November 2013,First,Petrol,AP13,Comp,Hyundai,2013,EonDLITEPLUS,November
4,300699,4.4,3686,12238,August 2017,Second,Petrol,TS10,Expired,Hyundai,2017,EonERAPLUS,August
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32153,150000,3.4,769,95792,February 2008,Third,Petrol,CH04,Insurance Expired,Ford,2008,Fiesta1.6ZXI,February
32154,250000,3.5,769,54505,November 2015,Second,Diesel,PB13,Insurance Expired,Chevrolet,2015,BeatLSDIESEL,November
32155,500000,3.6,769,161322,January 2008,Second,Diesel,HR03,Insurance Expired,Ford,2008,EndeavourXLTTDCI4X2,January
32156,600000,4.7,769,52006,October 2016,First,Diesel,PB65,Zero Depreciation,Maruti,2016,SwiftDzireVDIABS,October


In [33]:
df['Year_of_Purchase'] = df['Year_of_Purchase'].astype(str).str.extract(r'(\d{4})')

In [34]:
df

Unnamed: 0,Price,Rating,city,Kilometers,Year_of_Purchase,Owner,Fuel_Type,RTO,Insurance_Type,Car_Name,Year,Model,Month
0,174699,4.0,3686,34854,2010,First,Petrol,AP09,Insurance Expired,Maruti,2010,AltoLXI,May
1,333999,4.2,3686,39541,2013,Second,Petrol,AP28,Comp,Maruti,2013,WagonR1.0VXI,July
2,353199,4.3,3686,23233,2014,Second,Petrol,AP28,Comp,Maruti,2014,WagonR1.0VXI,March
3,237899,4.4,3686,27748,2013,First,Petrol,AP13,Comp,Hyundai,2013,EonDLITEPLUS,November
4,300699,4.4,3686,12238,2017,Second,Petrol,TS10,Expired,Hyundai,2017,EonERAPLUS,August
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32153,150000,3.4,769,95792,2008,Third,Petrol,CH04,Insurance Expired,Ford,2008,Fiesta1.6ZXI,February
32154,250000,3.5,769,54505,2015,Second,Diesel,PB13,Insurance Expired,Chevrolet,2015,BeatLSDIESEL,November
32155,500000,3.6,769,161322,2008,Second,Diesel,HR03,Insurance Expired,Ford,2008,EndeavourXLTTDCI4X2,January
32156,600000,4.7,769,52006,2016,First,Diesel,PB65,Zero Depreciation,Maruti,2016,SwiftDzireVDIABS,October


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32158 entries, 0 to 32157
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             32158 non-null  int32  
 1   Rating            32158 non-null  float64
 2   city              32158 non-null  int32  
 3   Kilometers        32158 non-null  int32  
 4   Year_of_Purchase  32158 non-null  object 
 5   Owner             32158 non-null  object 
 6   Fuel_Type         32158 non-null  object 
 7   RTO               32158 non-null  object 
 8   Insurance_Type    32158 non-null  object 
 9   Car_Name          32158 non-null  object 
 10  Year              32158 non-null  int32  
 11  Model             32158 non-null  object 
 12  Month             32158 non-null  object 
dtypes: float64(1), int32(4), object(8)
memory usage: 2.7+ MB


In [36]:
df['city']= df['city'].astype(int)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32158 entries, 0 to 32157
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             32158 non-null  int32  
 1   Rating            32158 non-null  float64
 2   city              32158 non-null  int32  
 3   Kilometers        32158 non-null  int32  
 4   Year_of_Purchase  32158 non-null  object 
 5   Owner             32158 non-null  object 
 6   Fuel_Type         32158 non-null  object 
 7   RTO               32158 non-null  object 
 8   Insurance_Type    32158 non-null  object 
 9   Car_Name          32158 non-null  object 
 10  Year              32158 non-null  int32  
 11  Model             32158 non-null  object 
 12  Month             32158 non-null  object 
dtypes: float64(1), int32(4), object(8)
memory usage: 2.7+ MB


In [38]:
df['Year_of_Purchase']= df['Year_of_Purchase'].astype(str)

In [39]:
df['Year_of_Purchase']= df['Year_of_Purchase'].astype(int)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32158 entries, 0 to 32157
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             32158 non-null  int32  
 1   Rating            32158 non-null  float64
 2   city              32158 non-null  int32  
 3   Kilometers        32158 non-null  int32  
 4   Year_of_Purchase  32158 non-null  int32  
 5   Owner             32158 non-null  object 
 6   Fuel_Type         32158 non-null  object 
 7   RTO               32158 non-null  object 
 8   Insurance_Type    32158 non-null  object 
 9   Car_Name          32158 non-null  object 
 10  Year              32158 non-null  int32  
 11  Model             32158 non-null  object 
 12  Month             32158 non-null  object 
dtypes: float64(1), int32(5), object(7)
memory usage: 2.6+ MB


In [41]:
df['Owner']= df['Owner'].astype(str)

In [42]:
df['Fuel_Type']= df['Fuel_Type'].astype(str)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32158 entries, 0 to 32157
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             32158 non-null  int32  
 1   Rating            32158 non-null  float64
 2   city              32158 non-null  int32  
 3   Kilometers        32158 non-null  int32  
 4   Year_of_Purchase  32158 non-null  int32  
 5   Owner             32158 non-null  object 
 6   Fuel_Type         32158 non-null  object 
 7   RTO               32158 non-null  object 
 8   Insurance_Type    32158 non-null  object 
 9   Car_Name          32158 non-null  object 
 10  Year              32158 non-null  int32  
 11  Model             32158 non-null  object 
 12  Month             32158 non-null  object 
dtypes: float64(1), int32(5), object(7)
memory usage: 2.6+ MB


In [44]:
df.head()

Unnamed: 0,Price,Rating,city,Kilometers,Year_of_Purchase,Owner,Fuel_Type,RTO,Insurance_Type,Car_Name,Year,Model,Month
0,174699,4.0,3686,34854,2010,First,Petrol,AP09,Insurance Expired,Maruti,2010,AltoLXI,May
1,333999,4.2,3686,39541,2013,Second,Petrol,AP28,Comp,Maruti,2013,WagonR1.0VXI,July
2,353199,4.3,3686,23233,2014,Second,Petrol,AP28,Comp,Maruti,2014,WagonR1.0VXI,March
3,237899,4.4,3686,27748,2013,First,Petrol,AP13,Comp,Hyundai,2013,EonDLITEPLUS,November
4,300699,4.4,3686,12238,2017,Second,Petrol,TS10,Expired,Hyundai,2017,EonERAPLUS,August


In [45]:
df.dtypes

Price                 int32
Rating              float64
city                  int32
Kilometers            int32
Year_of_Purchase      int32
Owner                object
Fuel_Type            object
RTO                  object
Insurance_Type       object
Car_Name             object
Year                  int32
Model                object
Month                object
dtype: object

In [46]:
df

Unnamed: 0,Price,Rating,city,Kilometers,Year_of_Purchase,Owner,Fuel_Type,RTO,Insurance_Type,Car_Name,Year,Model,Month
0,174699,4.0,3686,34854,2010,First,Petrol,AP09,Insurance Expired,Maruti,2010,AltoLXI,May
1,333999,4.2,3686,39541,2013,Second,Petrol,AP28,Comp,Maruti,2013,WagonR1.0VXI,July
2,353199,4.3,3686,23233,2014,Second,Petrol,AP28,Comp,Maruti,2014,WagonR1.0VXI,March
3,237899,4.4,3686,27748,2013,First,Petrol,AP13,Comp,Hyundai,2013,EonDLITEPLUS,November
4,300699,4.4,3686,12238,2017,Second,Petrol,TS10,Expired,Hyundai,2017,EonERAPLUS,August
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32153,150000,3.4,769,95792,2008,Third,Petrol,CH04,Insurance Expired,Ford,2008,Fiesta1.6ZXI,February
32154,250000,3.5,769,54505,2015,Second,Diesel,PB13,Insurance Expired,Chevrolet,2015,BeatLSDIESEL,November
32155,500000,3.6,769,161322,2008,Second,Diesel,HR03,Insurance Expired,Ford,2008,EndeavourXLTTDCI4X2,January
32156,600000,4.7,769,52006,2016,First,Diesel,PB65,Zero Depreciation,Maruti,2016,SwiftDzireVDIABS,October


In [47]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Owner'] = le.fit_transform(df['Owner'])

In [48]:
le = LabelEncoder()
df['Fuel_Type'] = le.fit_transform(df['Fuel_Type'])

In [49]:
le = LabelEncoder()
df['RTO'] = le.fit_transform(df['RTO'])

In [50]:
le = LabelEncoder()
df['Insurance_Type'] = le.fit_transform(df['Insurance_Type'])

In [51]:
le = LabelEncoder()
df['Car_Name'] = le.fit_transform(df['Car_Name'])

In [52]:
le = LabelEncoder()
df['Model'] = le.fit_transform(df['Model'])

In [53]:
le = LabelEncoder()
df['Month'] = le.fit_transform(df['Month'])

In [54]:
x=df.drop('Fuel_Type',axis=1)
y=df['Fuel_Type']

In [55]:
x

Unnamed: 0,Price,Rating,city,Kilometers,Year_of_Purchase,Owner,RTO,Insurance_Type,Car_Name,Year,Model,Month
0,174699,4.0,3686,34854,2010,2,10,4,20,2010,149,8
1,333999,4.2,3686,39541,2013,5,26,1,20,2013,2421,5
2,353199,4.3,3686,23233,2014,5,26,1,20,2014,2421,7
3,237899,4.4,3686,27748,2013,2,14,1,10,2013,772,9
4,300699,4.4,3686,12238,2017,5,642,3,10,2017,777,1
...,...,...,...,...,...,...,...,...,...,...,...,...
32153,150000,3.4,769,95792,2008,9,72,4,7,2008,901,3
32154,250000,3.5,769,54505,2015,5,467,4,2,2015,246,9
32155,500000,3.6,769,161322,2008,5,191,4,7,2008,758,4
32156,600000,4.7,769,52006,2016,2,494,9,20,2016,2111,10


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32158 entries, 0 to 32157
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             32158 non-null  int32  
 1   Rating            32158 non-null  float64
 2   city              32158 non-null  int32  
 3   Kilometers        32158 non-null  int32  
 4   Year_of_Purchase  32158 non-null  int32  
 5   Owner             32158 non-null  int32  
 6   Fuel_Type         32158 non-null  int32  
 7   RTO               32158 non-null  int32  
 8   Insurance_Type    32158 non-null  int32  
 9   Car_Name          32158 non-null  int32  
 10  Year              32158 non-null  int32  
 11  Model             32158 non-null  int32  
 12  Month             32158 non-null  int32  
dtypes: float64(1), int32(12)
memory usage: 1.7 MB


In [57]:
df.isna().sum()

Price               0
Rating              0
city                0
Kilometers          0
Year_of_Purchase    0
Owner               0
Fuel_Type           0
RTO                 0
Insurance_Type      0
Car_Name            0
Year                0
Model               0
Month               0
dtype: int64

In [58]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=22)

In [59]:
from sklearn.linear_model import LogisticRegression
#save the model65
model=LogisticRegression() 
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
y_train_pred = model.predict(x_train)

In [61]:
y_train_pred

array([0, 0, 0, ..., 0, 3, 0])

In [62]:
y_test_pred = model.predict(x_test)

In [63]:
y_test_pred

array([0, 3, 3, ..., 0, 0, 0])

In [64]:
test_data = pd.DataFrame({'Actual':y_test,'Prediction':y_test_pred})
test_data

Unnamed: 0,Actual,Prediction
17731,0,0
9496,0,3
1091,3,3
28214,3,3
14513,0,0
...,...,...
21987,3,0
28168,0,3
12644,0,0
7494,0,0


In [65]:
from sklearn.metrics import accuracy_score,classification_report

In [66]:
print('Train accuracy:',accuracy_score(y_train,y_train_pred))
print('Test accuracy:',accuracy_score(y_test,y_test_pred))

Train accuracy: 0.7267355982274741
Test accuracy: 0.7397388059701493


In [67]:
from sklearn.model_selection import cross_val_score
print('Cross Validation Score:',cross_val_score(model,x,y,cv=5).mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross Validation Score: 0.7123900111478063


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [68]:
from sklearn.tree import DecisionTreeClassifier
Decision_tree_model = DecisionTreeClassifier()
Decision_tree_model.fit(x_train,y_train)

In [69]:
y_train_pred=Decision_tree_model.predict(x_train)
y_test_pred=Decision_tree_model.predict(x_test)

In [70]:
print('Train accuracy :',accuracy_score(y_train,y_train_pred))
print('Test accuracy :',accuracy_score(y_test,y_test_pred))

Train accuracy : 1.0
Test accuracy : 0.777363184079602


In [71]:
print('Cross Validation Score:',cross_val_score(model,x,y,cv=5).mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross Validation Score: 0.7123900111478063


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88      3028
           1       1.00      0.83      0.91         6
           3       0.83      0.87      0.85      3012
           4       0.72      0.32      0.44       290
           5       0.80      0.25      0.38        96

    accuracy                           0.85      6432
   macro avg       0.84      0.63      0.69      6432
weighted avg       0.85      0.85      0.84      6432



In [76]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)
grid_rf.fit(x_train, y_train)
print("Best Random Forest Params:", grid_rf.best_params_)
print("Accuracy:", grid_rf.score(x_test, y_test))



Best Random Forest Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.849502487562189
