In [137]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.simplefilter("ignore")

In [59]:
df=pd.read_csv('train-data.csv',index_col=0)
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 658.3+ KB


In [61]:
df.isnull().sum()/df.shape[0]*100

Name                  0.000000
Location              0.000000
Year                  0.000000
Kilometers_Driven     0.000000
Fuel_Type             0.000000
Transmission          0.000000
Owner_Type            0.000000
Mileage               0.033228
Engine                0.598106
Power                 0.598106
Seats                 0.697790
New_Price            86.310018
Price                 0.000000
dtype: float64

In [62]:
# as 'New_Price' column contains more than 50% null values so drop that column

In [63]:
df=df.drop('New_Price',axis=1)

In [64]:
df.Mileage=df.Mileage.str.replace(' km/kg','')
df.Mileage=df.Mileage.str.replace(' kmpl','')
df.Power=df.Power.str.replace(' bhp','')
df.Engine=df.Engine.str.replace(' CC','')

In [65]:
df['Mileage'] = df['Mileage'].astype(float)
df['Engine'] = df['Engine'].astype(float)

In [66]:
df['Power'] = df['Power'].replace('null',np.nan)

In [67]:
df['Power'] = df['Power'].astype(float)

In [68]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [69]:
df.describe(include=[np.number])

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price
count,6019.0,6019.0,6017.0,5983.0,5876.0,5977.0,6019.0
mean,2013.358199,58738.38,18.134961,1621.27645,113.25305,5.278735,9.479468
std,3.269742,91268.84,4.582289,601.355233,53.874957,0.80884,11.187917
min,1998.0,171.0,0.0,72.0,34.2,0.0,0.44
25%,2011.0,34000.0,15.17,1198.0,75.0,5.0,3.5
50%,2014.0,53000.0,18.15,1493.0,97.7,5.0,5.64
75%,2016.0,73000.0,21.1,1984.0,138.1,5.0,9.95
max,2019.0,6500000.0,33.54,5998.0,560.0,10.0,160.0


In [70]:
# Seats and Mileage can not be zero so replace o with nan values
df.Mileage=df.Mileage.replace(0,np.nan)
df.Seats=df.Seats.replace(0,np.nan)

In [71]:
df['Company Name']=df['Name'].str.split().apply(lambda x:x[0])

In [72]:
df=df.drop('Name',axis=1)

In [73]:
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Company Name
0,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75,Maruti
1,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5,Hyundai
2,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5,Honda
3,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0,Maruti
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74,Audi


In [16]:
# null values imputation

In [74]:
df.Seats.unique()

array([ 5.,  7.,  8.,  4.,  6.,  2., nan, 10.,  9.])

In [76]:
df.Seats=df.Seats.fillna(df.Seats.mode()[0])

In [77]:
df.Engine=df.Engine.fillna(df.Engine.median())

In [78]:
df.Mileage=df.Mileage.fillna(df.Mileage.median())

In [79]:
df.Power=df.Power.fillna(df.Power.median())

In [80]:
df.isnull().sum()

Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
Company Name         0
dtype: int64

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Location           6019 non-null   object 
 1   Year               6019 non-null   int64  
 2   Kilometers_Driven  6019 non-null   int64  
 3   Fuel_Type          6019 non-null   object 
 4   Transmission       6019 non-null   object 
 5   Owner_Type         6019 non-null   object 
 6   Mileage            6019 non-null   float64
 7   Engine             6019 non-null   float64
 8   Power              6019 non-null   float64
 9   Seats              6019 non-null   float64
 10  Price              6019 non-null   float64
 11  Company Name       6019 non-null   object 
dtypes: float64(5), int64(2), object(5)
memory usage: 611.3+ KB


In [82]:
df=df[['Company Name','Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission',
       'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats', 'Price']]

In [83]:
df.head()

Unnamed: 0,Company Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,Hyundai,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [84]:
# frequency encoding of company name column

In [85]:
fe=df.groupby('Company Name').size()/len(df)

In [86]:
df['Company Name']=df['Company Name'].map(fe)

In [87]:
df.head()

Unnamed: 0,Company Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0.201196,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,0.183918,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,0.101013,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,0.201196,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,0.039209,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [88]:
df.Location

0           Mumbai
1             Pune
2          Chennai
3          Chennai
4       Coimbatore
           ...    
6014         Delhi
6015        Jaipur
6016        Jaipur
6017       Kolkata
6018     Hyderabad
Name: Location, Length: 6019, dtype: object

In [89]:
fe1=df.groupby('Location').size()/len(df)

In [90]:
df['Location']=df['Location'].map(fe1)

In [91]:
df.head()

Unnamed: 0,Company Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0.201196,0.131251,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,0.183918,0.103339,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,0.101013,0.082073,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,0.201196,0.082073,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,0.039209,0.105665,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [92]:
# encode other categorical variables

In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Company Name       6019 non-null   float64
 1   Location           6019 non-null   float64
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6019 non-null   float64
 8   Engine             6019 non-null   float64
 9   Power              6019 non-null   float64
 10  Seats              6019 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(7), int64(2), object(3)
memory usage: 611.3+ KB


In [94]:
df=pd.get_dummies(df)

In [95]:
df.head()

Unnamed: 0,Company Name,Location,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Electric,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Automatic,Transmission_Manual,Owner_Type_First,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third
0,0.201196,0.131251,2010,72000,26.6,998.0,58.16,5.0,1.75,1,0,0,0,0,0,1,1,0,0,0
1,0.183918,0.103339,2015,41000,19.67,1582.0,126.2,5.0,12.5,0,1,0,0,0,0,1,1,0,0,0
2,0.101013,0.082073,2011,46000,18.2,1199.0,88.7,5.0,4.5,0,0,0,0,1,0,1,1,0,0,0
3,0.201196,0.082073,2012,87000,20.77,1248.0,88.76,7.0,6.0,0,1,0,0,0,0,1,1,0,0,0
4,0.039209,0.105665,2013,40670,15.2,1968.0,140.8,5.0,17.74,0,1,0,0,0,1,0,0,0,1,0


In [96]:
X=df.drop('Price',axis=1)
y=df.Price

In [97]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,random_state=10)

In [159]:
pipe1=Pipeline((
('pt',PowerTransformer()),
('gb',GradientBoostingRegressor(random_state=2))
))

In [169]:
params={'gb__n_estimators':[50,100,150,200,250],'gb__max_depth':[5,10,15,20,25]}
gd=GridSearchCV(pipe1,params,cv=5,scoring='r2')
gd.fit(xtrain,ytrain)
print('best_parameter',gd.best_params_)
print('best Score',gd.best_score_)

best_parameter {'gb__max_depth': 5, 'gb__n_estimators': 250}
best Score 0.8996909962904789


In [173]:
pipe=Pipeline((
('pt',PowerTransformer()),
('gb',GradientBoostingRegressor(n_estimators=250,n_iter_no_change=1,learning_rate=0.1,random_state=2))
))

In [174]:
pipe.fit(xtrain,ytrain)
pipe.score(xtrain,ytrain)

0.9072216260435599

In [175]:
pipe.score(xtest,ytest)

0.8569372850985786

In [176]:
from sklearn.metrics import mean_squared_error
pred2=pipe.predict(xtest)
pred1=pipe.predict(xtrain)
mse1=mean_squared_error(ytrain,pred1)
mse2=mean_squared_error(ytest,pred2)
print('train_rmse',np.mean(mse1))
print('test_rmse',np.mean(mse2))

train_rmse 12.33130011719429
test_rmse 15.311925991608472


In [177]:
# mape
def mape(actual, predicted):
    return (np.mean(np.abs((actual - predicted) / actual)) * 100)

In [167]:
print('train_mape',mape(ytrain,pred1))
print('test_mape',mape(ytest,pred2))


train_mape 27.279028344258897
test_mape 30.17286514226667


In [55]:
la=Lasso()
from sklearn.model_selection import cross_val_score
r=cross_val_score(dt,sx,y,cv=5,scoring='neg_mean_squared_error')
rm=np.mean(abs(r))
rm

16.73942657833185