In [127]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor

In [128]:
df = pd.read_csv('https://drive.google.com/uc?export=download&id=1tKqddJuIiRqrm7ml2qyVGJlAW3JQjaZX')

In [129]:
df.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,"$36,945","$33,337",3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,"$23,820","$21,761",2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,"$26,990","$24,647",2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,"$33,195","$30,299",3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,"$43,755","$39,014",3.5,6.0,225,18,24,3880,115,197


In [130]:
# checking the null values
df.isnull().sum()

Make           0
Model          0
Type           0
Origin         0
DriveTrain     0
MSRP           0
Invoice        0
EngineSize     0
Cylinders      2
Horsepower     0
MPG_City       0
MPG_Highway    0
Weight         0
Wheelbase      0
Length         0
dtype: int64

In [131]:
#dropping the null values
df = df.dropna()
df.isnull().sum()

Make           0
Model          0
Type           0
Origin         0
DriveTrain     0
MSRP           0
Invoice        0
EngineSize     0
Cylinders      0
Horsepower     0
MPG_City       0
MPG_Highway    0
Weight         0
Wheelbase      0
Length         0
dtype: int64

The target variable is 'MSRP'.

In [132]:
y = df.pop('MSRP')

In [133]:
# dropping the invoice feature
df = df.drop('Invoice',axis=1)
df.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,3.5,6.0,225,18,24,3880,115,197


In [134]:
a = y

In [135]:
a = a.apply(lambda x: x.strip('$'))
a = a.apply(lambda x: x.replace(',',''))
a

0      36945
1      23820
2      26990
3      33195
4      43755
       ...  
423    40565
424    42565
425    45210
426    26135
427    35145
Name: MSRP, Length: 426, dtype: object

In [136]:
# applying above expression in a single line (also converting to integer)
y = y.apply(lambda x: x.replace('$','').replace(',','')).astype(int)

In [137]:
print(type(y[0]))

<class 'numpy.int64'>


In [138]:
df.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,3.5,6.0,225,18,24,3880,115,197


Pandas.get_dummies() on the categorical features : 'Make','Model','Type','Origin', 'DriveTrain'

In [139]:
# applying get dummies on 'Make'
test = pd.get_dummies(df,columns=['Make'])
test

Unnamed: 0,Model,Type,Origin,DriveTrain,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,...,Make_Pontiac,Make_Porsche,Make_Saab,Make_Saturn,Make_Scion,Make_Subaru,Make_Suzuki,Make_Toyota,Make_Volkswagen,Make_Volvo
0,MDX,SUV,Asia,All,3.5,6.0,265,17,23,4451,...,0,0,0,0,0,0,0,0,0,0
1,RSX Type S 2dr,Sedan,Asia,Front,2.0,4.0,200,24,31,2778,...,0,0,0,0,0,0,0,0,0,0
2,TSX 4dr,Sedan,Asia,Front,2.4,4.0,200,22,29,3230,...,0,0,0,0,0,0,0,0,0,0
3,TL 4dr,Sedan,Asia,Front,3.2,6.0,270,20,28,3575,...,0,0,0,0,0,0,0,0,0,0
4,3.5 RL 4dr,Sedan,Asia,Front,3.5,6.0,225,18,24,3880,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,C70 LPT convertible 2dr,Sedan,Europe,Front,2.4,5.0,197,21,28,3450,...,0,0,0,0,0,0,0,0,0,1
424,C70 HPT convertible 2dr,Sedan,Europe,Front,2.3,5.0,242,20,26,3450,...,0,0,0,0,0,0,0,0,0,1
425,S80 T6 4dr,Sedan,Europe,Front,2.9,6.0,268,19,26,3653,...,0,0,0,0,0,0,0,0,0,1
426,V40,Wagon,Europe,Front,1.9,4.0,170,22,29,2822,...,0,0,0,0,0,0,0,0,0,1


In [140]:
# applying get_dummies to all the categorical features
cat_features = ['Make','Model','Type','Origin','DriveTrain']

In [141]:
X = pd.get_dummies(df,columns=cat_features)
X

Unnamed: 0,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length,Make_Acura,Make_Audi,...,Type_Sedan,Type_Sports,Type_Truck,Type_Wagon,Origin_Asia,Origin_Europe,Origin_USA,DriveTrain_All,DriveTrain_Front,DriveTrain_Rear
0,3.5,6.0,265,17,23,4451,106,189,1,0,...,0,0,0,0,1,0,0,1,0,0
1,2.0,4.0,200,24,31,2778,101,172,1,0,...,1,0,0,0,1,0,0,0,1,0
2,2.4,4.0,200,22,29,3230,105,183,1,0,...,1,0,0,0,1,0,0,0,1,0
3,3.2,6.0,270,20,28,3575,108,186,1,0,...,1,0,0,0,1,0,0,0,1,0
4,3.5,6.0,225,18,24,3880,115,197,1,0,...,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,2.4,5.0,197,21,28,3450,105,186,0,0,...,1,0,0,0,0,1,0,0,1,0
424,2.3,5.0,242,20,26,3450,105,186,0,0,...,1,0,0,0,0,1,0,0,1,0
425,2.9,6.0,268,19,26,3653,110,190,0,0,...,1,0,0,0,0,1,0,0,1,0
426,1.9,4.0,170,22,29,2822,101,180,0,0,...,0,0,0,1,0,1,0,0,1,0


In [142]:
X.shape

(426, 481)

In [143]:
# splitting the data

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [151]:
def training_regressor(estimator, X, y, name):
  estimator.fit(X_train,y_train)
  score = estimator.score(X_test,y_test)
  print(f'name: {score}')

In [152]:
training_regressor(AdaBoostRegressor(random_state=1),X,y,'AdaBoost Regressor' )

name: 0.711903287964231


In [153]:
training_regressor(BaggingRegressor(random_state=1),X,y,'Bagging Regressor')

name: 0.7949163237522345


In [154]:
training_regressor(GradientBoostingRegressor(random_state=1),X,y,'GB Regressor')

name: 0.8270485242007507


In [155]:
training_regressor(RandomForestRegressor(random_state=1),X,y,'RF Regressor')

name: 0.8369440882741959
