In [1]:
#importing library
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import learning_curve, GridSearchCV
import xgboost as xgb

# Basic preprocessing steps

In [2]:
#Fill na values of horsehorsepower and acceleration with average of the respective columns
car_df=pd.read_csv("C:/Users/Mainak Kundu/Desktop/Samiran/Rakuten/data.csv")
car_df["horsepower"].fillna((car_df["horsepower"].mean()), inplace=True)
car_df["acceleration"].fillna((car_df["acceleration"].mean()), inplace=True)

#removing extra large values of cylinders and model year
car_df = car_df.drop(car_df[car_df["cylinders"] >8 ].index)
car_df = car_df.drop(car_df[car_df["model year"] > 82].index)

# Linear regression

Linear regression is applied on the data after removing the "car name" feature and removing anomalies for cylinder and model year

In [3]:
#We are removing car name column from the data set for our initial analysis
input_df=car_df.drop("car name",axis=1)

#building a linear regression model 
linear_model = LinearRegression()
X =input_df.loc[:, input_df.columns != "mpg"]
y =input_df["mpg"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
lm_fit=linear_model.fit(x_train,y_train)
r2_test = r2_score(y_test,lm_fit.predict(x_test))
r2_train = r2_score(y_train,lm_fit.predict(x_train))

#final r2 scores
print("Train accuracy: {} and Test accuracy: {}".format(r2_train,r2_test))

Train accuracy: 0.8250051486909883 and Test accuracy: 0.8112807219286456


# Decision tree algorithm

Regression tree regression is applied on the data after removing the "car name" feature and removing anomalies for cylinder and model year

In [4]:
#We are removing car name column from the data set for our initial analysis
input_df=car_df.drop("car name",axis=1)

#list of all variables for which one-hot-encoding has to be done
discrete=["cylinders","model year","origin"]

#One-hot encoding of all the categorical features
input_df=pd.get_dummies(data=input_df, columns=discrete)

#building a tree based model
tree_model = DecisionTreeRegressor(random_state=0,criterion="mae")
X =input_df.loc[:, input_df.columns != "mpg"]
y =input_df["mpg"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
dt_fit=tree_model.fit(x_train, y_train)
r2_test = r2_score(y_test,dt_fit.predict(x_test))
r2_train = r2_score(y_train,dt_fit.predict(x_train))

#final r2 scores
print("Train accuracy: {} and Test accuracy: {}".format(r2_train,r2_test))

Train accuracy: 1.0 and Test accuracy: 0.6669754519741429


Remark: Decision tree is overfitting the data too much.

# XgBoost model with hyperparameter tuning

In [5]:
#We are removing car name column from the data set for our initial analysis
input_df=car_df.drop("car name",axis=1)

#list of all variables for which one-hot-encoding has to be done
discrete=["cylinders","model year","origin"]

#One-hot encoding of all the categorical features
input_df=pd.get_dummies(data=input_df, columns=discrete)


Before going into full hyperparameter turing for xgboost we will try to fit a basic xg boost tree on the data 
and then apply cross validation to get better estimates of the hyparameters.

In [6]:
#xg boost model with default values
xgb = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:linear',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

#dividing the data in train test
X =input_df.loc[:, input_df.columns != "mpg"]
y =input_df["mpg"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

#default xgb model
xgb_model=xgb.fit(x_train, y_train)

r2_test = r2_score(y_test,xgb_model.predict(x_test))
r2_train = r2_score(y_train,xgb_model.predict(x_train))

#final r2 scores
print("Train accuracy: {} and Test accuracy: {}".format(r2_train,r2_test))

  if getattr(data, 'base', None) is not None and \


Train accuracy: 0.9999999863569797 and Test accuracy: 0.8369761540733889


Still we found out that the model is overfitting the data too much. We will try to improve this one by cross-validation.

The first parameter of the xgboost, that is learning rate will be fixed which
we search of other sets of parameter. Onces all the set of paramets are already 
obtained we will go into finding the most optimum value of the learning parameter.
 
If we consider all the parametrs at onces then it will be stuck as the combinations
of all the parameters will be huge to handle with. So we will apply a greedy approach
where we will try to optimize the hyperparameter taking two at a time.

# 1. Max depth and min child weight

In [7]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBRegressor( 

learning_rate =0.1,
n_estimators=140,
max_depth=5,
min_child_weight=1,    
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'reg:linear',
nthread=4,
scale_pos_weight=1,
seed=27), 
 param_grid = param_test1,
n_jobs=4,
iid=False,
cv=5)
gsearch1.fit(x_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

  if getattr(data, 'base', None) is not None and \


({'max_depth': 3, 'min_child_weight': 5}, 0.8486143303431414)

In [8]:
param_test2 = {
 'max_depth':range(2,5,1),
 'min_child_weight':range(3,7,1)
}
gsearch1 = GridSearchCV(estimator = XGBRegressor( 

learning_rate =0.1,
n_estimators=140,
max_depth=5,
min_child_weight=1,    
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'reg:linear',
nthread=4,
scale_pos_weight=1,
seed=27), 
 param_grid = param_test2,
n_jobs=4,
iid=False,
cv=5)
gsearch1.fit(x_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 3, 'min_child_weight': 4}, 0.8533383222065772)

So the best parameteres for max depth is=3 and min_child_weight =4

# 2.Tune gamma

In [9]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBRegressor( 

learning_rate =0.1,
n_estimators=140,
max_depth=3,
min_child_weight=4,    
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'reg:linear',
nthread=4,
scale_pos_weight=1,
seed=27), 
 param_grid = param_test3,
n_jobs=4,
iid=False,
cv=5)
gsearch3.fit(x_train,y_train)
gsearch3.best_params_, gsearch1.best_score_

({'gamma': 0.0}, 0.8533383222065772)

In [10]:
param_test4 = {
 'gamma':[i/100.0 for i in range(0,30,2)]
}
gsearch4 = GridSearchCV(estimator = XGBRegressor( 

learning_rate =0.1,
n_estimators=140,
max_depth=3,
min_child_weight=4,    
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'reg:linear',
nthread=4,
scale_pos_weight=1,
seed=27), 
 param_grid = param_test4,
n_jobs=4,
iid=False,
cv=5)
gsearch4.fit(x_train,y_train)
gsearch4.best_params_, gsearch1.best_score_

({'gamma': 0.0}, 0.8533383222065772)

So the optimum value of gamma is 0.00

# 3. subsample and colsample_bytree

In [11]:
param_test5 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch5 = GridSearchCV(estimator = XGBRegressor( 

learning_rate =0.1,
n_estimators=140,
max_depth=3,
min_child_weight=4,    
gamma=0.00,
subsample=0.8,
colsample_bytree=0.8,
objective= 'reg:linear',
nthread=4,
scale_pos_weight=1,
seed=27), 
 param_grid = param_test5,
n_jobs=4,
iid=False,
cv=5)
gsearch5.fit(x_train,y_train)
gsearch5.best_params_, gsearch4.best_score_

({'colsample_bytree': 0.8, 'subsample': 0.9}, 0.8533383222065772)

In [13]:
param_test6 = {
 'subsample':[i/100.0 for i in range(70,100,5)],
 'colsample_bytree':[i/100.0 for i in range(70,100,5)]
}
gsearch6 = GridSearchCV(estimator = XGBRegressor( 

learning_rate =0.1,
n_estimators=140,
max_depth=3,
min_child_weight=4,    
gamma=0.0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'reg:linear',
nthread=4,
scale_pos_weight=1,
seed=27), 
 param_grid = param_test6,
n_jobs=4,
iid=False,
cv=5)
gsearch6.fit(x_train,y_train)
gsearch6.best_params_, gsearch4.best_score_

({'colsample_bytree': 0.95, 'subsample': 0.85}, 0.8533383222065772)

So the best values of colsample_bytree is 0.85 and subsample is 0.55

# 4. Tuning reg_alpha

In [14]:
param_test7 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch7 =GridSearchCV(estimator = XGBRegressor( 

learning_rate =0.1,
n_estimators=140,
max_depth=3,
min_child_weight=4,    
gamma=0.00,
subsample=0.85,
colsample_bytree=0.95,
objective= 'reg:linear',
reg_alpha=0.1,
nthread=4,
scale_pos_weight=1,
seed=27), 
 param_grid = param_test7,
n_jobs=4,
iid=False,
cv=5)
gsearch7.fit(x_train,y_train)
gsearch7.best_params_, gsearch7.best_score_

({'reg_alpha': 0.01}, 0.8589803166740173)

In [16]:
param_test8= {
 'reg_alpha':[0.005,0.01,0.015,0.02]
}
gsearch8 =GridSearchCV(estimator = XGBRegressor( 

learning_rate =0.1,
n_estimators=140,
max_depth=4,
min_child_weight=1,    
gamma=0.0,
subsample=0.85,
colsample_bytree=0.95,
objective= 'reg:linear',
reg_alpha=0.1,
nthread=4,
scale_pos_weight=1,
seed=27), 
 param_grid = param_test7,
n_jobs=4,
iid=False,
cv=5)
gsearch8.fit(x_train,y_train)
gsearch8.best_params_, gsearch8.best_score_

({'reg_alpha': 0.1}, 0.8294566807384797)

# 5. Tuning learning rate: Now we will reduce the learning rate and find the optimum model

In [17]:
xgb=XGBRegressor(
 learning_rate =0.01,
 n_estimators=1000,
 max_depth=3,
 min_child_weight=4,
 gamma=0.00,
 subsample=0.85,
 reg_alpha=0.1,
 colsample_bytree=0.95,
 objective= 'reg:linear',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

#dividing the data in train test
X =input_df.loc[:, input_df.columns != "mpg"]
y =input_df["mpg"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

#default xgb model
xgb_model=xgb.fit(x_train, y_train)

r2_test = r2_score(y_test,xgb_model.predict(x_test))
r2_train = r2_score(y_train,xgb_model.predict(x_train))

#final r2 scores
print("Train accuracy: {} and Test accuracy: {}".format(r2_train,r2_test))

Train accuracy: 0.957182934176924 and Test accuracy: 0.8572520306269685


So we found out the model which is better than the original model proposed by 0.02.