### In this notebook, we are using XGBoost Regressor with and without Hyperparameter tuning.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
data = pd.read_csv('Data/Real-Data/Real_Combine.csv')
data.head()

Unnamed: 0,T,TM,Tm,SLP,H,VV,V,VM,PM 2.5
0,23.4,30.3,19.0,59.0,0.0,6.3,4.3,5.4,284.795833
1,22.4,30.3,16.9,57.0,0.0,6.9,3.3,7.6,219.720833
2,24.0,31.8,16.9,51.0,0.0,6.9,2.8,5.4,182.1875
3,22.8,30.2,17.6,55.0,0.0,7.7,3.7,7.6,154.0375
4,23.1,30.6,17.4,46.0,0.0,6.9,3.3,5.4,223.208333


In [4]:
data=data.dropna()

In [5]:
X = data.iloc[:,:-1]
Y=data.iloc[:,-1]

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)

In [7]:
#!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.2.1-py3-none-macosx_10_13_x86_64.macosx_10_14_x86_64.macosx_10_15_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 2.7 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.2.1


In [8]:
import xgboost as xgb

In [9]:
regressor = xgb.XGBRegressor()
regressor.fit(X_train,Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
print('Score on Train Dataset',regressor.score(X_train,Y_train))

Score on Train Dataset 0.998664792887286


In [11]:
print('Score on Test Dataset',regressor.score(X_test,Y_test))

Score on Test Dataset 0.2862652158761383


#### The above demonstrates that this is a perfect Overfit problem!!!!!

#### Let's do an Hyper parameter tuning to see, if we can prune the DT incolved in XGBoost 

In [12]:
xgb.XGBRegressor()

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)

In [13]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(100,1200,12)]

#Randomized Search CV
learning_rate = ['0.05','0.1','0.2','0.3','0.5','0.6']

max_depth = [int(x) for x in np.linspace(5,30,6)]

subsample =[0.7,0.6,0.8]

min_child_weight = [3,4,5,6,7]

In [14]:
random_grid = {'n_estimators': n_estimators,
               'learning_rate':learning_rate,
               'max_depth':max_depth,
               'subsample':subsample,
               'min_child_weight':min_child_weight}

In [27]:
%%time
regressor = xgb.XGBRegressor()

# Use the XGBRegressor and find the best fit
xg_random = RandomizedSearchCV(regressor,param_distributions=random_grid,scoring='neg_mean_squared_error')

xg_random.fit(X_train,Y_train)

CPU times: user 1min 27s, sys: 1.46 s, total: 1min 28s
Wall time: 12.5 s


RandomizedSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n_jobs...
                                          scale_pos_weight=None, subsample=None,
                                          tree_method=None,
                                          validate_parameters=None,
 

In [16]:
xg_random.best_params_

{'subsample': 0.8,
 'n_estimators': 100,
 'min_child_weight': 5,
 'max_depth': 20,
 'learning_rate': '0.1'}

In [17]:
xg_random.best_score_

-4640.528380990496

In [19]:
xg_random.score(X_train,Y_train)

-59.487871490170285

In [21]:
regressor_hyp_tuned = xgb.XGBRegressor(subsample=0.8,n_estimators=100,min_child_weight=5,max_depth=20,learning_rate=0.1)

In [23]:
regressor_hyp_tuned.fit(X_train,Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=20,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
regressor_hyp_tuned.score(X_train,Y_train)

0.9916080573585102

In [26]:
regressor_hyp_tuned.score(X_test,Y_test)

0.9778611833090686

### As seen above now, we could see that the XGBoostRegressor have performed the best after Hyper Prameter tuning. My other observation was the computation time taken in XGBoost was much less compared to that of DT and RF.

In [29]:
#Model evaluation for Decision Tree Regressor

from sklearn.metrics import mean_absolute_error

from sklearn.metrics import mean_squared_error

from sklearn.metrics import r2_score

y_pred_test = regressor_hyp_tuned.predict(X_test)

print("Mean Absoute Error = ",mean_absolute_error(Y_test,y_pred_test))

print("Mean squared Error = ",mean_squared_error(Y_test,y_pred_test))

print("R^2 = ",r2_score(Y_test,y_pred_test))

print("RMSE = ",np.sqrt(mean_squared_error(Y_test,y_pred_test)))

Mean Absoute Error =  6.3511089261823805
Mean squared Error =  154.75014258525428
R^2 =  0.9778611833090686
RMSE =  12.439861035608649


In [30]:
!pip freeze > requirements.txt

In [31]:
import pickle
file = 'XGBoostRegressor.sav'

pickle.dump(regressor_hyp_tuned,open(file,'wb'))