# 12 - XGBoost

**XGBoost is a popular and efficient open-source implementation of the gradient boosted trees algorithm. Gradient boosting is a supervised learning algorithm, which attempts to accurately predict a target variable by combining the estimates of a set of simpler, weaker models.**

# Get Dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("Hitters.csv")
data = df.copy()
# Clear null values
data = data.dropna()
print(data.isnull().values.any())
data.head()

False


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


# Get Dataset

In [3]:
dummies = pd.get_dummies(data[["League","Division","NewLeague"]])
dummies.head()

Unnamed: 0,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
1,0,1,0,1,0,1
2,1,0,0,1,1,0
3,0,1,1,0,0,1
4,0,1,1,0,0,1
5,1,0,0,1,1,0


In [4]:
y = data["Salary"]

X_pre = data.drop(["Salary","League","Division","NewLeague"],axis=1).astype("float64")
X = pd.concat([X_pre,dummies[["League_N","Division_W","NewLeague_N"]]],axis=1)
X.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0,1,1,1
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0,0,1,0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0,1,0,1
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0,1,0,1
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0,0,1,0


# Split train and test

In [5]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=33)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(210, 19)
(210,)
(53, 19)
(53,)


# Use XGB Matrix(Better performance)

In [9]:
import xgboost as xgb
dm_train = xgb.DMatrix(data = X_train,label = y_train)
dm_test = xgb.DMatrix(data = X_test,label = y_test)

from xgboost import XGBRegressor

xgb_model = XGBRegressor().fit(X_train,y_train)

In [10]:
xgb_model

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

# Model Tuning

In [12]:
from sklearn.model_selection import GridSearchCV

xgb = XGBRegressor()

xgb_params = {"colsample_bytree":[0.4,0.5,0.7],
             "n_estimators":[100,200,300],
             "max_depth":[1,2,3,4],
             "learning_rate":[0.1,0.01,0.001]}
xgb_cv_model = GridSearchCV(xgb,xgb_params,cv=10,verbose=2)

xgb_cv_model.fit(X_train,y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, lear

[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=4, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=4, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.1, max_depth=4, n_estimators=100; total time=   0.0s


[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.01, max_depth=2, n_estimators=300; total tim

[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=1, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=1, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=1, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=1, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=1, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=1, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=1, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=1, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=1, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=1, n_estimators=300;

[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=4, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=4, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=4, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=4, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=4, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=4, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=4, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=4, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=4, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.4, learning_rate=0.001, max_depth=4, n_estimators=200;

[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=2, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.0s


[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=1, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=1, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=1, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=1, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=1, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=1, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100; total tim

[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=4, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=4, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=4, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=4, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=4, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=4, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=4, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=4, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=4, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=4, n_estimators=300; total tim

[CV] END colsample_bytree=0.5, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.001, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.001, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.001, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.001, max_depth=3, n_estimators=300;

[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=2, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=2, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=2, n_estimators=200; total time=   0.0s


[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=4, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=4, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=1, n_estimators=100; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=1, n_estimators=100; total time=

[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=300; total tim

[CV] END colsample_bytree=0.7, learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.001, max_depth=2, n_estimators=300;

GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=None, gpu_id=None, grow_policy=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_bin=None,
                                    max_ca...
                                    max_depth=None, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estimators=100,
                                  

In [13]:
xgb_cv_model.best_params_

{'colsample_bytree': 0.4,
 'learning_rate': 0.01,
 'max_depth': 3,
 'n_estimators': 200}

# Tuned Model

In [14]:
xgb_tuned_model = XGBRegressor(colsample_bytree = 0.4,learning_rate = 0.01,max_depth = 3,n_estimators = 200)
xgb_tuned_model.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.4,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=200, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [16]:
from sklearn.metrics import mean_squared_error

y_pred_tuned = xgb_tuned_model.predict(X_test)

rmse_tuned = np.sqrt(mean_squared_error(y_test,y_pred_tuned))
print(f"RMSE Tuned Loss Value = {rmse_tuned}")

RMSE Tuned Loss Value = 214.4322939564181
