### Libraries

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### Data ingestion

In [2]:
df_raw = pd.read_csv('https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/raw/master/Chapter06/heart_disease.csv') 

### Data description

In [3]:
df_raw.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df_raw.shape

(303, 14)

In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


### Train-class-data-split

In [6]:
X = df_raw.iloc[:, :-1]
y = df_raw.iloc[:, -1]

### Model training

In [7]:
model = XGBClassifier(booster='gbtree',
                      objective='binary:logistic',
                      random_state=2)

In [8]:
scores = cross_val_score(model, X, y, cv=5)
print('Accuracy: {}'.format(np.round(scores, 2)))
print('Accuracy mean: {:.4f}'.format(np.round(np.mean(scores), 4)))

Accuracy: [0.84 0.85 0.82 0.8  0.77]
Accuracy mean: 0.8150


### Stratified $k$-fold

In [9]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)

In [10]:
scores = cross_val_score(model, X, y, cv=kfold)
print('Accuracy: {}'.format(np.round(scores, 2)))
print('Accuracy mean: {:.4f}'.format(np.round(np.mean(scores), 4)))

Accuracy: [0.72 0.82 0.75 0.8  0.82]
Accuracy mean: 0.7823


### Hyperparameter tuning

In [43]:
def grid_search(params, random=False):
    xgb = XGBClassifier(booster='gbtree',
                        objective='binary:logistic',
                        random_state=2)
    
    if random:
        grid = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=1000, n_jobs=-1)
    else:
        grid = GridSearchCV(xgb, params, cv=kfold, n_jobs=-1)
    
    grid.fit(X, y)
    best_params = grid.best_params_
    print("Best params:", best_params)
    
    best_score = grid.best_score_
    print("Training score: {:.4f}".format(best_score))

##### n_estimators

number of trees in the ensemble

In [14]:
grid_search(params={'n_estimators':[100, 200, 400, 800]})

Best params: {'n_estimators': 100}
Training score: 0.7823


as our dataset is small, it didn't produce better results.

#### learning_rate

shrinks the weights of trees for each round of boosting.
lower learning_rate => prevents overfitting but requires more trees.

In [15]:
grid_search(params={'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]})

Best params: {'learning_rate': 0.05}
Training score: 0.7958


slight better than base line

#### max_depth

the length of the tree (number of rounds of splitting). limiting max_depth prevents overfitting.

In [16]:
grid_search(params={'max_depth':[2, 3, 5, 6, 8]})

Best params: {'max_depth': 2}
Training score: 0.7990


slight better than base line

#### gamma

(lagrange multiplier) provides a threshold that nodes must surpass before making splits according to the loss function. Default is 0, anything over 10 is very high. 

In [17]:
grid_search(params={'gamma':[0, 0.1, 0.5, 1, 2, 5]})

Best params: {'gamma': 0.5}
Training score: 0.7957


slight better than base line

#### min_child_weight

min sum of weights required for a node to split. higher values => reduce overfitting.

In [18]:
grid_search(params={'min_child_weight':[1, 2, 3, 4, 5]})

Best params: {'min_child_weight': 5}
Training score: 0.8122


The best result so far.

#### subsample

limits the perentage of training instances for each boosting round. lower values of subsample reduces overfitting.

In [19]:
grid_search(params={'subsample':[0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'subsample': 0.8}
Training score: 0.7958


slight better than base line

#### colsample_bytree

colsample_bytree randomly selects particular
columns according to the given percentage.

In [20]:
grid_search(params={'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'colsample_bytree': 0.9}
Training score: 0.7989


In [21]:
grid_search(params={'colsample_bylevel':[0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'colsample_bylevel': 0.8}
Training score: 0.8022


In [22]:
grid_search(params={'colsample_bynode':[0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'colsample_bynode': 0.8}
Training score: 0.8022


slight better than base line

#### Random-search

In [51]:
grid_search(params={
                   'n_estimators':[2, 50, 100],
                   'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
                   'max_depth':[2, 3, 5, 6, 8],
                   'gamma':[0, 0.1, 0.5, 1, 2, 5],
                   'min_child_weight':[1, 2, 3, 4, 5],
                   'subsample':[0.5, 0.7, 0.8, 0.9, 1],
                   'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1],
                   'colsample_bylevel':[0.5, 0.7, 0.8, 0.9, 1],
                   'colsample_bynode':[0.5, 0.6, 0.7, 0.8, 0.9, 1]},
            random=True)

Best params: {'subsample': 0.5, 'n_estimators': 100, 'min_child_weight': 2, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 2, 'colsample_bytree': 0.5, 'colsample_bynode': 0.6, 'colsample_bylevel': 0.5}
Training score: 0.8484


### Early stopping

A general method to limit the number of training rounds in iterative machine learning algorithms. Early stopping allows training to continue until $n$ consecutive rounds fail to produce gains.

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [25]:
model = XGBClassifier(booster='gbtree',
                      objective='binary:logistic',
                      random_state=2)

#### eval_set

In [26]:
eval_set = [(X_test, y_test)]

In [27]:
eval_metric = 'error'

In [28]:
model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set)

[0]	validation_0-error:0.15789
[1]	validation_0-error:0.10526
[2]	validation_0-error:0.11842
[3]	validation_0-error:0.13158
[4]	validation_0-error:0.11842
[5]	validation_0-error:0.14474
[6]	validation_0-error:0.14474
[7]	validation_0-error:0.14474
[8]	validation_0-error:0.14474
[9]	validation_0-error:0.14474
[10]	validation_0-error:0.14474
[11]	validation_0-error:0.15789
[12]	validation_0-error:0.15789
[13]	validation_0-error:0.17105
[14]	validation_0-error:0.17105
[15]	validation_0-error:0.17105
[16]	validation_0-error:0.15789
[17]	validation_0-error:0.17105
[18]	validation_0-error:0.15789
[19]	validation_0-error:0.17105
[20]	validation_0-error:0.17105
[21]	validation_0-error:0.17105
[22]	validation_0-error:0.18421
[23]	validation_0-error:0.18421
[24]	validation_0-error:0.17105
[25]	validation_0-error:0.18421
[26]	validation_0-error:0.18421
[27]	validation_0-error:0.18421
[28]	validation_0-error:0.18421
[29]	validation_0-error:0.18421
[30]	validation_0-error:0.18421
[31]	validation_0-



XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=2,
              reg_alpha=0, reg_lambda=1, ...)

In [29]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 82.89%


#### early_stopping_rounds

In [30]:
model = XGBClassifier(booster='gbtree',
                      objective='binary:logistic',
                      random_state=2)

eval_set = [(X_test, y_test)]

eval_metric='error'

model.fit(X_train,
          y_train,
          eval_metric="error",
          eval_set=eval_set,
          early_stopping_rounds=10,
          verbose=True)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-error:0.15789
[1]	validation_0-error:0.10526
[2]	validation_0-error:0.11842




[3]	validation_0-error:0.13158
[4]	validation_0-error:0.11842
[5]	validation_0-error:0.14474
[6]	validation_0-error:0.14474
[7]	validation_0-error:0.14474
[8]	validation_0-error:0.14474
[9]	validation_0-error:0.14474
[10]	validation_0-error:0.14474
[11]	validation_0-error:0.15789
Accuracy: 89.47%


In [48]:
model = XGBClassifier(n_estimators=5000,
                      booster='gblinear',
                      objective='binary:logistic',
                      random_state=2)

eval_set = [(X_test, y_test)]

eval_metric='error'

model.fit(X_train,
          y_train,
          eval_metric="error",
          eval_set=eval_set,
          early_stopping_rounds=100,
          verbose=True)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-error:0.15789
[1]	validation_0-error:0.13158
[2]	validation_0-error:0.13158
[3]	validation_0-error:0.13158
[4]	validation_0-error:0.13158
[5]	validation_0-error:0.13158
[6]	validation_0-error:0.13158
[7]	validation_0-error:0.10526
[8]	validation_0-error:0.10526
[9]	validation_0-error:0.10526
[10]	validation_0-error:0.10526
[11]	validation_0-error:0.10526
[12]	validation_0-error:0.10526
[13]	validation_0-error:0.10526
[14]	validation_0-error:0.10526
[15]	validation_0-error:0.11842
[16]	validation_0-error:0.10526
[17]	validation_0-error:0.10526
[18]	validation_0-error:0.11842
[19]	validation_0-error:0.10526
[20]	validation_0-error:0.11842
[21]	validation_0-error:0.11842
[22]	validation_0-error:0.11842
[23]	validation_0-error:0.11842
[24]	validation_0-error:0.11842
[25]	validation_0-error:0.11842
[26]	validation_0-error:0.11842
[27]	validation_0-error:0.09211
[28]	validation_0-error:0.09211
[29]	validation_0-error:0.09211
[30]	validation_0-error:0.09211
[31]	validation_0-



In [45]:
grid_search(params={
    'booster': ['gbtree', 'gblinear', 'dart'],
    'colsample_bynode':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'colsample_bylevel':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'colsample_bytree':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'max_depth':[1],
    'n_estimators':[50]
})

Best params: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bynode': 0.8, 'colsample_bytree': 0.7, 'max_depth': 1, 'n_estimators': 50}
Training score: 0.8518


In [37]:
grid_search(params={'gamma':[0, 0.01, 0.05, 0.1, 0.5,
1, 2, 3], 'colsample_bylevel':[0.9],
'colsample_bytree':[0.8], 'colsample_bynode':[0.5],
'max_depth':[1], 'n_estimators':[2,50]})

Best params: {'colsample_bylevel': 0.9, 'colsample_bynode': 0.5, 'colsample_bytree': 0.8, 'gamma': 0, 'max_depth': 1, 'n_estimators': 50}
Training score: 0.8386
