In this notebook we focus on XGBoost classifier and investigate how quickly it learns the data (as a function of the number of estimators and the maximal depth of the individual tree)

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve
np.random.seed(42)

Define parameters:

In [None]:
# size of the validation set
VAL_SIZE = 0.9

# define model parameters
LEARNING_RATE = 0.1
N_ESTIMATORS  = [10, 25, 50, 75, 100, 125, 150, 200]
MAX_DEPTH     = [4, 9, 14]
N_JOBS        = 16
TREE_METHOD   = 'hist'
VERBOSITY     = 1

Load the data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')

In [None]:
train.head()

Separate train and validation sets

In [None]:
bids = train['id'].values
train_bids = np.random.choice(bids, replace=False, size=int((1 - VAL_SIZE) * len(train)))
valid_bids = np.array([x for x in bids if x not in train_bids])

In [None]:
y_train = train[train['id'].isin(train_bids)]['target'].values
X_train = train[train['id'].isin(train_bids)].drop(['id', 'target'], axis = 1)
y_valid = train[train['id'].isin(valid_bids)]['target'].values
X_valid = train[train['id'].isin(valid_bids)].drop(['id', 'target'], axis = 1)

Train various XGBoost classifiers

In [None]:
auc = [[] for x in MAX_DEPTH]
roc  = [[] for x in MAX_DEPTH]

for i, md in enumerate(MAX_DEPTH):
    
    print(f'Max depth {md}')
    
    for nest in N_ESTIMATORS:
        
        print(f'Running {nest}')
        
        # define the model
        xgb = XGBClassifier(learning_rate = LEARNING_RATE, n_estimators = nest, max_depth = md, 
                            n_jobs = N_JOBS, tree_method = TREE_METHOD, verbosity=VERBOSITY, 
                            eval_metric = 'logloss', use_label_encoder = False)

        # fit the model	on the train set
        model_xgb = xgb.fit(X_train,y_train)

        # predict on the validation set
        y_valid_pred = model_xgb.predict_proba(X_valid)[:,1]

        # save area under the curve and roc curve
        auc[i].append(roc_auc_score(y_valid, y_valid_pred))
        roc[i].append(roc_curve(y_valid, y_valid_pred))

Check AUC as a function of the number of estimators - we see that we get pretty good results already with a small number of estimators. Somewhat interestingly, the smallest trees do the best job.

In [None]:
for i, md in enumerate(MAX_DEPTH):
    plt.scatter(N_ESTIMATORS, auc[i], label = f'Max depth {md}')
plt.legend()
plt.ylabel('AUC')
plt.xlabel('Number of estimators');

Plot the ROC curve for the results obtained with the smallest trees to see its convergence

In [None]:
which = 0

for i in range(len(N_ESTIMATORS)):
    plt.plot(roc[which][i][0], roc[which][i][1], label = N_ESTIMATORS[i]);
plt.legend();
plt.title(f'ROC curve for max_depth = {MAX_DEPTH[which]} and various number of estimators');