## Advanced early stopping strategy

### Imports

In [1]:
import os
# set the device to run
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.makedirs('data', exist_ok=True)

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from py_boost import GradientBoosting 

# advanced prunning strategy
from py_boost.callbacks.advanced_es import AdvancedES

### Download data from openml

In [2]:
!wget https://www.openml.org/data/get_csv/19335689/file1c556e3db171.csv -O data/volkert.csv

--2021-11-11 01:42:08--  https://www.openml.org/data/get_csv/19335689/file1c556e3db171.csv
Resolving www.openml.org (www.openml.org)... 131.155.11.11
Connecting to www.openml.org (www.openml.org)|131.155.11.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘data/volkert.csv’

data/volkert.csv        [    <=>             ]  65.06M  6.87MB/s    in 7.6s    

2021-11-11 01:42:15 (8.52 MB/s) - ‘data/volkert.csv’ saved [68219213]



In [3]:
data = pd.read_csv('data/volkert.csv', na_values='?')

In [4]:
data

Unnamed: 0,class,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180
0,0,0.438660,0,0,0,0,0,0,0,0,...,0.013889,0.012153,0.012442,0.013021,0.014757,0.012442,0.013600,0.010417,0.009549,0.013600
1,2,0.000000,0,0,0,0,0,0,0,0,...,0.013889,0.014178,0.013310,0.014468,0.013021,0.015625,0.016493,0.015046,0.016782,0.010995
2,9,0.000000,0,0,0,0,0,0,0,0,...,0.009259,0.012442,0.011574,0.011574,0.010706,0.007812,0.012153,0.009549,0.007812,0.017361
3,3,0.000000,0,0,0,0,0,0,0,0,...,0.005498,0.006655,0.007812,0.006366,0.007234,0.007523,0.006076,0.006944,0.006076,0.006655
4,0,0.000000,0,0,0,0,0,0,0,0,...,0.006655,0.007234,0.008970,0.004919,0.009549,0.008391,0.006944,0.005498,0.002025,0.019965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58305,0,0.000868,0,0,0,0,0,0,0,0,...,0.004340,0.010417,0.012442,0.005498,0.008102,0.006366,0.005498,0.006366,0.003472,0.018229
58306,7,0.000000,0,0,0,0,0,0,0,0,...,0.004946,0.010765,0.019494,0.004655,0.015711,0.010474,0.009601,0.009019,0.000291,0.051207
58307,0,0.414640,0,0,0,0,0,0,0,0,...,0.017940,0.016493,0.021123,0.016493,0.018519,0.020255,0.019676,0.018519,0.017650,0.023438
58308,5,1.000000,0,0,0,0,0,0,0,0,...,0.013889,0.015914,0.017361,0.018229,0.020255,0.019676,0.020833,0.026331,0.019965,0.020255


In [5]:
X = data.drop('class', axis=1).values.astype('float32')
y = (data['class']).values.astype('int32')


X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

X, X_val, y, y_val = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)


In [6]:
y.mean()

4.467032125685235

### Train simple model

Let's train the simple model, we will use validation set for the early stopping and evaluate accuracy on the test set to avoid overestimation

In [7]:
%%time
model = GradientBoosting('crossentropy', 
                         ntrees=10000, lr=0.05, verbose=1000, es=300, lambda_l2=10,
                         subsample=1, colsample=1, min_data_in_leaf=10,
                         max_bin=256, max_depth=6)

model.fit(X, y, eval_sets = [{'X': X_val, 'y': y_val}])

[01:42:19] Stdout logging level is INFO.
[01:42:19] GDBT train starts. Max iter 10000, early stopping rounds 300
[01:42:20] Iter 0; Sample 0, Crossentropy = 2.181068391156711; 
[01:42:42] Iter 1000; Sample 0, Crossentropy = 0.9296358989596356; 
[01:43:04] Iter 2000; Sample 0, Crossentropy = 0.8972518262973849; 
[01:43:27] Iter 3000; Sample 0, Crossentropy = 0.8830401193199362; 
[01:43:49] Iter 4000; Sample 0, Crossentropy = 0.8753117101808281; 
[01:44:11] Iter 5000; Sample 0, Crossentropy = 0.8727872564239998; 
[01:44:30] Early stopping at iter 5828, best iter 5528, best_score 0.8720492694887033
CPU times: user 1min 53s, sys: 20.1 s, total: 2min 13s
Wall time: 2min 12s


<py_boost.gpu.boosting.GradientBoosting at 0x7fa5dd269280>

In [8]:
# %debug

In [9]:
%%time
pred = model.predict(X_test)

pred.shape

CPU times: user 3.8 s, sys: 120 ms, total: 3.92 s
Wall time: 3.92 s


(11662, 10)

In [10]:
log_loss(y_test, pred)

0.8560588106838752

### Train model with advanced early stopping

Let's now train the model by selection separate prunning points for different validation data clusters. We will split validation data using Decision Trees of different depth. Note: it is important to disable common early stoppinng strategy by passing es=0

In [11]:
%%time
adv_es = AdvancedES(num_rounds=300, freq=10, max_depths=(1, 2, 3, 4, 5, 6), min_data_in_leaf=100)

model = GradientBoosting('crossentropy',  
                         ntrees=10000, lr=0.05, verbose=1000, es=0, lambda_l2=10,
                         subsample=1, colsample=1, min_data_in_leaf=10,
                         max_bin=256, max_depth=6,
                        callbacks=[adv_es])

model.fit(X, y, eval_sets = [{'X': X_val, 'y': y_val}])

[01:44:34] Stdout logging level is INFO.
[01:44:34] GDBT train starts. Max iter 10000, early stopping rounds 0
[01:44:34] Iter 0; Sample 0, Crossentropy = 2.181068378903851; 
[01:44:57] Iter 1000; Sample 0, Crossentropy = 0.9296106331675427; 
[01:44:57] Advanced ES: Best strategy 4, best LVO metric 0.9295895158018498
[01:45:21] Iter 2000; Sample 0, Crossentropy = 0.8960243708298485; 
[01:45:21] Advanced ES: Best strategy 2, best LVO metric 0.8960158703378166
[01:45:44] Iter 3000; Sample 0, Crossentropy = 0.8808397727133226; 
[01:45:44] Advanced ES: Best strategy 2, best LVO metric 0.8808188849303289
[01:46:07] Iter 4000; Sample 0, Crossentropy = 0.873921449608148; 
[01:46:07] Advanced ES: Best strategy 0, best LVO metric 0.873921449608148
[01:46:30] Iter 5000; Sample 0, Crossentropy = 0.8702889987945498; 
[01:46:30] Advanced ES: Best strategy 1, best LVO metric 0.8702155596421789
[01:46:54] Iter 6000; Sample 0, Crossentropy = 0.8699057896840645; 
[01:46:54] Advanced ES: Best strategy 2

<py_boost.gpu.boosting.GradientBoosting at 0x7fa5dd4cabb0>

In [12]:
adv_es.best_lvo_metrics

[0.8697107124027974,
 0.869700977292128,
 0.8694962291117104,
 0.8698414828270951,
 0.8699477059493821,
 0.8686998170994192,
 0.8685052573864539]

In [13]:
pred = adv_es.predict(X_test)

pred.shape

(11662, 10)

In [14]:
log_loss(y_test, pred)

0.856550771555217

#### This strategy may slightly improve score on the test set for some datasets