In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np

In [2]:
SEED=25

In [3]:
df = pd.read_pickle('Prelim data v2.pkl')

In [4]:
X = np.vstack(np.array(df['em_per_protein']))
y = np.vstack(np.array(df['enantiomer binary']))

dmatrix = xgb.DMatrix(X, label=y)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=SEED)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [6]:
from sklearn.metrics import f1_score

In [7]:
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:linear'}

In [9]:
num_boost_round=999

params['eval_metric'] = "logloss"

model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10)

print("Best logloss: {:.2f} after {} rounds of boosting".format(
                 model.best_score,
                 model.best_iteration+1))

[0]	Test-logloss:0.58055
[1]	Test-logloss:0.50739
[2]	Test-logloss:0.48437
[3]	Test-logloss:0.45878
[4]	Test-logloss:0.44100
[5]	Test-logloss:0.44944
[6]	Test-logloss:0.45505
[7]	Test-logloss:0.44452
[8]	Test-logloss:0.46083
[9]	Test-logloss:0.45685
[10]	Test-logloss:0.45938
[11]	Test-logloss:0.45046
[12]	Test-logloss:0.45532
[13]	Test-logloss:0.45068
[14]	Test-logloss:0.45893
Best logloss: 0.44 after 5 rounds of boosting


Only 12 rounds of boosting (of a possible 999) were required to improve the AUC score, which has not shown much variance across the different ML models and embedders - demonstrating the potential of XGBoosting. Beyond 12 rounds, no improvement in AUC was achieved.

In [11]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=SEED,
    nfold=10,
    metrics={'logloss'},
    early_stopping_rounds=10
)
cv_results



Unnamed: 0,train-logloss-mean,train-logloss-std,test-logloss-mean,test-logloss-std
0,0.450749,0.004733,0.591934,0.049016
1,0.308706,0.006879,0.561583,0.09228
2,0.214157,0.007593,0.545787,0.123769


## Tuning max_depth and min_child_weight

In [12]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)]

In [13]:
min_logloss = float('inf')
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=SEED,
        nfold=5,
        metrics={'logloss'},
        early_stopping_rounds=10
    )
    # Update best AUC
    mean_logloss = cv_results['test-logloss-mean'].min()
    boost_rounds = cv_results['test-logloss-mean'].argmin()
    print("\tlogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, logloss: {}".format(best_params[0], best_params[1], min_logloss))

CV with max_depth=9, min_child_weight=5
	logloss 0.473522 for 7 rounds
CV with max_depth=9, min_child_weight=6
	logloss 0.47910819999999993 for 3 rounds
CV with max_depth=9, min_child_weight=7
	logloss 0.4733252 for 3 rounds
CV with max_depth=10, min_child_weight=5
	logloss 0.47506360000000003 for 4 rounds
CV with max_depth=10, min_child_weight=6
	logloss 0.47910819999999993 for 3 rounds
CV with max_depth=10, min_child_weight=7
	logloss 0.4733252 for 3 rounds
CV with max_depth=11, min_child_weight=5
	logloss 0.47506360000000003 for 4 rounds
CV with max_depth=11, min_child_weight=6
	logloss 0.47910819999999993 for 3 rounds
CV with max_depth=11, min_child_weight=7


	logloss 0.4733252 for 3 rounds
Best params: 9, 7, logloss: 0.4733252


In [14]:
params['max_depth'] = 9
params['min_child_weight'] = 7

## Tuning subsample and colsample

In [15]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]]

In [16]:
min_logloss = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=SEED,
        nfold=5,
        metrics={'logloss'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_auc = cv_results['test-logloss-mean'].min()
    boost_rounds = cv_results['test-logloss-mean'].argmin()
    print("\tlogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = (subsample,colsample)
print("Best params: {}, {}, logloss: {}".format(best_params[0], best_params[1], min_logloss))


CV with subsample=1.0, colsample=1.0
	logloss 0.4733252 for 3 rounds
CV with subsample=1.0, colsample=0.9
	logloss 0.4733252 for 3 rounds
CV with subsample=1.0, colsample=0.8
	logloss 0.4733252 for 4 rounds
CV with subsample=1.0, colsample=0.7
	logloss 0.4733252 for 3 rounds
CV with subsample=0.9, colsample=1.0
	logloss 0.4733252 for 5 rounds
CV with subsample=0.9, colsample=0.9
	logloss 0.4733252 for 6 rounds
CV with subsample=0.9, colsample=0.8
	logloss 0.4733252 for 5 rounds
CV with subsample=0.9, colsample=0.7
	logloss 0.4733252 for 5 rounds
CV with subsample=0.8, colsample=1.0


	logloss 0.4733252 for 2 rounds
CV with subsample=0.8, colsample=0.9
	logloss 0.4733252 for 2 rounds
CV with subsample=0.8, colsample=0.8
	logloss 0.4733252 for 4 rounds
CV with subsample=0.8, colsample=0.7
	logloss 0.4733252 for 3 rounds
CV with subsample=0.7, colsample=1.0
	logloss 0.4733252 for 3 rounds
CV with subsample=0.7, colsample=0.9
	logloss 0.4733252 for 3 rounds
CV with subsample=0.7, colsample=0.8
	logloss 0.4733252 for 4 rounds
CV with subsample=0.7, colsample=0.7
	logloss 0.4733252 for 2 rounds
Best params: 1.0, 1.0, logloss: 0.4733252


In [17]:
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0

## Tuning ETA

In [23]:
%time
# This can take some time…
min_logloss = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgb.cv(params, dtrain, num_boost_round=num_boost_round, seed=SEED, nfold=5, metrics=['logloss'], early_stopping_rounds=10)
    # Update best score
    mean_logloss = cv_results['test-logloss-mean'].min()
    boost_rounds = cv_results['test-logloss-mean'].argmin()
    print("\tlogloss {} for {} rounds\n".format(mean_logloss, boost_rounds))
    if mean_logloss < min_logloss:
        min_logloss = mean_logloss
        best_params = eta
print("Best params: {}, logloss: {}".format(best_params, min_logloss))

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 10 µs
CV with eta=0.3
CPU times: user 6.07 s, sys: 61.6 ms, total: 6.13 s
Wall time: 1.72 s
	logloss 0.4733252 for 3 rounds

CV with eta=0.2
CPU times: user 7.45 s, sys: 90.4 ms, total: 7.54 s
Wall time: 2.15 s
	logloss 0.46483559999999996 for 6 rounds

CV with eta=0.1
CPU times: user 11.5 s, sys: 135 ms, total: 11.7 s
Wall time: 3.36 s
	logloss 0.4612868 for 17 rounds

CV with eta=0.05
CPU times: user 19.9 s, sys: 291 ms, total: 20.2 s
Wall time: 6.09 s
	logloss 0.47797119999999993 for 38 rounds

CV with eta=0.01
CPU times: user 1min 26s, sys: 762 ms, total: 1min 26s
Wall time: 24.2 s
	logloss 0.4662476 for 211 rounds

CV with eta=0.005
CPU times: user 2min 47s, sys: 1.78 s, total: 2min 49s
Wall time: 50.2 s
	logloss 0.4714342 for 424 rounds

Best params: 0.1, logloss: 0.4612868


In [24]:
params['eta'] = 0.1

In [25]:
params

{'max_depth': 9,
 'min_child_weight': 7,
 'eta': 0.1,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'objective': 'reg:linear',
 'eval_metric': 'logloss'}

In [27]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best logloss: {:.2f} after {} rounds of boosting".format(model.best_score, model.best_iteration+1))

[0]	Test-logloss:0.65351
[1]	Test-logloss:0.61130
[2]	Test-logloss:0.57620
[3]	Test-logloss:0.55307
[4]	Test-logloss:0.52478
[5]	Test-logloss:0.50328
[6]	Test-logloss:0.48595
[7]	Test-logloss:0.46346
[8]	Test-logloss:0.45578
[9]	Test-logloss:0.44502
[10]	Test-logloss:0.43727
[11]	Test-logloss:0.43068
[12]	Test-logloss:0.42242
[13]	Test-logloss:0.41796
[14]	Test-logloss:0.40991
[15]	Test-logloss:0.40780
[16]	Test-logloss:0.40009
[17]	Test-logloss:0.39389
[18]	Test-logloss:0.39417
[19]	Test-logloss:0.39273
[20]	Test-logloss:0.38544
[21]	Test-logloss:0.38036
[22]	Test-logloss:0.38288
[23]	Test-logloss:0.37624
[24]	Test-logloss:0.37327
[25]	Test-logloss:0.37414
[26]	Test-logloss:0.36735
[27]	Test-logloss:0.37045
[28]	Test-logloss:0.37133
[29]	Test-logloss:0.37254
[30]	Test-logloss:0.37412
[31]	Test-logloss:0.37682
[32]	Test-logloss:0.37362
[33]	Test-logloss:0.37552
[34]	Test-logloss:0.37531
[35]	Test-logloss:0.37337
[36]	Test-logloss:0.37333
Best logloss: 0.37 after 27 rounds of boosting
