# XGBoost

In [1]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization
from IPython.display import display
from ipywidgets import IntProgress
from sklearn import metrics
import xgboost as xgb
import pandas as pd
import os

In [2]:
if '__file__' in locals():
    current_folder = os.path.dirname(os.path.abspath(__file__))
else:
    current_folder = os.getcwd()

set_de_entrenamiento_testing_y_prediccion = '"{}"'.format(os.path.join(
    current_folder,
    '..',
    'Set de entrenamiento, testing y predicción.ipynb'
))
merge_features = '"{}"'.format(os.path.join(current_folder, '..', 'Features', 'Merge features.ipynb'))
calcular_auc = '"{}"'.format(os.path.join(current_folder, '..', 'Calcular AUC.ipynb'))
predicciones_csv = os.path.join(current_folder, '..', 'predictions.csv')
hiperparametros_csv = os.path.join(current_folder, 'hiperparametros', 'xgboost.csv')

Cargo el df con los features.

In [3]:
pd.options.mode.chained_assignment = None
%run $merge_features

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

In [4]:
assert(df_features.shape[0] == df['person'].unique().shape[0])

Cargo los sets de entrenamiento, testing y predicción.

In [5]:
%run $set_de_entrenamiento_testing_y_prediccion

labels_with_features = labels.merge(df_features, how='inner', on='person')
data = labels_with_features.drop('label', axis=1)
target = labels_with_features['label']

## Entrenamiento rápido

Con cross validation de xgboost. Lo bueno de esto es que al final me da el *num_boost_round* óptimo.

In [21]:
param = {
    'silent': 1,
    'objective': 'reg:logistic',
    'max_depth': 3,
    'eta': 0.1,
}
cv = 10 # cantidad de splits en el cross validation
num_round = 100 # cantidad de veces que se boostea

In [22]:
%%time
dtrain = xgb.DMatrix(data, label=target)
result = xgb.cv(param, dtrain, nfold=cv, metrics='auc', verbose_eval=False, shuffle=False, stratified=False, num_boost_round=num_round)

CPU times: user 53.8 s, sys: 88 ms, total: 53.8 s
Wall time: 13.8 s


El índice + 1 es el *num_boost_round* óptimo. 

In [8]:
result.loc[[result['test-auc-mean'].idxmax()]]

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
56,0.87477,0.01197,0.897444,0.001117


## Búsqueda de hiperparámetros con Grid Search

Vamos a hacer un kfold con sklearn.

**Nota**: está busqueda ya no es óptima, es mejor realizarla usando GridSearchCV de sklearn o Bayesian Optimization que está más abajo.

In [9]:
splits = 2
max_depth_values = 2
eta_values = 2
gamma_values = 2
num_round_values = 2

In [10]:
param = {
    'max_depth': 30,
    'eta': 0,
    'gamma': 0,
    'silent': 1,
    'objective': 'binary:logistic',
    'nthread': 10,
    'eval_metric': 'auc'
}

In [11]:
def calculate_auc():
    return metrics.roc_auc_score(labels_test['label'], labels_test['label_predicted'])

In [12]:
labels_with_features = labels.merge(df_features, how='inner', on='person')

In [13]:
columns = list(labels_with_features.columns)
columns.remove('label')

In [14]:
f = IntProgress(min=0, max=splits*max_depth_values*eta_values*gamma_values*num_round_values)
display(f) # display the bar

kf = KFold(n_splits=splits, shuffle=False)
results = pd.DataFrame(columns=['k', 'max_depth', 'eta', 'gamma', 'num_round', 'auc'])
index = 0
k = 0
for train_index, test_index in kf.split(labels):
    
    labels_training = labels_with_features.iloc[train_index]
    labels_test = labels_with_features.iloc[test_index]
    assert(labels_training.merge(labels_test, how='inner', on='person').shape[0] == 0)
    train_matrix = xgb.DMatrix(labels_training.loc[:, columns], label=labels_training['label'])
    test_matrix = xgb.DMatrix(labels_test.loc[:, columns])
    
    for max_depth, eta, gamma, num_round in np.ndindex((max_depth_values, eta_values, gamma_values, num_round_values)):
        eta=eta/eta_values
        param['max_depth'] = max_depth
        param['eta'] = eta
        param['gamma'] = gamma
        
        bst = xgb.train(param, train_matrix, num_round)
        labels_test['label_predicted'] = bst.predict(test_matrix)
        
        results.loc[index] = k, max_depth, eta, gamma, num_round, calculate_auc()
        
        index+=1
        f.value += 1
    
    k += 1

IntProgress(value=0, max=32)

Promedio los hiperparámetros.

In [15]:
results_mean = results.groupby(['max_depth', 'eta', 'gamma', 'num_round'])[['auc']].mean()
mejor_resultado = results_mean.loc[results_mean.idxmax()]
mejor_resultado

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auc
max_depth,eta,gamma,num_round,Unnamed: 4_level_1
1.0,0.5,0.0,1.0,0.776814


Escribo los nuevos resultados en un archivo.

In [16]:
params = mejor_resultado.reset_index().to_dict('records')[0]
auc = params.pop('auc')

In [17]:
hyperparameter_data = {
    'algorithm': 'xgboost',
    'hyperparameters': params,
    'cv_splits': splits,
    'auc': auc,
    'features': data.columns
}

In [18]:
%run -i write_hyperparameters.py

## Hiperparámetros con Bayesian Optimization

In [27]:
pbounds = {
    'max_depth': (2, 20),
    'eta': (0, 0.3),
    'gamma': (0, 10),
    'min_child_weight': (1, 5),
    'max_delta_step': (1, 5),
    'subsample': (0, 1),
    'colsample_bytree': (0, 1),
    'colsample_bylevel': (0, 1),
    'lambda': (1, 3),
    'alpha': (0, 2)
}

discrete = ['max_depth'] # parámetros discretos
cv_splits = 10 # cantidad de splits en el cv
num_round = 100 # cantidad máxima de boosts

Falta optimizar otros parámetros discretos:
 - booster 
 - min_child_weight 
 - max_delta_step 
 - etc...

In [28]:
dtrain = xgb.DMatrix(data, label=target)
def cv_score_xgb(**param):
    param['silent'] = 1
    param['objective'] = 'reg:logistic'
    param['scale_pos_weight'] = 19
    
    # transformo los valores que deben ser discretos
    for d in discrete:
        param[d] = int(param[d])
    
    # hago el cv
    scores = xgb.cv(param, dtrain, nfold=cv_splits, metrics='auc', verbose_eval=False, shuffle=False, stratified=False, num_boost_round=num_round)
    return scores['test-auc-mean'].max()

In [29]:
%%time
optimizer = BayesianOptimization(f=cv_score_xgb, pbounds=pbounds)
optimizer.maximize(
    init_points=2,
    n_iter=10,
)

|   iter    |  target   |   alpha   | colsam... | colsam... |    eta    |   gamma   |  lambda   | max_de... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8503  [0m | [0m 0.04493 [0m | [0m 0.8174  [0m | [0m 0.3518  [0m | [0m 0.1638  [0m | [0m 8.968   [0m | [0m 2.659   [0m | [0m 2.676   [0m | [0m 9.837   [0m | [0m 1.941   [0m | [0m 0.3557  [0m |
| [95m 2       [0m | [95m 0.8513  [0m | [95m 1.466   [0m | [95m 0.4259  [0m | [95m 0.821   [0m | [95m 0.1035  [0m | [95m 1.049   [0m | [95m 1.115   [0m | [95m 1.796   [0m | [95m 7.793   [0m | [95m 3.204   [0m | [95m 0.1238  [0m |
| [95m 3       [0m | [95m 0.8663  [0m | [95m 1.049   [0m | [95m 0.9827  [0m | [95m 0.6402  [0m | [95m 0.03232 [0m | [95m 0.1039  [0m | [95m 1.042   [0m | [95m 4.908   [0m | [95m 19.97   [0m | 

In [30]:
optimizer.max

{'params': {'alpha': 1.0487680119954488,
  'colsample_bylevel': 0.9826513226038831,
  'colsample_bytree': 0.6401709761369432,
  'eta': 0.032324032761254506,
  'gamma': 0.10386735718731055,
  'lambda': 1.0420854557106385,
  'max_delta_step': 4.908012056400395,
  'max_depth': 19.96547220092579,
  'min_child_weight': 4.180239202888753,
  'subsample': 0.11383309326316748},
 'target': 0.8662746}

Guardo el resultado en un archivo.

In [31]:
params = optimizer.max['params'].copy()
params['max_depth'] = int(params['max_depth'])
params['silent'] = 1
result = xgb.cv(params, dtrain, nfold=cv_splits, metrics='auc', verbose_eval=False, shuffle=False, stratified=False, num_boost_round=num_round)
params['num_round'] = result['test-auc-mean'].idxmax() + 1
del params['silent']

In [32]:
hyperparameter_data = {
    'algorithm': 'xgboost',
    'hyperparameters': params,
    'cv_splits': cv_splits,
    'auc': optimizer.max['target'],
    'features': data.columns
} 

In [33]:
%run -i write_hyperparameters.py

## Predecir labels desconocidos

In [34]:
dtrain = xgb.DMatrix(data, label=target)

In [27]:
param = {
    'eta': 0.09396,
    'gamma': 6.476,
    'max_depth': 10,
    'silent': 1,
    'objective': 'reg:logistic'
}
num_round = 10

In [28]:
bst = xgb.train(param, dtrain, num_boost_round=num_round)

Predigo:

In [29]:
labels_to_predict_with_features = labels_to_predict.merge(df_features, how='inner', on='person')

In [30]:
assert(labels_to_predict.shape[0] == labels_to_predict_with_features.shape[0])

In [31]:
matrix = xgb.DMatrix(labels_to_predict_with_features.loc[:, columns])

In [32]:
labels_to_predict['label'] = bst.predict(matrix)

In [33]:
# labels_to_predict.to_csv(predicciones_csv)