# XGBoost

In [1]:
from sklearn.model_selection import cross_val_score, KFold
from bayes_opt import BayesianOptimization
from IPython.display import display
from ipywidgets import IntProgress
import xgboost as xgb
import pandas as pd
import os

In [2]:
if '__file__' in locals():
    current_folder = os.path.dirname(os.path.abspath(__file__))
else:
    current_folder = os.getcwd()

In [3]:
merge_features = '"{}"'.format(os.path.join(current_folder, '..', 'Features', 'Merge features.ipynb'))

In [4]:
calcular_auc = '"{}"'.format(os.path.join(current_folder, '..', 'Calcular AUC.ipynb'))

In [5]:
set_de_entrenamiento_testing_y_prediccion = '"{}"'.format(os.path.join(
    current_folder,
    '..',
    'Set de entrenamiento, testing y predicción.ipynb'
))

In [6]:
predicciones_csv = os.path.join(current_folder, '..', 'predictions.csv')

In [7]:
hiperparametros_csv = os.path.join(current_folder, 'hiperparametros', 'xgboost.csv')

Cargo el df con los features.

In [8]:
pd.options.mode.chained_assignment = None

In [9]:
%run $merge_features

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

KeyboardInterrupt: La limpieza ya corrió en este Kernel

In [10]:
assert(df_features.shape[0] == df['person'].unique().shape[0])

Cargo los sets de entrenamiento, testing y predicción.

In [11]:
%run $set_de_entrenamiento_testing_y_prediccion

In [12]:
labels_with_features = labels.merge(df_features, how='inner', on='person')
data = labels_with_features.drop('label', axis=1)
target = labels_with_features['label']

## Entrenamiento rápido

Con cross validation de sklearn.

In [13]:
regr = xgb.XGBRegressor(objective='reg:logistic')
cv = 10 # cantidad de splits en el cross validation

In [14]:
%%time
scores = cross_val_score(regr, data, target, cv=cv, scoring='roc_auc')
print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.860962 (+/- 0.025609)
Wall time: 1min 48s


Con cross validation de xgboost.

In [33]:
param = {
    'silent': 1,
}

In [34]:
%%time
dtrain = xgb.DMatrix(data, label=target)
result = xgb.cv(param, dtrain, nfold=10, metrics='auc', verbose_eval=False, shuffle=False, stratified=True) # , callbacks=[xgb.callback.print_evaluation(show_stdv=True)]

Wall time: 8.8 s


In [35]:
result.tail(1)

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
9,0.91862,0.003305,0.85039,0.015567


In [38]:
result['test-auc-mean'].max()

0.8517725

## Búsqueda de hiperparámetros con k-fold

Vamos a hacer un kfold con sklearn.

In [18]:
splits = 10
max_depth_values = 10
eta_values = 5
gamma_values = 5
num_round_values = 10

In [19]:
param = {
    'max_depth': 30,
    'eta': 0,
    'gamma': 0,
    'silent': 1,
    'objective': 'binary:logistic',
    'nthread': 10,
    'eval_metric': 'auc'
}

In [20]:
labels_with_features = labels.merge(df_features, how='inner', on='person')

In [21]:
columns = list(labels_with_features.columns)
columns.remove('label')

In [22]:
f = IntProgress(min=0, max=splits*max_depth_values*eta_values*gamma_values*num_round_values)
display(f) # display the bar

kf = KFold(n_splits=splits, shuffle=False)
results = pd.DataFrame(columns=['k', 'max_depth', 'eta', 'gamma', 'num_round', 'auc'])
index = 0
k = 0
for train_index, test_index in kf.split(labels):
    
    labels_training = labels_with_features.iloc[train_index]
    labels_test = labels_with_features.iloc[test_index]
    assert(labels_training.merge(labels_test, how='inner', on='person').shape[0] == 0)
    train_matrix = xgb.DMatrix(labels_training.loc[:, columns], label=labels_training['label'])
    test_matrix = xgb.DMatrix(labels_test.loc[:, columns])
    
    for max_depth, eta, gamma, num_round in np.ndindex((max_depth_values, eta_values, gamma_values, num_round_values)):
        eta=eta/eta_values
        param['max_depth'] = max_depth
        param['eta'] = eta
        param['gamma'] = gamma
        
        bst = xgb.train(param, train_matrix, num_round)
        labels_test['label_predicted'] = bst.predict(test_matrix)
        
        results.loc[index] = k, max_depth, eta, gamma, num_round, calculate_auc()
        
        index+=1
        f.value += 1
    
    k += 1

IntProgress(value=0, max=25000)

NameError: name 'calculate_auc' is not defined

Promedio los hiperparámetros.

In [None]:
results_mean = results.groupby(['max_depth', 'eta', 'gamma', 'num_round'])[['auc']].mean()

In [None]:
mejor_resultado = results_mean.loc[results_mean.idxmax()]
mejor_resultado

Escribo los nuevos resultados en un archivo.

In [None]:
mejores_resultados = pd.read_csv(hiperparametros_csv, index_col='fecha')

In [None]:
mejores_resultados

In [None]:
mejor_resultado['k'] = splits
mejor_resultado['features'] = ','.join(columns)
mejor_resultado['fecha'] = pd.datetime.now().strftime("%Y-%m-%d %H:%M")
mejor_resultado.reset_index(inplace=True)
mejor_resultado = mejor_resultado[['fecha', 'auc', 'max_depth', 'eta', 'gamma', 'k', 'num_round', 'features']]
mejor_resultado.set_index('fecha', inplace=True)

In [None]:
mejores_resultados = mejores_resultados.append(mejor_resultado, sort=False)

In [None]:
mejores_resultados.to_csv(hiperparametros_csv)

## Hiperparámetros con Bayesian Optimization

Optimizo con **sklearn** para calcular el cv.

In [24]:
regr = xgb.XGBRegressor()

In [25]:
pbounds = {
    'max_depth': (2, 20),
    'learning_rate': (0, 1),
    'n_estimators': (2, 1000),
    'gamma': (0, 20),
}

discrete = ['max_depth', 'n_estimators ']

Falta optimizar otros parámetros discretos:
 - booster 
 - min_child_weight 
 - max_delta_step 

In [56]:
def cv_score_sklearn(**param):
    # transformo los valores que deben ser discretos
    for d in discrete:
        param[d] = int(param[d])
    return cross_val_score(xgb.XGBRegressor(**param), data, target, cv=10, scoring='roc_auc', n_jobs=-2).mean()

In [57]:
%%time
optimizer = BayesianOptimization(f=cv_score_sklearn, pbounds=pbounds)
optimizer.maximize(
    init_points=5,
    n_iter=10,
)

|   iter    |  target   |    eta    |   gamma   | max_depth |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.7912  [0m | [0m 0.8778  [0m | [0m 10.56   [0m | [0m 13.85   [0m |
| [0m 2       [0m | [0m 0.7145  [0m | [0m 0.3739  [0m | [0m 17.55   [0m | [0m 4.285   [0m |


KeyboardInterrupt: 

Ahora con **xgboost** como cross validator.

In [46]:
pbounds = {
    'max_depth': (2, 20),
    'eta': (0, 1),
    'gamma': (0, 20),
}

discrete = ['max_depth']

In [47]:
dtrain = xgb.DMatrix(data, label=target)
def cv_score_xgb(**param):
    param['silent'] = 1
    
    # transformo los valores que deben ser discretos
    for d in discrete:
        param[d] = int(param[d])
    return xgb.cv(param, dtrain, nfold=10, metrics='auc', verbose_eval=False, shuffle=False, stratified=True)['test-auc-mean'].max() # , callbacks=[xgb.callback.print_evaluation(show_stdv=True)]

In [53]:
%%time
optimizer = BayesianOptimization(f=cv_score_xgb, pbounds=pbounds)
optimizer.maximize(
    init_points=10,
    n_iter=20,
)

|   iter    |  target   |    eta    |   gamma   | max_depth |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.705   [0m | [0m 0.2294  [0m | [0m 19.13   [0m | [0m 16.1    [0m |
| [95m 2       [0m | [95m 0.7726  [0m | [95m 0.8427  [0m | [95m 6.972   [0m | [95m 13.2    [0m |
| [95m 3       [0m | [95m 0.8184  [0m | [95m 0.7898  [0m | [95m 1.153   [0m | [95m 16.08   [0m |
| [0m 4       [0m | [0m 0.7079  [0m | [0m 0.04712 [0m | [0m 13.65   [0m | [0m 2.746   [0m |
| [0m 5       [0m | [0m 0.7272  [0m | [0m 0.06254 [0m | [0m 15.52   [0m | [0m 9.569   [0m |
| [0m 6       [0m | [0m 0.705   [0m | [0m 0.05221 [0m | [0m 19.85   [0m | [0m 9.046   [0m |
| [0m 7       [0m | [0m 0.776   [0m | [0m 0.6117  [0m | [0m 8.988   [0m | [0m 7.255   [0m |
| [95m 8       [0m | [95m 0.8255  [0m | [95m 0.8505  [0m | [95m 0.3448  [0m | [95m 3.511   [0m |
| [0m 9       [0m | [0m 0.7051  [0m | [0

In [55]:
%%time
optimizer = BayesianOptimization(f=cv_score_xgb, pbounds=pbounds)
optimizer.maximize(
    init_points=20,
    n_iter=30,
)

|   iter    |  target   |    eta    |   gamma   | max_depth |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.7548  [0m | [0m 0.7977  [0m | [0m 9.675   [0m | [0m 8.439   [0m |
| [95m 2       [0m | [95m 0.8005  [0m | [95m 0.134   [0m | [95m 7.713   [0m | [95m 9.604   [0m |
| [0m 3       [0m | [0m 0.7532  [0m | [0m 0.877   [0m | [0m 10.39   [0m | [0m 7.216   [0m |
| [95m 4       [0m | [95m 0.8168  [0m | [95m 0.8005  [0m | [95m 3.215   [0m | [95m 14.26   [0m |
| [0m 5       [0m | [0m 0.7733  [0m | [0m 0.5753  [0m | [0m 7.882   [0m | [0m 18.19   [0m |
| [0m 6       [0m | [0m 0.811   [0m | [0m 0.2463  [0m | [0m 5.811   [0m | [0m 19.33   [0m |
| [0m 7       [0m | [0m 0.7663  [0m | [0m 0.6384  [0m | [0m 9.161   [0m | [0m 18.11   [0m |
| [0m 8       [0m | [0m 0.7138  [0m | [0m 0.9417  [0m | [0m 18.74   [0m | [0m 17.29   [0m |
| [0m 9       [0m | [0m 0.7141  [0m | [0m 0.4

In [49]:
optimizer.max

{'target': 0.8143018,
 'params': {'eta': 1.0, 'gamma': 0.0, 'max_depth': 10.723377917898326}}

### Random sear

In [15]:
dtrain = xgb.DMatrix(data, label=target)
xgb.cv({}, dtrain, nfold=10, metrics='auc', verbose_eval=False, shuffle=False, stratified=True, maximize=True, early_stopping_rounds=15)

[18:20:45] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 0 pruned nodes, max_depth=6
[18:20:45] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 0 pruned nodes, max_depth=6
[18:20:45] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 0 pruned nodes, max_depth=6
[18:20:45] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 0 pruned nodes, max_depth=6
[18:20:45] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 0 pruned nodes, max_depth=6
[18:20:46] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 0 pruned nodes, max_depth=6
[18:20:46] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pru

[18:20:50] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 96 extra nodes, 0 pruned nodes, max_depth=6
[18:20:50] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 86 extra nodes, 0 pruned nodes, max_depth=6
[18:20:50] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 78 extra nodes, 0 pruned nodes, max_depth=6
[18:20:50] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 72 extra nodes, 0 pruned nodes, max_depth=6
[18:20:50] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 0 pruned nodes, max_depth=6
[18:20:50] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 76 extra nodes, 0 pruned nodes, max_depth=6
[18:20:50] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning e

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.845141,0.00895,0.812798,0.019389
1,0.869106,0.004513,0.836404,0.015672
2,0.881391,0.002728,0.843909,0.016373
3,0.889001,0.003023,0.846955,0.013251
4,0.895995,0.00402,0.848374,0.013421
5,0.901501,0.003494,0.848626,0.013324
6,0.906799,0.00315,0.849891,0.014503
7,0.911311,0.002869,0.851104,0.014785
8,0.915058,0.004215,0.851773,0.014699
9,0.91862,0.003305,0.85039,0.015567


## Predecir labels desconocidos

In [None]:
training = labels.merge(df_features, how='inner', on='person')

In [None]:
columns = list(training.columns)
columns.remove('label')

In [None]:
dtrain = xgb.DMatrix(training.loc[:, columns], label=training['label'])

In [None]:
param['max_depth'] = 4
param['eta'] = 0.6
param['gamma'] = 6

In [None]:
num_round = 10
bst = xgb.train(param, dtrain, num_round)

Predigo:

In [None]:
labels_to_predict_with_features = labels_to_predict.merge(df_features, how='inner', on='person')

In [None]:
assert(labels_to_predict.shape[0] == labels_to_predict_with_features.shape[0])

In [None]:
matrix = xgb.DMatrix(labels_to_predict_with_features.loc[:, columns])

In [None]:
labels_to_predict['label'] = bst.predict(matrix)

In [None]:
# labels_to_predict.to_csv(predicciones_csv)