# Optimización de un modelo de XGBoost

Este notebook recoge los resultados de la búsqueda del mejor modelo de clasificación mediante XGBoost (= eXtreme Gradient Boosting). Se trata de un método de boosting, por tanto, la idea es generar un modelo robusto a partir de varios modelos "débiles". Sin embargo, se le considera extreme gradient boosting ya que es generalmente bastante más rápido que otras implementaciones de gradient boosting y suele tener un buen rendimiento sobre datos estructurados.

Para buscar el mejor modelo posible, se tratará de buscar los mejores hiperparámetros para:

* El tipo de booster que se va a utilizar.
* El paso del método de boosting.
* La mínima reducción de loss exigida para hacer una nueva partición de una rama cuando el booster sea de tal tipo.
* La profundidad máxima de los árboles cuando el booster sea de tal tipo.

### Preparación de los datos

In [1]:
# Estructuras de datos
import pandas as pd
import numpy as np
# Data partition
from sklearn.model_selection import train_test_split
# Parameter tunning libraries
import optuna
from sklearn.model_selection import GridSearchCV
# Accuracy function
from sklearn.metrics import accuracy_score
# Model
import xgboost as xgb
from xgboost import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [2]:
# Datos de entrenamiento
trainFNC = pd.read_csv("../data/train_FNC.csv")
trainSBM = pd.read_csv("../data/train_SBM.csv")
train_labels = pd.read_csv("../data/train_labels.csv")

# DataFrame con ambas fuentes de datos
train = pd.merge(left=trainFNC, right=trainSBM, left_on="Id", right_on="Id")
data = pd.merge(left=train_labels, right=train, left_on="Id", right_on="Id")
data.drop("Id", inplace=True, axis=1)

# Shuffle de los datos de train
data = data.sample(frac=1, random_state=0)
data.head(5)

Unnamed: 0,Class,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
2,0,0.24585,0.21662,-0.12468,-0.3538,0.1615,-0.002032,-0.13302,-0.035222,0.25904,...,-0.257114,0.597229,1.220756,-0.059213,-0.435494,-0.092971,1.09091,-0.448562,-0.508497,0.350434
13,1,0.41073,-0.031925,0.2107,0.24226,0.3201,-0.41929,-0.18714,0.16845,0.59979,...,-0.050862,0.870602,0.609465,1.181878,-2.279469,-0.013484,-0.012693,-1.244346,-1.080442,-0.788502
53,1,0.070919,0.034179,-0.011755,0.019158,0.024645,-0.032022,0.00462,0.31817,0.21255,...,-1.539922,-1.495822,1.643866,1.68778,1.521086,-1.988432,-0.267471,0.510576,1.104566,-1.067206
41,0,0.087377,-0.052462,-0.007835,-0.11283,0.38938,0.21608,0.063572,-0.25123,-0.080568,...,-0.077353,-0.459463,-0.204328,-0.619508,-1.410523,-0.304622,-1.521928,0.593691,0.073638,-0.26092
74,0,0.20275,0.19142,-0.056662,-0.15778,0.24404,0.03978,-0.001503,0.001056,-0.048222,...,0.044457,0.593326,1.063052,0.434726,1.604964,-0.359736,0.210107,0.355922,0.730287,-0.323557


Vamos a usar la siguiente partición de los datos:

* 60% train $\sim$ 50 datos
* 20% validation $\sim$ 18 datos (se define al aplicar cross-validación en el ajuste)
* 20% test $\sim$ 18 datos

In [3]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("Tamaño del dataset de train:", X_train.shape)
print("Tamaño del dataset de test:", X_test.shape)

Tamaño del dataset de train: (68, 410)
Tamaño del dataset de test: (18, 410)


In [4]:
# Datos de test
testFNC = pd.read_csv("../data/test_FNC.csv")
testSBM = pd.read_csv("../data/test_SBM.csv")

# DataFrame con ambas fuentes de datos
test_kaggle = pd.merge(left=testFNC, right=testSBM, left_on="Id", right_on="Id")
test_kaggle.drop("Id", inplace=True, axis=1)
test_kaggle.head(5)

Unnamed: 0,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,FNC10,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
0,0.476127,0.064466,0.053238,-0.608133,0.073988,-0.637038,0.113556,-0.192434,-0.004025,-0.060474,...,-0.451994,1.12377,2.083006,1.14544,-0.067608,1.202529,0.851587,0.451583,-0.159739,0.192076
1,0.013833,0.267183,0.232178,-0.167151,-0.261327,0.191869,0.406493,0.088761,0.177048,0.036718,...,0.696987,1.397832,1.046136,-0.191733,-2.192023,-0.369276,0.822225,-0.109342,-0.580476,0.17416
2,-0.435452,0.04678,0.243742,0.39703,-0.147821,0.17362,-0.461963,-0.610736,0.419753,0.400985,...,0.160145,1.906989,-2.661633,-0.193911,0.440873,0.641739,0.918397,-0.758046,0.154701,-0.476647
3,-0.20451,-0.036735,-0.760705,-0.740495,0.064668,0.349926,-0.273826,-0.174384,-0.120248,0.175618,...,0.974828,-1.997087,-2.083782,1.154107,-0.643947,2.332424,0.659124,-0.809445,0.55896,2.790871
4,0.599435,-0.166441,0.122431,0.011539,0.346906,-0.01743,-0.274734,0.21151,0.151012,-0.033434,...,-0.789153,1.578984,1.402592,-1.23044,0.296686,2.806314,0.427184,-0.240682,-0.196948,-1.544345


### Modelo

In [5]:
def train_model(model, param_grid):
    '''Función para realizar el entrenamiento y la búsqueda de hiperparámetros'''
    np.random.seed()
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=4)
    # cv = 4 porque así: el conjunto de validation tiene un 0.25 del tamaño de train y: 0.25 * 0.8 = 0.2 ~ 20% datos
    #                    el conjunto de train tiene un 0.75 del tamaño de train y: 0.75 * 0.8 = 0.6 ~60% datos
    grid_search.fit(X_train, y_train)
    
    print("Parámetros óptimos:", grid_search.best_params_)
    print("Modelo óptimo:", grid_search.best_estimator_)
    
    return grid_search.best_estimator_

Búsqueda de hiperparámetros mediante ``GridSearchCV`` de ``sklearn``:

In [6]:
import warnings
warnings.filterwarnings("ignore") # Suprimir warning de versiones
xgb.set_config(verbosity=0)

# Definir y entrenar el modelo
model_XGB = XGBClassifier(eval_metric="logloss", random_state=0, use_label_encoder=False)
param_grid_XGB = {
    "booster": ["gbtree", "gblinear", "dart"],
    "learning_rate": [0.001, 0.01, 0.1, 0.3, 0.5, 1],
    "gamma": [0, 0.001, 0.005, 0.01, 0.05, 0.1],
    "max_depth": range(0, 21) # 0 = ninguna restricción
}
model_XGB_opt = train_model(model_XGB, param_grid_XGB)

# Predicción en partición de test
y_pred_XGB = model_XGB_opt.predict(X_test)

# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_XGB)
print("Accuracy: {:0.2f}%".format(accuracy * 100))

Parámetros óptimos: {'booster': 'gblinear', 'gamma': 0.001, 'learning_rate': 1, 'max_depth': 1}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=0.001,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=1, max_delta_step=None, max_depth=1,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              use_label_encoder=False, validate_parameters=1, verbosity=None)
Accuracy: 66.67%


Búsqueda mediante la librería ``optuna`` probando 2 métodos de búsqueda de hiperparámetros:

* **GridSampler:** equivalente a la anterior búsqueda de grid de sklearn. Lo usaremos para que los resultados sean comparables.
* **TPE:** algoritmo para hacer una "búsqueda inteligente" de hiperparámetros. Debería ahorrar intentos de combinaciones haciendo una selección inteligente de las pruebas. En nuestro caso le permitiremos probar un 10% del número de combinaciones posibles. 

In [7]:
def objectiveXGBoost_Grid(trial):
    '''
    Define la función a optimizar por medio de un sampler de tipo GridSampler.
    En este caso se trata de maximizar el accuracy
    '''
    booster = trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"])
    learning_rate = trial.suggest_float("learning_rate", 0, 1)
    gamma = trial.suggest_float("gamma", 0, 1)
    max_depth = trial.suggest_int("max_depth", 0, 20)
    
    modelXGBoost_optuna = XGBClassifier(eval_metric="logloss", booster=booster, learning_rate=learning_rate, gamma=gamma,
                                        max_depth=max_depth, random_state=0, use_label_encoder=False)
    
    modelXGBoost_optuna.fit(X_train, y_train)

    y_pred_XGBoost_optuna = modelXGBoost_optuna.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_XGBoost_optuna)
    return accuracy

In [8]:
# Prueba con GridSampler
optuna.logging.set_verbosity(optuna.logging.WARNING)

search_space = {"booster": ["gbtree", "gblinear", "dart"], 
                "learning_rate": np.arange(0.001, 1, 0.1665),
                "gamma": np.arange(0, 0.1, 0.025),
                "max_depth": range(0, 20, 2)
               }
sampler = optuna.samplers.GridSampler(search_space)
study_Grid = optuna.create_study(direction="maximize", sampler=sampler)
study_Grid.optimize(objectiveXGBoost_Grid)

In [9]:
study_Grid.best_trial

FrozenTrial(number=0, values=[0.8333333333333334], datetime_start=datetime.datetime(2022, 6, 11, 23, 12, 52, 659897), datetime_complete=datetime.datetime(2022, 6, 11, 23, 12, 53, 550274), params={'booster': 'gblinear', 'learning_rate': 0.001, 'gamma': 0.05, 'max_depth': 18}, distributions={'booster': CategoricalDistribution(choices=('gbtree', 'gblinear', 'dart')), 'learning_rate': UniformDistribution(high=1.0, low=0.0), 'gamma': UniformDistribution(high=1.0, low=0.0), 'max_depth': IntUniformDistribution(high=20, low=0, step=1)}, user_attrs={}, system_attrs={'search_space': OrderedDict([('booster', ['dart', 'gblinear', 'gbtree']), ('gamma', [0.0, 0.025, 0.05, 0.07500000000000001]), ('learning_rate', [0.001, 0.1675, 0.334, 0.5005000000000001, 0.667, 0.8335]), ('max_depth', [0, 2, 4, 6, 8, 10, 12, 14, 16, 18])]), 'grid_id': 369}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None)

In [10]:
# Definir y entrenar el modelo
modelXGBoost_optuna_Grid = XGBClassifier(eval_metric="logloss", booster="gblinear", learning_rate=0.001, gamma=0.05,
                                         max_depth=18, random_state=0, use_label_encoder=False)  
modelXGBoost_optuna_Grid.fit(X_train, y_train)

# Predicción en partición de test
y_pred_XGBoost_optuna_Grid = modelXGBoost_optuna_Grid.predict(X_test)

# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_XGBoost_optuna_Grid)
print("Accuracy: {:0.2f}%".format(accuracy * 100))

Accuracy: 83.33%


In [11]:
def objectiveXGBoost_TPE(trial):
    '''
    Define la función a optimizar por medio de un sampler de tipo TPE.
    En este caso se trata de maximizar el accuracy
    '''
    booster = trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"])
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1, step=0.1665)
    gamma = trial.suggest_float("gamma", 0, 0.1, step=0.025)
    max_depth = trial.suggest_int("max_depth", 0, 20, 2)
    
    modelXGBoost_optuna = XGBClassifier(eval_metric="logloss", booster=booster, learning_rate=learning_rate, gamma=gamma,
                                        max_depth=max_depth, random_state=0, use_label_encoder=False)
    
    modelXGBoost_optuna.fit(X_train, y_train)

    y_pred_XGBoost_optuna = modelXGBoost_optuna.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_XGBoost_optuna)
    return accuracy

In [12]:
# Prueba con TPE
optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = optuna.samplers.TPESampler(seed=0)  # Asegurar los reproducibilidad de los resultados
study_TPE = optuna.create_study(direction="maximize", sampler=sampler)
study_TPE.optimize(objectiveXGBoost_TPE, n_trials=80)
# n_trials = (3 x 6 x 4 x 11) * 0.1 = 79.2 ~ 80

In [13]:
study_TPE.best_trial

FrozenTrial(number=2, values=[0.8333333333333334], datetime_start=datetime.datetime(2022, 6, 10, 19, 5, 30, 272856), datetime_complete=datetime.datetime(2022, 6, 10, 19, 5, 30, 499449), params={'booster': 'gblinear', 'learning_rate': 0.001, 'gamma': 0.0, 'max_depth': 18}, distributions={'booster': CategoricalDistribution(choices=('gbtree', 'gblinear', 'dart')), 'learning_rate': DiscreteUniformDistribution(high=1.0, low=0.001, q=0.1665), 'gamma': DiscreteUniformDistribution(high=0.1, low=0.0, q=0.025), 'max_depth': IntUniformDistribution(high=20, low=0, step=2)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=2, state=TrialState.COMPLETE, value=None)

In [13]:
# Definir y entrenar el modelo
modelXGBoost_optuna_TPE = XGBClassifier(eval_metric="logloss", booster="gblinear", learning_rate=0.001, gamma=0,
                                        max_depth=18, random_state=0, use_label_encoder=False) 
modelXGBoost_optuna_TPE.fit(X_train, y_train)

# Predicción en partición de test
y_pred_XGBoost_optuna_TPE = modelXGBoost_optuna_TPE.predict(X_test)

# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_XGBoost_optuna_TPE)
print("Accuracy: {:0.2f}%".format(accuracy * 100))

Accuracy: 83.33%


Búsqueda mediante ``optuna`` con ``OptunaSearchCV``:

In [14]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Definir y entrenar el modelo
model_XGB = XGBClassifier(eval_metric="logloss", random_state=0, use_label_encoder=False)
param_grid_XGB = {
    "booster": optuna.distributions.CategoricalDistribution(["gbtree", "gblinear", "dart"]),
    "learning_rate": optuna.distributions.DiscreteUniformDistribution(0.001, 1, 0.1665),
    "gamma": optuna.distributions.DiscreteUniformDistribution(0, 0.1, 0.025),
    "max_depth": optuna.distributions.IntUniformDistribution(0, 20, 2) # 0 = ninguna restricción
}
# Probamos también 6 valores de learning_rate, aunque ahora el paso entre uno y otro es necesariamente el mismo

optuna_search = optuna.integration.OptunaSearchCV(model_XGB, param_grid_XGB, cv=4, n_trials=792, refit=True, random_state=0)
# n_trials = 3 x 6 x 4 x 11 = 792
optuna_search.fit(X_train, y_train)

OptunaSearchCV(cv=4,
               estimator=XGBClassifier(base_score=None, booster=None,
                                       colsample_bylevel=None,
                                       colsample_bynode=None,
                                       colsample_bytree=None,
                                       enable_categorical=False,
                                       eval_metric='logloss', gamma=None,
                                       gpu_id=None, importance_type=None,
                                       interaction_constraints=None,
                                       learning_rate=None, max_delta_step=None,
                                       max_depth=None, min_child_weight=None,
                                       missing=nan, mo...
                                       validate_parameters=None,
                                       verbosity=None),
               n_trials=792,
               param_distributions={'booster': CategoricalDistribution(cho

In [15]:
optunaCV_opt = optuna_search.best_estimator_
optunaCV_opt

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=0.025,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=1.0, max_delta_step=None, max_depth=10,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [16]:
optunaCV_opt.fit(X_train, y_train)

# Predicción en partición de test
y_pred_XGB_optuna = optunaCV_opt.predict(X_test)

# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_XGB_optuna)
print("Accuracy: {:0.2f}%".format(accuracy * 100))

Accuracy: 61.11%


# Create submissions

In [None]:
import pathlib
from datetime import datetime

def create_submission(pred, test_id=testFNC["Id"]):
    '''
    Función para generar un csv con las predicciones de un modelo para participar en la competición de Kaggle
    '''
    submissionDF = pd.DataFrame(list(zip(test_id, pred)), columns=["Id", "Probability"])
    print(submissionDF.shape) # Comprobación del tamaño, debe ser: (119748, 2)
    current_time = datetime.now().strftime("%d-%m-%Y_%Hh%Mmin")
    current_path = pathlib.Path().resolve()
    parent_path = current_path.parent
    submissionDF.to_csv(f"{parent_path}\submissions\MLSP_submission_XGBoost_{current_time}.csv", header=True, index=False)