# Optimización de un modelo de XGBoost

Este notebook recoge los resultados de la búsqueda del mejor modelo de clasificación mediante XGBoost (= eXtreme Gradient Boosting). Se trata de un método de boosting, por tanto, la idea es generar un modelo robusto a partir de varios modelos "débiles". Sin embargo, se le considera extreme gradient boosting ya que es generalmente bastante más rápido que otras implementaciones de gradient boosting y suele tener un buen rendimiento sobre datos estructurados.

Para buscar el mejor modelo posible, se tratará de buscar los mejores hiperparámetros para:

* El tipo de booster que se va a utilizar.
* El paso del método de boosting.
* La mínima reducción de loss exigida para hacer una nueva partición de una rama cuando el booster sea de tal tipo.
* La profundidad máxima de los árboles cuando el booster sea de tal tipo.

### Preparación de los datos

In [6]:
import pandas as pd
import numpy as np
# Data partition
from sklearn.model_selection import train_test_split
# Parameter tunning libraries
import optuna
from sklearn.model_selection import GridSearchCV
# Accuracy function
from sklearn.metrics import accuracy_score
# Model
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
# Datos de entrenamiento
trainFNC = pd.read_csv("../data/train_FNC.csv")
trainSBM = pd.read_csv("../data/train_SBM.csv")
train_labels = pd.read_csv("../data/train_labels.csv")

# DataFrame con ambas fuentes de datos
train = pd.merge(left=trainFNC, right=trainSBM, left_on='Id', right_on='Id')
data = pd.merge(left=train_labels, right=train, left_on='Id', right_on='Id')
data.drop("Id", inplace=True, axis=1)

# Shuffle de los datos de train
data = data.sample(frac=1, random_state=0)
data.head(5)

Unnamed: 0,Class,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
2,0,0.24585,0.21662,-0.12468,-0.3538,0.1615,-0.002032,-0.13302,-0.035222,0.25904,...,-0.257114,0.597229,1.220756,-0.059213,-0.435494,-0.092971,1.09091,-0.448562,-0.508497,0.350434
13,1,0.41073,-0.031925,0.2107,0.24226,0.3201,-0.41929,-0.18714,0.16845,0.59979,...,-0.050862,0.870602,0.609465,1.181878,-2.279469,-0.013484,-0.012693,-1.244346,-1.080442,-0.788502
53,1,0.070919,0.034179,-0.011755,0.019158,0.024645,-0.032022,0.00462,0.31817,0.21255,...,-1.539922,-1.495822,1.643866,1.68778,1.521086,-1.988432,-0.267471,0.510576,1.104566,-1.067206
41,0,0.087377,-0.052462,-0.007835,-0.11283,0.38938,0.21608,0.063572,-0.25123,-0.080568,...,-0.077353,-0.459463,-0.204328,-0.619508,-1.410523,-0.304622,-1.521928,0.593691,0.073638,-0.26092
74,0,0.20275,0.19142,-0.056662,-0.15778,0.24404,0.03978,-0.001503,0.001056,-0.048222,...,0.044457,0.593326,1.063052,0.434726,1.604964,-0.359736,0.210107,0.355922,0.730287,-0.323557


Vamos a usar la siguiente partición de los datos:

* 60% train $\sim$ 50 datos
* 20% validation $\sim$ 18 datos (se define al aplicar cross-validación en el ajuste)
* 20% test $\sim$ 18 datos

In [3]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("Tamaño del dataset de train:", X_train.shape)
print("Tamaño del dataset de test:", X_test.shape)

Tamaño del dataset de train: (68, 410)
Tamaño del dataset de test: (18, 410)


In [4]:
# Datos de test
testFNC = pd.read_csv("../data/test_FNC.csv")
testSBM = pd.read_csv("../data/test_SBM.csv")

# DataFrame con ambas fuentes de datos
test = pd.merge(left=testFNC, right=testSBM, left_on='Id', right_on='Id')
test.drop("Id", inplace=True, axis=1)
test.head(5)

Unnamed: 0,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,FNC10,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
0,0.476127,0.064466,0.053238,-0.608133,0.073988,-0.637038,0.113556,-0.192434,-0.004025,-0.060474,...,-0.451994,1.12377,2.083006,1.14544,-0.067608,1.202529,0.851587,0.451583,-0.159739,0.192076
1,0.013833,0.267183,0.232178,-0.167151,-0.261327,0.191869,0.406493,0.088761,0.177048,0.036718,...,0.696987,1.397832,1.046136,-0.191733,-2.192023,-0.369276,0.822225,-0.109342,-0.580476,0.17416
2,-0.435452,0.04678,0.243742,0.39703,-0.147821,0.17362,-0.461963,-0.610736,0.419753,0.400985,...,0.160145,1.906989,-2.661633,-0.193911,0.440873,0.641739,0.918397,-0.758046,0.154701,-0.476647
3,-0.20451,-0.036735,-0.760705,-0.740495,0.064668,0.349926,-0.273826,-0.174384,-0.120248,0.175618,...,0.974828,-1.997087,-2.083782,1.154107,-0.643947,2.332424,0.659124,-0.809445,0.55896,2.790871
4,0.599435,-0.166441,0.122431,0.011539,0.346906,-0.01743,-0.274734,0.21151,0.151012,-0.033434,...,-0.789153,1.578984,1.402592,-1.23044,0.296686,2.806314,0.427184,-0.240682,-0.196948,-1.544345


### Modelo

In [5]:
def train_model(model, param_grid):
    '''Función para realizar el entrenamiento y la búsqueda de hiperparámetros'''
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=4)
    # cv = 4 porque así: el conjunto de validation tiene un 0.25 del tamaño de train y: 0.25 * 0.8 = 0.2 ~ 20% datos
    #                    el conjunto de train tiene un 0.75 del tamaño de train y: 0.75 * 0.8 = 0.6 ~60% datos
    grid_search.fit(X_train, y_train)
    
    print("Parámetros óptimos:", grid_search.best_params_)
    print("Modelo óptimo:", grid_search.best_estimator_)
    
    return grid_search.best_estimator_

Búsqueda de hiperparámetros mediante ``GridSearchCV`` de ``sklearn``:

In [10]:
import warnings
warnings.filterwarnings("ignore")
xgb.set_config(verbosity=0)

# Definir y entrenar el modelo
model_XGB = XGBClassifier(eval_metric="logloss", random_state=0, use_label_encoder=False)
param_grid_XGB = {
    "booster": ["gbtree", "gblinear", "dart"],
    "learning_rate": [0.001, 0.05, 0.1, 0.5],
    "gamma": [0, 0.001, 0.005, 0.01, 0.05, 0.1],
    "max_depth": [5, 6, 10, 15, 20, 0] # 0 = ninguna restricción
#     "gamma": [0],
#     "max_depth": [6, 10] # 0 = ninguna restricción
}
model_XGB_opt = train_model(model_XGB, param_grid_XGB)

# Predicción en partición de test
y_pred_XGB = model_XGB_opt.predict(X_test)

# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_XGB)
print("Accuracy: {:0.2f}%".format(accuracy * 100))

# Predicción en test para kaggle
y_pred_kaggle_XGB = model_XGB_opt.predict(test)

Parámetros óptimos: {'booster': 'gblinear', 'gamma': 0.005, 'learning_rate': 0.5, 'max_depth': 6}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=0.005,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.5, max_delta_step=None, max_depth=6,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              use_label_encoder=False, validate_parameters=1, verbosity=None)
Accuracy: 72.22%


Búsqueda mediante ``optuna`` con ``OptunaSearchCV``:

In [18]:
# Definir y entrenar el modelo
model_XGB = XGBClassifier(eval_metric="logloss", random_state=0, use_label_encoder=False)
param_grid_XGB = {
    "booster": optuna.distributions.CategoricalDistribution(["gbtree", "gblinear", "dart"]),
    "learning_rate": optuna.distributions.DiscreteUniformDistribution(0.001, 0.1, 0.033),
    "gamma": optuna.distributions.DiscreteUniformDistribution(0, 0.1, 0.025),
    "max_depth": optuna.distributions.IntUniformDistribution(0, 20, 2) # 0 = ninguna restricción
#     "gamma": [0],
#     "max_depth": [6, 10] # 0 = ninguna restricción
}

optuna_search = optuna.integration.OptunaSearchCV(model_XGB, param_grid_XGB, cv=4, n_trials=396, random_state=0)
# n_trials = 3 x 3 x 4 x 11 = 396

optuna_search.fit(X_train, y_train)

[32m[I 2022-05-23 18:45:33,478][0m A new study created in memory with name: no-name-b2dd370a-73a9-4c91-b146-f6aaa1748e9e[0m
[32m[I 2022-05-23 18:45:38,591][0m Trial 0 finished with value: 0.5 and parameters: {'booster': 'dart', 'learning_rate': 0.001, 'gamma': 0.05, 'max_depth': 20}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-05-23 18:45:43,059][0m Trial 1 finished with value: 0.5441176470588235 and parameters: {'booster': 'dart', 'learning_rate': 0.1, 'gamma': 0.1, 'max_depth': 10}. Best is trial 1 with value: 0.5441176470588235.[0m
[32m[I 2022-05-23 18:45:43,989][0m Trial 2 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.001, 'gamma': 0.1, 'max_depth': 14}. Best is trial 2 with value: 0.6323529411764707.[0m
[32m[I 2022-05-23 18:45:44,961][0m Trial 3 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.1, 'gamma': 0.07500000000000001, 'max_depth': 2}. Best is trial 2 wit

[32m[I 2022-05-23 18:46:29,683][0m Trial 34 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.1, 'gamma': 0.1, 'max_depth': 18}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:46:30,663][0m Trial 35 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 18}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:46:35,246][0m Trial 36 finished with value: 0.5294117647058824 and parameters: {'booster': 'dart', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 18}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:46:36,028][0m Trial 37 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.034, 'gamma': 0.07500000000000001, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:46:36,797][0m Trial 38 f

[32m[I 2022-05-23 18:47:13,507][0m Trial 69 finished with value: 0.5294117647058824 and parameters: {'booster': 'gbtree', 'learning_rate': 0.1, 'gamma': 0.05, 'max_depth': 14}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:47:14,501][0m Trial 70 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.034, 'gamma': 0.1, 'max_depth': 10}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:47:15,249][0m Trial 71 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 18}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:47:16,016][0m Trial 72 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 20}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:47:16,827][0m Trial 73 finished with v

[32m[I 2022-05-23 18:47:52,062][0m Trial 104 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:47:52,832][0m Trial 105 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 12}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:47:53,595][0m Trial 106 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 12}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:47:55,130][0m Trial 107 finished with value: 0.5294117647058824 and parameters: {'booster': 'gbtree', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:47:56,137][0m Trial 108 finis

[32m[I 2022-05-23 18:48:31,228][0m Trial 139 finished with value: 0.5294117647058824 and parameters: {'booster': 'dart', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 10}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:48:32,172][0m Trial 140 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.001, 'gamma': 0.05, 'max_depth': 20}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:48:32,933][0m Trial 141 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 12}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:48:33,718][0m Trial 142 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 14}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:48:34,578][0m Trial 143 finishe

[32m[I 2022-05-23 18:49:06,247][0m Trial 174 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 20}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:49:07,250][0m Trial 175 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.0, 'max_depth': 10}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:49:08,167][0m Trial 176 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 20}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:49:08,931][0m Trial 177 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 10}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:49:09,715][0m Trial 178 fini

[32m[I 2022-05-23 18:49:41,199][0m Trial 209 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.1, 'max_depth': 20}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:49:42,011][0m Trial 210 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 12}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:49:42,969][0m Trial 211 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 20}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:49:43,947][0m Trial 212 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.0, 'max_depth': 18}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:49:44,731][0m Trial 213 finish

[32m[I 2022-05-23 18:50:12,813][0m Trial 244 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.0, 'max_depth': 10}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:50:13,668][0m Trial 245 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.07500000000000001, 'max_depth': 10}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:50:14,667][0m Trial 246 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:50:15,648][0m Trial 247 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.07500000000000001, 'max_depth': 10}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:

[32m[I 2022-05-23 18:50:55,595][0m Trial 279 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.1, 'max_depth': 20}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:50:56,549][0m Trial 280 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.1, 'max_depth': 20}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:50:57,657][0m Trial 281 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 18}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:50:58,977][0m Trial 282 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.0, 'max_depth': 12}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:50:59,880][0m Trial 283 finished

[32m[I 2022-05-23 18:51:34,979][0m Trial 314 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:51:36,200][0m Trial 315 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.05, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:51:37,110][0m Trial 316 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.07500000000000001, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:51:41,866][0m Trial 317 finished with value: 0.5294117647058824 and parameters: {'booster': 'dart', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:51:42,684][0m Tri

[32m[I 2022-05-23 18:52:16,123][0m Trial 349 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.1, 'max_depth': 20}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:52:17,138][0m Trial 350 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.1, 'max_depth': 20}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:52:18,132][0m Trial 351 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.0, 'max_depth': 10}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:52:18,959][0m Trial 352 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.0, 'max_depth': 8}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:52:19,739][0m Trial 353 finished w

[32m[I 2022-05-23 18:52:53,493][0m Trial 384 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:52:54,401][0m Trial 385 finished with value: 0.6323529411764707 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:52:55,451][0m Trial 386 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.0, 'max_depth': 0}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:52:56,408][0m Trial 387 finished with value: 0.6470588235294118 and parameters: {'booster': 'gblinear', 'learning_rate': 0.067, 'gamma': 0.025, 'max_depth': 16}. Best is trial 12 with value: 0.6470588235294118.[0m
[32m[I 2022-05-23 18:52:57,234][0m Trial 388 fini

OptunaSearchCV(cv=4,
               estimator=XGBClassifier(base_score=None, booster=None,
                                       colsample_bylevel=None,
                                       colsample_bynode=None,
                                       colsample_bytree=None,
                                       enable_categorical=False,
                                       eval_metric='logloss', gamma=None,
                                       gpu_id=None, importance_type=None,
                                       interaction_constraints=None,
                                       learning_rate=None, max_delta_step=None,
                                       max_depth=None, min_child_weight=None,
                                       missing=nan, mo...
                                       validate_parameters=None,
                                       verbosity=None),
               n_trials=396,
               param_distributions={'booster': CategoricalDistribution(cho

In [20]:
# Predicción en partición de test
y_pred_XGB = optuna_search.predict(X_test)

# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_XGB)
print("Accuracy: {:0.2f}%".format(accuracy * 100))

Accuracy: 72.22%


En este caso, ambos métodos nos proporcionan los mismos resultados.

# Create submissions

In [None]:
import pathlib
from datetime import datetime

def create_submission(pred, method, test_id=testFNC["Id"]):
    submissionDF = pd.DataFrame(list(zip(test_id, pred)), columns=["Id", "Probability"])
    print(submissionDF.shape) # Comprobación del tamaño, debe ser: (119748, 2)
    current_time = datetime.now().strftime("%d-%m-%Y_%Hh%Mmin")
    current_path = pathlib.Path().resolve()
    parent_path = current_path.parent
    submissionDF.to_csv(f"{parent_path}\submissions\MLSP_submission_{method}_{current_time}.csv", header=True, index=False)