# Nested Cross-Validation (NestedCV) com XGBoost e Optuna

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
import numpy as np
import optuna
import pandas as pd

df = pd.read_csv('./datasets/original_treated.csv')

cols_to_drop = [
    'status',
    'g2',
    'final_grade',
    'canceled_discipline', 
    'skipped_discipline',
    'class_skips', 
    'id',
    'student_id',
    'is_approved',
]

X = df.drop(columns=cols_to_drop, errors='ignore')
y = df['is_approved']

# Configurações para os loops de validação e otimização
N_OUTER_FOLDS = 5
N_INNER_FOLDS = 3
N_TRIALS = 20

# Inicialização da validação cruzada externa
outer_cv = StratifiedKFold(n_splits=N_OUTER_FOLDS, shuffle=True, random_state=42)

outer_results = []
best_params_per_fold = []

print(f"Iniciando Nested CV com {N_OUTER_FOLDS} loops externos...")

# Loop Externo
for i, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
    print(f"\n[Outer Fold {i+1}/{N_OUTER_FOLDS}] Iniciando otimização...")

    # Separação física dos dados para este fold específico (Treino Externo vs Teste Externo)
    X_train_outer, X_test_outer = X.iloc[train_idx], X.iloc[test_idx]
    y_train_outer, y_test_outer = y.iloc[train_idx], y.iloc[test_idx]

    # Função objetivo do Optuna (Loop Interno: Otimização de Hiperparâmetros)
    def objective(trial: optuna.trial.Trial):
        # Definição do espaço de busca dos hiperparâmetros
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'random_state': 42,
            'n_jobs': -1,
            'verbosity': 0,
            'objective': 'binary:logistic',
        }

        clf = XGBClassifier(**params)

        # Validação Cruzada Interna: Avalia os hiperparâmetros apenas no X_train_outer
        inner_cv = StratifiedKFold(n_splits=N_INNER_FOLDS, shuffle=True, random_state=42)

        scores = cross_val_score(clf, X_train_outer, y_train_outer, cv=inner_cv, scoring='accuracy')
        return scores.mean() # O Optuna tenta maximizar essa média

    # Criação e execução do estudo de otimização
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=N_TRIALS)

    print(f"\t-> Melhores params deste fold: {study.best_params}")
    best_params_per_fold.append(study.best_params)

    # Configuração final do modelo com os melhores hiperparâmetros encontrados no loop interno
    best_params = study.best_params
    best_params['random_state'] = 42
    best_params['n_jobs'] = -1
    best_params['objective'] = 'binary:logistic'

    # Treinamento do modelo final do fold usando TODO o conjunto de treino externo
    final_model_fold = XGBClassifier(**best_params)
    final_model_fold.fit(X_train_outer, y_train_outer)

    # Avaliação final no conjunto de teste externo (dados nunca vistos durante a otimização)
    preds = final_model_fold.predict(X_test_outer)
    acc = accuracy_score(y_test_outer, preds)

    outer_results.append(acc)
    print(f"\t-> Acurácia: {acc:.4f}")

# Relatório final de performance
print("\n--- RESUMO DO NESTED CV ---")
print(f"Acurácias por fold: {outer_results}")
print(f"Média Geral de Acurácia: {np.mean(outer_results):.4f}")
print(f"Desvio Padrão: +/- {np.std(outer_results):.4f}")

[I 2025-12-01 00:19:44,858] A new study created in memory with name: no-name-2562c25c-22e5-4f16-80d1-baec07283e54


Iniciando Nested CV com 5 loops externos...

[Outer Fold 1/5] Iniciando otimização...


[I 2025-12-01 00:19:45,281] Trial 0 finished with value: 0.8391448827087603 and parameters: {'n_estimators': 62, 'max_depth': 4, 'learning_rate': 0.019593435714656455, 'subsample': 0.5414177889219085, 'colsample_bytree': 0.5165339005731644}. Best is trial 0 with value: 0.8391448827087603.
[I 2025-12-01 00:19:47,585] Trial 1 finished with value: 0.9685415431102804 and parameters: {'n_estimators': 289, 'max_depth': 10, 'learning_rate': 0.29500172586495904, 'subsample': 0.6056659161500859, 'colsample_bytree': 0.7624475937525563}. Best is trial 1 with value: 0.9685415431102804.
[I 2025-12-01 00:19:48,151] Trial 2 finished with value: 0.9706260651384744 and parameters: {'n_estimators': 121, 'max_depth': 3, 'learning_rate': 0.13976225411249454, 'subsample': 0.9208070635421871, 'colsample_bytree': 0.9326243288443845}. Best is trial 2 with value: 0.9706260651384744.
[I 2025-12-01 00:19:49,312] Trial 3 finished with value: 0.9711945915197319 and parameters: {'n_estimators': 229, 'max_depth': 5,

  -> Melhores params deste fold: {'n_estimators': 152, 'max_depth': 10, 'learning_rate': 0.06087701990026521, 'subsample': 0.6375523287700924, 'colsample_bytree': 0.6974551595625749}


[I 2025-12-01 00:20:17,272] A new study created in memory with name: no-name-30ca757c-8507-4298-a989-d201ebdf7d5a


  -> Acurácia no Teste Externo: 0.9686

[Outer Fold 2/5] Iniciando otimização...


[I 2025-12-01 00:20:17,703] Trial 0 finished with value: 0.9711199545196134 and parameters: {'n_estimators': 78, 'max_depth': 4, 'learning_rate': 0.140570924919093, 'subsample': 0.6190376488469422, 'colsample_bytree': 0.8526695253117527}. Best is trial 0 with value: 0.9711199545196134.
[I 2025-12-01 00:20:18,225] Trial 1 finished with value: 0.965965510706841 and parameters: {'n_estimators': 77, 'max_depth': 6, 'learning_rate': 0.029854500552442622, 'subsample': 0.6076420212957563, 'colsample_bytree': 0.5285412136957468}. Best is trial 0 with value: 0.9711199545196134.
[I 2025-12-01 00:20:19,291] Trial 2 finished with value: 0.9708167519423916 and parameters: {'n_estimators': 199, 'max_depth': 5, 'learning_rate': 0.023788787572211072, 'subsample': 0.7110262782222005, 'colsample_bytree': 0.809448273389164}. Best is trial 0 with value: 0.9711199545196134.
[I 2025-12-01 00:20:20,249] Trial 3 finished with value: 0.9704377487208641 and parameters: {'n_estimators': 143, 'max_depth': 7, 'lea

  -> Melhores params deste fold: {'n_estimators': 273, 'max_depth': 8, 'learning_rate': 0.021814638098423256, 'subsample': 0.7746908583668178, 'colsample_bytree': 0.8263435119075749}


[I 2025-12-01 00:20:40,910] A new study created in memory with name: no-name-9d4c4f8b-7b3d-4e95-969e-fad5d0a0b70c


  -> Acurácia no Teste Externo: 0.9720

[Outer Fold 3/5] Iniciando otimização...


[I 2025-12-01 00:20:42,550] Trial 0 finished with value: 0.9664961152169793 and parameters: {'n_estimators': 175, 'max_depth': 9, 'learning_rate': 0.010881326387918312, 'subsample': 0.8241045671602907, 'colsample_bytree': 0.5214632841018221}. Best is trial 0 with value: 0.9664961152169793.
[I 2025-12-01 00:20:43,507] Trial 1 finished with value: 0.9705514496873224 and parameters: {'n_estimators': 209, 'max_depth': 4, 'learning_rate': 0.061423235216163025, 'subsample': 0.6992010584959579, 'colsample_bytree': 0.5043067222687448}. Best is trial 1 with value: 0.9705514496873224.
[I 2025-12-01 00:20:45,903] Trial 2 finished with value: 0.9709304529088497 and parameters: {'n_estimators': 259, 'max_depth': 10, 'learning_rate': 0.040849530149946994, 'subsample': 0.8796982831015281, 'colsample_bytree': 0.5674361936936168}. Best is trial 2 with value: 0.9709304529088497.
[I 2025-12-01 00:20:46,539] Trial 3 finished with value: 0.968125829069547 and parameters: {'n_estimators': 73, 'max_depth': 8

  -> Melhores params deste fold: {'n_estimators': 245, 'max_depth': 8, 'learning_rate': 0.035038949989737146, 'subsample': 0.7635531539985577, 'colsample_bytree': 0.7974929437172691}


[I 2025-12-01 00:21:10,470] A new study created in memory with name: no-name-ebe028c7-d3d3-4c27-9339-15e0d95658d9


  -> Acurácia no Teste Externo: 0.9744

[Outer Fold 4/5] Iniciando otimização...


[I 2025-12-01 00:21:11,203] Trial 0 finished with value: 0.970020845177184 and parameters: {'n_estimators': 140, 'max_depth': 4, 'learning_rate': 0.023329195137639556, 'subsample': 0.8281510057687826, 'colsample_bytree': 0.9172877877966467}. Best is trial 0 with value: 0.970020845177184.
[I 2025-12-01 00:21:12,323] Trial 1 finished with value: 0.9713094561303771 and parameters: {'n_estimators': 114, 'max_depth': 10, 'learning_rate': 0.05576023158884255, 'subsample': 0.5071811037825481, 'colsample_bytree': 0.9382178242865908}. Best is trial 1 with value: 0.9713094561303771.
[I 2025-12-01 00:21:14,774] Trial 2 finished with value: 0.9705135493651696 and parameters: {'n_estimators': 272, 'max_depth': 9, 'learning_rate': 0.02059144039692514, 'subsample': 0.6219611778899679, 'colsample_bytree': 0.7201583254744601}. Best is trial 1 with value: 0.9713094561303771.
[I 2025-12-01 00:21:15,921] Trial 3 finished with value: 0.96581390941823 and parameters: {'n_estimators': 104, 'max_depth': 10, '

  -> Melhores params deste fold: {'n_estimators': 114, 'max_depth': 10, 'learning_rate': 0.05576023158884255, 'subsample': 0.5071811037825481, 'colsample_bytree': 0.9382178242865908}


[I 2025-12-01 00:21:38,374] A new study created in memory with name: no-name-feeea945-cf7a-419c-9e62-07c4ca68fc7d


  -> Acurácia no Teste Externo: 0.9747

[Outer Fold 5/5] Iniciando otimização...


[I 2025-12-01 00:21:38,990] Trial 0 finished with value: 0.9696039416335038 and parameters: {'n_estimators': 119, 'max_depth': 4, 'learning_rate': 0.04107235256773821, 'subsample': 0.7023545721266817, 'colsample_bytree': 0.8290169660707911}. Best is trial 0 with value: 0.9696039416335038.
[I 2025-12-01 00:21:39,475] Trial 1 finished with value: 0.9700587454993368 and parameters: {'n_estimators': 70, 'max_depth': 5, 'learning_rate': 0.06452792037188135, 'subsample': 0.9022243820321001, 'colsample_bytree': 0.555523020391206}. Best is trial 1 with value: 0.9700587454993368.
[I 2025-12-01 00:21:40,999] Trial 2 finished with value: 0.9686185332575327 and parameters: {'n_estimators': 169, 'max_depth': 10, 'learning_rate': 0.253602708163175, 'subsample': 0.6208062823963798, 'colsample_bytree': 0.7689518523573489}. Best is trial 1 with value: 0.9700587454993368.
[I 2025-12-01 00:21:42,740] Trial 3 finished with value: 0.9709304529088497 and parameters: {'n_estimators': 177, 'max_depth': 10, 'l

  -> Melhores params deste fold: {'n_estimators': 247, 'max_depth': 8, 'learning_rate': 0.02695036659431838, 'subsample': 0.7683084465017948, 'colsample_bytree': 0.6618024949829341}
  -> Acurácia no Teste Externo: 0.9712

=== RESUMO DO NESTED CV ===
Acurácias por fold: [0.9686221009549796, 0.9719526986052153, 0.9743784111582777, 0.9746816252274105, 0.9711946634323833]
Média Geral de Acurácia: 0.9722
Desvio Padrão: +/- 0.0022
