# Carga de datos

In [504]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact
from lightgbm import LGBMClassifier
import os
import shutil
import time
import copy
import datetime
from tqdm import tqdm
import torch
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import torch.nn.functional as F
from joblib import load, dump
from utils import plot_confusion_matrix
from scipy.stats import mode

In [505]:
resnet_pred = pd.read_csv('resnet_preds_final.csv')

In [506]:
text = pd.read_csv('predictions_desc.csv')

In [507]:
df = pd.read_csv('df_refinado.csv').drop(columns=['resnet_pred', 'Prob_text', 'Pred_text'])

In [508]:
df = pd.merge(df, resnet_pred, left_on="PetID", right_on="PetID", how="left")

In [509]:
df = pd.merge(df, text, left_on="PetID", right_on="ID", how="left")

In [510]:
df.drop(columns=['ID'], inplace=True)

In [511]:
df_original = pd.read_csv('train.csv')['PetID']
df_train = pd.merge(df_original, df, left_on="PetID", right_on="PetID", how="left")
df_train.shape

(14993, 52)

In [512]:
df = df_train

In [513]:
df.head(3)

Unnamed: 0,PetID,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,...,Sentiment_Negativo,Sentiment_Neutro,Sentiment_Positivo,Sentiment_Negativo.1,Sentiment_Neutro.1,Sentiment_Positivo.1,RescuerGroup,resnet_pred,Probabilities,Prediction
0,86e1089a3,2,Nibble,3,299,0,1,1,7,0,...,False,False,True,False,False,True,2,,[0.00075255 0.15268189 0.80045265 0.04555201 0...,2.0
1,6296e909a,2,No name yet,1,265,0,1,1,2,0,...,False,False,True,False,False,True,1,,[0.02892482 0.31997138 0.54787356 0.09699186 0...,2.0
2,3422e4906,1,Brisco,1,307,0,1,2,7,0,...,False,False,True,False,False,True,5,,[0.00089478 0.00274030 0.12768795 0.86501288 0...,3.0


# Modelos

## Data Leakage

In [514]:
X = df.select_dtypes(exclude='object').drop(columns=['AdoptionSpeed'])
y = df['AdoptionSpeed']

In [515]:
df['AdoptionSpeed'].value_counts(dropna=False)

AdoptionSpeed
4    4197
2    4037
3    3259
1    3090
0     410
Name: count, dtype: int64

In [516]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=7)

In [517]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [518]:
def kappa_metric(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred)

In [519]:
storage = "sqlite:////Users/paul/Downloads/Documentos Locales/labo-ii/optuna_studies.db"

In [520]:
def objective_rf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    }
    model = RandomForestClassifier(**params, random_state=7, n_jobs=-1)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    return kappa_metric(y_val, preds)

study_rf = optuna.create_study(
    direction="maximize",
    study_name="rf_dl",
    storage=storage,
    load_if_exists=True
)
study_rf.optimize(objective_rf, n_trials=5)

model_rf = RandomForestClassifier(**study_rf.best_params, random_state=7, n_jobs=-1)
model_rf.fit(X_train_scaled, y_train)
preds_rf = model_rf.predict(X_val_scaled)


[I 2025-05-08 17:42:39,940] Using an existing study with name 'rf_dl' instead of creating a new one.
[I 2025-05-08 17:42:40,142] Trial 5 finished with value: 0.5277498616573633 and parameters: {'n_estimators': 64, 'max_depth': 8, 'min_samples_split': 6}. Best is trial 1 with value: 0.5534352032007082.
[I 2025-05-08 17:42:40,391] Trial 6 finished with value: 0.5575132327828793 and parameters: {'n_estimators': 90, 'max_depth': 10, 'min_samples_split': 7}. Best is trial 6 with value: 0.5575132327828793.
[I 2025-05-08 17:42:40,593] Trial 7 finished with value: 0.34114341004944815 and parameters: {'n_estimators': 96, 'max_depth': 3, 'min_samples_split': 5}. Best is trial 6 with value: 0.5575132327828793.
[I 2025-05-08 17:42:40,762] Trial 8 finished with value: 0.5398774409368182 and parameters: {'n_estimators': 60, 'max_depth': 8, 'min_samples_split': 10}. Best is trial 6 with value: 0.5575132327828793.
[I 2025-05-08 17:42:40,941] Trial 9 finished with value: 0.5129043664430892 and paramete

In [521]:
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "use_label_encoder": False,
        "eval_metric": "mlogloss"
    }
    model = XGBClassifier(**params, random_state=7, verbosity=0)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    return kappa_metric(y_val, preds)

study_xgb = optuna.create_study(
    direction="maximize",
    study_name="xgb_dl",
    storage=storage,
    load_if_exists=True
)
study_xgb.optimize(objective_xgb, n_trials=5)

model_xgb = XGBClassifier(**study_xgb.best_params, random_state=7, use_label_encoder=False, eval_metric="mlogloss")
model_xgb.fit(X_train_scaled, y_train)
preds_xgb = model_xgb.predict(X_val_scaled)

[I 2025-05-08 17:42:41,196] Using an existing study with name 'xgb_dl' instead of creating a new one.
[I 2025-05-08 17:42:42,676] Trial 5 finished with value: 0.5664337987836014 and parameters: {'n_estimators': 96, 'max_depth': 8, 'learning_rate': 0.07570903714619329}. Best is trial 0 with value: 0.5678326623991661.
[I 2025-05-08 17:42:44,276] Trial 6 finished with value: 0.5660352145482788 and parameters: {'n_estimators': 94, 'max_depth': 8, 'learning_rate': 0.21827239648516497}. Best is trial 0 with value: 0.5678326623991661.
[I 2025-05-08 17:42:44,790] Trial 7 finished with value: 0.5648481073289754 and parameters: {'n_estimators': 76, 'max_depth': 3, 'learning_rate': 0.14394513089282773}. Best is trial 0 with value: 0.5678326623991661.
[I 2025-05-08 17:42:46,881] Trial 8 finished with value: 0.554593088900857 and parameters: {'n_estimators': 79, 'max_depth': 10, 'learning_rate': 0.2651880164590136}. Best is trial 0 with value: 0.5678326623991661.
[I 2025-05-08 17:42:47,527] Trial 9

In [522]:
def objective_lgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
    }
    model = LGBMClassifier(**params, random_state=7, verbosity=-1)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    return kappa_metric(y_val, preds)

study_lgb = optuna.create_study(
    direction="maximize",
    study_name="lgb_dl",
    storage=storage,
    load_if_exists=True
)
study_lgb.optimize(objective_lgb, n_trials=5)

model_lgb = LGBMClassifier(**study_lgb.best_params, random_state=7)
model_lgb.fit(X_train_scaled, y_train)
preds_lgb = model_lgb.predict(X_val_scaled)

[I 2025-05-08 17:42:48,479] Using an existing study with name 'lgb_dl' instead of creating a new one.
[I 2025-05-08 17:42:48,969] Trial 5 finished with value: 0.5670033966284094 and parameters: {'n_estimators': 65, 'max_depth': 3, 'learning_rate': 0.19435088897212674}. Best is trial 3 with value: 0.5674007901296104.
[I 2025-05-08 17:42:49,593] Trial 6 finished with value: 0.5665376229660188 and parameters: {'n_estimators': 90, 'max_depth': 3, 'learning_rate': 0.28703544518842034}. Best is trial 3 with value: 0.5674007901296104.
[I 2025-05-08 17:42:50,024] Trial 7 finished with value: 0.5661726815822146 and parameters: {'n_estimators': 57, 'max_depth': 3, 'learning_rate': 0.25083644765243274}. Best is trial 3 with value: 0.5674007901296104.
[I 2025-05-08 17:42:52,089] Trial 8 finished with value: 0.5682831058251014 and parameters: {'n_estimators': 87, 'max_depth': 7, 'learning_rate': 0.11747655802549196}. Best is trial 8 with value: 0.5682831058251014.
[I 2025-05-08 17:42:54,001] Trial 

In [523]:
all_preds = np.vstack([preds_rf, preds_xgb, preds_lgb])
ensemble_preds = mode(all_preds, axis=0).mode

In [524]:
final_kappa = cohen_kappa_score(y_val, ensemble_preds)
print(f"Cohen's Kappa del Ensemble: {final_kappa:.4f}")

Cohen's Kappa del Ensemble: 0.5678


In [525]:
proba_rf = model_rf.predict_proba(X_val_scaled)
proba_xgb = model_xgb.predict_proba(X_val_scaled)
proba_lgb = model_lgb.predict_proba(X_val_scaled)



In [526]:
prob_resnet = pd.read_csv('prob_resnet.csv')
prob_resnet

Unnamed: 0,PetID,prob_class_0,prob_class_1,prob_class_2,prob_class_3,prob_class_4
0,015da9e87,-0.879113,0.443262,0.689422,-0.139108,-0.356720
1,022606901,-4.020277,-0.577126,1.333538,2.054347,1.475107
2,02f89bdcb,-2.336137,-0.366367,0.259281,1.189482,1.085584
3,0cf7fae9d,-0.899533,0.899696,0.859313,-0.421518,-0.684542
4,0e922caab,-1.021429,0.817223,0.393542,0.584349,-0.658304
...,...,...,...,...,...,...
2926,ff2cf88a0,-3.103668,-0.347119,0.866403,1.245120,1.353035
2927,ff498c903,-2.143353,-0.180348,0.465958,0.826810,0.799603
2928,ff50c6171,-1.933526,0.621045,-0.066495,0.907700,0.555991
2929,ff5e30380,-2.088701,0.234211,-0.059611,0.294012,1.263344


In [527]:
text_prob = text[['ID','Probabilities']]
proba_text = text_prob['Probabilities'].str.split(' ', expand=True)
proba_text[0] = proba_text[0].str.replace('[', '')
proba_text[4] = proba_text[4].str.replace(']', '')
text_proba = pd.merge(text_prob, proba_text, left_index=True, right_index=True).drop(columns=['Probabilities'])
text_proba

Unnamed: 0,ID,0,1,2,3,4
0,86e1089a3,0.00075255,0.15268189,0.80045265,0.04555201,0.00056093
1,6296e909a,0.02892482,0.31997138,0.54787356,0.09699186,0.00623844
2,3422e4906,0.00089478,0.00274030,0.12768795,0.86501288,0.00366400
3,5842f1ff5,0.00050428,0.06155059,0.84609097,0.09076333,0.00109088
4,850a43f90,0.00148470,0.01733202,0.69236803,0.28250989,0.00630536
...,...,...,...,...,...,...
14975,dc0935a84,0.00818448,0.67607671,0.29578659,0.01762347,0.00232873
14976,a01ab5b30,0.00261964,0.03075699,0.13270104,0.23656160,0.59736073
14977,d981b6395,0.00106464,0.00335503,0.13507260,0.74570012,0.11480758
14978,e4da1c9e4,0.00143031,0.00105063,0.00398096,0.00495054,0.98858750


In [528]:
_1,ids,_2,_3 = train_test_split(df, y, stratify=y, test_size=0.2, random_state=7)
ids = ids['PetID']

In [529]:
ids_ensemble_aux = pd.merge(ids, text_proba, left_on='PetID', right_on='ID', how='left')
ids_ensemble_aux = ids_ensemble_aux.fillna(0)
ids_ensemble_aux_sum = ids_ensemble_aux.drop(columns=['PetID','ID']).values

In [530]:
len(ids_ensemble_aux_sum)

2999

In [531]:
ids_ensemble_img_aux = pd.merge(ids_ensemble_aux, prob_resnet, left_on='PetID', right_on='PetID', how='inner')
ids_ensemble_img_aux

Unnamed: 0,PetID,ID,0,1,2,3,4,prob_class_0,prob_class_1,prob_class_2,prob_class_3,prob_class_4
0,cf78e66fd,cf78e66fd,0.00098114,0.03140692,0.69946831,0.26659068,0.00155298,-1.467817,1.738951,0.965889,0.190470,-1.377797
1,5de1d0c34,5de1d0c34,0.00274695,0.00269661,0.03190725,0.02141715,0.94123203,-2.546853,-0.141721,0.247514,0.282480,2.148874
2,43e4cb788,43e4cb788,0.01521666,0.82707489,0.09218681,0.04710428,0.01841737,-1.025800,0.385830,-0.194456,-0.471743,1.251837
3,feec7a438,feec7a438,0.00154261,0.00175889,0.00110020,0.00156655,0.99403173,-2.425889,-0.880943,0.483216,0.542247,2.417926
4,95855babf,95855babf,0.00041192,0.02822931,0.23739995,0.73264903,0.00130975,-1.878236,0.202715,0.447229,0.176778,1.032748
...,...,...,...,...,...,...,...,...,...,...,...,...
594,af8138fea,af8138fea,0.00159788,0.00190135,0.00363870,0.00970647,0.98315549,-3.089685,-0.041475,-0.301495,0.114695,3.036229
595,edcc13ccb,edcc13ccb,0.00236725,0.84133399,0.07660707,0.06778520,0.01190652,-1.433197,0.141028,0.838616,0.807360,-0.285771
596,5ada2f8b6,5ada2f8b6,0.01152135,0.53567052,0.43494299,0.01606540,0.00179972,-1.491605,1.056869,1.096664,0.170903,-0.929501
597,162c1d98a,162c1d98a,0.00483711,0.16835481,0.73992068,0.08577160,0.00111579,-0.929815,0.085813,0.725370,0.231037,-0.416319


In [532]:
ids_finales = ids_ensemble_img_aux['PetID'].unique()

In [533]:
text_ensemble_proba_arr = ids_ensemble_img_aux[[0,1,2,3,4]].values
text_ensemble_proba_arr

array([['0.00098114', '0.03140692', '0.69946831', '0.26659068',
        '0.00155298'],
       ['0.00274695', '0.00269661', '0.03190725', '0.02141715',
        '0.94123203'],
       ['0.01521666', '0.82707489', '0.09218681', '0.04710428',
        '0.01841737'],
       ...,
       ['0.01152135', '0.53567052', '0.43494299', '0.01606540',
        '0.00179972'],
       ['0.00483711', '0.16835481', '0.73992068', '0.08577160',
        '0.00111579'],
       ['0.02655957', '0.80220807', '0.14889178', '0.01586577',
        '0.00647473']], dtype=object)

In [534]:
img_ensemble_proba_arr = ids_ensemble_img_aux[['prob_class_0',	'prob_class_1',	'prob_class_2',	'prob_class_3', 'prob_class_4']].values
img_ensemble_proba_arr

array([[-1.4678171 ,  1.7389505 ,  0.9658895 ,  0.1904704 , -1.3777965 ],
       [-2.5468533 , -0.1417214 ,  0.24751389,  0.2824798 ,  2.1488738 ],
       [-1.0257998 ,  0.38582987, -0.19445631, -0.47174302,  1.2518374 ],
       ...,
       [-1.4916052 ,  1.0568687 ,  1.0966635 ,  0.17090292, -0.9295012 ],
       [-0.92981493,  0.08581277,  0.72536993,  0.23103733, -0.41631928],
       [-1.6618752 , -0.09671431,  0.12918828,  0.4928713 ,  0.90801454]])

In [535]:
ids.size

2999

In [536]:
pd.DataFrame(proba_rf).sort_index()

Unnamed: 0,0,1,2,3,4
0,0.014016,0.155579,0.534144,0.224232,0.072029
1,0.018045,0.166274,0.347164,0.366142,0.102375
2,0.017478,0.176646,0.493676,0.237252,0.074947
3,0.053205,0.429961,0.246157,0.154588,0.116088
4,0.001452,0.068796,0.061175,0.070586,0.797991
...,...,...,...,...,...
2994,0.015678,0.154492,0.340266,0.396688,0.092877
2995,0.071573,0.253203,0.213071,0.232490,0.229663
2996,0.005161,0.117384,0.526175,0.326230,0.025051
2997,0.006405,0.043898,0.050248,0.038568,0.860881


In [537]:
ids = ids.reset_index(drop=True)

In [538]:
ids_proba_rf = pd.DataFrame(proba_rf).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_rf_sum = ids_proba_rf.drop(columns=['PetID']).values

In [539]:
ids_proba_xgb = pd.DataFrame(proba_xgb).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_xgb_sum = ids_proba_xgb.drop(columns=['PetID']).values

In [540]:
ids_proba_lgb = pd.DataFrame(proba_lgb).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_lgb_sum = ids_proba_lgb.drop(columns=['PetID']).values

In [541]:
ids_ensemble_aux_sum = ids_ensemble_aux_sum.astype('float')

In [542]:
ensemble_probas_con_texto = (ids_proba_rf_sum + ids_proba_xgb_sum + ids_proba_lgb_sum + ids_ensemble_aux_sum) / 4
ensemble_preds_con_texto = np.argmax(ensemble_probas_con_texto, axis=1)
final_kappa_ensemble_con_texto = cohen_kappa_score(y_val, ensemble_preds_con_texto)
print(f"Cohen's Kappa del Ensemble: {final_kappa_ensemble_con_texto:.4f}")

Cohen's Kappa del Ensemble: 0.5674


## Modelo sin predicción de imágenes y texto

In [543]:
X.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'VideoAmt',
       'PhotoAmt', 'HasName', 'MixedRace', 'Free', 'HasDescription',
       'HasPhoto', 'MultiColor', 'Black', 'AgeYear', 'Vaccinated_Bin',
       'Dewormed_Bin', 'Sterilized_Bin', 'HealthScore', 'Sentiment_Negativo',
       'Sentiment_Neutro', 'Sentiment_Positivo', 'Sentiment_Negativo.1',
       'Sentiment_Neutro.1', 'Sentiment_Positivo.1', 'RescuerGroup',
       'resnet_pred', 'Prediction'],
      dtype='object')

In [544]:
X = X.drop(columns=['resnet_pred', 'Prediction'])
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=7)

In [545]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [546]:
def objective_rf_base(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    }
    model = RandomForestClassifier(**params, random_state=7, n_jobs=-1)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    return kappa_metric(y_val, preds)

study_rf = optuna.create_study(
    direction="maximize",
    study_name="rf_base",
    storage=storage,
    load_if_exists=True
)
study_rf.optimize(objective_rf_base, n_trials=5)

model_rf = RandomForestClassifier(**study_rf.best_params, random_state=7, n_jobs=-1)
model_rf.fit(X_train_scaled, y_train)
preds_rf_base = model_rf.predict(X_val_scaled)


[I 2025-05-08 17:42:56,775] Using an existing study with name 'rf_base' instead of creating a new one.
[I 2025-05-08 17:42:56,984] Trial 5 finished with value: 0.2109674170193463 and parameters: {'n_estimators': 78, 'max_depth': 8, 'min_samples_split': 2}. Best is trial 2 with value: 0.2143697722594018.
[I 2025-05-08 17:42:57,139] Trial 6 finished with value: 0.20748177187187256 and parameters: {'n_estimators': 57, 'max_depth': 8, 'min_samples_split': 5}. Best is trial 2 with value: 0.2143697722594018.
[I 2025-05-08 17:42:57,295] Trial 7 finished with value: 0.13080710771853277 and parameters: {'n_estimators': 71, 'max_depth': 3, 'min_samples_split': 4}. Best is trial 2 with value: 0.2143697722594018.
[I 2025-05-08 17:42:57,528] Trial 8 finished with value: 0.21300900937730327 and parameters: {'n_estimators': 96, 'max_depth': 9, 'min_samples_split': 7}. Best is trial 2 with value: 0.2143697722594018.
[I 2025-05-08 17:42:57,693] Trial 9 finished with value: 0.19659306689248923 and param

In [547]:
def objective_xgb_base(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "use_label_encoder": False,
        "eval_metric": "mlogloss"
    }
    model = XGBClassifier(**params, random_state=7, verbosity=0)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    return kappa_metric(y_val, preds)

study_xgb = optuna.create_study(
    direction="maximize",
    study_name="xgb_base",
    storage=storage,
    load_if_exists=True
)
study_xgb.optimize(objective_xgb_base, n_trials=5)

model_xgb = XGBClassifier(**study_xgb.best_params, random_state=7, use_label_encoder=False, eval_metric="mlogloss")
model_xgb.fit(X_train_scaled, y_train)
preds_xgb_base = model_xgb.predict(X_val_scaled)

[I 2025-05-08 17:42:57,925] Using an existing study with name 'xgb_base' instead of creating a new one.
[I 2025-05-08 17:42:58,503] Trial 5 finished with value: 0.21726861735658587 and parameters: {'n_estimators': 86, 'max_depth': 4, 'learning_rate': 0.053714697812177165}. Best is trial 3 with value: 0.23919363961037665.
[I 2025-05-08 17:42:59,486] Trial 6 finished with value: 0.22375086761289065 and parameters: {'n_estimators': 66, 'max_depth': 9, 'learning_rate': 0.28231681223885297}. Best is trial 3 with value: 0.23919363961037665.
[I 2025-05-08 17:43:00,521] Trial 7 finished with value: 0.23180275199110145 and parameters: {'n_estimators': 93, 'max_depth': 7, 'learning_rate': 0.10077358856580124}. Best is trial 3 with value: 0.23919363961037665.
[I 2025-05-08 17:43:01,254] Trial 8 finished with value: 0.22864087723606574 and parameters: {'n_estimators': 75, 'max_depth': 6, 'learning_rate': 0.21631432256904337}. Best is trial 3 with value: 0.23919363961037665.
[I 2025-05-08 17:43:01,

In [548]:
def objective_lgb_base(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
    }
    model = LGBMClassifier(**params, random_state=7, verbosity=-1)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    return kappa_metric(y_val, preds)

study_lgb = optuna.create_study(
    direction="maximize",
    study_name="lgb_base",
    storage=storage,
    load_if_exists=True
)
study_lgb.optimize(objective_lgb_base, n_trials=5)

model_lgb = LGBMClassifier(**study_lgb.best_params, random_state=7)
model_lgb.fit(X_train_scaled, y_train)
preds_lgb_base = model_lgb.predict(X_val_scaled)

[I 2025-05-08 17:43:03,312] Using an existing study with name 'lgb_base' instead of creating a new one.
[I 2025-05-08 17:43:04,358] Trial 5 finished with value: 0.23310633098556732 and parameters: {'n_estimators': 65, 'max_depth': 5, 'learning_rate': 0.28073162711332517}. Best is trial 4 with value: 0.23990595113839797.
[I 2025-05-08 17:43:04,962] Trial 6 finished with value: 0.21663077766802696 and parameters: {'n_estimators': 98, 'max_depth': 3, 'learning_rate': 0.2082999364800157}. Best is trial 4 with value: 0.23990595113839797.
[I 2025-05-08 17:43:06,574] Trial 7 finished with value: 0.23816423020293798 and parameters: {'n_estimators': 78, 'max_depth': 6, 'learning_rate': 0.20573814718259953}. Best is trial 4 with value: 0.23990595113839797.
[I 2025-05-08 17:43:07,627] Trial 8 finished with value: 0.2294378745542418 and parameters: {'n_estimators': 59, 'max_depth': 5, 'learning_rate': 0.08975444655233622}. Best is trial 4 with value: 0.23990595113839797.
[I 2025-05-08 17:43:09,481

In [549]:
all_preds_base = np.vstack([preds_rf_base, preds_xgb_base, preds_lgb_base])
ensemble_preds_base = mode(all_preds_base, axis=0).mode

In [550]:
proba_rf_base = model_rf.predict_proba(X_val_scaled)
proba_xgb_base = model_xgb.predict_proba(X_val_scaled)
proba_lgb_base = model_lgb.predict_proba(X_val_scaled)



In [551]:
ensemble_probas = (proba_rf_base + proba_xgb_base + proba_lgb_base) / 3
ensemble_preds_base = np.argmax(ensemble_probas, axis=1)

In [552]:
ensemble_preds_base

array([2, 1, 2, ..., 2, 4, 1])

In [553]:
final_kappa_base = cohen_kappa_score(y_val, ensemble_preds_base)
print(f"Cohen's Kappa del Ensemble: {final_kappa_base:.4f}")

Cohen's Kappa del Ensemble: 0.2399


In [554]:
ids_proba_rf_base = pd.DataFrame(proba_rf_base).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_rf_sum_base = ids_proba_rf_base.drop(columns=['PetID']).values
ids_proba_xgb_base = pd.DataFrame(proba_xgb_base).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_xgb_sum_base = ids_proba_xgb_base.drop(columns=['PetID']).values
ids_proba_lgb_base = pd.DataFrame(proba_lgb_base).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_lgb_sum_base = ids_proba_lgb_base.drop(columns=['PetID']).values
ids_ensemble_aux_sum_base = ids_ensemble_aux_sum.astype('float')

In [555]:
ensemble_probas_con_texto = (ids_proba_rf_sum_base + ids_proba_xgb_sum_base + ids_proba_lgb_sum_base + ids_ensemble_aux_sum_base) / 4
ensemble_preds_base_con_texto = np.argmax(ensemble_probas_con_texto, axis=1)

In [556]:
final_kappa_base_con_texto = cohen_kappa_score(y_val, ensemble_preds_base_con_texto)
print(f"Cohen's Kappa del Ensemble: {final_kappa_base_con_texto:.4f}")

Cohen's Kappa del Ensemble: 0.4995
