# Carga de datos

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact
from lightgbm import LGBMClassifier
import os
import shutil
import time
import copy
import datetime
from tqdm import tqdm
import torch
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import torch.nn.functional as F
from joblib import load, dump
from utils import plot_confusion_matrix
from scipy.stats import mode

  from .autonotebook import tqdm as notebook_tqdm


In [194]:
resnet_train = pd.read_csv('resnet_datos_entrenamiento.csv')
resnet_pred = pd.read_csv('predicciones_val.csv')

In [195]:
resnet_pred.shape

(2935, 6)

In [196]:
text_train = pd.read_csv('predictions_desc_train.csv')
text_pred = pd.read_csv('predictions_desc_test.csv')

In [197]:
text_pred.shape

(2999, 3)

In [198]:
#resnet_pred = pd.read_csv('resnet_preds_final.csv')

In [199]:
#text = pd.read_csv('predictions_desc.csv')

In [200]:
df = pd.read_csv('df_refinado.csv').drop(columns=['resnet_pred', 'Prob_text', 'Pred_text'])

In [201]:
df_train, df_test = train_test_split(df, random_state=7, test_size=0.2, stratify=df['AdoptionSpeed'])

In [202]:
df_gen = pd.merge(df_train, text_train, left_on='PetID', right_on='ID', how='inner')
df_gen.head(3)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Dewormed_Bin,Sterilized_Bin,HealthScore,Sentiment_Negativo,Sentiment_Neutro,Sentiment_Positivo,RescuerGroup,ID,Probabilities,Prediction
0,2,Brownie,1,265,0,1,1,2,7,2,...,0,1,1,False,False,True,1,5c110b9d2,[0.00469014 0.05030612 0.50046724 0.41639575 0...,2
1,1,Molly & dolly,48,307,307,2,2,7,0,2,...,0,1,1,False,False,True,1,8cd1d06ab,[0.00131765 0.00614847 0.01377026 0.01018812 0...,4
2,1,Sasha,2,307,0,2,1,2,0,2,...,0,0,0,False,False,True,4,a700a780b,[0.00236614 0.07325836 0.82948875 0.09341414 0...,2


In [205]:
df_gen = df_gen.rename(columns={'Probabilities': 'Text_prob', 'Prediction': 'Text_pred'})
df_gen.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed',
       'HasName', 'MixedRace', 'QuantityGroups', 'Free', 'FeeRange',
       'HasDescription', 'HasPhoto', 'StateName', 'StateName_clean', 'income',
       'population', 'population_percentage', 'Breed1_Grouped',
       'Breed2_Grouped', 'MultiColor', 'Black', 'AgeYear', 'Vaccinated_Bin',
       'Dewormed_Bin', 'Sterilized_Bin', 'HealthScore', 'Sentiment_Negativo',
       'Sentiment_Neutro', 'Sentiment_Positivo', 'RescuerGroup', 'ID',
       'Text_prob', 'Text_pred'],
      dtype='object')

In [206]:
df_test_gen = pd.merge(df_test, text_pred, left_on='PetID', right_on='ID', how='inner')
df_test_gen.head(3)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Dewormed_Bin,Sterilized_Bin,HealthScore,Sentiment_Negativo,Sentiment_Neutro,Sentiment_Positivo,RescuerGroup,ID,Probabilities,Prediction
0,2,Oreo,3,266,0,1,1,7,0,2,...,1,0,1,False,False,True,3,e46744bb9,[0.00975405 0.58259273 0.31584060 0.08191197 0...,1
1,2,Husky,2,266,0,1,1,7,0,1,...,0,0,0,False,False,True,2,de12dc945,[0.00119368 0.04356433 0.53542107 0.41577843 0...,2
2,1,Phoebe,48,205,0,2,2,7,0,1,...,1,1,3,False,False,True,3,cf78e66fd,[0.00103862 0.01274959 0.21784711 0.75383699 0...,3


In [207]:
df_test_gen = df_test_gen.rename(columns={'Probabilities': 'Text_prob', 'Prediction': 'Text_pred'})
df_test_gen.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed',
       'HasName', 'MixedRace', 'QuantityGroups', 'Free', 'FeeRange',
       'HasDescription', 'HasPhoto', 'StateName', 'StateName_clean', 'income',
       'population', 'population_percentage', 'Breed1_Grouped',
       'Breed2_Grouped', 'MultiColor', 'Black', 'AgeYear', 'Vaccinated_Bin',
       'Dewormed_Bin', 'Sterilized_Bin', 'HealthScore', 'Sentiment_Negativo',
       'Sentiment_Neutro', 'Sentiment_Positivo', 'RescuerGroup', 'ID',
       'Text_prob', 'Text_pred'],
      dtype='object')

In [None]:
if 1 in 'Color1':
    'Color2' = 0, 'Color3' = 0
elif 

In [245]:
df_one_hot_dim['Color1'].value_counts()

Color1
1    5893
2    2997
3     767
5     720
6     573
7     536
4     508
Name: count, dtype: int64

In [246]:
df_one_hot_dim['Color2'].value_counts()

Color2
0    3623
7    2725
2    2624
5     895
6     851
4     700
3     576
Name: count, dtype: int64

In [247]:
df_one_hot_dim['Color3'].value_counts()

Color3
0    8499
7    2562
5     337
6     304
4     154
3     138
Name: count, dtype: int64

In [227]:
# Testeo one hot encoding
df_one_hot = df_gen.drop(columns=['Name', 'Description', 'RescuerID', 'PetID', 'ID', 'StateName', 'Breed1_Grouped', 'Breed2_Grouped', 'Text_prob'])
df_one_hot_dim = pd.get_dummies(df_one_hot, columns=['StateName_clean', 'FeeRange', 'QuantityGroups','Color1', 'Color2', 'Color3'])
df_one_hot_dim.shape

(11994, 62)

In [215]:
# df_gen = df_gen.drop(columns=['Name', 'Description', 'RescuerID', 'Color1', 'Color2', 'Color3', 'StateName', 'StateName_clean', 'FeeRange', 'QuantityGroups', 'Breed1_Grouped', 'Breed2_Grouped', 'Text_prob', 'AgeYear', 'PetID', 'ID'])

In [58]:
df_gen = df_gen.drop(columns=['ID'])

In [59]:
df_gen.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'MaturitySize',
       'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health',
       'Quantity', 'Fee', 'State', 'VideoAmt', 'PhotoAmt', 'AdoptionSpeed',
       'HasName', 'MixedRace', 'Free', 'HasDescription', 'HasPhoto', 'income',
       'population', 'population_percentage', 'MultiColor', 'Black',
       'Vaccinated_Bin', 'Dewormed_Bin', 'Sterilized_Bin', 'HealthScore',
       'Sentiment_Negativo', 'Sentiment_Neutro', 'Sentiment_Positivo',
       'RescuerGroup', 'Text_pred'],
      dtype='object')

In [225]:
df_test_one_hot.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'VideoAmt',
       'PhotoAmt', 'AdoptionSpeed', 'HasName', 'MixedRace', 'QuantityGroups',
       'Free', 'FeeRange', 'HasDescription', 'HasPhoto', 'StateName_clean',
       'income', 'population', 'population_percentage', 'MultiColor', 'Black',
       'AgeYear', 'Vaccinated_Bin', 'Dewormed_Bin', 'Sterilized_Bin',
       'HealthScore', 'Sentiment_Negativo', 'Sentiment_Neutro',
       'Sentiment_Positivo', 'RescuerGroup', 'Text_prob', 'Text_pred'],
      dtype='object')

In [226]:
df_test_one_hot = df_test_gen.drop(columns=['Name', 'Description', 'RescuerID', 'PetID', 'ID', 'StateName', 'Breed1_Grouped', 'Breed2_Grouped', 'Text_prob'])
df_test_one_hot_dim = pd.get_dummies(df_test_one_hot, columns=['StateName_clean', 'FeeRange', 'QuantityGroups','Color1', 'Color2', 'Color3'])
df_test_one_hot_dim.shape

# df_test_gen = df_test_gen.drop(columns=['Name', 'Description', 'RescuerID', 'Color1', 'Color2', 'Color3', 'StateName', 'StateName_clean', 'FeeRange', 'QuantityGroups', 'Breed1_Grouped', 'Breed2_Grouped', 'Text_prob', 'AgeYear', 'PetID', 'ID'])
# df_test_gen.head(3)

(2999, 62)

In [228]:
X_train = df_one_hot_dim.drop(columns='AdoptionSpeed')
X_val = df_test_one_hot_dim.drop(columns='AdoptionSpeed')

In [88]:
# X_train = df_gen.drop(columns='AdoptionSpeed')
# X_val = df_test_gen.drop(columns='AdoptionSpeed')

In [238]:
X_train_base = X_train.copy()
X_val_base = X_val.copy()

In [91]:
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_val_scaled = scaler.transform(X_val)

In [212]:
y_train = df_gen['AdoptionSpeed']
y_val = df_test_gen['AdoptionSpeed']

# Modelos

## Predicciones de Resnet y Text como features

In [92]:
def kappa_metric(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [65]:
storage = "sqlite:////Users/paul/Downloads/Documentos Locales/labo-ii/optuna_studies.db"

In [220]:
# for col in X_train:
#     print(col, X_train[col].unique())

Type [2 1]
Age [  1  48   2  24  17   3   4  84   7   5  12  14   6  36  15  11  10  27
   9   8  60  31  65   0  18 108  16  19  30  38  72  21  13  20  29 120
  26  96  88  37  22  50  86  23  54  32  28  87  47  56  46  42  25 122
  81  35  68 123 132  89 156  53  62  76  61  41  34  80  74  57  78  39
  82  49  51  43  55 212  45  64  33  77  95 112  63  52  75  73 180  67
  91  40  92 102 100 168  85]
Breed1 [265 307 242 266 243 152 292 299 141 205 247 203  20 293  78 109  44 249
 254 264 306 285 276 195 240 304 295  60 207 179 187  75 218  69 119  56
 231 128 227 262 213 102  50 303 251  26 147 105 289 305 108 178 270 284
 103 189  17 111 239 271 283  49 280 185 277 282 288 150 155 173 200  97
  31 206 268   7 169 241 300  72  82 117  76 267 250  81  39  18 287 246
 212  70 188 269  10 252 274  42  83 272   0 279 296 302 182 129 244 273
 122 139 197  23  64 143 201  24  88 217  85 154  15 146 114 190 237  21
  93 257  11 199  32 224  65 145 202 297 281  19 263 248  25 132 294 214

In [229]:
def objective_rf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    }
    model = RandomForestClassifier(**params, random_state=7, n_jobs=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return kappa_metric(y_val, preds)

study_rf = optuna.create_study(
    direction="maximize",
    study_name="rf_dl_one_hot",
    storage=storage,
    load_if_exists=True
)
study_rf.optimize(objective_rf, n_trials=50)

model_rf = RandomForestClassifier(**study_rf.best_params, random_state=7, n_jobs=-1)
model_rf.fit(X_train, y_train)
preds_rf = model_rf.predict(X_val)

[I 2025-05-24 19:41:23,393] Using an existing study with name 'rf_dl_one_hot' instead of creating a new one.
[I 2025-05-24 19:41:23,616] Trial 2 finished with value: 0.26194741187043424 and parameters: {'n_estimators': 80, 'max_depth': 5, 'min_samples_split': 9}. Best is trial 2 with value: 0.26194741187043424.
[I 2025-05-24 19:41:23,828] Trial 3 finished with value: 0.2612236825923021 and parameters: {'n_estimators': 94, 'max_depth': 10, 'min_samples_split': 2}. Best is trial 2 with value: 0.26194741187043424.
[I 2025-05-24 19:41:24,040] Trial 4 finished with value: 0.27099538234180864 and parameters: {'n_estimators': 97, 'max_depth': 8, 'min_samples_split': 5}. Best is trial 4 with value: 0.27099538234180864.
[I 2025-05-24 19:41:24,197] Trial 5 finished with value: 0.25415287482051296 and parameters: {'n_estimators': 97, 'max_depth': 3, 'min_samples_split': 8}. Best is trial 4 with value: 0.27099538234180864.
[I 2025-05-24 19:41:24,370] Trial 6 finished with value: 0.2680871022097312

In [231]:
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "eval_metric": "mlogloss"
    }
    model = XGBClassifier(**params, random_state=7, verbosity=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return kappa_metric(y_val, preds)

study_xgb = optuna.create_study(
    direction="maximize",
    study_name="xgb_dl_one_hot_",
    storage=storage,
    load_if_exists=True
)
study_xgb.optimize(objective_xgb, n_trials=50)

model_xgb = XGBClassifier(**study_xgb.best_params, random_state=7, eval_metric="mlogloss")
model_xgb.fit(X_train, y_train)
preds_xgb = model_xgb.predict(X_val)

[I 2025-05-24 19:44:48,019] A new study created in RDB with name: xgb_dl_one_hot_
[I 2025-05-24 19:44:49,394] Trial 0 finished with value: 0.25570695039277624 and parameters: {'n_estimators': 91, 'max_depth': 10, 'learning_rate': 0.13214519679795628}. Best is trial 0 with value: 0.25570695039277624.
[I 2025-05-24 19:44:50,171] Trial 1 finished with value: 0.24695609923748674 and parameters: {'n_estimators': 90, 'max_depth': 5, 'learning_rate': 0.08555239607509793}. Best is trial 0 with value: 0.25570695039277624.
[I 2025-05-24 19:44:51,084] Trial 2 finished with value: 0.24564156569847906 and parameters: {'n_estimators': 81, 'max_depth': 8, 'learning_rate': 0.016673833039693213}. Best is trial 0 with value: 0.25570695039277624.
[I 2025-05-24 19:44:51,627] Trial 3 finished with value: 0.24733954527350188 and parameters: {'n_estimators': 94, 'max_depth': 4, 'learning_rate': 0.1881715223221348}. Best is trial 0 with value: 0.25570695039277624.
[I 2025-05-24 19:44:52,522] Trial 4 finished 

In [232]:
def objective_lgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
    }
    model = LGBMClassifier(**params, random_state=7, verbosity=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return kappa_metric(y_val, preds)

study_lgb = optuna.create_study(
    direction="maximize",
    study_name="lgb_dl_one_hot",
    storage=storage,
    load_if_exists=True
)
study_lgb.optimize(objective_lgb, n_trials=50)

model_lgb = LGBMClassifier(**study_lgb.best_params, random_state=7)
model_lgb.fit(X_train, y_train)
preds_lgb = model_lgb.predict(X_val)

[I 2025-05-24 19:45:49,834] A new study created in RDB with name: lgb_dl_one_hot
[I 2025-05-24 19:45:50,803] Trial 0 finished with value: 0.25116846690103156 and parameters: {'n_estimators': 65, 'max_depth': 9, 'learning_rate': 0.1486408416329573}. Best is trial 0 with value: 0.25116846690103156.
[I 2025-05-24 19:45:52,176] Trial 1 finished with value: 0.24607424110343434 and parameters: {'n_estimators': 96, 'max_depth': 6, 'learning_rate': 0.024788550485894283}. Best is trial 0 with value: 0.25116846690103156.
[I 2025-05-24 19:45:53,627] Trial 2 finished with value: 0.25728144045056967 and parameters: {'n_estimators': 99, 'max_depth': 8, 'learning_rate': 0.2604065973841895}. Best is trial 2 with value: 0.25728144045056967.
[I 2025-05-24 19:45:54,549] Trial 3 finished with value: 0.25489747219724956 and parameters: {'n_estimators': 64, 'max_depth': 7, 'learning_rate': 0.2652046350726376}. Best is trial 2 with value: 0.25728144045056967.
[I 2025-05-24 19:45:55,370] Trial 4 finished with

In [233]:
all_preds = np.vstack([preds_rf, preds_xgb, preds_lgb])
ensemble_preds = mode(all_preds, axis=0).mode

In [234]:
final_kappa = cohen_kappa_score(y_val, ensemble_preds)
print(f"Cohen's Kappa del Ensemble: {final_kappa:.4f}")

Cohen's Kappa del Ensemble: 0.1772


In [235]:
proba_rf = model_rf.predict_proba(X_val)
proba_xgb = model_xgb.predict_proba(X_val)
proba_lgb = model_lgb.predict_proba(X_val)

In [236]:
rf_pred = proba_rf.argmax(axis=1)
xgb_pred = proba_xgb.argmax(axis=1)
lgb_pred = proba_lgb.argmax(axis=1)

In [237]:
print(cohen_kappa_score(y_val, rf_pred))
print(cohen_kappa_score(y_val, xgb_pred))
print(cohen_kappa_score(y_val, lgb_pred))

0.17516918882646604
0.17803454231688143
0.17803011269605096


# Modelos Base

In [239]:
X_train_base = X_train_base.drop(columns='Text_pred')
X_val_base = X_val_base.drop(columns='Text_pred')

In [240]:
def objective_rf_base(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    }
    model = RandomForestClassifier(**params, random_state=7, n_jobs=-1)
    model.fit(X_train_base, y_train)
    preds = model.predict(X_val_base)
    return kappa_metric(y_val, preds)

study_rf_base = optuna.create_study(
    direction="maximize",
    study_name="rf_dl_base_one_hot",
    storage=storage,
    load_if_exists=True
)
study_rf_base.optimize(objective_rf_base, n_trials=50)

model_rf_base = RandomForestClassifier(**study_rf_base.best_params, random_state=7, n_jobs=-1)
model_rf_base.fit(X_train_base, y_train)
preds_rf_base = model_rf_base.predict(X_val_base)

[I 2025-05-24 19:48:25,884] A new study created in RDB with name: rf_dl_base_one_hot
[I 2025-05-24 19:48:26,158] Trial 0 finished with value: 0.31445550382374887 and parameters: {'n_estimators': 99, 'max_depth': 7, 'min_samples_split': 9}. Best is trial 0 with value: 0.31445550382374887.
[I 2025-05-24 19:48:26,350] Trial 1 finished with value: 0.27533169339084285 and parameters: {'n_estimators': 91, 'max_depth': 6, 'min_samples_split': 9}. Best is trial 0 with value: 0.31445550382374887.
[I 2025-05-24 19:48:26,482] Trial 2 finished with value: 0.21817081109430603 and parameters: {'n_estimators': 65, 'max_depth': 3, 'min_samples_split': 10}. Best is trial 0 with value: 0.31445550382374887.
[I 2025-05-24 19:48:26,613] Trial 3 finished with value: 0.26005769555991043 and parameters: {'n_estimators': 53, 'max_depth': 5, 'min_samples_split': 4}. Best is trial 0 with value: 0.31445550382374887.
[I 2025-05-24 19:48:26,771] Trial 4 finished with value: 0.321990804221012 and parameters: {'n_est

In [241]:
def objective_xgb_base(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "eval_metric": "mlogloss"
    }
    model = XGBClassifier(**params, random_state=7, verbosity=0)
    model.fit(X_train_base, y_train)
    preds = model.predict(X_val_base)
    return kappa_metric(y_val, preds)

study_xgb = optuna.create_study(
    direction="maximize",
    study_name="xgb_dl_base_one_hot",
    storage=storage,
    load_if_exists=True
)
study_xgb.optimize(objective_xgb_base, n_trials=50)

model_xgb = XGBClassifier(**study_xgb.best_params, random_state=7, eval_metric="mlogloss")
model_xgb.fit(X_train_base, y_train)
preds_xgb = model_xgb.predict(X_val_base)

[I 2025-05-24 19:48:46,811] A new study created in RDB with name: xgb_dl_base_one_hot
[I 2025-05-24 19:48:47,934] Trial 0 finished with value: 0.3662775836212153 and parameters: {'n_estimators': 100, 'max_depth': 8, 'learning_rate': 0.1001023176206022}. Best is trial 0 with value: 0.3662775836212153.
[I 2025-05-24 19:48:48,413] Trial 1 finished with value: 0.369761062528266 and parameters: {'n_estimators': 98, 'max_depth': 3, 'learning_rate': 0.1628436578706646}. Best is trial 1 with value: 0.369761062528266.
[I 2025-05-24 19:48:48,934] Trial 2 finished with value: 0.377745205677418 and parameters: {'n_estimators': 54, 'max_depth': 7, 'learning_rate': 0.18012219317421432}. Best is trial 2 with value: 0.377745205677418.
[I 2025-05-24 19:48:49,968] Trial 3 finished with value: 0.35742352387811827 and parameters: {'n_estimators': 78, 'max_depth': 9, 'learning_rate': 0.12234284250340767}. Best is trial 2 with value: 0.377745205677418.
[I 2025-05-24 19:48:50,948] Trial 4 finished with value

In [242]:
def objective_lgb_base(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
    }
    model = LGBMClassifier(**params, random_state=7, verbosity=-1)
    model.fit(X_train_base, y_train)
    preds = model.predict(X_val_base)
    return kappa_metric(y_val, preds)

study_lgb = optuna.create_study(
    direction="maximize",
    study_name="lgb_dl_base_one_hot",
    storage=storage,
    load_if_exists=True
)
study_lgb.optimize(objective_lgb_base, n_trials=50)

model_lgb = LGBMClassifier(**study_lgb.best_params, random_state=7)
model_lgb.fit(X_train_base, y_train)
preds_lgb = model_lgb.predict(X_val_base)

[I 2025-05-24 19:49:25,897] A new study created in RDB with name: lgb_dl_base_one_hot
[I 2025-05-24 19:49:27,371] Trial 0 finished with value: 0.3836218349959394 and parameters: {'n_estimators': 94, 'max_depth': 9, 'learning_rate': 0.05299287003149758}. Best is trial 0 with value: 0.3836218349959394.
[I 2025-05-24 19:49:28,010] Trial 1 finished with value: 0.3728478254642391 and parameters: {'n_estimators': 97, 'max_depth': 4, 'learning_rate': 0.1705223658959736}. Best is trial 0 with value: 0.3836218349959394.
[I 2025-05-24 19:49:29,119] Trial 2 finished with value: 0.3369725611453024 and parameters: {'n_estimators': 88, 'max_depth': 5, 'learning_rate': 0.01809120420106244}. Best is trial 0 with value: 0.3836218349959394.
[I 2025-05-24 19:49:30,114] Trial 3 finished with value: 0.3505715024608129 and parameters: {'n_estimators': 63, 'max_depth': 7, 'learning_rate': 0.02238869037241677}. Best is trial 0 with value: 0.3836218349959394.
[I 2025-05-24 19:49:31,273] Trial 4 finished with v

In [138]:
preds_rf_base

array([2, 1, 2, ..., 2, 4, 1])

In [243]:
all_preds_base = np.vstack([preds_rf_base, preds_xgb, preds_lgb])
ensemble_preds_base = mode(all_preds_base, axis=0).mode
final_kappa_base = cohen_kappa_score(y_val, ensemble_preds_base)
print(f"Cohen's Kappa del Ensemble: {final_kappa_base:.4f}")

Cohen's Kappa del Ensemble: 0.2424


In [244]:
proba_rf_base = model_rf_base.predict_proba(X_val_base)
proba_xgb_base = model_xgb.predict_proba(X_val_base)
proba_lgb_base = model_lgb.predict_proba(X_val_base)

rf_pred_base = proba_rf_base.argmax(axis=1)
xgb_pred_base = proba_xgb.argmax(axis=1)
lgb_pred_base = proba_lgb.argmax(axis=1)

print(cohen_kappa_score(y_val, rf_pred_base))
print(cohen_kappa_score(y_val, xgb_pred_base))
print(cohen_kappa_score(y_val, lgb_pred_base))

0.21963627705069355
0.17803454231688143
0.17803011269605096


# Ensemble Text

In [140]:
X_val_text = pd.merge(df_test, text_pred, left_on='PetID', right_on='ID', how='inner')
X_val_text.head(3)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Dewormed_Bin,Sterilized_Bin,HealthScore,Sentiment_Negativo,Sentiment_Neutro,Sentiment_Positivo,RescuerGroup,ID,Probabilities,Prediction
0,2,Oreo,3,266,0,1,1,7,0,2,...,1,0,1,False,False,True,3,e46744bb9,[0.00975405 0.58259273 0.31584060 0.08191197 0...,1
1,2,Husky,2,266,0,1,1,7,0,1,...,0,0,0,False,False,True,2,de12dc945,[0.00119368 0.04356433 0.53542107 0.41577843 0...,2
2,1,Phoebe,48,205,0,2,2,7,0,1,...,1,1,3,False,False,True,3,cf78e66fd,[0.00103862 0.01274959 0.21784711 0.75383699 0...,3


In [141]:
preds_text = X_val_text['Prediction']

In [142]:
all_preds_text = np.vstack([preds_rf_base, preds_xgb, preds_lgb, preds_text])
ensemble_preds_text = mode(all_preds_text, axis=0).mode
final_kappa_text = cohen_kappa_score(y_val, ensemble_preds_text)
print(f"Cohen's Kappa del Ensemble: {final_kappa_text:.4f}")

Cohen's Kappa del Ensemble: 0.2484


# Ensemble Resnet

In [148]:
df['AdoptionSpeed'].value_counts()

AdoptionSpeed
4    4197
2    4037
3    3259
1    3090
0     410
Name: count, dtype: int64

In [None]:
resnet_pred.

In [172]:
resnet_pred_pred = resnet_pred_pred.drop(columns=['PetID']).values.argmax(axis=1)
resnet_pred_pred = pd.Series(resnet_pred_pred, name='pred')
resnet_pred_pred

AxisError: axis 1 is out of bounds for array of dimension 1

In [171]:
resnet_pred_pred = pd.Series(resnet_pred
resnet_pred_pred

SyntaxError: '(' was never closed (3523336772.py, line 1)

In [169]:
resnet_pred_merged = pd.merge(resnet_pred, resnet_pred_pred, left_index=True, right_index=True)
resnet_pred_merged

ValueError: Cannot merge a Series without a name

In [143]:
X_val_res = pd.merge(df_test, resnet_pred, left_on='PetID', right_on='PetID', how='left')
X_val_res.head(3)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,HealthScore,Sentiment_Negativo,Sentiment_Neutro,Sentiment_Positivo,RescuerGroup,prob_class_0,prob_class_1,prob_class_2,prob_class_3,prob_class_4
0,2,Oreo,3,266,0,1,1,7,0,2,...,1,False,False,True,3,0.058776,0.133278,0.241755,0.181376,0.384815
1,2,Husky,2,266,0,1,1,7,0,1,...,0,False,False,True,2,0.015547,0.11074,0.255631,0.117561,0.500521
2,1,Phoebe,48,205,0,2,2,7,0,1,...,3,False,False,True,3,0.029485,0.349746,0.393155,0.215309,0.012305


In [160]:
X_val_res['prob_class_0'].dtype

dtype('float64')

float

In [165]:
X_val_res[X_val_res['prob_class_0'].isna()]

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,HealthScore,Sentiment_Negativo,Sentiment_Neutro,Sentiment_Positivo,RescuerGroup,prob_class_0,prob_class_1,prob_class_2,prob_class_3,prob_class_4
11,1,Dusty,3,307,0,2,2,7,0,1,...,0,False,False,True,1,,,,,
16,1,Mitang,10,65,65,2,5,0,0,3,...,3,False,False,True,1,,,,,
81,2,Golden short tail,3,266,0,1,3,0,0,2,...,2,False,False,True,3,,,,,
128,2,Rain,11,266,0,1,1,6,7,2,...,3,False,False,True,1,,,,,
134,1,No name,135,195,0,1,6,7,0,1,...,1,False,False,True,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2821,2,Looking for a kitten,4,251,292,1,3,4,6,2,...,2,False,False,True,1,,,,,
2856,2,Anderson,24,266,0,1,6,7,0,2,...,2,False,False,True,3,,,,,
2907,1,Unnamed,42,150,150,3,2,0,0,2,...,2,False,False,True,1,,,,,
2963,1,Cream poodle,36,179,0,2,5,0,0,1,...,0,False,False,True,2,,,,,


In [156]:
X_val_res[['prob_class_0', 'prob_class_1', 'prob_class_2', 'prob_class_3', 'prob_class_4']].idxmax(axis=1).str.extract("(\d+)").astype(int)[0].values

  X_val_res[['prob_class_0', 'prob_class_1', 'prob_class_2', 'prob_class_3', 'prob_class_4']].idxmax(axis=1).str.extract("(\d+)").astype(int)[0].values
  X_val_res[['prob_class_0', 'prob_class_1', 'prob_class_2', 'prob_class_3', 'prob_class_4']].idxmax(axis=1).str.extract("(\d+)").astype(int)[0].values
  X_val_res[['prob_class_0', 'prob_class_1', 'prob_class_2', 'prob_class_3', 'prob_class_4']].idxmax(axis=1).str.extract("(\d+)").astype(int)[0].values


ValueError: cannot convert float NaN to integer

In [130]:
resnet_pred[['prob_class_0', 'prob_class_1', 'prob_class_2', 'prob_class_3', 'prob_class_4']].argmax(axis=0)

AttributeError: 'DataFrame' object has no attribute 'argmax'

In [None]:
all_preds_resnet = np.vstack([preds_rf_base, preds_xgb, preds_lgb, ])


In [24]:
text_prob = text[['ID','Probabilities']]
proba_text = text_prob['Probabilities'].str.split(' ', expand=True)
proba_text[0] = proba_text[0].str.replace('[', '')
proba_text[4] = proba_text[4].str.replace(']', '')
text_proba = pd.merge(text_prob, proba_text, left_index=True, right_index=True).drop(columns=['Probabilities'])
text_proba

Unnamed: 0,ID,0,1,2,3,4
0,86e1089a3,0.00075255,0.15268189,0.80045265,0.04555201,0.00056093
1,6296e909a,0.02892482,0.31997138,0.54787356,0.09699186,0.00623844
2,3422e4906,0.00089478,0.00274030,0.12768795,0.86501288,0.00366400
3,5842f1ff5,0.00050428,0.06155059,0.84609097,0.09076333,0.00109088
4,850a43f90,0.00148470,0.01733202,0.69236803,0.28250989,0.00630536
...,...,...,...,...,...,...
14975,dc0935a84,0.00818448,0.67607671,0.29578659,0.01762347,0.00232873
14976,a01ab5b30,0.00261964,0.03075699,0.13270104,0.23656160,0.59736073
14977,d981b6395,0.00106464,0.00335503,0.13507260,0.74570012,0.11480758
14978,e4da1c9e4,0.00143031,0.00105063,0.00398096,0.00495054,0.98858750


In [25]:
_1,ids,_2,_3 = train_test_split(df, y, stratify=y, test_size=0.2, random_state=7)
ids = ids['PetID']

In [26]:
ids_ensemble_aux = pd.merge(ids, text_proba, left_on='PetID', right_on='ID', how='left')
ids_ensemble_aux = ids_ensemble_aux.fillna(0)
ids_ensemble_aux_sum = ids_ensemble_aux.drop(columns=['PetID','ID']).values

In [27]:
len(ids_ensemble_aux_sum)

2999

In [28]:
ids_ensemble_img_aux = pd.merge(ids_ensemble_aux, prob_resnet, left_on='PetID', right_on='PetID', how='inner')
ids_ensemble_img_aux

Unnamed: 0,PetID,ID,0,1,2,3,4,prob_class_0,prob_class_1,prob_class_2,prob_class_3,prob_class_4
0,cf78e66fd,cf78e66fd,0.00098114,0.03140692,0.69946831,0.26659068,0.00155298,-1.467817,1.738951,0.965889,0.190470,-1.377797
1,5de1d0c34,5de1d0c34,0.00274695,0.00269661,0.03190725,0.02141715,0.94123203,-2.546853,-0.141721,0.247514,0.282480,2.148874
2,43e4cb788,43e4cb788,0.01521666,0.82707489,0.09218681,0.04710428,0.01841737,-1.025800,0.385830,-0.194456,-0.471743,1.251837
3,feec7a438,feec7a438,0.00154261,0.00175889,0.00110020,0.00156655,0.99403173,-2.425889,-0.880943,0.483216,0.542247,2.417926
4,95855babf,95855babf,0.00041192,0.02822931,0.23739995,0.73264903,0.00130975,-1.878236,0.202715,0.447229,0.176778,1.032748
...,...,...,...,...,...,...,...,...,...,...,...,...
594,af8138fea,af8138fea,0.00159788,0.00190135,0.00363870,0.00970647,0.98315549,-3.089685,-0.041475,-0.301495,0.114695,3.036229
595,edcc13ccb,edcc13ccb,0.00236725,0.84133399,0.07660707,0.06778520,0.01190652,-1.433197,0.141028,0.838616,0.807360,-0.285771
596,5ada2f8b6,5ada2f8b6,0.01152135,0.53567052,0.43494299,0.01606540,0.00179972,-1.491605,1.056869,1.096664,0.170903,-0.929501
597,162c1d98a,162c1d98a,0.00483711,0.16835481,0.73992068,0.08577160,0.00111579,-0.929815,0.085813,0.725370,0.231037,-0.416319


In [29]:
ids_finales = ids_ensemble_img_aux['PetID'].unique()

In [30]:
text_ensemble_proba_arr = ids_ensemble_img_aux[[0,1,2,3,4]].values
text_ensemble_proba_arr

array([['0.00098114', '0.03140692', '0.69946831', '0.26659068',
        '0.00155298'],
       ['0.00274695', '0.00269661', '0.03190725', '0.02141715',
        '0.94123203'],
       ['0.01521666', '0.82707489', '0.09218681', '0.04710428',
        '0.01841737'],
       ...,
       ['0.01152135', '0.53567052', '0.43494299', '0.01606540',
        '0.00179972'],
       ['0.00483711', '0.16835481', '0.73992068', '0.08577160',
        '0.00111579'],
       ['0.02655957', '0.80220807', '0.14889178', '0.01586577',
        '0.00647473']], dtype=object)

In [31]:
img_ensemble_proba_arr = ids_ensemble_img_aux[['prob_class_0',	'prob_class_1',	'prob_class_2',	'prob_class_3', 'prob_class_4']].values
img_ensemble_proba_arr

array([[-1.4678171 ,  1.7389505 ,  0.9658895 ,  0.1904704 , -1.3777965 ],
       [-2.5468533 , -0.1417214 ,  0.24751389,  0.2824798 ,  2.1488738 ],
       [-1.0257998 ,  0.38582987, -0.19445631, -0.47174302,  1.2518374 ],
       ...,
       [-1.4916052 ,  1.0568687 ,  1.0966635 ,  0.17090292, -0.9295012 ],
       [-0.92981493,  0.08581277,  0.72536993,  0.23103733, -0.41631928],
       [-1.6618752 , -0.09671431,  0.12918828,  0.4928713 ,  0.90801454]])

In [32]:
ids.size

2999

In [33]:
pd.DataFrame(proba_rf).sort_index()

Unnamed: 0,0,1,2,3,4
0,0.011805,0.150503,0.531519,0.235201,0.070971
1,0.018411,0.177822,0.370418,0.327513,0.105837
2,0.017600,0.157004,0.458867,0.317993,0.048536
3,0.060099,0.427062,0.232383,0.158467,0.121989
4,0.000261,0.053809,0.045198,0.116705,0.784028
...,...,...,...,...,...
2994,0.010170,0.156651,0.318519,0.415787,0.098873
2995,0.061247,0.282346,0.252323,0.242235,0.161849
2996,0.010059,0.116948,0.513494,0.306164,0.053335
2997,0.007060,0.048003,0.063053,0.053892,0.827991


In [34]:
ids = ids.reset_index(drop=True)

In [35]:
ids_proba_rf = pd.DataFrame(proba_rf).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_rf_sum = ids_proba_rf.drop(columns=['PetID']).values

In [36]:
ids_proba_xgb = pd.DataFrame(proba_xgb).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_xgb_sum = ids_proba_xgb.drop(columns=['PetID']).values

In [37]:
ids_proba_lgb = pd.DataFrame(proba_lgb).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_lgb_sum = ids_proba_lgb.drop(columns=['PetID']).values

In [38]:
ids_ensemble_aux_sum = ids_ensemble_aux_sum.astype('float')

In [39]:
ensemble_probas_con_texto = (ids_proba_rf_sum + ids_proba_xgb_sum + ids_proba_lgb_sum + ids_ensemble_aux_sum) / 4
ensemble_preds_con_texto = np.argmax(ensemble_probas_con_texto, axis=1)
final_kappa_ensemble_con_texto = cohen_kappa_score(y_val, ensemble_preds_con_texto)
print(f"Cohen's Kappa del Ensemble: {final_kappa_ensemble_con_texto:.4f}")

Cohen's Kappa del Ensemble: 0.5678


## Modelo sin predicción de imágenes y texto

In [40]:
X.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'VideoAmt',
       'PhotoAmt', 'HasName', 'MixedRace', 'Free', 'HasDescription',
       'HasPhoto', 'income', 'population', 'population_percentage',
       'MultiColor', 'Black', 'AgeYear', 'Vaccinated_Bin', 'Dewormed_Bin',
       'Sterilized_Bin', 'HealthScore', 'Sentiment_Negativo',
       'Sentiment_Neutro', 'Sentiment_Positivo', 'RescuerGroup', 'resnet_pred',
       'Prediction'],
      dtype='object')

In [41]:
X = X.drop(columns=['resnet_pred', 'Prediction'])
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=7)

In [42]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [43]:
def objective_rf_base(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    }
    model = RandomForestClassifier(**params, random_state=7, n_jobs=-1)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    return kappa_metric(y_val, preds)

study_rf = optuna.create_study(
    direction="maximize",
    study_name="rf_base",
    storage=storage,
    load_if_exists=True
)
study_rf.optimize(objective_rf_base, n_trials=5)

model_rf = RandomForestClassifier(**study_rf.best_params, random_state=7, n_jobs=-1)
model_rf.fit(X_train_scaled, y_train)
preds_rf_base = model_rf.predict(X_val_scaled)


[I 2025-05-22 21:33:13,926] Using an existing study with name 'rf_base' instead of creating a new one.
[I 2025-05-22 21:33:14,243] Trial 10 finished with value: 0.20136347898334506 and parameters: {'n_estimators': 82, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 2 with value: 0.2143697722594018.
[I 2025-05-22 21:33:14,635] Trial 11 finished with value: 0.22436969719166933 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 8}. Best is trial 11 with value: 0.22436969719166933.
[I 2025-05-22 21:33:14,969] Trial 12 finished with value: 0.22398756044919277 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 9}. Best is trial 11 with value: 0.22436969719166933.
[I 2025-05-22 21:33:15,341] Trial 13 finished with value: 0.2234915176999399 and parameters: {'n_estimators': 99, 'max_depth': 10, 'min_samples_split': 8}. Best is trial 11 with value: 0.22436969719166933.
[I 2025-05-22 21:33:15,699] Trial 14 finished with value: 0.2220831525

In [44]:
def objective_xgb_base(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "use_label_encoder": False,
        "eval_metric": "mlogloss"
    }
    model = XGBClassifier(**params, random_state=7, verbosity=0)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    return kappa_metric(y_val, preds)

study_xgb = optuna.create_study(
    direction="maximize",
    study_name="xgb_base",
    storage=storage,
    load_if_exists=True
)
study_xgb.optimize(objective_xgb_base, n_trials=5)

model_xgb = XGBClassifier(**study_xgb.best_params, random_state=7, use_label_encoder=False, eval_metric="mlogloss")
model_xgb.fit(X_train_scaled, y_train)
preds_xgb_base = model_xgb.predict(X_val_scaled)

[I 2025-05-22 21:33:20,351] Using an existing study with name 'xgb_base' instead of creating a new one.
[I 2025-05-22 21:33:21,855] Trial 10 finished with value: 0.22383146039360813 and parameters: {'n_estimators': 77, 'max_depth': 7, 'learning_rate': 0.0164303613849536}. Best is trial 3 with value: 0.23919363961037665.
[I 2025-05-22 21:33:23,329] Trial 11 finished with value: 0.22483528942001918 and parameters: {'n_estimators': 80, 'max_depth': 9, 'learning_rate': 0.29348124725434827}. Best is trial 3 with value: 0.23919363961037665.
[I 2025-05-22 21:33:24,832] Trial 12 finished with value: 0.24008558728092944 and parameters: {'n_estimators': 65, 'max_depth': 10, 'learning_rate': 0.15398230821952474}. Best is trial 12 with value: 0.24008558728092944.
[I 2025-05-22 21:33:25,810] Trial 13 finished with value: 0.23202248479844934 and parameters: {'n_estimators': 64, 'max_depth': 8, 'learning_rate': 0.149672634841922}. Best is trial 12 with value: 0.24008558728092944.
[I 2025-05-22 21:33:

In [45]:
def objective_lgb_base(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
    }
    model = LGBMClassifier(**params, random_state=7, verbosity=-1)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    return kappa_metric(y_val, preds)

study_lgb = optuna.create_study(
    direction="maximize",
    study_name="lgb_base",
    storage=storage,
    load_if_exists=True
)
study_lgb.optimize(objective_lgb_base, n_trials=5)

model_lgb = LGBMClassifier(**study_lgb.best_params, random_state=7)
model_lgb.fit(X_train_scaled, y_train)
preds_lgb_base = model_lgb.predict(X_val_scaled)

[I 2025-05-22 21:33:29,259] Using an existing study with name 'lgb_base' instead of creating a new one.
[I 2025-05-22 21:33:31,747] Trial 10 finished with value: 0.22281974464994592 and parameters: {'n_estimators': 88, 'max_depth': 9, 'learning_rate': 0.01759711808610917}. Best is trial 4 with value: 0.23990595113839797.
[I 2025-05-22 21:33:33,851] Trial 11 finished with value: 0.2338878251936164 and parameters: {'n_estimators': 85, 'max_depth': 8, 'learning_rate': 0.10315014676351017}. Best is trial 4 with value: 0.23990595113839797.
[I 2025-05-22 21:33:35,698] Trial 12 finished with value: 0.23034926986598026 and parameters: {'n_estimators': 80, 'max_depth': 7, 'learning_rate': 0.26032693433517934}. Best is trial 4 with value: 0.23990595113839797.
[I 2025-05-22 21:33:37,555] Trial 13 finished with value: 0.2324290088026565 and parameters: {'n_estimators': 66, 'max_depth': 10, 'learning_rate': 0.11960504873319736}. Best is trial 4 with value: 0.23990595113839797.
[I 2025-05-22 21:33:4

In [46]:
all_preds_base = np.vstack([preds_rf_base, preds_xgb_base, preds_lgb_base])
ensemble_preds_base = mode(all_preds_base, axis=0).mode

In [47]:
proba_rf_base = model_rf.predict_proba(X_val_scaled)
proba_xgb_base = model_xgb.predict_proba(X_val_scaled)
proba_lgb_base = model_lgb.predict_proba(X_val_scaled)



In [144]:
ensemble_probas = (proba_rf_base + proba_xgb_base + proba_lgb_base) / 3
ensemble_preds_base = np.argmax(ensemble_probas, axis=1)

In [145]:
ensemble_preds_base

array([2, 1, 2, ..., 2, 4, 1])

In [147]:
final_kappa_base = cohen_kappa_score(y_val, ensemble_preds_base)
print(f"Cohen's Kappa del Ensemble: {final_kappa_base:.4f}")

Cohen's Kappa del Ensemble: 0.2417


In [51]:
ids_proba_rf_base = pd.DataFrame(proba_rf_base).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_rf_sum_base = ids_proba_rf_base.drop(columns=['PetID']).values
ids_proba_xgb_base = pd.DataFrame(proba_xgb_base).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_xgb_sum_base = ids_proba_xgb_base.drop(columns=['PetID']).values
ids_proba_lgb_base = pd.DataFrame(proba_lgb_base).merge(ids, left_index=True, right_index=True, how='inner')
ids_proba_lgb_sum_base = ids_proba_lgb_base.drop(columns=['PetID']).values
ids_ensemble_aux_sum_base = ids_ensemble_aux_sum.astype('float')

In [52]:
ensemble_probas_con_texto = (ids_proba_rf_sum_base + ids_proba_xgb_sum_base + ids_proba_lgb_sum_base + ids_ensemble_aux_sum_base) / 4
ensemble_preds_base_con_texto = np.argmax(ensemble_probas_con_texto, axis=1)

In [53]:
final_kappa_base_con_texto = cohen_kappa_score(y_val, ensemble_preds_base_con_texto)
print(f"Cohen's Kappa del Ensemble: {final_kappa_base_con_texto:.4f}")

Cohen's Kappa del Ensemble: 0.4970
