# Scrip para la competencia 01

Training the best M models

Incluye:

- Tuning de hyperparámetros (con meses históricos)
- Análisis de la incertidumbre (respecto a la proyección histórica)
- Predicción para entrega (competancia de Kaggle)

## Directorios

In [1]:
import os
import pandas as pd

datos = 'datos'

optimizacion = 'optimizacion'

modelos = 'modelos'

resultados = 'resultados'


### Condiciones de entrenamiento

Entrenamos con la mejor optimización posible, sobre el último més disponible.

Ya tenemos en nuestras variables, algunas variables históricas según el feature engineering aplicado.

In [2]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train = 202104

threshold = 0.025

semillas = [437809, 327347, 392879, 455783, 217163]

In [3]:
max_jobs = os.cpu_count() - 6

## Pipeline

Con distintos modelos

In [4]:
# Importar las librerías necesarias
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
from joblib import Parallel, delayed
import seaborn as sns
import matplotlib.pyplot as plt
import optuna

# Definir la clase ModelPipeline
class ModelPipeline:
    def __init__(self, data, seeds, model_type='decision_tree', 
                 ganancia_acierto=273000, costo_estimulo=7000, 
                 threshold=0.025, seed=0, n_jobs=-1, reg=False):
        self.data = data
        self.seeds = seeds
        self.s = seed
        self.n_jobs = int(n_jobs)
        self.model_type = model_type
        self.ganancia_acierto = ganancia_acierto
        self.costo_estimulo = costo_estimulo
        self.threshold = threshold
        self.reg = reg
        self.models = {}
        self.base_params = {'random_state': self.seeds[self.s]}
        self.best_params = None
        self.base_model = None
        self.best_model = None

        # Mapear model_type al clasificador correspondiente
        self.classifier_map = {
            'decision_tree': DecisionTreeClassifier,
            'random_forest': RandomForestClassifier,
            'xgboost': XGBClassifier,
            'lightgbm': LGBMClassifier
        }

    def def_xy(self, mes, target='clase_ternaria', to_pred=False):
        X = self.data[self.data['foto_mes'] == mes]
        y = X[target]
        X = X.drop(columns=[target])

        numero_de_cliente = X['numero_de_cliente']

        if to_pred:
            return X, numero_de_cliente
        else:
            return X, y

    def ganancia(self, model, X, y, prop=1):
        # Obtener las probabilidades predichas
        y_hat = model.predict_proba(X)

        # Obtener las clases del modelo
        model_classes = model.classes_

        # Identificar la clase objetivo (puede ser 'BAJA+2' o 2)
        if 'BAJA+2' in model_classes:
            target_class = 'BAJA+2'
        elif 2 in model_classes:
            target_class = 2
        else:
            raise ValueError("La clase objetivo 'BAJA+2' o 2 no está en las clases del modelo.")

        # Obtener el índice de la clase objetivo
        class_index = np.where(model_classes == target_class)[0][0]

        # Obtener las probabilidades predichas para la clase objetivo
        probs = y_hat[:, class_index]

        # Calcular la ganancia para cada fila
        gains = np.where(
            probs >= self.threshold,
            np.where(y == target_class, self.ganancia_acierto, -self.costo_estimulo),
            0
        )

        # Sumar las ganancias
        total_gain = gains.sum()/prop

        return total_gain

    def train_and_evaluate(self, train_index, test_index, X, y, params):
        # Instanciar el clasificador basado en model_type
        classifier_class = self.classifier_map[self.model_type]
        model = classifier_class(**params)
        model.fit(X.iloc[train_index], y.iloc[train_index])
        ganancia_value = self.ganancia(model, X.iloc[test_index], y.iloc[test_index], prop=0.3)
        return model, ganancia_value

    def optimize_model(self, X, y, storage_name, study_name, optimize=True, n_trials=200):
        sss_opt = ShuffleSplit(n_splits=5, test_size=0.3, random_state=self.seeds[self.s])

        def objective_xgboost(trial):
            # Hiperparámetros para XGBClassifier

            # Parámetros a optimizar
            # n_estimators = trial.suggest_int('n_estimators', 100, 500)
            max_leaves = trial.suggest_int('max_leaves', 10, 256)
            learning_rate = trial.suggest_float('eta', 0.01, 0.3, log=True)  # 'eta' es equivalente a 'learning_rate'
            gamma = trial.suggest_float('gamma', 0, 5)
            min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
            subsample = trial.suggest_float('subsample', 0.5, 1.0)
            colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
            if self.reg:
                reg_lambda = trial.suggest_float('lambda', 0.0, 10.0)
                reg_alpha = trial.suggest_float('alpha', 0.0, 10.0)
            # scale_pos_weight = trial.suggest_float('scale_pos_weight', 1.0, 10.0)

            params = {
                'booster': 'gbtree',
                'n_estimators': 100,
                'max_leaves': max_leaves,
                'learning_rate': learning_rate,
                'gamma': gamma,
                'min_child_weight': min_child_weight,
                'subsample': subsample,
                'colsample_bytree': colsample_bytree,
                # 'scale_pos_weight': scale_pos_weight, # default = 1, ya que la ganancia ya contempla desbalance
                'random_state': self.seeds[self.s],
                'enable_categorical': True,
                'use_label_encoder': False,
                'objective': 'multi:softprob',
                'num_class': 3,
                'eval_metric': 'mlogloss',
                'tree_method': 'hist',      # Usar 'hist' para grandes conjuntos de datos
                'grow_policy': 'lossguide', # Necesario cuando se usa 'max_leaves'
            }

            if self.reg:
                params.update({
                    'reg_lambda': reg_lambda,  # 'lambda' es palabra reservada en Python, usamos 'reg_lambda'
                    'reg_alpha': reg_alpha,
                })

            # Ejecutar validación cruzada paralela
            results = Parallel(n_jobs=self.n_jobs)(
                delayed(self.train_and_evaluate)(train_index, test_index, X, y, params)
                for train_index, test_index in sss_opt.split(X, y)
            )

            # Retornar la ganancia media
            return np.mean([result[1] for result in results])

        def objective_lightgbm(trial):
            # Hiperparámetros para LGBMClassifier
            # n_estimators = trial.suggest_int('n_estimators', 100, 1000)
            num_leaves = trial.suggest_int('num_leaves', 31, 256)
            # max_depth = trial.suggest_int('max_depth', -1, 30) # conflict with num_leaves
            learning_rate = trial.suggest_float('learning_rate', 0.001, 0.3, log=True)
            min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 20, 100)
            if self.reg:
                lambda_l1 = trial.suggest_float('lambda_l1', 0.0, 10.0)
                lambda_l2 = trial.suggest_float('lambda_l2', 0.0, 10.0)
            min_gain_to_split = trial.suggest_float('min_gain_to_split', 0.0, 1.0)
            feature_fraction = trial.suggest_float('feature_fraction', 0.5, 1.0)
            bagging_fraction = trial.suggest_float('bagging_fraction', 0.5, 1.0)
            bagging_freq = trial.suggest_int('bagging_freq', 1, 7)
            max_bin = trial.suggest_int('max_bin', 64, 255)

            params = {
                'n_estimators': 100, # to reduce overfitting and training time
                'num_leaves': num_leaves,
                # 'max_depth': max_depth,
                'learning_rate': learning_rate,
                'min_data_in_leaf': min_data_in_leaf,
                'min_gain_to_split': min_gain_to_split,
                'feature_fraction': feature_fraction,
                'bagging_fraction': bagging_fraction,
                'bagging_freq': bagging_freq,
                'max_bin': max_bin,
                # 'objective': 'binary',  # Specify the objective function
                # 'metric': 'binary_logloss',  # Specify the evaluation metric
                'random_state': self.seeds[self.s],
                'n_jobs': self.n_jobs
            }

            if self.reg:
                params.update({
                    'lambda_l1': lambda_l1,
                    'lambda_l2': lambda_l2,
                })

            results = Parallel(n_jobs=self.n_jobs)(
                delayed(self.train_and_evaluate)(train_index, test_index, X, y, params)
                for train_index, test_index in sss_opt.split(X)
            )

            return np.mean([result[1] for result in results])

        # Mapear model_type a la función objetivo correspondiente
        objective_map = {
            'xgboost': objective_xgboost,
            'lightgbm': objective_lightgbm
        }

        objective = objective_map[self.model_type]

        study = optuna.create_study(
            direction="maximize",
            study_name=study_name,
            storage=storage_name,
            load_if_exists=True
        )

        if optimize:
            print(f"Optimizando {self.model_type} con {n_trials} pruebas")
            study.optimize(objective, n_trials=n_trials)

        best_trial = study.best_trial
        self.best_params = best_trial.params  # Guardar los mejores parámetros

        print(f"Mejores parámetros para {self.model_type}: {best_trial.params}")
        return best_trial.params

    def train_base_model(self, X_train, y_train):
        classifier_class = self.classifier_map[self.model_type]
        self.base_model = classifier_class(**self.base_params)
        self.base_model.fit(X_train, y_train)

    def train_best_model(self, X_train, y_train):
        if self.best_params is None:
            print("No se encontraron mejores parámetros. Por favor, ejecuta optimize_model primero.")
            return
        classifier_class = self.classifier_map[self.model_type]
        self.best_model = classifier_class(**self.best_params)
        self.best_model.fit(X_train, y_train)

    def compare_models(self, X, y):
        sss = StratifiedShuffleSplit(n_splits=30, test_size=0.3, random_state=self.seeds[self.s])

        results_base = Parallel(n_jobs=self.n_jobs)(
            delayed(self.train_and_evaluate)(train_index, test_index, X, y, self.base_params)
            for train_index, test_index in sss.split(X, y)
        )
        results_best = Parallel(n_jobs=self.n_jobs)(
            delayed(self.train_and_evaluate)(train_index, test_index, X, y, self.best_params)
            for train_index, test_index in sss.split(X, y)
        )

        return results_base, results_best

    def plot_comparisons(self, results_base, results_best):
        df_pred = pd.DataFrame({'Ganancia': [result[1] for result in results_base], 'Modelo': 'Base'})
        df_pred_best = pd.DataFrame({'Ganancia': [result[1] for result in results_best], 'Modelo': 'Best'})
        df_combined = pd.concat([df_pred, df_pred_best])

        g = sns.FacetGrid(df_combined, row="Modelo", aspect=2)
        g.map(sns.histplot, "Ganancia", kde=True)
        plt.show()

        mean_base = df_combined[df_combined['Modelo'] == 'Base']['Ganancia'].mean()
        mean_best = df_combined[df_combined['Modelo'] == 'Best']['Ganancia'].mean()

        print(f"Ganancia media del modelo base: {mean_base}")
        print(f"Ganancia media del modelo Best: {mean_best}")

    def test_model(self, model, X, y):
        ganancia_test = self.ganancia(model, X, y)
        print(f"Ganancia del modelo en el conjunto de test: {ganancia_test}")
        return ganancia_test

    def test_base_model(self, X, y):
        return self.test_model(self.base_model, X, y)

    def test_best_model(self, X, y):
        return self.test_model(self.best_model, X, y)

    def simulate_kaggle_split(self, mes_futuro, imputer=None):
        """
        Simula el split público/privado como en una competencia de Kaggle.
        """
        # Obtener los datos futuros
        X_futuro, y_futuro = self.def_xy(mes_futuro, target='clase_ternaria', to_pred=False)
        if imputer is not None:
            X_futuro = pd.DataFrame(imputer.fit_transform(X_futuro), columns=X_futuro.columns)

        # Simular el split público/privado
        sss_futuro = StratifiedShuffleSplit(n_splits=50, test_size=0.3, random_state=self.seeds[self.s])

        ganancias_futuro_privada_best = []
        ganancias_futuro_privada_base = []
        ganancias_futuro_publica_best = []
        ganancias_futuro_publica_base = []

        for train_index, test_index in sss_futuro.split(X_futuro, y_futuro):
            # Privado (70% de los datos)
            ganancias_futuro_privada_best.append(
                self.ganancia(self.best_model, X_futuro.iloc[train_index], y_futuro.iloc[train_index], prop=0.7)
            )
            ganancias_futuro_privada_base.append(
                self.ganancia(self.base_model, X_futuro.iloc[train_index], y_futuro.iloc[train_index], prop=0.7)
            )
            # Público (30% de los datos)
            ganancias_futuro_publica_best.append(
                self.ganancia(self.best_model, X_futuro.iloc[test_index], y_futuro.iloc[test_index], prop=0.3)
            )
            ganancias_futuro_publica_base.append(
                self.ganancia(self.base_model, X_futuro.iloc[test_index], y_futuro.iloc[test_index], prop=0.3)
            )

        # Crear DataFrames para visualización
        df_pred_1_best = pd.DataFrame({
            'Ganancia': ganancias_futuro_privada_best,
            'Modelo': 'Best',
            'Grupo': 'Privado'
        })
        df_pred_2_best = pd.DataFrame({
            'Ganancia': ganancias_futuro_publica_best,
            'Modelo': 'Best',
            'Grupo': 'Publico'
        })
        df_pred_1_base = pd.DataFrame({
            'Ganancia': ganancias_futuro_privada_base,
            'Modelo': 'Base',
            'Grupo': 'Privado'
        })
        df_pred_2_base = pd.DataFrame({
            'Ganancia': ganancias_futuro_publica_base,
            'Modelo': 'Base',
            'Grupo': 'Publico'
        })

        df_combined = pd.concat([df_pred_1_base, df_pred_2_base, df_pred_1_best, df_pred_2_best])

        # Visualización
        g = sns.FacetGrid(df_combined, col="Grupo", row="Modelo", aspect=2)
        g.map(sns.histplot, "Ganancia", kde=True)
        plt.show()

        # Cálculo de ganancias medias
        mean_base_privado = df_combined[
            (df_combined['Modelo'] == 'Base') & (df_combined['Grupo'] == 'Privado')
        ]['Ganancia'].mean()
        mean_base_publico = df_combined[
            (df_combined['Modelo'] == 'Base') & (df_combined['Grupo'] == 'Publico')
        ]['Ganancia'].mean()
        mean_best_privado = df_combined[
            (df_combined['Modelo'] == 'Best') & (df_combined['Grupo'] == 'Privado')
        ]['Ganancia'].mean()
        mean_best_publico = df_combined[
            (df_combined['Modelo'] == 'Best') & (df_combined['Grupo'] == 'Publico')
        ]['Ganancia'].mean()

        print(f"Ganancia media del modelo base en privado: {mean_base_privado}")
        print(f"Ganancia media del modelo base en público: {mean_base_publico}")
        print(f"Ganancia media del modelo Best en privado: {mean_best_privado}")
        print(f"Ganancia media del modelo Best en público: {mean_best_publico}")

_____

____

## XGBoost 

**Aggressive FE 3 + data cleansing**

La creación del target y el análisis exploratorio, está avanzado en el Notebook_comp01.

Luego, un proceso de **feature engineering conceptual y extendido a 3 meses previos**, fue llevado a cabo en el Notebook_comp01_fe_agr_3.

Agrego, la eliminación de features conflictivas y regularización.

### Datos

In [5]:
dataset_file = 'competencia_01_brandoni_fe_agr_3.csv'

dataset_file = os.path.join(datos, dataset_file)

data = pd.read_csv(dataset_file)

# data cleansing + data drifting
to_drop = ['Master_fultimo_cierre', 'Visa_fultimo_cierre'] + ['cprestamos_personales', 'mprestamos_personales']

  data = pd.read_csv(dataset_file)


Uso del pipeline con XGBoost

En abril

Sin mes_test

In [6]:
# Mapear etiquetas de clase a números
label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

In [7]:
mes_train

202104

In [8]:
from sklearn.impute import SimpleImputer

# Semilla a usar
s = 1

print("### Corriendo pipeline con XGBoost ###")
# Inicializar el pipeline con 'xgboost'
pipeline_xgb_reg = ModelPipeline(data, semillas, model_type='xgboost', 
                             seed=s, n_jobs=2, reg=True)

X_train, y_train = pipeline_xgb_reg.def_xy(mes_train)

# Identificar y eliminar columnas con todos los valores NaN
cols_with_all_nan = X_train.columns[X_train.isna().all()]
print("Columnas con todos los valores NaN:", cols_with_all_nan.tolist())

# Drop these columns + bad columns (DQ + DD)
to_drop = list(cols_with_all_nan) + to_drop
X_train = X_train.drop(columns=to_drop)

# Imputación de valores faltantes
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)

# Identificar variables categóricas
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Identificar variables categóricas
categorical_features = X_train_imp.select_dtypes(include=['object']).columns.tolist()

# Convertir columnas categóricas al tipo 'category'
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')

# print("\n# Entrenando el modelo base XGBoost")
# pipeline_xgb_reg.train_base_model(X_train_imp, y_train)

# Definir el almacenamiento para Optuna
storage_xgb = "sqlite:///optimizacion/optimization_tree.db"
study_xgb = "comp01_v02_pipeline_training_xgboost_opt_2_fe_agr_3_reg"

print("\n# Optimizando el modelo")
pipeline_xgb_reg.optimize_model(
    X_train_imp, y_train,
    storage_name=storage_xgb,
    study_name=study_xgb,
    optimize=False,  # Establecer en True para realizar la optimización
    n_trials=100  # Ajusta el número de pruebas según sea necesario
)

# Entrenar el mejor modelo con parámetros optimizados
print("\n# Entrenando el mejor modelo con parámetros optimizados")
pipeline_xgb_reg.train_best_model(X_train_imp, y_train)

# # Comparar modelos
# print("\n# Comparando modelos")
# results_base_xgb_reg, results_best_xgb_reg = pipeline_xgb_reg.compare_models(X_train_imp, y_train)
# pipeline_xgb_reg.plot_comparisons(results_base_xgb_reg, results_best_xgb_reg)

### Corriendo pipeline con XGBoost ###
Columnas con todos los valores NaN: ['payroll_slope_1_cliente_antiguedad', 'cuenta_corriente_slope_1_cliente_antiguedad', 'visa_consumo_slope_1_cliente_antiguedad', 'comisiones_mantenimiento_slope_1_cliente_antiguedad', 'comisiones_otras_slope_1_cliente_antiguedad', 'payroll_slope_1_foto_mes', 'cuenta_corriente_slope_1_foto_mes', 'visa_consumo_slope_1_foto_mes', 'comisiones_mantenimiento_slope_1_foto_mes', 'comisiones_otras_slope_1_foto_mes']

# Optimizando el modelo


[I 2024-10-28 11:55:05,061] Using an existing study with name 'comp01_v02_pipeline_training_xgboost_opt_2_fe_agr_3_reg' instead of creating a new one.


Mejores parámetros para xgboost: {'max_leaves': 19, 'eta': 0.11537876667197919, 'gamma': 2.9208841946871176, 'min_child_weight': 7, 'subsample': 0.9416959858943643, 'colsample_bytree': 0.5429812768188057, 'lambda': 7.226815448281549, 'alpha': 0.5929685712726126, 'scale_pos_weight': 8.669078758370219}

# Entrenando el mejor modelo con parámetros optimizados


Parameters: { "scale_pos_weight" } are not used.



In [9]:
pipeline_xgb_reg.best_model.get_params()

{'objective': 'multi:softprob',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.5429812768188057,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 2.9208841946871176,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': 19,
 'min_child_weight': 7,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': 8.669078758370219,
 'subsample': 0.9416959858943643,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'eta': 0.115

___

## Ensamble

XGBoost + Redes neuronales + Atención

Usando el XGBoost pre-entrenado, vamos a desarrollar un modelo de ensamble con redes neuronales y mecanismos de atención.

### 1. XGBoost re-entrenado

Datos

In [10]:
dataset_file = 'competencia_01_brandoni_fe_agr_3.csv'

dataset_file = os.path.join(datos, dataset_file)

data = pd.read_csv(dataset_file)

# data cleansing + data drifting
to_drop = ['Master_fultimo_cierre', 'Visa_fultimo_cierre'] + ['cprestamos_personales', 'mprestamos_personales']

  data = pd.read_csv(dataset_file)


Target ternaria

In [11]:
target_multi = 'clase_ternaria'

In [12]:
X = data[data['foto_mes'] == mes_train]
y = X[target_multi]
X = X.drop(columns=[target_multi])

In [13]:
# Mapear etiquetas de clase a números
label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

y = y.map(label_mapping)

In [14]:
y.value_counts()

clase_ternaria
0    161919
2      1189
1       982
Name: count, dtype: int64

Target binaria

In [15]:
y_bin = pd.Series(
                np.where(y == 2, 1, 0),
                name='clase_binaria',
                index=y.index
                )

In [16]:
y_bin.value_counts()

clase_binaria
0    162901
1      1189
Name: count, dtype: int64

Split del train para tomar una porción para validación y test

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    # y, # ternaria
                                                    y_bin, # binaria
                                                    test_size=0.1,
                                                    # stratify=y,
                                                    stratify=y_bin, 
                                                    random_state=semillas[1])

y_train = y_train.values
y_test = y_test.values

In [18]:
# tomo los parámetros óptimos del modelo a trabajar
opt_params = pipeline_xgb_reg.best_model.get_params()

opt_params.update({'objective': 'binary:logistic'}) # para binario

# instancia con mejores parámetros
xgb_clf = XGBClassifier(**opt_params, seed=semillas[1])

# entreno
xgb_clf.fit(X_train, y_train)

In [19]:
# Get number of trees in the model
num_trees = xgb_clf.get_booster().num_boosted_rounds()
print("Number of trees:", num_trees)

# Get the model parameters
params = xgb_clf.get_params()
print("Model Parameters:", params)

Number of trees: 100
Model Parameters: {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.5429812768188057, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 2.9208841946871176, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': 19, 'min_child_weight': 7, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 8.669078758370219, 'subsample': 0.9416959858943643, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'eta': 0.11

In [20]:
# evaluation Libraries
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc, roc_auc_score

def find_best_threshold(y_pred_prob, y_test, best_thresh = None):
    """
    This function is to find the best threshold to determine "to inspect" or "not".
    We assume that we inspect only the imports where predicted probability of fraud is above the threshold.

    dtype model: xgboost.sklearn.XGBClassifier
    dtype x_list: list or array
    dtype y_test: np.array
    dtype best_thresh: float
    rtype best_f1: float
    """
    
    # Set threshold range as [0.1, 0.2, ..., 0.5]. 
    threshold_list = np.arange(0.1,0.6,0.1)
    # Set an initial value of best threshold.
    best_f1 = 0
    
    # If best_thresh is set as "None", this function is to find the best_thresh as well as best_f1 
    if best_thresh ==None:
        for th in threshold_list:
            y_pred_label = (y_pred_prob > th)*1 
            f_score = f1_score(y_test,y_pred_label)
            if f_score > best_f1:
                best_f1 = f_score
                best_thresh = th 
        return best_thresh, best_f1, roc_auc_score(y_test, y_pred_prob)
    
    # If best_thresh is set as a certain number, this function is to calculate its f1 score.
    else:
        y_pred_label = (y_pred_prob > best_thresh)*1 
        best_f1 = f1_score(y_test,y_pred_label)
        
    return best_f1, roc_auc_score(y_test, y_pred_prob)

Testeo de compromiso

In [21]:
# Evaluate xgboost model
print("------Evaluating xgboost model------")
# Predict
test_pred = xgb_clf.predict_proba(X_test)[:,1]
# Calculate auc
xgb_auc = roc_auc_score(y_test, test_pred)

# Find the best threshold
xgb_threshold,_,_ = find_best_threshold(xgb_clf.predict_proba(X_test)[:,1], y_test)
# Calculate the best f1 score
xgb_f1,_ = find_best_threshold(xgb_clf.predict_proba(X_test)[:,1], y_test, best_thresh = xgb_threshold)
print("AUC = %.4f, F1-score = %.4f" % (xgb_auc, xgb_f1))

------Evaluating xgboost model------
AUC = 0.9220, F1-score = 0.2483


In [22]:
xgb_threshold

0.5

Guardado del modelo XGBoost

In [47]:
import joblib

xgb_clf_path = 'modelos/xgb_classifier_model.pkl'

# Save the model
joblib.dump(xgb_clf, xgb_clf_path)

['modelos/xgb_classifier_model.pkl']

### 2. Funciones de ensamble

**process_lead_idx** function

In [23]:
def process_leaf_idx(X_leaves): 
    """
    This function is to convert the output of XGBoost model to the input of DATE model.
    For an individual import, the output of XGBoost model is a list of leaf index of multiple trees.
    eg. [1, 1, 10, 9, 30, 30, 32, ... ]
    How to distinguish "node 1" of the first tree from "node 1" of the second tree?
    How to distinguish "node 30" of the fifth tree from "node 30" of the sixth tree?
    This function is to assign unique index to every leaf node in all the trees. 
    
    This function returns;
    - lists of unique leaf index;
    - total number of unique leaf nodes; and
    - a reference table (dictionary) composed of "unique leaf index", "tree id", "(previous) leaf index".
    
    dtype X_leaves: np.array
    rtype leaves: list
    rtype total_leaves: int
    rtype new_leaf_index: dict 
    """
    leaves = X_leaves.copy()
    new_leaf_index = dict() # Dictionary to store leaf index
    total_leaves = 0
    for c in range(X_leaves.shape[1]): # Iterate for each column (i.e. 100 trees)
        column = X_leaves[:,c]
        unique_vals = list(sorted(set(column)))
        new_idx = {v:(i+total_leaves) for i,v in enumerate(unique_vals)}
        for i,v in enumerate(unique_vals):
            leaf_id = i+total_leaves
            new_leaf_index[leaf_id] = {c:v}
        leaves[:,c] = [new_idx[v] for v in column]
        total_leaves += len(unique_vals)
        
    assert leaves.ravel().max() == total_leaves - 1
    return leaves,total_leaves,new_leaf_index

**fgsm_attack** function

This function is to generate adversarial samples.

In [24]:
def fgsm_attack(model, loss, images, labels, eps):
    """
    Generate adversarial examples
    
    dtype model: torch.nn.parallel.data_parallel.DataParallel
    dtype loss: torch.nn.modules.loss.BCELoss
    dtype images: torch.Tensor
    dtype labels: torch.Tensor
    dtype eps: float
    rtype attack_images: torch.Tensor
    """
    
    images = Variable(images, requires_grad=True)
    outputs = model.module.pred_from_hidden(images)
    
    model.zero_grad()
    cost = loss(outputs, labels)
    cost.backward()
    attack_images = images + eps * images.grad.sign()
    
    return attack_images

**metrics** function

This is the function which generates metrics to evaluate the performance of the model.  

For date model, we will calculate the metrics of evaluation for each group of 1%, 2%, 5%, and 10% suspicious transactions suggested by the model.

In [25]:
def metrics(y_prob,xgb_testy,best_thresh=None):
    """
    Generate metrics for evaluation
    
    dtype y_prob: np.array
    dtype xgb_testy: np.array
    dtype revenue_test: np.array
    dtype best_thresh: float64
    rtype overall_f1: float64
    rtype auc: float64
    rtype pr: list
    rtype re: list
    rtype f: list
    rtype rev: list
    """
    
    if best_thresh ==None:
        _,overall_f1,auc = find_best_threshold(y_prob,xgb_testy,best_thresh)
    else:
        overall_f1,auc = find_best_threshold(y_prob,xgb_testy,best_thresh)
    # Seized revenue 
    # Precision and Recall
    pr, re, f = [], [], []
    for i in [99,98,95,90]:
        threshold = np.percentile(y_prob, i)

        precision = np.mean(xgb_testy[y_prob > threshold])
        recall = sum(xgb_testy[y_prob > threshold])/sum(xgb_testy)
        f1 = 2*precision*recall/(precision+recall)

        # Save results
        pr.append(precision)
        re.append(recall)
        f.append(f1)
    
    return overall_f1, auc, pr, re, f

### 3. Leaf nodes of each import from XGB model**

Get output values from the XGBoost model and convert them to the input for NN

In [26]:
# Get leaf index from xgboost model 
X_train_leaves = xgb_clf.apply(X_train) #apply: Return the predicted leaf every tree for each sample.
# X_valid_leaves = xgb_clf.apply(X_val)
X_test_leaves = xgb_clf.apply(X_test)
train_rows = X_train_leaves.shape[0]

In [27]:
# Preprocess
train_rows = X_train.shape[0]
test_rows = X_test.shape[0] + train_rows

# Convert output values of the XGBoost model to the input form of the DATE model.
X_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0) # Make sure the dimensionality

transformed_leaves, leaf_num, new_leaf_index = process_leaf_idx(X_leaves)

train_leaves, test_leaves = transformed_leaves[:train_rows],\
                                          transformed_leaves[train_rows:]

In [28]:
# Number 
len(new_leaf_index)

1888

In [29]:
transformed_leaves

array([[   5.,   22.,   41., ..., 1842., 1858., 1872.],
       [  10.,   27.,   48., ..., 1833., 1860., 1886.],
       [  10.,   27.,   48., ..., 1842., 1859., 1870.],
       ...,
       [   9.,   28.,   45., ..., 1843., 1859., 1872.],
       [  11.,   28.,   48., ..., 1842., 1862., 1873.],
       [  11.,   28.,   48., ..., 1842., 1859., 1873.]], dtype=float32)

In [30]:
train_leaves

array([[5.000e+00, 2.200e+01, 4.100e+01, ..., 1.842e+03, 1.858e+03,
        1.872e+03],
       [1.000e+01, 2.700e+01, 4.800e+01, ..., 1.833e+03, 1.860e+03,
        1.886e+03],
       [1.000e+01, 2.700e+01, 4.800e+01, ..., 1.842e+03, 1.859e+03,
        1.870e+03],
       ...,
       [1.000e+01, 2.600e+01, 5.000e+01, ..., 1.842e+03, 1.859e+03,
        1.872e+03],
       [1.000e+00, 2.000e+01, 3.900e+01, ..., 1.843e+03, 1.859e+03,
        1.872e+03],
       [1.000e+01, 2.700e+01, 4.500e+01, ..., 1.842e+03, 1.859e+03,
        1.872e+03]], dtype=float32)

In [31]:
test_leaves

array([[   9.,   26.,   48., ..., 1835., 1860., 1873.],
       [  10.,   25.,   43., ..., 1842., 1859., 1872.],
       [  11.,   28.,   48., ..., 1833., 1858., 1875.],
       ...,
       [   9.,   28.,   45., ..., 1843., 1859., 1872.],
       [  11.,   28.,   48., ..., 1842., 1862., 1873.],
       [  11.,   28.,   48., ..., 1842., 1859., 1873.]], dtype=float32)

Seteo de semilla

In [32]:
import torch
import torch.utils.data as Data
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # Si usas múltiples GPUs
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(semillas[1])

### 4. Convert data to tensor

Tensor is a collection of numbers with specific shape (dimension)

In [33]:
# Convert to torch type
train_leaves = torch.tensor(train_leaves).long()
test_leaves = torch.tensor(test_leaves).long()

# cls data
train_label_cls = torch.tensor(y_train).float()
test_label_cls = torch.tensor(y_test).float()

# Create dataset 
train_dataset = Data.TensorDataset(train_leaves, train_label_cls)
test_dataset = Data.TensorDataset(test_leaves, test_label_cls)

Data loader

In [34]:
import time

# Create dataloader
batch_size = 128
train_loader = Data.DataLoader(
    dataset=train_dataset,     
    batch_size=batch_size,      
    shuffle=True,               
)

test_loader = Data.DataLoader(
    dataset=test_dataset,     
    batch_size=batch_size,      
    shuffle=False,               
)

# Model information
curr_time = str(time.time())
model_name = "DMEyF_ensamble"
model_path = "modelos/%s%s.pkl" % (model_name, curr_time)

### 5. Construcción del modelo

In [35]:
# !pip install torch-multi-head-attention

In [36]:
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch_multi_head_attention import MultiHeadAttention
from torch.autograd import Variable
import argparse

Attención

In [37]:
class Attention(nn.Module):
    """
    Attention sends all outputs of the encoder to the decoder, and at each time step, 
    the memory cell of the decoder calculates the sum of the weights from all encoder outputs 
    to determine which data to focus more on.
    """
    def __init__(self,dim,hidden,aggregate="sum"):
        super(Attention, self).__init__()
        self.attention_matrix = nn.Linear(dim, hidden)
        self.project_weight = nn.Linear(hidden*2, hidden)
        self.h = nn.Parameter(torch.rand(hidden,1))
        self.agg_type = aggregate
        
    def forward(self, query, key): # Query: 256 X 16, # key: 256 X 100 X 16, # Assume key==value
        dim = query.size(-1) # 16 (n_embedding_dimension)
        batch = key.size(0) # 256 (batch_size = n_observation in a batch)
        time_step = key.size(1) # 100 (n_trees from xgboot model)
        
        # Concate input query and key 
        query = query.view(batch, 1, dim) # View = reshape: (256X16) -> (256X1X16)
        query = query.expand(batch, time_step, -1) # Expand to the same dimension: (256X1X16) -> (256X100X16)
        cat_vector = torch.cat((query, key), dim=-1) # (256X100X32)
        
        # Project to single value
        project_vector = self.project_weight(cat_vector) 
        project_vector = torch.relu(project_vector)
        attention_alpha = torch.matmul(project_vector, self.h)
        attention_weight = torch.softmax(attention_alpha, dim=1) # Normalize and calculate weights (b,t,1)
        attention_vec = key * attention_weight
        
        # Aggregate leaves
        if self.agg_type == "max":
            attention_vec, _ = torch.max(attention_vec, dim=1)
        elif self.agg_type =="mean":
            attention_vec = torch.mean(attention_vec, dim=1)
        elif self.agg_type =="sum":
            attention_vec = torch.sum(attention_vec, dim=1)
        return attention_vec, attention_weight

Ensamble DMEyF

Con normalización extra

In [38]:
class DMEyF_norm(nn.Module):
    """
    Esta es la clase para crear la estructura completa del modelo DATE
    """
    def __init__(self, max_leaf, dim, head_num=4, act="relu", 
                 device="cuda", use_self=True, use_att=True, agg_type="sum"):
        super(DMEyF_norm, self).__init__()
        self.d = dim
        self.device = device
        if act == "relu":
            self.act = nn.LeakyReLU()
        self.use_self = use_self
        self.use_att = use_att
        self.agg_type = agg_type
        # Capas de embedding
        self.leaf_embedding = nn.Embedding(max_leaf, dim)

        # Capas de atención
        self.attention_block = Attention(dim, dim, self.agg_type).to(device)
        self.self_att = MultiHeadAttention(dim, head_num).to(device)

        # Capas de normalización para las salidas de atención
        self.layer_norm_self_att = nn.LayerNorm(dim)
        self.layer_norm_att_block = nn.LayerNorm(dim)

        # Capas ocultas y de salida con normalización
        self.hidden1 = nn.Linear(dim, 128)
        self.hidden2 = nn.Linear(128, 256)
        self.hidden3 = nn.Linear(256, dim)
        self.output_cls_layer = nn.Linear(dim, 1)
    
    def forward(self, feature):
        # Embedding de leaf_id
        leaf_vectors = self.leaf_embedding(feature)
        
        # Primera atención: Multi-Head Self-Attention
        if self.use_self:
            leaf_vectors = self.self_att(leaf_vectors, leaf_vectors, leaf_vectors)
            # Normalización después de self_att
            leaf_vectors = self.layer_norm_self_att(leaf_vectors)
        
        if self.use_att:
            # Computar la media de leaf_vectors para obtener query_vector
            query_vector = torch.mean(leaf_vectors, dim=1)  # (batch_size, dim)
            # Segunda atención: Atención con un vector de consulta propio
            set_vector, self.attention_w = self.attention_block(query_vector, leaf_vectors)
            # Normalización después de attention_block
            set_vector = self.layer_norm_att_block(set_vector)
        else:
            # Agregar leaves
            if self.agg_type == "max":
                set_vector, _ = torch.max(leaf_vectors, dim=1)
            elif self.agg_type =="mean":
                set_vector = torch.mean(leaf_vectors, dim=1)
            elif self.agg_type =="sum":
                set_vector = torch.sum(leaf_vectors, dim=1)

            # Normalización del set_vector si no se usa atención
            set_vector = self.layer_norm_att_block(set_vector)
    
        # Pasar por las capas ocultas con activaciones y normalización
        hidden = self.act(self.hidden1(set_vector))
        hidden = self.act(self.hidden2(hidden))
        hidden = self.act(self.hidden3(hidden))

        # Salida de clasificación
        classification_output = torch.sigmoid(self.output_cls_layer(hidden))
        return classification_output, hidden

    def pred_from_hidden(self, hidden):
        classification_output = torch.sigmoid(self.output_cls_layer(hidden))
        return classification_output 

    def eval_on_batch(self, test_loader):  # Predecir datos de prueba usando batch
        final_output = []
        cls_loss = []
        for batch in test_loader:
            batch_feature, batch_cls = batch
            batch_feature, batch_cls = batch_feature.to(self.device), batch_cls.to(self.device)
            batch_cls = batch_cls.view(-1,1)
            y_pred_prob, _ = self.forward(batch_feature)

            # Calcular pérdida de clasificación
            cls_losses = nn.BCELoss()(y_pred_prob, batch_cls)
            cls_loss.append(cls_losses.item())

            # Almacenar probabilidad predicha
            y_pred = y_pred_prob.detach().cpu().numpy().tolist()
            final_output.extend(y_pred)

        print("Pérdida CLS: %.4f" % np.mean(cls_loss))
        return np.array(final_output).ravel(), np.mean(cls_loss)



### 6. Hiperparámetros

In [39]:
# Parse argument
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', 
                        type=str, 
                        default="DMEyF_SST", 
                        help="Name of model",
                        )
parser.add_argument('--epoch', 
                        type=int, 
                        default=10, 
                        help="Number of epochs",
                        )
parser.add_argument('--dim', 
                        type=int, 
                        default=16, 
                        help="Hidden layer dimension",
                        )
parser.add_argument('--lr', 
                        type=float, 
                        default=0.1, 
                        help="learning rate",
                        )
parser.add_argument('--l2',
                        type=float,
                        default=0.01, #it was 0.001
                        help="l2 reg",
                        )
# parser.add_argument('--alpha',
#                         type=float,
#                         default=10,
#                         help="Regression loss weight",
#                         )

parser.add_argument('--beta', type=float, default=0.001, help="Adversarial loss weight")
parser.add_argument('--head_num', type=int, default=8, help="Number of heads for self attention")
parser.add_argument('--use_self', type=int, default=True, help="Wheter to use self attention")
parser.add_argument('--use_att', type=int, default=True, help="Wheter to use attention")
# parser.add_argument('--fusion', type=str, choices=["concat","attention"], default="concat", help="Fusion method for final embedding")
parser.add_argument('--agg', type=str, choices=["sum","max","mean"], default="sum", help="Aggreate type for leaf embedding")
parser.add_argument('--act', type=str, choices=["relu"], default="relu", help="Activation function")
parser.add_argument('--device', type=str, choices=["cuda","cpu"], default="cuda", help="device name for training")
parser.add_argument('--output', type=str, default="full.csv", help="Name of output file")
parser.add_argument('--save', type=int, default=0, help="save model or not")

_StoreAction(option_strings=['--save'], dest='save', nargs=None, const=None, default=0, type=<class 'int'>, choices=None, required=False, help='save model or not', metavar=None)

In [40]:
torch.cuda.is_available()

True

In [41]:
args = parser.parse_args([])

In [42]:
args

Namespace(model_name='DMEyF_SST', epoch=10, dim=16, lr=0.1, l2=0.01, beta=0.001, head_num=8, use_self=True, use_att=True, agg='sum', act='relu', device='cuda', output='full.csv', save=0)

### 7. Entrenamiento del modelo

#### C. Entrenamiento de prueba

Con self_att & att y con extra norm

Agg sum

Con Balance de clases, penalizando la pérdida

In [43]:
clases_freq = y_bin.value_counts(normalize=True)

class_weights = 1.0 / clases_freq

class_weights = class_weights / class_weights.sum()

print("Pesos de clase:")
print(class_weights)

Pesos de clase:
clase_binaria
0    0.007246
1    0.992754
Name: proportion, dtype: float64


In [68]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

def train(args):
    """
    Train the DATE model with predefined hyperparameters
    
    dtype *args: strings
    """
    # Get configs
    epochs = args.epoch
    dim = args.dim
    lr = args.lr
    weight_decay = args.l2
    head_num = args.head_num
    device = args.device
    act = args.act
    beta = args.beta
    use_self = True
    use_att = True
    agg = "sum"
    
    model = DMEyF_norm(leaf_num, dim, head_num, act=act, device=device,
                  use_self=use_self, use_att=use_att, agg_type=agg).to(device)
    
    model = nn.DataParallel(model, device_ids=[0])

    # Params measurement
    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    # print(model.name)
    print(f'Params to be trained: {params}')
    # Initialize parameters
    # Fills the input Tensor with values according to the method described in 
    # Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), 
    # Using a uniform distribution.
    
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    # Balanceo de clases por penalizacion de loss
    class_weights_tensor = torch.tensor(class_weights.values, dtype=torch.float).to(device)

    # Optimizer & loss 
    optimizer = Adam(model.parameters(), weight_decay=weight_decay, lr=lr)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

    cls_loss_func = nn.BCELoss(weight=class_weights_tensor[1])
    # cls_loss_func = nn.BCEWithLogitsLoss(pos_weight=class_weights_tensor[1])

    for epoch in range(epochs):
        for step, (batch_feature,batch_cls) in enumerate(train_loader):
            model.train() # prep to train model
            batch_feature, batch_cls = batch_feature.to(device), batch_cls.to(device)
            batch_cls = batch_cls.view(-1,1)

            # Model output
            classification_output, hidden_vector = model(batch_feature)

            # FGSM attack
            adv_vector = fgsm_attack(model, cls_loss_func, hidden_vector, batch_cls, 0.01)
            adv_output = model.module.pred_from_hidden(adv_vector) 

            # Calculate loss
            adv_loss_func = nn.BCELoss(weight=batch_cls)
            # adv_loss_func = nn.BCEWithLogitsLoss(pos_weight=batch_cls)
            adv_loss = beta * adv_loss_func(adv_output, batch_cls) 
            cls_loss = cls_loss_func(classification_output, batch_cls)
            loss = cls_loss + adv_loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # if (step+1) % 1000 == 0:  
            #     print("CLS loss:%.4f, ADV loss:%.4f, Loss:%.4f"%(cls_loss.item(), adv_loss.item(),loss.item()))
                
        # Evaluate 
        model.eval()

        print("Train at epoch %s"%(epoch+1))
        _, train_loss = model.module.eval_on_batch(train_loader)

        print("Test at epoch %s"%(epoch+1))
        _, test_loss = model.module.eval_on_batch(test_loader)

        # best_threshold, test_score, roc = find_best_threshold(y_prob, y_test)
        # overall_f1, auc, precisions, recalls, f1s = metrics(y_prob, y_test, best_threshold)
        # select_best = np.mean(f1s)
        # print("Over-all F1:%.4f, AUC:%.4f, F1-top:%.4f, Loss:%.4f"% (overall_f1, auc, select_best, test_loss))

        scheduler.step(test_loss)   
        

    return model

In [None]:
# increasing the numbers of epochs
args.epoch = 50

model = train(args)

torch.save(model, model_path)

Params to be trained: 71505
Train at epoch 1
Pérdida CLS: 0.0430
Test at epoch 1
Pérdida CLS: 0.0428
Train at epoch 2
Pérdida CLS: 0.0470
Test at epoch 2
Pérdida CLS: 0.0469
Train at epoch 3
Pérdida CLS: 0.0534
Test at epoch 3
Pérdida CLS: 0.0532
Train at epoch 4
Pérdida CLS: 0.0438
Test at epoch 4
Pérdida CLS: 0.0436
Train at epoch 5
Pérdida CLS: 0.0542
Test at epoch 5
Pérdida CLS: 0.0540


### 8. Armado del modelo predictivo hídrido

Usando el bosque y combinado con el modelo dl_att

In [63]:
class HybridModel:
    def __init__(self, xgb_model_path, dl_model_path):
        # Load pre-trained XGBoost model for preprocessing
        self.xgb_clf = joblib.load(xgb_model_path)
        
        # Load pre-trained deep learning model for cross-feature embeddings
        self.dl_model = torch.load(dl_model_path)
        self.dl_model.eval()
        
    def process_leaf_idx(self, X_leaves):
        """
        This function is to convert the output of XGBoost model to the input of the deep learning model.
        """
        leaves = X_leaves.copy()
        new_leaf_index = dict()  # Dictionary to store leaf index
        total_leaves = 0
        for c in range(X_leaves.shape[1]):  # Iterate for each column (i.e. 100 trees)
            column = X_leaves[:, c]
            unique_vals = list(sorted(set(column)))
            new_idx = {v: (i + total_leaves) for i, v in enumerate(unique_vals)}
            for i, v in enumerate(unique_vals):
                leaf_id = i + total_leaves
                new_leaf_index[leaf_id] = {c: v}
            leaves[:, c] = [new_idx[v] for v in column]
            total_leaves += len(unique_vals)

        assert leaves.ravel().max() == total_leaves - 1
        return leaves, total_leaves, new_leaf_index

    def get_dataloader(self, X, batch_size=128):
        # Get leaf index from XGBoost model
        X_leaves = self.xgb_clf.apply(X)
        
        # Process leaf indexes
        transformed_leaves, _, _ = self.process_leaf_idx(X_leaves)
        
        # Convert to torch tensor
        transformed_leaves = torch.tensor(transformed_leaves).long()
        
        # Create dataset and dataloader
        x_dataset = Data.TensorDataset(transformed_leaves)
        x_loader = Data.DataLoader(dataset=x_dataset, batch_size=batch_size, shuffle=False)
        
        return x_loader

    def predict_proba(self, X):
        # Get dataloader for input data
        dataloader = self.get_dataloader(X)
        
        # Predict probabilities
        probabilities = []
        with torch.no_grad():
            for batch in dataloader:
                batch_input = batch[0]
                batch_output = self.dl_model(batch_input)[0]  # Extract tensor from tuple
                probs = torch.sigmoid(batch_output).squeeze().cpu().numpy()
                probabilities.extend(probs)
        
        return np.array(probabilities)

    def predict(self, X, threshold=0.5):
        # Predict probabilities
        probas = self.predict_proba(X)
        
        # Convert probabilities to binary predictions based on threshold
        predictions = (probas >= threshold).astype(int)

        return predictions
    
    def count_parameters(self):
        return sum(p.numel() for p in self.dl_model.parameters() if p.requires_grad)
        


Prueba del modelo híbrido

In [64]:
xgb_model_path = 'modelos/xgb_classifier_model.pkl'
dl_model_path = 'modelos/DMEyF_ensamble1730127357.2834532.pkl'

# Instantiate the HybridModel
hybrid_model = HybridModel(xgb_model_path, dl_model_path)

# junio como mes de test
mes_test = 202106

X_ = data[data['foto_mes'] == mes_test]

target_multi = 'clase_ternaria'
X_ = X_.drop(columns=[target_multi])

# Predict probabilities
probas = hybrid_model.predict_proba(X_)
print(probas)

# Predict classes
predictions = hybrid_model.predict(X_)
print(predictions)

  self.dl_model = torch.load(dl_model_path)


[0.502487 0.502487 0.502487 ... 0.502487 0.502487 0.502487]
[1 1 1 ... 1 1 1]


In [62]:
predictions

array([1, 1, 1, ..., 1, 1, 1])

In [65]:
hybrid_model.count_parameters()

71505