# Back-testing

In [None]:
# Librerias
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#######
# rutas
# datasets
from config import dataset_file_fe6_6pqt, dataset_file_fe6_6xpqt
# optimizacion
from config import db_path
# modelos
from config import modelos_path
# predicciones
from config import pred_path

##########
# pipeline
from processing import ModelPipeline, plot_comparisons_on_kaggle_split
from processing import analyze_study
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import optuna
import warnings

# Ignorar advertencias de tipo UserWarning
warnings.filterwarnings('ignore', category=UserWarning, module='pandas')
warnings.filterwarnings('ignore', category=Warning, module='lightgbm')
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')
warnings.filterwarnings('ignore', category=Warning, module='xgboost')
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')

Variables de train y test

In [None]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train_all = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
                 201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
                 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_3_meses = [202104, 202105, 202106]

mes_train_ult_6_meses = [202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_9_meses = [202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_anio = [202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train = [202106]
mes_test = 202108

threshold = 0.025

semillas = [437809, 327347, 392879, 455783, 217163]

## Loading data

In [None]:
data = pd.read_parquet(dataset_file_fe6_6pqt)

# Mapear etiquetas de clase a números
label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

# Simulación para Kaggle
X_train = data[data['foto_mes'].isin([202101, 202102, 202103, 202104])]
y_train = X_train['clase_ternaria']
X_train = X_train.drop(columns=['clase_ternaria'])

mes_futuro = 202106 # usado como test
X_test = data[data['foto_mes'] == mes_futuro]
y_test = X_test['clase_ternaria']
X_test = X_test.drop(columns=['clase_ternaria'])

del data

Preprocesando data

In [None]:
# Imputacion de Xs
cols_with_all_nan = X_train.columns[X_train.isna().all()].tolist()
print("Columns with all NaN values:", cols_with_all_nan)
X_train = X_train.drop(columns=cols_with_all_nan)
X_test = X_test.drop(columns=cols_with_all_nan)

# Imputación de nulls
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)
X_test_imp = pd.DataFrame(imp_median.transform(X_test), columns=X_train.columns)

del X_train
del X_test

# Codificar variables categóricas
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')
    X_test_imp[col] = X_test_imp[col].astype('category')

Función de ganancia

In [None]:
def ganancia(y, y_hat, thr, 
             ganancia_acierto = ganancia_acierto, 
             costo_estimulo = costo_estimulo,
             target = 2, prop=1):

    # Calcular la ganancia para cada fila
    gains = np.where(y_hat >= thr, np.where(y == target, ganancia_acierto, -costo_estimulo), 0)

    # Sumar las ganancias
    estimated_gain = gains.sum()/prop

    return estimated_gain

## Modelos candidatos

Luego de una comparación de modelos candidatos en comp02_pipeline_comp

### Modelo **semillero de Denicolay** (modificado)

semillerio_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 2*0.2, # x2 para tratar de compenzar la falta de variables
                  'extra_trees': False,
}

### Modelo **semillero de Denicolay** sobre datos x

semillerio_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 0.2,
                  'extra_trees': False,
}

### Modelo **semillero de Denicolay** sobre datos x (modificado)

semillerio_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 2*0.2, # x2 para tratar de compenzar la falta de variables
                  'extra_trees': False,
}

Visualización

In [None]:
def plot_ganancias_estimulos(df_sem):
    fig, ax1 = plt.subplots()

    # Iterar sobre cada columna de ganancias y estimulos
    for sem in df_sem.columns:
        if sem.startswith('ganancias_'):
            ganancias = df_sem[sem]
            threshold = df_sem['threshold']
            
            # Plot A: Threshold vs Ganancias in gray
            ax1.plot(threshold, ganancias, color='gray')

    # Calcular y graficar las ganancias y estímulos promedio
    ganancias_avg = df_sem[[col for col in df_sem.columns if col.startswith('ganancias_')]].mean(axis=1)
    estimulos_avg = df_sem[[col for col in df_sem.columns if col.startswith('estimulos_')]].mean(axis=1)
    
    # Plot average in black
    ax1.plot(threshold, ganancias_avg, label='Ganancias Promedio', color='black', linestyle='--')

    # Highlight maximum gain
    max_gain_idx = ganancias_avg.idxmax()
    max_gain_threshold = threshold[max_gain_idx]
    max_gain = max(ganancias_avg)
    max_estimulos = estimulos_avg[max_gain_idx]

    ax1.scatter(max_gain_threshold, max_gain, color='red', zorder=5)
    ax1.annotate(f"Max Gain: {max_gain}\nThresh: {max_gain_threshold:.3f}\nEstim: {max_estimulos}", 
                (max_gain_threshold, max_gain), textcoords="offset points", xytext=(0,-20), ha='center')

    ax1.set_xlabel('Threshold')
    ax1.set_ylabel('Ganancias')
    ax1.legend()
    plt.title('Ganancias vs Threshold para cada Semillerio')
    plt.show()

## Evaluación de semillerios 

**Modelo semillero**

Con las distintas semillas, según Denicolay modeficado

In [None]:
from lightgbm import LGBMClassifier

semillero_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 2*0.2, # x2 para tratar de compenzar la falta de variables
                  'extra_trees': False,
}

semillero_params.update({'n_jobs': -1})

print("Running back-testing for Semillerio Denicolay")
print(semillero_params)

# para registrar las probabilidades
df_s_proba = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                        })

label_antimapping = {0:'CONTINUA', 1:'BAJA+1', 2:'BAJA+2'}
df_s_proba['clase_ternaria'] = df_s_proba['baja'].map(label_antimapping)

df_semillerios = pd.DataFrame()

semillerio = []
l_max_gan = []
l_max_gan_thr = []
l_max_gan_esti = []

i = 0
r_sem = range(217163, 455783, 7*7*7*7*10)
sem_total = len(list(range(217163, 455783, 7*7*7*7*10)))
r_s = range(217163, 455783, 7*7*7*7*4)
s_total = len(list(range(217163, 455783, 7*7*7*7*4)))

for sem in r_sem:
    print(f"# Semillerio: {sem}, {i+1} de {sem_total}")

    # para operar sobre el semillerio
    df_s = df_s_proba

    j = 0
    for s in r_s:
    #for s in range(0,2):
        # nueva instancia del modelos con semilla
        seed = s+(7+j)**i
        model = LGBMClassifier(**semillero_params, random_state=seed)
        # entreno
        print(f"Entrenando modelo con semilla: {seed}, {j+1} de {s_total}")
        model.fit(X_train_imp, y_train)
        # predigo proba
        y_pred_proba = model.predict_proba(X_test_imp)
        # proba baja+2
        proba_baja2 = y_pred_proba[:,2]
        df_s[f'proba_s{seed}'] = proba_baja2
        j += 1
    
    # calculando proba mean
    proba_s_columns = df_s.filter(regex='^proba_s')
    proba_s_mean = proba_s_columns.mean(axis=1)

    df_s['proba_sem_mean'] = proba_s_mean  

    # dataframe con el test
    test_results = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                            'proba_sem_baja2' : df_s['proba_sem_mean'].values
                        })
    test_results['clase_ternaria'] = test_results['baja'].map(label_antimapping)

    # ganancias según threshold
    thrs = []
    ganancias = []
    estimulos = []
    for thr in np.linspace(0.01, 0.05, 100):
        gain = ganancia(test_results.baja, test_results.proba_sem_baja2, thr)
        esti = np.where(test_results.proba_sem_baja2 >= thr, 1, 0).sum()
        thrs.append(thr)
        ganancias.append(gain)
        estimulos.append(esti)
            
    df_semillerios[f'threshold'] = thrs # será siempre igual
    df_semillerios[f'ganancias_{sem}'] = ganancias # 
    df_semillerios[f'estimulos_{sem}'] = estimulos # 

    # maxima ganancia y condiciones
    max_gan_idx = df_semillerios[f'ganancias_{sem}'].idxmax()
    max_gan_thr = df_semillerios['threshold'][max_gan_idx]
    max_gan = df_semillerios[f'ganancias_{sem}'][max_gan_idx]
    max_estimulos = df_semillerios[f'estimulos_{sem}'][max_gan_idx]
        
    print(f"sem: {sem}, ganancia max: {max_gan}, thr: {max_gan_thr}, estimulos: {max_estimulos}")
    
    # registro condiciones de maxima para summary
    semillerio.append(sem)
    l_max_gan_thr.append(max_gan_thr)
    l_max_gan.append(max_gan)
    l_max_gan_esti.append(max_estimulos)

    i += 1

df_semillerios_summary = pd.DataFrame({
                            'semillerio': semillerio,
                            'thr_max_gan': l_max_gan_thr,
                            'max_gan': l_max_gan,
                            'est_max_gan': l_max_gan_esti,
                        })

In [None]:
df_semillerios.head()

In [None]:
df_semillerios_summary.head()

In [None]:
sem_name = 'df_semillerios_mod.csv'
sem_file = pred_path + sem_name

df_semillerios.to_csv(sem_file)
print(f"Semillerio guardado en {sem_file}")

In [None]:
sem_summ_name = 'df_semillerios_summ_mod.csv'
sem_summ_file = pred_path + sem_name

df_semillerios_summary.to_csv(sem_summ_file)
print(f"Semillerio summary guardado en {sem_summ_file}")

In [None]:
plot_ganancias_estimulos(df_semillerios)

In [None]:
del X_train_imp
del X_test_imp

### Datos X

In [None]:
data = pd.read_parquet(dataset_file_fe6_6xpqt)

# Mapear etiquetas de clase a números
label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

# Simulación para Kaggle
X_train = data[data['foto_mes'].isin([202101, 202102, 202103, 202104])]
y_train = X_train['clase_ternaria']
X_train = X_train.drop(columns=['clase_ternaria'])

mes_futuro = 202106 # usado como test
X_test = data[data['foto_mes'] == mes_futuro]
y_test = X_test['clase_ternaria']
X_test = X_test.drop(columns=['clase_ternaria'])

del data

Preprocesando data

In [None]:
# Imputacion de Xs
cols_with_all_nan = X_train.columns[X_train.isna().all()].tolist()
print("Columns with all NaN values:", cols_with_all_nan)
X_train = X_train.drop(columns=cols_with_all_nan)
X_test = X_test.drop(columns=cols_with_all_nan)

# Imputación de nulls
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)
X_test_imp = pd.DataFrame(imp_median.transform(X_test), columns=X_train.columns)

del X_train
del X_test

# Codificar variables categóricas
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')
    X_test_imp[col] = X_test_imp[col].astype('category')

**Modelo semillero**

Con datos X, según Denicolay modificado

In [None]:
from lightgbm import LGBMClassifier

semillero_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 2*0.2, # x2 para tratar de compenzar la falta de variables
                  'extra_trees': False,
}

semillero_params.update({'n_jobs': -1})

print("Running back-testing for Semillerio Denicolay")
print(semillero_params)

# para registrar las probabilidades
df_s_proba = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                        })

label_antimapping = {0:'CONTINUA', 1:'BAJA+1', 2:'BAJA+2'}
df_s_proba['clase_ternaria'] = df_s_proba['baja'].map(label_antimapping)

df_semillerios_modx = pd.DataFrame()

semillerio = []
l_max_gan = []
l_max_gan_thr = []
l_max_gan_esti = []

i = 0
r_sem = range(217163, 455783, 7*7*7*7*10)
sem_total = len(list(range(217163, 455783, 7*7*7*7*10)))
r_s = range(217163, 455783, 7*7*7*7*4)
s_total = len(list(range(217163, 455783, 7*7*7*7*4)))

for sem in r_sem:
    print(f"# Semillerio: {sem}, {i+1} de {sem_total}")

    # para operar sobre el semillerio
    df_s = df_s_proba

    j = 0
    for s in r_s:
    #for s in range(0,2):
        # nueva instancia del modelos con semilla
        seed = s+(7+j)**i
        model = LGBMClassifier(**semillero_params, random_state=seed)
        # entreno
        print(f"Entrenando modelo con semilla: {seed}, {j+1} de {s_total}")
        model.fit(X_train_imp, y_train)
        # predigo proba
        y_pred_proba = model.predict_proba(X_test_imp)
        # proba baja+2
        proba_baja2 = y_pred_proba[:,2]
        df_s[f'proba_s{seed}'] = proba_baja2
        j += 1
    
    # calculando proba mean
    proba_s_columns = df_s.filter(regex='^proba_s')
    proba_s_mean = proba_s_columns.mean(axis=1)

    df_s['proba_sem_mean'] = proba_s_mean  

    # dataframe con el test
    test_results = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                            'proba_sem_baja2' : df_s['proba_sem_mean'].values
                        })
    test_results['clase_ternaria'] = test_results['baja'].map(label_antimapping)

    # ganancias según threshold
    thrs = []
    ganancias = []
    estimulos = []
    for thr in np.linspace(0.01, 0.05, 100):
        gain = ganancia(test_results.baja, test_results.proba_sem_baja2, thr)
        esti = np.where(test_results.proba_sem_baja2 >= thr, 1, 0).sum()
        thrs.append(thr)
        ganancias.append(gain)
        estimulos.append(esti)
            
    df_semillerios_modx[f'threshold'] = thrs # será siempre igual
    df_semillerios_modx[f'ganancias_{sem}'] = ganancias # 
    df_semillerios_modx[f'estimulos_{sem}'] = estimulos # 

    # maxima ganancia y condiciones
    max_gan_idx = df_semillerios_modx[f'ganancias_{sem}'].idxmax()
    max_gan_thr = df_semillerios_modx['threshold'][max_gan_idx]
    max_gan = df_semillerios_modx[f'ganancias_{sem}'][max_gan_idx]
    max_estimulos = df_semillerios_modx[f'estimulos_{sem}'][max_gan_idx]
        
    print(f"sem: {sem}, ganancia max: {max_gan}, thr: {max_gan_thr}, estimulos: {max_estimulos}")
    
    # registro condiciones de maxima para summary
    semillerio.append(sem)
    l_max_gan_thr.append(max_gan_thr)
    l_max_gan.append(max_gan)
    l_max_gan_esti.append(max_estimulos)

    i += 1

df_semillerios_summary = pd.DataFrame({
                            'semillerio': semillerio,
                            'thr_max_gan': l_max_gan_thr,
                            'max_gan': l_max_gan,
                            'est_max_gan': l_max_gan_esti,
                        })

In [None]:
df_semillerios_modx.head()

In [None]:
df_semillerios_summary.head()

In [None]:
sem_name = 'df_semillerios_modx.csv'
sem_file = pred_path + sem_name

df_semillerios_modx.to_csv(sem_file)
print(f"Semillerio guardado en {sem_file}")

In [None]:
sem_summ_name = 'df_semillerios_summ_modx.csv'
sem_summ_file = pred_path + sem_name

df_semillerios_summary.to_csv(sem_summ_file)
print(f"Semillerio summary guardado en {sem_summ_file}")

In [None]:
plot_ganancias_estimulos(df_semillerios_modx)

**Modelo semillero**

Con las distintas semillas, según Denicolay std

In [None]:
from lightgbm import LGBMClassifier

semillero_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 0.2, # tal como lo mostró el profe
                  'extra_trees': False,
}

semillero_params.update({'n_jobs': -1})

print("Running back-testing for Semillerio Denicolay")
print(semillero_params)

# para registrar las probabilidades
df_s_proba = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                        })

label_antimapping = {0:'CONTINUA', 1:'BAJA+1', 2:'BAJA+2'}
df_s_proba['clase_ternaria'] = df_s_proba['baja'].map(label_antimapping)

df_semillerios_x = pd.DataFrame()

semillerio = []
l_max_gan = []
l_max_gan_thr = []
l_max_gan_esti = []

i = 0
r_sem = range(217163, 455783, 7*7*7*7*10)
sem_total = len(list(range(217163, 455783, 7*7*7*7*10)))
r_s = range(217163, 455783, 7*7*7*7*4)
s_total = len(list(range(217163, 455783, 7*7*7*7*4)))

for sem in r_sem:
    print(f"# Semillerio: {sem}, {i+1} de {sem_total}")

    # para operar sobre el semillerio
    df_s = df_s_proba

    j = 0
    for s in r_s:
    #for s in range(0,2):
        # nueva instancia del modelos con semilla
        seed = s+(7+j)**i
        model = LGBMClassifier(**semillero_params, random_state=seed)
        # entreno
        print(f"Entrenando modelo con semilla: {seed}, {j+1} de {s_total}")
        model.fit(X_train_imp, y_train)
        # predigo proba
        y_pred_proba = model.predict_proba(X_test_imp)
        # proba baja+2
        proba_baja2 = y_pred_proba[:,2]
        df_s[f'proba_s{seed}'] = proba_baja2
        j += 1
    
    # calculando proba mean
    proba_s_columns = df_s.filter(regex='^proba_s')
    proba_s_mean = proba_s_columns.mean(axis=1)

    df_s['proba_sem_mean'] = proba_s_mean  

    # dataframe con el test
    test_results = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                            'proba_sem_baja2' : df_s['proba_sem_mean'].values
                        })
    test_results['clase_ternaria'] = test_results['baja'].map(label_antimapping)

    # ganancias según threshold
    thrs = []
    ganancias = []
    estimulos = []
    for thr in np.linspace(0.01, 0.05, 100):
        gain = ganancia(test_results.baja, test_results.proba_sem_baja2, thr)
        esti = np.where(test_results.proba_sem_baja2 >= thr, 1, 0).sum()
        thrs.append(thr)
        ganancias.append(gain)
        estimulos.append(esti)
            
    df_semillerios_x[f'threshold'] = thrs # será siempre igual
    df_semillerios_x[f'ganancias_{sem}'] = ganancias # 
    df_semillerios_x[f'estimulos_{sem}'] = estimulos # 

    # maxima ganancia y condiciones
    max_gan_idx = df_semillerios_x[f'ganancias_{sem}'].idxmax()
    max_gan_thr = df_semillerios_x['threshold'][max_gan_idx]
    max_gan = df_semillerios_x[f'ganancias_{sem}'][max_gan_idx]
    max_estimulos = df_semillerios_x[f'estimulos_{sem}'][max_gan_idx]
        
    print(f"sem: {sem}, ganancia max: {max_gan}, thr: {max_gan_thr}, estimulos: {max_estimulos}")
    
    # registro condiciones de maxima para summary
    semillerio.append(sem)
    l_max_gan_thr.append(max_gan_thr)
    l_max_gan.append(max_gan)
    l_max_gan_esti.append(max_estimulos)

    i += 1

df_semillerios_summary = pd.DataFrame({
                            'semillerio': semillerio,
                            'thr_max_gan': l_max_gan_thr,
                            'max_gan': l_max_gan,
                            'est_max_gan': l_max_gan_esti,
                        })

In [None]:
df_semillerios_x.head()

In [None]:
df_semillerios_summary.head()

In [None]:
sem_name = 'df_semillerios_x.csv'
sem_file = pred_path + sem_name

df_semillerios_x.to_csv(sem_file)
print(f"Semillerio guardado en {sem_file}")

In [None]:
sem_summ_name = 'df_semillerios_summ_x.csv'
sem_summ_file = pred_path + sem_name

df_semillerios_summary.to_csv(sem_summ_file)
print(f"Semillerio summary guardado en {sem_summ_file}")

In [None]:
plot_ganancias_estimulos(df_semillerios_x)

In [None]:
del X_train_imp
del X_test_imp

### Comparación de semillerios

Con métodos visuales

In [None]:
def plot_ganancias_todas_superpuestas(df_sem1, df_sem2):
    fig, ax1 = plt.subplots()

    # Colores y nombres para cada modelo
    colores = ['blue', 'green']
    modelos = [df_sem1, df_sem2]
    nombres_modelos = ['xgb sem', 'denicolay sem']

    for i, df_sem in enumerate(modelos):
        color = colores[i]
        nombre_modelo = nombres_modelos[i]
        
        # Obtener el threshold
        threshold = df_sem['threshold']
        
        # Graficar ganancias individuales
        for sem in df_sem.columns:
            if sem.startswith('ganancias_'):
                ganancias = df_sem[sem]
                ax1.plot(threshold, ganancias, color=color, alpha=0.1)
        
        # Calcular y graficar las ganancias promedio
        ganancias_cols = [col for col in df_sem.columns if col.startswith('ganancias_')]
        ganancias_avg = df_sem[ganancias_cols].mean(axis=1)
        ax1.plot(threshold, ganancias_avg, label=f'Ganancias Promedio {nombre_modelo}', color=color, linewidth=2)
        
        # Destacar la ganancia máxima
        max_gain_idx = ganancias_avg.idxmax()
        max_gain_threshold = threshold.iloc[max_gain_idx]
        max_gain = ganancias_avg.iloc[max_gain_idx]
        
        ax1.scatter(max_gain_threshold, max_gain, color=color, zorder=5)
        ax1.annotate(f"Max Gain {nombre_modelo}: {max_gain:.2f}\nThresh: {max_gain_threshold:.3f}", 
                    (max_gain_threshold, max_gain), textcoords="offset points", xytext=(0,-20*(i+1)), ha='center', color=color)

    ax1.set_xlabel('Threshold')
    ax1.set_ylabel('Ganancias')
    ax1.legend()
    plt.title('Comparación de Ganancias vs Threshold entre Modelos')
    plt.show()

In [None]:
plot_ganancias_todas_superpuestas(df_semillerios, df_semillerios_modx)

In [None]:
plot_ganancias_todas_superpuestas(df_semillerios_modx, df_semillerios_x)

In [None]:
plot_ganancias_todas_superpuestas(df_semillerios, df_semillerios_x)

In [None]:
def plot_ganancias_promedio_superpuestas(df_sem1, df_sem2):
    fig, ax1 = plt.subplots()

    # Colores y nombres para cada modelo
    colores = ['blue', 'green']
    modelos = [df_sem1, df_sem2]
    nombres_modelos = ['xgb sem', 'denicolay sem']

    for i, df_sem in enumerate(modelos):
        color = colores[i]
        nombre_modelo = nombres_modelos[i]
        
        # Obtener el threshold
        threshold = df_sem['threshold']
        
        # Calcular las ganancias promedio
        ganancias_cols = [col for col in df_sem.columns if col.startswith('ganancias_')]
        ganancias_avg = df_sem[ganancias_cols].mean(axis=1)
        
        # Graficar las ganancias promedio
        ax1.plot(threshold, ganancias_avg, label=f'Ganancias Promedio {nombre_modelo}', color=color)
        
        # Destacar la ganancia máxima
        max_gain_idx = ganancias_avg.idxmax()
        max_gain_threshold = threshold.iloc[max_gain_idx]
        max_gain = ganancias_avg.iloc[max_gain_idx]
        
        ax1.scatter(max_gain_threshold, max_gain, color=color, zorder=5)
        ax1.annotate(f"Max Gain {nombre_modelo}: {max_gain:.2f}\nThresh: {max_gain_threshold:.3f}", 
                    (max_gain_threshold, max_gain), textcoords="offset points", xytext=(0,-20*(i+1)), ha='center', color=color)

    ax1.set_xlabel('Threshold')
    ax1.set_ylabel('Ganancias Promedio')
    ax1.legend()
    plt.title('Comparación de Ganancias Promedio vs Threshold entre Modelos')
    plt.show()

In [None]:
plot_ganancias_promedio_superpuestas(df_semillerios, df_semillerios_modx)

In [None]:
plot_ganancias_promedio_superpuestas(df_semillerios_modx, df_semillerios_x)

In [None]:
plot_ganancias_promedio_superpuestas(df_semillerios, df_semillerios_x)

______