# Back-testing

In [1]:
# Librerias
import os
import pandas as pd
import numpy as np

#######
# rutas
# datasets
from config import dataset_file_fe6_6pqt
# optimizacion
from config import db_path
# modelos
from config import modelos_path
# predicciones
from config import pred_path

##########
# pipeline
from processing import ModelPipeline, plot_comparisons_on_kaggle_split
from processing import analyze_study
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import optuna
import warnings

# Ignorar advertencias de tipo UserWarning
warnings.filterwarnings('ignore', category=UserWarning, module='pandas')
warnings.filterwarnings('ignore', category=Warning, module='lightgbm')
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')
warnings.filterwarnings('ignore', category=Warning, module='xgboost')
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')

Variables de train y test

In [2]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train_all = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
                 201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
                 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_3_meses = [202104, 202105, 202106]

mes_train_ult_6_meses = [202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_9_meses = [202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_anio = [202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train = [202106]
mes_test = 202108

threshold = 0.025

semillas = [437809, 327347, 392879, 455783, 217163]

## Loading data

In [3]:
# data = pd.read_parquet(dataset_file_fe6_6pqt)

# running local
data = pd.read_parquet("datos/datasets_competencia_02_fe6_6_6m_train.parquet")

# Mapear etiquetas de clase a números
label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

# Simulación para Kaggle
X_train = data[data['foto_mes'].isin([202101, 202102, 202103, 202104])]
y_train = X_train['clase_ternaria']
X_train = X_train.drop(columns=['clase_ternaria'])

mes_futuro = 202106 # usado como test
X_test = data[data['foto_mes'] == mes_futuro]
y_test = X_test['clase_ternaria']
X_test = X_test.drop(columns=['clase_ternaria'])

del data

Preprocesando data

In [4]:
# Imputacion de Xs
cols_with_all_nan = X_train.columns[X_train.isna().all()].tolist()
print("Columns with all NaN values:", cols_with_all_nan)
X_train = X_train.drop(columns=cols_with_all_nan)
X_test = X_test.drop(columns=cols_with_all_nan)

# Imputación de nulls
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)
X_test_imp = pd.DataFrame(imp_median.transform(X_test), columns=X_train.columns)

del X_train
del X_test

# Codificar variables categóricas
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')
    X_test_imp[col] = X_test_imp[col].astype('category')

Columns with all NaN values: ['payroll_slope_1_foto_mes', 'cuenta_corriente_slope_1_foto_mes', 'visa_consumo_slope_1_foto_mes', 'comisiones_mantenimiento_slope_1_foto_mes', 'comisiones_otras_slope_1_foto_mes']


## Modelos candidatos

Luego de una comparación de modelos candidatos en comp02_pipeline_comp

Se decide optar como **modelo regular** el:

> **xgb prepro6 fe6 y 3 opt (local opt parcial)**

Mientras que, para calcular una predicción con semillerío:

Modelo **semillero de Denicolay** (modificado)

semillerio_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 2*0.2, # x2 para tratar de compenzar la falta de variables
                  'extra_trees': False,
                  'random_state': semillas[s],
}

## Evaluación del punto de corte

Función de ganancia

In [5]:
def ganancia(y, y_hat, thr, 
             ganancia_acierto = ganancia_acierto, 
             costo_estimulo = costo_estimulo,
             target = 2, prop=1):

    # Calcular la ganancia para cada fila
    gains = np.where(y_hat >= thr, np.where(y == target, ganancia_acierto, -costo_estimulo), 0)

    # Sumar las ganancias
    estimated_gain = gains.sum()/prop

    return estimated_gain

### Entrenamiento con óptimos parámetros

**Modelo regular**

Con las distintas semillas

In [None]:
from xgboost import XGBClassifier

# Tomando al mejor modelo optimizado

prepro = 6 # data quality + data drifting
fe = 6 # feature engineering conceptual 6 meses
training = 3 # un mes de optimización

# storage_name = "sqlite:///" + db_path + "optimization_tree.db"

# carga local
storage_name = "sqlite:///optimizacion/optimization_tree.db"
study_name = f"exp_xgb_pr{prepro}_fe{fe}_tr{training}_x"

study = optuna.load_study(study_name=study_name, storage=storage_name)

# Mejores parámetros
opt_params = study.best_trial.params
opt_params.update({'n_jobs': -1})

# para registrar las probabilidades
df_s_proba = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                        })

label_antimapping = {0:'CONTINUA', 1:'BAJA+1', 2:'BAJA+2'}
df_s_proba['clase_ternaria'] = df_s_proba['baja'].map(label_antimapping)

seeds = []
max_ganancia = []
max_ganancia_thr = []
max_ganancia_esti = []

for s in range(217163, 455783, 7*7): # con 49 semillas
    # nueva instancia del modelos con semilla
    model = XGBClassifier(**opt_params, seed=s)
    # entreno
    model.fit(X_train_imp, y_train)
    # predigo proba
    y_pred_proba = model.predict_proba(X_test_imp)
    # proba baja+2
    proba_baja2 = y_pred_proba[:,2]
    df_s_proba[f'proba_{s}'] = proba_baja2

    # dataframe con el test
    test_results = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                            'proba_baja2' : y_pred_proba[:,2]
                        })
    test_results['clase_ternaria'] = test_results['baja'].map(label_antimapping)

    # ganancias según threshold
    thrs = []
    ganancias = []
    estimulos = []
    for thr in np.linspace(0.01, 0.05, 100):
        gain = ganancia(test_results.baja, test_results.proba_baja2, thr)
        esti = np.where(test_results.proba_baja2 >= thr, 1, 0).sum()
        thrs.append(thr)
        ganancias.append(gain)
        estimulos.append(esti)
        
    df_ganancias = pd.DataFrame({
                                'threshold': thrs,
                                'ganancias': ganancias,
                                'estimulos': estimulos
                            })

    # maxima ganancia
    max_gain_idx = df_ganancias['ganancias'].idxmax()
    max_gain = df_ganancias['ganancias'][max_gain_idx]
    max_gain_thr = df_ganancias['threshold'][max_gain_idx]
    max_estimulos = df_ganancias['estimulos'][max_gain_idx]
    
    # registro condiciones de maxima
    seeds.append(s)
    max_ganancia.append(max_gain)
    max_ganancia_thr.append(max_gain_thr)
    max_ganancia_esti.append(max_estimulos)

    print(f"s: {s}, ganancia max: {max_gain}, thr: {max_gain_thr}, estimulos: {max_estimulos}")

df_ganancias_semillas = pd.DataFrame({
                            'semillas': seeds,
                            'max_ganancias': max_ganancia,
                            'threshold': max_ganancia_thr,
                            'estimulos': max_ganancia_esti,
                        })

Análisis del threshold

In [None]:
df_ganancias_semillas.head(5)

Unnamed: 0,semillas,max_ganancias,threshold,estimulos
0,217163,141239000.0,0.018485,10423
1,222076,140609000.0,0.020505,9673
2,226989,142751000.0,0.022121,9167
3,231902,142275000.0,0.019697,9795
4,236815,144242000.0,0.017273,10674


In [None]:
df_ganancias_semillas.describe()

Unnamed: 0,semillas,max_ganancias,threshold,estimulos
count,49.0,49.0,49.0,49.0
mean,335075.0,142448700.0,0.019367,10016.714286
std,70200.334787,1257070.0,0.002068,894.506475
min,217163.0,139993000.0,0.015253,8272.0
25%,276119.0,141659000.0,0.018081,9400.0
50%,335075.0,142408000.0,0.018889,10142.0
75%,394031.0,143234000.0,0.020909,10564.0
max,452987.0,146349000.0,0.024545,12410.0


!REVISAR

La media del threshold para ganancias máximas en backtesting está en 0.0194 de proba y 0.0189 de mediana.

Esto implica una cantidad de estímulos de 10 mil +/- 895.

### Entrenamiento de modelo semillerio

Para evaluar corte en este tipo de ensamble

In [None]:
semillero_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 2*0.2, # x2 para tratar de compenzar la falta de variables
                  'extra_trees': False,
                  'random_state': semillas[s],
}
semillero_params.update({'n_jobs': -1})

# para registrar las probabilidades
df_sem_proba = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                        })

label_antimapping = {0:'CONTINUA', 1:'BAJA+1', 2:'BAJA+2'}
df_sem_proba['clase_ternaria'] = df_s_proba['baja'].map(label_antimapping)

semillerio = []
mean_max_gan = []
mean_max_gan_thr = []
mean_max_gan_esti = []

for sem in range(217163, 455783, 27): # con 27 semillerios

    seeds = []
    max_gan = []
    max_gan_thr = []
    max_gan_esti = []

    for s in range(217163, 455783, 7*7): # con 49 semillas por semillerio

        # nueva instancia del modelos con semilla
        model = XGBClassifier(**semillero_params, seed=s)
        # entreno
        model.fit(X_train_imp, y_train)
        # predigo proba
        y_pred_proba = model.predict_proba(X_test_imp)
        # proba baja+2
        proba_baja2 = y_pred_proba[:,2]
        df_s_proba[f'proba_{s}'] = proba_baja2

        # dataframe con el test
        test_results = pd.DataFrame({
                                'client': y_test.index,
                                'baja': y_test.values,
                                'proba_baja2' : y_pred_proba[:,2]
                            })
        test_results['clase_ternaria'] = test_results['baja'].map(label_antimapping)

        # ganancias según threshold
        thrs = []
        ganancias = []
        estimulos = []
        for thr in np.linspace(0.01, 0.05, 100):
            gain = ganancia(test_results.baja, test_results.proba_baja2, thr)
            esti = np.where(test_results.proba_baja2 >= thr, 1, 0).sum()
            thrs.append(thr)
            ganancias.append(gain)
            estimulos.append(esti)
            
        df_ganancias = pd.DataFrame({
                                    'threshold': thrs,
                                    'ganancias': ganancias,
                                    'estimulos': estimulos
                                })

        # maxima ganancia
        max_gain_idx = df_ganancias['ganancias'].idxmax()
        max_gain = df_ganancias['ganancias'][max_gain_idx]
        max_gain_thr = df_ganancias['threshold'][max_gain_idx]
        max_estimulos = df_ganancias['estimulos'][max_gain_idx]
        
        # registro condiciones de maxima
        seeds.append(s)
        max_gan.append(max_gain)
        max_gan_thr.append(max_gain_thr)
        max_gan_esti.append(max_estimulos)

        print(f"sem: {sem}, s: {s}, ganancia max: {max_gain}, thr: {max_gain_thr}, estimulos: {max_estimulos}")
    
    # registro condiciones de medias en el semillerio
    semillerio.append(sem)
    mean_max_gan.append(sum(max_gan) / len(max_gan))
    mean_max_gan_thr.append(sum(max_gan_thr) / len(max_gan_thr))
    mean_max_gan_esti.append(sum(max_gan_esti) / len(max_gan_esti))

    print(f"sem: {sem}, ganancia media: {sum(max_gan) / len(max_gan)},\
           thr medio: {sum(max_gan_thr) / len(max_gan_thr)},\
           estimulos: {sum(max_gan_esti) / len(max_gan_esti)}")

df_ganancias_semillerio = pd.DataFrame({
                            'semillerio': semillerio,
                            'mean_max_gan': mean_max_gan,
                            'mean_max_gan_thr': mean_max_gan_thr,
                            'mean_max_gan_esti': mean_max_gan_esti,
                        })

Análisis del threshold

In [None]:
df_ganancias_semillerio.head(5)

In [None]:
df_ganancias_semillerio.describe()

!REVISAR

La media del threshold para ganancias máximas en backtesting para semillerío está en 0.0194 de proba y 0.0189 de mediana.

Esto implica una cantidad de estímulos de 10 mil +/- 895.

______