# Back-testing

> Modelo optimizado, evaluado con semillas para explorar el umbral óptimo

In [1]:
# Librerias
import os
import pandas as pd
import numpy as np

#######
# rutas
# datasets
from config import dataset_file_fe6_6xpqt
from config import db_path
# modelos
from config import modelos_path
# predicciones
from config import pred_path

##########
# pipeline
from processing import ModelPipeline, plot_comparisons_on_kaggle_split
from processing import analyze_study
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import optuna
import warnings

# Ignorar advertencias de tipo UserWarning
warnings.filterwarnings('ignore', category=UserWarning, module='pandas')
warnings.filterwarnings('ignore', category=Warning, module='lightgbm')
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

  from .autonotebook import tqdm as notebook_tqdm


Variables de train y test

In [2]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train_all = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
                 201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
                 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106, 202107]

mes_train_ult_3_meses = [202105, 202106, 202107]

mes_train_ult_6_meses = [202102, 202103, 202104, 202105, 202106, 202107]

mes_train_ult_9_meses = [202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106, 202107]

mes_train_ult_anio = [202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106, 202107]

mes_train = [202107]
mes_test = 202109

threshold = 0.025

semillas = [437809, 327347, 392879, 455783, 217163]

## Loading data

In [9]:
data = pd.read_parquet(dataset_file_fe6_6xpqt)

# running local
# data = pd.read_parquet("datos/datasets_competencia_03_fe6x_opt_under.parquet")

# Mapear etiquetas de clase a números
label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

# Simulación para Kaggle
mes_bt_ult_anio = [202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105]

X_train = data[data['foto_mes'].isin(mes_bt_ult_anio)]
y_train = X_train['clase_ternaria']
X_train = X_train.drop(columns=['clase_ternaria'])

mes_futuro = 202107 # usado como test
X_test = data[data['foto_mes'] == mes_futuro]
y_test = X_test['clase_ternaria']
X_test = X_test.drop(columns=['clase_ternaria'])

del data

# ESTO PARECE ROMPER LOS MODELOS
# data_futuro = pd.read_parquet(dataset_202107)

# running local
# data_futuro = pd.read_parquet("datos/datasets_competencia_03_202107.parquet")

# mes_futuro = 202107 # usado como test
# X_test = data_futuro[data_futuro['foto_mes'] == mes_futuro]
# y_test = X_test['clase_ternaria']
# X_test = X_test.drop(columns=['clase_ternaria'])

# del data_futuro

Preprocesando data

In [10]:
# Imputacion de Xs
cols_with_all_nan = X_train.columns[X_train.isna().all()].tolist()
print("Columns with all NaN values:", cols_with_all_nan)
X_train = X_train.drop(columns=cols_with_all_nan)
X_test = X_test.drop(columns=cols_with_all_nan)

# Imputación de nulls
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)
X_test_imp = pd.DataFrame(imp_median.transform(X_test), columns=X_train.columns)

del X_train
del X_test

# Codificar variables categóricas
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')
    X_test_imp[col] = X_test_imp[col].astype('category')

Columns with all NaN values: ['payroll_slope_1_foto_mes', 'cuenta_corriente_slope_1_foto_mes', 'visa_consumo_slope_1_foto_mes', 'comisiones_mantenimiento_slope_1_foto_mes', 'comisiones_otras_slope_1_foto_mes']


## Modelos candidatos

Luego de una comparación de modelos candidatos en comp03_pipeline_comp

Se decide optar como **modelo optimizado** el:

> **lgbm prepro6 fe6 y 12 opt (local opt 10 % de CONTINUA)**

Además, se trabaja con un modelo semillero, en comp03_back-testing_sem

## Evaluación del punto de corte

Función de ganancia

In [5]:
def ganancia(y, y_hat, thr, 
             ganancia_acierto = ganancia_acierto, 
             costo_estimulo = costo_estimulo,
             target = 2, prop=1):

    # Calcular la ganancia para cada fila
    gains = np.where(y_hat >= thr, np.where(y == target, ganancia_acierto, -costo_estimulo), 0)

    # Sumar las ganancias
    estimated_gain = gains.sum()/prop

    return estimated_gain

### Entrenamiento con óptimos parámetros

**Modelo regular**

Con las distintas semillas

In [6]:
# len(list(range(217163, 455783, 7*7*7*7))) # 100 semillas
# len(list(range(217163, 455783, 7*7*7*7*2))) # 50 semillas
# len(list(range(217163, 455783, 7*7*7*7*4))) # 25 semillas

In [11]:
from lightgbm import LGBMClassifier

# Tomando al mejor modelo optimizado

# prepro = 6 # data quality + data drifting
# fe = 6 # feature engineering conceptual 6 meses
# training = 3 # un mes de optimización

storage_name = "sqlite:///" + db_path + "optimization_tree.db"

# carga local
# storage_name = "sqlite:///optimizacion/optimization_tree.db"
study_name = f"exp_lgbm_comp03_local_v00"

study = optuna.load_study(study_name=study_name, storage=storage_name)

# Mejores parámetros
opt_params = study.best_trial.params
opt_params.update({'n_jobs': -1})

print("Running back-testing for LGBMClassifier")
print(opt_params)

# para registrar las probabilidades
df_s_proba = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                        })

label_antimapping = {0:'CONTINUA', 1:'BAJA+1', 2:'BAJA+2'}
df_s_proba['clase_ternaria'] = df_s_proba['baja'].map(label_antimapping)

seeds = []
max_ganancia = []
max_ganancia_thr = []
max_ganancia_esti = []

s_r = range(217163, 455783, 7*7*7*7*5) # 20 semillas
total_s = len(list(s_r)) 
for s in s_r: 
    # nueva instancia del modelos con semilla
    model = LGBMClassifier(**opt_params, random_state=s)
    # entreno
    model.fit(X_train_imp, y_train)
    # predigo proba
    y_pred_proba = model.predict_proba(X_test_imp)
    # proba baja+2
    proba_baja2 = y_pred_proba[:,2]
    df_s_proba[f'proba_{s}'] = proba_baja2

    # dataframe con el test
    test_results = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                            'proba_baja2' : y_pred_proba[:,2]
                        })
    test_results['clase_ternaria'] = test_results['baja'].map(label_antimapping)

    # ganancias según threshold
    thrs = []
    ganancias = []
    estimulos = []
    for thr in np.linspace(0.01, 0.03, 1000):
        gain = ganancia(test_results.baja, test_results.proba_baja2, thr)
        esti = np.where(test_results.proba_baja2 >= thr, 1, 0).sum()
        thrs.append(thr)
        ganancias.append(gain)
        estimulos.append(esti)
        
    df_ganancias = pd.DataFrame({
                                'threshold': thrs,
                                'ganancias': ganancias,
                                'estimulos': estimulos
                            })

    # maxima ganancia
    max_gain_idx = df_ganancias['ganancias'].idxmax()
    max_gain = df_ganancias['ganancias'][max_gain_idx]
    max_gain_thr = df_ganancias['threshold'][max_gain_idx]
    max_estimulos = df_ganancias['estimulos'][max_gain_idx]
    
    # registro condiciones de maxima
    seeds.append(s)
    max_ganancia.append(max_gain)
    max_ganancia_thr.append(max_gain_thr)
    max_ganancia_esti.append(max_estimulos)

    print(f"\ns: {s}, ganancia max: {max_gain}, thr: {max_gain_thr}, estimulos: {max_estimulos}\n\n")

df_ganancias_semillas = pd.DataFrame({
                            'semillas': seeds,
                            'max_ganancias': max_ganancia,
                            'threshold': max_ganancia_thr,
                            'estimulos': max_ganancia_esti,
                        })

df_ganancias_semillas.to_csv(db_path+'df_ganancias_semillas.csv', sep=';')

Running back-testing for LGBMClassifier
{'n_estimators': 532, 'num_leaves': 74, 'learning_rate': 0.02597037622291732, 'min_data_in_leaf': 160, 'feature_fraction': 0.5306395912844186, 'n_jobs': -1}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 5.895606 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 86140
[LightGBM] [Info] Number of data points in the train set: 1614498, number of used features: 659
[LightGBM] [Info] Start training from score -0.009347
[LightGBM] [Info] Start training from score -5.394395
[LightGBM] [Info] Start training from score -5.347249

s: 217163, ganancia max: 163492000.0, thr: 0.014324324324324324, estimulos: 12604


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 7.818549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 86210
[LightGBM] [Info] Number of data points in the train set: 1614498,

KeyboardInterrupt: 

Análisis del threshold

In [12]:
df_ganancias_semillas = pd.DataFrame({
                            'semillas': seeds,
                            'max_ganancias': max_ganancia,
                            'threshold': max_ganancia_thr,
                            'estimulos': max_ganancia_esti,
                        })

df_ganancias_semillas.to_csv(db_path+'df_ganancias_semillas.csv', sep=';')

In [None]:
# df_ganancias_semillas = pd.read_csv('df_ganancias_semillas.csv', sep=';', decimal=',')

In [13]:
df_ganancias_semillas.head(5)

Unnamed: 0,semillas,max_ganancias,threshold,estimulos
0,217163,163492000.0,0.014324,12604
1,229168,164038000.0,0.014845,12406
2,241173,163464000.0,0.015566,11768
3,253178,162715000.0,0.015025,12155
4,265183,163744000.0,0.012983,13608


In [14]:
df_ganancias_semillas.describe()

Unnamed: 0,semillas,max_ganancias,threshold,estimulos
count,12.0,12.0,12.0,12.0
mean,283190.5,163283200.0,0.014273,12750.5
std,43284.643062,870939.9,0.001694,1090.673562
min,217163.0,161553000.0,0.012322,10248.0
25%,250176.75,162919800.0,0.012983,12343.25
50%,283190.5,163404500.0,0.014364,12614.5
75%,316204.25,163555000.0,0.01489,13609.0
max,349218.0,165046000.0,0.018448,14076.0


La media del threshold para ganancias máximas en backtesting está en 0.01427 de proba y 0.01436 de mediana.

______