# Back-testing

Incluye:

- Uso de datos undersampling para train, calculando threshold opt red
- Uso de datos totales para train, para calcular threshold opt

In [None]:
# Librerias
import os
import pandas as pd
import numpy as np

#######
# rutas
# datasets
from config import dataset_file_fe6_6xpqt,\
                   dataset_file_fe6_6xpqt_opt_under,\
                   dataset_202107
# optimizacion
from config import db_path
# modelos
from config import modelos_path
# predicciones
from config import pred_path

##########
# pipeline
from processing import ModelPipeline, plot_comparisons_on_kaggle_split
from processing import analyze_study
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import optuna
import warnings

# Ignorar advertencias de tipo UserWarning
warnings.filterwarnings('ignore', category=UserWarning, module='pandas')
warnings.filterwarnings('ignore', category=Warning, module='lightgbm')
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

Variables de train y test

In [5]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train_all = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
                 201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
                 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106, 202107]

mes_train_ult_3_meses = [202105, 202106, 202107]

mes_train_ult_6_meses = [202102, 202103, 202104, 202105, 202106, 202107]

mes_train_ult_9_meses = [202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106, 202107]

mes_train_ult_anio = [202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106, 202107]

mes_train = [202107]
mes_test = 202109

threshold = 0.025

semillas = [437809, 327347, 392879, 455783, 217163]

## Loading data

In [None]:
data = pd.read_parquet(dataset_file_fe6_6xpqt)

# running local
# data = pd.read_parquet("datos/datasets_competencia_03_fe6x_opt_under.parquet")

# Mapear etiquetas de clase a números
label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

# Simulación para Kaggle
mes_bt_ult_anio = [202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105]

X_train = data[data['foto_mes'].isin(mes_bt_ult_anio)]
y_train = X_train['clase_ternaria']
X_train = X_train.drop(columns=['clase_ternaria'])

del data

data_futuro = pd.read_parquet(dataset_202107)

# running local
# data_futuro = pd.read_parquet("datos/datasets_competencia_03_202107.parquet")

mes_futuro = 202107 # usado como test
X_test = data_futuro[data_futuro['foto_mes'] == mes_futuro]
y_test = X_test['clase_ternaria']
X_test = X_test.drop(columns=['clase_ternaria'])

del data_futuro

Preprocesando data

In [7]:
# Imputacion de Xs
cols_with_all_nan = X_train.columns[X_train.isna().all()].tolist()
print("Columns with all NaN values:", cols_with_all_nan)
X_train = X_train.drop(columns=cols_with_all_nan)
X_test = X_test.drop(columns=cols_with_all_nan)

# Imputación de nulls
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)
X_test_imp = pd.DataFrame(imp_median.transform(X_test), columns=X_train.columns)

del X_train
del X_test

# Codificar variables categóricas
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')
    X_test_imp[col] = X_test_imp[col].astype('category')

Columns with all NaN values: ['payroll_slope_1_foto_mes', 'cuenta_corriente_slope_1_foto_mes', 'visa_consumo_slope_1_foto_mes', 'comisiones_mantenimiento_slope_1_foto_mes', 'comisiones_otras_slope_1_foto_mes']


## Modelos candidatos

Luego de una comparación de modelos candidatos en comp02_pipeline_comp

Se decide optar como **modelo regular** el:

> **xgb prepro6 fe6 y 3 opt (local opt parcial)**

Mientras que, para calcular una predicción con semillerío:

Modelo **semillero de Denicolay** (modificado)

semillerio_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 2*0.2, # x2 para tratar de compenzar la falta de variables
                  'extra_trees': False,
                  'random_state': semillas[s],
}

## Evaluación del punto de corte

Función de ganancia

In [8]:
def ganancia(y, y_hat, thr, 
             ganancia_acierto = ganancia_acierto, 
             costo_estimulo = costo_estimulo,
             target = 2, prop=1):

    # Calcular la ganancia para cada fila
    gains = np.where(y_hat >= thr, np.where(y == target, ganancia_acierto, -costo_estimulo), 0)

    # Sumar las ganancias
    estimated_gain = gains.sum()/prop

    return estimated_gain

### Entrenamiento con óptimos parámetros

**Modelo regular**

Con las distintas semillas

In [9]:
# len(list(range(217163, 455783, 7*7*7*7))) # 100 semillas
# len(list(range(217163, 455783, 7*7*7*7*2))) # 50 semillas
# len(list(range(217163, 455783, 7*7*7*7*4))) # 25 semillas

In [None]:
from lightgbm import LGBMClassifier

# Tomando al mejor modelo optimizado

# prepro = 6 # data quality + data drifting
# fe = 6 # feature engineering conceptual 6 meses
# training = 3 # un mes de optimización

# storage_name = "sqlite:///" + db_path + "optimization_tree.db"

# carga local
storage_name = "sqlite:///optimizacion/optimization_tree.db"
study_name = f"exp_lgbm_comp03_local_v00"

study = optuna.load_study(study_name=study_name, storage=storage_name)

# Mejores parámetros
opt_params = study.best_trial.params
opt_params.update({'n_jobs': -1})

print("Running back-testing for LGBMClassifier")
print(opt_params)

# para registrar las probabilidades
df_s_proba = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                        })

label_antimapping = {0:'CONTINUA', 1:'BAJA+1', 2:'BAJA+2'}
df_s_proba['clase_ternaria'] = df_s_proba['baja'].map(label_antimapping)

seeds = []
max_ganancia = []
max_ganancia_thr = []
max_ganancia_esti = []

s_r = range(217163, 455783, 7*7*7*7*5) # 20 semillas
total_s = len(list(s_r)) 
for s in s_r: 
    # nueva instancia del modelos con semilla
    model = LGBMClassifier(**opt_params, random_state=s)
    # entreno
    model.fit(X_train_imp, y_train)
    # predigo proba
    y_pred_proba = model.predict_proba(X_test_imp)
    # proba baja+2
    proba_baja2 = y_pred_proba[:,2]
    df_s_proba[f'proba_{s}'] = proba_baja2

    # dataframe con el test
    test_results = pd.DataFrame({
                            'client': y_test.index,
                            'baja': y_test.values,
                            'proba_baja2' : y_pred_proba[:,2]
                        })
    test_results['clase_ternaria'] = test_results['baja'].map(label_antimapping)

    # ganancias según threshold
    thrs = []
    ganancias = []
    estimulos = []
    for thr in np.linspace(0.01, 0.03, 1000):
        gain = ganancia(test_results.baja, test_results.proba_baja2, thr)
        esti = np.where(test_results.proba_baja2 >= thr, 1, 0).sum()
        thrs.append(thr)
        ganancias.append(gain)
        estimulos.append(esti)
        
    df_ganancias = pd.DataFrame({
                                'threshold': thrs,
                                'ganancias': ganancias,
                                'estimulos': estimulos
                            })

    # maxima ganancia
    max_gain_idx = df_ganancias['ganancias'].idxmax()
    max_gain = df_ganancias['ganancias'][max_gain_idx]
    max_gain_thr = df_ganancias['threshold'][max_gain_idx]
    max_estimulos = df_ganancias['estimulos'][max_gain_idx]
    
    # registro condiciones de maxima
    seeds.append(s)
    max_ganancia.append(max_gain)
    max_ganancia_thr.append(max_gain_thr)
    max_ganancia_esti.append(max_estimulos)

    print(f"\ns: {s}, ganancia max: {max_gain}, thr: {max_gain_thr}, estimulos: {max_estimulos}\n\n")

df_ganancias_semillas = pd.DataFrame({
                            'semillas': seeds,
                            'max_ganancias': max_ganancia,
                            'threshold': max_ganancia_thr,
                            'estimulos': max_ganancia_esti,
                        })

df_ganancias_semillas.to_csv(db_path+'df_ganancias_semillas.csv', sep=';')

Running back-testing for LGBMClassifier
{'n_estimators': 532, 'num_leaves': 74, 'learning_rate': 0.02597037622291732, 'min_data_in_leaf': 160, 'feature_fraction': 0.5306395912844186, 'n_jobs': -5}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.291446 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85183
[LightGBM] [Info] Number of data points in the train set: 175211, number of used features: 649
[LightGBM] [Info] Start training from score -0.089624
[LightGBM] [Info] Start training from score -3.173606
[LightGBM] [Info] Start training from score -3.126460

s: 217163, ganancia max: 0.0, thr: 0.5623832383238324, estimulos: 0


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.323846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85183
[LightGBM] [Info] Number of data points in the train set: 175211, number of used 

KeyboardInterrupt: 

Análisis del threshold

In [None]:
# df_ganancias_semillas = pd.read_csv('df_ganancias_semillas.csv', sep=';', decimal=',')

In [None]:
df_ganancias_semillas.head(5)

Unnamed: 0,semillas,max_ganancias,threshold,estimulos
0,217163,95900000,0.013636,13420
1,217212,95473000,0.020505,9401
2,217261,96747000,0.017273,10899
3,217310,96971000,0.014444,12947
4,217359,96369000,0.014848,12633


In [None]:
df_ganancias_semillas.describe()

Unnamed: 0,semillas,max_ganancias,threshold,estimulos
count,54.0,54.0,54.0,54.0
mean,218461.5,97045410.0,0.015627,12174.148148
std,770.874503,729396.0,0.002141,1361.594451
min,217163.0,95158000.0,0.012828,9048.0
25%,217812.25,96633250.0,0.01404,11299.75
50%,218461.5,97079500.0,0.014848,12580.0
75%,219110.75,97560750.0,0.016768,13111.0
max,219760.0,99190000.0,0.021313,14440.0


La media del threshold para ganancias máximas en backtesting está en 0.0156 de proba y 0.0148 de mediana.

______