# Training Pipeline

Training the best M models

Incluye:

- Tuning de hyperparámetros (con meses históricos)

In [1]:
# Librerias
import os
import pandas as pd
import numpy as np

#######
# rutas
# datasets
from config import dataset_file_fe6_6pqt
# optimizacion
from config import db_path
# modelos
from config import modelos_path
# predicciones
from config import pred_path

##########
# pipeline
from processing import ModelPipeline
from sklearn.impute import SimpleImputer


Variables de train y test

In [2]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train_all = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
                 201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
                 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_3_meses = [202104, 202105, 202106]

mes_train_ult_6_meses = [202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_9_meses = [202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_anio = [202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train = [202106]
mes_test = 202108

threshold = 0.025

semillas = [437809, 327347, 392879, 455783, 217163]

## LightGBM 

**Prepro in 6 months and Conceptual FE 6 months**

> comp02_prepro_6.ipynb

> comp02_fe6_6.ipynb

**Usando los últimos 3 meses para optimizar**

In [None]:
# data = pd.read_parquet(dataset_file_fe6_6pqt)

# running local
data = pd.read_parquet("datos/datasets_competencia_02_fe6_6_3m_train.parquet")

# Mapear etiquetas de clase a números
label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

X_train = data[data['foto_mes'].isin(mes_train_ult_3_meses)]
y_train = X_train['clase_ternaria']
X_train = X_train.drop(columns=['clase_ternaria'])

del data

In [None]:
# Condiciones de la optimización
s = 1
prepro = 6 # data quality + data drifting
fe = 6 # feature engineering conceptual 6 meses
training = 3 # un mes de optimización

print("### Corriendo pipeline con LightGBM ###")
# Inicializar el pipeline con 'lightgbm'
pipeline_lgbm = ModelPipeline(data=None, seeds=semillas,
                              model_type='lightgbm', seed=s, n_jobs=-1)

# performed manually to reduce memory
# X_train, y_train = pipeline_lgbm.def_xy(mes_train_ult_3_meses)

# Identify columns with all NaN values
cols_with_all_nan = X_train.columns[X_train.isna().all()]
print("Columns with all NaN values:", cols_with_all_nan.tolist())

# Drop these columns
X_train = X_train.drop(columns=cols_with_all_nan) # extra limpieza

# Imputación de valores faltantes
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)

del X_train

# Opcional: Codificar variables categóricas
# LightGBM puede manejar variables categóricas directamente si se especifican
# Si tus datos tienen variables categóricas, puedes identificarlas y especificarlas en el modelo
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')

# Definir el almacenamiento para Optuna
# storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
storage_name = "sqlite:///optimizacion/optimization_tree.db"
study_name = f"exp_lgbm_pr{prepro}_fe{fe}_tr{training}_x"

print("\n# Optimizando el modelo")
pipeline_lgbm.optimize_model(
    X_train_imp, y_train,
    storage_name=storage_name,
    study_name=study_name,
    optimize=False,
    n_trials=200
)

del X_train_imp
del y_train
del pipeline_lgbm

### Corriendo pipeline con LightGBM ###
Columns with all NaN values: ['payroll_slope_1_foto_mes', 'cuenta_corriente_slope_1_foto_mes', 'visa_consumo_slope_1_foto_mes', 'comisiones_mantenimiento_slope_1_foto_mes', 'comisiones_otras_slope_1_foto_mes']

# Optimizando el modelo


[I 2024-11-20 07:56:19,158] Using an existing study with name 'exp_lgbm_pr6_fe6_tr3_x' instead of creating a new one.


Optimizando lightgbm con 25 pruebas


[I 2024-11-20 07:59:34,886] Trial 176 finished with value: 347522000.0 and parameters: {'n_estimators': 457, 'num_leaves': 54, 'learning_rate': 0.0270460231088841, 'min_data_in_leaf': 97, 'min_gain_to_split': 0.02583464978259349, 'feature_fraction': 0.5069688756627543, 'bagging_fraction': 0.8477537070962985, 'bagging_freq': 1, 'max_bin': 101}. Best is trial 169 with value: 352865333.3333334.
[I 2024-11-20 08:02:50,430] Trial 177 finished with value: 349365333.3333334 and parameters: {'n_estimators': 465, 'num_leaves': 50, 'learning_rate': 0.022038529551349276, 'min_data_in_leaf': 96, 'min_gain_to_split': 0.046378583546193494, 'feature_fraction': 0.5181987875516293, 'bagging_fraction': 0.8495833943361905, 'bagging_freq': 2, 'max_bin': 92}. Best is trial 169 with value: 352865333.3333334.
[I 2024-11-20 08:06:08,391] Trial 178 finished with value: 351409333.3333334 and parameters: {'n_estimators': 452, 'num_leaves': 56, 'learning_rate': 0.02694228790144789, 'min_data_in_leaf': 98, 'min_ga

Mejores parámetros para lightgbm: {'n_estimators': 466, 'num_leaves': 57, 'learning_rate': 0.024563319625208798, 'min_data_in_leaf': 100, 'min_gain_to_split': 0.011030284732349089, 'feature_fraction': 0.5157056981996073, 'bagging_fraction': 0.8424259086760796, 'bagging_freq': 2, 'max_bin': 97}


Exploring optimization

In [5]:
###

## XGBoost 

**Prepro in 6 months and Conceptual FE 6 months**

> comp02_prepro_6.ipynb

> comp02_fe6_6.ipynb

**Usando los últimos 3 meses para optimizar**

In [7]:
# data = pd.read_parquet(dataset_file_fe6_6pqt)
# running local
data = pd.read_parquet("datos/datasets_competencia_02_fe6_6_3m_train.parquet")

# Mapear etiquetas de clase a números
label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

X_train = data[data['foto_mes'].isin(mes_train_ult_3_meses)]
y_train = X_train['clase_ternaria']
X_train = X_train.drop(columns=['clase_ternaria'])

del data

In [8]:
# Condiciones de la optimización
s = 1
prepro = 6 # data quality + data drifting
fe = 6 # feature engineering conceptual 6 meses
training = 3 # un mes de optimización

print("### Corriendo pipeline con XGBoost ###")
# Inicializar el pipeline con 'xgboost'
pipeline_xgb = ModelPipeline(data=None, seeds=semillas, 
                              model_type='xgboost', seed=s, n_jobs=-1)

# performed manually to reduce memory
# X_train, y_train = pipeline_xgb.def_xy(mes_train_ult_3_meses)

# Identify columns with all NaN values
cols_with_all_nan = X_train.columns[X_train.isna().all()]
print("Columns with all NaN values:", cols_with_all_nan.tolist())

# Drop these columns
X_train = X_train.drop(columns=cols_with_all_nan) # extra limpieza

# Imputación de valores faltantes
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)

del X_train

# Codificar variables categóricas
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')

# Definir el almacenamiento para Optuna
# storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
storage_name = "sqlite:///optimizacion/optimization_tree.db"
study_name = f"exp_xgb_pr{prepro}_fe{fe}_tr{training}_x"

print("\n# Optimizando el modelo")
pipeline_xgb.optimize_model(
    X_train_imp, y_train,
    storage_name=storage_name,
    study_name=study_name,
    optimize=False,  
    n_trials=200
)

del X_train_imp
del y_train
del pipeline_xgb

### Corriendo pipeline con XGBoost ###
Columns with all NaN values: ['payroll_slope_1_foto_mes', 'cuenta_corriente_slope_1_foto_mes', 'visa_consumo_slope_1_foto_mes', 'comisiones_mantenimiento_slope_1_foto_mes', 'comisiones_otras_slope_1_foto_mes']


[I 2024-11-21 07:56:35,272] Using an existing study with name 'exp_xgb_pr6_fe6_tr3_x' instead of creating a new one.



# Optimizando el modelo
Mejores parámetros para xgboost: {'n_estimators': 464, 'max_leaves': 228, 'eta': 0.026841741174110256, 'gamma': 0.6065611085207565, 'min_child_weight': 10, 'subsample': 0.8649413237261332, 'colsample_bytree': 0.5013152719066779}


___