# Training Pipeline

Training the best M models

Incluye:

- Tuning de hyperparámetros (con meses históricos)

In [1]:
!pip install dask[dataframe]

Collecting dask[dataframe]
  Downloading dask-2024.11.2-py3-none-any.whl.metadata (3.7 kB)
Collecting partd>=1.4.0 (from dask[dataframe])
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting toolz>=0.10.0 (from dask[dataframe])
  Downloading toolz-1.0.0-py3-none-any.whl.metadata (5.1 kB)
Collecting importlib-metadata>=4.13.0 (from dask[dataframe])
  Downloading importlib_metadata-8.5.0-py3-none-any.whl.metadata (4.8 kB)
Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
Collecting pyarrow>=14.0.1 (from dask-expr<1.2,>=1.1->dask[dataframe])
  Downloading pyarrow-18.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting zipp>=3.20 (from importlib-metadata>=4.13.0->dask[dataframe])
  Downloading zipp-3.21.0-py3-none-any.whl.metadata (3.7 kB)
Collecting locket (from partd>=1.4.0->dask[dataframe])
  Downloading locket-1.0.0-py2.py3-none-any.whl.metadata (2.8 kB)
Downloading dask_expr-1.1.19-py3-none-

In [2]:
# Librerias
import os
import pandas as pd
import numpy as np

#######
# rutas
# datasets
from config import dataset_file_fe6_6pqt
# optimizacion
from config import db_path
# modelos
from config import modelos_path
# predicciones
from config import pred_path

##########
# pipeline
from processing import ModelPipeline
from sklearn.impute import SimpleImputer


Variables de train y test

In [3]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train_all = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
                 201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
                 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_3_meses = [202104, 202105, 202106]

mes_train_ult_6_meses = [202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_9_meses = [202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_anio = [202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train = [202106]
mes_test = 202108

threshold = 0.025

semillas = [437809, 327347, 392879, 455783, 217163]

## Loading data

In [10]:
# data = pd.read_parquet(dataset_file_fe6_6pqt)

# running local
data = pd.read_parquet("datos/datasets_competencia_02_fe6_6_6m_train.parquet")


## LightGBM 

**Prepro in 6 months and Conceptual FE 6 months**

> comp02_prepro_6.ipynb

> comp02_fe6_6.ipynb

**Usando los últimos 6 meses para optimizar**

In [None]:

# Condiciones de la optimización
s = 1
prepro = 6 # data quality + data drifting
fe = 6 # feature engineering conceptual 6 meses
training = 6 # un mes de optimización

print("### Corriendo pipeline con LightGBM ###")
# Inicializar el pipeline con 'lightgbm'
pipeline_lgbm = ModelPipeline(data, semillas, model_type='lightgbm', seed=s, n_jobs=-1)

del data

X_train, y_train = pipeline_lgbm.def_xy(mes_train)

# Identify columns with all NaN values
cols_with_all_nan = X_train.columns[X_train.isna().all()]
print("Columns with all NaN values:", cols_with_all_nan.tolist())

# Drop these columns
X_train = X_train.drop(columns=cols_with_all_nan) # extra limpieza

# Imputación de valores faltantes
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)

del X_train

# Opcional: Codificar variables categóricas
# LightGBM puede manejar variables categóricas directamente si se especifican
# Si tus datos tienen variables categóricas, puedes identificarlas y especificarlas en el modelo
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')

print("\n# Entrenando el modelo base LightGBM")
pipeline_lgbm.train_base_model(X_train_imp, y_train)

# Definir el almacenamiento para Optuna
# storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
storage_name = "sqlite:///optimizacion/optimization_tree.db"
study_name = f"exp_lgbm_pr{prepro}_fe{fe}_tr{training}"

print("\n# Optimizando el modelo")
pipeline_lgbm.optimize_model(
    X_train_imp, y_train,
    storage_name=storage_name,
    study_name=study_name,
    optimize=False,  
    n_trials=100
)

# Entrenar el mejor modelo con parámetros optimizados
print("\n# Entrenando el mejor modelo con parámetros optimizados")
pipeline_lgbm.train_best_model(X_train_imp, y_train)

# Comparar modelos
print("\n# Comparando modelos")
results_base_lgbm_pr6_fe6_tr1, results_best_lgbm_pr6_fe6_tr1 = pipeline_lgbm.compare_models(X_train_imp, y_train)
pipeline_lgbm.plot_comparisons(results_base_lgbm_pr6_fe6_tr1, results_best_lgbm_pr6_fe6_tr1)

### Corriendo pipeline con LightGBM ###
Columns with all NaN values: ['payroll_slope_1_foto_mes', 'cuenta_corriente_slope_1_foto_mes', 'visa_consumo_slope_1_foto_mes', 'comisiones_mantenimiento_slope_1_foto_mes', 'comisiones_otras_slope_1_foto_mes']

# Entrenando el modelo base LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.165531 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 49336
[LightGBM] [Info] Number of data points in the train set: 164876, number of used features: 402
[LightGBM] [Info] Start training from score -5.201705
[LightGBM] [Info] Start training from score -5.033804
[LightGBM] [Info] Start training from score -0.012094

# Optimizando el modelo


[I 2024-11-19 15:43:47,489] A new study created in RDB with name: exp_lgbm_pr6_fe6_tr6


Optimizando lightgbm con 100 pruebas


[I 2024-11-19 15:45:27,462] Trial 0 finished with value: 92045333.33333334 and parameters: {'n_estimators': 89, 'num_leaves': 143, 'learning_rate': 0.004749786099207218, 'min_data_in_leaf': 71, 'min_gain_to_split': 0.7323179472983463, 'feature_fraction': 0.5418692410684371, 'bagging_fraction': 0.5455362627643998, 'bagging_freq': 1, 'max_bin': 209}. Best is trial 0 with value: 92045333.33333334.
[I 2024-11-19 15:47:16,583] Trial 1 finished with value: 47310666.66666667 and parameters: {'n_estimators': 61, 'num_leaves': 191, 'learning_rate': 0.029298329292614073, 'min_data_in_leaf': 93, 'min_gain_to_split': 0.1435341926694329, 'feature_fraction': 0.8063704723076408, 'bagging_fraction': 0.7830765435330829, 'bagging_freq': 7, 'max_bin': 224}. Best is trial 0 with value: 92045333.33333334.
[I 2024-11-19 15:48:16,118] Trial 2 finished with value: 42886666.66666667 and parameters: {'n_estimators': 402, 'num_leaves': 96, 'learning_rate': 0.11635288914285075, 'min_data_in_leaf': 30, 'min_gain_t

Mejores parámetros para lightgbm: {'n_estimators': 256, 'num_leaves': 37, 'learning_rate': 0.019705417599590425, 'min_data_in_leaf': 60, 'min_gain_to_split': 0.7542063172070039, 'feature_fraction': 0.5020429576814027, 'bagging_fraction': 0.8623805856452469, 'bagging_freq': 2, 'max_bin': 123}

# Entrenando el mejor modelo con parámetros optimizados
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.192328 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26687
[LightGBM] [Info] Number of data points in the train set: 164876, number of used features: 402
[LightGBM] [Info] Start training from score -5.201705
[LightGBM] [Info] Start training from score -5.033804
[LightGBM] [Info] Start training from score -0.012094

# Comparando modelos


Exploring optimization

In [None]:
###

___