# Optimización Pipeline

Incluye:

- Tuning de hyperparámetros (con meses históricos)

In [1]:
# Librerias
import os
import pandas as pd
import numpy as np

#######
# rutas
# datasets
from config import dataset_file_fe6_6xpqt,\
                   dataset_file_fe6_6xpqt_opt_under
# optimizacion
from config import db_path
# modelos
from config import modelos_path
# predicciones
from config import pred_path


##########
# pipeline
from processing import ModelPipeline
from processing import analyze_study
from sklearn.impute import SimpleImputer
import optuna


Variables de train y test

In [2]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train_all = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
                 201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
                 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106, 202107]

mes_train_ult_3_meses = [202105, 202106, 202107]

mes_train_ult_6_meses = [202102, 202103, 202104, 202105, 202106, 202107]

mes_train_ult_9_meses = [202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106, 202107]

mes_train_ult_anio = [202008, 202009, 202010, 202011, 202012, 202101, 
                      202102, 202103, 202104, 202105, 202106, 202107]

mes_train = [202107]
mes_test = 202109

threshold = 0.025

semillas = [437809, 327347, 392879, 455783, 217163]

## LightGBM 

**Prepro in 6 months Conceptual FE 6 months + Lag1 + Delta1**

> comp03_prepro_6x.ipynb

> comp03_fe6_6x.ipynb

**Usando los últimos 12 meses para optimizar** con 10 % de CONTINUA

In [3]:
# data = pd.read_parquet(dataset_file_fe6_6xpqt_opt_under)

# running local
data = pd.read_parquet("datos/datasets_competencia_03_fe6x_opt_under.parquet")

# Mapear etiquetas de clase a números
label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

X_train = data[data['foto_mes'].isin(mes_train_ult_anio)]
y_train = X_train['clase_ternaria']
X_train = X_train.drop(columns=['clase_ternaria'])

del data

In [None]:
# Condiciones de la optimización
s = 1
prepro = 6 # data quality + data drifting reducido
fe = 6 # feature engineering conceptual 6 meses
training = 12 # 12 meses de optimización con 10 % de CONTINUA

print("### Corriendo pipeline con LightGBM ###")
# Inicializar el pipeline con 'lightgbm'
pipeline_lgbm = ModelPipeline(data=None, seeds=semillas,
                              model_type='lightgbm', seed=s, 
                              meses_opt=training, meses_test=1,
                              threshold=0.015, # según back-testing en comp02
                              n_jobs=-1)

# Identify columns with all NaN values
cols_with_all_nan = X_train.columns[X_train.isna().all()]
print("Columns with all NaN values:", cols_with_all_nan.tolist())

# Drop these columns
X_train = X_train.drop(columns=cols_with_all_nan) # extra limpieza

# Imputación de valores faltantes
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)

del X_train

# Codificar variables categóricas
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')

# Definir el almacenamiento para Optuna
# storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
storage_name = "sqlite:///optimizacion/optimization_tree.db" # SUBIR a la nube para pipeline comp
study_name = f"exp_lgbm_comp03_local_v00"

print("\n# Optimizando el modelo")
pipeline_lgbm.optimize_model(
    X_train_imp, y_train,
    storage_name=storage_name,
    study_name=study_name,
    optimize=True,
    n_trials=185
)

### Corriendo pipeline con LightGBM ###
Columns with all NaN values: ['payroll_slope_1_foto_mes', 'cuenta_corriente_slope_1_foto_mes', 'visa_consumo_slope_1_foto_mes', 'comisiones_mantenimiento_slope_1_foto_mes', 'comisiones_otras_slope_1_foto_mes']

# Optimizando el modelo


[I 2024-11-29 08:01:25,530] Using an existing study with name 'exp_lgbm_comp03_local_v00' instead of creating a new one.


Optimizando lightgbm con 185 pruebas


[I 2024-11-29 08:06:29,373] Trial 42 finished with value: 175618333.33333337 and parameters: {'n_estimators': 512, 'num_leaves': 71, 'learning_rate': 0.02037664360465537, 'min_data_in_leaf': 125, 'feature_fraction': 0.49270352148874796}. Best is trial 30 with value: 176340888.8888889.
[I 2024-11-29 08:09:11,364] Trial 43 finished with value: 175445666.6666667 and parameters: {'n_estimators': 430, 'num_leaves': 52, 'learning_rate': 0.03856911907004415, 'min_data_in_leaf': 179, 'feature_fraction': 0.38876128102191915}. Best is trial 30 with value: 176340888.8888889.


Exploring optimization

In [None]:
study = analyze_study(storage_name, study_name)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_slice(study)