# Predicción para Kaggle

Incluye:

- Modelo optimizado, semilla única
- Semillerío
- Semillerío con modelo simplificado


In [None]:
# Librerias
import os
import pandas as pd
import numpy as np
import datetime
import joblib

####################
# funciones y clases
from utils import psi, drift_deflacion

#######
# rutas
# datasets
from config import dataset_file_fe6_6pqt
# optimizacion
from config import db_path
# modelos
from config import modelos_path
# predicciones
from config import pred_path

##########
# pipeline
from processing import ModelPipeline
from sklearn.impute import SimpleImputer

##############
# optimización
import optuna

#########
# modelos
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings

# Ignorar advertencias de tipo UserWarning
warnings.filterwarnings('ignore', category=UserWarning, module='pandas')
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')


Variables de train y test

In [None]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train_all = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
                 201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
                 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_3_meses = [202104, 202105, 202106]

mes_train_ult_6_meses = [202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_9_meses = [202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_anio = [202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train = [202106]
mes_test = 202108

threshold = 0.025

semillas = [437809, 327347, 392879, 455783, 217163]

## Loading data

In [None]:
data = pd.read_parquet(dataset_file_fe6_6pqt)

train_data = data[data['foto_mes'].isin(mes_train_ult_6_meses)]
score_data = data[data['foto_mes'] == mes_test]

del data

Preprocesando data

In [None]:
cols_with_all_nan = train_data.columns[train_data.isna().all()]
print("Columns with all NaN values:", cols_with_all_nan.tolist())

y_train = train_data['clase_ternaria']
X_train = train_data.drop(columns=['clase_ternaria']+cols_with_all_nan)

# Imputación de valores faltantes
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)

del X_train

# Opcional: Codificar variables categóricas
# LightGBM puede manejar variables categóricas directamente si se especifican
# Si tus datos tienen variables categóricas, puedes identificarlas y especificarlas en el modelo
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')

## Modelo

**Prepro in 6 months and Conceptual FE 6 months**

> comp02_prepro_6.ipynb

> comp02_fe6_6.ipynb

**Usando los últimos 1 meses para optimizar**

Cargando estudio de optimización

In [None]:
# Condiciones de la optimización
s = 1
prepro = 6 # data quality + data drifting
fe = 6 # feature engineering conceptual 6 meses
training = 1 # un mes de optimización

# Definir el almacenamiento de Optuna
storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
study_name = f"exp_lgbm_pr{prepro}_fe{fe}_tr{training}"

loaded_study = optuna.load_study(study_name = study_name,
                                 storage = storage_name)

# Mejores parámetros
best_params = loaded_study.best_trial

# Entrenamiento
model = LGBMClassifier(**best_params)
print("Entrenando modelo con:")
print(best_params)
model.fit(X_train_imp, y_train)

Guardando modelo

In [None]:
ft = "%dT-%m-%Y%H-%M-%S"
t_now = datetime.datetime.now().strftime(ft)

model_name = f"lgbm_pr{prepro}_fe{fe}_tr{training}"+t_now

model_path = modelos_path + model_name +".pkl"

# Save the model
joblib.dump(model, model_path)

## Predicción simple

In [None]:
X_kaggle = score_data

del score_data

# dropping variables según pipeline en train
X_kaggle = X_kaggle.drop(columns=['clase_ternaria'] + cols_with_all_nan)

# imputación usando último imputer ajustado con X_train
X_kaggle_imp = pd.DataFrame(imp_median.transform(X_kaggle), columns=X_kaggle.columns)

del X_kaggle

# modificando tipo de variables según las categóricas del X_train
for col in categorical_features:
    X_kaggle_imp[col] = X_kaggle_imp[col].astype('category')

numero_de_cliente = X_kaggle_imp['numero_de_cliente']

# prediccion
y_pred_proba = model.predict_proba(X_kaggle_imp)
proba_baja2 = y_pred_proba[:,2]

# umbral
thr_opt_mean = 0.019367

submission = pd.DataFrame({
    'numero_de_cliente': numero_de_cliente.values,
    'Predicted': (proba_baja2['prediccion'] >= thr_opt_mean).astype(int)
})


Guardando entrega simple

In [None]:
ft = "%dT-%m-%Y%H-%M-%S"
t_now = datetime.datetime.now().strftime(ft)

pred_name = f"lgbm_pr{prepro}_fe{fe}_tr{training}"+t_now+".csv"

pred_file = pred_path + pred_name

# Guardar el DataFrame en un archivo CSV
submission.to_csv(pred_file, index=False)
print(f"Predicciones guardadas en {pred_file}")

# Imprimir value counts de las predicciones
value_counts = submission['Predicted'].value_counts()
total_count = len(submission)
print("\nValue Counts:")
print(value_counts)
print("\nFrecuencia Relativa:")
print((value_counts / total_count) * 100)

## Semillerío

In [None]:
###