# Predicción para Kaggle

Incluye:

- Modelo optimizado semillerio
- Semillerío con modelo Denicolay, mod y original

In [1]:
# Librerias
import os
import pandas as pd
import numpy as np
import datetime
import joblib

####################
# funciones y clases
from utils import psi, drift_deflacion

#######
# rutas
# datasets
from config import dataset_file_fe6_6pqt, dataset_file_fe6_6xpqt
# optimizacion
from config import db_path
# modelos
from config import modelos_path
# predicciones
from config import pred_path

##########
# pipeline
from processing import ModelPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

##############
# optimización
import optuna

#########
# modelos
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings

# Ignorar advertencias de tipo UserWarning
warnings.filterwarnings('ignore', category=UserWarning, module='pandas')
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

Matplotlib is building the font cache; this may take a moment.
  from .autonotebook import tqdm as notebook_tqdm


Variables de train y test

In [2]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train_all = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
                 201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
                 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_3_meses = [202104, 202105, 202106]

mes_train_ult_6_meses = [202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_9_meses = [202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106]

mes_train_ult_anio = [202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106]

mes_train = [202106]
mes_test = 202108

threshold = 0.025

semillas = [437809, 327347, 392879, 455783, 217163]

## Loading data

In [None]:
# dataset_file_fe6_6xpqt = 'gs://sst001/datasets/competencia_02_fe6_6x.parquet'

data = pd.read_parquet(dataset_file_fe6_6xpqt)

label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

# train data con 9 meses
meses_train = 9
X_train = data[data['foto_mes'].isin(mes_train_ult_9_meses)]
y_train = X_train['clase_ternaria']
X_train = X_train.drop(columns=['clase_ternaria'])

X_kaggle = data[data['foto_mes'] == mes_test]
X_kaggle = X_kaggle.drop(columns=['clase_ternaria']) # nulls

del data

Preprocesando data

In [None]:
# Imputacion de Xs
cols_with_all_nan = X_train.columns[X_train.isna().all()].tolist()
print("Columns with all NaN values:", cols_with_all_nan)
X_train = X_train.drop(columns=cols_with_all_nan)
X_kaggle = X_kaggle.drop(columns=cols_with_all_nan)

# Imputación de nulls
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)
X_kaggle_imp = pd.DataFrame(imp_median.transform(X_kaggle), columns=X_train.columns)

del X_train
del X_kaggle

# Codificar variables categóricas
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')
    X_kaggle_imp[col] = X_kaggle_imp[col].astype('category')

numero_de_cliente = X_kaggle_imp['numero_de_cliente'].astype(int)

Columns with all NaN values: ['payroll_slope_1_foto_mes', 'cuenta_corriente_slope_1_foto_mes', 'visa_consumo_slope_1_foto_mes', 'comisiones_mantenimiento_slope_1_foto_mes', 'comisiones_otras_slope_1_foto_mes']


## Modelo optimo con semillerio

**Prepro in 6 months and Conceptual FE 6 months**

> comp02_prepro_6.ipynb

> comp02_fe6_6.ipynb

**Usando los últimos 3 meses para optimizar de forma local**

Seleccionado según:

> comp02_pipeline_comp.ipynb

In [2]:
s_r = range(217163, 455783, 7*7*7*7*5) # 25 semillas
len(list(s_r))

20

In [None]:
# Condiciones de la optimización
s = 1
prepro = 6 # data quality + data drifting
fe = 6 # feature engineering conceptual 6 meses
opt = 3 # 3 meses de optimización

# Definir el almacenamiento de Optuna
storage_name = "sqlite:///" + db_path + "optimization_tree.db"

# carga local
# storage_name = "sqlite:///optimizacion/optimization_tree.db"
study_name = f"exp_xgb_pr{prepro}_fe{fe}_tr{opt}_x"

study = optuna.load_study(study_name = study_name,
                                 storage = storage_name)

# Mejores parámetros
opt_params = study.best_trial.params
opt_params.update({'n_jobs': -1})

# Entrenamiento 
print("Running semillerío para XGBClassifier Opt")
print(opt_params)

# para registrar las probabilidades
df_sem_proba = pd.DataFrame({
                            'client': numero_de_cliente.values,
                        })

j = 0
s_r = range(217163, 455783, 7*7*7*7*4) # 25 semillas
s_total = len(list(s_r))
for s in s_r:
    # nueva instancia del modelos con semilla
    seed = s + (7+j)
    model = XGBClassifier(**opt_params, seed=seed)
    # entreno
    print(f"Entrenando modelo con semilla: {seed}, {j+1} de {s_total}")
    model.fit(X_train_imp, y_train)
    # predigo proba
    y_pred_proba = model.predict_proba(X_kaggle_imp)
    # proba baja+2
    proba_baja2 = y_pred_proba[:,2]
    df_sem_proba[f'proba_s{seed}'] = proba_baja2
    j += 1

# Promediando proba de cada semilla
proba_s_columns = df_sem_proba.filter(regex='^proba_s')
proba_s_mean = proba_s_columns.mean(axis=1)

df_sem_proba['proba_sem_mean'] = proba_s_mean

# Umbral
thr_opt_sem = 0.0138 # segun comp02_back-testing con sem

# Prediccion
df_sem_proba['pred'] = np.where(df_sem_proba.proba_sem_mean >= thr_opt_sem, 1, 0)

df_sem_proba.head()

OperationalError: (sqlite3.OperationalError) unable to open database file
(Background on this error at: https://sqlalche.me/e/20/e3q8)

Preparando entrega

In [None]:
submission = pd.DataFrame({
    'numero_de_cliente': numero_de_cliente.values,
    'Predicted': df_sem_proba['pred'].values
})

# Imprimir value counts de las predicciones
value_counts = submission['Predicted'].value_counts()
total_count = len(submission)
print("\nValue Counts:")
print(value_counts)
print("\nFrecuencia Relativa:")
print((value_counts / total_count) * 100)

submission.info()

Guardando semillerio opt datos x

In [None]:
ft = "%dT-%m-%Y%H-%M-%S"
t_now = datetime.datetime.now().strftime(ft)

pred_name = f"pred_xgb_pr{prepro}_fe{fe}x_op{opt}_tr{meses_train}_sem{s_total}_"+t_now+".csv"

pred_file = pred_path + pred_name
proba_file = pred_path + "probas/" + pred_name

# Guardar el DataFrame en un archivo CSV
submission.to_csv(pred_file, index=False)
print(f"Predicciones guardadas en {pred_file}")
# Guardar el DataFrame en un archivo CSV
submission.to_csv('predicciones/'+pred_name, index=False)
print(f"Y en {'predicciones/'+pred_name}")

# Guardamos las probas
df_sem_proba.to_csv(proba_file, index=False)
print(f"Probas guardadas en {proba_file}")
df_sem_proba.to_csv('predicciones/probas/'+pred_name, index=False)
print(f"Y en {'predicciones/probas/'+pred_name}")

## Semillerío con modelo Denicolay

Datos X, modificado para compenzar la falta de variables

In [10]:
numero_de_cliente = X_kaggle_imp['numero_de_cliente'].astype(int)

In [None]:
semillero_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 2*0.2, # x2 para tratar de compenzar la falta de variables
                  'extra_trees': False,
}
semillero_params.update({'n_jobs': -1})

print("Running semillerío for Semillerio Denicolay")
print(semillero_params)

# para registrar las probabilidades
df_sem_proba = pd.DataFrame({
                            'client': numero_de_cliente.values,
                        })

j = 0
s_r = range(217163, 455783, 7*7*7*7) # 100 semillas
s_total = len(list(s_r))
for s in s_r: # con 100 semillas
    # nueva instancia del modelos con semilla
    seed = s+(7+j)
    model = LGBMClassifier(**semillero_params, random_state=seed)
    # entreno
    print(f"Entrenando modelo con semilla: {seed}, {j+1} de {s_total}")
    model.fit(X_train_imp, y_train)
    # predigo proba
    y_pred_proba = model.predict_proba(X_kaggle_imp)
    # proba baja+2
    proba_baja2 = y_pred_proba[:,2]
    df_sem_proba[f'proba_s{seed}'] = proba_baja2
    j += 1

# Promediando proba de cada semilla
proba_s_columns = df_sem_proba.filter(regex='^proba_s')
proba_s_mean = proba_s_columns.mean(axis=1)

df_sem_proba['proba_sem_mean'] = proba_s_mean

# Umbral
thr_opt_sem = 0.016 # # segun comp02_back-testing con semx

# Prediccion
df_sem_proba['pred'] = np.where(df_sem_proba.proba_sem_mean >= thr_opt_sem, 1, 0)

df_sem_proba.head()

Running semillerío for Semillerio Denicolay
{'n_estimators': 23, 'num_leaves': 32, 'learning_rate': 0.34, 'min_data_in_leaf': 711, 'feature_fraction': 0.4, 'extra_trees': False, 'n_jobs': -1}
Entrenando modelo con semilla: 217170, 1 de 100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 5.379063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 88721
[LightGBM] [Info] Number of data points in the train set: 1622316, number of used features: 675
[LightGBM] [Info] Start training from score -0.009903
[LightGBM] [Info] Start training from score -5.350779
[LightGBM] [Info] Start training from score -5.276681
Entrenando modelo con semilla: 219572, 2 de 100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.433988 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88

  df_sem_proba[f'proba_s{seed}'] = proba_baja2
  df_sem_proba['proba_sem_mean'] = proba_s_mean
  df_sem_proba['pred'] = np.where(df_sem_proba.proba_sem_mean >= thr_opt_sem, 1, 0)


Unnamed: 0,client,proba_s217170,proba_s219572,proba_s221974,proba_s224376,proba_s226778,proba_s229180,proba_s231582,proba_s233984,proba_s236386,...,proba_s438154,proba_s440556,proba_s442958,proba_s445360,proba_s447762,proba_s450164,proba_s452566,proba_s454968,proba_sem_mean,pred
0,725464666,0.017956,0.016593,0.022882,0.070633,0.030207,0.04579,0.015359,0.036982,0.016308,...,0.008267,0.014588,0.018021,0.029319,0.036695,0.047458,0.02106,0.020221,0.032141,1
1,468172440,0.055022,0.021651,0.037573,0.022239,0.032428,0.028908,0.02814,0.029936,0.049641,...,0.017445,0.035208,0.076392,0.033161,0.022649,0.026039,0.028279,0.042274,0.042765,1
2,1168083010,0.040582,0.067715,0.242897,0.198219,0.068213,0.039407,0.030925,0.031466,0.031424,...,0.574881,0.040907,0.077673,0.099376,0.050301,0.062497,0.546794,0.209313,0.150169,1
3,352814875,0.004396,0.005066,0.003791,0.004786,0.007153,0.002884,0.002623,0.006897,0.003534,...,0.00233,0.009305,0.003114,0.003117,0.003377,0.003707,0.004243,0.005777,0.004797,0
4,1229717603,0.010715,0.019277,0.00994,0.005081,0.010911,0.013234,0.013114,0.028189,0.010068,...,0.015951,0.012115,0.013046,0.0106,0.014546,0.018684,0.014982,0.011884,0.015079,0


Preparando entrega

In [12]:
submission = pd.DataFrame({
    'numero_de_cliente': numero_de_cliente.values,
    'Predicted': df_sem_proba['pred'].values
})

# Imprimir value counts de las predicciones
value_counts = submission['Predicted'].value_counts()
total_count = len(submission)
print("\nValue Counts:")
print(value_counts)
print("\nFrecuencia Relativa:")
print((value_counts / total_count) * 100)

submission.info()


Value Counts:
Predicted
0    153400
1     12042
Name: count, dtype: int64

Frecuencia Relativa:
Predicted
0    92.721316
1     7.278684
Name: count, dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165442 entries, 0 to 165441
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   numero_de_cliente  165442 non-null  int64
 1   Predicted          165442 non-null  int64
dtypes: int64(2)
memory usage: 2.5 MB


Guardando pred semillerio denicolay

In [None]:
ft = "%dT-%m-%Y%H-%M-%S"
t_now = datetime.datetime.now().strftime(ft)

pred_name = f"pred_lgbm_mod_pr{prepro}_fe{fe}x_tr{meses_train}_sem{s_total}_"+t_now+".csv"

pred_file = pred_path + pred_name
proba_file = pred_path + "probas/" + pred_name

# Guardar el DataFrame en un archivo CSV
submission.to_csv(pred_file, index=False)
print(f"Predicciones guardadas en {pred_file}")
# Guardar el DataFrame en un archivo CSV
submission.to_csv('predicciones/'+pred_name, index=False)
print(f"Y en {'predicciones/'+pred_name}")

# Guardamos las probas
df_sem_proba.to_csv(proba_file, index=False)
print(f"Probas guardadas en {proba_file}")
df_sem_proba.to_csv('predicciones/probas/'+pred_name, index=False)
print(f"Y en {'predicciones/probas/'+pred_name}")

Y en predicciones/lgbm_mod_pr6_fe6x_tr9_sem100_23T-11-202414-37-33.csv
Probas guardadas en /home/santtedo/buckets/b1/predicciones/probas/lgbm_mod_pr6_fe6x_tr9_sem100_23T-11-202414-37-33.csv
Y en predicciones/probas/lgbm_mod_pr6_fe6x_tr9_sem100_23T-11-202414-37-33.csv


## Semillerío con modelo Denicolay

Datos X

In [None]:
numero_de_cliente = X_kaggle_imp['numero_de_cliente'].astype(int)

In [None]:
semillero_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 0.2, # x2 para tratar de compenzar la falta de variables
                  'extra_trees': False,
}
semillero_params.update({'n_jobs': -1})

print("Running semillerío for Semillerio Denicolay")
print(semillero_params)

# para registrar las probabilidades
df_sem_proba = pd.DataFrame({
                            'client': numero_de_cliente.values,
                        })

j = 0
s_r = range(217163, 455783, 7*7*7*7) # 100 semillas
s_total = len(list(s_r))
for s in s_r: # con 100 semillas
    # nueva instancia del modelos con semilla
    seed = s+(7+j)
    model = LGBMClassifier(**semillero_params, random_state=seed)
    # entreno
    print(f"Entrenando modelo con semilla: {seed}, {j+1} de {s_total}")
    model.fit(X_train_imp, y_train)
    # predigo proba
    y_pred_proba = model.predict_proba(X_kaggle_imp)
    # proba baja+2
    proba_baja2 = y_pred_proba[:,2]
    df_sem_proba[f'proba_s{seed}'] = proba_baja2
    j += 1

# Promediando proba de cada semilla
proba_s_columns = df_sem_proba.filter(regex='^proba_s')
proba_s_mean = proba_s_columns.mean(axis=1)

df_sem_proba['proba_sem_mean'] = proba_s_mean

# Umbral
thr_opt_sem = 0.013 # # segun comp02_back-testing con semx

# Prediccion
df_sem_proba['pred'] = np.where(df_sem_proba.proba_sem_mean >= thr_opt_sem, 1, 0)

df_sem_proba.head()

Preparando entrega

In [None]:
submission = pd.DataFrame({
    'numero_de_cliente': numero_de_cliente.values,
    'Predicted': df_sem_proba['pred'].values
})

# Imprimir value counts de las predicciones
value_counts = submission['Predicted'].value_counts()
total_count = len(submission)
print("\nValue Counts:")
print(value_counts)
print("\nFrecuencia Relativa:")
print((value_counts / total_count) * 100)

submission.info()

Guardando pred semillerio denicolay

In [None]:
ft = "%dT-%m-%Y%H-%M-%S"
t_now = datetime.datetime.now().strftime(ft)

pred_name = f"pred_lgbm_pr{prepro}_fe{fe}x_tr{meses_train}_sem{s_total}_"+t_now+".csv"

pred_file = pred_path + pred_name
proba_file = pred_path + "probas/" + pred_name

# Guardar el DataFrame en un archivo CSV
submission.to_csv(pred_file, index=False)
print(f"Predicciones guardadas en {pred_file}")
# Guardar el DataFrame en un archivo CSV
submission.to_csv('predicciones/'+pred_name, index=False)
print(f"Y en {'predicciones/'+pred_name}")

# Guardamos las probas
df_sem_proba.to_csv(proba_file, index=False)
print(f"Probas guardadas en {proba_file}")
df_sem_proba.to_csv('predicciones/probas/'+pred_name, index=False)
print(f"Y en {'predicciones/probas/'+pred_name}")