# Predicción para Kaggle

> Modelo optimizado semillerio

> Semillerío con modelo Denicolay

In [1]:
!pip install dask[dataframe]


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Librerias
import os
import pandas as pd
import numpy as np
import datetime
import joblib

####################
# funciones y clases
from utils import psi, drift_deflacion

#######
# rutas
# datasets
from config import dataset_file_fe6_6xxpqt # con lag1&2 + delta1&2
# optimizacion
from config import db_path
# modelos
from config import modelos_path
# predicciones
from config import pred_path

##########
# pipeline
from processing import ModelPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

##############
# optimización
import optuna

#########
# modelos
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import warnings

# Ignorar advertencias de tipo UserWarning
warnings.filterwarnings('ignore', category=UserWarning, module='pandas')
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

  from .autonotebook import tqdm as notebook_tqdm


Variables de train y test

In [3]:
ganancia_acierto = 273000
costo_estimulo = 7000

mes_train_all = [201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
                 201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
                 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
                 202101, 202102, 202103, 202104, 202105, 202106, 202107]

mes_train = [202107]
mes_test = 202109

threshold = 0.0195

semillas = [437809, 327347, 392879, 455783, 217163]

## Loading data

In [4]:
data = pd.read_parquet(dataset_file_fe6_6xxpqt)

label_mapping = {'CONTINUA': 0, 'BAJA+1': 1, 'BAJA+2': 2}

data['clase_ternaria'] = data['clase_ternaria'].map(label_mapping)

mes_bt_train_all = [
                    # 201901, 201902, 201903, 201904, 201906, 201907, # no los agregó por rotura de variables creadas
                    201908, 201909, 201911, 201912, 202001, 202002, 
                    202003, 202008, 202009, 202010, 202011, 202012,
                    202101, 202102, 202103, 202104, 202105, 202103,
                    202107] # igualmente habra variables perturbadas en los meses que no son tenidos en cuenta

meses_train = 19
X_train = data[data['foto_mes'].isin(mes_bt_train_all)]
y_train = X_train['clase_ternaria']
X_train = X_train.drop(columns=['clase_ternaria'])

X_kaggle = data[data['foto_mes'] == mes_test]
X_kaggle = X_kaggle.drop(columns=['clase_ternaria']) # nulls

del data

Preprocesando data

sin normalizar

In [5]:
# Imputacion de Xs
cols_with_all_nan = X_train.columns[X_train.isna().all()].tolist()
print("Columns with all NaN values:", cols_with_all_nan)
X_train = X_train.drop(columns=cols_with_all_nan)
X_kaggle = X_kaggle.drop(columns=cols_with_all_nan)

# Imputación de nulls
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = pd.DataFrame(imp_median.fit_transform(X_train), columns=X_train.columns)
X_kaggle_imp = pd.DataFrame(imp_median.transform(X_kaggle), columns=X_train.columns)

del X_train
del X_kaggle

# Codificar variables categóricas
categorical_features = [col for col in X_train_imp.columns if X_train_imp[col].dtype == 'object']

# Convertir variables categóricas a 'category' dtype para LightGBM
for col in categorical_features:
    X_train_imp[col] = X_train_imp[col].astype('category')
    X_kaggle_imp[col] = X_kaggle_imp[col].astype('category')

numero_de_cliente = X_kaggle_imp['numero_de_cliente'].astype(int)

Columns with all NaN values: ['payroll_slope_1_foto_mes', 'cuenta_corriente_slope_1_foto_mes', 'visa_consumo_slope_1_foto_mes', 'comisiones_mantenimiento_slope_1_foto_mes', 'comisiones_otras_slope_1_foto_mes']


## Modelo semillerio

**Enfoque conceptual en 6 meses + Lag1&2 + Delta1&2**

> comp03_prepro_6x

> comp03_fe6_6xx

Seleccionado según:

> comp03_back-testing_semxx

Umbral según:

> comp03_kaggle_api_sub

In [6]:
list(range(3))

[0, 1, 2]

In [10]:
# # Definir el almacenamiento de Optuna
# storage_name = "sqlite:///" + db_path + "optimization_tree.db"

# # carga local
# # storage_name = "sqlite:///optimizacion/optimization_tree.db"
# study_name = f"exp_lgbm_comp03_local_v00"

# study = optuna.load_study(study_name = study_name,
#                                  storage = storage_name)

# # Mejores parámetros lgbm opt
# opt_params = study.best_trial.params
# opt_params.update({'n_jobs': -1})

semillero_params = {'n_estimators': 23,
                  'num_leaves': 32,
                  'learning_rate': 0.34,
                  'min_data_in_leaf': 711,
                  'feature_fraction': 0.25,
                  'extra_trees': False,
}

semillero_params.update({'n_jobs': -1})

# Entrenamiento 
print("Running semillerío para entrega")
print(semillero_params)

# para registrar las probabilidades
df_sem_proba = pd.DataFrame({
                            'client': numero_de_cliente.values,
                        })

j = 0
s_r = range(217163, 455783, 7*7*7*6) # 116 semillas
#s_r = list(range(3)) # 3 semillas de prueba

s_total = len(list(s_r))
for s in s_r:
    # nueva instancia del modelos con semilla
    seed = s + (7+j)
    model = LGBMClassifier(**semillero_params, random_state=seed)
    # entreno
    print(f"Entrenando modelo con semilla: {seed}, {j+1} de {s_total}")
    model.fit(X_train_imp, y_train)
    # predigo proba
    y_pred_proba = model.predict_proba(X_kaggle_imp)
    # proba baja+2
    proba_baja2 = y_pred_proba[:,2]
    df_sem_proba[f'proba_s{seed}'] = proba_baja2
    j += 1

# Promediando proba de cada semilla
proba_s_columns = df_sem_proba.filter(regex='^proba_s')
proba_s_mean = proba_s_columns.mean(axis=1)

df_sem_proba['proba_sem_mean'] = proba_s_mean

# Umbral
thr_opt_sem = 0.0195 # segun comp03_kaggle_api_sub

# Segun Denicolay, el óptimo ronda los 11 mil estímulos

# Prediccion
df_sem_proba['pred'] = np.where(df_sem_proba.proba_sem_mean >= thr_opt_sem, 1, 0)

df_sem_proba.head()

Running semillerío para entrega
{'n_estimators': 23, 'num_leaves': 32, 'learning_rate': 0.34, 'min_data_in_leaf': 711, 'feature_fraction': 0.25, 'extra_trees': False, 'n_jobs': -1}
Entrenando modelo con semilla: 217170, 1 de 116
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.040423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124449
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 927
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 219229, 2 de 116
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.039319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set 

  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.063222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124425
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 928
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 425129, 102 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.075261 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124449
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 928
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 427188, 103 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.065107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124202
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 927
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 429247, 104 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.147487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124353
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 928
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 431306, 105 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.073305 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124163
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 926
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 433365, 106 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.065368 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124415
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 926
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 435424, 107 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.046836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124441
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 927
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 437483, 108 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.079071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124359
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 927
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 439542, 109 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.061836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124335
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 927
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 441601, 110 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.064753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124407
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 927
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 443660, 111 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.593392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124380
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 928
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 445719, 112 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.054115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124352
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 927
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 447778, 113 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.053578 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124372
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 928
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 449837, 114 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.044297 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124337
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 927
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 451896, 115 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.058934 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124361
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 927
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009
Entrenando modelo con semilla: 453955, 116 de 116


  df_sem_proba[f'proba_s{seed}'] = proba_baja2


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.066153 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124284
[LightGBM] [Info] Number of data points in the train set: 2766387, number of used features: 927
[LightGBM] [Info] Start training from score -0.009008
[LightGBM] [Info] Start training from score -5.415698
[LightGBM] [Info] Start training from score -5.399009


  df_sem_proba[f'proba_s{seed}'] = proba_baja2
  df_sem_proba['proba_sem_mean'] = proba_s_mean
  df_sem_proba['pred'] = np.where(df_sem_proba.proba_sem_mean >= thr_opt_sem, 1, 0)


Unnamed: 0,client,proba_s217170,proba_s219229,proba_s221288,proba_s223347,proba_s225406,proba_s227465,proba_s229524,proba_s231583,proba_s233642,...,proba_s439542,proba_s441601,proba_s443660,proba_s445719,proba_s447778,proba_s449837,proba_s451896,proba_s453955,proba_sem_mean,pred
0,278710263,0.002078,0.000727,0.000444,0.00063,0.000794,0.000804,0.000489,0.000721,0.000868,...,0.001174,0.000711,0.000399,0.000625,0.001051,0.000722,0.000943,0.000546,0.001599,0
1,472404689,0.000485,0.000389,0.000432,0.00028,0.000319,0.000663,0.000525,0.000346,0.000499,...,0.000383,0.000403,0.000514,0.000553,0.000393,0.000382,0.000726,0.000264,0.000421,0
2,900822233,0.005777,0.004374,0.003333,0.004014,0.007472,0.005119,0.005925,0.006795,0.004196,...,0.011466,0.007499,0.004737,0.01709,0.004361,0.009139,0.003613,0.009475,0.006848,0
3,305557480,0.000721,0.000641,0.000542,0.000492,0.000745,0.000862,0.000818,0.000971,0.00107,...,0.000429,0.000589,0.000545,0.001291,0.000799,0.000856,0.000762,0.000406,0.000815,0
4,946226702,0.001711,0.001976,0.005455,0.004259,0.003372,0.00487,0.00879,0.002024,0.003018,...,0.003368,0.002482,0.001268,0.004944,0.003297,0.002775,0.003044,0.005235,0.004374,0


Preparando entrega

In [11]:
submission = pd.DataFrame({
    'numero_de_cliente': numero_de_cliente.values,
    'Predicted': df_sem_proba['pred'].values
})

# Imprimir value counts de las predicciones
value_counts = submission['Predicted'].value_counts()
total_count = len(submission)
print("\nValue Counts:")
print(value_counts)
print("\nFrecuencia Relativa:")
print((value_counts / total_count) * 100)

submission.info()


Value Counts:
Predicted
0    154447
1     11197
Name: count, dtype: int64

Frecuencia Relativa:
Predicted
0    93.240323
1     6.759677
Name: count, dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165644 entries, 0 to 165643
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   numero_de_cliente  165644 non-null  int64
 1   Predicted          165644 non-null  int64
dtypes: int64(2)
memory usage: 2.5 MB


Guardando semillerio opt datos x

In [None]:
ft = "%dT-%m-%Y%H-%M-%S"
t_now = datetime.datetime.now().strftime(ft)

pred_name = f"pred_sem_03_tr{meses_train}_sem{s_total}_"+t_now+".csv"

proba_file = pred_path + "probas/" + pred_name
pred_file = pred_path + pred_name

# Guardamos las probas
df_sem_proba.to_csv(proba_file, index=False)
print(f"Probas guardadas en {proba_file}")

# Guardar el DataFrame en un archivo CSV
submission.to_csv(pred_file, index=False)
print(f"Predicciones guardadas en {pred_file}")


Probas guardadas en /home/santtedo/buckets/b1/predicciones/probas/pred_sem_03_tr19_sem116_03T-12-202420-37-40.csv
Predicciones guardadas en /home/santtedo/buckets/b1/predicciones/pred_sem_03_tr19_sem116_03T-12-202420-37-40.csv
