In [1]:
import pickle

import numpy as np
import pandas as pd
import random
from tqdm import tqdm

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score


In [2]:
## funciones para cargar y grabar pickles
def load_pickle(filename):
    with open(filename,'rb') as file:
        object_file = pickle.load(file)
    return object_file

def dump_pickle(file, filename):
    with open(filename, "wb") as output_file:
        pickle.dump(file, output_file)


In [3]:
data_dir = 'data/'
df = pd.read_parquet(data_dir + 'application_train.parquet')
agg_ccb = pd.read_pickle(data_dir + 'ccb_agg_features.pkl')
agg_ip = pd.read_pickle(data_dir + 'ip_agg_features.pkl')

In [4]:
df = df.merge(agg_ccb, on = 'SK_ID_CURR', how='left')
df = df.merge(agg_ip, on = 'SK_ID_CURR', how='left')

del agg_ccb, agg_ip

In [6]:
target_col_name = 'TARGET'
id_col_name = 'SK_ID_CURR'

y = df.pop(target_col_name)
ids = df.pop(id_col_name)

categorical_features = df.select_dtypes('object').columns.to_list()
num_features = df.select_dtypes('number').columns.to_list()

In [7]:
## Defino pipeline de preprocesamientos

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

num_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=np.nan))
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [8]:
#Realizo preprocesamiento
X = preprocess.fit_transform(df)
X = pd.DataFrame(X)

prepro_columns  = num_features + list(preprocess.transformers_[1][1][1].get_feature_names_out(categorical_features)) 
prepro_columns =  [x.replace(" ", "_") for x in prepro_columns]
X.columns = prepro_columns

In [15]:
X.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone,_brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_None,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_None
0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461.0,-637.0,-3648.0,-2120.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765.0,-1188.0,-1186.0,-291.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046.0,-225.0,-4260.0,-2531.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005.0,-3039.0,-9833.0,-2437.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932.0,-3038.0,-4311.0,-3458.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


### Train, Validation and Test Split

In [16]:
ids_train, ids_val_test, X_train, X_val_test, y_train, y_val_test = train_test_split(ids, X, y, test_size=0.33, random_state=42)
ids_val, ids_test, X_val, X_test, y_val, y_test = train_test_split(ids_val_test, X_val_test, y_val_test, test_size=0.5, random_state=42)
print(f'Train Set n rows: {X_train.shape[0]}')
print(f'Validation Set n rows: {X_val.shape[0]}')
print(f'Test Set n rows: {X_test.shape[0]}')

Train Set n rows: 206032
Validation Set n rows: 50739
Test Set n rows: 50740


In [20]:
del ids_val_test, X_val_test, y_val_test, df, y, ids

### Feature Selection by Best Than Random Feature approach

In [21]:
canaritos_pct =  0.05

In [22]:
# Función para generar dataset con variables aleatorias
def agregar_canaritos( pdataset, pcanaritos_idx ):
    nrows, ncols= pdataset.shape
    
    
    canaritos_cantidad = int(ncols * pcanaritos_idx)  
    vcanaritos =  ['canarito_' + sub for sub in map(str, list(range(canaritos_cantidad)))]

    np.random.seed(10217)

    df_canaritos = pd.DataFrame(np.random.randn(nrows, canaritos_cantidad, ), columns=vcanaritos)
    df_canaritos = df_canaritos.set_index(pdataset.index)

    
    result = pd.concat([pdataset, df_canaritos], axis=1)
    
    return result

In [23]:
X_train_con_canaritos = agregar_canaritos(X_train, canaritos_pct)
X_val_con_canaritos = agregar_canaritos(X_val, canaritos_pct)

In [24]:
feature_importance = pd.Series(X_train_con_canaritos.columns).to_frame(name='feature' )

params = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': [100, 500, 1000],
            "reg_alpha"   : [0.5,0.2,1],
            "reg_lambda"  : [2,3,5],
            "gamma"       : [1,2,3]}

metrics = []

In [25]:
### Realizo iteraciones para generar distintas feature importances

n_iter = 10
random.seed(42)
for i in tqdm(range(n_iter)):
    parms_iter = {k:random.choice(v) for k,v in params.items()}
    model = XGBClassifier(seed=10,
                          verbosity =0, 
                          objective='binary:logistic',
                          use_label_encoder=False,
                          **parms_iter)
    model.fit(X=X_train_con_canaritos, y=y_train)
    
    y_pred = model.predict_proba(X=X_val_con_canaritos)[:,1]
    
    accuracy = accuracy_score(y_val,np.where(y_pred >0.5, 1,0))
    roc_auc = roc_auc_score(y_val, y_pred)
    
    
    feature_importance = pd.concat([feature_importance,  
                                    pd.Series(model.feature_importances_).to_frame(name=f'importance_{i}' )],
                                   axis=1)
    feature_importance[f'ranking_{i}'] = feature_importance[f'importance_{i}'].rank(method='first', ascending = False)
    
    best_canarito = feature_importance.loc[feature_importance.feature.str.startswith('canarito')][f'ranking_{i}'].min() 
    
    feature_importance[f'best_than_canarito_{i}'] = np.where(feature_importance[f'ranking_{i}'] < best_canarito, True, False) 
    
    
    metrics_iter = { 'iter': i,
                    'accuracy':accuracy,
                    'roc_auc':roc_auc,
                    'params': parms_iter,
                    'best_canarito' : best_canarito
                   }
    metrics.append(metrics_iter)

100%|██████████| 10/10 [2:08:55<00:00, 773.57s/it] 


In [None]:
metrics = pd.DataFrame( metrics)


In [30]:
best_params = metrics.loc[metrics.roc_auc == metrics.roc_auc.max(), 'params' ].values[0]
dump_pickle(best_params, 'data/best_first_params.pkl')

#### Realizo el promedio de las 10 itereaciones

In [34]:
filter_col = [col for col in feature_importance if col.startswith('importance_')]
feature_importance['mean_importance'] = feature_importance[filter_col].mean(axis=1)
feature_importance['ranking_mean_importance'] = feature_importance['mean_importance'].rank(method='first', ascending = False)
best_mean_canarito = feature_importance.loc[feature_importance.feature.str.startswith('canarito')]['ranking_mean_importance'].min() 
feature_importance['best_than_canarito_mean'] = np.where(feature_importance['ranking_mean_importance'] < best_mean_canarito, True, False) 


In [40]:
###selecciono las features que tuvieron un feature importance promedio mayor al de la mejor variable aleatoria
feature_selected =feature_importance.loc[feature_importance['best_than_canarito_mean']]['feature'].to_list()

In [47]:
y_train.to_pickle(data_dir + 'y_train.pkl')
y_val.to_pickle(data_dir +  'y_val.pkl')
y_test.to_pickle(data_dir + 'y_test.pkl')

ids_train.to_pickle(data_dir + 'ids_train.pkl')
ids_val.to_pickle(data_dir + 'ids_val.pkl')
ids_test.to_pickle(data_dir + 'ids_test.pkl')

X_train[feature_selected].to_pickle(data_dir + 'X_train.pkl')
X_val[feature_selected].to_pickle(data_dir + 'X_val.pkl')
X_test[feature_selected].to_pickle(data_dir + 'X_test.pkl')
