In [1]:
import pickle

import numpy as np
import pandas as pd
import random
from tqdm import tqdm

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split


from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import roc_auc_score, accuracy_score
from matplotlib import pyplot as plt
import seaborn as sns # for correlation heatmap
from sklearn.preprocessing import OrdinalEncoder

In [32]:
def load_pickle(filename):
    with open(filename,'rb') as file:
        object_file = pickle.load(file)
    return object_file

def dump_pickle(file, filename):
    with open(filename, "wb") as output_file:
        pickle.dump(file, output_file)


In [2]:
df = pd.read_parquet('data/application_train.parquet')
agg_ccb = pd.read_pickle('data/ccb_agg_features.pkl')
agg_ip = pd.read_pickle('data/ip_agg_features.pkl')

In [3]:
df = df.merge(agg_ccb, on = 'SK_ID_CURR', how='left')
df = df.merge(agg_ip, on = 'SK_ID_CURR', how='left')

del agg_ccb, agg_ip

In [4]:
df.shape

(307511, 263)

In [5]:
target_col_name = 'TARGET'
id_col_name = 'SK_ID_CURR'

y = df.pop(target_col_name)
ids = df.pop(id_col_name)

categorical_features = df.select_dtypes('object').columns.to_list()
num_features = df.select_dtypes('number').columns.to_list()

In [6]:

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

num_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=np.nan))
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [7]:
X = preprocess.fit_transform(df)

In [8]:
X.shape

(307511, 391)

In [9]:
X = pd.DataFrame(X)

In [10]:
prepro_columns  = num_features + list(preprocess.transformers_[1][1][1].get_feature_names_out(categorical_features)) 

In [11]:
prepro_columns =  [x.replace(" ", "_") for x in prepro_columns]

In [12]:
X.columns = prepro_columns

In [13]:
X.shape

(307511, 391)

In [14]:
X.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone,_brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_None,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_None
0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461.0,-637.0,-3648.0,-2120.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765.0,-1188.0,-1186.0,-291.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046.0,-225.0,-4260.0,-2531.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005.0,-3039.0,-9833.0,-2437.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932.0,-3038.0,-4311.0,-3458.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


### Train, Validation and Test Split

In [15]:
ids_train, ids_val_test, X_train, X_val_test, y_train, y_val_test = train_test_split(ids, X, y, test_size=0.33, random_state=42)
ids_val, ids_test, X_val, X_test, y_val, y_test = train_test_split(ids_val_test, X_val_test, y_val_test, test_size=0.5, random_state=42)
print(f'Train Set n rows: {X_train.shape[0]}')
print(f'Validation Set n rows: {X_val.shape[0]}')
print(f'Test Set n rows: {X_test.shape[0]}')

Train Set n rows: 206032
Validation Set n rows: 50739
Test Set n rows: 50740


In [52]:
y_train.mean() * 206032

16708.0

In [50]:
y_val.mean()

0.0801750133033761

In [51]:
y_test.mean()

0.0797989751675207

In [16]:
del ids_val_test, X_val_test, y_val_test, df, y, ids

### Feature Selection by Best Than Random Featuare approach

In [17]:
canaritos_pct =  0.05

In [18]:
def agregar_canaritos( pdataset, pcanaritos_idx ):
    nrows, ncols= pdataset.shape
    
    
    canaritos_cantidad = int(ncols * pcanaritos_idx)  
    vcanaritos =  ['canarito_' + sub for sub in map(str, list(range(canaritos_cantidad)))]

  #uso esta semilla para los canaritos
    np.random.seed(10217)

  #genero dataset con canaritos haber hecho un loop for
    df_canaritos = pd.DataFrame(np.random.randn(nrows, canaritos_cantidad, ), columns=vcanaritos)
    df_canaritos = df_canaritos.set_index(pdataset.index)

    #ahora hago que los canaritos sean las primeras variables del dataset
    
    result = pd.concat([pdataset, df_canaritos], axis=1)
    
    return result

In [19]:
X_train_con_canaritos = agregar_canaritos(X_train, canaritos_pct)
X_val_con_canaritos = agregar_canaritos(X_val, canaritos_pct)

In [27]:
feature_importance = pd.Series(X_train_con_canaritos.columns).to_frame(name='feature' )

params = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': [100, 500, 1000],
            "reg_alpha"   : [0.5,0.2,1],
            "reg_lambda"  : [2,3,5],
            "gamma"       : [1,2,3]}

metrics = []

In [28]:
n_iter = 10
random.seed(42)
for i in tqdm(range(n_iter)):
    parms_iter = {k:random.choice(v) for k,v in params.items()}
    model = XGBClassifier(seed=10,
                          verbosity =0, 
                          objective='binary:logistic',
                          use_label_encoder=False,
                          **parms_iter)
    model.fit(X=X_train.values, y=y_train.values)
    # make predictions for test data
    y_pred = model.predict_proba(X=X_val)[:,1]
    # evaluate predictions
    accuracy = accuracy_score(y_val,np.where(y_pred >0.5, 1,0))
    roc_auc = roc_auc_score(y_val, y_pred)
    
  
    
    #feature importance data frame
    
    feature_importance = pd.concat([feature_importance,  
                                    pd.Series(model.feature_importances_).to_frame(name=f'importance_{i}' )],
                                   axis=1)
    feature_importance[f'ranking_{i}'] = feature_importance[f'importance_{i}'].rank(method='first', ascending = False)
    
    best_canarito = feature_importance.loc[feature_importance.feature.str.startswith('canarito')][f'ranking_{i}'].min() 
    
    feature_importance[f'best_than_canarito_{i}'] = np.where(feature_importance[f'ranking_{i}'] < best_canarito, True, False) 
    
    
    metrics_iter = { 'iter': i,
                    'accuracy':accuracy,
                    'roc_auc':roc_auc,
                    'params': parms_iter,
                    'best_canarito' : best_canarito
                   }
    metrics.append(metrics_iter)

100%|██████████| 10/10 [1:06:51<00:00, 401.14s/it]


In [29]:
metrics = pd.DataFrame( metrics)

In [30]:
metrics

Unnamed: 0,iter,accuracy,roc_auc,params,best_canarito
0,0,0.920022,0.742459,"{'max_depth': 20, 'learning_rate': 0.01, 'subs...",1.0
1,1,0.919825,0.730708,"{'max_depth': 3, 'learning_rate': 0.01, 'subsa...",1.0
2,2,0.92018,0.770892,"{'max_depth': 15, 'learning_rate': 0.01, 'subs...",1.0
3,3,0.919904,0.762332,"{'max_depth': 5, 'learning_rate': 0.3, 'subsam...",4.0
4,4,0.919825,0.764763,"{'max_depth': 6, 'learning_rate': 0.1, 'subsam...",1.0
5,5,0.91545,0.725034,"{'max_depth': 6, 'learning_rate': 0.2, 'subsam...",6.0
6,6,0.918997,0.742473,"{'max_depth': 15, 'learning_rate': 0.2, 'subsa...",2.0
7,7,0.919588,0.75213,"{'max_depth': 20, 'learning_rate': 0.1, 'subsa...",2.0
8,8,0.917756,0.736873,"{'max_depth': 20, 'learning_rate': 0.2, 'subsa...",1.0
9,9,0.920259,0.769202,"{'max_depth': 20, 'learning_rate': 0.01, 'subs...",2.0


In [31]:
best_params = metrics.loc[metrics.roc_auc == metrics.roc_auc.max(), 'params' ].values[0]

In [35]:
dump_pickle(best_params, 'data/best_first_params.pkl')

In [36]:
feature_importance.loc[(feature_importance.best_than_canarito_0) &
                      (feature_importance.best_than_canarito_1) &
                      (feature_importance.best_than_canarito_2) &
                      (feature_importance.best_than_canarito_3) &
                      (feature_importance.best_than_canarito_4) &
                      (feature_importance.best_than_canarito_5) &
                      (feature_importance.best_than_canarito_6) &
                      (feature_importance.best_than_canarito_7) &
                      (feature_importance.best_than_canarito_8) &
                      (feature_importance.best_than_canarito_9)]

Unnamed: 0,feature,importance_0,ranking_0,best_than_canarito_0,importance_1,ranking_1,best_than_canarito_1,importance_2,ranking_2,best_than_canarito_2,...,best_than_canarito_6,importance_7,ranking_7,best_than_canarito_7,importance_8,ranking_8,best_than_canarito_8,importance_9,ranking_9,best_than_canarito_9


In [37]:
### Realizo el promedio

In [38]:
filter_col = [col for col in feature_importance if col.startswith('importance_')]

In [39]:
feature_importance['mean_importance'] = feature_importance[filter_col].mean(axis=1)

In [40]:
feature_importance['ranking_mean_importance'] = feature_importance['mean_importance'].rank(method='first', ascending = False)

In [41]:
best_mean_canarito = feature_importance.loc[feature_importance.feature.str.startswith('canarito')]['ranking_mean_importance'].min() 

In [42]:
feature_importance['best_than_canarito_mean'] = np.where(feature_importance['ranking_mean_importance'] < best_mean_canarito, True, False) 


In [43]:
feature_importance.head()

Unnamed: 0,feature,importance_0,ranking_0,best_than_canarito_0,importance_1,ranking_1,best_than_canarito_1,importance_2,ranking_2,best_than_canarito_2,...,best_than_canarito_7,importance_8,ranking_8,best_than_canarito_8,importance_9,ranking_9,best_than_canarito_9,mean_importance,ranking_mean_importance,best_than_canarito_mean
0,canarito_0,0.002623,215.0,False,0.0,73.0,False,0.00254,230.0,False,...,False,0.002866,216.0,False,0.002528,262.0,False,0.0024,198.0,False
1,canarito_1,0.002636,210.0,False,0.0,74.0,False,0.002628,176.0,False,...,False,0.003027,144.0,False,0.002632,203.0,False,0.002474,165.0,False
2,canarito_2,0.003067,76.0,False,0.0,75.0,False,0.003299,34.0,False,...,False,0.003281,56.0,False,0.003028,46.0,False,0.003142,68.0,False
3,canarito_3,0.003142,58.0,False,0.0,76.0,False,0.003129,41.0,False,...,False,0.003321,49.0,False,0.002994,54.0,False,0.003395,57.0,False
4,canarito_4,0.003503,27.0,False,0.029786,6.0,False,0.003747,18.0,False,...,False,0.0037,20.0,False,0.003319,22.0,False,0.006945,12.0,False


In [44]:
feature_selected =feature_importance.loc[feature_importance['best_than_canarito_mean']]['feature'].to_list()

In [45]:
feature_selected

[]

In [48]:
feature_importance.head(50)

Unnamed: 0,feature,importance_0,ranking_0,best_than_canarito_0,importance_1,ranking_1,best_than_canarito_1,importance_2,ranking_2,best_than_canarito_2,...,best_than_canarito_7,importance_8,ranking_8,best_than_canarito_8,importance_9,ranking_9,best_than_canarito_9,mean_importance,ranking_mean_importance,best_than_canarito_mean
0,canarito_0,0.002623,215.0,False,0.0,73.0,False,0.00254,230.0,False,...,False,0.002866,216.0,False,0.002528,262.0,False,0.0024,198.0,False
1,canarito_1,0.002636,210.0,False,0.0,74.0,False,0.002628,176.0,False,...,False,0.003027,144.0,False,0.002632,203.0,False,0.002474,165.0,False
2,canarito_2,0.003067,76.0,False,0.0,75.0,False,0.003299,34.0,False,...,False,0.003281,56.0,False,0.003028,46.0,False,0.003142,68.0,False
3,canarito_3,0.003142,58.0,False,0.0,76.0,False,0.003129,41.0,False,...,False,0.003321,49.0,False,0.002994,54.0,False,0.003395,57.0,False
4,canarito_4,0.003503,27.0,False,0.029786,6.0,False,0.003747,18.0,False,...,False,0.0037,20.0,False,0.003319,22.0,False,0.006945,12.0,False
5,canarito_5,0.002784,152.0,False,0.0,77.0,False,0.002675,153.0,False,...,False,0.003045,130.0,False,0.002644,189.0,False,0.002436,179.0,False
6,canarito_6,0.003767,21.0,False,0.027522,9.0,False,0.0035,26.0,False,...,False,0.003557,25.0,False,0.003185,30.0,False,0.006547,13.0,False
7,canarito_7,0.003132,60.0,False,0.016809,24.0,False,0.003354,31.0,False,...,False,0.003249,61.0,False,0.003039,43.0,False,0.005023,23.0,False
8,canarito_8,0.002839,136.0,False,0.0,78.0,False,0.002734,122.0,False,...,False,0.003078,118.0,False,0.002718,146.0,False,0.002551,139.0,False
9,canarito_9,0.002917,115.0,False,0.0,79.0,False,0.002842,85.0,False,...,False,0.003136,98.0,False,0.002755,126.0,False,0.002783,99.0,False
