# Model Overview

In [1]:
#Base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Model Settings
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import early_stopping
from lightgbm import log_evaluation

#Model Selection
from sklearn.ensemble import RandomForestClassifier
import optuna.integration.lightgbm as lgbm
import lightgbm as lgb
from xgboost import XGBClassifier #Please enable GPU Acceleration

#Hyperparameter Tuning
import optuna
import optuna.visualization as optvis

#Misc
import gc #Trash Collection
import re #Regular Expression
import joblib #Export Pre-Processor/Model 

## Load Training Data

In [2]:
#Train
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')

print(train.shape)
train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


## Preprocessing 

In [3]:
###############################################
# One-Hot Encoding
###############################################   
def onehot_encoder(df):
    cat_columns = [col for col in df.columns if df[col].dtype == 'object']
    num_columns = [col for col in df.columns if df[col].dtype != 'object']
    df = pd.get_dummies(df)    
    enc_columns = [c for c in df.columns if c not in num_columns]

    return df, enc_columns


In [4]:
def process_application(data_path):
    print('Processing application data.')
    
    train = pd.read_csv(f'{data_path}/application_train.csv')
    
    ###################################
    # One-Hot Encoding *Train/Test*
    ###################################
    
    train = pd.get_dummies(train)
    
    ###################################
    # Fill Missing Values for Numerical Columns w/ mean
    ###################################
    
    num_columns = [col for col in train.columns[2:] if train[col].dtype != 'object']
    for col in num_columns:
        if train[col].isna().values.sum() > 0:
            train[col].fillna(train[col].mean(), inplace=True)

    ###################################
    # Fill Missing Values for Categorical Columns w/ mode
    ###################################
    
    train = train.fillna(train.mode().iloc[0])
    
    #####################################################
    # Format Non-Regular Column Names with Underscores
    #####################################################
    train = train.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '_', x))
    
    ###################################
    # End - process_application Function
    ###################################
    
    print("")
    print('Data Preprocessed')
    print(train.shape)
    
    return train

#Calling Function -------------------------

train = process_application('../input/home-credit-default-risk')

Processing application data.

Data Preprocessed
(307511, 246)


In [5]:
#Defining X_train and y_train
X_train = train.drop(columns=['TARGET','SK_ID_CURR'])
y_train = train.TARGET

print(X_train.shape)
print(y_train.shape)

(307511, 244)
(307511,)


In [6]:
#Split Train into train/validation sets
X_sample, X_valid, y_sample, y_valid = train_test_split(X_train, y_train, test_size=0.8, stratify=y_train, random_state=1)

print(X_sample.shape)
print(X_valid.shape)

(61502, 244)
(246009, 244)


In [7]:
#Creating indexes from train/test
train_idx, valid_idx = train_test_split(range(len(train)), test_size=0.8, random_state=1, stratify=train.TARGET)
print(len(train_idx))
print(len(valid_idx))

61502
246009


## Model Selection 

| Model | Score | Time | Notes|
| ---   | ---   | ---  | ---  |
|Random Forest| 0.73914 | 2 min 6 sec | No Tuning/Base|
|LightGBM | 0.74388 | 10.5 sec | No Tuning/Base|
|XGBoost | 0.71189 | 1min 33 sec | No Tuning/Base|
|Boosted Tree| 0.74897| 6min 8 sec | No Tuning/Base| 

In [8]:
%%time

#Random Forest

def rf_objective(trial):
    
    n  = trial.suggest_int('n_estimators', 20, 200)
    md = trial.suggest_int('max_depth', 2, 256)
    msl = trial.suggest_int('min_samples_leaf', 1, 64)
    ms = trial.suggest_float('max_samples', 0.5, 1.0)
    cr = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    clf =  RandomForestClassifier(
        n_estimators=n, max_depth=md, min_samples_leaf=msl, max_samples=ms
    )
    
    cv_scores = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=[(train_idx, valid_idx)], scoring ='roc_auc')
    score = cv_scores.mean()
    return score
    
rf_study = optuna.create_study(direction='maximize')
rf_study.optimize(rf_objective, n_trials=5)

[32m[I 2022-12-08 02:53:47,395][0m A new study created in memory with name: no-name-17733580-1908-4f14-ade8-cfdb709c25bf[0m
[32m[I 2022-12-08 02:53:56,311][0m Trial 0 finished with value: 0.724201066682605 and parameters: {'n_estimators': 22, 'max_depth': 163, 'min_samples_leaf': 34, 'max_samples': 0.9969299005834324, 'criterion': 'gini'}. Best is trial 0 with value: 0.724201066682605.[0m
[32m[I 2022-12-08 02:54:19,225][0m Trial 1 finished with value: 0.7301676106231899 and parameters: {'n_estimators': 88, 'max_depth': 134, 'min_samples_leaf': 10, 'max_samples': 0.6765043014260719, 'criterion': 'gini'}. Best is trial 1 with value: 0.7301676106231899.[0m
[32m[I 2022-12-08 02:54:31,108][0m Trial 2 finished with value: 0.7327637837377105 and parameters: {'n_estimators': 41, 'max_depth': 215, 'min_samples_leaf': 28, 'max_samples': 0.9675956747691709, 'criterion': 'entropy'}. Best is trial 2 with value: 0.7327637837377105.[0m
[32m[I 2022-12-08 02:54:39,457][0m Trial 3 finished

CPU times: user 3.24 s, sys: 686 ms, total: 3.93 s
Wall time: 1min 21s


In [9]:
%%time

#LightGBM

dtrain = lgb.Dataset(X_sample, label=y_sample)
dval = lgb.Dataset(X_valid, label=y_valid)

params = {
    'objective': 'binary',
    'metric': 'auc', 
    'verbosity' : 0,
    'boosting_type': 'gbdt',
    'force_col_wise':'true'
}

gbm = lgb.train (
    params,
    train_set=dtrain, 
    valid_sets=[dtrain, dval],
    callbacks=[early_stopping(100), log_evaluation(100)]
)

prediction = np.rint(gbm.predict(X_valid, num_iteration=gbm.best_iteration))
roc_score = roc_auc_score(y_valid, prediction)
    
best_params = gbm.params
print('Best params:', best_params)
print('roc_score = {}'.format(roc_score))

print(' Params: ')
for key, value in best_params.items():
      print('    {}:  {}'.format(key, value))

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.878679	valid_1's auc: 0.743884
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.878679	valid_1's auc: 0.743884
Best params: {'objective': 'binary', 'metric': 'auc', 'verbosity': 0, 'boosting_type': 'gbdt', 'force_col_wise': 'true', 'num_iterations': 100, 'early_stopping_round': None}
roc_score = 0.5077347429379067
 Params: 
    objective:  binary
    metric:  auc
    verbosity:  0
    boosting_type:  gbdt
    force_col_wise:  true
    num_iterations:  100
    early_stopping_round:  None
CPU times: user 18.5 s, sys: 601 ms, total: 19.1 s
Wall time: 10.7 s


In [10]:
%%time

#XGBoost - GPU Acceleration must be turned on from Settings for results to show.

def gb_objective(trial):
    n  = trial.suggest_int('n_estimators', 150, 300, log=False)
    lr = trial.suggest_float('learning_rate', 0, 0.7, log=False)
    ga = trial.suggest_float('gamma', 0, 5, log=False)
    ss = trial.suggest_float('subsample', 0.6, 1, log=False)
    md = trial.suggest_int('max_depth', 10, 24, log=False)
    
    clf =  XGBClassifier(random_state=1, max_depth=md, n_estimators=n, booster='gbtree',
                         learning_rate=lr, gamma=ga, subsample=ss, tree_method='gpu_hist')
    
    scores = cross_val_score(clf, X_train, y_train, n_jobs=1, cv=[(train_idx, valid_idx)], scoring='roc_auc')
    return scores.mean()
    
gb_study = optuna.create_study(direction='maximize')
gb_study.optimize(gb_objective, n_trials=5)

print(gb_study.best_value)
print(gb_study.best_params)

[32m[I 2022-12-08 02:55:20,155][0m A new study created in memory with name: no-name-cbd4a02b-5eaa-4bbd-83e3-9847362cf34b[0m
[32m[I 2022-12-08 02:55:34,621][0m Trial 0 finished with value: 0.6847753958539673 and parameters: {'n_estimators': 171, 'learning_rate': 0.5445915968936368, 'gamma': 4.620376119241899, 'subsample': 0.7893157396921522, 'max_depth': 22}. Best is trial 0 with value: 0.6847753958539673.[0m
[32m[I 2022-12-08 02:55:41,363][0m Trial 1 finished with value: 0.679963523478316 and parameters: {'n_estimators': 295, 'learning_rate': 0.6649129870387618, 'gamma': 3.65811061423004, 'subsample': 0.9412461873933499, 'max_depth': 19}. Best is trial 0 with value: 0.6847753958539673.[0m
[32m[I 2022-12-08 02:55:47,162][0m Trial 2 finished with value: 0.6986509041528499 and parameters: {'n_estimators': 254, 'learning_rate': 0.4361799079110012, 'gamma': 3.518761223628136, 'subsample': 0.8226005862448154, 'max_depth': 17}. Best is trial 2 with value: 0.6986509041528499.[0m
[

0.7299487168039456
{'n_estimators': 151, 'learning_rate': 0.18475046712441306, 'gamma': 3.6823979529432354, 'subsample': 0.8311095977085419, 'max_depth': 12}
CPU times: user 1min 23s, sys: 2.1 s, total: 1min 25s
Wall time: 1min 20s


In [11]:
%%time

#Boosted-Tree

def lgbm_objective(trial):
    n  = trial.suggest_int('n_estimators', 20, 150)
    md = trial.suggest_int('max_depth', 2, 40)
    lr = trial.suggest_float('learning_rate', 0, 1, log=False)
    ss = trial.suggest_float('subsample', 0.6, 1, log=False)
    bt = trial.suggest_categorical('boosting_type', ['gbdt', 'dart'])
    
    clf =  lgb.LGBMClassifier(n_estimators=n, max_depth=md, learning_rate=lr, 
                              boosting_type=bt,subsample=ss, random_state=1)
    
    scores = cross_val_score(clf, X_train, y_train, n_jobs=1, cv=5, scoring='roc_auc')
    return scores.mean()
    
lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(lgbm_objective, n_trials=5)

print(lgbm_study.best_value)
print(lgbm_study.best_params)

[32m[I 2022-12-08 02:56:40,451][0m A new study created in memory with name: no-name-fab77b10-f8c8-4cf5-8683-4dd4a4979427[0m
[32m[I 2022-12-08 02:57:26,711][0m Trial 0 finished with value: 0.7343884390760872 and parameters: {'n_estimators': 62, 'max_depth': 13, 'learning_rate': 0.5240668125305448, 'subsample': 0.7281498271293056, 'boosting_type': 'gbdt'}. Best is trial 0 with value: 0.7343884390760872.[0m
[32m[I 2022-12-08 03:00:04,049][0m Trial 1 finished with value: 0.7468666567054124 and parameters: {'n_estimators': 136, 'max_depth': 39, 'learning_rate': 0.08473951710506933, 'subsample': 0.9705552987722222, 'boosting_type': 'dart'}. Best is trial 1 with value: 0.7468666567054124.[0m
[32m[I 2022-12-08 03:01:02,637][0m Trial 2 finished with value: 0.7454126148540161 and parameters: {'n_estimators': 50, 'max_depth': 17, 'learning_rate': 0.12823182009064038, 'subsample': 0.7350541165169018, 'boosting_type': 'dart'}. Best is trial 1 with value: 0.7468666567054124.[0m
[32m[I 2

0.7486646427443687
{'n_estimators': 53, 'max_depth': 4, 'learning_rate': 0.27710440685323245, 'subsample': 0.9390837837986028, 'boosting_type': 'dart'}
CPU times: user 13min 25s, sys: 10.7 s, total: 13min 36s
Wall time: 7min 34s
