In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score


In [None]:
working_dir = "/kaggle/input/home-credit-default-risk/"

print('Importing data...')
data = pd.read_csv(working_dir + 'application_train.csv')
test = pd.read_csv(working_dir + 'application_test.csv')
prev = pd.read_csv(working_dir + 'previous_application.csv')
bureau = pd.read_csv(working_dir + 'bureau.csv')
bureau_balance = pd.read_csv(working_dir + 'bureau_balance.csv')
credit_card  = pd.read_csv(working_dir + 'credit_card_balance.csv')
POS_CASH  = pd.read_csv(working_dir + 'POS_CASH_balance.csv')
payments = pd.read_csv(working_dir + 'installments_payments.csv')
lgbm_submission = pd.read_csv(working_dir + 'sample_submission.csv')

In [None]:
# Drop the target column
target = data['TARGET']
del data['TARGET']

# **1. Data Preprocessing**

The provided dataset contains lots of details about the customer. And all the data are segregated into multiple tables. We will dive into all tables seperately.

Dataset has a relation table that shows the which table related with which table via which column. For example previous_application table is related with POS_CASH_balance via sk_id_prev. We will group this tables according to id variables such as sk_id_prev. We have three id variables:  
1. SK_ID_CURR   -> id for current loan; 
2. SK_ID_BUREAU -> ids for the bureau loans
3. SK_ID_PREV   -> id for previous loans

After preprocess operation for all tables seperately we will combine all data into one final dataframe.

There are categorical data in tables. We have several options for dealing with categorical data. Such as replacing the meanings of the all categories, label encoding and one hot encoding. In this dataset we have two part of categorical data. The first part has binary features like female/male or yes/no but the other part has Cash loans/Revolving loans. We will use label encoding for the first part and one hot encoding for the second part.

And the null values will be considered after combine operation.

In [None]:
# Preprocess application_train.csv and application_test.csv
def preprocess_application_files(data, test):
    '''
    negative_day_features = ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE']
    for negative_feature in negative_day_features:
        data[negative_feature] = data[negative_feature].apply(lambda day : day*(-1/365))
        
    
    data = data.append(test).reset_index()
    #print(data['CODE_GENDER'].value_counts())
    data = data[data['CODE_GENDER'] != 'XNA']

    negative_day_features = ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE']
    for negative_feature in negative_day_features:
        data[negative_feature] = data[negative_feature].apply(lambda day : day*(-1/365))
    
    binary_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
    for binary_feature in binary_features:
            data[binary_feature], uniques = pd.factorize(data[binary_feature])
    
    non_binary_features = [col for col in data.columns if data[col].dtype == 'object' and (col not in binary_features)]
    one_hot_df = pd.get_dummies(data, columns=non_binary_features)
    
    categorical_features = [col for col in data.columns if data[col].dtype == 'object']
    one_hot_df = pd.concat([data,test])
    one_hot_df = pd.get_dummies(one_hot_df, columns=categorical_features)
    '''
    #One-hot encoding of categorical features in data and test sets
    categorical_features = [col for col in data.columns if data[col].dtype == 'object']

    one_hot_df = pd.concat([data,test])
    one_hot_df = pd.get_dummies(one_hot_df, columns=categorical_features)
    return one_hot_df

In [None]:
# Preprocess bureau.csv and bureau_balance.csv
def preprocess_bureau_files(bureau, bureau_balance):
    # One row is for one month history so we will group this rows according to id
    bureau_grouped_size = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].size()
    bureau_grouped_max = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].max()
    bureau_grouped_min = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].min()

    bureau_counts = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'].value_counts(normalize = False)
    bureau_counts_unstacked = bureau_counts.unstack('STATUS')
    bureau_counts_unstacked.columns = ['STATUS_0', 'STATUS_1','STATUS_2','STATUS_3','STATUS_4','STATUS_5','STATUS_C','STATUS_X',]
    bureau_counts_unstacked['MONTHS_COUNT'] = bureau_grouped_size
    bureau_counts_unstacked['MONTHS_MAX'] = bureau_grouped_max
    bureau_counts_unstacked['MONTHS_MIN'] = bureau_grouped_min

    bureau = bureau.join(bureau_counts_unstacked, how='left', on='SK_ID_BUREAU')
    '''
    negative_day_features = ['DAYS_CREDIT', 'DAYS_CREDIT_UPDATE', 'DAYS_ENDDATE_FACT']
    for negative_feature in negative_day_features:
        bureau[negative_feature] = bureau[negative_feature].apply(lambda day : day*(-1/365))
    
    bureau['CREDIT_ACTIVE'], uniques = pd.factorize(bureau['CREDIT_ACTIVE'])
    non_binary_features = [col for col in bureau.columns if bureau[col].dtype == 'object' and col != 'CREDIT_ACTIVE']
    bureau = pd.get_dummies(bureau, columns=non_binary_features)
    
    avg_bureau = bureau.groupby('SK_ID_CURR').mean()
    avg_bureau['buro_count'] = bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']
    del avg_bureau['SK_ID_BUREAU']
    return avg_bureau
    '''
    
    buro_cat_features = [bcol for bcol in bureau.columns if bureau[bcol].dtype == 'object']
    bureau = pd.get_dummies(bureau, columns=buro_cat_features)
    avg_bureau = bureau.groupby('SK_ID_CURR').mean()
    avg_bureau['buro_count'] = bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']
    del avg_bureau['SK_ID_BUREAU']
    return avg_bureau

In [None]:
# Preprocessing previous_application
def preprocess_previous_application(prev):
    prev_cat_features = [pcol for pcol in prev.columns if prev[pcol].dtype == 'object']
    prev = pd.get_dummies(prev, columns=prev_cat_features)
    avg_prev = prev.groupby('SK_ID_CURR').mean()
    cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
    del avg_prev['SK_ID_PREV']
    return avg_prev


In [None]:
# Preprocess POS_CASH_balance.csv
def preprocess_pos_cash(POS_CASH):
    le = LabelEncoder()
    POS_CASH['NAME_CONTRACT_STATUS'] = le.fit_transform(POS_CASH['NAME_CONTRACT_STATUS'].astype(str))
    nunique_status = POS_CASH[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').nunique()
    nunique_status2 = POS_CASH[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').max()
    POS_CASH['NUNIQUE_STATUS'] = nunique_status['NAME_CONTRACT_STATUS']
    POS_CASH['NUNIQUE_STATUS2'] = nunique_status2['NAME_CONTRACT_STATUS']
    POS_CASH.drop(['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], axis=1, inplace=True)
    return POS_CASH

In [None]:
# Preprocess credit_card_balance.csv
def preprocess_credit_card_balance(credit_card):
    le = LabelEncoder()
    credit_card['NAME_CONTRACT_STATUS'] = le.fit_transform(credit_card['NAME_CONTRACT_STATUS'].astype(str))
    nunique_status = credit_card[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').nunique()
    nunique_status2 = credit_card[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').max()
    credit_card['NUNIQUE_STATUS'] = nunique_status['NAME_CONTRACT_STATUS']
    credit_card['NUNIQUE_STATUS2'] = nunique_status2['NAME_CONTRACT_STATUS']
    credit_card.drop(['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], axis=1, inplace=True)
    
    return credit_card 

In [None]:
# Preprocessing payments
def preprocess_payments(payments):
    '''
    negative_day_features = ['DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT']
    for negative_feature in negative_day_features:
        payments[negative_feature] = payments[negative_feature].apply(lambda day : day*(-1/365))
    '''
    payments_mean = payments.groupby('SK_ID_CURR').mean()
    payments_max = payments.groupby('SK_ID_CURR').max()
    payments_min = payments.groupby('SK_ID_CURR').min()

    del payments_mean['SK_ID_PREV']
    return payments_mean, payments_max, payments_min

In [None]:
def combine_all(data, test, bureau, bureau_balance, prev, POS_CASH, credit_card, payments):
    bureau = preprocess_bureau_files(bureau, bureau_balance)
    prev = preprocess_previous_application(prev)
    POS_CASH = preprocess_pos_cash(POS_CASH)
    credit_card = preprocess_credit_card_balance(credit_card)
    payments_mean, payments_max, payments_min = preprocess_payments(payments)
    
    data = data.merge(right=prev.reset_index(), how='left', on='SK_ID_CURR')
    data = data.merge(right=bureau.reset_index(), how='left', on='SK_ID_CURR')
    data = data.merge(POS_CASH.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')
    data = data.merge(credit_card.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')
    data = data.merge(right=payments_mean.reset_index(), how='left', on='SK_ID_CURR')
    data = data.merge(right=payments_max.reset_index(), how='left', on='SK_ID_CURR')
    data = data.merge(right=payments_min.reset_index(), how='left', on='SK_ID_CURR')
    
    test = test.merge(right=prev.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right=bureau.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(POS_CASH.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(credit_card.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right=payments_mean.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right=payments_max.reset_index(), how='left', on='SK_ID_CURR')
    test = test.merge(right=payments_min.reset_index(), how='left', on='SK_ID_CURR')
    return data, test

In [None]:
application_data = preprocess_application_files(data, test)
data = application_data.iloc[:data.shape[0],:]
test = application_data.iloc[data.shape[0]:,]

print('Combining all tables...')
data, test  = combine_all(data, test, bureau, bureau_balance, prev, POS_CASH, credit_card, payments)

In [None]:
target.value_counts().plot(kind='bar');

# 2. Model Operations

We have a classification problem and we will start with LightGBM. Splitting data to the parts as train, validation and test is helpful for evaluation. But the result might be belongs to a particular split. This brings on the overfitting problem. We can use cross validation to solve this problem. We will consider K-Folds and Stratified K-Folds as a cross validator.   

While selecting the right model validation method we will consider the dataset. As we can see in the above graphic, we have inbalance dataset. In this bar chart, 0 stands for customer will repay on time and 1 is opposite. Stratified K-Folds will enforce the class distribution in each split of the data to match the distribution in the complete training dataset.([source](https://machinelearningmastery.com/cross-validation-for-imbalanced-classification/)) So, it means that we should prefer StratifiedKFold over KFold when dealing with classifications tasks with imbalanced class distributions.

In [None]:
import re
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

LightGBM parameters based on this study -> https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code

In [None]:
def get_model():
    clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.03,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

    return clf

In [None]:
import gc
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)
oof_preds = np.zeros(data.shape[0])
sub_preds = np.zeros(test.shape[0])

feature_importance_df = pd.DataFrame()

feats = [f for f in data.columns if f not in ['SK_ID_CURR']]

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data, target)):
    trn_x, trn_y = data[feats].iloc[trn_idx], target.iloc[trn_idx]
    val_x, val_y = data[feats].iloc[val_idx], target.iloc[val_idx]
    
    clf = get_model()
    
    clf.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=100  #30
           )
    
    oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(test[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()

print('Full AUC score %.6f' % roc_auc_score(target, oof_preds)) 

test['TARGET'] = sub_preds

test[['SK_ID_CURR', 'TARGET']].to_csv('1submission.csv', index=False)

In [None]:
! pip install lofo-importance

# Feature Selection
We have 504 features after all these preprocessing operations. More features might seem like a useful thing but indeed they can be harmful in some case. Sometimes irrelevant features, highly correlated features and also null values decrease the generalization of the model. After all features extracted we must perform feature selection to keep only the most useful variables. Feature selection is for calculating the importance of the features based on given metric and given model. Our model is LightGBM and the selected metric is roc. We will [LOFO-importance](https://github.com/aerdem4/lofo-importance) for this aim.


In [None]:
from lofo import LOFOImportance, Dataset, plot_importance
data['TARGET'] = target
dataset = Dataset(df=data, target="TARGET", features=[data.columns[i] for i in range(len(data.columns))])
lofo_imp = LOFOImportance(dataset, cv=folds, model=get_model(), scoring='roc_auc')
importance_df = lofo_imp.get_importance()
importance_df


In [None]:
%matplotlib inline
plot_importance(importance_df, figsize=(12, 12))

# Optimisation
Hyperparameter optimization(known as tuning) is the way of choosing a set of optimal hyperparameters for choosed learning algorithm, LightGBM for our case. [Optuna](https://optuna.readthedocs.io/en/stable/#) is automated hyperparameter optimization framework for this aim. 

In [None]:
# Lets see how to change our study to use Optuna
# Change the model for getting parameters from outside
def get_model(nthread, n_estimators, learning_rate, num_leaves, max_depth, min_split_gain, min_child_samples, reg_alpha, reg_lambda):
    clf = LGBMClassifier(
            nthread=nthread, #4,
            n_estimators=n_estimators, #10000,
            learning_rate=learning_rate, #0.03,
            num_leaves=num_leaves, #34,
            max_depth=max_depth, #8,
            reg_alpha=reg_alpha, #0.041545473,
            reg_lambda=reg_lambda, #0.0735294,
            min_split_gain=min_split_gain, #0.0222415,
            min_child_samples=min_child_samples,
            silent=-1,
            verbose=-1, )

    return clf

In [None]:
# Create an objection function for optuna
# 
import gc
def objective(data, test, trial):

    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)
    oof_preds = np.zeros(data.shape[0])
    sub_preds = np.zeros(test.shape[0])

    feature_importance_df = pd.DataFrame()

    feats = [f for f in data.columns if f not in ['SK_ID_CURR']]

    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data, target)):
        trn_x, trn_y = data[feats].iloc[trn_idx], target.iloc[trn_idx]
        val_x, val_y = data[feats].iloc[val_idx], target.iloc[val_idx]
        
        
        nthread = trial.suggest_int('nthread',1,10)
        n_estimators = trial.suggest_int('n_estimators',100,20000)
        learning_rate = trial.suggest_loguniform('learning_rate',0.001,0.1)
        num_leaves = trial.suggest_int('num_leaves',2,70)
        max_depth = trial.suggest_int('max_depth',1,20)
        min_split_gain = trial.suggest_discrete_uniform('min_split_gain', 0.1, 5, 0.01)
        min_child_samples = trial.suggest_int('min_child_samples', 5, 100)
        reg_alpha =  trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
        reg_lambda = trial.suggest_loguniform('reg_lambda', 1e-8, 10.0)
            
        clf = get_model(nthread, n_estimators, learning_rate, num_leaves, max_depth, min_split_gain, min_child_samples, reg_alpha, reg_lambda)

        clf.fit(trn_x, trn_y, 
                eval_set= [(trn_x, trn_y), (val_x, val_y)], 
                eval_metric='auc', verbose=100, early_stopping_rounds=100  #30
               )

        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()

        print('Full AUC score %.6f' % roc_auc_score(target, oof_preds)) 
        test['TARGET'] = sub_preds

        return roc_auc_score(target, oof_preds)

In [None]:
import optuna
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(data, test, trial), n_trials=100)
 
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)