In [None]:
import numpy as np
import pandas as pd

In [None]:
from tqdm.notebook import tqdm
import random
import gc
import time

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
import lightgbm as lgb

In [None]:
gc.enable()

One of the main problems that I faced is the large amount of data that needs to be handled, so I process the data one portion at a time in order to always keep under control the memory usage, applying similar transformations multiple times.

Application data

For all the csv files I consider XNA and XAP as nan (along with the default nan).

In [None]:
train_data = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)
test_data = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

Cleaning the application data

In [None]:
train_counts = train_data.count().sort_values()/len(train_data)
test_counts = test_data.count().sort_values()/len(test_data)

In [None]:
cols = set(train_counts[(train_counts < 1) & (train_counts > 0.99)].index) - set(test_counts[(test_counts < 1) & (test_counts > 0.9)].index)

In [None]:
cols

I drop a few rows in the train data where there are less than .01% missing values in columns where the test data has got no missing values and then I join the two datasets adding one column IS_TRAIN to identify where each record belongs.

In [None]:
train_data.dropna(subset=cols, inplace=True)

In [None]:
train_target = train_data[['SK_ID_CURR', 'TARGET']]

In [None]:
submit = test_data[['SK_ID_CURR']]

In [None]:
train_data.drop(columns=['TARGET'], inplace=True)

In [None]:
test_data['IS_TRAIN'] = 0
train_data['IS_TRAIN'] = 1

In [None]:
application_data = train_data.append(test_data)

In [None]:
del(train_data)
del(test_data)

Analysis of columns with more than 60% of missing values

In [None]:
appl_counts = application_data.count().sort_values()/len(application_data)

In [None]:
appl_counts[(appl_counts < 0.6)]

OWN_CAR_AGE can be handled with FLAG_OWN_CAR when using the tree based gradient boosting.

EXT_SOURCE_1 will be imputed when the external bureau data is added.
The rest is data relative to the housing and it is mostly missing, so I drop all those columns.

In [None]:
appl_counts = application_data.count().sort_values()/len(application_data)

In [None]:
cols = list(set(appl_counts[(appl_counts < 0.6)].index) - set(['EXT_SOURCE_1', 'OWN_CAR_AGE']))

In the final submission I decided to also leave the housing features which slightly improve the score.

In [None]:
#application_data.drop(columns=cols, inplace=True)

Label encoding for binary categorical features

In [None]:
le = LabelEncoder()
for col in application_data.select_dtypes('object'):
    if len(application_data[col].unique()) <= 2:
        le.fit(application_data[col])
        application_data[col] = le.transform(application_data[col])

Next I handle the three categorical features with missing values: ORGANIZATION_TYPE, NAME_TYPE_SUITE, OCCUPATION_TYPE. The approach that I prefer to follow is to create a new categorical value for all of them called Nan in order to avoid messing up the existing data, which will be handled by the get_dummies function used for the one hot encoding. I will use the same approch for all the following data.

One hot encoding of all the other categorical features

In [None]:
application_data = pd.get_dummies(application_data, dummy_na=True)

I check the correlation of the features with the target to see if I can drop the remaining columns with missing values

In [None]:
appl_counts = application_data.count().sort_values()/len(application_data)

In [None]:
appl_counts[(appl_counts < 1)]

In [None]:
train_data = application_data[application_data.IS_TRAIN == 1].merge(train_target, how='left', on='SK_ID_CURR')

In [None]:
corrs = train_data.corr()

In [None]:
del(train_data)

In [None]:
corrs['TARGET'].abs().sort_values().tail(40)

EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3 are the best features, which are scores obtained from external sources. I tried to impute them using the rest of the data, but I only end up lowering the correlation with the target feature so I prefer to let them as they are since the the lgbm algorithm can handle missing data.

The 6 AMT_REQ_CREDIT_BUREAU features are missing because these clients are not present in the bureau credit data. So I add a new feature IS_IN_BUREAU.

In [None]:
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

In [None]:
application_data[application_data.AMT_REQ_CREDIT_BUREAU_WEEK.isnull() & application_data.SK_ID_CURR.isin(bureau.SK_ID_CURR.unique())]

In [None]:
application_data['IS_IN_BUREAU'] = 0

In [None]:
application_data.loc[application_data.SK_ID_CURR.isin(bureau.SK_ID_CURR.unique()), 'IS_IN_BUREAU'] = 1

As for the CNT_SOCIAL_CIRCLE features, not knowing how this data was gathered, I decided to add a feature HAS_SOCIAL_CIRCLE.

In [None]:
appl_counts = application_data.count().sort_values()/len(application_data)
appl_counts[(appl_counts < 1)]

In [None]:
application_data['HAS_SOCIAL_CIRCLE'] = 0

In [None]:
application_data.loc[~application_data.OBS_30_CNT_SOCIAL_CIRCLE.isnull(), 'HAS_SOCIAL_CIRCLE'] = 1

Hand crafted features. The few application data features that actually regard the entity of the loan are only slightly correlated to the target. So I tried to build a few new features

In [None]:
application_data

In [None]:
application_data['AMT_CREDIT_FRAC'] = application_data.AMT_CREDIT / application_data.AMT_INCOME_TOTAL

In [None]:
application_data['AMT_CREDIT_FRAC'] = application_data.AMT_ANNUITY / application_data.AMT_CREDIT

In [None]:
application_data['AMT_GOODS_FRAC'] = application_data.AMT_GOODS_PRICE / application_data.AMT_CREDIT

In [None]:
application_data['AMT_ANNUITY_FRAC'] = application_data.AMT_ANNUITY / application_data.AMT_INCOME_TOTAL

In [None]:
application_data['AMT_DPD_DEF'] = application_data.DEF_30_CNT_SOCIAL_CIRCLE + application_data.OBS_30_CNT_SOCIAL_CIRCLE

Only AMT_GOODS_FRAC is actually a very good feature. I kept all the others leaving the machine learning algorithm to decide how to use them.

Bureau Balance data

In [None]:
bureau_balance = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

I drop the data of loans that are not related to clients in the application data.

In [None]:
bureau = bureau[bureau.SK_ID_CURR.isin(application_data.SK_ID_CURR.unique())]

In [None]:
bureau_balance = bureau_balance[bureau_balance.SK_ID_BUREAU.isin(bureau.SK_ID_BUREAU.unique())]

One-hot encoding

In [None]:
bureau_balance = pd.get_dummies(bureau_balance)

Aggregation

In [None]:
bureau_balance = bureau_balance.sort_values(['SK_ID_BUREAU', 'MONTHS_BALANCE'])

In [None]:
temp = bureau_balance.groupby('SK_ID_BUREAU').size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [None]:
bureau_balance = bureau_balance.groupby('SK_ID_BUREAU').agg({'last', 'sum', 'mean'})

In [None]:
bureau_balance.columns = bureau_balance.columns.map('_'.join)

In [None]:
bureau_balance.reset_index(inplace=True)

In [None]:
bureau_balance = bureau_balance.merge(temp, how='left', on='SK_ID_BUREAU')

In [None]:
bureau_balance.columns = bureau_balance.columns.map(lambda x : 'BLN_' + x if x != 'SK_ID_BUREAU' else x)

Bureau data

In [None]:
bureau = bureau.merge(bureau_balance, how='left', on='SK_ID_BUREAU')

In [None]:
bureau.drop(columns='SK_ID_BUREAU', inplace=True)

In [None]:
del(bureau_balance)

In [None]:
bureau = bureau.sort_values(['SK_ID_CURR', 'DAYS_CREDIT'])

In [None]:
bureau = pd.get_dummies(bureau, dummy_na=True)

In [None]:
temp = bureau.groupby('SK_ID_CURR').size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [None]:
bureau = bureau.groupby('SK_ID_CURR').agg({'sum', 'mean', 'max'})

In [None]:
bureau.columns = bureau.columns.map('_'.join)

In [None]:
bureau.reset_index(inplace=True)

In [None]:
bureau = bureau.merge(temp, how='left', on='SK_ID_CURR')

In [None]:
bureau.columns = bureau.columns.map(lambda x : 'BRU_' + x if x != 'SK_ID_CURR' else x)

In [None]:
application_data = application_data.merge(bureau, how='left', on='SK_ID_CURR')

In [None]:
del(bureau)

Previous applications

In [None]:
prev_application = pd.read_csv('/kaggle/input/home-credit-default-risk/previous_application.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

In [None]:
prev_application = prev_application[prev_application.SK_ID_CURR.isin(application_data.SK_ID_CURR.unique())]

In [None]:
prev_application = pd.get_dummies(prev_application, dummy_na=True)

In [None]:
prev_application.drop(columns='SK_ID_PREV', inplace=True)

In [None]:
prev_application = prev_application.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])

In [None]:
temp = prev_application.groupby('SK_ID_CURR').size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [None]:
prev_application = prev_application.groupby('SK_ID_CURR').agg(['max', 'sum', 'mean']) # last

In [None]:
prev_application.columns = prev_application.columns.map('_'.join)

In [None]:
prev_application.reset_index(inplace=True)

In [None]:
prev_application = prev_application.merge(temp, how='left', on='SK_ID_CURR')

In [None]:
prev_application.columns = prev_application.columns.map(lambda x : 'PREV_' + x if x != 'SK_ID_CURR' else x)

In [None]:
application_data = application_data.merge(prev_application, how='left', on='SK_ID_CURR')

In [None]:
del(prev_application)

POS Cash Balance

In [None]:
pos_cash_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

In [None]:
pos_cash_balance = pos_cash_balance.sort_values(['SK_ID_CURR', 'SK_ID_PREV', 'MONTHS_BALANCE'])

In [None]:
temp = pos_cash_balance.groupby(['SK_ID_CURR', 'SK_ID_PREV']).size().to_frame()
temp = temp.rename(columns={0: 'BLN_COUNT'})
temp.reset_index(inplace=True)

In [None]:
pos_cash_balance = pd.get_dummies(pos_cash_balance, dummy_na=True)

In [None]:
pos_cash_balance = pos_cash_balance.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg(['sum', 'mean', 'max']) # last

In [None]:
pos_cash_balance.columns = pos_cash_balance.columns.map('_'.join)

In [None]:
pos_cash_balance.reset_index(inplace=True)

In [None]:
pos_cash_balance = pos_cash_balance.merge(temp, how='left', on=['SK_ID_CURR', 'SK_ID_PREV'])

In [None]:
pos_cash_balance.drop(columns='SK_ID_PREV', inplace=True)

In [None]:
temp = pos_cash_balance.groupby('SK_ID_CURR').size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [None]:
pos_cash_balance = pos_cash_balance.groupby(['SK_ID_CURR']).agg(['sum', 'mean', 'max']) 

In [None]:
pos_cash_balance.columns = pos_cash_balance.columns.map('_'.join)

In [None]:
pos_cash_balance.reset_index(inplace=True)

In [None]:
pos_cash_balance = pos_cash_balance.merge(temp, how='left', on='SK_ID_CURR')

In [None]:
pos_cash_balance.columns = pos_cash_balance.columns.map(lambda x : 'CSH_' + x if x != 'SK_ID_CURR' else x)

In [None]:
application_data = application_data.merge(pos_cash_balance, how='left', on='SK_ID_CURR')

In [None]:
del(pos_cash_balance)

Credit Card Balance

In [None]:
credit_card_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/credit_card_balance.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

In [None]:
credit_card_balance = pd.get_dummies(credit_card_balance, dummy_na=True)

In [None]:
credit_card_balance = credit_card_balance.sort_values(['SK_ID_CURR', 'SK_ID_PREV', 'MONTHS_BALANCE'])

In [None]:
temp = credit_card_balance.groupby(['SK_ID_CURR', 'SK_ID_PREV']).size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [None]:
credit_card_balance = credit_card_balance.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg(['sum', 'mean', 'max']) # last

In [None]:
credit_card_balance.columns = credit_card_balance.columns.map('_'.join)

In [None]:
credit_card_balance.reset_index(inplace=True)

In [None]:
credit_card_balance = credit_card_balance.merge(temp, how='left', on=['SK_ID_CURR', 'SK_ID_PREV'])

In [None]:
credit_card_balance.drop(columns='SK_ID_PREV', inplace=True)

In [None]:
credit_card_balance = credit_card_balance.groupby(['SK_ID_CURR']).agg(['sum', 'mean'])

In [None]:
credit_card_balance.columns = credit_card_balance.columns.map('_'.join)

In [None]:
credit_card_balance.reset_index(inplace=True)

In [None]:
credit_card_balance.columns = credit_card_balance.columns.map(lambda x : 'CRD_' + x if x != 'SK_ID_CURR' else x)

In [None]:
application_data = application_data.merge(credit_card_balance, how='left', on='SK_ID_CURR')

In [None]:
del(credit_card_balance)

Installment payments

In [None]:
installments_payments = pd.read_csv('/kaggle/input/home-credit-default-risk/installments_payments.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

In [None]:
installments_payments = installments_payments.sort_values(['SK_ID_CURR', 'SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'])

In [None]:
temp = installments_payments.groupby(['SK_ID_CURR', 'SK_ID_PREV']).size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [None]:
installments_payments.fillna(0, inplace=True)

In [None]:
installments_payments = installments_payments.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg(['sum', 'mean', 'max', 'min'])

In [None]:
installments_payments.columns = installments_payments.columns.map('_'.join)

In [None]:
installments_payments.reset_index(inplace=True)

In [None]:
installments_payments = installments_payments.merge(temp, how='left', on=['SK_ID_CURR', 'SK_ID_PREV'])

In [None]:
installments_payments.drop(columns='SK_ID_PREV', inplace=True)

In [None]:
installments_payments = installments_payments.groupby(['SK_ID_CURR']).agg(['sum', 'mean', 'max', 'min'])

In [None]:
installments_payments.columns = installments_payments.columns.map('_'.join)

In [None]:
installments_payments.reset_index(inplace=True)

In [None]:
installments_payments.columns = installments_payments.columns.map(lambda x : 'INS_' + x if x != 'SK_ID_CURR' else x)

In [None]:
application_data = application_data.merge(installments_payments, how='left', on='SK_ID_CURR')

In [None]:
del(installments_payments)

In [None]:
for col in application_data.columns:
    if len(application_data[col].unique()) <= 1:
        application_data.drop(columns=col,inplace=True)

In [None]:
application_data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in application_data.columns]

Feature selection

In order to reduce the number of features before starting the training and evaluation, I use the lgbm algorithm to select the most important features based on the number of times the feature is used in a model. 

In [None]:
model = lgb.LGBMClassifier()

In [None]:
train_data = application_data[application_data.IS_TRAIN == 1.0]

In [None]:
test_data = application_data[application_data.IS_TRAIN == 0.0]

In [None]:
train_data.drop(columns='IS_TRAIN', inplace=True)
test_data.drop(columns='IS_TRAIN', inplace=True)

In [None]:
del(application_data)

In [None]:
params = model.get_params()

In [None]:
params['objective'] = 'binary'
params['metric'] = 'auc'

In [None]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
final_importance = np.zeros(len(train_data.columns))
for n_fold, (train_index, valid_index) in tqdm(enumerate(skf.split(train_data, train_target.TARGET))):
    X_train = train_data.iloc[train_index]
    y_train = train_target.iloc[train_index].TARGET
    X_valid = train_data.iloc[valid_index]
    y_valid = train_target.iloc[valid_index].TARGET
    lgb_train = lgb.Dataset(data=X_train, label=y_train)
    lgb_eval = lgb.Dataset(data=X_valid, label=y_valid)
    model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=100)
    final_importance += model.feature_importance()

In [None]:
fi = pd.DataFrame()
fi['FEAT'] = train_data.columns

In [None]:
fi['importance'] = final_importance

In [None]:
fi = fi.sort_values(by='importance', ascending=False)

In [None]:
fi = fi[fi.importance != 0]

In [None]:
fi.head(30)

In [None]:
cols = list(set(fi.FEAT.values).union(set(['SK_ID_CURR'])))

In [None]:
len(cols)

In [None]:
train_data = train_data[cols]

In [None]:
test_data = test_data[cols]

Hyperparameter Tuning

The approach that I follow for the hyperparameter tuning is the random search on the interval around the main default parameters of the lgbt classifier.

In [None]:
def get_random_params():
    params = {
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'num_leaves': random.randint(10, 60),
        'max_depth': random.randint(10, 30),
        'learning_rate': random.choice([0.0001, 0.0005, 0.001, 0.005, 0.01]),
        'n_estimators': random.randint(1000, 20000),
        'objective': 'binary',
        'reg_alpha': random.choice([0.001, 0.005, 0.01, 0.05, 0.1]),
        'reg_lambda': random.choice([0.001, 0.005, 0.01, 0.05, 0.1]),       
        'colsample_bytree': random.choice([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
        'min_child_samples': random.randint(10, 100),
        'subsample_for_bin': random.randint(50000, 300000)
    }
    return params

In [None]:
best_params = {'boosting_type': 'gbdt', 
               'metric': 'auc', 
               'num_leaves': 46, 
               'max_depth': 18, 
               'learning_rate': 0.01, 
               'n_estimators': 6289, 
               'objective': 'binary', 
               'reg_alpha': 0.05, 
               'reg_lambda': 0.05, 
               'colsample_bytree': 0.4, 
               'min_child_samples': 79, 
               'subsample_for_bin': 113092}
best_auc = 0.787228

In [None]:
def get_best_params(hyper_rounds, n_folds, best_params=None, best_auc=0):
    best_params = best_params
    best_auc = best_auc
    lgb_train = lgb.Dataset(data=train_data, label=train_target.TARGET)
    for i in tqdm(range(hyper_rounds)):
        curr_params = get_random_params()
        start = time.time()
        print(curr_params)
        eval_hist = lgb.cv(curr_params, lgb_train, early_stopping_rounds = 200, nfold = n_folds, seed = 42, verbose_eval = 100)
        end = time.time()
        print('TIME:', end-start)
        curr_auc = eval_hist['auc-mean'][-1]
        if curr_auc > best_auc:
            best_params = curr_params
            best_auc = curr_auc
    return best_params, best_auc

remove the comment to do hyperparameter tuning

In [None]:
HYPER_ROUNDS = 1
FOLDS = 5
#best_params, best_auc = get_best_params(HYPER_ROUNDS, FOLDS, best_params, best_auc)

Training and evaluation

In [None]:
N_FOLDS = 10

In [None]:
skf = StratifiedKFold(n_splits=N_FOLDS, random_state=42, shuffle=True)
sub_preds = np.zeros(len(test_data))
avg_valid_auc = 0
for n_fold, (train_index, valid_index) in tqdm(enumerate(skf.split(train_data, train_target.TARGET))):
    print("FOLD N:", n_fold)
    X_train = train_data.iloc[train_index]
    y_train = train_target.iloc[train_index].TARGET
    X_valid = train_data.iloc[valid_index]
    y_valid = train_target.iloc[valid_index].TARGET
    lgb_train = lgb.Dataset(data=X_train, label=y_train)
    lgb_eval = lgb.Dataset(data=X_valid, label=y_valid)
    model = lgb.train(best_params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=100)
    y_pred = model.predict(X_valid)
    sub_preds += model.predict(test_data) / skf.n_splits
    avg_valid_auc += roc_auc_score(y_valid, y_pred) / N_FOLDS

In [None]:
avg_valid_auc

Submission

In [None]:
submit['TARGET'] = sub_preds

In [None]:
submit.to_csv('submission.csv', index = False)

References

1. Guolin Ke Qi Meng Thomas Finely Taifeng Wang Wei Chen Weidong Ma Qiwei Ye Tie-Yan Liu. 2017. [LightGBM: A Highly Efficient Gradient Boosting Decision Tree](https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf)