In [2]:
import numpy as np
import pandas as pd

In [3]:
from tqdm.notebook import tqdm
import random
import gc
import time

In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier

In [5]:
import lightgbm as lgb

In [6]:
gc.enable()

One of the main problems that I faced is the large amount of data that needs to be handled, so I process the data one portion at a time in order to always keep under control the memory usage, applying similar transformations multiple times.

Application data

For all the csv files I consider XNA and XAP as nan (along with the default nan).

In [7]:
train_data = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)
test_data = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

Cleaning the application data

In [8]:
train_counts = train_data.count().sort_values()/len(train_data)
test_counts = test_data.count().sort_values()/len(test_data)

In [9]:
cols = set(train_counts[(train_counts < 1) & (train_counts > 0.99)].index) - set(test_counts[(test_counts < 1) & (test_counts > 0.9)].index)

In [10]:
cols

{'AMT_GOODS_PRICE', 'CNT_FAM_MEMBERS', 'CODE_GENDER', 'DAYS_LAST_PHONE_CHANGE'}

I drop a few rows in the train data where there are less than .01% missing values in columns where the test data has got no missing values and then I join the two datasets adding one column IS_TRAIN to identify where each record belongs.

In [11]:
train_data.dropna(subset=cols, inplace=True)

In [12]:
train_target = train_data[['SK_ID_CURR', 'TARGET']]

In [13]:
submit = test_data[['SK_ID_CURR']]

In [14]:
train_data.drop(columns=['TARGET'], inplace=True)

In [15]:
test_data['IS_TRAIN'] = 0
train_data['IS_TRAIN'] = 1

In [16]:
application_data = train_data.append(test_data)

In [17]:
del(train_data)
del(test_data)

Analysis of columns with more than 60% of missing values

In [18]:
appl_counts = application_data.count().sort_values()/len(application_data)

In [19]:
appl_counts[(appl_counts < 0.6)]

COMMONAREA_MEDI                 0.302917
COMMONAREA_MODE                 0.302917
COMMONAREA_AVG                  0.302917
NONLIVINGAPARTMENTS_MEDI        0.307106
NONLIVINGAPARTMENTS_MODE        0.307106
NONLIVINGAPARTMENTS_AVG         0.307106
FONDKAPREMONT_MODE              0.317696
LIVINGAPARTMENTS_MEDI           0.318005
LIVINGAPARTMENTS_MODE           0.318005
LIVINGAPARTMENTS_AVG            0.318005
FLOORSMIN_MEDI                  0.323253
FLOORSMIN_MODE                  0.323253
FLOORSMIN_AVG                   0.323253
YEARS_BUILD_MEDI                0.336743
YEARS_BUILD_AVG                 0.336743
YEARS_BUILD_MODE                0.336743
OWN_CAR_AGE                     0.339945
LANDAREA_AVG                    0.408206
LANDAREA_MEDI                   0.408206
LANDAREA_MODE                   0.408206
BASEMENTAREA_MEDI               0.417339
BASEMENTAREA_MODE               0.417339
BASEMENTAREA_AVG                0.417339
NONLIVINGAREA_MEDI              0.450533
NONLIVINGAREA_MO

OWN_CAR_AGE can be handled with FLAG_OWN_CAR when using the tree based gradient boosting.

EXT_SOURCE_1 will be imputed when the external bureau data is added.
The rest is data relative to the housing and it is mostly missing, so I drop all those columns.

In [20]:
appl_counts = application_data.count().sort_values()/len(application_data)

In [21]:
cols = list(set(appl_counts[(appl_counts < 0.6)].index) - set(['EXT_SOURCE_1', 'OWN_CAR_AGE']))

In the final submission I decided to also leave the housing features which slightly improve the score.

In [22]:
#application_data.drop(columns=cols, inplace=True)

Label encoding for binary categorical features

In [23]:
le = LabelEncoder()
for col in application_data.select_dtypes('object'):
    if len(application_data[col].unique()) <= 2:
        le.fit(application_data[col])
        application_data[col] = le.transform(application_data[col])

Next I handle the three categorical features with missing values: ORGANIZATION_TYPE, NAME_TYPE_SUITE, OCCUPATION_TYPE. The approach that I prefer to follow is to create a new categorical value for all of them called Nan in order to avoid messing up the existing data, which will be handled by the get_dummies function used for the one hot encoding. I will use the same approch for all the following data.

One hot encoding of all the other categorical features

In [24]:
application_data = pd.get_dummies(application_data, dummy_na=True)

I check the correlation of the features with the target to see if I can drop the remaining columns with missing values

In [25]:
appl_counts = application_data.count().sort_values()/len(application_data)

In [26]:
appl_counts[(appl_counts < 1)]

COMMONAREA_MODE                 0.302917
COMMONAREA_MEDI                 0.302917
COMMONAREA_AVG                  0.302917
NONLIVINGAPARTMENTS_MODE        0.307106
NONLIVINGAPARTMENTS_MEDI        0.307106
NONLIVINGAPARTMENTS_AVG         0.307106
LIVINGAPARTMENTS_MODE           0.318005
LIVINGAPARTMENTS_AVG            0.318005
LIVINGAPARTMENTS_MEDI           0.318005
FLOORSMIN_AVG                   0.323253
FLOORSMIN_MEDI                  0.323253
FLOORSMIN_MODE                  0.323253
YEARS_BUILD_AVG                 0.336743
YEARS_BUILD_MODE                0.336743
YEARS_BUILD_MEDI                0.336743
OWN_CAR_AGE                     0.339945
LANDAREA_MEDI                   0.408206
LANDAREA_AVG                    0.408206
LANDAREA_MODE                   0.408206
BASEMENTAREA_MODE               0.417339
BASEMENTAREA_MEDI               0.417339
BASEMENTAREA_AVG                0.417339
NONLIVINGAREA_MODE              0.450533
NONLIVINGAREA_AVG               0.450533
NONLIVINGAREA_ME

In [27]:
train_data = application_data[application_data.IS_TRAIN == 1].merge(train_target, how='left', on='SK_ID_CURR')

In [28]:
corrs = train_data.corr()

In [29]:
del(train_data)

In [30]:
corrs['TARGET'].abs().sort_values().tail(40)

OWN_CAR_AGE                                          0.037613
OCCUPATION_TYPE_nan                                  0.038714
WALLSMATERIAL_MODE_nan                               0.039382
AMT_GOODS_PRICE                                      0.039647
HOUSETYPE_MODE_nan                                   0.039714
HOUSETYPE_MODE_block of flats                        0.040594
EMERGENCYSTATE_MODE_nan                              0.041378
DAYS_REGISTRATION                                    0.042008
EMERGENCYSTATE_MODE_No                               0.042193
OCCUPATION_TYPE_Laborers                             0.042991
FLOORSMAX_MODE                                       0.043157
FLOORSMAX_MEDI                                       0.043697
FLOORSMAX_AVG                                        0.043932
FLAG_DOCUMENT_3                                      0.044401
REG_CITY_NOT_LIVE_CITY                               0.044476
DAYS_EMPLOYED                                        0.044980
FLAG_EMP

EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3 are the best features, which are scores obtained from external sources. I tried to impute them using the rest of the data, but I only end up lowering the correlation with the target feature so I prefer to let them as they are since the the lgbm algorithm can handle missing data.

The 6 AMT_REQ_CREDIT_BUREAU features are missing because these clients are not present in the bureau credit data. So I add a new feature IS_IN_BUREAU.

In [31]:
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

In [32]:
application_data[application_data.AMT_REQ_CREDIT_BUREAU_WEEK.isnull() & application_data.SK_ID_CURR.isin(bureau.SK_ID_CURR.unique())]

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan


In [33]:
application_data['IS_IN_BUREAU'] = 0

In [34]:
application_data.loc[application_data.SK_ID_CURR.isin(bureau.SK_ID_CURR.unique()), 'IS_IN_BUREAU'] = 1

As for the CNT_SOCIAL_CIRCLE features, not knowing how this data was gathered, I decided to add a feature HAS_SOCIAL_CIRCLE.

In [35]:
appl_counts = application_data.count().sort_values()/len(application_data)
appl_counts[(appl_counts < 1)]

COMMONAREA_MODE                 0.302917
COMMONAREA_MEDI                 0.302917
COMMONAREA_AVG                  0.302917
NONLIVINGAPARTMENTS_MODE        0.307106
NONLIVINGAPARTMENTS_MEDI        0.307106
NONLIVINGAPARTMENTS_AVG         0.307106
LIVINGAPARTMENTS_MODE           0.318005
LIVINGAPARTMENTS_AVG            0.318005
LIVINGAPARTMENTS_MEDI           0.318005
FLOORSMIN_AVG                   0.323253
FLOORSMIN_MEDI                  0.323253
FLOORSMIN_MODE                  0.323253
YEARS_BUILD_AVG                 0.336743
YEARS_BUILD_MODE                0.336743
YEARS_BUILD_MEDI                0.336743
OWN_CAR_AGE                     0.339945
LANDAREA_MEDI                   0.408206
LANDAREA_AVG                    0.408206
LANDAREA_MODE                   0.408206
BASEMENTAREA_MODE               0.417339
BASEMENTAREA_MEDI               0.417339
BASEMENTAREA_AVG                0.417339
NONLIVINGAREA_MODE              0.450533
NONLIVINGAREA_AVG               0.450533
NONLIVINGAREA_ME

In [36]:
application_data['HAS_SOCIAL_CIRCLE'] = 0

In [37]:
application_data.loc[~application_data.OBS_30_CNT_SOCIAL_CIRCLE.isnull(), 'HAS_SOCIAL_CIRCLE'] = 1

Hand crafted features. The few application data features that actually regard the entity of the loan are only slightly correlated to the target. So I tried to build a few new features

In [38]:
application_data

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan,IS_IN_BUREAU,HAS_SOCIAL_CIRCLE
0,100002,0,1,0,1,0,202500.0,406597.5,24700.5,351000.0,...,0,0,1,0,0,1,0,0,1,1
1,100003,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0,1,0,0,1,1
2,100004,1,1,1,1,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,1,0,0,1,1,1
3,100006,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,1,0,0,1,0,1
4,100007,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,0,0,0,1,0,121500.0,412560.0,17473.5,270000.0,...,0,0,0,0,1,0,0,1,1,1
48740,456222,0,0,0,0,2,157500.0,622413.0,31909.5,495000.0,...,0,0,0,0,1,0,0,1,0,1
48741,456223,0,0,1,1,1,202500.0,315000.0,33205.5,315000.0,...,0,0,1,0,0,1,0,0,1,1
48742,456224,0,1,0,0,0,225000.0,450000.0,25128.0,450000.0,...,0,1,0,0,0,1,0,0,1,1


In [39]:
application_data['AMT_CREDIT_FRAC'] = application_data.AMT_CREDIT / application_data.AMT_INCOME_TOTAL

In [40]:
application_data['AMT_CREDIT_FRAC'] = application_data.AMT_ANNUITY / application_data.AMT_CREDIT

In [41]:
application_data['AMT_GOODS_FRAC'] = application_data.AMT_GOODS_PRICE / application_data.AMT_CREDIT

In [42]:
application_data['AMT_ANNUITY_FRAC'] = application_data.AMT_ANNUITY / application_data.AMT_INCOME_TOTAL

In [43]:
application_data['AMT_DPD_DEF'] = application_data.DEF_30_CNT_SOCIAL_CIRCLE + application_data.OBS_30_CNT_SOCIAL_CIRCLE

Only AMT_GOODS_FRAC is actually a very good feature. I kept all the others leaving the machine learning algorithm to decide how to use them.

Bureau Balance data

In [44]:
bureau_balance = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

I drop the data of loans that are not related to clients in the application data.

In [45]:
bureau = bureau[bureau.SK_ID_CURR.isin(application_data.SK_ID_CURR.unique())]

In [46]:
bureau_balance = bureau_balance[bureau_balance.SK_ID_BUREAU.isin(bureau.SK_ID_BUREAU.unique())]

One-hot encoding

In [47]:
bureau_balance = pd.get_dummies(bureau_balance)

Aggregation

In [48]:
bureau_balance = bureau_balance.sort_values(['SK_ID_BUREAU', 'MONTHS_BALANCE'])

In [49]:
temp = bureau_balance.groupby('SK_ID_BUREAU').size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [50]:
bureau_balance = bureau_balance.groupby('SK_ID_BUREAU').agg({'last', 'sum', 'mean'})

In [51]:
bureau_balance.columns = bureau_balance.columns.map('_'.join)

In [52]:
bureau_balance.reset_index(inplace=True)

In [53]:
bureau_balance = bureau_balance.merge(temp, how='left', on='SK_ID_BUREAU')

In [54]:
bureau_balance.columns = bureau_balance.columns.map(lambda x : 'BLN_' + x if x != 'SK_ID_BUREAU' else x)

Bureau data

In [55]:
bureau = bureau.merge(bureau_balance, how='left', on='SK_ID_BUREAU')

In [56]:
bureau.drop(columns='SK_ID_BUREAU', inplace=True)

In [57]:
del(bureau_balance)

In [58]:
bureau = bureau.sort_values(['SK_ID_CURR', 'DAYS_CREDIT'])

In [59]:
bureau = pd.get_dummies(bureau, dummy_na=True)

In [60]:
temp = bureau.groupby('SK_ID_CURR').size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [61]:
bureau = bureau.groupby('SK_ID_CURR').agg({'sum', 'mean', 'max'})

In [62]:
bureau.columns = bureau.columns.map('_'.join)

In [63]:
bureau.reset_index(inplace=True)

In [64]:
bureau = bureau.merge(temp, how='left', on='SK_ID_CURR')

In [65]:
bureau.columns = bureau.columns.map(lambda x : 'BRU_' + x if x != 'SK_ID_CURR' else x)

In [66]:
application_data = application_data.merge(bureau, how='left', on='SK_ID_CURR')

In [67]:
del(bureau)

Previous applications

In [68]:
prev_application = pd.read_csv('/kaggle/input/home-credit-default-risk/previous_application.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

In [69]:
prev_application = prev_application[prev_application.SK_ID_CURR.isin(application_data.SK_ID_CURR.unique())]

In [70]:
prev_application = pd.get_dummies(prev_application, dummy_na=True)

In [71]:
prev_application.drop(columns='SK_ID_PREV', inplace=True)

In [72]:
prev_application = prev_application.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])

In [73]:
temp = prev_application.groupby('SK_ID_CURR').size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [74]:
prev_application = prev_application.groupby('SK_ID_CURR').agg(['max', 'sum', 'mean']) # last

In [75]:
prev_application.columns = prev_application.columns.map('_'.join)

In [76]:
prev_application.reset_index(inplace=True)

In [77]:
prev_application = prev_application.merge(temp, how='left', on='SK_ID_CURR')

In [78]:
prev_application.columns = prev_application.columns.map(lambda x : 'PREV_' + x if x != 'SK_ID_CURR' else x)

In [79]:
application_data = application_data.merge(prev_application, how='left', on='SK_ID_CURR')

In [80]:
del(prev_application)

POS Cash Balance

In [81]:
pos_cash_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

In [82]:
pos_cash_balance = pos_cash_balance.sort_values(['SK_ID_CURR', 'SK_ID_PREV', 'MONTHS_BALANCE'])

In [83]:
temp = pos_cash_balance.groupby(['SK_ID_CURR', 'SK_ID_PREV']).size().to_frame()
temp = temp.rename(columns={0: 'BLN_COUNT'})
temp.reset_index(inplace=True)

In [84]:
pos_cash_balance = pd.get_dummies(pos_cash_balance, dummy_na=True)

In [85]:
pos_cash_balance = pos_cash_balance.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg(['sum', 'mean', 'max']) # last

In [86]:
pos_cash_balance.columns = pos_cash_balance.columns.map('_'.join)

In [87]:
pos_cash_balance.reset_index(inplace=True)

In [88]:
pos_cash_balance = pos_cash_balance.merge(temp, how='left', on=['SK_ID_CURR', 'SK_ID_PREV'])

In [89]:
pos_cash_balance.drop(columns='SK_ID_PREV', inplace=True)

In [90]:
temp = pos_cash_balance.groupby('SK_ID_CURR').size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [91]:
pos_cash_balance = pos_cash_balance.groupby(['SK_ID_CURR']).agg(['sum', 'mean', 'max']) 

In [92]:
pos_cash_balance.columns = pos_cash_balance.columns.map('_'.join)

In [93]:
pos_cash_balance.reset_index(inplace=True)

In [94]:
pos_cash_balance = pos_cash_balance.merge(temp, how='left', on='SK_ID_CURR')

In [95]:
pos_cash_balance.columns = pos_cash_balance.columns.map(lambda x : 'CSH_' + x if x != 'SK_ID_CURR' else x)

In [96]:
application_data = application_data.merge(pos_cash_balance, how='left', on='SK_ID_CURR')

In [97]:
del(pos_cash_balance)

Credit Card Balance

In [98]:
credit_card_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/credit_card_balance.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

In [99]:
credit_card_balance = pd.get_dummies(credit_card_balance, dummy_na=True)

In [100]:
credit_card_balance = credit_card_balance.sort_values(['SK_ID_CURR', 'SK_ID_PREV', 'MONTHS_BALANCE'])

In [101]:
temp = credit_card_balance.groupby(['SK_ID_CURR', 'SK_ID_PREV']).size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [102]:
credit_card_balance = credit_card_balance.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg(['sum', 'mean', 'max']) # last

In [103]:
credit_card_balance.columns = credit_card_balance.columns.map('_'.join)

In [104]:
credit_card_balance.reset_index(inplace=True)

In [105]:
credit_card_balance = credit_card_balance.merge(temp, how='left', on=['SK_ID_CURR', 'SK_ID_PREV'])

In [106]:
credit_card_balance.drop(columns='SK_ID_PREV', inplace=True)

In [107]:
credit_card_balance = credit_card_balance.groupby(['SK_ID_CURR']).agg(['sum', 'mean'])

In [108]:
credit_card_balance.columns = credit_card_balance.columns.map('_'.join)

In [109]:
credit_card_balance.reset_index(inplace=True)

In [110]:
credit_card_balance.columns = credit_card_balance.columns.map(lambda x : 'CRD_' + x if x != 'SK_ID_CURR' else x)

In [111]:
application_data = application_data.merge(credit_card_balance, how='left', on='SK_ID_CURR')

In [112]:
del(credit_card_balance)

Installment payments

In [113]:
installments_payments = pd.read_csv('/kaggle/input/home-credit-default-risk/installments_payments.csv', 
                               na_values=['XNA', 'XAP'], na_filter=True)

In [114]:
installments_payments = installments_payments.sort_values(['SK_ID_CURR', 'SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'])

In [115]:
temp = installments_payments.groupby(['SK_ID_CURR', 'SK_ID_PREV']).size().to_frame()
temp = temp.rename(columns={0: 'COUNT'})
temp.reset_index(inplace=True)

In [116]:
installments_payments.fillna(0, inplace=True)

In [117]:
installments_payments = installments_payments.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg(['sum', 'mean', 'max', 'min'])

In [118]:
installments_payments.columns = installments_payments.columns.map('_'.join)

In [119]:
installments_payments.reset_index(inplace=True)

In [120]:
installments_payments = installments_payments.merge(temp, how='left', on=['SK_ID_CURR', 'SK_ID_PREV'])

In [121]:
installments_payments.drop(columns='SK_ID_PREV', inplace=True)

In [122]:
installments_payments = installments_payments.groupby(['SK_ID_CURR']).agg(['sum', 'mean', 'max', 'min'])

In [123]:
installments_payments.columns = installments_payments.columns.map('_'.join)

In [124]:
installments_payments.reset_index(inplace=True)

In [125]:
installments_payments.columns = installments_payments.columns.map(lambda x : 'INS_' + x if x != 'SK_ID_CURR' else x)

In [126]:
application_data = application_data.merge(installments_payments, how='left', on='SK_ID_CURR')

In [127]:
del(installments_payments)

In [128]:
for col in application_data.columns:
    if len(application_data[col].unique()) <= 1:
        application_data.drop(columns=col,inplace=True)

In [129]:
application_data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in application_data.columns]

Feature selection

In order to reduce the number of features before starting the training and evaluation, I use the lgbm algorithm to select the most important features based on the number of times the feature is used in a model. 

In [130]:
model = lgb.LGBMClassifier()

In [131]:
train_data = application_data[application_data.IS_TRAIN == 1.0]

In [132]:
test_data = application_data[application_data.IS_TRAIN == 0.0]

In [133]:
train_data.drop(columns='IS_TRAIN', inplace=True)
test_data.drop(columns='IS_TRAIN', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [134]:
del(application_data)

In [135]:
params = model.get_params()

In [136]:
params['objective'] = 'binary'
params['metric'] = 'auc'

In [137]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
final_importance = np.zeros(len(train_data.columns))
for n_fold, (train_index, valid_index) in tqdm(enumerate(skf.split(train_data, train_target.TARGET))):
    X_train = train_data.iloc[train_index]
    y_train = train_target.iloc[train_index].TARGET
    X_valid = train_data.iloc[valid_index]
    y_valid = train_target.iloc[valid_index].TARGET
    lgb_train = lgb.Dataset(data=X_train, label=y_train)
    lgb_eval = lgb.Dataset(data=X_valid, label=y_valid)
    model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=100)
    final_importance += model.feature_importance()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.781982
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.781982
Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.78022
Did not meet early stopping. Best iteration is:
[99]	valid_0's auc: 0.78025
Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.782415
Did not meet early stopping. Best iteration is:
[97]	valid_0's auc: 0.782445



In [138]:
fi = pd.DataFrame()
fi['FEAT'] = train_data.columns

In [139]:
fi['importance'] = final_importance

In [140]:
fi = fi.sort_values(by='importance', ascending=False)

In [141]:
fi = fi[fi.importance != 0]

In [142]:
fi.head(30)

Unnamed: 0,FEAT,importance
32,EXT_SOURCE_1,349.0
33,EXT_SOURCE_2,288.0
34,EXT_SOURCE_3,277.0
247,AMT_CREDIT_FRAC,259.0
11,DAYS_BIRTH,196.0
8,AMT_ANNUITY,161.0
248,AMT_GOODS_FRAC,156.0
971,CSH_CNT_INSTALMENT_FUTURE_mean_mean,123.0
1341,INS_AMT_PAYMENT_min_sum,110.0
252,BRU_DAYS_CREDIT_max,110.0


In [143]:
cols = list(set(fi.FEAT.values).union(set(['SK_ID_CURR'])))

In [144]:
len(cols)

744

In [145]:
train_data = train_data[cols]

In [146]:
test_data = test_data[cols]

Hyperparameter Tuning

The approach that I follow for the hyperparameter tuning is the random search on the interval around the main default parameters of the lgbt classifier.

In [147]:
def get_random_params():
    params = {
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'num_leaves': random.randint(10, 60),
        'max_depth': random.randint(10, 30),
        'learning_rate': random.choice([0.0001, 0.0005, 0.001, 0.005, 0.01]),
        'n_estimators': random.randint(1000, 20000),
        'objective': 'binary',
        'reg_alpha': random.choice([0.001, 0.005, 0.01, 0.05, 0.1]),
        'reg_lambda': random.choice([0.001, 0.005, 0.01, 0.05, 0.1]),       
        'colsample_bytree': random.choice([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
        'min_child_samples': random.randint(10, 100),
        'subsample_for_bin': random.randint(50000, 300000)
    }
    return params

In [148]:
best_params = {'boosting_type': 'gbdt', 
               'metric': 'auc', 
               'num_leaves': 46, 
               'max_depth': 18, 
               'learning_rate': 0.01, 
               'n_estimators': 6289, 
               'objective': 'binary', 
               'reg_alpha': 0.05, 
               'reg_lambda': 0.05, 
               'colsample_bytree': 0.4, 
               'min_child_samples': 79, 
               'subsample_for_bin': 113092}
best_auc = 0.787228

In [149]:
def get_best_params(hyper_rounds, n_folds, best_params=None, best_auc=0):
    best_params = best_params
    best_auc = best_auc
    lgb_train = lgb.Dataset(data=train_data, label=train_target.TARGET)
    for i in tqdm(range(hyper_rounds)):
        curr_params = get_random_params()
        start = time.time()
        print(curr_params)
        eval_hist = lgb.cv(curr_params, lgb_train, early_stopping_rounds = 200, nfold = n_folds, seed = 42, verbose_eval = 100)
        end = time.time()
        print('TIME:', end-start)
        curr_auc = eval_hist['auc-mean'][-1]
        if curr_auc > best_auc:
            best_params = curr_params
            best_auc = curr_auc
    return best_params, best_auc

Remove the comment to do hyperparameter tuning

In [150]:
HYPER_ROUNDS = 1
FOLDS = 5
#best_params, best_auc = get_best_params(HYPER_ROUNDS, FOLDS, best_params, best_auc)

Training and evaluation

In [151]:
N_FOLDS = 10

In [152]:
skf = StratifiedKFold(n_splits=N_FOLDS, random_state=42, shuffle=True)
sub_preds = np.zeros(len(test_data))
avg_valid_auc = 0
for n_fold, (train_index, valid_index) in tqdm(enumerate(skf.split(train_data, train_target.TARGET))):
    print("FOLD N:", n_fold)
    X_train = train_data.iloc[train_index]
    y_train = train_target.iloc[train_index].TARGET
    X_valid = train_data.iloc[valid_index]
    y_valid = train_target.iloc[valid_index].TARGET
    lgb_train = lgb.Dataset(data=X_train, label=y_train)
    lgb_eval = lgb.Dataset(data=X_valid, label=y_valid)
    model = lgb.train(best_params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=100)
    y_pred = model.predict(X_valid)
    sub_preds += model.predict(test_data) / skf.n_splits
    avg_valid_auc += roc_auc_score(y_valid, y_pred) / N_FOLDS

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

FOLD N: 0




Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.761829
[200]	valid_0's auc: 0.768209
[300]	valid_0's auc: 0.772523
[400]	valid_0's auc: 0.775917
[500]	valid_0's auc: 0.778546
[600]	valid_0's auc: 0.780771
[700]	valid_0's auc: 0.782347
[800]	valid_0's auc: 0.783437
[900]	valid_0's auc: 0.784407
[1000]	valid_0's auc: 0.784956
[1100]	valid_0's auc: 0.785465
[1200]	valid_0's auc: 0.785735
[1300]	valid_0's auc: 0.785984
[1400]	valid_0's auc: 0.786211
[1500]	valid_0's auc: 0.786327
[1600]	valid_0's auc: 0.786583
[1700]	valid_0's auc: 0.786628
[1800]	valid_0's auc: 0.786699
[1900]	valid_0's auc: 0.786747
[2000]	valid_0's auc: 0.786857
[2100]	valid_0's auc: 0.786967
[2200]	valid_0's auc: 0.786977
[2300]	valid_0's auc: 0.787074
[2400]	valid_0's auc: 0.787162
[2500]	valid_0's auc: 0.787185
Early stopping, best iteration is:
[2429]	valid_0's auc: 0.787238
FOLD N: 1




Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.761394
[200]	valid_0's auc: 0.769481
[300]	valid_0's auc: 0.774118
[400]	valid_0's auc: 0.777994
[500]	valid_0's auc: 0.780839
[600]	valid_0's auc: 0.783064
[700]	valid_0's auc: 0.78479
[800]	valid_0's auc: 0.78615
[900]	valid_0's auc: 0.787076
[1000]	valid_0's auc: 0.787657
[1100]	valid_0's auc: 0.788172
[1200]	valid_0's auc: 0.788371
[1300]	valid_0's auc: 0.78871
[1400]	valid_0's auc: 0.789071
[1500]	valid_0's auc: 0.789269
[1600]	valid_0's auc: 0.789597
[1700]	valid_0's auc: 0.789766
[1800]	valid_0's auc: 0.789879
[1900]	valid_0's auc: 0.790062
[2000]	valid_0's auc: 0.790206
[2100]	valid_0's auc: 0.790305
[2200]	valid_0's auc: 0.790393
[2300]	valid_0's auc: 0.790386
[2400]	valid_0's auc: 0.790468
[2500]	valid_0's auc: 0.790636
[2600]	valid_0's auc: 0.790745
[2700]	valid_0's auc: 0.79074
[2800]	valid_0's auc: 0.79076
Early stopping, best iteration is:
[2737]	valid_0's auc: 0.790856
FOLD N: 2




Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.764645
[200]	valid_0's auc: 0.773925
[300]	valid_0's auc: 0.779497
[400]	valid_0's auc: 0.783596
[500]	valid_0's auc: 0.786593
[600]	valid_0's auc: 0.788995
[700]	valid_0's auc: 0.790431
[800]	valid_0's auc: 0.791673
[900]	valid_0's auc: 0.792473
[1000]	valid_0's auc: 0.793107
[1100]	valid_0's auc: 0.793587
[1200]	valid_0's auc: 0.79394
[1300]	valid_0's auc: 0.794262
[1400]	valid_0's auc: 0.794603
[1500]	valid_0's auc: 0.794764
[1600]	valid_0's auc: 0.795003
[1700]	valid_0's auc: 0.795158
[1800]	valid_0's auc: 0.795168
Early stopping, best iteration is:
[1680]	valid_0's auc: 0.795216
FOLD N: 3




Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.762517
[200]	valid_0's auc: 0.770778
[300]	valid_0's auc: 0.775965
[400]	valid_0's auc: 0.780335
[500]	valid_0's auc: 0.783652
[600]	valid_0's auc: 0.786507
[700]	valid_0's auc: 0.788841
[800]	valid_0's auc: 0.790347
[900]	valid_0's auc: 0.79169
[1000]	valid_0's auc: 0.792455
[1100]	valid_0's auc: 0.792936
[1200]	valid_0's auc: 0.793399
[1300]	valid_0's auc: 0.793671
[1400]	valid_0's auc: 0.793869
[1500]	valid_0's auc: 0.794013
[1600]	valid_0's auc: 0.794171
[1700]	valid_0's auc: 0.794218
[1800]	valid_0's auc: 0.794422
[1900]	valid_0's auc: 0.794577
[2000]	valid_0's auc: 0.794596
[2100]	valid_0's auc: 0.794835
[2200]	valid_0's auc: 0.794799
[2300]	valid_0's auc: 0.794886
[2400]	valid_0's auc: 0.794962
[2500]	valid_0's auc: 0.794995
[2600]	valid_0's auc: 0.795093
[2700]	valid_0's auc: 0.795231
[2800]	valid_0's auc: 0.795307
[2900]	valid_0's auc: 0.795365
[3000]	valid_0's auc: 0.795337
Early stopping, be



Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.754585
[200]	valid_0's auc: 0.762615
[300]	valid_0's auc: 0.767936
[400]	valid_0's auc: 0.772119
[500]	valid_0's auc: 0.775975
[600]	valid_0's auc: 0.778994
[700]	valid_0's auc: 0.781574
[800]	valid_0's auc: 0.783561
[900]	valid_0's auc: 0.785163
[1000]	valid_0's auc: 0.786163
[1100]	valid_0's auc: 0.78696
[1200]	valid_0's auc: 0.787503
[1300]	valid_0's auc: 0.787944
[1400]	valid_0's auc: 0.788322
[1500]	valid_0's auc: 0.788695
[1600]	valid_0's auc: 0.788945
[1700]	valid_0's auc: 0.789198
[1800]	valid_0's auc: 0.789387
[1900]	valid_0's auc: 0.789542
[2000]	valid_0's auc: 0.789713
[2100]	valid_0's auc: 0.789871
[2200]	valid_0's auc: 0.789935
[2300]	valid_0's auc: 0.790045
[2400]	valid_0's auc: 0.790161
[2500]	valid_0's auc: 0.790195
[2600]	valid_0's auc: 0.790365
[2700]	valid_0's auc: 0.79045
[2800]	valid_0's auc: 0.790529
[2900]	valid_0's auc: 0.79055
[3000]	valid_0's auc: 0.790646
[3100]	valid_0's auc



Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.756268
[200]	valid_0's auc: 0.763942
[300]	valid_0's auc: 0.768608
[400]	valid_0's auc: 0.772464
[500]	valid_0's auc: 0.775413
[600]	valid_0's auc: 0.7775
[700]	valid_0's auc: 0.779362
[800]	valid_0's auc: 0.780917
[900]	valid_0's auc: 0.782043
[1000]	valid_0's auc: 0.782801
[1100]	valid_0's auc: 0.783314
[1200]	valid_0's auc: 0.783725
[1300]	valid_0's auc: 0.783966
[1400]	valid_0's auc: 0.784191
[1500]	valid_0's auc: 0.784346
[1600]	valid_0's auc: 0.784581
[1700]	valid_0's auc: 0.784634
[1800]	valid_0's auc: 0.78463
[1900]	valid_0's auc: 0.784774
[2000]	valid_0's auc: 0.784881
[2100]	valid_0's auc: 0.784955
[2200]	valid_0's auc: 0.784943
Early stopping, best iteration is:
[2125]	valid_0's auc: 0.784998
FOLD N: 6




Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.76787
[200]	valid_0's auc: 0.776179
[300]	valid_0's auc: 0.780865
[400]	valid_0's auc: 0.78425
[500]	valid_0's auc: 0.786846
[600]	valid_0's auc: 0.78877
[700]	valid_0's auc: 0.790268
[800]	valid_0's auc: 0.791374
[900]	valid_0's auc: 0.792174
[1000]	valid_0's auc: 0.792935
[1100]	valid_0's auc: 0.793502
[1200]	valid_0's auc: 0.793843
[1300]	valid_0's auc: 0.794118
[1400]	valid_0's auc: 0.794283
[1500]	valid_0's auc: 0.794543
[1600]	valid_0's auc: 0.794737
[1700]	valid_0's auc: 0.794939
[1800]	valid_0's auc: 0.794978
[1900]	valid_0's auc: 0.795147
[2000]	valid_0's auc: 0.79529
[2100]	valid_0's auc: 0.795339
[2200]	valid_0's auc: 0.795418
[2300]	valid_0's auc: 0.79534
Early stopping, best iteration is:
[2238]	valid_0's auc: 0.795448
FOLD N: 7




Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.758567
[200]	valid_0's auc: 0.765634
[300]	valid_0's auc: 0.770099
[400]	valid_0's auc: 0.77393
[500]	valid_0's auc: 0.776851
[600]	valid_0's auc: 0.779306
[700]	valid_0's auc: 0.781295
[800]	valid_0's auc: 0.782666
[900]	valid_0's auc: 0.78395
[1000]	valid_0's auc: 0.784836
[1100]	valid_0's auc: 0.785721
[1200]	valid_0's auc: 0.786013
[1300]	valid_0's auc: 0.786137
[1400]	valid_0's auc: 0.786375
[1500]	valid_0's auc: 0.786437
[1600]	valid_0's auc: 0.786607
[1700]	valid_0's auc: 0.78669
[1800]	valid_0's auc: 0.786834
[1900]	valid_0's auc: 0.786927
[2000]	valid_0's auc: 0.787018
[2100]	valid_0's auc: 0.786962
[2200]	valid_0's auc: 0.787124
[2300]	valid_0's auc: 0.787272
[2400]	valid_0's auc: 0.787387
[2500]	valid_0's auc: 0.787492
[2600]	valid_0's auc: 0.787469
[2700]	valid_0's auc: 0.787444
Early stopping, best iteration is:
[2562]	valid_0's auc: 0.787542
FOLD N: 8




Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.760998
[200]	valid_0's auc: 0.769081
[300]	valid_0's auc: 0.774227
[400]	valid_0's auc: 0.778512
[500]	valid_0's auc: 0.782008
[600]	valid_0's auc: 0.784818
[700]	valid_0's auc: 0.786564
[800]	valid_0's auc: 0.787832
[900]	valid_0's auc: 0.788686
[1000]	valid_0's auc: 0.789327
[1100]	valid_0's auc: 0.789823
[1200]	valid_0's auc: 0.790139
[1300]	valid_0's auc: 0.790314
[1400]	valid_0's auc: 0.790653
[1500]	valid_0's auc: 0.790729
[1600]	valid_0's auc: 0.790931
[1700]	valid_0's auc: 0.791105
[1800]	valid_0's auc: 0.791099
[1900]	valid_0's auc: 0.791055
Early stopping, best iteration is:
[1757]	valid_0's auc: 0.791144
FOLD N: 9




Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.765841
[200]	valid_0's auc: 0.773786
[300]	valid_0's auc: 0.778506
[400]	valid_0's auc: 0.782362
[500]	valid_0's auc: 0.785329
[600]	valid_0's auc: 0.787717
[700]	valid_0's auc: 0.789464
[800]	valid_0's auc: 0.790894
[900]	valid_0's auc: 0.791919
[1000]	valid_0's auc: 0.792624
[1100]	valid_0's auc: 0.793082
[1200]	valid_0's auc: 0.793384
[1300]	valid_0's auc: 0.793516
[1400]	valid_0's auc: 0.793699
[1500]	valid_0's auc: 0.793798
[1600]	valid_0's auc: 0.793963
[1700]	valid_0's auc: 0.794102
[1800]	valid_0's auc: 0.794108
[1900]	valid_0's auc: 0.794094
[2000]	valid_0's auc: 0.794164
[2100]	valid_0's auc: 0.794119
[2200]	valid_0's auc: 0.794284
[2300]	valid_0's auc: 0.794359
[2400]	valid_0's auc: 0.79436
[2500]	valid_0's auc: 0.794276
[2600]	valid_0's auc: 0.79438
[2700]	valid_0's auc: 0.794388
[2800]	valid_0's auc: 0.794407
[2900]	valid_0's auc: 0.794387
Early stopping, best iteration is:
[2758]	valid_0'

In [153]:
avg_valid_auc

0.7913659982227721