In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Summary of this notebook:

In the V2 file, we used all features from the train file to build the model. Then we tried average of kfold cross val preds of test and also a small grid search on light gbm params. The best score we got was ~0.7; in this notebook:

lets do xgb without scale pos parameter
bring other data files in to the model



# 1. Load the main data (train) file

In [2]:
train_df = pd.read_csv('../input/application_train.csv')

In [3]:
train_df.shape

In [4]:
#there are 307K rows and 122 columns; the number of cols is very high

In [5]:
train_df.head()

In [6]:
# we see that TARGET is the output variable and also that we have a number of demographic variables

In [7]:
#let's explore the output variable

In [8]:
train_df.TARGET.value_counts()

In [9]:
# low event rate (as we would expect)
print("event rate is : {} %".format(round((train_df.TARGET.value_counts()[1]/train_df.shape[0]) * 100)))

In [10]:
#the file homecredit_columns_description has details 

In [11]:
#load the test set

In [12]:
test_df =  pd.read_csv('../input/application_test.csv')

In [13]:
#create flag in train and test df to identify them
train_df['is_train'] = 1
test_df['is_train'] = 0

In [14]:
#take the train output variable out from the train df so that we can merge train and test for processing
Y_train = train_df['TARGET']
train_X = train_df.drop(['TARGET'], axis = 1)

In [15]:
# test ID
test_id = test_df['SK_ID_CURR']
test_X = test_df

# merge train and test datasets for preprocessing
data = pd.concat([train_X, test_X], axis=0)

In [16]:
#write functions to get the categorical features in the overall dataset

In [17]:
# function to obtain Categorical Features
def get_categorical_features(df):
    cat_feats = [col for col in list(df.columns) if df[col].dtype == 'object']
    return cat_feats

In [18]:
def get_dummies(df, cat_feats):
    for cat_col in cat_feats:
        df = pd.concat([df, pd.get_dummies(df[cat_col], prefix=cat_col)], axis=1)
    return df

In [19]:
# get categorical features
data_cat_feats = get_categorical_features(data)

In [20]:
# create additional dummy features - 
data = get_dummies(data,data_cat_feats)

In [21]:
data.head()

In [22]:
#get numeric cols
numeric_cols = [col_name for col_name in list(data.columns) if data[col_name].dtype != 'object']

In [23]:
len(numeric_cols)

In [24]:
numeric_cols = [col for col in numeric_cols if col !='is_train']

In [25]:
# remove the ID from list
numeric_cols = [col for col in numeric_cols if col !='SK_ID_CURR']

In [26]:
#split the data back in to train and test
train_X = data[data['is_train'] == 1][numeric_cols]
test_X = data[data['is_train'] == 0][numeric_cols]

In [27]:
from sklearn.model_selection import train_test_split 

In [28]:
random_seed = 144

In [29]:
#lets get a baseline score with only one file's data

In [31]:
import xgboost as xgb

In [33]:
dtest = xgb.DMatrix(data=test_X)

In [35]:
#create a validation set in the train data
X_train, X_val, y_train, y_val = train_test_split(train_X, Y_train, test_size=0.2, random_state=random_seed)

In [37]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dval = xgb.DMatrix(data=X_val, label=y_val)

In [34]:
params = {
            'objective': "binary:logistic",
            'booster' : "gbtree",
            'eval_metric' : "auc",
            'nthread' : -1,
            'eta' : 0.05,
            'max_depth' : 6,
            'min_child_weight' : 5,
            'gamma' : 0,
            'subsample' : 0.9,
            'colsample_bytree' : 0.7,
            'alpha' : 0,
            'lambda' : 0,
            'nrounds' : 2000
} 

In [44]:
evallist = [(dval, 'eval'), (dtrain, 'train')]

In [46]:
#num_round = 100
#bst = xgb.train(params, dtrain, num_round, evallist)

In [47]:
from sklearn.metrics import roc_auc_score

In [48]:
#print(roc_auc_score(y_true=y_train, y_score=bst.predict(dtrain)))

In [49]:
#print(roc_auc_score(y_true=y_val, y_score=bst.predict(dval)))

In [50]:
#these are the same scores we see in the result of the train function

In [51]:
#test_preds = bst.predict(dtest)

In [52]:
#np.sum(test_preds)

In [53]:
'''
#submission - first the output of core xgb
sub_lgb = pd.DataFrame()
sub_lgb['SK_ID_CURR'] = test_id
sub_lgb['TARGET'] = test_preds
sub_lgb.to_csv("xgb_noGridSearch_noCrossVal_withWatchList.csv", index=False)
sub_lgb.head()
'''

In [54]:
#this scored 0.736; not an improvement over best of 0.743

In [55]:
#next is to use other data files
#we start with the prev appl data

In [56]:
prev = pd.read_csv('../input/previous_application.csv')

In [57]:
prev.head()

In [61]:
prev.NAME_CONTRACT_STATUS.value_counts()

In [62]:
#we will get  a few summary stats such as #total prev appl, #approved, #canceled, etc

In [63]:
#create a df at the SK_ID_CURR level and append the above metrics

In [85]:
prev_grouped = pd.DataFrame()

In [86]:
prev.shape

In [87]:
prev_grouped['SK_ID_CURR'] = prev['SK_ID_CURR'].unique()

In [88]:
prev_grouped.shape

In [89]:
prev_grouped.index

In [91]:
prev_grouped.set_index(keys=['SK_ID_CURR'], inplace=True)

In [92]:
prev_grouped.shape

In [93]:
prev_grouped.index

In [94]:
prev_grouped['total_prev_appl'] = prev.groupby('SK_ID_CURR')['SK_ID_PREV'].count()

In [95]:
prev_grouped.shape

In [96]:
#approved
prev_grouped['approved_prev_appl'] = prev[prev['NAME_CONTRACT_STATUS'] == 'Approved'].groupby('SK_ID_CURR')['SK_ID_PREV'].count()

In [97]:
prev_grouped.head()

In [98]:
prev_grouped.describe()

In [99]:
prev_grouped['canceled_prev_appl'] = prev[prev['NAME_CONTRACT_STATUS'] == 'Canceled'].groupby('SK_ID_CURR')['SK_ID_PREV'].count()
prev_grouped['refused_prev_appl'] = prev[prev['NAME_CONTRACT_STATUS'] == 'Refused'].groupby('SK_ID_CURR')['SK_ID_PREV'].count()
prev_grouped['unusedOffer_prev_appl'] = prev[prev['NAME_CONTRACT_STATUS'] == 'Unused offer'].groupby('SK_ID_CURR')['SK_ID_PREV'].count()

In [101]:
#get the amounts of approved / total / canceled
#we will use the AMT_CREDIT column; i couldnt find details about each col

In [102]:
prev_grouped['approved_prev_appl_amnt'] = prev[prev['NAME_CONTRACT_STATUS'] == 'Approved'].groupby('SK_ID_CURR')['AMT_CREDIT'].sum()
prev_grouped['canceled_prev_appl_amnt'] = prev[prev['NAME_CONTRACT_STATUS'] == 'Canceled'].groupby('SK_ID_CURR')['AMT_CREDIT'].sum()
prev_grouped['refused_prev_appl_amnt'] = prev[prev['NAME_CONTRACT_STATUS'] == 'Refused'].groupby('SK_ID_CURR')['AMT_CREDIT'].sum()
prev_grouped['unusedOffer_prev_appl_amnt'] = prev[prev['NAME_CONTRACT_STATUS'] == 'Unused offer'].groupby('SK_ID_CURR')['AMT_CREDIT'].sum()

In [104]:
prev_grouped.head()

In [105]:
#lets join this to the master data table (data)

In [106]:
data.shape

In [109]:
data.columns

In [110]:
'is_train' in data.columns

In [111]:
data['is_train'].value_counts()

In [112]:
#lets create a backup of the master data table and add the new cols to the old df
data_orig_file = data

In [113]:
data = pd.merge(left=data,
                 right=prev_grouped,
                 left_on='SK_ID_CURR',
                 right_on='SK_ID_CURR',
                 how='left')
#data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

In [114]:
data.shape

In [115]:
data_orig_file.shape

In [118]:
#lets build a model with the original and the prev added features
numeric_cols = [col_name for col_name in list(data.columns) if data[col_name].dtype != 'object']
numeric_cols = [col for col in numeric_cols if col !='is_train']
numeric_cols = [col for col in numeric_cols if col !='SK_ID_CURR']

In [119]:
#split the data back in to train and test
train_X = data[data['is_train'] == 1][numeric_cols]
test_X = data[data['is_train'] == 0][numeric_cols]

X_train, X_val, y_train, y_val = train_test_split(train_X, Y_train, test_size=0.2, random_state=random_seed)

dtrain = xgb.DMatrix(data=X_train, label=y_train)
dval = xgb.DMatrix(data=X_val, label=y_val)

dtest = xgb.DMatrix(data=test_X)

In [120]:
evallist = [(dval, 'eval'), (dtrain, 'train')]

In [122]:
num_round = 1000
bst = xgb.train(params, dtrain, num_round, evallist,early_stopping_rounds=100,verbose_eval=False)

In [123]:
bst.best_score

In [124]:
print(roc_auc_score(y_true=y_val, y_score=bst.predict(dval)))

In [125]:
#submission - first the output of core xgb
sub_lgb = pd.DataFrame()
sub_lgb['SK_ID_CURR'] = test_id
sub_lgb['TARGET'] = bst.predict(dtest)
sub_lgb.to_csv("xgb_noGS_noCV_withWL_prevApplDataIncluded.csv", index=False)
sub_lgb.head()