In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Objective of the competition is to "predict their clients' repayment abilities." There are quite a number of files provided. For this notebook, let's start with the main file and explore the variables in that
Summary of this notebook:

1. In the V1 file (EDA 23 May 2018), we had done a baseline model with only a few features
2. In this notebook, we will bring in all features
3. Build a model with all features - convert categorical to numeric, include all numeric
4. do cross validation and get output as average of multiple models
5, check if a feature is not useful - i.e., all input vals has same output val; we can remove them
6. we will still use only one file for this notebook
7. 


# 1. Load the main data (train) file

In [2]:
train_df = pd.read_csv('../input/application_train.csv')

In [3]:
train_df.shape

In [4]:
#there are 307K rows and 122 columns; the number of cols is very high

In [5]:
train_df.head()

In [6]:
# we see that TARGET is the output variable and also that we have a number of demographic variables

In [7]:
#let's explore the output variable

In [8]:
train_df.TARGET.value_counts()

In [9]:
# low event rate (as we would expect)
print("event rate is : {} %".format(round((train_df.TARGET.value_counts()[1]/train_df.shape[0]) * 100)))

In [10]:
#the file homecredit_columns_description has details 

In [11]:
#load the test set

In [12]:
test_df =  pd.read_csv('../input/application_test.csv')

In [13]:
#create flag in train and test df to identify them
train_df['is_train'] = 1
test_df['is_train'] = 0

In [14]:
#take the train output variable out from the train df so that we can merge train and test for processing
Y_train = train_df['TARGET']
train_X = train_df.drop(['TARGET'], axis = 1)

In [15]:
# test ID
test_id = test_df['SK_ID_CURR']
test_X = test_df

# merge train and test datasets for preprocessing
data = pd.concat([train_X, test_X], axis=0)

In [16]:
#write functions to get the categorical features in the overall dataset

In [17]:
# function to obtain Categorical Features
def get_categorical_features(df):
    cat_feats = [col for col in list(df.columns) if df[col].dtype == 'object']
    return cat_feats

In [18]:
#function to encode categorical values; we use pd.get_dummies;
#refer to https://www.kaggle.com/shivamb/homecreditrisk-extensive-eda-baseline-0-772; this nb has used both factorize and get dummies while I feel that just
#get dummies should do

In [19]:
def get_dummies(df, cat_feats):
    for cat_col in cat_feats:
        df = pd.concat([df, pd.get_dummies(df[cat_col], prefix=cat_col)], axis=1)
    return df

In [20]:
# get categorical features
data_cat_feats = get_categorical_features(data)

In [21]:
# create additional dummy features - 
data = get_dummies(data,data_cat_feats)

In [22]:
data.head()

In [27]:
#get numeric cols
numeric_cols = [col_name for col_name in list(data.columns) if data[col_name].dtype != 'object']

In [28]:
len(numeric_cols)

In [29]:
'is_train' in numeric_cols

In [31]:
numeric_cols = [col for col in numeric_cols if col !='is_train']

In [32]:
len(numeric_cols)

In [33]:
# remove the ID from list
numeric_cols = [col for col in numeric_cols if col !='SK_ID_CURR']

In [36]:
#split the data back in to train and test
train_X = data[data['is_train'] == 1][numeric_cols]
test_X = data[data['is_train'] == 0][numeric_cols]

In [37]:
from sklearn.model_selection import train_test_split 

In [38]:
random_seed = 144

In [40]:
#create validation sets to be used while training the model
X_train, X_val, y_train, y_val = train_test_split(train_X, Y_train, test_size=0.2, random_state=random_seed)

In [41]:
#build a simple light gbm model

In [42]:
import lightgbm as lgb

In [43]:
#prepare the train and eval data to fit to model
lgb_train = lgb.Dataset(data=X_train, label=y_train)
lgb_eval = lgb.Dataset(data=X_val, label=y_val)

In [44]:
#define the params for the model
params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
          'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 0 ,
          'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
          'min_split_gain':.01, 'min_child_weight':1}
#used same params as here: https://www.kaggle.com/shivamb/homecreditrisk-extensive-eda-baseline-0-772

In [45]:
model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=200)

In [46]:
#preds
preds = model.predict(test_X)
sub_lgb = pd.DataFrame()
sub_lgb['SK_ID_CURR'] = test_id
sub_lgb['TARGET'] = preds
sub_lgb.to_csv("lgb_baseline.csv", index=False)
sub_lgb.head()

In [37]:
#there are a number of numeric and categorical features; for the baseline model in this notebook, let's take only a few
#also, for the categorical features, we will try to do mean encoding instead of the regular one hot encoding / label encoding

#numeric features we are interested in:
AMT_INCOME_TOTAL
AMT_CREDIT
AMT_ANNUITY
AMT_GOODS_PRICE
DAYS_EMPLOYED


In [38]:
num_features_list = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_EMPLOYED']

# categorical features:
NAME_CONTRACT_TYPE
CODE_GENDER
FLAG_OWN_CAR
FLAG_OWN_REALTY
NAME_INCOME_TYPE
NAME_EDUCATION_TYPE
NAME_FAMILY_STATUS
OCCUPATION_TYPE



In [39]:
cat_features_list = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                     'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'OCCUPATION_TYPE']

In [40]:
#mean encoding for cat features

In [41]:
num_rows = train_df.shape[0] #get number of records in train

for cat_feature in cat_features_list: #iterate over all the cat features
        encoder_series = train_df[cat_feature].value_counts() / num_rows #create a series that would have the mean for each value in the cat feature
        train_df[cat_feature+'_mean_enc'] = train_df[cat_feature].map(encoder_series) #map that to the specific cat feature and create a new col

In [43]:
# we have created a set of cols that are mean encoded from categorical cols
#lets move on to create a baseline model with the selected numeric features and the mean encoded cols

In [44]:
#create a list with numeric cols

In [45]:
train_df.columns

In [47]:
features = num_features_list + ['NAME_CONTRACT_TYPE_mean_enc', 'CODE_GENDER_mean_enc', 'FLAG_OWN_CAR_mean_enc', 'FLAG_OWN_REALTY_mean_enc',
                               'NAME_INCOME_TYPE_mean_enc', 'NAME_EDUCATION_TYPE_mean_enc',
                               'NAME_FAMILY_STATUS_mean_enc', 'OCCUPATION_TYPE_mean_enc']

In [48]:
X_train = train_df[features]

In [50]:
y_train = train_df.TARGET

In [51]:
from xgboost import XGBClassifier

In [52]:
seed = 111

In [69]:
#without scale pos weight, we had no 1 preds; with scale pos weight as 12, we had a 127K 1s with accuracy of 60%

In [70]:
model_xgb = XGBClassifier(scale_pos_weight=6)

In [71]:
model_xgb.fit(X=X_train, y=y_train)

In [72]:
np.sum(model_xgb.predict(X_train))

In [55]:
from sklearn.metrics import accuracy_score

In [73]:
accuracy_score(y_true=y_train, y_pred=model_xgb.predict(data=X_train))

In [74]:
from sklearn.metrics import confusion_matrix

In [75]:
confusion_matrix(y_true=y_train, y_pred=model_xgb.predict(data=X_train))

In [57]:
#now to make predictions on the test set
#first, we need to do the mean encoding for the test data as well

In [58]:
test_df = pd.read_csv('../input/application_test.csv')

In [59]:
for cat_feature in cat_features_list: #iterate over all the cat features
        test_df[cat_feature+'_mean_enc'] = test_df[cat_feature].map(encoder_series) #map that to the specific cat feature and create a new col

In [60]:
X_test = test_df[features]

In [76]:
y_pred_test = model_xgb.predict(X_test)

In [77]:
np.sum(y_pred_test)

In [78]:
#no 1s are predicted; FAIL

In [79]:
#do submission

In [80]:
y_pred_test_prob = model_xgb.predict_proba(X_test)[:, 1]


Submission = pd.DataFrame({ 'SK_ID_CURR': test_df.SK_ID_CURR,'TARGET': y_pred_test_prob })
Submission.to_csv("sample_submission_baseline_23May18.csv", index=False)