# Home Credit Default Risk - Competition

### This is a work in progress. Comments and critical feedback are always welcome.

Note:


# 1. Load Data and Modules

In [49]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# visualization
import seaborn as sns
color = sns.color_palette()

import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()

import cufflinks as cf
cf.go_offline()

# 2. Load input data / Initial Exploration

In [51]:
print(os.listdir("../input"))

### **application_{train|test}.csv**

* This is the main table, broken into two files for Train (with TARGET) and Test (without TARGET).
* Static data for all applications. One row represents one loan in our data sample.

In [53]:
df_train = pd.read_csv('../input/application_train.csv')
df_test = pd.read_csv('../input/application_test.csv')

In [55]:
df_train.columns.values

In [57]:
print(df_train.shape)
df_train.head()

**Checking missing data in application_train**

In [59]:
# checking missing data
total = df_train.isnull().sum().sort_values(ascending = False)
percent = (df_train.isnull().sum()/df_train.isnull().count()*100).sort_values(ascending = False)
missing_application_train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_application_train_data.head(20)

### **bureau.csv **
* All client's previous credits provided by other financial institutions that were reported to Credit Bureau (for clients who have a loan in our sample).
* For every loan in our sample, there are as many rows as number of credits the client had in Credit Bureau before the application date.

In [60]:
df_bureau = pd.read_csv('../input/bureau.csv')

In [61]:
print(df_bureau.shape)
print(df_bureau.columns)
df_bureau.head()

**Checking missing data in bureau**

In [62]:
# checking missing data
total = df_bureau.isnull().sum().sort_values(ascending = False)
percent = (df_bureau.isnull().sum()/df_bureau.isnull().count()*100).sort_values(ascending = False)
missing_bureau_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_bureau_data.head(8)

### **bureau_balance.csv**

* Monthly balances of previous credits in Credit Bureau.
* This table has one row for each month of history of every previous credit reported to Credit Bureau – i.e the table has (#loans in sample * # of relative previous credits * # of months where we have some history observable for the previous credits) rows.

In [63]:
df_bureau_balance = pd.read_csv('../input/bureau_balance.csv')

In [64]:
print(df_bureau_balance.shape)
print(df_bureau_balance.columns)
df_bureau_balance.head()

**Checking missing data in bureau_balance**

In [65]:
# checking missing data
total = df_bureau_balance.isnull().sum().sort_values(ascending = False)
percent = (df_bureau_balance.isnull().sum()/df_bureau_balance.isnull().count()*100).sort_values(ascending = False)
missing_bureau_balance_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_bureau_balance_data.head(3)

### **POS_CASH_balance.csv**

* Monthly balance snapshots of previous POS (point of sales) and cash loans that the applicant had with Home Credit.
* This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credits * # of months in which we have some history observable for the previous credits) rows.

In [66]:
df_cash_balance = pd.read_csv('../input/POS_CASH_balance.csv')

In [67]:
print(df_cash_balance.shape)
print(df_cash_balance.columns)
df_cash_balance.head()

**Checking missing data in POS_CASH_balance**

In [68]:
# checking missing data
total = df_cash_balance.isnull().sum().sort_values(ascending = False)
percent = (df_cash_balance.isnull().sum()/df_cash_balance.isnull().count()*100).sort_values(ascending = False)
missing_POS_CASH_balance_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_POS_CASH_balance_data.head(3)

### **credit_card_balance.csv**

* Monthly balance snapshots of previous credit cards that the applicant has with Home Credit.
* This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credit cards * # of months where we have some history observable for the previous credit card) rows.

In [69]:
df_card_balance = pd.read_csv('../input/credit_card_balance.csv')

In [70]:
print(df_card_balance.shape)
print(df_card_balance.columns)
df_card_balance.head()

**Checking missing data in credit_card_balance**

In [71]:
# checking missing data
total = df_card_balance.isnull().sum().sort_values(ascending = False)
percent = (df_card_balance.isnull().sum()/df_card_balance.isnull().count()*100).sort_values(ascending = False)
missing_credit_card_balance_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_credit_card_balance_data.head(10)

### **previous_application.csv**

* All previous applications for Home Credit loans of clients who have loans in our sample.
* There is one row for each previous application related to loans in our data sample.

In [72]:
df_previous = pd.read_csv('../input/previous_application.csv')

In [73]:
print(df_previous.shape)
print(df_previous.columns)
df_previous.head()

**Checking missing data in previous_application**

In [74]:
# checking missing data
total = df_previous.isnull().sum().sort_values(ascending = False)
percent = (df_previous.isnull().sum()/df_previous.isnull().count()*100).sort_values(ascending = False)
missing_previous_application_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_previous_application_data.head(15)

### **installments_payments.csv**

* Repayment history for the previously disbursed credits in Home Credit related to the loans in our sample.
* There is a) one row for every payment that was made plus b) one row each for missed payment.
* One row is equivalent to one payment of one installment OR one installment corresponding to one payment of one previous Home Credit credit related to loans in our sample.

In [75]:
df_installments_payments = pd.read_csv('../input/installments_payments.csv')

In [76]:
print(df_installments_payments.shape)
print(df_installments_payments.columns)
df_installments_payments.head()

**Checking missing data in installments_payments**

In [77]:
# checking missing data
total = df_installments_payments.isnull().sum().sort_values(ascending = False)
percent = (df_installments_payments.isnull().sum()/df_installments_payments.isnull().count()*100).sort_values(ascending = False)
missing_installments_payments_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_installments_payments_data.head(3)

# 3. Data Exploration

Thanks for plot by https://www.kaggle.com/codename007/home-credit-complete-eda-feature-importance/notebook

### 3.1 Distribution of AMT_CREDIT

In [78]:
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_CREDIT")
ax = sns.distplot(df_train["AMT_CREDIT"])

### 3.2 Distribution of AMT_INCOME_TOTAL

In [79]:
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_INCOME_TOTAL")
ax = sns.distplot(df_train["AMT_ANNUITY"].dropna())

### 3.3 Distribution of AMT_GOODS_PRICE

In [80]:
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_GOODS_PRICE")
ax = sns.distplot(df_train["AMT_GOODS_PRICE"].dropna())

### 3.4 Distribution of Name of type of the Suite

In [81]:
temp = df_train["NAME_TYPE_SUITE"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
    x = temp.index,
    y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
    title = "Distribution of Name of type of the Suite in % ",
    xaxis=dict(
        title='Name of type of the Suite',
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Count of Name of type of the Suite in %',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='schoolStateNames')

### 3.5 Data is balanced or imbalanced

In [82]:
temp = df_train["TARGET"].value_counts()
df = pd.DataFrame({'labels': temp.index,
                   'values': temp.values
                  })
df.iplot(kind='pie',labels='labels',values='values', title='Loan Repayed or not')

### 3.6 Types of loan

**Rovolving loans **: Arrangement which allows for the loan amount to be withdrawn, repaid, and redrawn again in any manner and any number of times, until the arrangement expires. Credit card loans and overdrafts are revolving loans. Also called evergreen loan

In [83]:
temp = df_train["NAME_CONTRACT_TYPE"].value_counts()
fig = {
  "data": [
    {
      "values": temp.values,
      "labels": temp.index,
      "domain": {"x": [0, .48]},
      #"name": "Types of Loans",
      #"hoverinfo":"label+percent+name",
      "hole": .7,
      "type": "pie"
    },
    
    ],
  "layout": {
        "title":"Types of loan",
        "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Loan Types",
                "x": 0.17,
                "y": 0.5
            }
            
        ]
    }
}
iplot(fig, filename='donut')

### 3.7 Purpose of loan

In [84]:
temp1 = df_train["FLAG_OWN_CAR"].value_counts()
temp2 = df_train["FLAG_OWN_REALTY"].value_counts()

fig = {
  "data": [
    {
      "values": temp1.values,
      "labels": temp1.index,
      "domain": {"x": [0, .48]},
      "name": "Own Car",
      "hoverinfo":"label+percent+name",
      "hole": .6,
      "type": "pie"
    },
    {
      "values": temp2.values,
      "labels": temp2.index,
      "text":"Own Reality",
      "textposition":"inside",
      "domain": {"x": [.52, 1]},
      "name": "Own Reality",
      "hoverinfo":"label+percent+name",
      "hole": .6,
      "type": "pie"
    }],
  "layout": {
        "title":"Purpose of loan",
        "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Own Car",
                "x": 0.20,
                "y": 0.5
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Own Reality",
                "x": 0.8,
                "y": 0.5
            }
        ]
    }
}
iplot(fig, filename='donut')

### 3.8 Income sources of Applicant's who applied for loan

In [85]:
temp = df_train["NAME_INCOME_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
                   'values': temp.values
                  })
df.iplot(kind='pie',labels='labels',values='values', title='Income sources of Applicant\'s', hole = 0.5)

### 3.9 Family Status of Applicant's who applied for loan

In [86]:
temp = df_train["NAME_FAMILY_STATUS"].value_counts()
df = pd.DataFrame({'labels': temp.index,
                   'values': temp.values
                  })
df.iplot(kind='pie',labels='labels',values='values', title='Family Status of Applicant\'s', hole = 0.5)

### 3.10 Occupation of Applicant's who applied for loan

In [87]:
temp = df_train["OCCUPATION_TYPE"].value_counts()
temp.iplot(kind='bar', xTitle = 'Occupation', yTitle = "Count", title = 'Occupation of Applicant\'s who applied for loan', color = 'green')

### 3.11 Education of Applicant's who applied for loan

In [88]:
temp = df_train["NAME_EDUCATION_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
                   'values': temp.values
                  })
df.iplot(kind='pie',labels='labels',values='values', title='Education of Applicant\'s', hole = 0.5)

### 3.12 For which types of house higher applicant's applied for loan ?

In [89]:
temp = df_train["NAME_HOUSING_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
                   'values': temp.values
                  })
df.iplot(kind='pie',labels='labels',values='values', title='Type of House', hole = 0.5)

### 3.13 Types of Organizations who applied for loan

In [90]:
temp = df_train["ORGANIZATION_TYPE"].value_counts()
temp.iplot(kind='bar', xTitle = 'Organization Name', yTitle = "Count", title = 'Types of Organizations who applied for loan ', color = 'red')

# 4.Pearson Correlation of features

In [91]:
data = [
    go.Heatmap(
        z= df_train.corr().values,
        x= df_train.columns.values,
        y= df_train.columns.values,
        colorscale='Viridis',
        reversescale = False,
        text = True ,
        opacity = 1.0 )
]

layout = go.Layout(
    title='Pearson Correlation of features',
    xaxis = dict(ticks='', nticks=36),
    yaxis = dict(ticks='' ),
    width = 900, height = 700)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='labelled-heatmap')

# 4. Preparation data

The **sklearn.preprocessing** package provides several common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators. [More information](http://scikit-learn.org/stable/modules/preprocessing.html)

In [92]:
from sklearn import preprocessing

In [93]:
# I check columns, which are dtupe -> 'object'

categorical_features = [
    categorical for categorical in df_train.columns if df_train[categorical].dtype == 'object'
]

In [94]:
for i in categorical_features:
    lben = preprocessing.LabelEncoder()
    lben.fit(list(df_train[i].values.astype('str')) + list(df_test[i].values.astype('str')))
    df_train[i] = lben.transform(list(df_train[i].values.astype('str')))
    df_test[i] = lben.transform(list(df_test[i].values.astype('str')))

**pandas.DataFrame.fillna**

Fill NA/NaN values using the specified method

[More information](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html)

In [95]:
df_train.fillna(-999, inplace = True)
df_test.fillna(-999, inplace = True)

# 5. Building models

### 5.1. LightGBM

Light GBM is a gradient boosting framework that uses tree based learning algorithm.

[More...](https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc)

In [96]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split 

In [97]:
# target variable 
Y = df_train['TARGET']
test_id = df_test['SK_ID_CURR']

train_X = df_train.drop(['TARGET', 'SK_ID_CURR'], axis = 1)
test_X = df_test.drop(['SK_ID_CURR'], axis = 1)

In [98]:
# prepare training and validation data
x_train, x_val, y_train, y_val = train_test_split(
    train_X, 
    Y, 
    random_state=18
)

lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_eval = lgb.Dataset(data=x_val, label=y_val)

In [99]:
# params for model

params = {
    'task': 'train', 
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc', 
    'learning_rate': 0.05, 
    'num_leaves': 32, 
    'num_iteration': 500, 
    'verbose': 0 
}

In [100]:
model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=100, verbose_eval=10)

### Feature Importance

In [101]:
lgb.plot_importance(model, figsize=(12, 20));

In [102]:
# for competition

pred = model.predict(test_X)
sub = pd.DataFrame()
sub['SK_ID_CURR'] = test_id
sub['TARGET'] = pred
sub.to_csv("baseline_submission.csv", index=False)
sub.head()

## 5.2. LGBMClassifier

In [103]:
from lightgbm import LGBMClassifier

In [104]:
clf = LGBMClassifier(
        n_estimators=300,
        num_leaves=15,
        colsample_bytree=.8,
        subsample=.8,
        max_depth=7,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01
    )

In [105]:
clf.fit(x_train, 
        y_train,
        eval_set= [(x_train, y_train), (x_val, y_val)], 
        eval_metric='auc', 
        verbose=0, 
        early_stopping_rounds=30
       )

In [106]:
# for competition

pred_1 = clf.predict(test_X)
sub = pd.DataFrame()
sub['SK_ID_CURR'] = test_id
sub['TARGET'] = pred
sub.to_csv("submission_clf.csv", index=False)
sub.head()