In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Introduction

Description as per kaggle https://www.kaggle.com/c/home-credit-default-risk/overview :
Many people struggle to get loans due to insufficient or non-existent credit histories. And, unfortunately, this population is often taken advantage of by untrustworthy lenders.

Home Credit strives to broaden financial inclusion for the unbanked population by providing a positive and safe borrowing experience. In order to make sure this underserved population has a positive loan experience, Home Credit makes use of a variety of alternative data--including telco and transactional information--to predict their clients' repayment abilities.

While Home Credit is currently using various statistical and machine learning methods to make these predictions, they're challenging Kagglers to help them unlock the full potential of their data. Doing so will ensure that clients capable of repayment are not rejected and that loans are given with a principal, maturity, and repayment calendar that will empower their clients to be successful.

Also the page https://www.kaggle.com/c/home-credit-default-risk/data defines various data-set for this competation.

This is a standard supervised classification task:
Supervised: The labels are included in the training data and the goal is to train a model to learn to predict the labels from the features
Classification: The label is a binary variable, 0 (will repay loan on time), 1 (will have difficulty repaying loan).

Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.

Will start with just the application train and test data-set.

In [None]:
# numpy and pandas for data manipulation
# already imported as a default by the Kaggel Kernels.

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the training data
train_data = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
print('Training data shape: ', train_data.shape)
train_data.head()

Observation : The training data has 307511 observations and 122 features including the TARGET (the label we want to predict).

In [None]:
# Check if there are any duplicate application.
train_data[train_data.duplicated(['SK_ID_CURR'])]
#train_data[train_data.duplicated(['SK_ID_CURR'])].count()

In [None]:
# Check if there are any duplicate rows.
train_data[train_data.duplicated()]

Observation : awesome.. No duplicate rows.. and no duplicate application id 'SK_ID_CURR'

In [None]:
# Read the test data
test_data = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')
print('Test data shape: ', test_data.shape)
test_data.head()

In [None]:
# Check if there are any duplicate rows.
test_data[test_data.duplicated()]

# Feature and Target Distribution

I read the article from one of the Kaggle winner, he mentioned that we have to check the distribution of TARGET column. 
Reason : To check the class balance in our dataset.
We can first examine the number of loans falling into each category.

In [None]:
print(train_data['TARGET'].value_counts())

In [None]:
train_data['TARGET'].astype(int).plot.hist();

Observation : This seems to be a class imbalance problem.
Refer to http://www.chioka.in/class-imbalance-problem/ to get more details.
There are far more loans that were repaid on time than loans that were not repaid.

In [None]:
# Number of each type of column
train_data.dtypes.value_counts()

In [None]:
#Get list of categorical variables
s = (train_data.dtypes == 'object')
train_data_cat_var = list(s[s].index)
train_data_cat_var

In [None]:
#Get list of Numerical variables
train_data_num_var = list(train_data.select_dtypes(exclude=['object']).columns)
train_data_num_var

In [None]:
len(train_data_num_var)

In [None]:
train_data_num_var[1:10]

In [None]:
# Distribution of each feature
#pd.options.display.mpl_style = 'default'

import matplotlib
matplotlib.style.use('ggplot')

plt.figure(figsize=(20,5))
train_data.boxplot(column=train_data_num_var[1:10])

plt.show(block=True)


In [None]:
plt.figure(figsize=(20,5))
train_data.hist(column=train_data_num_var[1:5])
plt.show()

# Checking for Missing Values

In [None]:
# Lets get the % of each null values.
total = train_data.isnull().sum().sort_values(ascending=False)
percent_1 = train_data.isnull().sum()/train_data.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'], sort=False)
missing_data.head()

In [None]:
print("Total columns that have missing values :", str(len(missing_data[(missing_data['%']>0) ]) ))

In [None]:
print("Total columns that have missing values (More than 50%):", str(len(missing_data[(missing_data['%']>50) ]) ))
print("Total columns that have missing values (Less than 50%):", str(len(missing_data[(missing_data['%']<50) & (missing_data['%']>0)] ) ))

Observation : Seems we have 64 columns with NULL value. Now we are left with 3 options as below.

* Option 1: fill-in these missing values, as all the models work with non-NaN values.
* Option 2: drop columns having more than 50% of missing values. ie we will drop 41 columns.
* Option 3: Use XGBoost model.. as this is the only model as of now i am aware that can work with NaN or missing values.

# Correlation of features with target variable

In [None]:
train_data[train_data_num_var[1:5]]

In [None]:
#Using Pearson Correlation

plt.figure(figsize=(20,10))

cor = train_data[train_data_num_var[1:20]].corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)

plt.show()

In [None]:
#Correlation with output variable
cor_target = abs(cor["TARGET"])

#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.05]
relevant_features

Seems none of the numeric feature have much correlation with our target variable.

Correlation coefficients whose magnitude are between 0.5 and 0.7 indicate variables which can be considered moderately correlated. Correlation coefficients whose magnitude are between 0.3 and 0.5 indicate variables which have a low correlation.

# Uniqueness of data in each column

In [None]:
# Number of unique values in each categorical column
train_data.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [None]:
train_data.describe(include=[np.object])

In [None]:
cat_dict = dict()
for column in train_data:
    if train_data[column].dtype == 'object':
        if len(list(train_data[column].unique())) <= 2:
            cat_dict[column] = len(list(train_data[column].unique()))
#             print(cat_dict)
        else :
            cat_dict[column] = len(list(train_data[column].unique()))

print(cat_dict)
            

From above result of we have 16 Categorical features. Out of which.
* NAME_CONTRACT_TYPE, FLAG_OWN_CAR, FLAG_OWN_REALTY  : binary Categorical values.
* Remaining 13 are having more multi-value categorical features

In [None]:
# write a function to get the distinct value in each categorical value
def get_Unique_Values(list_cat_var) :
    cat_dict = dict()
    for i in list_cat_var:
        cat_dict[i] = list(train_data[i].unique())
    return cat_dict

In [None]:
print(get_Unique_Values(['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']))

In [None]:
print(get_Unique_Values(['CODE_GENDER', 'EMERGENCYSTATE_MODE', 'HOUSETYPE_MODE', 'NAME_EDUCATION_TYPE', 'FONDKAPREMONT_MODE']))

# Encoding techniques
For encoding we do have following approach.. either we can go with Label Encoding or One-Hot Encoding or hard-code the values manually.
But seemd there are some NaN in EMERGENCYSTATE_MODE; HOUSETYPE_MODE; FONDKAPREMONT_MODE.. will leave them as is, and encode them as well for now.

In [None]:
# Binary encoding
train_data['NAME_CONTRACT_TYPE'] = [0 if x == 'Cash loans' else 1 for x in train_data['NAME_CONTRACT_TYPE']]
train_data['FLAG_OWN_CAR'] = [0 if x == 'N' else 1 for x in train_data['FLAG_OWN_CAR']]
train_data['FLAG_OWN_REALTY'] = [0 if x == 'N' else 1 for x in train_data['FLAG_OWN_REALTY']]