In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# [Problem 1] Understanding the content of the competition

* Home Credit is a non-bank financial company, they provide installment financial loans to unbanked people who have no or little credit history.
* There are some people who are struggling to get a loan because of inadequate credit history. Predicting a customer's ability to repay so that they can get a loan.
* The ability to predict this knowledge helps the company manage risk, understand more about their clients and ensure the services.

# [Problem 2] Understanding the overview of data

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline

# **Load the data**

In [None]:
PATH="../input/home-credit-default-risk"

In [None]:
application_train = pd.read_csv(PATH+"/application_train.csv")
application_test = pd.read_csv(PATH+"/application_test.csv")
bureau = pd.read_csv(PATH+"/bureau.csv")
bureau_balance = pd.read_csv(PATH+"/bureau_balance.csv")
credit_card_balance = pd.read_csv(PATH+"/credit_card_balance.csv")
installments_payments = pd.read_csv(PATH+"/installments_payments.csv")
previous_application = pd.read_csv(PATH+"/previous_application.csv")
POS_CASH_balance = pd.read_csv(PATH+"/POS_CASH_balance.csv")

# **Glimpse the data**

In [None]:
# view shape
print("application_train -  rows:",application_train.shape[0]," columns:", application_train.shape[1])
print("application_test -  rows:",application_test.shape[0]," columns:", application_test.shape[1])
print("bureau -  rows:",bureau.shape[0]," columns:", bureau.shape[1])
print("bureau_balance -  rows:",bureau_balance.shape[0]," columns:", bureau_balance.shape[1])
print("credit_card_balance -  rows:",credit_card_balance.shape[0]," columns:", credit_card_balance.shape[1])
print("installments_payments -  rows:",installments_payments.shape[0]," columns:", installments_payments.shape[1])
print("previous_application -  rows:",previous_application.shape[0]," columns:", previous_application.shape[1])
print("POS_CASH_balance -  rows:",POS_CASH_balance.shape[0]," columns:", POS_CASH_balance.shape[1])

**application_train**

In [None]:
# view first 5 rows
application_train.head()

In [None]:
application_train.info()

In [None]:
application_train.describe()

In [None]:
# to_numpy
application_train.columns.values

**application_test**

In [None]:
# view first 5 rows
application_test.head()

In [None]:
application_test.info()

In [None]:
application_test.describe()

In [None]:
# to_numpy
application_test.columns.values

**bureau**

In [None]:
# view first 5 rows
bureau.head()

In [None]:
bureau.info()

In [None]:
bureau.describe()

In [None]:
# to_numpy
bureau.columns.values

**bureau_balance**

In [None]:
# view first 5 rows
bureau_balance.head()

In [None]:
bureau_balance.info()

In [None]:
bureau_balance.describe()

In [None]:
# to_numpy
bureau_balance.columns.values

**credit_card_balance**

In [None]:
credit_card_balance.head()

In [None]:
credit_card_balance.info()

In [None]:
credit_card_balance.describe()

In [None]:
credit_card_balance.columns.values

**installments_payments**

In [None]:
installments_payments.head()

In [None]:
installments_payments.info()

In [None]:
installments_payments.describe()

In [None]:
installments_payments.columns.values

**previous_applications**

In [None]:
previous_application.head()

In [None]:
previous_application.info()

In [None]:
previous_application.describe()

In [None]:
previous_application.columns.values

**POS_CASH_balance**

In [None]:
POS_CASH_balance.head()

In [None]:
POS_CASH_balance.info()

In [None]:
POS_CASH_balance.describe()

In [None]:
POS_CASH_balance.columns.values

# **Check missing data**

In [None]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending=False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

**application_train**

In [None]:
print(missing_data(application_train).head(10))

**application_test**

In [None]:
print(missing_data(application_test).head(10))

**bureau**

In [None]:
print(missing_data(bureau))

**bureau_balance**

In [None]:
print(missing_data(bureau_balance))

**credit_card_balance**

In [None]:
print(missing_data(credit_card_balance))

**installments_payments**

In [None]:
print(missing_data(installments_payments))

**previous_application**

In [None]:
print(missing_data(previous_application).head(20))

**POS_CASH_balance**

In [None]:
print(missing_data(POS_CASH_balance))

# [Problem 3] Defining issues

# **Check data unbalance**

*TARGET value 0 means loan is repaid, value 1 means loan is not repaid.*

In [None]:
temp = application_train["TARGET"].value_counts()
df = pd.DataFrame({'class': temp.index,
                   'values': temp.values
                  })
plt.figure(figsize = (6,6))
plt.title('Application loans repayed - train dataset')
sns.barplot(x = 'class', y="values", data=df)
locs, labels = plt.xticks(ticks=[0,1], labels=["repay", "not repay"])
plt.show()


As the figure shown, in the dataset, the loan which is repayed is much higher than the not repayed one.

*=> The dataset is extremely unbalanced.*

In [None]:
missing_df = missing_data(application_train)
num_of_missing_cols = missing_df[missing_df['Percent'] > 10].shape[0]
print("Number of column attributes have over 15% missing values:", num_of_missing_cols)
print(f"It's about {num_of_missing_cols / application_train.shape[1] * 100}% of columns")

Nearly half of the features contain missing values severely (assume 10% is severe).

**Conclusion**: We have to handle the missing value unless it will lead to inefficient prediction model.

**Some issues/questions:**

* TARGETs are unbalanced maybe because the data is collected among the same group of people or attributes.

* Does the unbalance off the dataset show something about the features?

* Which strategies should be used to deal with the missing values?

* Does the data follow any type of distribution?

# **[Problem 4] Data exploration**

# **Explore the data**

**Application data**

In [None]:
def plot_stats(feature,label_rotation=False,horizontal_layout=True):
    temp = application_train[feature].value_counts()
    df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})

    # Calculate the percentage of target=1 per category value
    cat_perc = application_train[[feature, 'TARGET']].groupby([feature],as_index=False).mean()
    cat_perc.sort_values(by='TARGET', ascending=False, inplace=True)
    
    if(horizontal_layout):
        fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
    else:
        fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12,14))
    sns.set_color_codes("pastel")
    s = sns.barplot(ax=ax1, x = feature, y="Number of contracts",data=df1)
    if(label_rotation):
        s.set_xticklabels(s.get_xticklabels(),rotation=90)
    
    s = sns.barplot(ax=ax2, x = feature, y='TARGET', order=cat_perc[feature], data=cat_perc)
    if(label_rotation):
        s.set_xticklabels(s.get_xticklabels(),rotation=90)
    plt.ylabel('Percent of target with value 1 [%]', fontsize=10)
    plt.tick_params(axis='both', which='major', labelsize=10)

    plt.show();

In [None]:
plot_stats('NAME_CONTRACT_TYPE')

Contract type *Cash loans* accounts for about 90% of the total number of contracts and most of them (\~8.5%) are not repaid, while contracts type *Revolving loans* have a medium level of not repaying chance (~5.5%)

In [None]:
plot_stats('CODE_GENDER')

The number of female clients is nearly double that of male clients. Taking a closer look in to the percent of defaulted credits, males are more likely to not return the loans (\~10%), comparing to females (~7.5%)

In [None]:
plot_stats('FLAG_OWN_CAR')

The clients that don't have car are double the clients that have the cars. In addition, the one that don't have car are less likely to repay the loans (~8%).

In [None]:
plot_stats('FLAG_OWN_REALTY')

In [None]:
plot_stats('EMERGENCYSTATE_MODE')

The majority customers are not in emergency mode but they are less likely to repay comparing to the one that in emergency.

The clients that owns real estate are more than double of the ones that doesn't own. Both categories (owning real estate or not owning) have not-repayment rates around 8%.

In [None]:
plot_stats('NAME_FAMILY_STATUS',True, True)

Most of the customers are married.
Beside the Unknown, Civil marriage has the highest probability of not repaying the loans (\~10%) while Widow has the smallest proportion (~6%)

In [None]:
plot_stats('NAME_INCOME_TYPE', True, True)

Mostly contracts made for employed clients, but 10% of them didn't repay their loans. However, the unemployed and maternity-leave population consist of small amount of samples but over 35% and 40% didn't repay, respectively.

In [None]:
plot_stats('CNT_CHILDREN')

Most of the clients taking a loan have no children. 
The number of loans associated with the clients with one children are 4 times smaller, the number of loans associated with the clients with two children are 8 times smaller; clients with 3, 4 or more children are much more rare.

As for repayment, clients with no children, 1, 2, 3, and 5 children have percents of no repayment around 10%. 

The clients with 9 or 11 children, the percent of loans not repaid is 100%.

In [None]:
missing_application_train = missing_data(application_train)
drop_application_train = application_train.drop(columns=missing_application_train[missing_application_train['Total'] > 5].index)
print("Remain features:", drop_application_train.columns.values)

In [None]:
application_train_corr = drop_application_train.corr()
plt.figure(figsize=(20, 12))
sns.heatmap(application_train_corr, square=True);
plt.show()

In [None]:
# 10 features/cols have the highest correlation coefficient values with target
application_train_10_list = application_train_corr.nlargest(10, 'TARGET').index
top10_corrmat = np.corrcoef(drop_application_train[application_train_10_list].values.T)
sns.set(font_scale=1.25)
plt.figure(figsize=(12, 9))
sns.heatmap(top10_corrmat, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=application_train_10_list.values, xticklabels=application_train_10_list.values)
plt.show()

In [None]:
# Plot distribution of one feature
def plot_distribution(feature,color):
    plt.figure(figsize=(10,6))
    plt.title("Distribution of %s" % feature)
    sns.histplot(application_train[feature].dropna(),color=color, kde=True,bins=100)
    plt.show()   

In [None]:
# Plot distribution of multiple features, with TARGET = 1/0 on the same graph
def plot_distribution_comp(var,nrow=2):
    
    i = 0
    t1 = application_train.loc[application_train['TARGET'] != 0]
    t0 = application_train.loc[application_train['TARGET'] == 0]

    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(nrow,2,figsize=(12,6*nrow))

    for feature in var:
        i += 1
        plt.subplot(nrow,2,i)
        sns.kdeplot(t1[feature], bw_adjust=0.5,label="TARGET = 1")
        sns.kdeplot(t0[feature], bw_adjust=0.5,label="TARGET = 0")
        plt.ylabel('Density plot', fontsize=12)
        plt.xlabel(feature, fontsize=12)
        locs, labels = plt.xticks()
        plt.tick_params(axis='both', which='major', labelsize=12)
    plt.show();

In [None]:
plot_distribution('AMT_INCOME_TOTAL','green')

In [None]:
plot_distribution('AMT_CREDIT','blue')

In [None]:
plot_distribution('AMT_ANNUITY','tomato')

In [None]:
plot_distribution('AMT_GOODS_PRICE','brown')

In [None]:
plot_distribution('DAYS_BIRTH','blue')

In [None]:
plot_distribution('DAYS_EMPLOYED','red')

> The data not follow the normal distribution.

In [None]:
var = ['AMT_ANNUITY','AMT_GOODS_PRICE','DAYS_EMPLOYED', 'DAYS_REGISTRATION','DAYS_BIRTH','DAYS_ID_PUBLISH']
plot_distribution_comp(var,nrow=3)

It's seem that the distributions of interval values from features remain between 2 sets of samples (TARGET=1 and TARGET=0) no matter of unbalance shown previously.