In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import required libraries

import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization

import warnings # Warnings
warnings.filterwarnings('ignore') # Ignore warnings

In [None]:
# Embed static images in notebook
%matplotlib inline

# Show upto 150 rows and columns in a DataFrame
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

## Read the Data files

In [None]:
# Detecting the encoding of the files to be imported

import chardet

def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc

print("Encoding of the application_data file: " + find_encoding('/kaggle/input/bank-loans-dataset/application_data.csv'))
print("Encoding of the previous_application file: " + find_encoding('/kaggle/input/bank-loans-dataset/previous_application.csv'))

So, both the files are encoded as 'ascii'. Let's import the files now.

Most of the times, you will not need to check the encoding of the data files. But, it is a good practice to check it to avoid any errors due to reading the files with the wrong encoding.

In [None]:
# Time to read the files

application_data = pd.read_csv('/kaggle/input/bank-loans-dataset/application_data.csv', encoding = 'ascii')
prev_application = pd.read_csv('/kaggle/input/bank-loans-dataset/previous_application.csv', encoding = 'ascii')

Application Data contains the information about the loan and applicant at the time of the application of the loan. <br>
Previous Application Data contains the Application Data for the client's previous loan application. It has one row per previous application.

## Data Summary

In [None]:
# Shape of the dataframes

print('Application Data: ', application_data.shape)
print('Previous Application Data: ', prev_application.shape)

In [None]:
# Snapshots of the datasets

application_data.head()

In [None]:
prev_application.head()

In [None]:
# Statistical summary of application data

application_data.describe()

.describe gives us the statistical summary of the numerical variables only. However, if we want to also include the categorical variables, we can set the parameter include = 'all'.

In [None]:
# Statistical summary of application data

application_data.describe(include = 'all')

In [None]:
# Statistical summary of Previous Application Data

prev_application.describe()

## Treating missing values and removing irrelevant variables

In [None]:
# Get percentage of missing data for each column and save it in another DataFrame

app_data_missing = pd.DataFrame(100*application_data.isnull().sum()/application_data.shape[0]).reset_index()

Since we have a large number of variables, let's first visualize the missing values in a chart.

In [None]:
# Creating a chart for missing values

plt.figure(figsize = (20,5))
plt.plot(app_data_missing['index'], app_data_missing[0])
plt.xticks(rotation = 90, fontsize = 8)
plt.title('Percentage of missing values in each column of Application Data', fontsize = 14)
plt.xlabel('Columns / Variables', fontsize = 10)
plt.ylabel('Percentage Missing', fontsize = 10)
plt.grid(b = True)
plt.show()

We can see that many of the variables have a high percentage of missing values.

Let us create a DataFrame of the columns that have more than 45% values as missing.

In [None]:
# Storing the variables having >45% missing values in a list

miss_cols = list(app_data_missing.loc[app_data_missing[0] > 45, 'index'])
len(miss_cols)

We can remove these 49 columns.

In [None]:
# Removing the high missing columns

application_data.drop(miss_cols, axis = 1, inplace = True)

In [None]:
# Checking the shape of application data again

application_data.shape

Let's start by first identifying the unnecessary columns based on our understanding from the columns_description file.<br>

The columns *FLAG_WORK_PHONE* and *FLAG_PHONE* both contain the information on whether the client provided home phone or not. Since, work phone information is captured in the *FLAG_EMP_PHONE* variable, we can remove the *FLAG_WORK_PHONE* variable.

In [None]:
# Dropping FLAG_WORK_PHONE variable

application_data.drop('FLAG_WORK_PHONE', axis = 1, inplace = True)

Now, we also do not have any context on what the variables *EXT_SOURCE_2* and *EXT_SOURCE_3* mean and how they relate to whether the client is more probable to default or not, we can remove these columns too.

In [None]:
# Dropping EXT_SOURCE_2, EXT_SOURCE_3 variables

application_data.drop(['EXT_SOURCE_2', 'EXT_SOURCE_3'], axis = 1, inplace = True)

The column *NAME_TYPE_SUITE* which indicates who was accompanying the client while applying for the loan has no relation with whether the client will default or not, we can remove this column too. <br>
Similarly, the columns *WEEKDAY_APPR_PROCESS_START*, *HOUR_APPR_PROCESS_START*, *REG_REGION_NOT_LIVE_REGION*, *REG_REGION_NOT_WORK_REGION*, *LIVE_REGION_NOT_WORK_REGION*, *REG_CITY_NOT_LIVE_CITY*, *REG_CITY_NOT_WORK_CITY*, *LIVE_CITY_NOT_WORK_CITY* can also be removed.

In [None]:
# Removing unnecessary columns

application_data.drop(['NAME_TYPE_SUITE', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY'], axis = 1, inplace = True)

Now, the columns of document flags are not individually important as we have no information about which document is being referred to. But, they can be a good indicator at an **aggregate** level. So, we can create another column *NUM_DOCS_ADDED* as the number of documents submitted.

In [None]:
# Creating the NUM_DOCS_ADDED Column

application_data['NUM_DOCS_ADDED'] = application_data['FLAG_DOCUMENT_2'] + application_data['FLAG_DOCUMENT_3'] + application_data['FLAG_DOCUMENT_4'] + application_data['FLAG_DOCUMENT_5'] + application_data['FLAG_DOCUMENT_6'] + application_data['FLAG_DOCUMENT_7'] + application_data['FLAG_DOCUMENT_8'] + application_data['FLAG_DOCUMENT_9'] + application_data['FLAG_DOCUMENT_10'] + application_data['FLAG_DOCUMENT_11'] + application_data['FLAG_DOCUMENT_12'] + application_data['FLAG_DOCUMENT_13'] + application_data['FLAG_DOCUMENT_14'] + application_data['FLAG_DOCUMENT_15'] + application_data['FLAG_DOCUMENT_16'] + application_data['FLAG_DOCUMENT_17'] + application_data['FLAG_DOCUMENT_18'] + application_data['FLAG_DOCUMENT_19'] + application_data['FLAG_DOCUMENT_20'] + application_data['FLAG_DOCUMENT_21']

Now, we can **remove the Document flag variables**.

In [None]:
# Removing Document flag variables

application_data.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
       'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
       'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
       'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'], axis = 1, inplace = True)

In [None]:
# Updated shape of Application Data

application_data.shape

We only have 42 columns now.

In [None]:
# Print percentage missing values of each column

print(100*application_data.isnull().sum()/application_data.shape[0])

The percentage of missing values for columns *AMT_ANNUITY*, *AMT_GOODS_PRICE*, *OBS_30_CNT_SOCIAL_CIRCLE*, *DEF_30_CNT_SOCIAL_CIRCLE*, *OBS_60_CNT_SOCIAL_CIRCLE*, *DEF_60_CNT_SOCIAL_CIRCLE*, *DAYS_LAST_PHONE_CHANGE* is very less. So, we can **remove these missing rows**.

In [None]:
# Removing missing values for AMT_ANNUITY
app_data_1 = application_data[~application_data['AMT_ANNUITY'].isnull()].copy()

# Removing missing values for AMT_GOODS_PRICE
app_data_2 = app_data_1[~app_data_1['AMT_GOODS_PRICE'].isnull()].copy()

# Removing missing values for OBS_30_CNT_SOCIAL_CIRCLE
app_data_3 = app_data_2[~app_data_2['OBS_30_CNT_SOCIAL_CIRCLE'].isnull()].copy()

# Removing missing values for DEF_30_CNT_SOCIAL_CIRCLE
app_data_4 = app_data_3[~app_data_3['DEF_30_CNT_SOCIAL_CIRCLE'].isnull()].copy()

# Removing missing values for OBS_60_CNT_SOCIAL_CIRCLE
app_data_5 = app_data_4[~app_data_4['OBS_60_CNT_SOCIAL_CIRCLE'].isnull()].copy()

# Removing missing values for DEF_60_CNT_SOCIAL_CIRCLE
app_data_6 = app_data_5[~app_data_5['DEF_60_CNT_SOCIAL_CIRCLE'].isnull()].copy()

# Removing missing values for DAYS_LAST_PHONE_CHANGE
app_data_7 = app_data_6[~app_data_6['DAYS_LAST_PHONE_CHANGE'].isnull()].copy()

In [None]:
# Shape of updated DataFrame

app_data_7.shape

So, we have 306199 rows in the data now, ehich is around **99.57% of the original data**. So, we are good to go with this data.

*OCCUPATION_TYPE* column is a categorical variable, let's look at the composition of the variable.

In [None]:
# Count of each instance of OCCUPATION_TYPE

100*app_data_7['OCCUPATION_TYPE'].value_counts(normalize = True)

Although Laborers make up a large percent of our data, it is not so high that we replace our missing values with it. So, let us **impute the missing values in this column with 'Unknown'**. In the machine learning model, this column, combined with *'NAME_INCOME_TYPE'* could indicate towards a person's occupation.

In [None]:
# Imputing missing values in OCCUPATION_TYPE with 'Unknown'

app_data_7['OCCUPATION_TYPE'].fillna('Unknown', inplace = True)

Now, let's look at *AMT_REQ_CREDIT_BUREAU* columns and how we can impute their missing values. <br>

Since these columns are numeric and represent the number of queries in the Credit Bureau about the client in the specified time period, we can **use the median of each column to impute the values**.

In [None]:
# Imputing missing values in AMT_REQ_CREDIT_BUREAU variables with the median of each column

app_data_7['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(app_data_7['AMT_REQ_CREDIT_BUREAU_HOUR'].median(), inplace = True)

app_data_7['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(app_data_7['AMT_REQ_CREDIT_BUREAU_DAY'].median(), inplace = True)

app_data_7['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(app_data_7['AMT_REQ_CREDIT_BUREAU_WEEK'].median(), inplace = True)

app_data_7['AMT_REQ_CREDIT_BUREAU_MON'].fillna(app_data_7['AMT_REQ_CREDIT_BUREAU_MON'].median(), inplace = True)

app_data_7['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(app_data_7['AMT_REQ_CREDIT_BUREAU_QRT'].median(), inplace = True)

app_data_7['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(app_data_7['AMT_REQ_CREDIT_BUREAU_YEAR'].median(), inplace = True)

In [None]:
# Verifying whether all missing values are treated

print(100*app_data_7.isnull().sum()/app_data_7.shape[0])

All missing values of the application dataset are cealned.

## Cleaning the Data types

In [None]:
# Looking at the dataset again

app_data_7.head()

We can see that the columns *DAYS_BIRTH*, *DAYS_EMPLOYED*, *DAYS_REGISTRATION*, *DAYS_PUBLISH* are negative. These should be in positive years or months. Let's **convert these to years**.

In [None]:
# Converting Negative Days to Positive Years

app_data_7['DAYS_BIRTH_YRS'] = app_data_7['DAYS_BIRTH'].apply(lambda x : (-1.0)*x/365)

app_data_7['DAYS_EMPLOYED_YRS'] = app_data_7['DAYS_EMPLOYED'].apply(lambda x: (-1.0)*x/365)

app_data_7['DAYS_REGISTRATION_YRS'] = app_data_7['DAYS_REGISTRATION'].apply(lambda x : (-1.0)*x/365)

app_data_7['DAYS_ID_PUBLISH_YRS'] = app_data_7['DAYS_ID_PUBLISH'].apply(lambda x : (-1.0)*x/365)

Now, we need to drop the older Days columns.

In [None]:
# Drop negative days columns

app_data_7.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH'], axis = 1, inplace = True)

Now, there are no columns that are of date type in this DataFrame. We only have the age/time since documents were changed which are recorded as float. We should not convert these to DateTime type.

Now, let's look at some variables where the DataType is not stored correctly. <br>
Let's look at CNT_FAM_MEMBERS which is stored as float. It should be stored as integer.

In [None]:
# Convert CNT_FAM_MEMBERS to int

app_data_7['CNT_FAM_MEMBERS'] = app_data_7.loc[:,'CNT_FAM_MEMBERS'].astype(int)

In [None]:
# Converting Social Circle variables to int

app_data_7['OBS_30_CNT_SOCIAL_CIRCLE'] = app_data_7.loc[:,'OBS_30_CNT_SOCIAL_CIRCLE'].astype(int)

app_data_7['DEF_30_CNT_SOCIAL_CIRCLE'] = app_data_7.loc[:,'DEF_30_CNT_SOCIAL_CIRCLE'].astype(int)

app_data_7['OBS_60_CNT_SOCIAL_CIRCLE'] = app_data_7.loc[:,'OBS_60_CNT_SOCIAL_CIRCLE'].astype(int)

app_data_7['DEF_60_CNT_SOCIAL_CIRCLE'] = app_data_7.loc[:,'DEF_60_CNT_SOCIAL_CIRCLE'].astype(int)

## Handling Outliers and Binning

Now, we need to identify the outliers in our continuous variables. <br>
For this, we first need to identify the continuous variables in our dataset. So, let's look at the number of unique values in each variable.

In [None]:
# Unique values in each variable

app_data_7.nunique().sort_values()

We can see here that the variables REGION_POPULATION_RELATIVE, AMT_GOODS_PRICE, AMT_INCOME_TOTAL, DAYS_LAST_PHONE_CHANGE, AMT_CREDIT, DAYS_ID_PUBLISH_YRS, DAYS_EMPLOYED_YRS, AMT_ANNUITY, DAYS_REGISTRATION_YRS and DAYS_BIRTH_YRS are continuous variables since their number of unique values is large. 

Let's try to find outliers in variables AMT_INCOME_TOTAL, AMT_CREDIT, DAYS_BIRTH_YRS.

In [None]:
# Box plot for AMT_INCOME_TOTAL

plt.figure(figsize = (18,5))
sns.boxplot(app_data_7['AMT_INCOME_TOTAL'])
plt.title('Box plot of AMT_INCOME_TOTAL')
plt.show()

In [None]:
# Statistical summary of AMT_INCOME_TOTAL

app_data_7['AMT_INCOME_TOTAL'].describe()

We can see that one value is way higher than all other values. We can delete this value.

In [None]:
# Removing the outlier observation

app_data_7 = app_data_7[app_data_7['AMT_INCOME_TOTAL']< app_data_7['AMT_INCOME_TOTAL'].max()]

In [None]:
# Box plot for AMT_INCOME_TOTAL after removing the outlier

plt.figure(figsize = (18,5))
sns.boxplot(app_data_7['AMT_INCOME_TOTAL'])
plt.title('Box plot of AMT_INCOME_TOTAL')
plt.show()

Now, we can bin these values. For that, we first need to look at the distribution of the variable.

In [None]:
# Distribution of Applicant's income

plt.figure(figsize = (18,6))
plt.hist(app_data_7['AMT_INCOME_TOTAL'])
plt.yscale('log')
plt.xlabel("Applicant's Income",fontsize=12)
plt.title('Distribution of AMT_INCOME_TOTAL')
plt.show()

To know more, we can look at the percentile values of the Income variable.

In [None]:
# Different percentiles of Income variable

app_data_7['AMT_INCOME_TOTAL'].quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

We can see that our median is just around 1.5 lakh. So, our bins have to be more dense below 1.5 lakh and sparse beyond that. Let's create these bins.

In [None]:
# Creating bins for Income variable

app_data_7['APPLICANT_INCOME'] = pd.cut(x=app_data_7['AMT_INCOME_TOTAL'],
                                    bins=[0, 50000, 100000, 150000, 300000, 500000, 1000000, 2000000, 100000000],
                                    labels=['<50k', '50k - 1lac', '1lac - 1.5lac', '1.5lac - 3lac', '3lac - 5lac', '5lac - 10lac', '10lac - 20lac', '>20lac'])

Let's see the composition of our new variable APPLICANT_INCOME.

In [None]:
# Value Counts of APPLICANT_INCOME

app_data_7['APPLICANT_INCOME'].value_counts(normalize = True, sort = False)

Now, let's see how our DAYS_BIRTH_YRS, which represents the age of the applicant, is distributed.

In [None]:
#Distribution of Applicant's age

plt.figure(figsize = (18,6))
plt.hist(app_data_7['DAYS_BIRTH_YRS'])
plt.xlabel("Age of Applicant",fontsize=12)
plt.title('Distribution of AMT_INCOME_TOTAL')
plt.show()

In [None]:
# Box plot of DAYS_BIRTH_YRS

plt.figure(figsize = (18,4))
sns.boxplot(app_data_7['DAYS_BIRTH_YRS'])
plt.title('Box plot of DAYS_BIRTH_YRS')
plt.show()

So, there are no outliers in the DAYS_BIRTH_YRS variable. But, we can still bin this variable as people of an age group tend to behave in a similar manner.

In [None]:
#Creating bins for Applicant's age

app_data_7['APPLICANT_AGE'] = pd.cut(x=app_data_7['DAYS_BIRTH_YRS'],
                                    bins=[0, 25, 40, 60, 80],
                                    labels=['<25 yrs', '25-40 yrs', '40-60 yrs', '>60 yrs'])

In [None]:
# Checking the composition of the APPLICANT_AGE variable

100*app_data_7['APPLICANT_AGE'].value_counts(normalize = True)

In [None]:
# Box plot of AMT_CREDIT

plt.figure(figsize = (18,5))
sns.boxplot(app_data_7['AMT_CREDIT'])
plt.title('Box plot of AMT_CREDIT')
plt.show()

In [None]:
# Different percentiles of the AMT_CREDIT variable

app_data_7['AMT_CREDIT'].quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [None]:
# Creating bins for AMT_CREDIT

app_data_7['LOAN_AMOUNT'] = pd.cut(x=app_data_7['AMT_CREDIT'],
                                    bins=[0, 250000, 500000, 750000, 1000000, 1500000, 2000000, 5000000],
                                    labels=['<2.5lac', '2.5lac - 5lac', '5lac - 7.5lac', '7.5lac - 10lac', '10lac - 15lac', '15lac - 20lac', '>20lac'])

In [None]:
# Checking the composition of LOAN_AMOUNT variable

app_data_7['LOAN_AMOUNT'].value_counts(normalize = True, sort = False)

## Class Imbalance

We start our analysis by checking the imbalance in the data. Imbalance is the ratio of one value of Target variable vs the other.

In [None]:
# Counting number of rows for TARGET values of 1 and 0

print(app_data_7['TARGET'].value_counts(normalize = True))

print(app_data_7['TARGET'].value_counts(normalize = False))

So, imbalance for TARGET = 1 is 91.9114% and for TARGET = 0 is 8.0886%.

## Dividing dataset into 1 and 0

Now, we will divide the dataset into two parts - One with TARGET = 0 and one with TARGET = 1.

In [None]:
# Creating DataFrame with TARGET = 1

app_data_target_1 = app_data_7.loc[app_data_7['TARGET'] == 1]
app_data_target_1

In [None]:
# Creting DataFrame with TARGET = 0

app_data_target_0 = app_data_7.loc[app_data_7['TARGET'] == 0]
app_data_target_0

In [None]:
# Confirming whether the new DataFrames have correct row counts

print(app_data_7['TARGET'].value_counts(),"\n",app_data_target_0.shape[0],"\n",app_data_target_1.shape[0])

We can confirm that the two new dataframes have correct respective TARGET value and right row count.

## Univariate Analysis

### Categorical Unordered Univariate Analysis

In [None]:
# Data Types

app_data_target_0.dtypes

So, our unordered categorical variables are NAME_CONTRACT_TYPE, CODE_GENDER, NAME_INCOME_TYPE, NAME_FAMILY_STATUS, NAME_HOUSING_TYPE, OCCUPATION_TYPE, ORGANIZATION_TYPE.

Let's look at these one by one for both Target = 0 an Target = 1

In [None]:
# Plotting the CODE_GENDER variable

plt.figure(figsize = (18,6))
plt.title('Applicant Gender')

plt.subplot(121)
plt.title('Percentage of Male/Female for Target = 0', fontsize = 10)
plt.xlabel('Target = 0')
plt.ylabel('Percentage')
(100*app_data_target_0['CODE_GENDER'].value_counts(normalize = True)).plot.bar()

plt.subplot(122)
plt.title('Percentage of Male/Female for Target = 1', fontsize = 10)
plt.xlabel('Target = 1')
plt.ylabel('Percentage')
(100*app_data_target_1['CODE_GENDER'].value_counts(normalize = True)).plot.bar()

In [None]:
app_data_target_0['CODE_GENDER'].value_counts(normalize = False)

In [None]:
# Removing rows with Gender as 'XNA'

app_data_target_0 = app_data_target_0[~(app_data_target_0['CODE_GENDER'] == 'XNA')]

In [None]:
# Making the chart again

fig = plt.figure(figsize = (18,6))
fig.suptitle('Applicant Gender')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
plt.ylabel('Percent')
(100*app_data_target_0['CODE_GENDER'].value_counts(normalize = True)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['CODE_GENDER'].value_counts(normalize = True)).plot.bar()

Here, we can see that Males are more likely to default on a loan.

In [None]:
# Actual values
print(app_data_target_0['CODE_GENDER'].value_counts(normalize = True))
print(app_data_target_1['CODE_GENDER'].value_counts(normalize = True))

In [None]:
# Plotting the NAME_CONTRACT_TYPE variable

fig = plt.figure(figsize = (18,6))
fig.suptitle('Loan Type')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
plt.ylabel('Percent')
(100*app_data_target_0['NAME_CONTRACT_TYPE'].value_counts(normalize = True)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['NAME_CONTRACT_TYPE'].value_counts(normalize = True)).plot.bar()

So, Revolving loans are less likely to default.

In [None]:
# Actual values
print(app_data_target_0['NAME_CONTRACT_TYPE'].value_counts(normalize = True))
print(app_data_target_1['NAME_CONTRACT_TYPE'].value_counts(normalize = True))

In [None]:
# Plotting NAME_INCOME_TYPE variable

fig = plt.figure(figsize = (18,6))
fig.suptitle('Income Type')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
plt.ylabel('Percent')
(100*app_data_target_0['NAME_INCOME_TYPE'].value_counts(normalize = True)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['NAME_INCOME_TYPE'].value_counts(normalize = True)).plot.bar()

In [None]:
# Actual values
print(app_data_target_0['NAME_INCOME_TYPE'].value_counts(normalize = True))
print(app_data_target_1['NAME_INCOME_TYPE'].value_counts(normalize = True))

In [None]:
# Plotting NAME_FAMILY_STATUS variable

fig = plt.figure(figsize = (18,6))
fig.suptitle('Family Status')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
plt.ylabel('Percent')
(100*app_data_target_0['NAME_FAMILY_STATUS'].value_counts(normalize = True)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['NAME_FAMILY_STATUS'].value_counts(normalize = True)).plot.bar()

Married people are less likely to default while Single and Civil married people are more likely to default.

In [None]:
# Actual values
print(app_data_target_0['NAME_FAMILY_STATUS'].value_counts(normalize = True))
print(app_data_target_1['NAME_FAMILY_STATUS'].value_counts(normalize = True))

In [None]:
# Plotting NAME_HOUSING_TYPE variable

fig = plt.figure(figsize = (18,6))
fig.suptitle('Housing Type')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
plt.ylabel('Percent')
(100*app_data_target_0['NAME_HOUSING_TYPE'].value_counts(normalize = True)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['NAME_HOUSING_TYPE'].value_counts(normalize = True)).plot.bar()

People living with parents or in a rented apartment are more likely to default.

In [None]:
# Actual values
print(app_data_target_0['NAME_HOUSING_TYPE'].value_counts(normalize = True))
print(app_data_target_1['NAME_HOUSING_TYPE'].value_counts(normalize = True))

In [None]:
# Plotting OCCUPATION_TYPE variable

fig = plt.figure(1, figsize = (18,6))
fig.suptitle('Occupation Type')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
plt.ylabel('Percent')
(100*app_data_target_0['OCCUPATION_TYPE'].value_counts(normalize = True)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['OCCUPATION_TYPE'].value_counts(normalize = True)).plot.bar()

Laborers, Sales staff, Drivers are more likely to default.

In [None]:
# Actual values
print(app_data_target_0['OCCUPATION_TYPE'].value_counts(normalize = True))
print(app_data_target_1['OCCUPATION_TYPE'].value_counts(normalize = True))

In [None]:
# Plotting ORGANIZATION_TYPE variable

fig = plt.figure(1, figsize = (18,6))
fig.suptitle('Income Type')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
plt.ylabel('Percent')
plt.xticks(fontsize = 7)
(100*app_data_target_0['ORGANIZATION_TYPE'].value_counts(normalize = True)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
plt.xticks(fontsize = 7)
(100*app_data_target_1['ORGANIZATION_TYPE'].value_counts(normalize = True)).plot.bar()

In [None]:
# Actual values
print(app_data_target_0['ORGANIZATION_TYPE'].value_counts(normalize = True))
print(app_data_target_1['ORGANIZATION_TYPE'].value_counts(normalize = True))

### Categorical Ordered Univariate Analysis

Let's first identify the Ordered Categorical variables in our dataset. 

In [None]:
app_data_target_0.dtypes

The variables FLAG_OWN_CAR, FLAG_OWN_REALTY, NAME_EDUCATION_TYPE, APPLICANT_INCOME, APPLICANT_AGE, LOAN_AMOUNT are all Ordered Categorical variables. Let's look at these one by one.

In [None]:
# Plotting the FLAG_OWN_CAR

fig = plt.figure(figsize = (18,5))
fig.suptitle('Does the Applicant own a Car?')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

plt.subplot(ax1)
plt.xlabel('Target = 0')
(100*app_data_target_0['FLAG_OWN_CAR'].value_counts(normalize = True)).plot.pie()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['FLAG_OWN_CAR'].value_counts(normalize = True)).plot.pie()

People who do not own a car are more likely to default.

In [None]:
# Plotting the FLAG_OWN_REALTY

fig = plt.figure(figsize = (18,6))
fig.suptitle('Does the Applicant own a House?')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

plt.subplot(ax1)
plt.xlabel('Target = 0')
(100*app_data_target_0['FLAG_OWN_REALTY'].value_counts(normalize = True)).plot.pie()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['FLAG_OWN_REALTY'].value_counts(normalize = True)).plot.pie()

People who do not own a house are more likely to default.

In [None]:
# Plotting the NAME_EDUCATION_TYPE

fig = plt.figure(1, figsize = (18,6))
fig.suptitle('Education level of Applicant')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

plt.subplot(ax1)
plt.xlabel('Target = 0')
(100*app_data_target_0['NAME_EDUCATION_TYPE'].value_counts(normalize = True)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['NAME_EDUCATION_TYPE'].value_counts(normalize = True)).plot.bar()

People with a higher level of education are less likely to default.

In [None]:
# Plotting the APPLICANT_INCOME

fig = plt.figure(1, figsize = (18,6))
fig.suptitle('What is the income of the Applicant?')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

plt.subplot(ax1)
plt.xlabel('Target = 0')
(100*app_data_target_0['APPLICANT_INCOME'].value_counts(normalize = True, sort = False)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['APPLICANT_INCOME'].value_counts(normalize = True, sort = False)).plot.bar()

In [None]:
# Plotting the APPLICANT_AGE

fig = plt.figure(1, figsize = (18,6))
fig.suptitle('What is the age of the Applicant?')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

plt.subplot(ax1)
plt.xlabel('Target = 0')
(100*app_data_target_0['APPLICANT_AGE'].value_counts(normalize = True, sort = False)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['APPLICANT_AGE'].value_counts(normalize = True, sort = False)).plot.bar()

In [None]:
# Plotting the LOAN_AMOUNT

fig = plt.figure(1, figsize = (18,6))
fig.suptitle('What is the loan amount?')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

plt.subplot(ax1)
plt.xlabel('Target = 0')
(100*app_data_target_0['LOAN_AMOUNT'].value_counts(normalize = True, sort = False)).plot.bar()

plt.subplot(ax2)
plt.xlabel('Target = 1')
(100*app_data_target_1['LOAN_AMOUNT'].value_counts(normalize = True, sort = False)).plot.bar()

### Numerical variables

Let's see the statistical summary of our DataFrames. 

In [None]:
# Statistical summary of Target = 0

app_data_target_0.describe()

In [None]:
# Statistical summary of Target = 1

app_data_target_1.describe()

Now, let's plot the box plots of the variables CNT_CHILDREN, CNT_FAM_MEMBERS, REGION_RATING_CLIENT, DEF_30_CNT_SOCIAL_CIRCLE, DEF_60_CNT_SOCIAL_CIRCLE, NUM_DOCS_ADDED, DAYS_LAST_PHONE_CHANGE.

In [None]:
# Box plots of CNT_CHILDREN

fig = plt.figure(1, figsize = (18,8))
fig.suptitle('Number of Children')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
app_data_target_0['CNT_CHILDREN'].plot.box()

plt.subplot(ax2)
plt.xlabel('Target = 1')
app_data_target_1['CNT_CHILDREN'].plot.box()

In [None]:
# Box plots of CNT_FAM_MEMBERS

fig = plt.figure(1, figsize = (18,8))
fig.suptitle('Number of Family Members')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
app_data_target_0['CNT_FAM_MEMBERS'].plot.box()

plt.subplot(ax2)
plt.xlabel('Target = 1')
app_data_target_1['CNT_FAM_MEMBERS'].plot.box()

In [None]:
# Box plots of REGION_RATING_CLIENT

fig = plt.figure(1, figsize = (18,8))
fig.suptitle('Rating of Applicant\'s Region')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
app_data_target_0['REGION_RATING_CLIENT'].plot.box()

plt.subplot(ax2)
plt.xlabel('Target = 1')
app_data_target_1['REGION_RATING_CLIENT'].plot.box()

In [None]:
# Box plots of FLAG_OWN_CAR vs AMT_ANNUITY

fig = plt.figure(1, figsize = (18,8))
fig.suptitle("Target=0 vs Target=1")
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
#plt.xlabel('Target = 0')
sns.boxplot(x='FLAG_OWN_CAR', y='AMT_ANNUITY', data=app_data_target_0)
plt.yscale('log')

plt.subplot(ax2)
#plt.xlabel('Target = 1')
sns.boxplot(x='FLAG_OWN_CAR', y='AMT_ANNUITY', data=app_data_target_1)
plt.yscale('log')

In [None]:
# Box plots of APPLICANT_INCOME vs AMT_ANNUITY

fig = plt.figure(1, figsize = (18,8))
fig.suptitle("Target=0 vs Target=1")
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
#plt.xlabel('Target = 0')
sns.boxplot(x="APPLICANT_INCOME", y='AMT_ANNUITY', data=app_data_target_0)
plt.yscale('log')

plt.subplot(ax2)
#plt.xlabel('Target = 1')
sns.boxplot(x="APPLICANT_INCOME", y='AMT_ANNUITY', data=app_data_target_1)
plt.yscale('log')

In [None]:
# Strip Plots of AMT_GOODS PRICE vs LOAN AMOUNT

fig = plt.figure(1, figsize = (18,6))
fig.suptitle("Target=0 vs Target=1")
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
sns.stripplot(x='AMT_GOODS_PRICE', y="LOAN_AMOUNT", data=app_data_target_0)

plt.subplot(ax2)
sns.stripplot(x='AMT_GOODS_PRICE', y="LOAN_AMOUNT", data=app_data_target_1)

In [None]:
# Strip plot for AMT_GOODS_PRICE vs APLICANT_INCOME

fig = plt.figure(1, figsize = (18,6))
fig.suptitle("Target=0 vs Target=1")
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
sns.stripplot(x='AMT_GOODS_PRICE', y="APPLICANT_INCOME", data=app_data_target_0)

plt.subplot(ax2)
sns.stripplot(x="AMT_GOODS_PRICE", y="APPLICANT_INCOME", data=app_data_target_1)


In [None]:
# Box plots of DAYS_LAST_PHONE_CHANGE

fig = plt.figure(1, figsize = (18,8))
fig.suptitle('Number of Defaults in Applicant\'s social surroundings')
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2, sharey = ax1)

plt.subplot(ax1)
plt.xlabel('Target = 0')
app_data_target_0['DAYS_LAST_PHONE_CHANGE'].plot.box()

plt.subplot(ax2)
plt.xlabel('Target = 1')
app_data_target_1['DAYS_LAST_PHONE_CHANGE'].plot.box()

## Bivariate Analysis

### Numeric - Numeric Analysis

Let's first see which numerical columns we have.

In [None]:
# Checking dtypes again

app_data_target_0.dtypes

So, the numerical columns are AMT_INCOME_TOTAL, AMT_CREDIT, AMT_ANNUITY, AMT_GOODS_PRICE, REGION_POPULATION_RELATIVE, AMT_REQ_CREDIT_BUREAU_WEEK, AMT_REQ_CREDIT_BUREAU_MON, AMT_REQ_CREDIT_BUREAU_YEAR, DAYS_BIRTH_YRS, DAYS_EMPLOYED_YRS, DAYS_REGISTRATION_YRS.

Let's look at the relation between Income and Loan amount.

In [None]:
sns.pairplot(data = app_data_target_0, vars = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'AMT_REQ_CREDIT_BUREAU_YEAR'])
plt.show()

Let's look at the same variables for Target = 1.

In [None]:
sns.pairplot(data = app_data_target_0, vars = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'AMT_REQ_CREDIT_BUREAU_YEAR'])
plt.show()

In [None]:
sns.pairplot(data = app_data_target_1, vars = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'AMT_REQ_CREDIT_BUREAU_YEAR'])
plt.show()

Let's see the correlation between AMT_CREDIT, AMT_ANNUITY, AMT_INCOME_TOTAL, CNT_CHILDREN, DAYS_EMPLOYED_YRS, DAYS_BIRTH_YRS

In [None]:
#Creating the Correlation Matrix for Target = 0

curr_0 = app_data_target_0[['AMT_CREDIT', 'AMT_ANNUITY', 'AMT_INCOME_TOTAL', 'CNT_CHILDREN', 'DAYS_EMPLOYED_YRS', 'DAYS_BIRTH_YRS']]

cor_0 = curr_0.corr()

sns.heatmap(cor_0, cmap = "YlGnBu", annot = True)

plt.show()

In [None]:
#Creating the Correlation Matrix for Target = 0

curr_1 = app_data_target_1[['AMT_CREDIT', 'AMT_ANNUITY', 'AMT_INCOME_TOTAL', 'CNT_CHILDREN', 'DAYS_EMPLOYED_YRS', 'DAYS_BIRTH_YRS']]

cor_1 = curr_1.corr()

sns.heatmap(cor_1, cmap = "YlGnBu", annot = True)

plt.show()

## EDA on Previous Application Data

Let's repeat all the processes done with the Application Data file.

In [None]:
# Get percentage of missing data for each column and save it in another DataFrame

prev_app_data_missing = pd.DataFrame(100*prev_application.isnull().sum()/prev_application.shape[0]).reset_index()

In [None]:
# Creating chart for missing values 

plt.figure(figsize = (18,5))
plt.plot(prev_app_data_missing['index'], prev_app_data_missing[0])
plt.xticks(rotation = 90, fontsize = 7)
plt.title('Percentage of Missing Values in each column of Previous Application Data', fontsize = 14)
plt.xlabel('Columns', fontsize = 10)
plt.ylabel('Percentage Missing', fontsize = 10)
plt.figure()
plt.show()

In [None]:
# Displaying the Missing percentage values as a DataFrame

prev_app_data_missing

In [None]:
# Getting the list of columns with > 45% data as missing

miss_cols_prev_app_data = list(prev_app_data_missing.loc[prev_app_data_missing[0] > 45, 'index'])
print(len(miss_cols_prev_app_data))

In [None]:
# Dropping the columns from Application Data

prev_application.drop(miss_cols_prev_app_data, axis = 1, inplace = True)

In [None]:
# Print percentage missing values of each column

print(100*prev_application.isnull().sum()/prev_application.shape[0])

In [None]:
# Removing missing values for PRODUCT_COMBINATION

prev_app_data_1 = prev_application[~prev_application['PRODUCT_COMBINATION'].isnull()].copy()

In [None]:
prev_app_data_1.shape[0]

This represents 99.97% of the original data.

In [None]:
prev_app_data_1['DAYS_LAST_DUE'].median()

In [None]:
# Imputing missing values with median

prev_app_data_1['AMT_ANNUITY'].fillna(prev_app_data_1['AMT_ANNUITY'].median(), inplace = True)

prev_app_data_1['AMT_GOODS_PRICE'].fillna(prev_app_data_1['AMT_GOODS_PRICE'].median(), inplace = True)

prev_app_data_1['CNT_PAYMENT'].fillna(prev_app_data_1['CNT_PAYMENT'].median(), inplace = True)

prev_app_data_1['DAYS_FIRST_DRAWING'].fillna(prev_app_data_1['DAYS_FIRST_DRAWING'].median(), inplace = True)

prev_app_data_1['DAYS_FIRST_DUE'].fillna(prev_app_data_1['DAYS_FIRST_DUE'].median(), inplace = True)

prev_app_data_1['DAYS_LAST_DUE_1ST_VERSION'].fillna(prev_app_data_1['DAYS_LAST_DUE_1ST_VERSION'].median(), inplace = True)

prev_app_data_1['DAYS_LAST_DUE'].fillna(prev_app_data_1['DAYS_LAST_DUE'].median(), inplace = True)

prev_app_data_1['DAYS_TERMINATION'].fillna(prev_app_data_1['DAYS_TERMINATION'].median(), inplace = True)

prev_app_data_1['NFLAG_INSURED_ON_APPROVAL'].fillna(prev_app_data_1['NFLAG_INSURED_ON_APPROVAL'].median(), inplace = True)

In [None]:
# Print percentage missing values of each column

print(100*prev_app_data_1.isnull().sum()/prev_app_data_1.shape[0])

All missing values have been imputed.

In [None]:
# Changing the DAYS_FIRST_DUE variable to positive years

prev_app_data_1['DAYS_FIRST_DUE'] = prev_app_data_1['DAYS_FIRST_DUE'].apply(lambda x : (-1.0)*x/365)

In [None]:
# Checking DataTypes of Previous Application Data

prev_app_data_1.dtypes

In [None]:
# Removing irrelevant columns

prev_app_data_1.drop(['WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY'], axis = 1, inplace = True)

The Data Types of all remaining variables are correct.

In [None]:
# Unique values in each variable

prev_app_data_1.nunique().sort_values()

In [None]:
# Box plot for AMT_ANNUITY

plt.figure(figsize = (18, 4))
sns.boxplot(prev_app_data_1['AMT_ANNUITY'])
plt.show()

In [None]:
# Different percentiles of the AMT_ANNUITY variable

prev_app_data_1['AMT_ANNUITY'].quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [None]:
# Creating bins for AMT_ANNUITY

prev_app_data_1['LOAN_INSTALMENT'] = pd.cut(x=prev_app_data_1['AMT_ANNUITY'],
                                    bins=[0, 5000, 10000, 20000, 30000, 40000, 50000, 500000],
                                    labels=['<5k', '5k - 10k', '10k - 20k', '20k - 30k', '30k - 40k', '40k - 50k', '>50k'])

In [None]:
prev_app_data_1.columns

### Univariate Analysis

In [None]:
sns.stripplot(x='DAYS_TERMINATION', y="NAME_CONTRACT_STATUS", data=prev_app_data_1)
plt.plot()

In [None]:
plt.suptitle("Amount of Loan asked Vs Contract Status")
sns.stripplot(x='AMT_APPLICATION', y="NAME_CONTRACT_STATUS", data=prev_app_data_1)
plt.plot()

In [None]:
plt.suptitle("when was the decision about previous application made Vs Contract Status")
sns.stripplot(x= "DAYS_DECISION", y="NAME_CONTRACT_STATUS", data=prev_app_data_1)
plt.plot()

In [None]:
prev_app_data_1.NAME_CONTRACT_STATUS.value_counts(normalize=True).plot.barh()
plt.show()

In [None]:
prev_app_data_1.NAME_PORTFOLIO.value_counts(normalize=True).plot.barh()
plt.show()