In [None]:
# Importing required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Ignore warning messages

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Display settings for rows and columns to get a complete view

pd.pandas.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [None]:
# Importing dataset application_data.csv into dataframe 'inp'

inp = pd.read_csv('../input/loanapplicationdata/application_data.csv')

In [None]:
#Displaying the first 5 rows of the dataframe

inp.head()

In [None]:
#Inspecting datatype of columns in dataframe

inp.info(verbose=True)

In [None]:
# Checking no. of rows and columns in the dataframe inp

inp.shape

In [None]:
# Describing the numerical columns of the dataframe

inp.describe()

### Find percentage of missing values for all the columns

In [None]:
# Function to display null value percent for each column in inp dataframe

def nullvalcnt(y):
    return float((inp[y].isnull().sum())/(inp[y].shape[0])*100)

In [None]:
# Displaying missing value percentage for all columns in inp dataframe

cnt=0
for column in inp.columns:
    print("{column_name}: {percent:.2f}".format(column_name=column,percent=nullvalcnt(column)))
    cnt+=1
print('Total no. of Columns:', cnt)

In [None]:
#Dropping the columns with missing value percentage greater than or equal to 50%

cnt=0
for column in inp.columns:
    if nullvalcnt(column) >= 50:
        inp.drop(column,axis='columns',inplace=True)
        cnt+=1
print("No. of columns dropped, with more than or equal to 50% missing values: ",cnt)

In [None]:
#Displaying columns with missing value percentage around 13%

cnt=0
for column in inp.columns:
    if nullvalcnt(column) >= 11 and nullvalcnt(column) <=14:
        print("{column_name}: {percent:.2f}".format(column_name=column,percent=nullvalcnt(column)))
        cnt+=1
print("No. of columns with around 13% missing values: ",cnt)

## Finding values which can be imputed in columns having around 13% missing values

In [None]:
# Finding out the description of the data in columns with missing value percentage around 13%

for column in inp.columns:
    if nullvalcnt(column) >= 11 and nullvalcnt(column) <=14:
        print("{describe}".format(describe=inp[column].describe()))

In [None]:
# Plotting graph to get a visual of values distribution to make choice for imputation

for column in inp.columns:
    if nullvalcnt(column) >= 11 and nullvalcnt(column) <=14:
        plt.title(column)
        (inp[column].value_counts(normalize=True)*100).plot.bar()
        plt.show()

## Imputation Metric 
- It can be observed from 5 graphs above that columns AMT_REQ_CREDIT_BUREAU_HOUR, AMT_REQ_CREDIT_BUREAU_DAY, AMT_REQ_CREDIT_BUREAU_WEEK, AMT_REQ_CREDIT_BUREAU_MON, AMT_REQ_CREDIT_BUREAU_QRT we can impute null values with zero since the 75th percentile and above is 0.
- In the column AMT_REQ_CREDIT_BUREAU_YEAR we can impute the missing values with median i.e. 1

In [None]:
# Finding the median of the column 'AMT_REQ_CREDIT_BUREAU_YEAR'

inp['AMT_REQ_CREDIT_BUREAU_YEAR'].median()

## Changing datatype of few columns

In [None]:
# Changing the values Y N with 1 0 respectively for columns - 'FLAG_OWN_CAR' and 'FLAG_OWN_REALTY'

inp.FLAG_OWN_CAR = inp.FLAG_OWN_CAR.apply(lambda x: 1 if x == 'Y' else 0)
inp.FLAG_OWN_REALTY = inp.FLAG_OWN_REALTY.apply(lambda x: 1 if x == 'Y' else 0)

In [None]:
# Inspecting the datatype of the columns - 'FLAG_OWN_CAR' and 'FLAG_OWN_REALTY'

inp[['FLAG_OWN_CAR','FLAG_OWN_REALTY']].info()

In [None]:
# Changing the negative age and date to absolute values in years for columns - DAYS_BIRTH, DAYS_EMPLOYED, DAYS_REGISTRATION, DAYS_ID_PUBLISH

# Converting DAYS_BIRTH to year and storing it in the same column
inp['DAYS_BIRTH'] = (abs(inp['DAYS_BIRTH'])/365).astype('int64')

inp['DAYS_EMPLOYED'] = abs(inp['DAYS_EMPLOYED'])
inp['DAYS_REGISTRATION'] = abs(inp['DAYS_REGISTRATION']).astype('int64')
inp['DAYS_ID_PUBLISH'] = abs(inp['DAYS_ID_PUBLISH'])

In [None]:
# Inspecting the datatype of the above columns

inp[['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH']].info()

In [None]:
# Following are the counts which are supposed to be integer values hence converting the datatype of the following columns as well

inp['OBS_30_CNT_SOCIAL_CIRCLE'] = inp['OBS_30_CNT_SOCIAL_CIRCLE'].astype('Int64')
inp['DEF_30_CNT_SOCIAL_CIRCLE'] = inp['DEF_30_CNT_SOCIAL_CIRCLE'].astype('Int64')
inp['OBS_60_CNT_SOCIAL_CIRCLE'] = inp['OBS_60_CNT_SOCIAL_CIRCLE'].astype('Int64')
inp['DEF_60_CNT_SOCIAL_CIRCLE'] = inp['DEF_60_CNT_SOCIAL_CIRCLE'].astype('Int64')


In [None]:
# Inspecting the datatpe of above columns

inp[['OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE']].info()

### Numerical Columns for Outlier Detection - 

- DAYS_BIRTH: Client's age in days at the time of application
- AMT_INCOME_TOTAL: Income of the client
- AMT_CREDIT: Credit amount of the loan
- AMT_GOODS_PRICE: For consumer loans it is the price of the goods for which the loan is given
- AMT_ANNUITY: Loan annuity / regular EMI to the bank

## Age

In [None]:
# Plotting the boxplot for DAYS_BIRTH variable to check presence of outliers / anomalies

sns.boxplot(inp.DAYS_BIRTH)
plt.show()

- for age variable we didn't find any outliers in the dataset

In [None]:
plt.hist(inp['DAYS_BIRTH'],bins=5)
plt.show()

- Clients having between 30-50 years are applying more for the loan

## Client Income

In [None]:
# Boxplot to check presence of outliers in AMT_INCOME_TOTAL column w.r.t TARGET variable

sns.boxplot(inp.AMT_INCOME_TOTAL)
plt.show()

In [None]:
# Trying to get a better visual of value distribution
# Salary below 250000

sns.boxplot(inp[inp.AMT_INCOME_TOTAL < 250000].AMT_INCOME_TOTAL)
plt.show()

- Since there are many outliers in the Salary variable, we'll prefer binning them into 'Low', 'Below Average','Above Average','High' categories which is done further below in the analysis

## Credit Amount

In [None]:
# Plotting boxplot to check outliers in AMT_CREDIT variable

plt.figure(figsize=(8,4))
sns.boxplot(inp.AMT_CREDIT)
plt.show()

## AMT_GOODS

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(inp.AMT_GOODS_PRICE)
plt.show()

##  AMT ANUITY

In [None]:
sns.boxplot(inp.AMT_ANNUITY)
plt.show()

## Binning the continuous variables 

In [None]:
# Since there are many outliers in these two columns
# Binning 2 important columns to categories and storing them as new columns - AMT_CREDIT and AMT_INCOME_TOTAL

inp['CREDIT_RANGE'] = pd.qcut(inp['AMT_CREDIT'], q=3, labels=['Low', 'Medium','High'])
inp['SALARY_RANGE'] = pd.qcut(inp['AMT_INCOME_TOTAL'], q=4, labels=['Low', 'Below Average','Above Average','High'])


In [None]:
# Bar plot - Salary Range Vs. Target

sns.barplot(x='SALARY_RANGE',y='TARGET',data=inp)
plt.show()

How likely is a client to default can be seen through bars approaching towards 1 and it can be observed that <b>Below Average Salary</b> are contributing more to default list

In [None]:
# Bar plot - Credit Range Vs. Target

sns.barplot(x='CREDIT_RANGE',y='TARGET',data=inp)
plt.show()

It can be observed from above graph that <b>Medium Credit Range</b> are defaulting more

In [None]:
sns.pairplot(inp[['AMT_INCOME_TOTAL','AMT_CREDIT','DAYS_BIRTH','TARGET']])
plt.show()

- Not many interesting correlation can be found in above pair plot hence plotting a heatmap

In [None]:
sns.heatmap(inp[['AMT_INCOME_TOTAL','AMT_CREDIT','DAYS_BIRTH','TARGET']].corr(), annot=True, cmap='RdYlGn', center=0.16)
plt.show()

- positive correlation between credit amount and total income
- negative correlation between age and total income 

### Application.csv dataset is having a variable TARGET which implies 
<br>1 - The person has defaulted
<br>0 - The person has not defaulted


In [None]:
#Checking the imbalance percentage

inp.TARGET.value_counts(normalize=True)*100

In [None]:
inp.TARGET.value_counts(normalize=True).plot.pie()
plt.show()

The data is imbalanced:
<br>92% of the people are non-defaulters 
<br>only 8% of the people are defaulters

The count is good considering only 8% are defaulting and it is essential to address the pattern hidden in this 8% as well

In [None]:
# Dividing the dataset into two segments on the basis of TARGET variable in application.csv dataset

Non_Defaulters = inp[inp.TARGET == 0]
Defaulters = inp[inp.TARGET == 1]

### Univariate Analysis

In [None]:
#Replacing incorrect values with NaN

Non_Defaulters['CODE_GENDER'].replace('XNA', np.NaN, inplace=True)
Defaulters['CODE_GENDER'].replace('XNA', np.NaN, inplace=True)

In [None]:
# Gender of applicant who applied for the loan

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6),sharey=True)
ax1.set_title('Non-Defaulters')
(Non_Defaulters.CODE_GENDER.value_counts(normalize=True)*100).plot.bar(ax=ax1)
ax2.set_title('Defaulters')
(Defaulters.CODE_GENDER.value_counts(normalize=True)*100).plot.bar(ax=ax2)
plt.show()

- Female category is applying for loan more as well as they are defaulting more than that of male category

In [None]:
# Who accompanied client while applying for the loan

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6), sharey=True)
ax1.set_title('Non-Defaulters')
(Non_Defaulters.NAME_TYPE_SUITE.value_counts(normalize=True)*100).plot.bar(ax=ax1)
ax2.set_title('Defaulters')
(Defaulters.NAME_TYPE_SUITE.value_counts(normalize=True)*100).plot.bar(ax=ax2)
plt.show()

- While applying for the loan about 80% of the clients were unaccompanied in both the cases.

In [None]:
# Income type of the client

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6), sharey=True)
ax1.set_title('Non-Defaulters')
(Non_Defaulters.NAME_INCOME_TYPE.value_counts(normalize=True)*100).plot.bar(ax=ax1)
ax2.set_title('Defaulters')
(Defaulters.NAME_INCOME_TYPE.value_counts(normalize=True)*100).plot.bar(ax=ax2)
plt.show()

- Customers with working income type are contributing to defaulters more than that of any other income type

In [None]:
# Occupation type of the client

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6), sharey=True)
ax1.set_title('Non-Defaulters')
(Non_Defaulters.OCCUPATION_TYPE.value_counts(normalize=True)*100).plot.bar(ax=ax1)
ax2.set_title('Defaulters')
(Defaulters.OCCUPATION_TYPE.value_counts(normalize=True)*100).plot.bar(ax=ax2)
plt.show()

- As Laborers have the highest values for both defaulters and non-defaulters ,it can said that they are majority of people applying for loan as well as they are on top for defaulters as compared to other occupation types

In [None]:
# Level of Highest Education Client has achieved

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6), sharey=True)
ax1.set_title('Non-Defaulters')
(Non_Defaulters.NAME_EDUCATION_TYPE.value_counts(normalize=True)*100).plot.bar(ax=ax1)
ax2.set_title('Defaulters')
(Defaulters.NAME_EDUCATION_TYPE.value_counts(normalize=True)*100).plot.bar(ax=ax2)
plt.show()

In [None]:
# Type of Loan - Cash or Revolving (also called credit limit)

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6), sharey=True)
ax1.set_title('Non-Defaulters')
(Non_Defaulters.NAME_CONTRACT_TYPE.value_counts(normalize=True)*100).plot.bar(ax=ax1)
ax2.set_title('Defaulters')
(Defaulters.NAME_CONTRACT_TYPE.value_counts(normalize=True)*100).plot.bar(ax=ax2)
plt.show()

In [None]:
# Housing situation of the client who applied for the loan

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6), sharey=True)
ax1.set_title('Non-Defaulters')
(Non_Defaulters.NAME_HOUSING_TYPE.value_counts(normalize=True)*100).plot.bar(ax=ax1)
ax2.set_title('Defaulters')
(Defaulters.NAME_HOUSING_TYPE.value_counts(normalize=True)*100).plot.bar(ax=ax2)
plt.show()

In [None]:
# Family status of the client who applied for the loan

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6), sharey=True)
ax1.set_title('Non-Defaulters')
(Non_Defaulters.NAME_FAMILY_STATUS.value_counts(normalize=True)*100).plot.bar(ax=ax1)
ax2.set_title('Defaulters')
(Defaulters.NAME_FAMILY_STATUS.value_counts(normalize=True)*100).plot.bar(ax=ax2)
plt.show()

### Correlation for numerical columns for both the cases, i.e. Defaulters and Non-Defaulters

### Variables used below are explained as follows:
- AMT_INCOME_TOTAL: Income of the client
- AMT_CREDIT: Credit amount of the loan
- AMT_GOODS_PRICE: For consumer loans it is the price of the goods for which the loan is given
- CNT_CHILDREN: Number of children the client has
- CNT_FAM_MEMBERS: How many family members does client have

In [None]:
# Finding correlation that might affect loan application/payment/defaulting process
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6))
ax1.set_title('Non-Defaulters')
sns.heatmap(Non_Defaulters[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_GOODS_PRICE','CNT_CHILDREN', 'CNT_FAM_MEMBERS']].corr(), annot=True, cmap='Reds', ax=ax1)
ax2.set_title('Defaulters')
sns.heatmap(Defaulters[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_GOODS_PRICE','CNT_CHILDREN', 'CNT_FAM_MEMBERS']].corr(), annot=True, cmap='Reds')
plt.subplots_adjust(right=1.2)
plt.show()

- In both the cases: A high and positive correlation can be observed between AMT_GOODS_PRICE Vs. AMT_CREDIT and CNT_CHILDREN and CNT_FAM_MEMBERS which is quite logical as they are highly related in real life scenario.
- In Non- Defaulters: A positive correlation is observed between the AMT_CREDIT or AMT_GOODS_PRICE Vs. AMT_INCOME_TOTAL which possibly can imply that people with high income applied for higher amount of loan.
- In Defaulters: A positive correlation is observed between the AMT_CREDIT Vs. CNT_FAM_MEMBERS which possibly means no. of family members are affecting the loan amount, (a real life possible guess - to meet the expenses)

In [None]:
sns.heatmap(inp[['FLAG_OWN_CAR','FLAG_OWN_REALTY','AMT_CREDIT','AMT_INCOME_TOTAL','TARGET']].corr(),cmap='Reds',annot=True)
plt.show()

- If target is observed, it has negative correlation with most of the variables, might imply that a client who's spending/investment are more they are defaulting less.

### Heatmap with pivot-table


In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(pd.pivot_table(inp, values='TARGET', index='NAME_INCOME_TYPE', columns='OCCUPATION_TYPE'), annot=True,cmap='Reds')
plt.show()

In [None]:
sns.heatmap(pd.pivot_table(inp,values='TARGET',index='NAME_INCOME_TYPE',columns='NAME_EDUCATION_TYPE'), annot=True, cmap='Reds')
plt.show()

- Unsure if this correaltion is correct or incorrect, but good patterns can be observered here
- Clients who are Unemployed and have Secondary level education have a high positive correlation
- Also clients who are unemployed and have incomplete higher education have a positive correlation

#### -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# previous_application.csv

It contains information about the client’s previous loan data. It contains the data whether the previous application had been Approved, Cancelled, Refused or Unused offer.

In [None]:
# Reading the previous application dataset into dataframe inp1

inp1 = pd.read_csv('../input/risk-analytics/previous_application.csv')

In [None]:
# Displaying the first five rows of the dataset

inp1.head()

In [None]:
# Reading the data-types of the columns

inp1.info(verbose=True)

### Univariate Analysis

In [None]:
# Replacing unknown values with null values

inp1.NAME_CONTRACT_TYPE.replace('XNA', np.NaN, inplace=True)

In [None]:
# Type of loans in previous application

(inp1.NAME_CONTRACT_TYPE.value_counts(normalize=True)*100).plot.barh()
plt.show()

- It can be observed that Clients have mostly opted for Cash and consumer loans, while revolving contributes the lowest in dataset

In [None]:
# For how much credit did client ask on the previous application 

sns.boxplot(inp1.AMT_APPLICATION)
plt.show()

 - There are some really huge application amount in the dataset leading to outliers

In [None]:
# For how much credit did client ask on the previous application - understanding through distribution plot

sns.distplot(inp1.AMT_APPLICATION)
plt.show()

- A majority of clients have applied for loan amount below 10Lakhs

In [None]:
# Final credit amount on the previous application (amount approved out of application amount)

sns.boxplot(inp1.AMT_CREDIT)
plt.show()

In [None]:
# Getting a better view through distplot 

sns.distplot(inp1.AMT_CREDIT.dropna())
plt.show()

- Similar pattern is observed in the approved credit amount i.e Majority of clients got the credit amount below 10. A possibilty of good correlation. Checking the correlation further in the analysis.

In [None]:
# Goods price of good that client asked for (Goods Example - Car, Mobile, Furniture,etc.)

sns.boxplot(inp1.AMT_GOODS_PRICE)
plt.show()

- There are continous outliers in boxplot for AMT-GOODS_PRICE. for better understanding of this column plotting distplot

In [None]:
sns.distplot(inp1.AMT_GOODS_PRICE.dropna())
plt.show()

- Again the similar pattern can be seen in AMT_GOODS_PRICE, pretty logical since credit amount is somehow directly depended to Price of a good that client wants to purchase

In [None]:
# Why was the previous application rejected

plt.title('Reason behind application rejection')
(inp1.CODE_REJECT_REASON.value_counts(normalize=True)*100).plot.bar()
plt.show()

Term XAP means a consumer loan has no cash_loan_purpose. (Source of information: Google, Kaggle)

- Looks like most of the client's application was rejected because they couldn't present an appropriate purpose behind applying for the credit.

In [None]:
# Purpose of the cash loan

plt.xticks(rotation=90)
plt.title('Purpose of the cash loan')
(inp1.NAME_CASH_LOAN_PURPOSE.value_counts(normalize=True)*100).plot.bar()
plt.show()

- For most of the cases (around 90%) the loan purpose is not known, other than that purpose behind most applications was repairs

In [None]:
# Replacing 'XNA' with null values in NAME_PAYMENT_TYPE

inp1.NAME_PAYMENT_TYPE.replace('XNA', np.NaN, inplace=True)

In [None]:
#Payment method that client chose to pay for the previous application

plt.xticks(rotation=35)
sns.countplot(inp1.NAME_PAYMENT_TYPE)
plt.show()

- Loan was mostly paid in cash through bank

In [None]:
# Replacing 'XNA' with null values in NAME_CLIENT_TYPE

inp1.NAME_CLIENT_TYPE.replace('XNA', np.NaN, inplace=True)

In [None]:
# Was the client old or new client when applying for the previous application

sns.countplot(inp1.NAME_CLIENT_TYPE)
plt.show()

 - Most of the clients in the previous application were repeater meaning they were already having loan and again applied for another

In [None]:
# Replacing 'XNA' with null values in NAME_GOODS_CATEGORY

inp1.NAME_GOODS_CATEGORY.replace('XNA', np.NaN, inplace=True)

In [None]:
plt.xticks(rotation=90)
sns.countplot(inp1.NAME_GOODS_CATEGORY)
plt.show()

- Clients applied mostly to purchase Mobile Phone and consumer electronics, followed by computers, audio/video devices (on EMI).
- This displays potential areas where bank can promote its offerings through ad-campaign and introducing special credit offerings.

In [None]:
# Replacing 'XNA' with null values in NAME_GOODS_CATEGORY

inp1.NAME_PORTFOLIO.replace('XNA', np.NaN, inplace=True)

In [None]:
# Was the previous application for CASH, POS, CAR, …

sns.countplot(inp1.NAME_PORTFOLIO)
plt.show()

In [None]:
# Replacing 'XNA' with null values in NAME_GOODS_CATEGORY

inp1.NAME_PRODUCT_TYPE.replace('XNA', np.NaN, inplace=True)

In [None]:
# Was the previous application x-sell o walk-in

sns.countplot(inp1.NAME_PRODUCT_TYPE)
plt.show()

- Majority of the loans taken in the previous application were via cross-selling

In [None]:
# Through which channel we acquired the client on the previous application

plt.xticks(rotation=90)
sns.countplot(inp1.CHANNEL_TYPE)
plt.show()

- Most of the applications were received through Credit Card and Cash Offices.

In [None]:
# Interest rate on previous credit

sns.boxplot(inp1.RATE_INTEREST_PRIMARY.dropna()*100)

- There are some outliers in the interest column which may be incorrect data, as few values are approaching towards 100% interest

### Bivariate Analysis

 - AMT_APPLICATION: For how much credit did client ask on the previous application
 - AMT_CREDIT: Final credit amount on the previous application (amount approved out of application amount)

In [None]:
# AMT_APPLICATION Vs. AMT_CREDIT

plt.title('AMT_APPLICATION Vs. AMT_CREDIT')
plt.scatter(inp1.AMT_APPLICATION,inp1.AMT_CREDIT, alpha=0.5)
plt.show()

- The values are forming linear relationship, so it can be implied that the client has got the loan for the amount he applied for most of the cases

## Bivariate categorical - continuous 

- NAME_CONTRACT_TYPE: Contract product type (Cash loan, consumer loan [POS] ,...) of the previous application
- AMT_APPLICATION: For how much credit did client ask on the previous application
- AMT_CREDIT: Final credit amount on the previous application (amount approved out of application amount)

In [None]:
# NAME_CONTRACT_TYPE Vs. AMT_APPLICATION

plt.title('NAME_CONTRACT_TYPE Vs. AMT_APPLICATION')
sns.boxplot(x = 'NAME_CONTRACT_TYPE', y = 'AMT_APPLICATION', data = inp1)
plt.show()

In [None]:
# NAME_CONTRACT_TYPE Vs. AMT_CREDIT

sns.boxplot(x = 'NAME_CONTRACT_TYPE', y = 'AMT_CREDIT', data = inp1)
plt.show()

In [None]:

sns.heatmap(pd.pivot_table(values='AMT_APPLICATION', columns='CODE_REJECT_REASON', index='NAME_YIELD_GROUP', data=inp1),cmap='Reds')
plt.show()