#  Bank Loan Default Analysis


This notebook aims to identify patterns to understand the driving factors behind loan default


#### First, let's import all the dependancies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
print('Packages imported successfully')

#### Now, let's import the Application Dataset

In [None]:
data_app = pd.read_csv('../input/loan-defaulter/application_data.csv')

In [None]:
# lets observe the shape of the dataframe
data_app_rows, data_app_cols = data_app.shape
print('Shape of Applications Data: Rows',data_app_rows,'Columns:',data_app_cols)

## Handling Missing Values 

#### Let's find columns with missing values

In [None]:
app_data_mis_vals = round(data_app.isnull().sum()/data_app_rows*100,2)

plt.figure(figsize=[10,8])
app_data_mis_vals.plot.hist().set(xlabel='Percentage Missing Values',ylabel='Number of Columns',title='Distribution of Missing Values across Columns')
plt.show()

##### From the above graph it is evident that there are many columns with the missing values percentage more than 40%, since 40% is a significant number for missing values in the dataset, we can drop these columns.

#### Dropping the Columns with missing values

In [None]:
#List of columns with missing values percentage greater than 40
drop_cols = app_data_mis_vals[app_data_mis_vals>40].index.tolist()

print('Number of Columns that have missing values percentage greater than 40%:', len(drop_cols))

In [None]:
#Dropping above columns
data_app.drop(labels=drop_cols,axis=1,inplace=True)

#Verifying the shape of the dataframe 
print('New Shape of Dataframe: ',data_app.shape)

#### Let's see the infomation about the dataset now

In [None]:
#Lets get information obout the dataframe now
data_app.info(verbose=True,null_counts=False)

In [None]:
impute_cols = data_app.isnull().sum()/data_app_rows*100
impute_cols_list = impute_cols[impute_cols>0].index.tolist()
print('List of Columns where we need to impute missing values appropriately:\n\n',impute_cols_list)

#### Let's observe the rows in the dataset having missing values

In [None]:
null_data = data_app[data_app.isnull().any(axis=1)]
null_data

## Missing Values Imputation

In [None]:
# Creating a function to fill missing values appropriately
def fill_null(col_name,df,how):
    df[col_name].fillna(how,inplace=True)
    print('Missing Values have been Immputed for',col_name)

In [None]:
#Let's take OCCUPATION_TYPE, EXT_SOURCE_2 and EXT_SOURCE_3 columns 
data_app[['OCCUPATION_TYPE','EXT_SOURCE_2','EXT_SOURCE_3']].isnull().sum()

In [None]:
data_app['EXT_SOURCE_3'].value_counts(normalize=True)

In [None]:
data_app['EXT_SOURCE_2'].value_counts(normalize=True)

In [None]:
data_app['OCCUPATION_TYPE'].value_counts()

In [None]:
#Lets look at CNT_FAM_MEMBERS column, number of missing values
data_app['CNT_FAM_MEMBERS'].isnull().sum()

In [None]:
#Lets look at CNT_FAM_MEMBERS column, 
df = data_app['CNT_FAM_MEMBERS'].value_counts().to_frame(name='Number of Families')
df['Count of Family Members']= df.index.astype('Int64')
df.reset_index(drop=True, inplace=True)
sns.barplot(data=df, x='Count of Family Members', y='Number of Families')
plt.show()

##### It can be seen most of the families 2 family members, hence it is safe to impute missing values with Mode of CNT_FAM_MEMBERS column

In [None]:
#EXT_SOURCE_3,EXT_SOURCE_2 are normalized scores from data source 3 and 2, we can impute missing values with average value
fill_null('EXT_SOURCE_3',data_app,data_app.EXT_SOURCE_3.mean)
fill_null('EXT_SOURCE_2',data_app,data_app.EXT_SOURCE_2.mean)
#OCCUPATION_TYPE column has unordered categorical values, we can impute missing values with a value 'Unavailable'
fill_null('OCCUPATION_TYPE',data_app,'Unavailable')
#CNT_FAM_MEMBERS can be imputed with the mode
fill_null('CNT_FAM_MEMBERS',data_app,data_app.CNT_FAM_MEMBERS.mode)

In [None]:
#Let's verify Null Values for OCCUPATION_TYPE, EXT_SOURCE_2 and EXT_SOURCE_3 columns 
data_app[['OCCUPATION_TYPE','EXT_SOURCE_2','EXT_SOURCE_3']].isnull().sum()

In [None]:
#Function to plot subplot for the AMT_* columns
def bar_plot(col_list):
    plt.figure(figsize=[18,10])
    i=1
    for col in col_list:
        plt.subplot(2,3,i) 
        tmp = data_app[col].value_counts(normalize=True)*100
        tmp.plot.bar().set(xlabel= 'No of '+str(col.split('_')[-1]), ylabel='Percent count of records',title= 'No of '+str(col.split('_')[-1])+' vs Percent Count of records')

        i+=1
    plt.tight_layout(pad=2.0)
    plt.show()

In [None]:
list_Cols = ['AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']
bar_plot(list_Cols)

###### As 99% of the values in `AMT_REQ_CREDIT_BUREAU_HOUR`, `AMT_REQ_CREDIT_BUREAU_DAY`, `AMT_REQ_CREDIT_BUREAU_MON`, 
##### `AMT_REQ_CREDIT_BUREAU_WEEK`, ` AMT_REQ_CREDIT_BUREAU_QRT` columns are 0, hence it is safe to impute missing values with 0
##### For `AMT_REQ_CREDIT_BUREAU_YEAR` column we can impute missing values with the median number of years

In [None]:
data_app.fillna({'AMT_REQ_CREDIT_BUREAU_HOUR':0,
 'AMT_REQ_CREDIT_BUREAU_DAY':0,
 'AMT_REQ_CREDIT_BUREAU_WEEK':0,
 'AMT_REQ_CREDIT_BUREAU_MON':0,
 'AMT_REQ_CREDIT_BUREAU_QRT':0,
 'AMT_REQ_CREDIT_BUREAU_YEAR':data_app.AMT_REQ_CREDIT_BUREAU_YEAR.median()},inplace=True,)

#### AMT_ANNUITY and AMT_GOODS_PRICE columns

In [None]:
data_app['AMT_GOODS_PRICE'].value_counts()

## Handling Invalid Data

#### First lets see columns with Object data types


In [None]:
data_app.info(verbose=True)

#### Let's find out the columns with Object data type 

In [None]:
obj_cols = data_app.select_dtypes(np.object).columns.tolist()
obj_cols

In [None]:
data_app.NAME_CONTRACT_TYPE.value_counts()

###### No issues with `NAME_CONTRACT_TYPE` column

In [None]:
data_app.CODE_GENDER.value_counts()

##### There are 4 records that do not belong to Male or Female customers, hence we can drop the records respective to these values

In [None]:
#Dropping records with XNA values
drop_genders = data_app[data_app.CODE_GENDER =="XNA"].index
data_app.drop(drop_genders, inplace=True)

In [None]:
data_app.FLAG_OWN_CAR.value_counts()

##### No issues with `FLAG_OWN_CAR` column

In [None]:
data_app.FLAG_OWN_REALTY.value_counts()
#No issues with this column

In [None]:
data_app.NAME_TYPE_SUITE.value_counts()

##### No issues with NAME_TYPE_SUITE column

In [None]:
data_app.ORGANIZATION_TYPE.value_counts()

##### No issues with `ORGANIZATION_TYPE` column

###### There are 55374 records where there is no information about the Organisation Type, we can keep it as-is.

In [None]:
data_app.NAME_HOUSING_TYPE.value_counts()

##### No issues with `NAME_HOUSING_TYPE` column

### Now lets look at invalid data in numeric columns 

In [None]:
int_cols = data_app.select_dtypes(np.int64).columns.tolist()
data_app[int_cols]

##### Since there are Days_* columns with the duration data till the loan application was made, as we do not have age data of the customer we can calculate the same from DAYS_BIRTH

In [None]:
#Let's create a new feature Age (in Years)
data_app['Age_Years'] = data_app.DAYS_BIRTH.apply(lambda x: int(np.floor(abs(x)/365)))

In [None]:
float_cols = data_app.select_dtypes(np.float64).columns.tolist()
data_app[float_cols]

##### There are two columns, `DAYS_REGISTRATION` and  `DAYS_LAST_PHONE_CHANGE` which should have been of type int64, so lets change their data type

In [None]:
data_app.DAYS_REGISTRATION = data_app.DAYS_REGISTRATION.astype('int64')
data_app.DAYS_LAST_PHONE_CHANGE = data_app.DAYS_LAST_PHONE_CHANGE.notna().astype('int64')

In [None]:
data_app.DAYS_REGISTRATION

## Finding Outliers in the Dataset

#### Let's take numeric columns first

In [None]:
num_cols = data_app.select_dtypes([np.int64,np.float64]).columns.tolist()
num_cols

#### let's find outliers by plotting Boxplot

In [None]:
#Function to plot subplot for the AMT_* columns
def box_plot(df,col_list):
    plt.figure(figsize=[16,12])
    i=1
    for col in col_list:
        plt.subplot(2,3,i)
        sns.boxplot(data=df[col]).set(title='Box plot for '+col.replace('_',' '))
        i+=1
    plt.tight_layout(pad=2.0)
    plt.show()

In [None]:
col_list = ['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','Age_Years', 'CNT_CHILDREN']
box_plot(data_app,col_list)

### Lets remove the outliers by knowing their presence in the quantiles

In [None]:
#let's see the distribution of Income
data_app.AMT_INCOME_TOTAL.describe()

In [None]:
INCOME_90 = data_app.AMT_INCOME_TOTAL.quantile(q=0.9)
#let remove the values above 90th quantile

In [None]:
#let's see the distribution of Loam amount credited
data_app.AMT_CREDIT.describe()

In [None]:
CREDIT_95 = data_app.AMT_CREDIT.quantile(q=0.95)
#let remove the values above 95th quantile

In [None]:
#let's see the distribution of Loan Annuity
data_app.AMT_ANNUITY.describe()

In [None]:
ANNUITY_97 = data_app.AMT_ANNUITY.quantile(q=0.97)
#let remove the values above 97th quantile

In [None]:
#let's see the distribution of Goods price 
data_app.AMT_GOODS_PRICE.describe()

In [None]:
GOODS_PRICE_97 = data_app.AMT_GOODS_PRICE.quantile(q=0.97)
#let remove the values above 97th quantile

In [None]:
#let's see the distribution of count of children
data_app.CNT_CHILDREN.describe()

In [None]:
CNT_CHILDREN_999 = data_app.CNT_CHILDREN.quantile(q=0.999)
#let remove the values above 99th quantile

### Removing the Outliers

In [None]:
#created a copy of the dataframe and removed outliers from it
app_data_final=data_app.copy()
app_data_final=app_data_final[app_data_final['AMT_INCOME_TOTAL'] <=INCOME_90]
app_data_final=app_data_final[app_data_final['AMT_ANNUITY'] <=ANNUITY_97]
app_data_final=app_data_final[app_data_final['AMT_GOODS_PRICE'] <=GOODS_PRICE_97]
app_data_final=app_data_final[app_data_final['CNT_CHILDREN'] <=CNT_CHILDREN_999]
app_data_final=app_data_final[app_data_final['AMT_CREDIT'] <=CREDIT_95]

In [None]:
#let's verify the columns after removing the outliers
box_plot(app_data_final,col_list)

##### From the above graphs we can see that the most (some have been kept for analysis purpose) of the outliers have been removed

### Lets see if there are any contious numerical columns

In [None]:
app_data_final.head(10)

##### There are AMT_* columns and Age_Years which has continous numerical values, hence we can bin them 

In [None]:
#let's see the distribution of Ages
app_data_final.Age_Years.describe()

In [None]:
#Binning the Age column
cut_labels = ['0-10', '11-20', '21-30', '31-40','41-50','51-60','61-70','71-80']
cut_bins = [0, 10,20,30,40,50,60,70,80]
app_data_final['Age_Groups'] = pd.cut(app_data_final['Age_Years'], bins=cut_bins, labels=cut_labels)
app_data_final['Age_Groups'].value_counts()

#### Now lets bin the `AMT_INCOME_TOTAL` column

In [None]:
app_data_final.AMT_INCOME_TOTAL.describe()

In [None]:
#Binning the Income column
cut_lbl = ['0-50000', '50001-100000', '100001-150000', '150001-200000','200001-250000','250001-300000']
cut_bns = [0, 50000, 100000, 150000, 200000, 250000, 300000]
app_data_final['INCOME_GROUP'] = pd.cut(app_data_final['AMT_INCOME_TOTAL'], bins=cut_bns, labels=cut_lbl)

In [None]:
app_data_final['INCOME_GROUP']

## Univariate Analysis

#### Lets see how is the distribution of the data in few key columns, we can then see if the data is skewed or balanced, We will be analysing the Gender, Income and Age

In [None]:
plt.figure(figsize=[20,6])

plt.subplot(1,3,1)

GENDER = app_data_final.CODE_GENDER.value_counts(normalize=True).rename_axis('Gender').reset_index(name='Count')
plt.pie(data=GENDER, x=GENDER.Count, labels=GENDER.Gender, autopct='%1.1f%%')
plt.title('Gender Distribution among applicants')

plt.subplot(1,3,2)
INCOME = app_data_final.INCOME_GROUP.value_counts().rename_axis('Income_Groups').reset_index(name='Income Amount')
sns.barplot(y=INCOME.Income_Groups, x=INCOME['Income Amount']).set(xlabel='No of Applicantions', ylabel='Income Groups', title='No of Applicants vs Income Groups')

plt.subplot(1,3,3)
AGE = app_data_final.Age_Groups
sns.histplot(AGE).set(xlabel='No of Applicants', ylabel='Age Groups', title='No of Applicantions vs Age Groups')


plt.tight_layout()
plt.show()

##### The percentage of females is higher compared to males in loan applications

##### Higher loan applications have come from applicants with income group 100001-150000
##### People in age group 31-40 have applied the highest number of loans


## Segmentation

#### Let's divide the dataframe in two, defaulter (Target 1) and non-defaulters (Target 0)

In [None]:
target_1 = app_data_final[app_data_final['TARGET']==1].iloc[:,1:]   #Defaullter
target_0 = app_data_final[app_data_final['TARGET']==0].iloc[:,1:]   #NOn-Defaulter

In [None]:
#lets observe Target-1
target_1.head()

In [None]:
#lets observe Target-0
target_0.head()

### Categorical Ordered Variables

In [None]:
plt.figure(figsize=[20,6])

plt.subplot(1,2,1)
INCOME1 = target_1.INCOME_GROUP.value_counts().rename_axis('Income_Groups').reset_index(name='No of Applicants')
sns.barplot(y=INCOME1.Income_Groups, x=INCOME1['No of Applicants']).set(xlabel='No of Applicants', ylabel='Income Groups', title='No of Applicants vs Income Groups [For target_1]')

plt.subplot(1,2,2)
INCOME0 = target_0.INCOME_GROUP.value_counts().rename_axis('Income_Groups').reset_index(name='No of Applicants')
sns.barplot(y=INCOME0.Income_Groups, x=INCOME0['No of Applicants']).set(xlabel='No of Applicants', ylabel='Income Groups', title='No of Applicants vs Income Groups [For target_0]')


plt.tight_layout()
plt.show()

##### The number of applications in income group 100001-150000 have most defaults as well as success. Whereas, income group 0-50000 has lowest defaults as well as non-defaults


## Bivariate Analysis On Segmented Data

In [None]:
plt.figure(figsize=[12,8])
plt.yscale('log')
plt.subplot(211)
sns.countplot(data = target_1, x='INCOME_GROUP' , order=target_1['INCOME_GROUP'].value_counts().index,hue = target_1['CODE_GENDER'],palette='Blues').set(xlabel='Income Groups', ylabel='Number of Applicants', title='Distribution of Applicant in various Income Groups for Males and Females [Defaulters]')
plt.subplot(212)
sns.countplot(data = target_0, x='INCOME_GROUP' , order=target_0['INCOME_GROUP'].value_counts().index,hue = target_0['CODE_GENDER'],palette='gray').set(xlabel='Income Groups', ylabel='Number of Applicants', title='Distribution of Applicant in various Income Groups for Males and Females [Non-Defaulters]')
plt.tight_layout()
plt.show()

##### Number of loan applications from females are greater in all the income groups compared to males

## Univariate Analysis on Segmented Data

In [None]:
plt.figure(figsize=[20,6])
plt.ylim((0,70000))
plt.subplot(1,2,1)
AGE = target_1.Age_Groups.value_counts().rename_axis('AGE_Grps').reset_index(name='No. of Applicants')
sns.barplot(y=AGE['No. of Applicants'], x=AGE.AGE_Grps).set(ylabel='No of Applicants', xlabel='Age Groups', title='No of Applicants vs Age Groups [For target_1]')
plt.ylim((0,70000))
plt.subplot(1,2,2)
AGE = target_0.Age_Groups.value_counts().rename_axis('AGE_Grps').reset_index(name='No. of Applicants')
sns.barplot(y=AGE['No. of Applicants'], x=AGE.AGE_Grps).set(ylabel='No of Applicants', xlabel='Age Groups', title='No of Applicants vs Age Groups [For target_0]')

plt.show()

##### Number of loans that were defaulted and not defaulted are highest in age group 31 to 40 years and it decreases as age increases or decreases

In [None]:
week_days = target_1.WEEKDAY_APPR_PROCESS_START.value_counts().rename_axis('Week_Days').reset_index(name='No. of Applicants')
week_days

In [None]:
plt.figure(figsize=[20,6])
plt.subplot(1,2,1)

week_days = target_1.WEEKDAY_APPR_PROCESS_START.value_counts().rename_axis('Week_Days').reset_index(name='No. of Applicants')
sns.barplot(y=week_days['No. of Applicants'], x=week_days.Week_Days).set(xlabel='Week Days', ylabel='No. Of Applicants', title='No of Applicants vs Week Day Client Apply for Application  [For target_1]')

plt.subplot(1,2,2)
week_days = target_0.WEEKDAY_APPR_PROCESS_START.value_counts().rename_axis('Week_Days').reset_index(name='No. of Applicants')
sns.barplot(y=week_days['No. of Applicants'], x=week_days.Week_Days).set(xlabel='Week Days', ylabel='No. Of Applicants', title='No of Applicants vs Week Day Client Apply for Application  [For target_0]')

plt.show()

##### From above graphs we can see that on Tuesday there are highest number of loan applications registered whereas on Sunday least number of application were registered.

### Categorical Unordered Variables

In [None]:
#Creating a function to plot pie-charts
def pie_plot(col_name ):
    plt.figure(figsize=[20,6])

    plt.subplot(1,2,1)
    col_pie = target_1[col_name].value_counts(normalize=True).rename_axis('Type').reset_index(name='Count')
    plt.pie(data=col_pie, x=col_pie.Count, labels=col_pie.Type, autopct='%1.1f%%')
    plt.title('Pie Chart for Column ' + col_name + ' [For target_1]')

    plt.subplot(1,2,2)
    col_pie = target_0[col_name].value_counts(normalize=True).rename_axis('Type').reset_index(name='Count')
    plt.pie(data=col_pie, x=col_pie.Count, labels=col_pie.Type, autopct='%1.1f%%')
    plt.title('Pie Chart for Column ' + col_name + ' [For target_0]')
    plt.show()

In [None]:
#Plotting the pie-charts
pie_plot('CODE_GENDER')
pie_plot('FLAG_OWN_CAR')

##### Proportion of females is higher than males in both defaulters and non-defaulters

##### Proportion of both defaulters and non-defaulters not having a car is higher than those who have it.

In [None]:
def barp_plot(col_name):
    plt.figure(figsize=[22,10])

    plt.subplot(1,2,1)
    target_1[col_name].value_counts().plot.barh()
    plt.title('Bar Chart for Column ' + col_name + ' [For target_1]')
    plt.xlabel('Number of Loan Applicantions',fontdict={'fontsize':20,'fontweight':5})
    plt.ylabel(col_name,fontdict={'fontsize':20,'fontweight':5})
    plt.subplot(1,2,2)
    target_0[col_name].value_counts().plot.barh()
    plt.title('Bar Chart for Column ' + col_name + ' [For target_0]')
    plt.xlabel('Number of Loan Applicantions',fontdict={'fontsize':20,'fontweight':5})
    plt.ylabel(col_name,fontdict={'fontsize':20,'fontweight':5})
    plt.tight_layout()
    plt.show()

In [None]:
#Let's plot the bar plots
barp_plot('NAME_CONTRACT_TYPE')
barp_plot('NAME_INCOME_TYPE')
barp_plot('OCCUPATION_TYPE')

##### Number of applications with cash loans are higher than revolving loan in both defaulted and non-defaulted applications

##### People from working income background have applied higher number of loan applications than other income type and this category has highest defaulted loans

##### Occupation type which has highest number of defaulted and non-defaulted loans is unknown

## Multivariate Analysis

### Correlation Matrix

#### Let's find if there are any strong are weak correlation among the features of the dataset

In [None]:
#Creating correlation matrix for target 1
corr_mat_Day_1 = target_1[['CNT_CHILDREN','REGION_POPULATION_RELATIVE','AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','DAYS_EMPLOYED','Age_Years']].corr()
corr_mat_Day_1

In [None]:
#Creating correlation matrix for target 0
corr_mat_Day_0= target_0[['CNT_CHILDREN','REGION_POPULATION_RELATIVE','AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','DAYS_EMPLOYED','Age_Years']].corr()
corr_mat_Day_0

#### Let's visualize the correlation matrix using Heatmap

In [None]:
plt.figure(figsize=[24, 10])

plt.subplot(1,2,1)
sns.heatmap(annot= True, data=corr_mat_Day_1, cmap='YlOrRd').set(title='Heatmap for Defaulters')

plt.subplot(1,2,2)
sns.heatmap(annot= True, data=corr_mat_Day_0, cmap='YlOrRd').set(title='Heatmap for Non-Defaulters')
plt.tight_layout()
plt.show()

##### From the above heatmap, it evident that there is strong correlation  between the Goods Price and Loan Amount that was credited. Also, it can be seen that higher the age the number of days people were employed is also high.

##### On the other hand, There is weak correlation between the age and count of children and between the loan annuity and number of days applicant was employed

In [None]:
plt.figure(figsize=[20, 7])

plt.subplot(1,2,1)
sns.scatterplot(data=target_1, x='AMT_GOODS_PRICE', y='AMT_CREDIT').set(title='Scatter Plot [Target 1] Loan amount credited vs Price of the Goods')
plt.subplot(1,2,2)
sns.scatterplot(data=target_0, x='AMT_GOODS_PRICE', y='AMT_CREDIT').set(title='Scatter Plot [Target 0] Loan amount credited vs Price of the Goods')

plt.show()

##### From the above scatterplot, we can confirm that there is strong correlation between the Goods Price and Loan Amount that was credited.

In [None]:
#Creating a function to plot a boxplot with Eduction type and Family status
def box_plot_graph(df, y_value, title_graph):
    plt.figure(figsize=(20,10))
    plt.xticks(rotation=45)
    sns.boxplot(data =df, x='NAME_EDUCATION_TYPE',y= y_value, hue ='NAME_FAMILY_STATUS',orient='v')
    plt.title(title_graph)
    plt.tight_layout()
    plt.show()

In [None]:
box_plot_graph(target_1, 'AMT_CREDIT', 'Distribution of credited loan amount with Education Status for Loan Defaulters')

##### Married people with higher education background have defaulted loans for higher distribution of credited loan amounts.

In [None]:
box_plot_graph(target_1, 'AMT_INCOME_TOTAL', 'Distribution of Income amount with Education Status for Loan Defaulters')

##### Married people with higher education background have defaulted loans for higher distribution of income amounts.

In [None]:
box_plot_graph(target_0, 'AMT_CREDIT', 'Distribution of credited loan amount with Education Status for Non Defaulters')

##### Married people with Academic degree have successful loans for higher distribution of credited loan amounts.

In [None]:
box_plot_graph(target_0, 'AMT_INCOME_TOTAL', 'Income Total vs Education Status  [For Non-Defaulter]')

##### Married people with Academic degree have successful loans for higher distribution of Income amounts.

##### People with Academic degree have most successful loans

# Merging the Datasets

In [None]:
#Let's import the previous applications dataset
data_prev_app = pd.read_csv('../input/loan-defaulter/previous_application.csv')

In [None]:
data_prv_app_rows, data_prv_app_cols = data_prev_app.shape
print('Shape of Previous Applications Data: Rows',data_prv_app_rows,'Columns:',data_prv_app_cols)

In [None]:
#List of columns with missing values percentage greater than 40
prv_app_data_mis_vals = round(data_prev_app.isnull().sum()/data_prv_app_rows*100,2)
drop_cols_prv = prv_app_data_mis_vals[prv_app_data_mis_vals>30].index.tolist()

print('Number of Columns that have missing values percentage greater than 30%:', len(drop_cols_prv))

In [None]:
#Dropping the columns which have more than 40% missing values
data_prev_app.drop(labels=drop_cols_prv,axis=1,inplace=True)

In [None]:
# Now merging the Application dataset with previous appliaction dataset
final_df=pd.merge(left=data_app,right=data_prev_app,how='inner',on='SK_ID_CURR')
final_df.columns

In [None]:
#Renaming the columns after merger of datasets
final_df.rename({'NAME_CONTRACT_TYPE_x':'NAME_CONTRACT_TYPE_APP','AMT_CREDIT_x':'AMT_CREDIT_APP', 
'AMT_ANNUITY_x':'AMT_ANNUITY_APP', 'AMT_GOODS_PRICE_x':'AMT_GOODS_PRICE_APP',
'WEEKDAY_APPR_PROCESS_START_x':'WEEKDAY_APPR_PROCESS_START_APP', 'HOUR_APPR_PROCESS_START_x':'HOUR_APPR_PROCESS_START_APP',
'NAME_CONTRACT_TYPE_y':'NAME_CONTRACT_TYPE_PRV', 'AMT_ANNUITY_y':'AMT_ANNUITY_PRV',
'AMT_CREDIT_y':'AMT_CREDIT_PRV', 'AMT_GOODS_PRICE_y':'AMT_GOODS_PRICE_PRV',
'WEEKDAY_APPR_PROCESS_START_y':'WEEKDAY_APPR_PROCESS_START_PRV', 'HOUR_APPR_PROCESS_START_y':'HOUR_APPR_PROCESS_START_PRV'}, axis=1, inplace=True)

In [None]:
# Distribution of contract status in logarithmic scale

sns.set_style('whitegrid')
plt.figure(figsize=(15,30))
plt.xscale('log')
plt.title('Distribution of Contract Statuses with Loan Purposes')
ax = sns.countplot(data = final_df, y= 'NAME_CASH_LOAN_PURPOSE', 
                   order=final_df['NAME_CASH_LOAN_PURPOSE'].value_counts().index,hue = 'NAME_CONTRACT_STATUS',palette='CMRmap').set(xlabel='Number of Loan Applicantion',ylabel='Loan Purposes')

##### When customers do not communicate their purpose for the loan bank has approved low number of applications.

##### For all the purposes of taking loans, bank has refused more applications that it approved.

##### The data related to purpose of taking loan is not clearly available, since there are highest number of application that were both approved and rejected for these purposes.

In [None]:
plt.figure(figsize=[12,6])
Payment_Type = final_df.NAME_PAYMENT_TYPE.value_counts(normalize=True).to_frame(name='Percentage of Loan Applications')
sns.barplot(data=Payment_Type, y=Payment_Type.index, x=Payment_Type['Percentage of Loan Applications'],orient='h').set(ylabel = 'Payment Types', title ='Payment Types Vs Percentage of Loan Applications')
plt.tight_layout()
plt.show()

##### Most of the previous loans were repaid by Cash deposits.

##### The second highest mode of  previous loan repayment has undisclosed methods (XNAs)

##### Non-Cash and Cashless contribute to extreme low percent of loan repayment methods

In [None]:
# Box plotting for Income amount

plt.figure(figsize=(18,12))
plt.xticks(rotation=45)
plt.yscale('log')
sns.boxplot(data =final_df, y='AMT_INCOME_TOTAL',x='NAME_CONTRACT_STATUS', hue ='CODE_GENDER',orient='v')
plt.title('Income amount vs Loan Status')
plt.tight_layout()
plt.show()

##### The loans that have been approved, cancelled, refused and unused have same median income amount for male customers.

##### The income of male customers is slightly higher than females in all the loan contract statuses.

##### The distribution of income amount is almost similar in all the loan contract statuses. 

In [None]:
plt.figure(figsize=(12,8))
plt.xscale('log')
plt.title(' Income Type Vs Previous Credit Amount for Loan')
sns.barplot(data=final_df, y=final_df.NAME_INCOME_TYPE, x=final_df.AMT_CREDIT_APP,color='Red',errwidth=0,label='Current')
sns.barplot(data=final_df, y=final_df.NAME_INCOME_TYPE, x=final_df.AMT_CREDIT_PRV,color='Yellow',errwidth=0,label='Previous')
plt.legend()
plt.tight_layout()

##### The loan amount that is credited in the current application is higher than that in previous application for all the income type groups

##### Highest loan amount is credited for people on maternity leave in both current and previous application.

In [None]:
#Creating a funtion to plot barplot
def bar_plot_graph(df, y_label, x_label, hue_value, title, orien, width, length, scale):
    plt.figure(figsize=(width,length))
    plt.xticks(rotation=90)
    if scale == 'xscale':
        plt.xscale('log')
    else:
        plt.yscale('log')
    sns.barplot(data =final_df, y=y_label, hue=hue_value, x=x_label, errwidth=0, orient= orien)
    plt.title(title)
    plt.tight_layout()
    plt.show()

In [None]:
bar_plot_graph(final_df, 'AMT_CREDIT_PRV', 'NAME_INCOME_TYPE', 'CODE_GENDER', 'Prev Credit amount vs Housing type', 'v', 16, 12, 'yscale')

##### Females across all the occupation type were given higher loan amount compared to males

##### Unemployed females had highest loan amount in their previous loan application.

In [None]:
bar_plot_graph(final_df,'AMT_APPLICATION','NAME_EDUCATION_TYPE', 'NAME_CONTRACT_STATUS', 'Application amount vs Education Status', 'v', 18,12, 'yscale')

##### For the customers with academic degree as educational qualification, the amount of loan asked by the customer that was accepted and rejectes is nearly same. Also, people in this category have cancelled loan with highest loan amounts. 

##### Amount of loan offered by the bank and which were unused by customers are almost same across all education types.

In [None]:
bar_plot_graph(final_df,'OCCUPATION_TYPE', 'AMT_APPLICATION', 'NAME_CONTRACT_STATUS', 'Application amount vs Education Status', 'h', 18,25, 'xscale')

##### For the higher amount of loan asked by customers there is high chance of getting loan application refused, this is highest with customers on Managerial posts.

##### For the people with occupation type Accountants and Real estate agents the number of application that were accepted are high and highest for the customer on Managerial post.

In [None]:
#Let's create a correlation matrix on final dataset
corr_mat_final = final_df[['AMT_INCOME_TOTAL',
       'AMT_CREDIT_APP', 'AMT_ANNUITY_APP', 'AMT_GOODS_PRICE_APP','CNT_CHILDREN','CNT_FAM_MEMBERS','Age_Years','AMT_APPLICATION','AMT_ANNUITY_PRV',
        'AMT_CREDIT_PRV', 'AMT_GOODS_PRICE_PRV']].corr()
corr_mat_final

In [None]:
#Let's now plot the correlation matrix with heatmap
plt.figure(figsize=[24, 10])

sns.heatmap(annot= True, data=corr_mat_final, cmap='YlOrRd').set(title='Heatmap for Defaulters')
plt.tight_layout()
plt.show()

##### From the above heatmap, it evident that there is strong correlation  between the Goods Price and Loan Amount of current application that is credited. Also, there is strong correlation  between the Loan Amount asked by applicant and Loan Amount credited in previous application.

##### On the other hand, There is weak correlation between the age and count of children and also between the Goods price for the loan, loan amount credited and loan anuity of current and previous application

# Conclusion


- Applicants having an academic degree have defaulted less number of loans compared to applicants from other educational backgrounds.
- People with higher secondary and married status have defaulted loans with higher loan amounts.
- Number of defaulters are higher in females than males.
- There is strong relation in loan amount credited by bank and loan amount asked by customer against goods.
- Also with loan purpose ‘Repair’ is having higher number of approved and rejected loans.
- Bank is expected to receive higher number of loan applications on Tuesdays than other days

## Thank You !