## 1. Import the libraries

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500) 
pd.set_option('display.max_rows', 500) 

# Supress Warnings

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns 

## 2. Importing data files

### 2.1 Backup of application and previous application file

In [None]:
df_application_backup = pd.read_csv("application_data.csv")
df_Previous_backup = pd.read_csv("previous_application.csv")

### 2.2  importing Application file

In [None]:
df_application = pd.read_csv("application_data.csv")
df_application.head()

### 2.3 importing Previous application

In [None]:
df_previous = pd.read_csv("previous_application.csv")
df_previous.head()

## 3. Checking structure of data in Application file

In [None]:
print('Size of application_data', df_application.shape)

In [None]:
df_application.columns.values

In [None]:
df_application.info(verbose= True)

In [None]:
df_application.describe()

## 4. Missing Values and Data Quality check

### 4.1 Percentage of missing values columnwise

In [None]:
100*df_application.isnull().sum()/len(df_application)


### 4.2 Removing coulmns 

###### 4.2.1Removing coulmns with high missing percentage 

In [None]:
df_application.drop(df_application.columns[(100*df_application.isnull().sum()/len(df_application))>=50], axis=1, inplace= True)

###### 4.2.2 Removing colums which are not important for analysis

In [None]:
df_application = df_application.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6','FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15','FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18','FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21','EXT_SOURCE_3',
       'YEARS_BEGINEXPLUATATION_AVG', 'FLOORSMAX_AVG','YEARS_BEGINEXPLUATATION_MODE', 'FLOORSMAX_MODE','YEARS_BEGINEXPLUATATION_MEDI', 'FLOORSMAX_MEDI', 'TOTALAREA_MODE',
       'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE',
       'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START','REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION','LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY', 
       'LIVE_CITY_NOT_WORK_CITY'],axis=1)
df_application.head()

In [None]:
# Checking the column name and shape after dropping columns
print(df_application.columns)
print('Size of application_data', df_application.shape)

#### 4.3 Missing Value Imputation

###### 4.3.1 Getting columns having missing values

In [None]:
print(df_application.columns[100*df_application.isnull().sum()/len(df_application) > 0].tolist())

###### 4.3.1 Getting continuous and categorical columns

In [None]:
# Columns having <=8 and OCCUPATION_TYPE,ORGANIZATION_TYPE are categorical columns others are continuous columns
df_application.nunique().sort_values()

In [None]:
#plotting Box plot to get outliers in order to get the values for outliers in order get imputing values for variables.

plt.figure(1,figsize=(15,8)) 

# create 1st subplot:
plt.subplot(2,2,1) 
plt.title('Annuity Amount')
sns.boxplot(y=df_application["AMT_ANNUITY"])

# cretae 2nd subplot:
plt.subplot(2,2,2) 
plt.title('Loan Amount')
sns.boxplot(y=df_application["AMT_GOODS_PRICE"])

# cretae 3rd subplot:
plt.subplot(2,2,3)
plt.title('Family Members')
sns.boxplot(y=df_application["CNT_FAM_MEMBERS"])


# Getting mean and medial for all 4 variables 
print (df_application["AMT_ANNUITY"].aggregate(['mean', 'median']))
print(df_application["AMT_GOODS_PRICE"].aggregate(['mean', 'median']))
print(df_application["CNT_FAM_MEMBERS"].aggregate(['mean', 'median']))


##### Imputation Menthod
- ###### As per Box plot its clearly visible that AMT_ANNUITY is having less ouliers where as AMT_GOODS_PRICE is having more outlires, hence missing values can br imputed as below:

- ###### AMT_ANNUITY is having more outlires so missing values can be imputed with median value which is 24903

- ###### AMT_GOODS_PRICE is having more outlires so missing values can be imputed with median value which is 450000

- ###### CNT_FAM_MEMBERS is having less outlires so missing values can be imputed with mean value which is 2 after rounding off.


###### 4.3.3 Categorical column

In [None]:
# NAME_TYPE_SUITE column imputation
plt.figure(figsize=(10,5))
sns.countplot(x='NAME_TYPE_SUITE',data=df_application)

print(df_application.NAME_TYPE_SUITE.mode())

In [None]:
# OCCUPATION_TYPE column imputation
plt.figure(figsize=(10,5))
plt.xticks(rotation=90)
sns.countplot(x='OCCUPATION_TYPE',data=df_application)

print(df_application.OCCUPATION_TYPE.mode())

##### Imputation Method:
- ###### For Categorical column we can impute missing values with value of mode:

- ###### For NAME_TYPE_SUITE missing values should be imput with "Unaccompanied"

- ###### For OCCUPATION_TYPE missing values should be imput with "Laborers"

### 4.4 Checking Datatype and converting

###### 4.4.1 Checking Datatype

In [None]:
df_application.dtypes

###### 4.4.2 Converting datatype

In [None]:
# converting DAYS_REGISTRATION and CNT_FAM_MEMBERS column datatype from Float64 to int64 as these cannot be float
df_application['DAYS_REGISTRATION']= df_application['DAYS_REGISTRATION'].astype('int64')

# Removing rows for which CNT_FAM_MEMBERS values are missing, as these are very less(.000650%),beacuse for missing values while converting datatype it is throwing error.
df_application.dropna(subset=['CNT_FAM_MEMBERS'], inplace= True)
df_application['CNT_FAM_MEMBERS']= df_application['CNT_FAM_MEMBERS'].astype('int64')

# Removing rows for which DAYS_LAST_PHONE_CHANGE values are missing, as these are very less(0.000325%),beacuse for missing values while converting datatype it is throwing error.
df_application.dropna(subset=['DAYS_LAST_PHONE_CHANGE'], inplace= True)
df_application['DAYS_LAST_PHONE_CHANGE']= df_application['DAYS_LAST_PHONE_CHANGE'].astype('int64')

In [None]:
df_application.dtypes

###### 4.4.3 converting negative values of columns to postive values

In [None]:
# Converting negative values of DAYS_BIRTH, DAYS_EMPLOYED, DAYS_REGISTRATION, DAYS_ID_PUBLISH as it should be in positve
cols_negative = ['DAYS_BIRTH','DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH','DAYS_LAST_PHONE_CHANGE']

for i in cols_negative:
    df_application[i] = df_application[i].apply(lambda x: round(abs(x)))

In [None]:
df_application.head()

### 4.5 Check for outliers

###### 4.5.1 AMT_INCOME_TOTAL

In [None]:

plt.figure(figsize=(15,8))
plt.subplot(2,2,1)
sns.boxplot(df_application["AMT_INCOME_TOTAL"])


plt.subplot(2,2,2) 
sns.distplot(df_application["AMT_INCOME_TOTAL"])


# After removing outliers
plt.subplot(2,2,3)
sns.boxplot(df_application[df_application["AMT_INCOME_TOTAL"]<800000]["AMT_INCOME_TOTAL"])
plt.show()


###### Outlier observation: 
`As per boxplot and distplot AMT_INCOME_TOTAL>800000 are outliers`

###### 4.5.2 AMT_CREDIT

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(2,2,1)
sns.boxplot(df_application["AMT_CREDIT"])
plt.title('Credit Amount')


plt.subplot(2,2,2) 
sns.distplot(df_application["AMT_CREDIT"])


# After removing outliers
plt.subplot(2,2,3)
sns.boxplot(df_application[df_application["AMT_CREDIT"]<2300000]["AMT_CREDIT"])
plt.show()

###### Outlier observation: 
`As per boxplot and distplot AMT_CREDIT>2300000 are outliers`

###### 4.5.3 AMT_ANNUITY

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(2,2,1)
sns.boxplot(y=df_application["AMT_ANNUITY"])
plt.title('Annuity Amount')

# Imputimg Null values with median to draw distplot
plt.subplot(2,2,2)
df_application['AMT_ANNUITY'].fillna((df_application['AMT_ANNUITY'].median()), inplace=True) 
sns.distplot(df_application["AMT_ANNUITY"])

# After removing outliers
plt.subplot(2,2,3)
sns.boxplot(df_application[df_application["AMT_ANNUITY"]<80000]["AMT_ANNUITY"])
plt.title('Annuity Amount')
plt.show()

###### Outlier observation: 
`As per boxplot and distplot AMT_ANNUITY>80000 are outliers`

###### 4.5.4 AMT_GOODS_PRICE

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(2,2,1)
sns.boxplot(y=df_application["AMT_GOODS_PRICE"])
plt.title('Loan Amount')

# Imputimg Null values with median to draw distplot
plt.subplot(2,2,2)
df_application['AMT_GOODS_PRICE'].fillna((df_application['AMT_GOODS_PRICE'].median()), inplace=True) 
sns.distplot(df_application["AMT_GOODS_PRICE"])

# After removing outliers

plt.subplot(2,2,3)
sns.boxplot(df_application[df_application["AMT_GOODS_PRICE"]<1850000]["AMT_GOODS_PRICE"])
plt.title('Loan Amount')
plt.show()

###### Outlier observation: 
`As per boxplot and distplot AMT_GOOD_PRICE>1850000 are outliers`


###### 4.5.5 DAYS_BIRTH

In [None]:
sns.boxplot(x=df_application['DAYS_BIRTH'])

###### Outlier observation: 
`As per boxplot there are no outliers for DAYS_BIRTH`

### 4.6 Binning

###### 4.6.1 Binning  'AMT_INCOME_TOTAL' based on quantile

In [None]:
df_application['AMT_INCOME_TOTAL'].describe()

In [None]:
# Binning based quantiles
df_application['Income_lable']= pd.cut(df_application['AMT_INCOME_TOTAL'],[25649.999,112500.0,147150.0,202500.0,117000000.0],labels = ['Poor','Low', 'medium', 'High'])
df_application

In [None]:
# Visualization for Binning column
x,y = 'Income_lable', 'TARGET'

df1 = df_application.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=15/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

###### 4.6.1 Binning  'DAYS_BIRTH' 

In [None]:
df_application['DAYS_BIRTH']= df_application['DAYS_BIRTH']/356
df_application['DAYS_BIRTH']= df_application['DAYS_BIRTH'].astype('int64')
df_application['Age']= pd.cut(df_application['DAYS_BIRTH'],[0,30,50,70],labels = ['Young','Adult','Old'])
df_application.Age

In [None]:
x,y = 'Age', 'TARGET'

df1 = df_application.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=15/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)
df1

## 5. Analysis

### 5.1 Imbalance percentage

In [None]:
temp = df_application["TARGET"].value_counts()
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
ax.pie(temp.values, labels= temp.index,autopct='%1.2f%%')
plt.title('Loan Repaid or not')
plt.show()

`Its highly imbalance as for Non Defaulters it's very high 91.93% and for Defaulters are only 8.07%`

### 5.2 Dividing dataset in two dataframe 

- ###### With Target Value= 1

- ###### With Target value= 0

In [None]:
df_0= df_application[df_application['TARGET']==0]
df_1 = df_application[df_application['TARGET']==1]

In [None]:
df_0.head()

In [None]:
df_1.head()

### 5.3 Univariate analysis for Categorical variables with respect to TARGET

###### 5.3.1 Income sources of Applicant's in terms of loan is repayed or not in %

In [None]:

x,y = 'NAME_INCOME_TYPE', 'TARGET'

df1 = df_application.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=15/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
plt.title('Income Type')
plt.show()

df1


###### Observation:
`1) 100% Businessmen and students are paying loan amount on time or we can say do not have any payment difficulties`

`2) 40% of Maternity Leave and 36.36% of Unemployed are Defaulters or have payment difficulties`

`3) Most of the people who are working as Commercial associate or pensioner or state servant or belong to working class are paying their installment on time`

###### 5.3.2 Family Status of Applicant's in terms of loan is repayed or not in %

In [None]:
x,y = 'NAME_FAMILY_STATUS', 'TARGET'

df1 = df_application.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=15/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

###### Observation:
`from the graph we can observer all the results are almost same so we cannot get defaulter pattern from family status column`

###### 5.3.3 Occupation of Applicant's in terms of loan is repayed or not in %

In [None]:
x,y = 'OCCUPATION_TYPE', 'TARGET'

df1 = df_application.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=15/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

###### Observation:
-   ##### let us take 10% as the threshold limit below which bank is allowed to take risk and more than that its an risk to give loan
-   ##### Considering people in occupataion where they are facing issues in payment are Coocking staff, Drivers, Laborers, Low-skill Laboures, Security staff, waiters/barmen staff 
-   ##### All other occupation type is able to pay on time
-   ##### People working as Accountant are repaying their loan amount on time

###### 5.3.4 Gender of applicatns  in terms of loan paid or not

In [None]:
x,y = 'CODE_GENDER', 'TARGET'

df1 = df_application.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=15/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
plt.show()
df1

###### Observation:
-   ##### XNA's are not facing any issues in payment. 100% of XNA are paying loan
-   ##### 93% of Male applier are not having issue in paying loan
-   ##### Hence Mail and XNA are able to pay the loan amount.

###### 5.3.5 Type of loan applied in terms of loan is paid or not

In [None]:
x,y = 'NAME_CONTRACT_TYPE', 'TARGET'

df1 = df_application.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=15/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

###### Observation:
-   ##### 94.52 % of Revolving loans and 91.65% of Cash loan is getting paid without any difficulties. 
-   ##### Revolving Loan are getting paid more than cash loans 

###### 5.3.6 Education of Applicant's in terms of loan is paid or not

In [None]:
x,y = 'NAME_EDUCATION_TYPE', 'TARGET'

df1 = df_application.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=15/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

###### Observation:
-   ##### People having lower secondry education are facing issues in payment
-   ##### Peolple having academic degree are not facing much issue in payemnt only 1.83% are facing issue in paying loan.

###### 5.3.7 owning CAR or Reality in terms of loan is paid or not

In [None]:
# Owning Car
x,y = 'FLAG_OWN_CAR', 'TARGET'

df1 = df_application.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=20/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

# Owning Realty

x,y = 'FLAG_OWN_REALTY', 'TARGET'

df1 = df_application.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=20/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

###### Observation:
-   ##### People owning car or Realty are having around same ratio in case of facing issue and paying back loan amount

### 5.4 Multi-Varient analysis for Categorical Variables with respect to TARGET

### 5.4.1 Below count graphs are on on following columns
-    `Age groups`
-    `Family Status`
-    `Income Type`
-    `Target (able to pay loan emi on time)`


In [None]:
income_type=['Maternity leave','Unemployed','Working','Commercial associate']

for i in income_type:
    g = sns.catplot(x='NAME_FAMILY_STATUS', hue='TARGET', col= 'Age',col_wrap=3,kind='count',data=df_application[df_application['NAME_INCOME_TYPE']==i],aspect=1)
    plt.xticks(rotation=90)
    plt.show()
    

#### Above count graphs are on on following columns
-    `Age groups`
-    `Family Status`
-    `Income Type`
-    `Target (able to pay loan emi on time)`

#### Observations
1) Married people having income type as 'Maternity leave','Unemployed' and in age group of 30-50 (Adult) are risk to give loan as they have high number of issues in loan repayment

2) Unemployed people with relationship status as single,window, Saperated or civil marriage under age of 50 pays their loan emi on time

3) Mostly Working Married couple in age of 30-50 apply for loan and probability of getting result is high 

### 5.4.2 Multi varient analysis based on OCCUPATION_TYPE ratio based on gender

#### Above count graphs are on on following 5 columns
-    `Age groups`
-    `Family Status`
-    `OCCUPATION_TYPE`
-    `Gender`
-    `Target (able to pay loan emi on time)`

#### 5.4.2.1 for gender = Male 

In [None]:
income_type=np.delete(df_application['OCCUPATION_TYPE'].unique(), 4)
genders=['M']
for i in income_type:
    for gender in genders:
        temp_dataframe = df_application[df_application['OCCUPATION_TYPE']==i][df_application['CODE_GENDER']==gender]
        if len(temp_dataframe) > 0:
            g = sns.catplot(x='NAME_FAMILY_STATUS',margin_titles=True , hue='TARGET', col= 'Age',col_wrap=3,orient='v' ,kind='count',data=temp_dataframe,aspect=1)
            plt.title(i + '_' + gender)
            plt.xticks(rotation= 90)
            plt.show()
    

### Observation:
-    Married males working for `Realty or as a Low skilled labours` has high probability of `having issue in returning the loan amount`
-    Males working as `HR professional` and in `age more than 30 has` more changes of `paying loan on time`

#### 5.4.2.2 for gender = Female 

In [None]:
income_type=np.delete(df_application['OCCUPATION_TYPE'].unique(), 4)
genders=['F']
for i in income_type:
    for gender in genders:
        temp_dataframe = df_application[df_application['OCCUPATION_TYPE']==i][df_application['CODE_GENDER']==gender]
        if len(temp_dataframe) > 0:
            g = sns.catplot(x='NAME_FAMILY_STATUS',margin_titles=True , hue='TARGET', col= 'Age',col_wrap=3,orient='v' ,kind='count',data=temp_dataframe,aspect=1)
            plt.title(i + '_' + gender)
            plt.xticks(rotation=90)
            plt.show()
    

#### Observations:
- ###### Married Women in age of 30-50(Adult) and working as `Low skilled labour or Waiter or Security Staff` is high changes of `Risk or having issues with payment on time`

#### 5.4.2.3 for gender = XNA 

In [None]:
income_type=np.delete(df_application['OCCUPATION_TYPE'].unique(), 4)
genders=['XNA']
for i in income_type:
    for gender in genders:
        temp_dataframe = df_application[df_application['OCCUPATION_TYPE']==i][df_application['CODE_GENDER']==gender]
        if len(temp_dataframe) > 0:
            g = sns.catplot(x='NAME_FAMILY_STATUS',margin_titles=True , hue='TARGET', col= 'Age',col_wrap=3,orient='v' ,kind='count',data=temp_dataframe,aspect=1)
            plt.title(i + '_' + gender)
            plt.xticks(rotation=90)
            plt.show()

### Observation:
- ###### People who havent defined their gender are more likly to pay their loan on time

### 5.4.2.4 analysis based in Income_lable

In [None]:
income_label=df_application.Income_lable.astype('str').unique()
for i in income_label:
    g = sns.catplot(x='NAME_FAMILY_STATUS', hue='TARGET', col= 'Age',col_wrap=3,orient='v' ,kind='count',data=df_application[df_application.Income_lable==i],aspect=1)
    plt.title(i )
    plt.xticks(rotation=90)
    plt.show()

### 5.4.2.5 below multivarient  analysis is based on following column 
-    'Age Group'
-    'Gender'
-    'REGION_RATING_CLIENT'
-    'Family Status'

We will try to figure out which customer has issue in payment based on the above values

In [None]:
# Analysis for Male customers
REGION_RATING_CLIENT=df_application['REGION_RATING_CLIENT'].unique()
REGION_RATING_CLIENT
genders=['M']
for i in REGION_RATING_CLIENT:
    for gender in genders:
        temp_dataframe = df_application[df_application['REGION_RATING_CLIENT']==i][df_application['CODE_GENDER']==gender]
        if len(temp_dataframe) > 0:
            g = sns.catplot(x='NAME_FAMILY_STATUS',margin_titles=True , hue='TARGET', col= 'Age',col_wrap=3,kind='count',data=temp_dataframe,aspect=1)
            plt.title(str(i) + '_' + gender)
            plt.xticks(rotation=90)
            plt.show()

### Observations

-   `Male customer of age 30-50 (Adult) from Region id 1 are more likly to pay EMI on time`
-   `Married male customer of age 30-50 (Adult) from Region 2 and 3 are more likly to be defaulter and should be given loan on higher rate of intereat`

In [None]:
# Analysis for Female customers

REGION_RATING_CLIENT=df_application['REGION_RATING_CLIENT'].unique()
REGION_RATING_CLIENT
# df_application[df_application['REGION_RATING_CLIENT']==3] REGION_RATING_CLIENT_W_CITY
genders=['F']
for i in REGION_RATING_CLIENT:
    for gender in genders:
        temp_dataframe = df_application[df_application['REGION_RATING_CLIENT']==i][df_application['CODE_GENDER']==gender]
        if len(temp_dataframe) > 0:
            g = sns.catplot(x='NAME_FAMILY_STATUS',margin_titles=True , hue='TARGET', col= 'Age',col_wrap=3,kind='count',data=temp_dataframe,aspect=1)
            plt.title(str(i) + '_' + gender)
            plt.xticks(rotation=90)
            plt.show()

### Observations

-   `Female customer of age 30-50 (Adult) from Region id 1 are more likly to pay EMI on time`
-   `Married Female customer of age 30-50 (Adult) from Region 2 and 3 are more likly to be defaulter and should be given loan on higher rate of intereat`

In [None]:
# Analysis for XNA customers

REGION_RATING_CLIENT=df_application['REGION_RATING_CLIENT'].unique()
REGION_RATING_CLIENT
genders=['XNA']
for i in REGION_RATING_CLIENT:
    for gender in genders:
        temp_dataframe = df_application[df_application['REGION_RATING_CLIENT']==i][df_application['CODE_GENDER']==gender]
        if len(temp_dataframe) > 0:
            g = sns.catplot(x='NAME_FAMILY_STATUS',margin_titles=True , hue='TARGET', col= 'Age',col_wrap=3,kind='count',data=temp_dataframe,aspect=1)
            plt.title(str(i) + '_' + gender)
            plt.xticks(rotation=90)
            plt.show()

### Observations

-   `Customer who havent defined gender are more likly to pay loan on time`


### 5.4.2.6 REGION_RATING_CLIENT_W_CITY Vs REGION_RATING_CLIENT vs Age

In [None]:
# df_application

REGION_RATING_CLIENT_W_CITY=df_application['REGION_RATING_CLIENT_W_CITY'].unique()
REGION_RATING_CLIENT_W_CITY

for i in REGION_RATING_CLIENT_W_CITY:
    temp_dataframe = df_application[df_application['REGION_RATING_CLIENT']==i]
    if len(temp_dataframe) > 0:
        g = sns.catplot(x='NAME_FAMILY_STATUS',margin_titles=True , hue='TARGET', col= 'Age',col_wrap=3,kind='count',data=temp_dataframe,aspect=1)
        plt.title(i)
        plt.xticks(rotation=90)
        plt.show()

###### Observation
- ###### Applicants from region 1 are less likely to have issues while paying Loan

### 5.5 Corelation

###### 5.5.1 For deaulters(TARGET=1)

In [None]:
var_numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
corr_1 = (df_1.select_dtypes(include=var_numerics)).corr().abs()
corr_1 = pd.DataFrame(corr_1.unstack()).reset_index()
corr_1.columns = ['FEATURE_1', 'FEATURE_2', 'CORRELATION']
dup = (corr_1[['FEATURE_1', 'FEATURE_2']].apply(frozenset, axis=1).duplicated()) | (corr_1['FEATURE_1']==corr_1['FEATURE_2']) 
corr_1 = corr_1[~dup]
print(corr_1.nlargest(10,['CORRELATION']))

###### 5.5.2 For Non deaulters (TARGET =0)

In [None]:
var_numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
corr_1 = (df_0.select_dtypes(include=var_numerics)).corr().abs()
corr_1 = pd.DataFrame(corr_1.unstack()).reset_index()
corr_1.columns = ['FEATURE_1', 'FEATURE_2', 'CORRELATION']
dup = (corr_1[['FEATURE_1', 'FEATURE_2']].apply(frozenset, axis=1).duplicated()) | (corr_1['FEATURE_1']==corr_1['FEATURE_2']) 
corr_1 = corr_1[~dup]
print(corr_1.nlargest(10,['CORRELATION']))

###### Observation:
-   ##### Corelation between DAYS_EMPLOYED & FLAG_EMP_PHONE,AMT_CREDIT & AMT_GOODS_PRICE, REGION_RATING_CLIENT & REGION_RATING_CLIENT_W_CITY, CNT_CHILDREN & CNT_FAM_MEMBERS, AMT_ANNUITY & AMT_GOODS_PRICE and AMT_CREDIT &AMT_ANNUITY are around same with respect to TARGET
-   ##### Corelation other than above variables having difference with respect to TARGET


###### 5.5.3 Visual INSIGHT for top 10 corelation in terms of non Defaulter

In [None]:
plt.figure(figsize = (15, 30))
plt.subplot(5,2,1)
sns.scatterplot(x='DAYS_EMPLOYED', y='FLAG_EMP_PHONE', data=df_0)

plt.subplot(5,2,2)
sns.scatterplot(x='AMT_CREDIT', y='AMT_GOODS_PRICE', data=df_0)
plt.xticks(rotation=90)

plt.subplot(5,2,3)
sns.scatterplot(x='REGION_RATING_CLIENT', y='REGION_RATING_CLIENT_W_CITY', data=df_0)
plt.xticks(rotation=90)

plt.subplot(5,2,4)
sns.scatterplot(x='CNT_CHILDREN', y='CNT_FAM_MEMBERS', data=df_0)
plt.xticks(rotation=90)

plt.subplot(5,2,5)
sns.scatterplot(x='AMT_ANNUITY', y='AMT_GOODS_PRICE', data=df_0)
plt.xticks(rotation=90)

plt.subplot(5,2,6)
sns.scatterplot(x='AMT_CREDIT', y='AMT_ANNUITY', data=df_0)
plt.xticks(rotation=90)

plt.subplot(5,2,7)
sns.scatterplot(x='DAYS_BIRTH', y='DAYS_EMPLOYED', data=df_0)
plt.xticks(rotation=90)

plt.subplot(5,2,8)
sns.scatterplot(x='DAYS_BIRTH', y='FLAG_EMP_PHONE', data=df_0)
plt.xticks(rotation=90)

plt.subplot(5,2,9)
sns.scatterplot(x='REGION_POPULATION_RELATIVE', y='REGION_RATING_CLIENT', data=df_0)
plt.xticks(rotation=90)

plt.subplot(5,2,10)
sns.scatterplot(x='REGION_POPULATION_RELATIVE', y='REGION_RATING_CLIENT_W_CITY', data=df_0)
plt.xticks(rotation=90)


###### Observation:
-   ##### After getting INSIGHT for top 10 corelation below Variables are having top 4 casuation
-   ##### AMT_CREDIT & AMT_GOODS_PRICE are directly proportinal
-   ##### CNT_CHILDREN & CNT_FAM_MEMBERS are directly proportinal
-   ##### AMT_ANNUITY & AMT_GOODS_PRICE are directly proportinal
-   ##### AMT_CREDIT & AMT_ANNUITY are directly proportinal

###### 5.5.4 Visual INSIGHT for top 10 corelation in terms of Defaulter

In [None]:
plt.figure(figsize = (15, 30))
plt.subplot(5,2,1)
sns.scatterplot(x='DAYS_EMPLOYED', y='FLAG_EMP_PHONE', data=df_1)

plt.subplot(5,2,2)
sns.scatterplot(x='AMT_CREDIT', y='AMT_GOODS_PRICE', data=df_1)
plt.xticks(rotation=90)

plt.subplot(5,2,3)
sns.scatterplot(x='REGION_RATING_CLIENT', y='REGION_RATING_CLIENT_W_CITY', data=df_1)
plt.xticks(rotation=90)

plt.subplot(5,2,4)
sns.scatterplot(x='CNT_CHILDREN', y='CNT_FAM_MEMBERS', data=df_1)
plt.xticks(rotation=90)

plt.subplot(5,2,5)
sns.scatterplot(x='AMT_ANNUITY', y='AMT_GOODS_PRICE', data=df_1)
plt.xticks(rotation=90)

plt.subplot(5,2,6)
sns.scatterplot(x='AMT_CREDIT', y='AMT_ANNUITY', data=df_1)
plt.xticks(rotation=90)

plt.subplot(5,2,7)
sns.scatterplot(x='DAYS_BIRTH', y='DAYS_EMPLOYED', data=df_1)
plt.xticks(rotation=90)

plt.subplot(5,2,8)
sns.scatterplot(x='DAYS_BIRTH', y='FLAG_EMP_PHONE', data=df_1)
plt.xticks(rotation=90)

plt.subplot(5,2,9)
sns.scatterplot(x='REGION_POPULATION_RELATIVE', y='REGION_RATING_CLIENT', data=df_1)
plt.xticks(rotation=90)

plt.subplot(5,2,10)
sns.scatterplot(x='REGION_POPULATION_RELATIVE', y='REGION_RATING_CLIENT_W_CITY', data=df_1)
plt.xticks(rotation=90)


###### Observation:
-   ##### After getting INSIGHT for top 10 corelation below Variables are having top 4 casuation
-   ##### AMT_CREDIT & AMT_GOODS_PRICE are directly proportinal
-   ##### CNT_CHILDREN & CNT_FAM_MEMBERS are directly proportinal
-   ##### AMT_ANNUITY & AMT_GOODS_PRICE are directly proportinal
-   ##### AMT_CREDIT & AMT_ANNUITY are directly proportinal

### 5.6 Numerical Univariate analysis

###### 5.6.1 Distribution of AMT_CREDIT

In [None]:
f, axs = plt.subplots(1,2,figsize=(15,8))
plt.subplot(2,1,1)
sns.distplot(df_1['AMT_CREDIT'].dropna(), hist = False, label = "Defaulter", color = 'red')
plt.subplot(2,1,1)
sns.distplot(df_0['AMT_CREDIT'].dropna(), hist = False, label = "Non Defaulter", color = 'green')
plt.show()


##### Observation:
- ###### AMT_CREDIT for both defaulter and non defaulter  lies between same interval

###### 5.6.2 Distribution of AMT_ANNUITY

In [None]:
f, axs = plt.subplots(1,2,figsize=(15,8))
plt.subplot(2,1,1)
sns.distplot(df_1['AMT_ANNUITY'].dropna(), hist = False, label = "Defaulter", color = 'red')
plt.subplot(2,1,1)
sns.distplot(df_0['AMT_ANNUITY'].dropna(), hist = False, label = "Non Defaulter", color = 'green')
plt.show()


##### Observation:
- ###### AMT_ANNUITY for both defaulter and non defaulter  lies between same interval

###### 5.6.3 Distribution of AMT_GOODS_PRICE with respect to TARGET

In [None]:
f, axs = plt.subplots(1,2,figsize=(15,8))
plt.subplot(2,1,1)
sns.distplot(df_1['AMT_GOODS_PRICE'].dropna(), hist = False, label = "Defaulter", color = 'red')
plt.subplot(2,1,1)
sns.distplot(df_0['AMT_GOODS_PRICE'].dropna(), hist = False, label = "Non Defaulter", color = 'green')
plt.show()

##### Observation:
- ###### AMT_GOODS_PRICE for both defaulter and non defaulter  lies between same interval

###### 5.6.4 Distribution of Age with respect to TARGET

In [None]:
f, axs = plt.subplots(1,2,figsize=(15,8))
plt.subplot(2,1,1)
sns.distplot(df_1['DAYS_BIRTH'].dropna(), hist = False, label = "Defaulter", color = 'red')
plt.subplot(2,1,1)
sns.distplot(df_0['DAYS_BIRTH'].dropna(), hist = False, label = "Non Defaulter", color = 'green')
plt.show()


##### Observation:
- ###### Age of most of the people for defaulter lies between 21 to 45
- ###### Age of most of the people for non defaulter lies between 21 to 60

### 5.7 Numerical Bivariate analysis

###### TARGET v/s DAYS_BIRTH

In [None]:
sns.boxplot(x='TARGET', y='DAYS_BIRTH', data=df_application)
plt.show()

###### Observation
- ###### Mean for non defaulters is more than Defaulter
- ###### Younger applicants face more payment difficulties, compared to older ones

###### EXT_SOURCE_2 v/s TARGET

In [None]:
sns.boxplot(x='TARGET', y = 'EXT_SOURCE_2', data=df_application)
plt.show()

###### Observation
- ######  Applicants with higher score have less payment difficulties

###### REGION_POPULATION_RELATIVE v/s TARGET

In [None]:
sns.boxplot(x='TARGET', y='REGION_POPULATION_RELATIVE', data=df_application)
plt.show()

###### Observation
- ######  The data spread for client's with no payment difficulties is in the higher populated regions.

## 6. Analysing previous application data

 ### 6.1 Checking structure of data in Previous Application

In [None]:
# Getting size of previous application
print('Size of application_data', df_previous.shape)

In [None]:
# Getting column info
df_previous.info()

In [None]:
df_previous.describe()

### 6.2 Missing Valus for Previous Application data

In [None]:
# Getting missing values
100*df_previous.isnull().sum()/len(df_previous)

In [None]:
# Dropping column having >= 20 % missing values
df_previous.drop(df_previous.columns[(100*df_previous.isnull().sum()/len(df_previous))>=20], axis=1, inplace= True)

In [None]:
df_previous.columns

In [None]:
df_previous.shape

In [None]:
100*df_previous.isnull().sum()/len(df_previous)

In [None]:
df_previous.info()

### 6.3 Missing Value Imputation

In [None]:
# Impute method for PRODUCT_COMBINATION
print(df_previous['PRODUCT_COMBINATION'].mode())

plt.figure(figsize=(15,8))
sns.countplot(x='PRODUCT_COMBINATION',data=df_previous)
plt.xticks(rotation=90)
plt.show()

- ###### As cash is having the mode of the column null values can be imputed with Cash

In [None]:
# Impute method for AMT_CREDIT
plt.figure(figsize=(15,8))
sns.boxplot(y=df_previous["AMT_CREDIT"])
plt.title('Approved Loan Amount')
plt.show()

df_previous['AMT_CREDIT'].aggregate(['mean', 'median'])

- ###### As Amount Credit is having lot of Outliers,hence missing values can be updated with Median (80541)

### 6.4 Outliers in Previous Application

###### 6.4.1 AMT_APPLICATION

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(2,2,1)
sns.boxplot(y=df_previous["AMT_APPLICATION"])
plt.title('Applied Loan Amount')

plt.subplot(2,2,2)
sns.distplot(df_previous["AMT_APPLICATION"])


# After removing outliers

plt.subplot(2,2,3)
sns.boxplot(df_previous[df_previous["AMT_APPLICATION"]<2200000]["AMT_APPLICATION"])
plt.title('Net Price')
plt.show()

###### Outlier observation: 
`As per boxplot and distplot AMT_APPLICATION>2200000 are outliers`

###### 6.4.1 AMT_CREDIT

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(2,2,1)
#plt.figure(figsize=(15,8))
sns.boxplot(y=df_previous["AMT_CREDIT"])
plt.title('Approved Loan Amount')

# Imputimg Null values with median to draw distplot
plt.subplot(2,2,2)
df_previous['AMT_CREDIT'].fillna((df_application['AMT_CREDIT'].median()), inplace=True) 
sns.distplot(df_previous["AMT_CREDIT"])

# After removing outliers
plt.subplot(2,2,3)
sns.boxplot(df_previous[df_previous["AMT_CREDIT"]<2000000]["AMT_CREDIT"])
plt.title('Approved Price')
plt.show()

###### Outlier observation: 
`As per boxplot and distplot AMT_CREDIT>2000000 are outliers`

## 7. Merging both Application and Previous Application

In [None]:
# for Previous application having only required Columns
df_previous = df_previous[['SELLERPLACE_AREA','NFLAG_LAST_APPL_IN_DAY','SK_ID_CURR','NAME_CONTRACT_STATUS','CODE_REJECT_REASON','NAME_CLIENT_TYPE','CHANNEL_TYPE','NAME_YIELD_GROUP','PRODUCT_COMBINATION']]
df_final = pd.merge(df_application,df_previous,how='inner',on='SK_ID_CURR')

In [None]:
df_final.head()

In [None]:
print(df_final.shape)

## 8. Univariate Analysis for merged dataframe

### 8.1 TARGET in terms of previous status of loan

In [None]:
x,y =  'TARGET', 'NAME_CONTRACT_STATUS'

df1 = df_final.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=20/10)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
plt.title('Application status in terms of Loan paid or not')
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)
    

##### Observation
- ###### 63.41% application are approved for which there is no issue in payment.
- ###### 54.96% application are approved for which there is issue in payment.

### 8.2  Type Client type in terms of previous status of loan

In [None]:
x,y =  'NAME_CLIENT_TYPE','NAME_CONTRACT_STATUS'

df1 = df_final.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=20/10)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)
plt.title('Application status in terms of Loan paid or not')
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)
    

##### Observation
- ###### Percentage of application approved for New applicants is more and for XNA is less.
- ###### Percentage of application refused for New applicants is less and for Repeater is more.
- ###### Percentage of application canceled for New applicants is very less and for refreshed is more.

### 8.3 Income Type in terms of application status

In [None]:

x,y = 'NAME_INCOME_TYPE', 'NAME_CONTRACT_STATUS'

df1 = df_final.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=15/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)

plt.show()
df1

##### Observation
- ###### Percentage of application approved for Students is more and unemployed is less.
- ###### Percentage of application canceled for prisoner is more and is less for Maternity leave and students.
- ###### Percentage of application refused for New applicants is more and is less for student.

### 8.4 Code_ Gender in terms of application status

In [None]:
x,y = 'CODE_GENDER', 'NAME_CONTRACT_STATUS'

df1 = df_final.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1,aspect=15/8.27)
g.ax.set_ylim(0,100)

plt.xticks(rotation=90)

plt.show()
df1

##### Observation
- ###### Percentage of application approved for male and female is around same and less for XNA.
- ###### Percentage of application canceled for Male is more and is less for XNA.
- ###### Percentage of application refused for XNA is more.
- ###### There is no unused offer for XNA

# Final Conclusion from analysis

### Based on all analysis on provided data we conclude there are 3 types of applicants available


#### 1) Good Applicant (Normal interest Rate)

-    Previously had no issues in payment

-    Lives in or from Region 1

-    Male applicants he should be part of core staff, Accountants or manager, and age is more than 50

-    For old age Females applicants with age more than 50 should be `labor, Core staff, Accountants, Managers, Drivers, Sales Staff, Private Service staff, Medical Staff, High skill labor, reality agent, Secretaries, IT staff or HR staff and Not Married`

-    `Male in age from 30-50 working as HR`

-    Applicants who have not defined their gender


#### 2) Risk (High interest Rate)

-    Previously had no issues in payment

-    Can live in any region

-    Married male in age group of 30-50 `Not working as Low Skill Labor, Reality Agent, Security Staff, Labors, Accountants, Drivers`

-    Married female applicants in age between 30-50 and `Not working in IT, Low skill labor, Waiter/barmen , Security Staff, cleaning or sales`


#### 3) Reject

-    Previously had payment issue

-    Applicant is `Married` and `Maternity leave and in age group of 30-50`.

-    Applicant is `Married` in age group if `20-50 and Unemployed`.

-    Applicant is `unemployed` and in `age group of more than 50`.




