# Importing Libraries

##### Importing the 'Pandas' library. The libraries will be imported as and when required.  

In [None]:
# Importing the necessary Pandas library

import pandas as pd


# Setting the option to read all the columns

pd.set_option('display.max_columns', 150)

# Data Sourcing - I

##### Trying to read the 'application_data.csv' file and getting various information from the data.

In [None]:
# Reading the csv file

app_data = pd.read_csv("../input/credit-eda-case-study-data/application_data.csv")

##### Trying to run various functions on the dataframe to know the basic info 

In [None]:
# Reading the first five rows

app_data.head()

In [None]:
# Reading the number of rows and columns

app_data.shape

In [None]:
# Reading the names of the column

app_data.columns

In [None]:
# Getting the information about the dataframe

app_data.info()

# Data Cleaning - I

##### Checking the quality of Data and missing values 

In [None]:
# Finding the percentage of missing values for all the columns

(100 * app_data.isnull().sum() / len(app_data)).round(2)

##### Checking the missing values of the dataframe

In [None]:
# Removing the columns with high missing percentage; 50% in this case

new_app_data = app_data.loc[:, app_data.isna().mean() < .5]
new_app_data.shape

In [None]:
# Getting the dataframe for columns having less than 13% and some missing values

lesser_missing_app_data = new_app_data.loc[:, (new_app_data.isna().mean() > 0) & (new_app_data.isna().mean() <= 0.13)]
lesser_missing_app_data.shape

In [None]:
# Getting the list of columns of the dataframe

lesser_missing_app_data.columns

In [None]:
# Checking the missing values from for 'DAYS_LAST_PHONE_CHANGE'

days_last_phone_change_data = pd.isnull(lesser_missing_app_data.DAYS_LAST_PHONE_CHANGE) 
    
# filtering and displaying data only with 'DAYS_LAST_PHONE_CHANGE' = NaN 

lesser_missing_app_data[days_last_phone_change_data].reset_index()

In [None]:
# Checking the missing values from for 'AMT_ANNUITY'

amt_annunity_data = pd.isnull(lesser_missing_app_data.AMT_ANNUITY) 
    
# filtering and displaying data only with 'AMT_ANNUITY' = NaN 

lesser_missing_app_data[amt_annunity_data].reset_index()

In [None]:
# Checking the missing values from for 'AMT_ANNUITY'

amt_goods_price_data = pd.isnull(lesser_missing_app_data.AMT_GOODS_PRICE) 
    
# filtering and displaying data only with 'AMT_GOODS_PRICE' = NaN 

lesser_missing_app_data[amt_goods_price_data].reset_index()

In [None]:
# Checking the missing values from for 'NAME_TYPE_SUITE'

name_type_suite_data = pd.isnull(lesser_missing_app_data.NAME_TYPE_SUITE) 
    
# filtering and displaying data only with 'NAME_TYPE_SUITE' = NaN 

lesser_missing_app_data[name_type_suite_data].reset_index()

In [None]:
# Checking the missing values from for 'CNT_FAM_MEMBERS'

cnt_fam_members_data = pd.isnull(lesser_missing_app_data.CNT_FAM_MEMBERS) 
    
# filtering and displaying data only with 'CNT_FAM_MEMBERS' = NaN 

lesser_missing_app_data[cnt_fam_members_data].reset_index()

##### Checking the data type of all columns

In [None]:
new_app_data.info()

##### Importing the 'Warnings' library.

In [None]:
# Warnings are made silent

import warnings

warnings.filterwarnings("ignore")

##### Creating a new column for each columns holding negative values

In [None]:
# Taking the absolute values of 'DAYS_BIRTH' and creating a new column

new_app_data["DAYS_BIRTH_ABS"] = abs(new_app_data.DAYS_BIRTH)
new_app_data.head()

In [None]:
# Taking the absolute values of 'DAYS_EMPLOYED' and creating a new column

new_app_data["DAYS_EMPLOYED_ABS"] = abs(new_app_data.DAYS_EMPLOYED)
new_app_data.head()

In [None]:
# Taking the absolute values of 'DAYS_ID_PUBLISH' and creating a new column

new_app_data["DAYS_ID_PUBLISH_ABS"] = abs(new_app_data.DAYS_ID_PUBLISH)
new_app_data.head()

In [None]:
# Converting the 'DAYS_REGISTRATION' to int, taking absolute values and creating a new column

new_app_data["DAYS_REGISTRATION_ABS"] = abs(new_app_data.DAYS_REGISTRATION.astype(int))
new_app_data.head()

In [None]:
# Converting the values of 'CNT_FAM_MEMBERS' to int and creating a new column

new_app_data["CNT_FAM_MEMBERS_ABS"] = new_app_data.CNT_FAM_MEMBERS
new_app_data.CNT_FAM_MEMBERS_ABS.fillna(0, inplace=True)
new_app_data.CNT_FAM_MEMBERS_ABS = new_app_data.CNT_FAM_MEMBERS_ABS.astype(int)
new_app_data.head()

##### Importing seaborn library

In [None]:
# Seaborn library will be used for graphical study

import seaborn as sns
import matplotlib.pyplot as plt

##### Checking the outliers

In [None]:
# Plotting graphs

plt.figure(figsize=(15,12))

plt.subplot(3, 2, 1)
plt.title("Date of Birth - Days")
sns.boxplot(new_app_data.DAYS_BIRTH_ABS).set(xlabel=None)

plt.subplot(3, 2 , 2)
plt.title("Employment - Days")
sns.boxplot(new_app_data.DAYS_EMPLOYED_ABS).set(xlabel=None)

plt.subplot(3, 2, 3)
plt.title("Document Change - Days")
sns.boxplot(new_app_data.DAYS_ID_PUBLISH_ABS).set(xlabel=None)

plt.subplot(3, 2, 4)
plt.title("Registration Change - Days")
sns.boxplot(new_app_data.DAYS_REGISTRATION_ABS).set(xlabel=None)

plt.subplot(3, 2 , 5)
plt.title("Family Members - Count")
sns.boxplot(new_app_data.CNT_FAM_MEMBERS_ABS).set(xlabel=None)

plt.show()

##### Binning of continuous variables

In [None]:
# Analysing the data range for birth days

new_app_data.DAYS_BIRTH_ABS.describe()

In [None]:
# Analysing the data range for family members count

new_app_data.CNT_FAM_MEMBERS_ABS.describe()

In [None]:
# Categorizing(binning) the request on the basis of days of birth and family members count

new_app_data['DAYS_BIRTH_BINS'] = pd.cut(new_app_data.DAYS_BIRTH_ABS, 
                                         bins = [5000, 9125, 14600, 21900, 35000], 
                                         labels = ['YOUNG', 'MILLENIALS', 'MIDDLE-AGED', 'OLD'])
new_app_data['CNT_FAM_MEMBERS_BINS'] = pd.cut(new_app_data.CNT_FAM_MEMBERS_ABS, 
                                              bins = [0, 1, 3, 25] , 
                                              labels= ['NO_DEPENDENTS', 'NORMAL_DEPENDENTS', 'MANY_DEPENDENTS'])
new_app_data[["SK_ID_CURR", "DAYS_BIRTH_BINS", "CNT_FAM_MEMBERS_BINS"]].head()

# Data Analysis - I

##### Bifurcating the data on the basis of TARGET variable

In [None]:
# Calculating the percentage of imbalance in the data

round(100 * new_app_data.TARGET.value_counts(normalize=True), 2)

##### Dividing the dataset

In [None]:
# Getting the dataset of people who paid their loan

able_to_payback_loan_data = new_app_data[new_app_data.TARGET == 0]
able_to_payback_loan_data.reset_index().head()

In [None]:
# Getting the dataset of people who could not paid their loan

not_able_to_payback_loan_data = new_app_data[new_app_data.TARGET == 1]
not_able_to_payback_loan_data.reset_index().head()

##### Univariate Analysis

In [None]:
# Analyzing continuos variables i.e. "CNT_FAM_MEMBERS" & "CNT_CHILDREN"

plt.figure(figsize=(15,5))

plt.subplot(1, 2, 1)
plt.title("Family Members Count")
sns.distplot(able_to_payback_loan_data.CNT_FAM_MEMBERS_ABS, hist = False, label = "Could Pay")
sns.distplot(not_able_to_payback_loan_data.CNT_FAM_MEMBERS_ABS, hist = False, label = "Had Difficulties")
plt.legend()

plt.subplot(1, 2, 2)
plt.title("Children Count")
sns.distplot(able_to_payback_loan_data.CNT_CHILDREN, hist = False, label = "Could Pay")
sns.distplot(not_able_to_payback_loan_data.CNT_CHILDREN, hist = False, label = "Had Difficulties")
plt.legend()

plt.show()

In [None]:
# Analyzing categorical variables i.e. "NAME_INCOME_TYPE" & "CODE_GENDER"

plt.figure(figsize=(15,10))

plt.subplot(2, 2, 1)
plt.title("Gender - Could Pay")
sns.countplot(able_to_payback_loan_data.CODE_GENDER).set(xlabel=None)

plt.subplot(2, 2, 2)
plt.title("Gender - Had Difficulties")
sns.countplot(not_able_to_payback_loan_data.CODE_GENDER).set(xlabel=None)

plt.subplot(2, 2, 3)
plt.xticks(rotation=90)
plt.title("Income Type - Could Pay")
sns.countplot(able_to_payback_loan_data.NAME_INCOME_TYPE).set(xlabel=None)

plt.subplot(2, 2, 4)
plt.xticks(rotation=90)
plt.title("Income Type - Had Difficulties")
sns.countplot(not_able_to_payback_loan_data.NAME_INCOME_TYPE).set(xlabel=None)

plt.show()

##### Bivariate Analysis

In [None]:
# Analyzing a categorical variable against a continous variable 

plt.figure(figsize = (20,4))
plt.subplot(1,2,1)
plt.ticklabel_format(style='plain')
plt.title("Able to Pay Loan")
sns.scatterplot(x = 'CNT_FAM_MEMBERS_ABS', y = 'AMT_INCOME_TOTAL', data = able_to_payback_loan_data)


plt.subplot(1,2,2)
plt.ticklabel_format(style='plain')
plt.title("Having Paying Difficulties")
sns.scatterplot(x = 'CNT_FAM_MEMBERS_ABS', y = 'AMT_INCOME_TOTAL', data = not_able_to_payback_loan_data)

plt.show()

##### Importing the numpy library

In [None]:
# Numpy library will be used for correlation analysis

import numpy as np

##### Finding correlation

In [None]:
# Finding correlation of numerical columns for TARGET = 0

able_to_payback_loan_corr = able_to_payback_loan_data[["DAYS_BIRTH_ABS", "DAYS_EMPLOYED_ABS" ,"CNT_FAM_MEMBERS_ABS", "CNT_CHILDREN", "AMT_INCOME_TOTAL"]].corr()
able_to_payback_loan_corr.where(np.triu(np.ones(able_to_payback_loan_corr.shape), k=1).astype(np.bool))

In [None]:
sns.heatmap(able_to_payback_loan_corr, cmap="Blues")

plt.show()

In [None]:
# Finding correlation of numerical columns for TARGET = 1

not_able_to_payback_loan_corr = not_able_to_payback_loan_data[["DAYS_BIRTH_ABS", "DAYS_EMPLOYED_ABS" ,"CNT_FAM_MEMBERS_ABS", "CNT_CHILDREN", "AMT_INCOME_TOTAL"]].corr()
not_able_to_payback_loan_corr.where(np.triu(np.ones(not_able_to_payback_loan_corr.shape), k=1).astype(np.bool))

In [None]:
sns.heatmap(not_able_to_payback_loan_corr, cmap="Oranges")

plt.show()

# Data Sourcing - II

##### Trying to read the 'previous_application.csv' file and getting various information from the data.

In [None]:
# Reading the csv file

prev_app_data = pd.read_csv("../input/credit-eda-case-study-data/previous_application.csv")

##### Trying to run various functions on the dataframe to know the basic info

In [None]:
# Reading the first five rows

prev_app_data.head()

In [None]:
# Reading the number of rows and columns

prev_app_data.shape

In [None]:
# Reading the names of the column

prev_app_data.columns

In [None]:
# Getting the information about the dataframe

prev_app_data.info()

##### Merging the two dataframes

In [None]:
# Merged the previous_application_data with the application_data

total_loan_count = prev_app_data.groupby(["SK_ID_CURR"]).agg("size").reset_index(name='Total Loan')
contract_type_loan_count = prev_app_data.groupby('SK_ID_CURR')['NAME_CONTRACT_TYPE'].value_counts().unstack().fillna(0).astype(int)
contract_status_loan_count = prev_app_data.groupby('SK_ID_CURR')['NAME_CONTRACT_STATUS'].value_counts().unstack().fillna(0).astype(int)

total_loan_count = total_loan_count.merge(contract_type_loan_count, left_on="SK_ID_CURR", right_on="SK_ID_CURR", how="left")
total_loan_count = total_loan_count.merge(contract_status_loan_count, left_on="SK_ID_CURR", right_on="SK_ID_CURR", how="left")

new_app_data = new_app_data.merge(total_loan_count, left_on="SK_ID_CURR", right_on="SK_ID_CURR", how="left")

new_app_data.head()

##### Univariate Analysis

In [None]:
# Describing Univariate Analysis for continuous variables 

plt.figure(figsize=(15,5))

plt.subplot(1, 2, 1)
plt.ticklabel_format(style='plain')
plt.title("Count of Applications on Amount")
sns.distplot(prev_app_data.AMT_APPLICATION, hist = False).set(xlabel=None)

plt.subplot(1, 2, 2)
plt.title("Probability of Start of the Day")
sns.distplot(prev_app_data.HOUR_APPR_PROCESS_START, hist = False).set(xlabel=None)

plt.show()

In [None]:
# Describing Univariate Analysis for categorical variables 

plt.figure(figsize = (20,4))

plt.subplot(1,2,1)
plt.ticklabel_format(style='plain')
plt.title("Count of Application on Weekdays")
sns.countplot(prev_app_data.WEEKDAY_APPR_PROCESS_START).set(xlabel=None)

plt.subplot(1,2,2)
plt.title("Count of Application for Contract Types")
sns.countplot(prev_app_data.NAME_CONTRACT_TYPE).set(xlabel=None)

plt.show()

### Bivariate Analysis

In [None]:
# Describing Bivariate Analysis for a categorical & a continous variables

plt.figure(figsize = (20,5))
plt.subplot(1,2,1)
plt.ticklabel_format(style='plain')
plt.title("Previous Credit vs Weekdays")
sns.boxplot(x = 'WEEKDAY_APPR_PROCESS_START', y = 'CNT_PAYMENT', data = prev_app_data).set(xlabel=None)

plt.subplot(1,2,2)
plt.title("Start Hour of the Day vs Contract Staus")
sns.boxplot(x  = 'NAME_CONTRACT_STATUS', y = 'HOUR_APPR_PROCESS_START', data = prev_app_data).set(xlabel=None)

plt.show()

In [None]:
# Describing Bivariate Analysis for two categorical variables 

plt.figure(figsize = (15,6))
plt.subplot(1,2,1)
plt.title("Weekdays & Contract Status vs Count")
sns.countplot('WEEKDAY_APPR_PROCESS_START', hue = 'NAME_CONTRACT_STATUS', data = prev_app_data).set(xlabel=None)


plt.subplot(1,2,2)
plt.title("Weekdays & Contract Type vs Count")
sns.countplot('WEEKDAY_APPR_PROCESS_START', hue = 'NAME_CONTRACT_TYPE', data = prev_app_data).set(xlabel=None)

plt.show()

In [None]:
# Describing Bivariate Analysis for two continuous variables 

plt.figure(figsize = (15,4))

plt.subplot(1,2,1)
plt.ticklabel_format(style='plain')
plt.title("Amount vs Start Hour of the day")
sns.scatterplot(x = 'HOUR_APPR_PROCESS_START', y = 'AMT_APPLICATION', data = prev_app_data).set(xlabel=None)


plt.subplot(1,2,2)
plt.ticklabel_format(style='plain')
plt.title("Amount vs Decisioning Days")
sns.scatterplot(x =abs(prev_app_data['DAYS_DECISION']), y = 'AMT_APPLICATION', data = prev_app_data).set(xlabel=None)

plt.show()

# Conclusion

•	Females face more difficulties in paying the loans as compared to male
<br>
•	Businessmen never face any difficulties in paying loans. So, they can be provided a greater number of loans in comparison to Working class which faces highest difficulties to pay the loan.
<br>
•	Loans of type ‘Revolving Loans’ can be put on focus as it is comparatively less popular.
<br>
•	Loans getting approved on ‘Saturday’ is significantly high compared to other weekdays.