# Python NoteBook for EDA Case Study Credit Analysis

### Importing Required libraries

In [None]:
# Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### As the data is large, set_option is used to avoid any shrink in data display

In [None]:
pd.set_option('display.max_columns',150)  ## By Default the pandas allowed max 30 cols, to present all the existing cols, pd.set_option is used
pd.set_option('display.max_info_columns', 150) ## to display all the info 
pd.set_option('display.max_rows',150) # to display all the rows
#?pd.set_option
pd.set_option('display.max_categories',150)

### Loading data set application_data.csv into data frame

In [None]:
# loading given data-set application_data into inp1 dataframe.

inp1 = pd.read_csv('../input/credit-eda-case-study/application_data.csv')


# EDA Analaysis:

### 1) Inspecting dataset to understand what it contains!!

In [None]:
#Inspecting Data Framee
inp1.head() # to display the top 5 rows and entire columns in data frame inp1

In [None]:
inp1.shape ## to see no .of rows and cols in data set

In [None]:
inp1.info() ## to see the info of data set

In [None]:
inp1.describe()  ## to see the mean, median statistical data in the dataframe

## 2) Cleaning Data Set :

It is very important in EDA to cleaning data set as we get data from multiple sources which leads to type error, row mis match, alignment issues differenct data types
We need to fix all these to improve eficacy for our analysis by changing suitable data types, handling null values etc.

In [None]:
## CLeaning data set
# finding null values in the data set
null_values = inp1.isnull().sum()  ## null values across data set assigning to variable null_values
null_values 
null_values = 100*(null_values)/len(inp1)  ## Converting null values to find null percentage
null_values

### Identifying columns which have morethan 50 percentage of null values and removing those columns for ease of analysing

In [None]:
excess_cols = null_values[null_values>50]
#excess_cols
excess_cols = null_values[null_values>50].index  ## to retrive column names which have 50 percentage of null values 
excess_cols

In [None]:
req_cols = null_values[null_values<50].index
req_cols

inp1 = inp1[req_cols] ## removing high percentage null values in data frame

In [None]:
## Inspecting data frame
inp1.head()
inp1.shape

In [None]:
100*(inp1.isnull().sum())/len(inp1)  ## Checking again for null values

### Extracting null values which are around 13 and handling these null values

In [None]:
null_values_1 = (100*(inp1.isnull().sum()/len(inp1))) ## Extracting null values 

null_values_1 = null_values_1[null_values_1>0].sort_values(ascending=False) ## Retrieving null values which are greater than 0 and sorted

null_values_1[null_values_1<15]

# Steps to approach for null values the above mentioned Columns and also OCCUPATION_TYPE Categorical column

## Considering below variables including categorical and continuous columns

###### AMT_REQ_CREDIT_BUREAU_QRT     Continous Column
###### AMT_REQ_CREDIT_BUREAU_HOUR    Continous Column
###### AMT_REQ_CREDIT_BUREAU_DAY     Continous Column
###### CNT_FAM_MEMBERS               Continous Column
###### NAME_TYPE_SUITE               Categorical Column
###### OCCUPATION_TYPE               Categorical Column



In [None]:
con_list = ['AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR','CNT_FAM_MEMBERS']  ## Continous Columns (Numeric Coulums)

# for i in con_list:
#     sns.boxplot(inp1[i])
#     plt.show()

plt.figure(figsize=(20,10))

plt.subplot(331)
sns.boxplot(inp1.AMT_REQ_CREDIT_BUREAU_QRT)

plt.subplot(332)
sns.boxplot(inp1.AMT_REQ_CREDIT_BUREAU_YEAR)

plt.subplot(333)
sns.boxplot(inp1.CNT_FAM_MEMBERS)
plt.show()
    



As we could see for the above continous columns 'AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR','AMT_REQ_CREDIT_BUREAU_DAY','CNT_FAM_MEMBERS' there are outliers present, so MEAN will have impact hence it is recommeded to use MEDIAN for these columns

# Categorical Columns

In [None]:
inp1.NAME_TYPE_SUITE.value_counts() ## Missing values recommedation for categorical variable


In [None]:
100*(inp1.OCCUPATION_TYPE.isnull().sum()/len(inp1.OCCUPATION_TYPE))  ## Checking for null values percentage for Occupation Type column

In [None]:
inp1.OCCUPATION_TYPE.value_counts()

#### ANS: By above execution we can impute MODE operation for NAME TYPE SUITE and OCCUPATION_TYPE as they are categorical columns. Also as NAME_TYPE_SUITE having very less percentage of null values  and can also removed these rows which doesn't get much impact for analysis.

## Checking DataTypes of all Columns in Data frame

In [None]:
inp1.info()

In [None]:
## From the above info of dataframe we could some datatypes miss match and hence we are converting to correct datatypes

res = inp1.nunique().sort_values() 
res## nunique is used for identifying unique values in column or dataframe
res = res[res <3].index   ## getting index for unique valued columns
for i in res:
    inp1[i]= inp1[i].astype('object')



In [None]:
inp1.info()

In [None]:
inp1.describe()

In [None]:
## By describe the dataframe we could see DAYS_BIRTH 	DAYS_EMPLOYED 	DAYS_REGISTRATION 	DAYS_ID_PUBLISH DAYS_LAST_PHONE_CHANGE are in negative.

## Hence days can't be negative it is required to change the values by using abs

inp1.DAYS_BIRTH= abs(inp1.DAYS_BIRTH)
inp1.DAYS_EMPLOYED = abs(inp1.DAYS_EMPLOYED)
inp1.DAYS_ID_PUBLISH = abs(inp1.DAYS_ID_PUBLISH)
inp1.DAYS_LAST_PHONE_CHANGE = abs(inp1.DAYS_LAST_PHONE_CHANGE)
inp1.DAYS_REGISTRATION = abs(inp1.DAYS_REGISTRATION)

In [None]:
inp1.describe()

In [None]:
numerical_dtypes = ['int64', 'float32', 'float64']

inp1.select_dtypes(include=numerical_dtypes).columns.tolist()

## Identifying Outliers for some numeric columns

#### AMT_CREDIT
#### CNT_CHILDREN
#### AMT_ANNUITY
#### AMT_REQ_CREDIT_BUREAU_YEAR
#### CNT_FAM_MEMBERS

In [None]:
plt.figure(figsize=(10,5))
plt.title("Amount Credit")
sns.boxplot(inp1.AMT_CREDIT)
plt.show()

#### Here we could see for AMT_CREDIT column there are outliers above max range which is above 1500000  in whisker plot and it is recommeded to remove these values for analyis or binning these into categorical 

In [None]:
plt.figure(figsize=(10,5))
plt.title('Count of Children for Client')
sns.boxplot(inp1.CNT_CHILDREN)
plt.show()

#### Here we could see for CNT_CHILDREN column there are outliers which are 19 maximum, to handle these outliers, median is best imputation or could be removed these entries as they won't effect EDA analysis

In [None]:
plt.figure(figsize=(10,5))
plt.title("Amount paying per annum of Previous Application")
sns.boxplot(inp1.AMT_ANNUITY)
plt.show()

In [None]:
inp1.AMT_ANNUITY.describe()

####  Approximately above max of whisker box plot these are vast no of outliers. For which these values median computaion it is better to remove the values

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(inp1.AMT_REQ_CREDIT_BUREAU_YEAR)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title("Count of family Members for Client")
sns.boxplot(inp1.CNT_FAM_MEMBERS)
plt.show()

# f) Binning for following continous variables 
    1) DAYS_BIRTH
    2) AMT_CREDIT
    3) AMT_INCOME_TOTAL

In [None]:
inp1['Client_Age'] = inp1.DAYS_BIRTH / 365
inp1['Client_Age'] = inp1.Client_Age.astype(int)
inp1.Client_Age.describe()

In [None]:
inp1['Age_Grp'] = pd.cut(inp1['Client_Age'], bins =[0,30,40,50,60,70] , labels = ['<30 years',  '30-40 years', '40-50 years', '50-60 years', '60+ years'])
inp1[['Client_Age','Age_Grp']].head()

In [None]:
plt.figure(figsize=(5,2))
sns.boxplot(inp1.AMT_INCOME_TOTAL)
plt.show()

In [None]:
inp1.AMT_CREDIT.describe()
Quantile_Range =list(inp1['AMT_CREDIT'].quantile([0.2,0.4,0.6,0.8,0.99,1]).astype(int))
Quantile_Range
def Credit_Amount_Cat(x):
    if (x<=Quantile_Range[0]):
        return 'Low'
    elif (x>Quantile_Range[0]) & (x<=Quantile_Range[1]):
        return 'Below_Avg'
    elif (x>Quantile_Range[1]) & (x<=Quantile_Range[2]):
        return 'Average'
    elif (x>Quantile_Range[2]) & (x<=Quantile_Range[3]):
        return 'Above_Avg'
    elif (x>Quantile_Range[3]) & (x<=Quantile_Range[4]):
        return 'High'
    elif (x>Quantile_Range[4]) & (x<=Quantile_Range[5]):
        return 'Very-High'


In [None]:
inp1['Credit_Amount_Cat'] = inp1.AMT_CREDIT.apply(lambda x:Credit_Amount_Cat(x))

In [None]:
inp1[['Credit_Amount_Cat','AMT_CREDIT']].head()

In [None]:
inp1.AMT_INCOME_TOTAL.describe()
Quantile_Range_Income = list(inp1['AMT_INCOME_TOTAL'].quantile([0.2,0.4,0.6,0.8,0.99,1]).astype(int)) # Quantile is function in which to compute the qth quantile of the given data
Quantile_Range_Income

In [None]:
def Client_Income(x):
    if (x<=Quantile_Range_Income[0]):
        return 'Working Class'   ## Which is relatively poor and works on daily wages 
    elif (x>Quantile_Range_Income[0]) & (x<=Quantile_Range_Income[1]):
        return 'Lower_Middle_Class'  ## which have some savings but not more than 2 months
    elif (x>Quantile_Range_Income[1]) & (x<=Quantile_Range_Income[2]):
        return 'Middle_class'
    elif (x>Quantile_Range_Income[2]) & (x<=Quantile_Range_Income[3]):
        return 'Upper_Middle_Class'
    elif (x>Quantile_Range_Income[3]) & (x<=Quantile_Range_Income[4]):
        return 'Rich'
    elif (x>Quantile_Range_Income[4]) & (x<=Quantile_Range_Income[5]):
        return 'Millionaire'


In [None]:
inp1['Income_Category'] = inp1.AMT_INCOME_TOTAL.apply(lambda x:Client_Income(x))

In [None]:
inp1[['Income_Category','AMT_INCOME_TOTAL']].head(10)

# 4) Analysis :

In [None]:
inp1.head() ## After data cleaning and fixing types examing the dataframe is good practise. (Note: It is always good practise to see the dataframe)

# 4) a) Checking for Variables for data imbalance percentage

In [None]:
100*inp1.TARGET.value_counts(normalize = True) ## Choosing target varaible for data imbalance percentage

In [None]:
target_balance = 100*inp1.TARGET.value_counts(normalize = True)
sizes = [target_balance[0],target_balance[1]]
labels = ['Target_0','Target_1']
colors = ['Green', 'Red']
plt.pie(sizes, labels=labels,autopct='%1.2f%%',colors=colors)
plt.show()

### We could see target column shows imbalance percentage over the dataset. Hence we can conclude that data has imbalance percetage of entries

##### Optional, This is just for optional column to check data imbalance percentage. Just incase column having null values how can we handle it

In [None]:
100*inp1.CODE_GENDER.value_counts(normalize=True)
inp1.loc[inp1['CODE_GENDER']=='XNA','CODE_GENDER']=np.nan  ## Replacing XNA values with null values

In [None]:
inp1['CODE_GENDER'].value_counts()
100*inp1.CODE_GENDER.value_counts(normalize=True)

In [None]:
gender_balance = 100*inp1.CODE_GENDER.value_counts(normalize=True)
sizes = [gender_balance[0],gender_balance[1]]
labels = ['Female','Male']
colors = ['Blue', 'Red']
plt.pie(sizes, labels=labels,autopct='%1.2f%%',colors=colors)
plt.show()

### We could see Code Gender column shows imbalance percentage over the dataset

### Before we start analysis taking 25-40 columns as we have privilage for this dataset.

In [None]:
inp1.head()

In [None]:
inp1.shape

### Trying to remove which are considering as not rquired for analysis. It's a subjective step.

In [None]:
inp1.columns
 
not_req_cols = []
for i in inp1.columns:
    if i.startswith('FLAG_DOCUMENT'):
        not_req_cols.append(i)
not_req_cols

## Droping these not required cols
inp1.drop(not_req_cols,inplace=True,axis=1)

In [None]:
inp1.shape
inp1.head()

In [None]:
#inp1[inp1.columns[:25]] 

In [None]:
inp1.columns

#### Even though after removing the columns by marking not required, still the data frame is huge. For ease of analysis selecting 3 columns of choice. This step is also subjective

In [None]:
## Considering 35 variables for case study analysis   
cols_ana_list = ['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE',
       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
       'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL',
       'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE',
        'OCCUPATION_TYPE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
       'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON',
       'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR',
       'Credit_Amount_Cat', 'Income_Category','Age_Grp']
len(cols_ana_list)

In [None]:
# appl_data = inp1[cols_ana_list]
inp1.head()
inp1.shape
appl_data = inp1[cols_ana_list]  ### Creating variable appl_data to load required cols into new data frame. Coming analysis will be doing on appl_data which is again subset of inp1 df

In [None]:
appl_data.head()

# Dividing DataSet into two sets based on Target Variable. Because to conduct univariate and bi variate analysis it is always recommeded to divide data frame into two sets on target variable
###  Target is a varible in data set which have two values 1 - with payment difficulties and 0 - not

In [None]:
target_0=appl_data.loc[appl_data['TARGET']==0]   ## loc is a python fucntion which helps to loacte values 
target_1=appl_data.loc[appl_data['TARGET']==1]

target_0.select_dtypes(include=object).columns.tolist()

# 4) C) Univariate Analysis

In [None]:
## Writing function for univariate analysis for graph mode explanation and also writing for loop for analysis at least 10 variables

def uni_cat_graph(target0,target1):
    plt.figure(figsize=(12,5))
    plt.subplot(2,2,1)
    sns.countplot(target0,order=target0.value_counts(normalize=True).index)
    plt.xticks(rotation=90)
    plt.title("Target_0")
    plt.subplot(2,2,2)
    sns.countplot(target1,order=target1.value_counts(normalize=True).index)
    plt.xticks(rotation=90)
    plt.title("Target_1")
    plt.show()
    
## Considering below Category columns for conducting Univariate Analysis    
cat_cols = ['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
'Credit_Amount_Cat','Age_Grp']

for i in cat_cols:
    uni_cat_graph(target_0[i],target_1[i])

### By Conducting Univariate analysis on categorical columns. The inferences are below:
We could see there are no much signicant inference we could able to draw, This is because of data imbalance percentage. As there is no balancing technique required we could draw inference with existing data.

1) If we consider about age group in target 0 the chances of getting non-defaulter for age_grp 30-40 is reletively high where as for the target 1 the chances of getting defaulted is also high

2) For the Credit amount category the low zone is having higher re-payments in target 0 and in target 1 the difficulties is for average credit amount taken holders.

3) Also for Code Gender category around 125000 we could see the female categgory is getting high in payment difficulties



### Correlation
 #### In EDA, Correlation is essential for conducting analysis to understand how and what variables are correlated with each other

In [None]:
num_cols = target_0.select_dtypes(include=numerical_dtypes).columns.tolist()
num_cols.remove('SK_ID_CURR')    ## Removing SK_ID_CURR column as it is an unique ID and won't be considered for correlation process
num_cols
corr_0 = target_0[num_cols].corr().T  ## T is required to display in tabular format 
corr_0 =corr_0.where(np.triu(np.ones(corr_0.shape), k=1).astype(np.bool))  
corr_0_df = corr_0.unstack().reset_index()
corr_0_df.columns = ['Num_Col1', 'Num_Col2', 'Correlation']
corr_0_df.dropna(subset = ['Correlation'], inplace  = True)
## ABS function is used because we may get neg values which is not considered as less rather it is considered as inversely proportioned to other variable
corr_0_df['abs_corr'] = corr_0_df['Correlation'].abs()   
corr_0_df.sort_values("abs_corr", ascending = False, inplace = True)
corr_0_df.head(10)

### Insights on Correlation of Numerical columns of 'Target 0'
    #1. Highest correlation exist between AMT_CREDIT and AMT_GOODS_PRICE there is something related to this column
    #2. There is negative correlation existing between CNT_CHILDREN and DAYS_BIRTH
    #3. Least correlation exist between the DAYS_EMPLOYED and DAYS_ID_PUBLISH

In [None]:
corr_1 = target_1[num_cols].corr().T
corr_1 =corr_1.where(np.triu(np.ones(corr_0.shape), k=1).astype(np.bool))
corr_1_df = corr_1.unstack().reset_index()
corr_1_df.columns = ['Num_Col1', 'Num_Col2', 'Correlation']
corr_1_df.dropna(subset = ['Correlation'], inplace  = True)
corr_1_df['abs_corr'] = corr_0_df['Correlation'].abs()
corr_1_df.sort_values("abs_corr", ascending = False, inplace = True)
corr_1_df.head(10)

### Insights on Correlation of Numerical columns of 'Target 1'
        1. Highest correlation exist between AMT_CREDIT and AMT_GOODS_PRICE there is something related to this column
        2. There is negative correlation existing between CNT_CHILDREN and DAYS_BIRTH com[aritively less with 'Target 0'
        3. Least correlation exist between the DAYS_EMPLOYED and DAYS_ID_PUBLISH
        
#### For the above correlation we could see almost the values which are correlated with one another is same as in target 0 and 1 dataframes. Also by above imputation we could AMT_GOODS_PRICE is highly correlated with AMT_CREDIT and also with reasonably less correlated with AMT_ANNUTITY

In [None]:
target_0.describe()

In [None]:

num_cols

In [None]:
num_cols_uni = num_cols[:9]
plt.figure(figsize = (15, 30))
for i in enumerate(num_cols):
    plt.subplot(8, 2, i[0]+1)
    sns.distplot(target_1[i[1]].dropna(), hist = False, label = "Target 1")
    sns.distplot(target_0[i[1]].dropna(), hist = False, label = "Target 0")
    

### Inferences for Univariate analysis on Numerical columns
     #1. The client reaching credit Bureau for enquiries are more non-defaulters than likely to default.
     #2. The client with no children are high in number of defaulters than not Defaulters.
     #3. If the Client's income is high then the client may not default the bank.
     #4. The applicant For consumer loans is likely to be non-defaulter than being defaulter.
     #5. If the clients days employed is above 350000 than it has more number of Non-Defaulters than likely to Defualt

# Bivariate Analysis:
    1) Categoriacal - Categorical 
    2) Categorical _ Continuous 
    3) Continuous - Continous

In [None]:
target_0.columns

In [None]:
target_0.info()

In [None]:
cat_cols_set = target_0.select_dtypes(include=['object','category']).columns.tolist()
con_cols_set = target_0.select_dtypes(include=numerical_dtypes).columns.tolist()
cat_cols_set,con_cols_set

In [None]:
numerical_dtypes

## Considering Flag_OWN_CAR, CODE_GENDER, NAME_HOUSING_TYPE

In [None]:
bi_cat_cols = ['FLAG_OWN_CAR','CODE_GENDER','NAME_HOUSING_TYPE','OCCUPATION_TYPE']
bi_con_cols = ['CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY']
bi_cat_con_cols = ['FLAG_OWN_CAR','CNT_CHILDREN','CODE_GENDER','AMT_INCOME_TOTAL']
from itertools import combinations
def com_funct(cols_list):
    com_list = []
    for i in range(0,len(cols_list)):
        com_list.extend(combinations(cols_list, i + 1))
    com_list_1=[]
    for i in com_list:
        if(len(i) == 2):
            com_list_1.append(i)
    return com_list_1

temp1 = com_funct(bi_cat_cols)
temp2 = com_funct(bi_con_cols)
temp3 = com_funct(bi_cat_con_cols)



In [None]:
com_list = [] 
for i in range(0,len(bi_cat_cols)): 
    com_list.extend(combinations(bi_cat_cols, i + 1)) 
com_list_1=[]
for i in com_list:
    if(len(i) == 2):
        com_list_1.append(i)
com_list_1


In [None]:
for i in temp1:
    plt.figure(figsize = (20,6))
    plt.subplot(1,2,1)
    plt.title("Target_0")
    plt.xticks(rotation=90)
    
    sns.countplot(x = i[0] , hue = i[1] , data = target_0)
    plt.subplot(1,2,2)
    plt.title("Target_1")
    plt.xticks(rotation=90)
    
    sns.countplot(x = i[0], hue = i[1], data = target_1)
    plt.show()

### Bivariate Analysis on Categorical columns of 'Target 0' and 'Target 1'
    #1. The female gender who does't own a car are more non-defaulters and its the same the same for likelt to default.
    #2. The male gender who owns a House / apartment are more likely to default than non-defaulters
    #3. The occupation type core staff who owns of House/apartment are more in number of non-Defaulters whereas the occupation type Laborers who owns of House/apartment are high in number to likely to default
    #4. The Male Gender laborers doesn't have significant difference between Non-Defaluting and likely to default and the female sales staff members are more in number of Non-Defaulters than likely to default.


In [None]:
for i in temp2:
    plt.figure(figsize = (10,5))
    plt.subplot(1,2,1)
    plt.title("Target_0")
    sns.scatterplot(i[0],i[1],data=target_0)
    plt.subplot(1,2,2)
    plt.title("Target_1")
    sns.scatterplot(i[0],i[1],data=target_1)
    
    plt.show()
    

### Insights Bivariate Analysis of Continous and categorical variables and Continous to continous
    #1. Applicants who have high income and with no childeren are more likely to default
    #2. Providing a loan amount of Range 500000-2500000 to the total income of Less 500000 are more likely to default than non-default
    #3. There are more people who haven't paid back their loans on time with a total income of less than 500000 and are more likely ot default.
    #4. The variables AMT_ANNUITY abd AMT_CREDIT for both non-defaulters and Defaulters has a strong correlation and also has similar pattern between them


In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
plt.title("Target_0")
sns.boxplot(x="FLAG_OWN_CAR",y='CNT_CHILDREN',data=target_0)
plt.subplot(1,2,2)
plt.title("Target_1")
sns.boxplot(x="FLAG_OWN_CAR",y='CNT_CHILDREN',data=target_1)
plt.show()

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
plt.title("Target_0")
sns.boxplot(x="CODE_GENDER",y='AMT_INCOME_TOTAL',data=target_0)
plt.subplot(1,2,2)
plt.title("Target_1")
sns.boxplot(x="CODE_GENDER",y='AMT_INCOME_TOTAL',data=target_1)
plt.show()

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
plt.title("Target_0")
sns.boxplot(x="CNT_CHILDREN",y='OCCUPATION_TYPE',data=target_0)
plt.subplot(1,2,2)
plt.title("Target_1")
sns.boxplot(x="CNT_CHILDREN",y='OCCUPATION_TYPE',data=target_1)
plt.show()

In [None]:
 
plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
plt.title("Target_0")
plt.xticks(rotation=90)
sns.boxplot(x="NAME_EDUCATION_TYPE",y='AMT_CREDIT',data=target_0)
plt.subplot(1,2,2)
plt.xticks(rotation=90)
plt.title("Target_1")
sns.boxplot(x="NAME_EDUCATION_TYPE",y='AMT_CREDIT',data=target_1)
plt.show()

In [None]:

plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
plt.title("Target_0")
plt.xticks(rotation=90)
sns.boxplot(x="NAME_EDUCATION_TYPE",y='AMT_ANNUITY',data=target_0)
plt.subplot(1,2,2)
plt.xticks(rotation=90)
plt.title("Target_1")
sns.boxplot(x="NAME_EDUCATION_TYPE",y='AMT_ANNUITY',data=target_1)
plt.show()


In [None]:
inp2 = pd.read_csv('../input/credit-eda-case-study/previous_application.csv') #importing Previous application data

In [None]:
inp2.describe()

In [None]:
comb_data=pd.merge(appl_data,inp2,how='inner',on='SK_ID_CURR') #Merging Previous data with application data by using inner join on SK_ID_CURR - reason for using Inner join is new applicants approval is largely based on the clients previous data so it will be better to have common data

In [None]:
comb_data.head()

In [None]:
100*(comb_data.isnull().sum()/len(comb_data))

In [None]:
null_percentage = 100*(comb_data.isnull().sum()/len(comb_data))
req_cols_1 = null_percentage[null_percentage<50]
req_cols_1 = req_cols_1.index
req_cols_1

In [None]:
comb_data = comb_data[req_cols_1]
comb_data.info()

### Univariate analysis on combined data on categorical varibales 

In [None]:
cat_cols_comb = ['NAME_CONTRACT_STATUS','NAME_CLIENT_TYPE','Income_Category','Credit_Amount_Cat','OCCUPATION_TYPE','CODE_GENDER','NAME_EDUCATION_TYPE']

In [None]:
list(enumerate(cat_cols_comb))

In [None]:
plt.figure(figsize = (20, 30))

for i in enumerate(cat_cols_comb):
    plt.subplot(6, 2, i[0]+1)
    plt.xticks(rotation=40)
    sns.countplot(x = i[1], hue = 'TARGET', data = comb_data)


In [None]:
num_cols_comb = ['AMT_INCOME_TOTAL','AMT_REQ_CREDIT_BUREAU_YEAR','AMT_REQ_CREDIT_BUREAU_QRT','AMT_ANNUITY_x','AMT_ANNUITY_y','AMT_CREDIT_x','AMT_CREDIT_y', 'CNT_CHILDREN']

In [None]:
list(enumerate(num_cols_comb))

In [None]:
num_cols_comb_uni = num_cols_comb[:7]
plt.figure(figsize = (15, 30))
for i in enumerate(num_cols_comb):
    plt.subplot(8, 2, i[0]+1)
    sns.distplot(comb_data[i[1]].dropna(), hist = False, label = "Target 1")
    sns.distplot(comb_data[i[1]].dropna(), hist = False, label = "Target 0")
    

## Bivariate Analysis on Numerical columns

In [None]:
sns.jointplot(comb_data['AMT_ANNUITY_y'],comb_data['AMT_APPLICATION'])
plt.show()

In [None]:
sns.jointplot(comb_data['AMT_ANNUITY_y'],comb_data['AMT_CREDIT_y'])
plt.show()

In [None]:
sns.jointplot(comb_data['AMT_ANNUITY_y'],comb_data['AMT_INCOME_TOTAL'])
plt.show()

In [None]:
sns.jointplot(comb_data['AMT_ANNUITY_x'],comb_data['AMT_INCOME_TOTAL'])
plt.show()

In [None]:
sns.jointplot(comb_data['AMT_ANNUITY_x'],comb_data['AMT_APPLICATION'])
plt.show()

## Bivariate Analysis on Categorical columns

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
sns.countplot(x = 'CODE_GENDER', hue = 'NAME_CONTRACT_STATUS', data = comb_data)
plt.title("CODE_GENDER_NAME_CONTRACT_STATUS")
plt.show()


In [None]:
cat_cols_comb = ['NAME_CONTRACT_STATUS','NAME_CLIENT_TYPE','Income_Category','Credit_Amount_Cat','OCCUPATION_TYPE','CODE_GENDER','NAME_EDUCATION_TYPE']

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
sns.countplot(x = 'NAME_CLIENT_TYPE', hue = 'NAME_CONTRACT_STATUS', data = comb_data)
plt.title("NAME_CLIENT_TYPE_Income_Category")
plt.show()

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
sns.countplot(x = 'Income_Category', hue = 'NAME_CONTRACT_STATUS', data = comb_data)
plt.xticks(rotation=90)
plt.title("Income_Category_NAME_CONTRACT_STATUS")
plt.show()

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
sns.countplot(x = 'Credit_Amount_Cat', hue = 'NAME_CONTRACT_STATUS', data = comb_data)
plt.xticks(rotation=90)
plt.title("Credit_Amount_Name_Contract_status")
plt.show()


In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,2,1)
sns.countplot(x = 'OCCUPATION_TYPE', hue = 'NAME_CONTRACT_STATUS', data = comb_data)
plt.xticks(rotation=90)
plt.title("Gender_Occupation_Type")
plt.show()


### Insights on Univariate Analysis and Bivariate analysis of Combined data frame of Categorical Varibales
    # 1. There are around 70000-80000 whose loans are approved who are likely to default and also over 200000 applicant's loan is refused who are less likely to defualt this would incurr loss to the bank.
    # 2. There is few applicant's loan with secondary / secondary special who face difficulties to pay loan on time than who are likely to pay on time
    # 3. Female Gender are more likely to not face payment difficulties then the male and hence it is recommended to approve more loans of Female Gender than the male gender at the same Female are High in number than who face difficulties than males
    # 4. Laborers are high in number of occupation type list who are likely to default or payment difficulties
    # 5. The Repeater applicant has High chance of non-Defaulting and also has high chance of defaulting when compared to new applicants
    # 6. No millionaire is likely to default so should not refused a application of millionaire's application for loan and Lower Middle class people are high in number to repay the loans