# Credit EDA Case Study 

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 100)

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Reading data set for the current application

In [None]:
df_curr_app=pd.read_csv('../input/credit-card/application_data.csv')

In [None]:
df_curr_app.shape

In [None]:
df_curr_app.info()

In [None]:
df_curr_app.describe()

### Data cleaning for the current application

In [None]:
df_curr_app.isnull().sum().head(50)

In [None]:
drop_cols=df_curr_app.isnull().sum()
drop_cols=drop_cols[drop_cols.values>(0.35*len(df_curr_app))] # checking columns having null values for more than 35%
len(drop_cols)

In [None]:
drop_cols= list(drop_cols[drop_cols.values>=0.35].index)
df_curr_app.drop(labels=drop_cols,axis=1,inplace=True)

In [None]:
df_curr_app.shape # checking whether the columns are dropped

In [None]:
df_curr_app.isnull().sum().tail(50)

In [None]:
df_curr_app.AMT_ANNUITY.describe()

In [None]:
df_curr_app.AMT_ANNUITY.median() # checking the median to impute the missing values

In [None]:
df_curr_app['AMT_ANNUITY']=df_curr_app.AMT_ANNUITY.fillna(df_curr_app.AMT_ANNUITY.median()) 
#Imputing missing values with median

In [None]:
df_curr_app['AMT_ANNUITY'].describe() #Missing values are imputed

In [None]:
df_curr_app.FLAG_DOCUMENT_2.unique() #Dropping columns starting with 'FLAG_DOC*' as we do not need it for analysis

In [None]:
# fetching all columns starting with 'FLAG_DOC*'
flag_cols=[x for x in df_curr_app if x.startswith('FLAG_DOC')] 
flag_cols

In [None]:
# dropping all columns starting with 'FLAG_DOC*'
df_curr_app.drop(labels=flag_cols,axis=1,inplace=True)

In [None]:
df_curr_app.isnull().sum()

In [None]:
df_curr_app.drop('NAME_TYPE_SUITE',axis=1,inplace=True)

In [None]:
df_curr_app.AMT_GOODS_PRICE.median()

In [None]:
# Imputing missing values with median for AMT_GOODS_PRICE
df_curr_app['AMT_GOODS_PRICE']=df_curr_app.AMT_GOODS_PRICE.fillna(df_curr_app.AMT_GOODS_PRICE.median())

In [None]:
df_curr_app.AMT_GOODS_PRICE.isnull().sum()

In [None]:
df_curr_app.OCCUPATION_TYPE.value_counts()

In [None]:
df_curr_app.OCCUPATION_TYPE.mode()[0]

In [None]:
# Replacing the missing values for OCCUPATION_TYPE column with mode as it is a categorical variable
df_curr_app['OCCUPATION_TYPE']=df_curr_app.OCCUPATION_TYPE.fillna(df_curr_app.OCCUPATION_TYPE.mode()[0])

In [None]:
# Checking whether the nulls are replaced for OCCUPATION_TYPE
df_curr_app.OCCUPATION_TYPE.isnull().sum() 

In [None]:
df_curr_app.isnull().sum()

In [None]:
#dropping social circle columns as the data description is not clear
df_curr_app.drop(['OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE'],axis=1,inplace=True)

In [None]:
df_curr_app.isnull().sum()

In [None]:
df_curr_app.EXT_SOURCE_2.describe()

In [None]:
df_curr_app.AMT_REQ_CREDIT_BUREAU_DAY.describe()

In [None]:
df_curr_app.AMT_REQ_CREDIT_BUREAU_HOUR.describe()

In [None]:
df_curr_app.AMT_REQ_CREDIT_BUREAU_HOUR.median()

In [None]:
df_curr_app['AMT_REQ_CREDIT_BUREAU_HOUR']=df_curr_app.AMT_REQ_CREDIT_BUREAU_HOUR.fillna(df_curr_app.AMT_REQ_CREDIT_BUREAU_HOUR.median)
df_curr_app['AMT_REQ_CREDIT_BUREAU_DAY']=df_curr_app.AMT_REQ_CREDIT_BUREAU_DAY.fillna(df_curr_app.AMT_REQ_CREDIT_BUREAU_DAY.median)
df_curr_app['AMT_REQ_CREDIT_BUREAU_WEEK']=df_curr_app.AMT_REQ_CREDIT_BUREAU_WEEK.fillna(df_curr_app.AMT_REQ_CREDIT_BUREAU_WEEK.median)
df_curr_app['AMT_REQ_CREDIT_BUREAU_MON']=df_curr_app.AMT_REQ_CREDIT_BUREAU_MON.fillna(df_curr_app.AMT_REQ_CREDIT_BUREAU_MON.median)
df_curr_app['AMT_REQ_CREDIT_BUREAU_YEAR']=df_curr_app.AMT_REQ_CREDIT_BUREAU_YEAR.fillna(df_curr_app.AMT_REQ_CREDIT_BUREAU_YEAR.median)
df_curr_app['AMT_REQ_CREDIT_BUREAU_QRT']=df_curr_app.AMT_REQ_CREDIT_BUREAU_QRT.fillna(df_curr_app.AMT_REQ_CREDIT_BUREAU_QRT.median)



In [None]:
df_curr_app.isnull().sum()

In [None]:
# Checking counts of CODE_GENDER
df_curr_app.CODE_GENDER.value_counts() 
# We have 4 values for XNA which means not available. We can replace these values with mode for code_gender column.

In [None]:
df_curr_app.CODE_GENDER=df_curr_app.CODE_GENDER.replace('XNA','F') #Replacing 'XNA' values for CODE_GENDER with mode i.e 'F'

In [None]:
df_curr_app.CODE_GENDER.value_counts()

In [None]:
#Checking Columns starting with 'DAYS_' for any incorrect datatype or outliers or errors
print(df_curr_app.DAYS_EMPLOYED.unique())
print(df_curr_app.DAYS_BIRTH.unique())
print(df_curr_app.DAYS_REGISTRATION.unique())

In [None]:
#We have negative values in these columns. Fetching columns starting with 'DAYS' into a new dataframe.
days_cols=[x for x in df_curr_app if x.startswith('DAYS')] 
days_cols

In [None]:
#Converting the values in columns starting with 'DAYS' to absolute
df_curr_app[days_cols]= abs(df_curr_app[days_cols])

In [None]:
# Checking whether the values are converted to absolute for few columns starting with 'DAYS' 
print(df_curr_app.DAYS_EMPLOYED.unique())
print(df_curr_app.DAYS_BIRTH.unique())
print(df_curr_app.DAYS_REGISTRATION.unique())

In [None]:
# Days_birth column has the age of the client in days and the datatype is float
df_curr_app.DAYS_BIRTH.describe() 

In [None]:
# Converting the age to years and the datatype to int
df_curr_app.DAYS_BIRTH = ((df_curr_app.DAYS_BIRTH)/365).astype('int')

In [None]:
df_curr_app.DAYS_BIRTH.describe() 

#### We can see that the distribution of age among applicants is as follows :
#### Minimum age applicant is 20 years and max is 69 years.

#### Bucketing / Binning of certain variables as per analysis

In [None]:
#We can bin the DAYS_BIRTH column to get the different buckets of age that have applied for the loan.
df_curr_app['DAYS_BIRTH_BINS']=pd.cut(df_curr_app.DAYS_BIRTH, bins=[19,40,60,100], labels=['Young_Age','Middle_Age','Senior_Citizen'])

In [None]:
df_curr_app.DAYS_BIRTH_BINS.value_counts()

In [None]:
# checking statistics for AMT_INCOME_TOTAL column
df_curr_app.AMT_INCOME_TOTAL.describe()

In [None]:
# binning AMT_INCOME_TOTAL column based on quantiles
df_curr_app['AMT_INCOME_QRANGE'] = pd.qcut(df_curr_app.AMT_INCOME_TOTAL, q=[0,0.2,0.4,0.6,0.8,1], labels=['VERY LOW','LOW','MEDIUM','HIGH','VERY HIGH'])

In [None]:
# binning AMT_INCOME_TOTAL column based on values
df_curr_app['AMT_INCOME_RANGE'] = pd.cut(df_curr_app.AMT_INCOME_TOTAL,bins=[0,50000,150000,300000,500000,1000000], labels=['VERY LOW','LOW','MEDIUM','HIGH','VERY HIGH'])
df_curr_app.AMT_INCOME_RANGE.value_counts()

In [None]:
#Analysing the TARGET variable
df_curr_app.TARGET.value_counts() # 1 - Client with payment difficulties, 0 -- Client with no payment difficulties

In [None]:
#Segmenting the data frame w.r.t to the target variable
df_target0 = df_curr_app.loc[df_curr_app.TARGET==0]
df_target1 = df_curr_app.loc[df_curr_app.TARGET==1]                        

In [None]:
#Checking the count of values in TARGET variable after segmenting
print(df_target0.shape)
print(df_target1.shape)

In [None]:
#Finding the imbalance ratio for the TARGET variable
round(len(df_target0)/len(df_target1),2)

#### The imbalance ratio is 11.39

### Univariate analysis for different variables

In [None]:
fig, ax =plt.subplots(1,2,figsize=[15,4])

sns.countplot(data=df_target0,x = 'AMT_INCOME_QRANGE',order = df_target0['AMT_INCOME_QRANGE'].value_counts().index,ax=ax[0])
sns.countplot(data=df_target1,x = 'AMT_INCOME_QRANGE',order = df_target1['AMT_INCOME_QRANGE'].value_counts().index,ax=ax[1])
ax[0].set_title('Fig1:Income QRange for Applicants with NO payment difficulties')
ax[1].set_title('Fig2:Income QRange for Applicants with payment difficulties')
plt.show()

#### Fig1:Income QRange for Applicants with NO payment difficulties :
1. Applicants with medium income range have the lowest count in terms of NO payment difficulties.
2. Applicants with low income range have the highest count in terms of NO payment difficulties.

#### Fig2:Income QRange for Applicants with payment difficulties:
1. Applicants with Very High Income range have the lowest count in terms of payment difficulties.
2. Applicants with low income range have the highest count in terms of payment difficulties.


In [None]:
fig =plt.subplots(1,2,figsize=[15,4])
plt.subplot(1,2,1)
df_target0.DAYS_BIRTH_BINS.value_counts().plot.pie(autopct='%1.0f%%')
plt.title('Age groups of Applicants with NO payment difficulties ')
plt.subplot(1,2,2)
df_target1.DAYS_BIRTH_BINS.value_counts().plot.pie(autopct='%1.0f%%')
plt.title('Age groups of Applicants with payment difficulties ')
plt.show()

1. The Middle_Age group has the maximum percentage of clients with NO payment difficulties followed by Young_Age and Senior_Citizen age groups.
2. The Young_Age group has the maximum percentage of clients with payment difficulties followed by Middle_Age and Senior_Citizen age groups.
3. By comparing the two graphs, we can infer that Middle_Age group has lower risk on payments as compared to Young_Age group.
4. Hence, there is a possibility that the Young_Age group will be having more payment difficulties.

In [None]:
plt.figure(figsize=[12,6])
sns.countplot(data=df_target0,x = 'NAME_EDUCATION_TYPE',order = df_target0['NAME_EDUCATION_TYPE'].value_counts().index)
plt.title('EDUCATION TYPE of Applicants with NO payment difficulties')
plt.show()

1. We can infer that the Applicants with Secondary Education type have the highest applications with no payment difficulties.

In [None]:
plt.figure(figsize=[12,6])
sns.countplot(data=df_target1,x = 'NAME_EDUCATION_TYPE',order = df_target1['NAME_EDUCATION_TYPE'].value_counts().index)
plt.title('EDUCATION TYPE of Applicants with payment difficulties')
plt.show()

1. We can infer that the Applicants with Secondary Education type have the highest applications with payment difficulties.
2. Both the above Education plots have a similar profile with respect to payment difficulties.
3. All education types have a lower count in payment difficulties as compared to no payment difficulties. 

In [None]:
plt.figure(figsize=[10,4])
df_target0.NAME_HOUSING_TYPE.value_counts().plot.barh()
plt.title('HOUSING TYPE of Applicants with NO payment difficulties')
plt.show()

1. We can infer that maximum applicants own a HOUSE/apartment with NO payment difficulties.

In [None]:
plt.figure(figsize=[10,4])
df_target1.NAME_HOUSING_TYPE.value_counts().plot.barh()
plt.title('HOUSING TYPE of Applicants with payment difficulties')
plt.show()

1. We can infer that maximum applicants own a HOUSE/apartment with payment difficulties.
2. Both the HOUSING TYPE plots have a similar profile with respect to payment difficulties but differ in count.

In [None]:
plt.figure(figsize=[10,4])
df_target0.NAME_FAMILY_STATUS.value_counts().plot.barh(color=['cyan', 'red', 'green', 'blue', 'black'])
plt.title('FAMILY STATUS of Applicants with NO payment difficulties')
plt.show()

Married applicants have the highest count with NO payment difficulties.

In [None]:
plt.figure(figsize=[10,4])
df_target1.NAME_FAMILY_STATUS.value_counts().plot.barh(color=['cyan', 'red', 'green', 'blue', 'black'])
plt.title('FAMILY STATUS of Applicants with payment difficulties')
plt.show()

1. Married applicants have the highest count with payment difficulties.
2. Both the FAMILY STATUS plots have a similar profile with respect to payment difficulties but differ in count.

In [None]:
plt.figure(figsize=[10,4])
df_target0.NAME_INCOME_TYPE.value_counts().plot.barh(color=['pink','yellow','orange','cyan', 'red', 'green', 'blue', 'black'])
plt.title('INCOME TYPE of Applicants with NO payment difficulties')
plt.show()

1. There are 4 incomes types with no payment difficulties ( State servant, Pensioner, Commercial associate and Working) whereas the others are insignificant.
2. Among all, Working professionals have the highest count of applications with no payment difficulties.

In [None]:
plt.figure(figsize=[10,4])
df_target1.NAME_INCOME_TYPE.value_counts().plot.barh(color=['pink','yellow','orange','cyan', 'red', 'green', 'blue', 'black'])
plt.title('INCOME TYPE of Applicants with NO payment difficulties')
plt.show()

1. Among all, Working professionals have the highest count of applications with payment difficulties whereas State servants have the lowest count of applications.

### Bivariate analysis for different variables

In [None]:
plt.figure(figsize=[10,6])
sns.scatterplot(data=df_target0, x='AMT_INCOME_TOTAL', y = 'AMT_CREDIT')
plt.show()

From the above scatter plot for  Target 0 we can infer that :
1. Maximum credit amount applications are for total income range between 0 to 25 lacs and Applicants with higher total income levels have lesser loan applications.
2. High Income totals do not amount to high credit amounts. 
3. Hence we can infer that total income range between 0 to 25 lacs have the most loan applications.


In [None]:
plt.figure(figsize=[10,6])
sns.scatterplot(data=df_target1, x='AMT_INCOME_TOTAL', y = 'AMT_CREDIT')
plt.show()

From the above scatter plot for  Target 1 we can infer that :
1. Maximum loan applications are between the range of approx. 0-25 lakhs for income total. 
2. Amount credit for such loan applications ranges between 0-40 lakhs.
3. We can see 1 outlier for income total at 11.7 crores.

In [None]:
plt.figure(figsize=[10,6])
sns.pairplot(df_target0[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','DAYS_BIRTH']])
plt.show()

#Inferences for target 0 in the pair plot for numerical variables :
1. We can see a high correlation between AMT_ANNUITY and AMT_GOODS_PRICE.
2. We can see a good correlation between AMT_ANNUITY and AMT_CREDIT.
3. We can see few outliers for DAYS_BIRTH and AMT_INCOME_TOTAL.
4. Also,we can see a high correlation between AMT_CREDIT and AMT_GOODS_PRICE.

In [None]:
plt.figure(figsize=[10,6])
sns.pairplot(df_target1[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','DAYS_BIRTH']])
plt.show()

#Inferences for target 1 in the pair plot for numerical variables :
1. We can see a good correlation between AMT_ANNUITY and AMT_GOODS_PRICE.
2. We can see a good correlation between AMT_ANNUITY and AMT_CREDIT.
3. We can see 1 outlier for DAYS_BIRTH and AMT_INCOME_TOTAL.
4. Also,we can see a high correlation between AMT_CREDIT and AMT_GOODS_PRICE.

In [None]:
sns.barplot(data=df_curr_app,x='TARGET',y='AMT_INCOME_TOTAL', estimator = np.median)
plt.title('AMT_INCOME_TOTAL v/s TARGET')
plt.show()

From the above plot, we can infer that the applicants with NO payment difficulties have high income compared to applicants with payment difficulties. 

In [None]:
plt.figure(figsize=[15,12])
sns.countplot(data=df_target0, y = 'ORGANIZATION_TYPE', order=df_target0.ORGANIZATION_TYPE.value_counts().index)
plt.title('Organization Type of Applicants with NO payment difficulties')
plt.show()

Inferences :
1. Business Type Entity 3 has the highest amount of applicants with no payment difficulties.
2. Self-employed, Medicine and other organization types are few other groups having no payment difficulties.
3. 'XNA' undefined organization type has the second highest amount of applicants with no payment difficulties.

In [None]:
plt.figure(figsize=[15,12])
sns.countplot(data=df_target1, y = 'ORGANIZATION_TYPE', order=df_target1.ORGANIZATION_TYPE.value_counts().index)
plt.title('Organization Type of Applicants with payment difficulties')
plt.show()

Inferences :
1. Business Type Entity 3 has the highest amount of applicants with payment difficulties.
2. Business Type Entity 2, Construction and other organization types are few other groups having payment difficulties.
3. Self-employed type has the second highest amount of applicants with payment difficulties.
4. XNA undefined organization type has the third highest amount of applicants with payment difficulties as compared to applicants with no payment difficulties. 

In [None]:
plt.figure(figsize=[15,12])
sns.countplot(data=df_target0, y = 'OCCUPATION_TYPE', order=df_target0.OCCUPATION_TYPE.value_counts().index)
plt.title('Occupation Type of Applicants with NO payment difficulties')
plt.show()

#Inferences:
1. Laborers have the highest amount of applications with no payment difficulties.
2. Sales staff has the second highest amount of applications with no payment difficulties.
3. HR and IT staff have the lowest amount of applications with no payment difficulties.

In [None]:
plt.figure(figsize=[15,12])
sns.countplot(data=df_target1, y = 'OCCUPATION_TYPE', order=df_target1.OCCUPATION_TYPE.value_counts().index)
plt.title('Occupation Type of Applicants with payment difficulties')
plt.show()

#Inferences:
1. Laborers have the highest amount of applications with payment difficulties.
2. Drivers have the third highest amount of applications with payment difficulties.

In [None]:
sns.countplot(data=df_target0, x = 'AMT_INCOME_QRANGE', order=df_target0.AMT_INCOME_QRANGE.value_counts().index,hue = 'CODE_GENDER')
plt.title('Income Range for Applicants with NO payment difficulties')
plt.show()

#### From the above graph for NO PAYMENT DIFFICULTIES we can visualize the split in income range basis gender and following are the observations:
1. In terms of counts LOW income range has the maximum aplications while Medium income group has the lowest applications.
2. Within Low, High and Very Low Income range it is evident that Female applicants are significantly more than male applicants.
3. For Very High Income range both Male and Female have a balanced ratio with Female applicants being slightly higher in count.

In [None]:
sns.countplot(data=df_target1, x = 'AMT_INCOME_QRANGE', order=df_target1.AMT_INCOME_QRANGE.value_counts().index,hue = 'CODE_GENDER')
plt.title('Income Range for Applicants with payment difficulties')
plt.show()

#### From the above graph for  PAYMENT DIFFICULTIES we can visualize the split in income range basis gender and following are the observations:
1. In terms of counts LOW income range has the maximum aplications while Very High income group has the lowest applications.
2. Within Low, Very Low Income range it is evident that Female applicants are significantly more than Male applicants.
3. For Very High Income range both Male and Female have a balanced ratio with Male applicants being slightly higher in count.

### Top 10 correlation for Target variable

In [None]:
# Finding the correlation matrix for numeric columns for Target=0 
target0_corr = df_target0[['AMT_ANNUITY','AMT_CREDIT','AMT_INCOME_TOTAL','AMT_GOODS_PRICE','CNT_CHILDREN','CNT_FAM_MEMBERS',
           'DAYS_BIRTH','DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH']].corr(method='pearson')
target0_corr

In [None]:
plt.figure(figsize=[12,7])
sns.heatmap(data=target0_corr, cmap='YlGnBu', annot=True)
plt.title('Correlation Heatmap for No Payment difficulties')
plt.show()


#### Inferences for Target 0:
We can see a positive correlation for the following variables in the descending order :
1. AMT_CREDIT AND AMT_GOODS_PRICE - Highest correlated
2. CNT_CHILDREN AND CNT_FAM_MEMBERS
3. AMT_GOODS_PRICE AND AMT_ANNUITY
4. AMT_CREDIT AND AMT_ANNUITY
5. DAYS_EMPLOYED AND DAYS_BIRTH

We can see a negative correlation for the following variables in the descending order :
1. DAYS_BIRTH AND AMT_ANNUITY
2. DAYS_BIRTH AND AMT_INCOME_TOTAL
3. DAYS_EMPLOYED AND AMT_GOODS_PRICE

In [None]:
# Finding the correlation matrix for numeric columns for Target=1
target1_corr = df_target1[['AMT_ANNUITY','AMT_CREDIT','AMT_INCOME_TOTAL','AMT_GOODS_PRICE','CNT_CHILDREN','CNT_FAM_MEMBERS',
           'DAYS_BIRTH','DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH']].corr(method='pearson')
target1_corr

In [None]:
plt.figure(figsize=[12,7])
sns.heatmap(data=target1_corr, cmap='YlGnBu', annot=True)
plt.title('Correlation Heatmap for Payment difficulties')
plt.show()

#### Inferences for Target 1:
We can see a positive correlation for the following variables in the descending order :
1. AMT_CREDIT AND AMT_GOODS_PRICE - Highest correlated
2. CNT_CHILDREN AND CNT_FAM_MEMBERS
3. AMT_GOODS_PRICE AND AMT_ANNUITY

We can see a negative correlation for the following variables in the descending order :
1. DAYS_BIRTH AND AMT_INCOME_TOTAL
2. DAYS_EMPLOYED AND AMT_GOODS_PRICE

## Reading data set for the previous application

In [None]:
#Reading the data set
df_prev_app=pd.read_csv('../input/credit-card/previous_data.csv')

In [None]:
#Checking the data
df_prev_app.head()

In [None]:
#Checking the columns
df_prev_app.columns

In [None]:
# Checking the datatypes and info
df_prev_app.info()

In [None]:
#checking the size of dataframe
df_prev_app.shape

### Data Cleaning for previous application

In [None]:
# checking columns with missing values
df_prev_app.isnull().sum()

In [None]:
# checking the percentage of null values
round(df_prev_app.isnull().sum()/len(df_prev_app),2)*100

In [None]:
# dropping unnecessary columns

df_prev_app.drop(['DAYS_FIRST_DRAWING','DAYS_FIRST_DUE','DAYS_LAST_DUE_1ST_VERSION','DAYS_LAST_DUE','DAYS_TERMINATION',
'NFLAG_INSURED_ON_APPROVAL','CNT_PAYMENT','NAME_TYPE_SUITE','RATE_INTEREST_PRIMARY','RATE_INTEREST_PRIVILEGED',
'RATE_DOWN_PAYMENT','AMT_GOODS_PRICE'],axis=1,inplace=True)

In [None]:
# Checking whether columns are dropped 
round(df_prev_app.isnull().sum()/len(df_prev_app),2)*100

### Handling Missing values for previous application

In [None]:
len(df_prev_app[(df_prev_app.AMT_APPLICATION==0.0) & (df_prev_app.AMT_ANNUITY.isnull())]) 
# these many columns are null for AMT_ANNUITY which have AMT_APPLICATION as 0.0 value
# so replacing these null values for AMT_ANNUITY with 0.0

In [None]:
# Replacing missing values for AMT_ANNUITY with 0.0 
df_prev_app['AMT_ANNUITY']=df_prev_app.AMT_ANNUITY.fillna(0.0)

In [None]:
# Finding the mode of AMT_DOWN_PAYMENT to replace the missing values

df_prev_app.AMT_DOWN_PAYMENT.mode()[0]

In [None]:
# Filling the missing values with mode
df_prev_app['AMT_DOWN_PAYMENT']=df_prev_app.AMT_DOWN_PAYMENT.fillna(df_prev_app.AMT_DOWN_PAYMENT.mode()[0])

In [None]:
df_prev_app.AMT_DOWN_PAYMENT.isnull().sum()

In [None]:
df_prev_app.AMT_ANNUITY.isnull().sum()

### Finding outliers for previous application dataset

In [None]:
# Univariate analysis for AMT_ANNUITY column
sns.set_style("whitegrid")
plt.figure(figsize=[8,8])
sns.boxplot(data=df_prev_app,x='AMT_ANNUITY',orient='v')
plt.show()

#### We observe that there are many outliers for AMT_ANNUITY column in previous application above 50000.

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=[8,8])
sns.boxplot(data=df_prev_app,x='AMT_CREDIT',orient='v')
plt.show()

#### We observe that there are many outliers for AMT_CREDIT column in previous application as well above 500000.

### Univariate and Bivariate analysis for previous application dataset

In [None]:
# Checking the counts of different contract status
df_prev_app.NAME_CONTRACT_STATUS.value_counts()

In [None]:
plt.figure(figsize=[8,5])
df_prev_app.NAME_CONTRACT_STATUS.value_counts().plot.pie(autopct='%1.0f%%')
plt.title('Status of previous loan applications')
plt.show()

#Inferences :
1. We can conclude that the majority of applications were approved.
2. The count of applications which were canceled or refused are approximately same.
3. Only 2% of loan applications were not accepted by the applicants.

In [None]:
df_prev_app['NAME_CONTRACT_TYPE'].value_counts().plot.bar(color=['purple','cyan','green','red'])
plt.title('Types of loans for previous applications')
plt.show()

#Inferences :
1. Cash loans has the highest count of previous applications followed by consumer loans.
2. Revolving loans has lowest count of previous applications.

In [None]:
sns.barplot(data=df_prev_app, x = 'NAME_CONTRACT_TYPE', y='AMT_ANNUITY')
plt.title('Types of loans vs Amount Annuity')
plt.show()

#Inferences:
1. Cash loans has the highest amount of annuity.
2. Revolving loans has the lowest amount of annuity.

In [None]:
sns.barplot(data=df_prev_app, x = 'NAME_CONTRACT_STATUS', y='AMT_CREDIT', hue='NAME_CONTRACT_TYPE')
plt.title('Status of loans vs Amount credit')
plt.show()

#### Status of loans vs Amount credit :
1. Approved loans are maximum for Cash loans contract type and minimum for consumer loans.
2. Revolving loans have the same threshold for approved and refused status.
3. For cash loans contract type, the refused loans threshold is above 420000 credit amount as compared to Approved loans.
4. Cancelled loans are usually below 50000 credit amount threshold. 
5. For Unused offer, Revolving and Consumer loans are the 2 loan type categories.

In [None]:
sns.barplot(data=df_prev_app, x = 'NAME_CONTRACT_STATUS', y='AMT_APPLICATION', hue='NAME_CONTRACT_TYPE')
plt.title('Status of loans vs Application Amount')
plt.show()

#### Status of loans vs Amount Application :
1. Approved and Refused loans are maximum for Cash loans contract type and minimum for consumer loans.
2. Cancelled loans are usually for consumer loans. 
3. For all contract types, Refused loans were more than approved loans.

In [None]:
# checking counts for payment type
df_prev_app.NAME_PAYMENT_TYPE.value_counts()

In [None]:
df_prev_app.NAME_PAYMENT_TYPE.value_counts().plot.barh(color='yellow')
plt.show()

#Inferences:
1. Cash through the bank is the most common type of payment method for loan applications.

In [None]:
df_prev_app.CODE_REJECT_REASON.value_counts().plot.bar(colormap='Accent')
plt.show()

#Inferences:
1. We can infer from the above graph that XAP is the top most reject reason for the applications. 
2. The count for other reject reasons is very less and can be excluded for the analysis.

### Merging the two dataframes ( current application with previous application ) :

In [None]:
#Merging application dataset with previous application dataset
df_merged=pd.merge(left = df_curr_app, right = df_prev_app, how='inner', on = 'SK_ID_CURR', suffixes='_x')
df_merged.columns

In [None]:
df_merged.shape

In [None]:
# creating a pivot table 
plot1 = pd.pivot_table(df_merged, values='TARGET', index=['NAME_CLIENT_TYPE'], columns=['NAME_CONTRACT_STATUS'], aggfunc = np.mean)
plot1

In [None]:
plot1.plot(kind='bar', figsize=[20,10]).legend()
plt.title('Client Types by Loan Status')
plt.show()

#### Inferences :
1. Most of the loan applications for new clients are cancelled.
2. For repeater clients, most of the loan applications are refused.
3. Across all the different client types, the 'NEW' client type has the maximum approved loan applications.
4. Across all the different client types, the 'XNA' client type has the maximum refused loan applications followed by 'REPEATER' client type.

In [None]:
# creating a pivot table 
plot2 = pd.pivot_table(df_merged, values='TARGET', index=['OCCUPATION_TYPE'], columns=['NAME_CONTRACT_STATUS'], aggfunc = np.mean)
plot2

In [None]:
plot2.plot(kind='barh', figsize=[20,25]).legend()
plt.title('Occupation Types by Loan Status')
plt.show()

#### Inferences :
1. Low-skill laborers, Laborers and Drivers have the highest amount of applications being refused as compared to being approved.
2. IT staff has the highest amount of applications being unused offer status but it has the lowest refusal rate as well.
3. IT staff, Accountants, Medicine Staff, Private Service Staff have comparatively balanced refusal and approval rates.

In [None]:
# creating a pivot table 
plot3 = pd.pivot_table(df_merged, values='TARGET', index=['AMT_INCOME_QRANGE'], columns=['NAME_CONTRACT_STATUS'], aggfunc = np.mean)
plot3

In [None]:
plot3.plot(kind='bar', figsize=[15,5]).legend()
plt.title('Income level by Loan Status')
plt.show()

### Inferences :
1. Medium income range has the highest amount of applicants being refused.
2. The Approved applications are approx. similar for all income groups.

In [None]:
# creating a pivot table 
plot4 = pd.pivot_table(df_merged, values='TARGET', index=['NAME_EDUCATION_TYPE'], columns=['NAME_CONTRACT_STATUS'], aggfunc = np.mean)
plot4

In [None]:
plot4.plot(kind='bar', figsize=[10,5]).legend()
plt.title('Education type by Loan Status')
plt.show()

### Inferences :
1. The lower secondary education has the most number of applications being approved.
2. The academic degree has the highest number of applications unused.
3. The approved applications are less in number for Higher education type.

In [None]:
# creating a pivot table 
plot5 = pd.pivot_table(df_merged, values='TARGET', index=['NAME_INCOME_TYPE'], columns=['NAME_CONTRACT_STATUS'], aggfunc = np.mean)
plot5

In [None]:
plot5.plot(kind='bar', figsize=[10,5]).legend()
plt.title('Professions by Loan Status')
plt.show()

### Inferences :
1. Maternity leave group has the same count for all loan statuses.
2. Unemployed group has the highest cancellations, refusals and approvals after Maternity leave group.
3. All other groups except student have approx. similar loan statuses.

In [None]:
# Checking the counts of Income type
df_merged.NAME_INCOME_TYPE.value_counts()

## Conclusion :

1. Banks should focus more on new applicants those who belong to IT staff, Accountants, Medicine Staff or Private Service Staff group since the loan statuses(approved, refused, cancelled) are stable for such groups.
2. Banks should not focus more on Low-skill laborers, Laborers and Drivers occupation type as they have a high refusal rate.
3. Income level do not affect the loan application statuses and hence bank can choose to focus on other attributes.
4. Secondary education and lower education is a stable education group to focus for approval of loans as the number of applicants for these groups is higher.
5. Banks should target Working, Commercial associate and State servants income type as number of applicants as well as the stats are stable.