In [None]:
## importing warnings library to ignore warnings
import warnings
warnings.simplefilter('ignore')

#import important and relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

#options to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
#load the datasets
application_data= pd.read_csv('../input/credit-card/application_data.csv')
pre_application= pd.read_csv('../input/credit-card/previous_application.csv')

In [None]:
#To view top 5 rows of data
application_data.head()

In [None]:
application_data.info()

In [None]:
#To check the data types of all the attributes
application_data.dtypes

In [None]:
#To see the statistical parameters of all numerical columns
application_data.describe()

In [None]:
#To check the shape of dataset
application_data.shape

## Data Pre-processing

In [None]:
#calculating the null values percentage of all attributes in dataset
null_percentage= (application_data.isna().sum().sort_values(ascending=False)/ len(application_data))*100
null_percentage

In [None]:
#dropping the columns having more than 50 % null values and storing in a new dataframe called df
df= application_data.drop(columns= null_percentage[null_percentage > 50.0].index)
df.head()

In [None]:
#checking the shape of dataset after dropping the columns
df.shape

In [None]:
#looking at the number of null-values present in all attributes 
null= df.isnull().sum().sort_values(ascending=False)
null= null[null>0]
null

### Handling the missing values in each column

In [None]:
df['FLOORSMAX_AVG'].head()

In [None]:
df.FLOORSMAX_AVG.mean()

In [None]:
df['FLOORSMAX_AVG'].describe()

In [None]:
df.FLOORSMAX_AVG.median()

In [None]:
#Visualizing the column 'FLOORSMAX_AVG'
fig= plt.figure(figsize=[10,5])

ax1= plt.subplot(1,2,1)
sns.distplot(df['FLOORSMAX_AVG'])

ax2= plt.subplot(1,2,2)
sns.boxplot(df['FLOORSMAX_AVG'])

plt.show()

In [None]:
#Imputing the missing values with median as the data is seems to be left-skewed.
df['FLOORSMAX_AVG']= df['FLOORSMAX_AVG'].fillna(df['FLOORSMAX_AVG'].median())
df['FLOORSMAX_AVG'].isnull().sum()

In [None]:
df['FLOORSMAX_MEDI'].describe()

In [None]:
fig= plt.figure(figsize=[10,5])
ax1= plt.subplot(1,2,1)
sns.distplot(df['FLOORSMAX_MEDI'])

ax2= plt.subplot(1,2,2)
sns.boxplot(df['FLOORSMAX_MEDI'])

plt.show()

In [None]:
#Imputing the median as data is right-skewed.
df['FLOORSMAX_MEDI']= df['FLOORSMAX_MEDI'].fillna(df['FLOORSMAX_MEDI'].median())
df['FLOORSMAX_MEDI'].isnull().sum()

In [None]:
df['FLOORSMAX_MODE'].describe()

In [None]:
df['FLOORSMAX_MODE'].mode()

In [None]:
##Imputing the median as data is right-skewed.
df['FLOORSMAX_MODE']= df['FLOORSMAX_MODE'].fillna(df['FLOORSMAX_MODE'].median())
df['FLOORSMAX_MODE'].isnull().sum()

In [None]:
df['YEARS_BEGINEXPLUATATION_AVG'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(df['YEARS_BEGINEXPLUATATION_AVG'])

ax2= plt.subplot(1,3,2)
sns.boxplot(df['YEARS_BEGINEXPLUATATION_AVG'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=df['YEARS_BEGINEXPLUATATION_AVG'], y= df.index)

plt.show()

In [None]:
df['YEARS_BEGINEXPLUATATION_AVG'].mode()

In [None]:
#Imputing the median value as the data contains outliers and is left skewed.
df['YEARS_BEGINEXPLUATATION_AVG']= df['YEARS_BEGINEXPLUATATION_AVG'].fillna(df['YEARS_BEGINEXPLUATATION_AVG'].median())

In [None]:
df['YEARS_BEGINEXPLUATATION_MEDI']= df['YEARS_BEGINEXPLUATATION_MEDI'].fillna(df['YEARS_BEGINEXPLUATATION_MEDI'].median())
df['YEARS_BEGINEXPLUATATION_MODE']= df['YEARS_BEGINEXPLUATATION_MODE'].fillna(df['YEARS_BEGINEXPLUATATION_MODE'].median())

In [None]:
df['TOTALAREA_MODE'].head()

In [None]:
df['TOTALAREA_MODE'].describe()

In [None]:
df['TOTALAREA_MODE'].mode()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(df['TOTALAREA_MODE'])

ax2= plt.subplot(1,3,2)
sns.boxplot(df['TOTALAREA_MODE'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=df['TOTALAREA_MODE'], y= df.index)

plt.show()

In [None]:
df['TOTALAREA_MODE']=df['TOTALAREA_MODE'].fillna(df['TOTALAREA_MODE'].median())

In [None]:
df['EMERGENCYSTATE_MODE'].describe()

In [None]:
sns.countplot(df['EMERGENCYSTATE_MODE']);

In [None]:
df['EMERGENCYSTATE_MODE'].value_counts()

In [None]:
df['EMERGENCYSTATE_MODE'].isnull().sum()

As attribute is categorical and data is already skewed & biased, we shall not not impute the mode values. Hence, leave the column as it is.

In [None]:
df['OCCUPATION_TYPE'].value_counts()

In [None]:
sns.countplot(df['OCCUPATION_TYPE'])
plt.xticks(rotation=90)
plt.show()

Can't Impute values as it can lead to biasing.

In [None]:
df['EXT_SOURCE_3'].value_counts().sample(5)

In [None]:
df['EXT_SOURCE_3'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(df['EXT_SOURCE_3'])

ax2= plt.subplot(1,3,2)
sns.boxplot(df['EXT_SOURCE_3'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=df['EXT_SOURCE_3'], y= df.index)

plt.show()

In [None]:
#The distribution seem to be approximately normal and also mean & median values are very close, mean value is imputed.
df['EXT_SOURCE_3']= df['EXT_SOURCE_3'].fillna(df['EXT_SOURCE_3'].mean())

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].describe()

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].unique()

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].isnull().sum()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.countplot(df['AMT_REQ_CREDIT_BUREAU_YEAR'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(df['AMT_REQ_CREDIT_BUREAU_YEAR'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=df['AMT_REQ_CREDIT_BUREAU_YEAR'], y= df.index)

plt.show()

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR']= df['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(df['AMT_REQ_CREDIT_BUREAU_YEAR'].mode()[0])


In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.countplot(df['AMT_REQ_CREDIT_BUREAU_YEAR'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(df['AMT_REQ_CREDIT_BUREAU_YEAR'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=df['AMT_REQ_CREDIT_BUREAU_YEAR'], y= df.index)

plt.show()

In [None]:
#The below attributes are categorical in nature and data is skewed & also contains outliers. Hence, mode value is imputed.
df['AMT_REQ_CREDIT_BUREAU_QRT']= df['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(df['AMT_REQ_CREDIT_BUREAU_QRT'].mode()[0])
df['AMT_REQ_CREDIT_BUREAU_MON']= df['AMT_REQ_CREDIT_BUREAU_MON'].fillna(df['AMT_REQ_CREDIT_BUREAU_MON'].mode()[0])
df['AMT_REQ_CREDIT_BUREAU_HOUR']= df['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(df['AMT_REQ_CREDIT_BUREAU_HOUR'].mode()[0])
df['AMT_REQ_CREDIT_BUREAU_DAY']= df['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(df['AMT_REQ_CREDIT_BUREAU_DAY'].mode()[0])
df['AMT_REQ_CREDIT_BUREAU_WEEK']= df['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(df['AMT_REQ_CREDIT_BUREAU_WEEK'].mode()[0])

In [None]:
df['AMT_REQ_CREDIT_BUREAU_WEEK'].isnull().sum()

In [None]:
df['NAME_TYPE_SUITE'].value_counts()

In [None]:
sns.countplot(df['NAME_TYPE_SUITE']);
plt.xticks(rotation=90)
plt.show()

In [None]:
df['NAME_TYPE_SUITE'].isnull().sum()

In [None]:
# This is a categorical attribute and have very low percentage of null values. Hence, imputed with mode values.
df['NAME_TYPE_SUITE']= df['NAME_TYPE_SUITE'].fillna(df['NAME_TYPE_SUITE'].mode()[0])

In [None]:
df['DEF_60_CNT_SOCIAL_CIRCLE'].value_counts()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.countplot(df['DEF_60_CNT_SOCIAL_CIRCLE'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(df['DEF_60_CNT_SOCIAL_CIRCLE'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=df['DEF_60_CNT_SOCIAL_CIRCLE'], y= df.index)

plt.show()

In [None]:
df['DEF_60_CNT_SOCIAL_CIRCLE'].isnull().sum()

In [None]:
df['DEF_60_CNT_SOCIAL_CIRCLE'].mode()[0]

In [None]:
df['DEF_60_CNT_SOCIAL_CIRCLE']= df['DEF_60_CNT_SOCIAL_CIRCLE'].fillna(df['DEF_60_CNT_SOCIAL_CIRCLE'].mode()[0])

In [None]:
df['OBS_30_CNT_SOCIAL_CIRCLE']= df['OBS_30_CNT_SOCIAL_CIRCLE'].fillna(df['OBS_30_CNT_SOCIAL_CIRCLE'].mode()[0])
df['DEF_30_CNT_SOCIAL_CIRCLE']= df['DEF_30_CNT_SOCIAL_CIRCLE'].fillna(df['DEF_30_CNT_SOCIAL_CIRCLE'].mode()[0])
df['OBS_60_CNT_SOCIAL_CIRCLE']= df['OBS_60_CNT_SOCIAL_CIRCLE'].fillna(df['OBS_60_CNT_SOCIAL_CIRCLE'].mode()[0])

In [None]:
df['EXT_SOURCE_2'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(df['EXT_SOURCE_2'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(df['EXT_SOURCE_2'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=df['EXT_SOURCE_2'], y= df.index)

plt.show()

In [None]:
#The data is left skewed and have less null values, median value is imputed.
df['EXT_SOURCE_2']= df['EXT_SOURCE_2'].fillna(df['EXT_SOURCE_2'].median())
df['EXT_SOURCE_2'].isnull().sum()

In [None]:
df['AMT_GOODS_PRICE'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(df['AMT_GOODS_PRICE'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(df['AMT_GOODS_PRICE'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=df['AMT_GOODS_PRICE'], y= df.index)

plt.show()

In [None]:
#The data is right skewed and also contains outliers. Therefore, median value is imputed.
df['AMT_GOODS_PRICE']=df['AMT_GOODS_PRICE'].fillna(df['AMT_GOODS_PRICE'].median())

In [None]:
df['AMT_ANNUITY'].describe()

In [None]:
df['AMT_ANNUITY'].isnull().sum()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(df['AMT_ANNUITY'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(df['AMT_ANNUITY'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=df['AMT_ANNUITY'], y= df.index)

plt.show()

In [None]:
#The data is right skewed and also contains outliers. Therefore, median value is imputed.
df['AMT_ANNUITY']=df['AMT_ANNUITY'].fillna(df['AMT_ANNUITY'].median())

In [None]:
df['CNT_FAM_MEMBERS'].unique()

In [None]:
df['CNT_FAM_MEMBERS'].isnull().sum()

In [None]:
df['CNT_FAM_MEMBERS'].describe()

In [None]:
sns.countplot(df['CNT_FAM_MEMBERS']);

This is categorical attribute and may have good influence on Target column. Therefore, no values are imputed.

In [None]:
df['DAYS_LAST_PHONE_CHANGE'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(df['DAYS_LAST_PHONE_CHANGE'])
plt.xticks(rotation=45)

ax2= plt.subplot(1,3,2)
sns.boxplot(df['DAYS_LAST_PHONE_CHANGE'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=df['DAYS_LAST_PHONE_CHANGE'], y= df.index)

plt.show()

In [None]:
df['DAYS_LAST_PHONE_CHANGE'].isnull().sum()

In [None]:
df['DAYS_LAST_PHONE_CHANGE'].mode()

In [None]:
df['DAYS_LAST_PHONE_CHANGE']= df['DAYS_LAST_PHONE_CHANGE'].fillna(df['DAYS_LAST_PHONE_CHANGE'].mode()[0])

In [None]:
null= df.isnull().sum().sort_values(ascending=False)
null= null[null>0]
null

In [None]:
null_p= null/len(df)
null_p

# pre-processing of application_data is completed

In [None]:
df['TARGET'].value_counts(normalize=True)

In [None]:
imbalance_ratio= len(df[df['TARGET']==0])/ len(df[df['TARGET']==1])
imbalance_ratio

The dataset is highly imbalance with imbalance ratio 11.38. Therefore, data is segregated in following two parts:

In [None]:
client_difficulties= df[df['TARGET']==1]
client_no_difficulties= df[df['TARGET']==0]

Storing the numerical columns and categorical columns separately for further data visualization and analysis

In [None]:
num_cols= df.describe().columns
num_cols

In [None]:
cat_cols= df.select_dtypes(include=['object']).columns
cat_cols

## Univariate Analysis

In [None]:
#defining  afunction to plot the categorical attributes in application_data dataset.
def cat_var_plots(train, train0, train1, column):
    
    train = train
    train_0 = train0
    train_1 = train1
    col = column
    
    fig = plt.figure(figsize=(13,10))
    
    ax1 = plt.subplot(221)
    train[col].value_counts().plot.pie(autopct = "%1.0f%%", ax=ax1)
    plt.title('Plotting data for the column: '+ column)
    
    ax2 = plt.subplot(222)
    sns.countplot(x= column, hue = 'TARGET', data = train, ax = ax2)
    plt.xticks(rotation=90)
    plt.title('Plotting data for target in terms of total count')


    ax3 = plt.subplot(223)
    df = pd.DataFrame()
    df['0']= ((train_0[col].value_counts())/len(train_0))
    df['1']= ((train_1[col].value_counts())/len(train_1))
    df.plot.bar(ax=ax3)
    plt.title('Plotting data for target in terms of percentage')


    fig.tight_layout() 

    plt.show()

In [None]:
#As there are various categorical columns, plotting every column for Target=0 and Target=1 using loops.
for column in cat_cols:
    print("Plotting", column)
    cat_var_plots(df, client_no_difficulties, client_difficulties, column)

### Insights from categorical univariate analysis

INSIGHTS
1. More people are availing cash loans as compared to Revolving loans. They seems to have difficulties in repayment of loan as compared to those availing revolving loans
2. The clients availing loans are Females in majority. However, more males seems to be defaulter as compared to Females. 
3. The clients who don't own cars tends to be threat of becoming defaulter.
4. Majority of people availing loans are from working category, however they are also more likely to be defaulter.
5. Majority of people availing loans are married and also they are also good with loan repayment.
6. The people who owns house/ apartments are more likely to avail loans.
7. The people who have their occupation type as labourers are availing more loans however, they are also more likely to be defaulter.
8. Majority of clients apply for loans in weekdays. However, people who apply for loans on sunday are less likely to be defaulters. 

# Analysis for the Outliers

In [None]:
# Plotting box plots for numerical columns.
for column in num_cols:
    fig_dims = (12, 5)
    fig, ax = plt.subplots(figsize=fig_dims)
    sns.boxplot(df[column], ax=ax)
    plt.title('Plotting'+column)
    plt.show();

In [None]:
#Plotting scatter plots for numerical columns
train_numerical = df.select_dtypes(include=['int64','float64']).columns
for column in train_numerical:
    plt.figure(figsize=[10,5])
    plt.scatter(df.index, df[column])
    plt.title("Plot of "+column)
    plt.show();

## We observed various outliers in dataset. Some are listed below:
- 1. CNT_CHILDREN - There are two cases where no of children is 19 and there are 3 cases where no of children is
     14 which seems very ilogical , so we believe these are invalid outliers and should be removed.
     
     
- 2. AMT_ANNUITY - it is valid outliers as a client might have a requirement of such a huge loan amount.


- 3. DAYS_EMPLOYED : There is one value which is present here 365243. However, no of days can't positive as we are not sure when the client will be employed so we think this is an invalid data point and hence is an outliers. It might be due           to manual error while data entry plus all other values in this column are negative.


- 4. AMT_INCOME_TOTAL - As max amount_income is way above the mean and 75th quartile range. Hence, it is an outlier. As this data point may influence the data, it should be treated.


- 5. CNT_FAM_MEMBERS - Now a days it is impossible to have a family of greater than 10-12 members. Hence we think there are many outliers present in this attribute.


- 6. REGION_POPULATION_RELATIVE : There are some outliers present in densly populated where loan amount is also high more than 75% quarantile.


## Finding Top10 Correration

In [None]:
# Plotting heatmap on numerical columns where clients pays their EMIs on time.
corr = client_no_difficulties.corr()
title = "Pays On Time"
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
plt.title(title, fontsize = 18, color="Blue")
ttl = ax.title
ttl.set_position([0.5,1.05])
with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True);

In [None]:
corr0 = client_no_difficulties.corr()
# Removing negative sign from correlation
corr0  = corr0.abs()
#using stack to see the details column wise
corr0 = corr0.unstack()
corr0 = corr0.sort_values(kind='quicksort').dropna()
corr0  = corr0[corr0 !=1 ]
corr0

In [None]:
top_10_ontime= corr0.tail(20)
print("Top10 Correration who Pays On Time")
top_10_ontime.iloc[::2]

In [None]:
# Plotting heatmap on numerical columns where clients don't pay their EMIs on time and are likely to become defaulter.
corr = client_difficulties.corr()
title = "Defaulters"
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
plt.title(title, fontsize = 18,color='Blue')
ttl = ax.title
ttl.set_position([0.5,1.05])
with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True);

In [None]:
# Top 10 corr for Target=1 
corr1 = client_difficulties.corr().abs().unstack().sort_values(kind='quicksort').dropna()
corr1  = corr1[corr1 !=1 ]
print("Top 10 Correlation who are Defaulters")
top_10defaulter= corr1.tail(20)

In [None]:
corr1.tail(20)

In [None]:
top_10_defaulter= corr1.tail(20)
print("Top10 Correlation who don't Pays On Time")
top_10_defaulter.iloc[::2]

## Previous Application Data

In [None]:
#To view top 5 rows of previous application dataset
pre_application.head()

In [None]:
#To view the shape of dataset.
pre_application.shape

In [None]:
pre_application.info()

In [None]:
#To view the statistical parameters of numerical columns in previous application dataset.
pre_application.describe()

In [None]:
df.shape

In [None]:
pre_application.shape

In [None]:
#Merging both the dataframes on common column SK_ID_CURR using inner join
new= pd.merge(df, pre_application, on= 'SK_ID_CURR')

In [None]:
new.shape

## Data cleaning of merged dataframe

In [None]:
#Now checking the null values percentage in new dataframe (merged dataframe)
new_null= ((new.isnull().sum()/len(new))*100).sort_values(ascending=False)
new_null

In [None]:
#Dropping the columns having null values percentage above 45.0
pre_df= new.drop(columns= new_null[new_null > 45.0].index)
pre_df.head()

In [None]:
pre_df.isnull().sum().sort_values(ascending=False)

In [None]:
pre_df['SK_ID_CURR'].value_counts()

There are duplicate entries for SK_ID_CURR. But as client can apply for loan multiple times so it is acceptable.

In [None]:
pre_df['NFLAG_INSURED_ON_APPROVAL'].describe()

In [None]:
#plotting countplot for NFLAG_INSURED_ON_APPROVAL
sns.countplot(pre_df['NFLAG_INSURED_ON_APPROVAL'])
plt.show()

Data can't be imputed as it may lead to biasing.

In [None]:
pre_df['DAYS_LAST_DUE'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(pre_df['DAYS_LAST_DUE'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(pre_df['DAYS_LAST_DUE'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=pre_df['DAYS_LAST_DUE'], y= pre_df.index)

plt.show()

Data Seems to be highly biased. No imputations made.

In [None]:
pre_df['DAYS_LAST_DUE_1ST_VERSION'].describe()

Data Seems to be highly biased. No imputations made.

In [None]:
pre_df['DAYS_LAST_DUE_1ST_VERSION'].describe()

Data Seems to be highly biased. No imputations made.

In [None]:
pre_df['DAYS_FIRST_DUE'].describe()

Data Seems to be highly biased. No imputations made.

In [None]:
pre_df['DAYS_FIRST_DRAWING'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(pre_df['DAYS_FIRST_DRAWING'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(pre_df['DAYS_FIRST_DRAWING'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=pre_df['DAYS_FIRST_DRAWING'], y= pre_df.index)

plt.show()

In [None]:
pre_df['DAYS_TERMINATION'].describe()

Data Seems to be highly biased. No imputations made.

In [None]:
pre_df['OCCUPATION_TYPE'].unique()

In [None]:
sns.countplot(pre_df['OCCUPATION_TYPE']);
plt.xticks(rotation=90)
plt.show()

As it is categorical data and may highly influence the target column, imputations may lead to biasing of data.

In [None]:
pre_df['AMT_GOODS_PRICE_y'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(pre_df['AMT_GOODS_PRICE_y'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(pre_df['AMT_GOODS_PRICE_y'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=pre_df['AMT_GOODS_PRICE_y'], y= pre_df.index)

plt.show()

In [None]:
pre_df['AMT_GOODS_PRICE_y']=pre_df['AMT_GOODS_PRICE_y'].fillna(pre_df['AMT_GOODS_PRICE_y'].median())

In [None]:
pre_df['AMT_ANNUITY_y'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(pre_df['AMT_ANNUITY_y'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(pre_df['AMT_ANNUITY_y'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=pre_df['AMT_ANNUITY_y'], y= pre_df.index)

plt.show()

In [None]:
pre_df['AMT_ANNUITY_y']= pre_df['AMT_ANNUITY_y'].fillna(pre_df['AMT_ANNUITY_y'].median())

In [None]:
pre_df['CNT_PAYMENT'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(pre_df['CNT_PAYMENT'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(pre_df['CNT_PAYMENT'])

ax2= plt.subplot(1,3,3)
sns.scatterplot(x=pre_df['CNT_PAYMENT'], y= pre_df.index)

plt.show()

In [None]:
pre_df['CNT_PAYMENT']=pre_df['CNT_PAYMENT'].fillna(pre_df['CNT_PAYMENT'].median())

In [None]:
pre_df['AMT_CREDIT_y'].describe()

In [None]:
fig= plt.figure(figsize=[15,5])
ax1= plt.subplot(1,3,1)
sns.distplot(pre_df['AMT_CREDIT_y'])
plt.xticks(rotation=90)

ax2= plt.subplot(1,3,2)
sns.boxplot(pre_df['AMT_CREDIT_y'])

ax3= plt.subplot(1,3,3)
sns.scatterplot(x=pre_df['AMT_CREDIT_y'], y= pre_df.index)

plt.show()

In [None]:
pre_df['AMT_CREDIT_y']=pre_df['AMT_CREDIT_y'].fillna(pre_df['AMT_CREDIT_y'].median())

In [None]:
pre_df['PRODUCT_COMBINATION'].value_counts()

In [None]:
fig= plt.figure(figsize=[12,12])
sns.countplot(pre_df['PRODUCT_COMBINATION'])
plt.xticks(rotation=90)
plt.show()

In [None]:
pre_df['PRODUCT_COMBINATION'].mode()

This column seems to be important attribute, and imputation may lead to biasing of data.

In [None]:
pre_df['TARGET'].value_counts()

In [None]:
pre_imbalance_ratio= len(pre_df[pre_df['TARGET']==0])/ len(pre_df[pre_df['TARGET']==1])
pre_imbalance_ratio

The dataset is highly imbalance with imbalance ratio 10.55. Therefore, data is segregated in following two parts:

In [None]:
p_client_difficulties= pre_df[pre_df['TARGET']==1]
p_client_no_difficulties= pre_df[pre_df['TARGET']==0]

In [None]:
#defining plotting function for bi-variate analysis
def plotting_bivariate(column, hue):
    col= column
    hue=hue
    fig= plt.figure(figsize=(13,10))
    
    ax1= plt.subplot(221)
    pre_df[col].value_counts().plot.pie(autopct= '%1.0f%%', ax=ax1)
    plt.title('Plotting data for the column: '+column)
    
    ax2= plt.subplot(222)
    df1= pd.DataFrame()
    df1['0']= ((p_client_no_difficulties[col].value_counts())/len(p_client_no_difficulties))
    df1['1']= ((p_client_difficulties[col].value_counts())/len(p_client_difficulties))
    df1.plot.bar(ax=ax2)
    plt.title('Plotting data for target in terms of total count')
    
    ax3= plt.subplot(223)
    sns.countplot(x= col, hue=hue, data=p_client_no_difficulties, ax=ax3)
    plt.xticks(rotation= 90)
    plt.title('Plotting data for Target=0 in terms of percentage')
    
    ax4= plt.subplot(224)
    sns.countplot(x=col, hue=hue, data= p_client_difficulties, ax=ax4)
    plt.xticks(rotation=90)
    plt.title('Plotting data for Target=1 in terms of percentage')
    
    fig.tight_layout()
    plt.show()

In [None]:
pre_df.columns

In [None]:
numerical_cols= pre_df.describe().columns

In [None]:
numerical_cols

In [None]:
categorical_cols= pre_df.select_dtypes(include=['object']).columns
categorical_cols

## Bi-Variate Analysis

In [None]:
cat_analysis_columns= ['NAME_CONTRACT_TYPE_x', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'OCCUPATION_TYPE', 'NAME_CONTRACT_STATUS', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP']
for col1 in cat_analysis_columns:
    for col2 in cat_analysis_columns:
        if col1 != col2:
            plotting_bivariate(col1,col2)

### Insights from bi-variate analysis

INSIGHTS
1. More Females seems to avail cash loans as compared to revolving loans and are less likely to be defaulter as compared to Males.
2. People who own realty prefers cash loans more as compared to those who don't own realty and also are less likely to be defaulter.
3. People with Secondary/ secondary special education prefers cash loans over revolving loans and are likely to have no difficulties in loan repayment.
4. The clients who tends to avail cash loans are more likely to get their loan applications approved.
5. Working professionals who own realty applies for loans more number of times and are less likely to be defaulter.



## Multi-Variate Analysis and Insights

In [None]:
plt.figure(figsize=(16,12))
sns.boxplot(data =p_client_difficulties, y='AMT_ANNUITY_x', x='NAME_EDUCATION_TYPE', hue ='NAME_INCOME_TYPE')
plt.title('Amount_Annuity vs Name_Income_Type vs Name_Education_Type')
plt.show()

The clients having secondary education and are on maternity leave are seems to higher ammount annuity. However, they are more likely to be defaulter.

In [None]:
plt.figure(figsize=(16,12))
sns.boxplot(data =p_client_difficulties, y='AMT_ANNUITY_x', x='NAME_EDUCATION_TYPE', hue ='NAME_CONTRACT_STATUS')
plt.title('Amount_Annuity vs Name_Contract_Status vs Name_Education_Type')
plt.show()

The people having lower secondary education tends to have lowest amount annuity and are more likely do not uses sanctioned loan.

In [None]:
plt.figure(figsize=(16,12))
sns.boxplot(data =p_client_no_difficulties, y='AMT_CREDIT_x', x='NAME_EDUCATION_TYPE', hue ='NAME_CONTRACT_STATUS')
plt.title('Amount_Credit vs Name_Contract_Status vs Name_Education_Type')
plt.show()

The people with academic degree tends to have highest amount credit and are more likely to get their applications approved.

In [None]:
plt.figure(figsize=(16,12))
sns.boxplot(data =p_client_no_difficulties, y='AMT_CREDIT_x', x='NAME_EDUCATION_TYPE', hue ='CODE_GENDER')
plt.title('Amount_Credit vs Code_Gender vs Name_Education_Type')
plt.show()

Females with academic degree have higher amount of credit and are less likely to be defaulter.

In [None]:
plt.figure(figsize=(16,14))
sns.boxplot(data =p_client_no_difficulties, y='AMT_ANNUITY_x', x='OCCUPATION_TYPE', hue ='NAME_EDUCATION_TYPE')
plt.title('Amount_Annuity vs Occupation_Type vs Name_Education_Type')
plt.xticks(rotation=90)
plt.show()

The private staff employees having academic degree seems to have highest amount annuity as compared to people with different occupation type.