In [None]:
import os
print(os.listdir("../input"))

# Credit Case study - EDA

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',200)

## Loading the data

In [None]:
df = pd.read_csv('../input/loan-defaulter/application_data.csv')
df.head()

## Shape of dateframe

In [None]:
df.shape

## Information of dataframe

In [None]:
df.info(verbose=True,null_counts=True)

## Finding the null value Percentage

In [None]:
100*df.isna().sum()/len(df)

## If a customer is not holding a car and the age of the car variable cannot be a null value. It should be technically Zero. 

In [None]:
for i in range(len(df['FLAG_OWN_CAR'])):
    if df['FLAG_OWN_CAR'][i]=='N':
        df['OWN_CAR_AGE'][i]=0

In [None]:
df['OWN_CAR_AGE'].isnull().value_counts(normalize=100)*100

#### Inference
    -- Changed from 65 % to 99.99 %

## Columns Having missing values greater than 50 %

In [None]:
df.columns[(100*df.isna().sum()/len(df) > 50)]

## Dropping the columns having missing values greater than 50 %

In [None]:
df= df.drop(columns=df.columns[(100*df.isna().sum()/len(df) > 50)])

In [None]:
df.shape ## Recheck the shape of data

In [None]:
df.info(verbose=True)

#### After having a look at info of dataframe we can observe there are some categorical variables  to proceed with model building we need to convert these into numerical variable 
- we can use map and lambda function to achive this 

## Describe the data - Check for statistics

In [None]:
df.describe()

## Finding the columns having missing values less than 13 %

In [None]:
df.columns[(100*df.isna().sum()/len(df) < 13) & (100*df.isna().sum()/len(df) > 0)]

## Missing value treatment for columns having less than 13% missing values
### 1. AMT_ANNUITY - Amount of loan paid by customer

#### AMT_ANNUITY - Annual loan payment by customer

In [None]:
df.AMT_ANNUITY.describe()

In [None]:
plt.subplots(ncols=3,figsize=(16,4.5))
plt.subplot(1,3,1)
plt.title('Plot 1. Annual loan payment\n by customer')
sns.boxplot(df['AMT_ANNUITY'])
plt.subplot(1,3,2)
a = (df[df['AMT_ANNUITY']<df['AMT_ANNUITY'].quantile(0.99)])['AMT_ANNUITY']
plt.title('Plot 2. Annual loan payment\n by customer at 0.99 percentile')
sns.distplot(a,bins=20,color='green')
plt.subplot(1,3,3)
plt.title('Plot 3. Freequency distrubution \nby AMT_ANNUITY Loan paid\n in 10 percentile bins')
pd.cut(df['AMT_ANNUITY'],bins = 10).value_counts().plot.bar()
plt.show()

#### Inference
    -- Outliers are observed in plot1 and since plot is too compressed, outliers are not clearly visible
    -- Based on observations in plot 1, plot 2 is plotted to find the density. Here it is observed that customer count are more  distrubuted in range of 10000 to 30000
    -- Data is contineous
    -- Larger portion of the distrubution is ranging from 1359.00 to 52897.5
    -- Standard deviation is too high in this case. And the distrubution is non-linear.

### 2. AMT_GOODS_PRICE - Cost of goods against which loan is applied for

In [None]:
df.AMT_GOODS_PRICE.describe()

In [None]:
plt.subplots(ncols=3,figsize=(16,4.5))
plt.subplot(1,3,1)
plt.title('Plot 1. Cost of goods against \nwhich loan is applied for')
sns.boxplot(df['AMT_GOODS_PRICE'])
plt.subplot(1,3,2)
a = (df[df['AMT_GOODS_PRICE']<df['AMT_GOODS_PRICE'].quantile(0.995)])['AMT_GOODS_PRICE']
plt.title('Plot 2. Cost of goods against \n which loan is applied for at \n 0.995 percentile')
sns.distplot(a,bins=10,color='green')
plt.subplot(1,3,3)
plt.title('Plot3. Cost of goods against \n which loan is applied for in \n 10 percentile bins')
pd.cut(df['AMT_GOODS_PRICE'],bins = 10).value_counts().plot.bar()
plt.show()

#### Inference
    -- Data is contineous
    -- Outliers are observed in plot1 and since plot is too compressed, outliers are not clearly visible
    -- Based on observations in plot 1, plot 2 is plotted to find the more insights    
    -- Most of the distrubution is under range of 36490.5 - 1644350.0

### 3. NAME_TYPE_SUITE - Who was accompanying client when he/she was applying for the loan

In [None]:
df['NAME_TYPE_SUITE'].value_counts()

In [None]:
plt.title('NAME_TYPE_SUITE - Who was accompanying client \n when he/she was applying for the loan')
sns.countplot(df['NAME_TYPE_SUITE'])
plt.xticks(rotation=90);

#### Inference
    -- Data is Catgorical
    -- Considering the no. of records and, as per observation abtained  in above data we can impute missing values by unaccompanied. Since it is most occuring 

## 4. CNT_FAM_MEMBERS - Number of family members of customer

In [None]:
df['CNT_FAM_MEMBERS'].value_counts()

In [None]:
plt.title('Distrubution of number of family members\n in applicants family')
sns.countplot(df['CNT_FAM_MEMBERS'])
plt.xticks(rotation=90);

#### Inference
    -- Data is Catgorical
    -- Considering the no. of records and, as per observation abtained  in above data we can impute missing values by 2. Since it is most occuring 

## Datatype conversion

In [None]:
df.select_dtypes('float').columns

In [None]:
INT_COLS =['DAYS_REGISTRATION','CNT_FAM_MEMBERS','DAYS_LAST_PHONE_CHANGE','AMT_REQ_CREDIT_BUREAU_YEAR',
          'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']
df[INT_COLS] = df[INT_COLS].astype(int,errors='ignore')

In [None]:
df.describe()

#### Inference
    -- DAYS_BIRTH is in negative, can be coverted into years and binned with age group
    -- DAYS_EMPLOYED is in negative, can be converted into years
    -- DAYS_EMPLOYED showed up max value is 365243.000000 is SD 141275.766519 and which is impossible value. This might be imputaion error
    -- DAYS_REGISTRATION is in negative
    --DAYS_ID_PUBLISH is in negative

### Convertion of above parameters into positive

In [None]:
df['DAYS_BIRTH'] = abs(df['DAYS_BIRTH'])
df['DAYS_EMPLOYED'] = abs(df['DAYS_EMPLOYED'])
df['DAYS_REGISTRATION'] = abs(df['DAYS_REGISTRATION'])

#### Inference 
    -- DAYS_BIRTH -> Since it is age and cannot be negative, it has to be converted into positive
    -- DAYS_EMPLOYED -> Since it is time period and cannot be negative, it has to be converted into positive
    -- DAYS_EMPLOYED -> Highest duration of employement is 365243.000000 which is almost 1000+ years, practically impossible value and is an oulier
    -- DAYS_REGISTRATION -> Since it is time period and cannot be negative, it has to be converted into positive

## Binning

### Age group


In [None]:
df['AGE'] =round( df['DAYS_BIRTH']/365.25,0) # Considering leap year

In [None]:
df.AGE.describe()

In [None]:
plt.figure(figsize=(16,5))
sns.countplot(df['AGE'])
plt.xticks(rotation=90);

In [None]:
df['AGE_GRP'] = pd.cut(df.AGE,bins=np.linspace(20,70,num=20))

In [None]:
df['AGE_GRP'].describe()

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(df['AGE_GRP'])
plt.xticks(rotation=90);

#### Inference
    -- Age is a contineous categorical data
    -- Age is categorized into 10 bins as above. Age group 25 to 65 are most likely to apply for loans

### Work Experience

In [None]:
# Creating new column for work experience from days to year

In [None]:
df['WORK_EXP'] = round(df['DAYS_EMPLOYED']/365.25,0) # 365.25 --> Considering leap year

In [None]:
df['WORK_EXP'].describe()

In [None]:
plt.subplots(ncols=3,figsize=(16,4.5))
plt.subplot(1,3,1)
plt.title('Plot 1. Bar plot for WORK_EXP \n of customer')
sns.boxplot(df['WORK_EXP'])
plt.subplot(1,3,2)
a = (df[df['WORK_EXP']<df['WORK_EXP'].quantile(0.99)])['WORK_EXP']
plt.title('Plot 2. Customer distrubution \nbased on WORK_EXP\n of customer at 0.99 percentile')
sns.distplot(a,bins=20,color='green')
plt.subplot(1,3,3)
plt.title('Plot 3. Freequency distrubution \nby WORK_EXP Loan paid\n in 10 percentile bins')
pd.cut(df['WORK_EXP'],bins = 10).value_counts().plot.bar()
plt.show()

### Inference
    -- There is an outlier present which could be practically impossible work experience i.e, 1000 years
    -- It is better to impute this outlier with null or median
    -- Major age groups in application data are having 0-15 years of work experience

## Based on Income of customers

In [None]:
df.AMT_INCOME_TOTAL.describe()

In [None]:
df['INCOME_GRP'] = pd.qcut(df.AMT_INCOME_TOTAL,q=[0,0.2,0.5,0.8,0.9,1],
                          labels=['Vey-Low','Low','Average','High','Very_high'])

In [None]:
plt.subplots(ncols=2,figsize = (16,5))
plt.subplot(1,3,1)
plt.title('Plot 1. Income distrubution in bins')
sns.countplot(df.INCOME_GRP)
plt.subplot(1,3,2)
plt.title('Plot 2. Income distrubution')
sns.boxplot(df['AMT_INCOME_TOTAL'])
plt.subplot(1,3,3)
a = (df[df['AMT_INCOME_TOTAL']<df['AMT_INCOME_TOTAL'].quantile(0.995)])['AMT_INCOME_TOTAL']
plt.title('Plot 3. Customers distrubution based on salary')
sns.distplot(a,bins=20,color='green')
plt.show()

#### Inference
    -- Customers having average salery are more appeared (Plot1)
    -- There are outliers present in the salaries (Plot 2)
    -- As we observe plt 3 we can say that maximum distrubution is rom 75000 - 200000 (Considered 0.995 percentile)

In [None]:
df.info()

## Data balancing

### Percentage of Defaulters and non-Defaulters

In [None]:
df.TARGET.value_counts(normalize=True)*100
plt.pie(df.TARGET.value_counts(normalize=True)*100,autopct='%1.1f%%',labels=['Non-Default','Default']);

#### Inference 
    1. Data is imbalance across defaulters and non-defaulters
    2. Around 8 % of customers are observed as defaulters

## Data Cleansing for data analysis
### Dropping the columns not required for analysis

In [None]:
df.columns

In [None]:
df_master = df[['SK_ID_CURR','TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR','NAME_TYPE_SUITE',
                       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'INCOME_GRP', 'AMT_CREDIT',
                       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_INCOME_TYPE','AMT_INCOME_TOTAL',
                       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
                       'REGION_POPULATION_RELATIVE', 'OCCUPATION_TYPE','CNT_FAM_MEMBERS','REGION_RATING_CLIENT',
                       'ORGANIZATION_TYPE','AMT_REQ_CREDIT_BUREAU_HOUR','AGE_GRP','WORK_EXP',
                       ]]

In [None]:
df_master.shape

In [None]:
df_def = df_master[df_master['TARGET']==1]
df_Ndef = df_master[df_master['TARGET']==0]

## Univariate Analysis

### By nature of loan 

In [None]:
def cnt_plots(var):
    plt.subplots(ncols=2,figsize=(12,4.5))
    plt.subplot(1,2,1)
    plt.title("Defalut")
    sns.countplot(var,data=df_def)
    plt.xticks(rotation=90)
    plt.subplot(1,2,2)
    plt.title("Non-Defalut")
    sns.countplot(var,data=df_Ndef)
    plt.xticks(rotation=90);
cnt_plots('NAME_CONTRACT_TYPE')

#### Inference
    -- Revolvoing loans are less default when compared to cash loans as per nature of graph. 

### Based on gender

In [None]:
cnt_plots('CODE_GENDER')

#### Inference
    -- Female customers are more likely to be defaulters
    -- Female customers are more than male customers

### Based on loan accompanying partner

In [None]:
cnt_plots('NAME_TYPE_SUITE')

#### Inference
    -- Unaccompanied loans are found likely to be default and loans applications are majorly without any accompany.
    -- Accompanied by childeren are also likely to be defaulters
    -- Group of people are insignificant

### Based on number of family members present

In [None]:
cnt_plots('CNT_FAM_MEMBERS')

#### Inference
    -- Customers with 3 or more members are likely to default

### Based on Marital/Family Status

In [None]:
cnt_plots('NAME_FAMILY_STATUS')

#### Inference
    -- Single/Not married are most appeared to be defaulters
    -- Applications with married status are appeared more

### Based on Education

In [None]:
cnt_plots('NAME_EDUCATION_TYPE')

#### Inference
    -- Secondary/Secondary Special educated customers are appearing to default
    -- Higher the education lower the rate of defaulters, since they will earn more and less chance for default

### By occupation type

In [None]:
cnt_plots('OCCUPATION_TYPE')

#### Inference
    -- Labours class, low-skilled labour staff & Drivers are more likely to be defaulters
    -- IT staff, HR Staff, Realty agents and secretaries are less likely to default
    -- Occupation by labourers & sales staff are more likely to apply for loan
    -- IT staff, HR Staff, Realty agents and secretaries are less likely to apply for loans

### By nature of income

In [None]:
cnt_plots('NAME_INCOME_TYPE')

#### Inference
    -- Working professionals are more likely appearing for loan applications and defaulters
    -- Applications with income type of maternity leave, Businessmanm Students and unemployed are less likely to appear in loan applications and defaulters

### Based on Annual income of customer

In [None]:
plt.subplots(nrows=1,ncols=3,figsize = (16,5))
plt.subplot(1,3,1)
plt.title('Plot 1. Income distrubution in bins \n Default')
sns.countplot(df_def.INCOME_GRP)
plt.subplot(1,3,2)
plt.title('Plot 2. Income distrubution \n Default')
sns.boxplot(df_def['AMT_INCOME_TOTAL'])
plt.subplot(1,3,3)
a = (df_def[df_def['AMT_INCOME_TOTAL']<df_def['AMT_INCOME_TOTAL'].quantile(0.995)])['AMT_INCOME_TOTAL']
plt.title('Plot 3. Customers distrubution based \n on salary Default')
sns.distplot(a,bins=20,color='green');

In [None]:
plt.subplots(nrows=1,ncols=3,figsize = (16,5))
plt.subplot(1,3,1)
plt.title('Plot 4. Income distrubution in bins \n Non-Default')
sns.countplot(df_Ndef.INCOME_GRP)
plt.subplot(1,3,2)
plt.title('Plot 5. Income distrubution \n Non-Default')
sns.boxplot(df_Ndef['AMT_INCOME_TOTAL'])
plt.subplot(1,3,3)
a = (df_Ndef[df_Ndef['AMT_INCOME_TOTAL']<df_Ndef['AMT_INCOME_TOTAL'].quantile(0.995)])['AMT_INCOME_TOTAL']
plt.title('Plot 6. Customers distrubution based on  \nsalary for Non-Default')
sns.distplot(a,bins=20,color='green')
plt.xticks(rotation=90);

#### Inference
    -- Salaries are most distrubuted across 75000 - 200000 and more likely to find defaulters
    -- Salaries are most distrubuted across 75000 - 225000 and more likely to apply for loans
    -- There is presence of outliers - 0.5 % by count 

#### Based on type of Residence

In [None]:
cnt_plots('NAME_HOUSING_TYPE')

#### Inference
    -- customers staying with parents and rented apartments are more likely to be defaulters.
    -- Customers in rented apartments change their residence to default.

#### Customers who owns car

In [None]:
cnt_plots('FLAG_OWN_CAR')

#### Inference
    -- People who are not owning cars are more likely to be defaulters
    -- Customers owing cars will be economically stable to have luxuries

### Loan Amount - AMT_CREDIT

In [None]:
def dist_plots(var,x=0):
    sns.set_style('darkgrid')
    plt.subplots(ncols=2,figsize=(15,5))
    plt.subplot(1,2,1)
    plt.title("Defalut")
    sns.distplot(df_def[var],bins=10,color='orange')
    plt.xticks(rotation=x)
    plt.subplot(1,2,2)
    plt.title("Non-Defalut")
    sns.distplot(df_Ndef[var],bins=10, color='green')
    plt.xticks(rotation=x);


In [None]:
dist_plots('AMT_CREDIT')

#### Inference
    -- Loan amount upto 50000 - 75000 and 125000 -150000 are likely to be more defaulters

#### Loan repayment by customer

In [None]:
dist_plots('AMT_ANNUITY')

#### Inference
    -- People who are repaying loan 20000 to 40000 are more likely to be defaulters

### Bivariate Analysis

#### Income Type V/S Income

In [None]:
df_master.head()

#### Loan amount v/s Goods price

In [None]:
plt.subplots(ncols=2,figsize=(12,5))
plt.subplot(1,2,1)
plt.title('Default')
sns.scatterplot(y='AMT_GOODS_PRICE',x='AMT_CREDIT',data=df_def)
plt.xlabel('Loan Amount')
plt.ylabel('Goods price')

plt.subplot(1,2,2)
plt.title('Non-Default')
sns.scatterplot(y='AMT_GOODS_PRICE',x='AMT_CREDIT',data=df_Ndef)
plt.xlabel('Loan Amount')
plt.ylabel('Goods price')
;

#### Inference
    -- Most of the customers are availing loan greater than the goods price.

#### Total income v/s Loan amount across gender

In [None]:
plt.subplots(nrows=2,ncols=2,figsize=(12,12))
plt.subplot(2,2,1)
plt.title('Plot 1. Population - Default')
sns.scatterplot(y='AMT_INCOME_TOTAL',x='AMT_CREDIT',data=df_def,hue='CODE_GENDER')
plt.xlabel('Loan Amount')
plt.ylabel('Total Income')

plt.subplot(2,2,2)
plt.title('Plot 2. Population - Non-Default')
sns.scatterplot(y='AMT_INCOME_TOTAL',x='AMT_CREDIT',data=df_Ndef,hue='CODE_GENDER')
plt.xlabel('Loan Amount')
plt.ylabel('Total Income')


plt.subplot(2,2,3)
plt.title('Plot 3. Default at 0.99 percentile')
b = df_def[df_def['AMT_INCOME_TOTAL']<df_def['AMT_INCOME_TOTAL'].quantile(0.995)]
sns.scatterplot(y='AMT_INCOME_TOTAL',x='AMT_CREDIT',data=b,hue='CODE_GENDER')
plt.xlabel('Loan Amount')
plt.ylabel('Total Income')

plt.subplot(2,2,4)
plt.title('Plot 4. Non-Default at 0.99 percentile')
c = df_Ndef[df_Ndef['AMT_INCOME_TOTAL']<df_Ndef['AMT_INCOME_TOTAL'].quantile(0.995)]
sns.scatterplot(y='AMT_INCOME_TOTAL',x='AMT_CREDIT',data=c,hue='CODE_GENDER')
plt.xlabel('Loan Amount')
plt.ylabel('Total Income');

#### Inference
    -- Plot 1 & 2 for entire population is tightly packed and difficult to draw inferences
    -- Based on plot 3 & 4 gives much more insights, where it evident that customers having income of 25000 to 275000 and loan amount ranging fro 12500 to 125000 are appearing to be default (Females are dominant)
    -- Customers having income of 12500 to 450000 and applied loan amount od 12500 to 200000 are more likely to appear.

## Multivariate Analysis

In [None]:
df_def.corr()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_def.corr(),annot=True,cmap='twilight_shifted');

In [None]:
corr_def = df_def.corr().where(np.triu(np.ones(df_def.corr().shape),k=1).astype(np.bool))
corr_def

In [None]:
corr_def = corr_def.unstack()
corr_def = corr_def.sort_values(ascending=False).drop_duplicates()

In [None]:
corr_def.columns=['var1','var2','correlation']

In [None]:
corr_def.head(10)

In [None]:
df_Ndef.corr()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_Ndef.corr(),annot=True,cmap='twilight_shifted');

In [None]:
corr_Ndef = df_Ndef.corr().where(np.triu(np.ones(df_Ndef.corr().shape),k=1).astype(np.bool))
corr_Ndef

In [None]:
corr_Ndef = corr_Ndef.unstack()
corr_Ndef = corr_Ndef.sort_values(ascending=False).drop_duplicates()
corr_Ndef.head(10)

#### Inference
- From heat map we can say that No field is correlated with the TARGET variable so we need some other information to draw conclusions on the target variable.
- There is a strong correlation between (AMT_GOODS_PRICE and AMT_CREDIT), (CNT_FAM_MEMBERS and CNT_CHILDREN ) and (AMT_GOODS_PRICE, AMT_ANNUITY ) so we can use any one of them to do regression analysis.


## Previous data

In [None]:
df_p = pd.read_csv('previous_application.csv')

In [None]:
df_p.head()

In [None]:
df_p.info()

In [None]:
null_val_data = df_p.isnull().sum()/len(df_p)*100
null_val_data

In [None]:
df_p.columns[(100*df_p.isna().sum()/len(df_p) > 30)]

In [None]:
df_p= df_p.drop(columns=df_p.columns[(100*df_p.isna().sum()/len(df_p) > 30)])

In [None]:
df_p.shape

In [None]:
df4 = pd.merge(df_master,df_p,on='SK_ID_CURR',how='left')

In [None]:
df4.head()

### Null value percentage

In [None]:
df4.shape

## Status of loan distrubuted among customers

In [None]:
df4.NAME_CONTRACT_STATUS.value_counts().plot.barh()

In [None]:
A = df4[df4['NAME_CONTRACT_STATUS']=='Approved']
R = df4[df4['NAME_CONTRACT_STATUS']=='Refused']
C = df4[df4['NAME_CONTRACT_STATUS']=='Canceled']
U = df4[df4['NAME_CONTRACT_STATUS']=='Unused offer']

In [None]:
plt.subplots(nrows=2,ncols=2,figsize=(10,10))
plt.subplot(2,2,1)
plt.title('APPROVED CUSTOMERS')
sns.countplot(data=A,x='NAME_CONTRACT_STATUS',hue='TARGET')
plt.subplot(2,2,2)
plt.title('REFUSED CUSTOMERS')
sns.countplot(data=R,x='NAME_CONTRACT_STATUS',hue='TARGET')
plt.subplot(2,2,3)
plt.title('LOAN CANCELLED CUSTOMERS')
sns.countplot(data=C,x='NAME_CONTRACT_STATUS',hue='TARGET')
plt.subplot(2,2,4)
plt.title('UNUSED LOAN BY CUSTOMERS')
sns.countplot(data=U,x='NAME_CONTRACT_STATUS',hue='TARGET');

In [None]:
plt.subplots(2,2,figsize=(8,8))
plt.subplot(2,2,1)
plt.title('Loan Approved')
plt.pie(A.TARGET.value_counts(normalize=True)*100,autopct='%1.1f%%',labels=['Non-Default','Default'])
plt.subplot(2,2,2)
plt.title('Loan Refused')
plt.pie(R.TARGET.value_counts(normalize=True)*100,autopct='%1.1f%%',labels=['Non-Default','Default'])
plt.subplot(2,2,3)
plt.title('Loan Cancelled')
plt.pie(C.TARGET.value_counts(normalize=True)*100,autopct='%1.1f%%',labels=['Non-Default','Default'])
plt.subplot(2,2,4)
plt.title('Loan Unused')
plt.pie(U.TARGET.value_counts(normalize=True)*100,autopct='%1.1f%%',labels=['Non-Default','Default']);

#### Inference
    -- Refused and loans cancelled customer are appreared more to be defaulters
    -- Approved customers are less appeared to be defaulters
    

In [None]:
df4.describe()

## Bivariate Analysis

### Loan amount applied v/s Total Income

In [None]:
sns.scatterplot(x='AMT_APPLICATION',y='AMT_INCOME_TOTAL',data=A,hue='TARGET')

#### Inference
    -- No significant Inference drawn from above plot

In [None]:
df4.corr()

In [None]:
corr_final = df4.corr().where(np.triu(np.ones(df4.corr().shape),k=1).astype(np.bool))
corr_final

In [None]:
corr_final = corr_final.unstack()
corr_final = corr_final.sort_values(ascending=False).drop_duplicates()
corr_final.head(13)

### In the above mentioned 13 column pairs there is a very high correlation so we can remove any one column from column pair.