In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
pd.set_option('display.max_columns',40)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Loading Dataset
#Removing rows with Target value as null
train_data = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
train_data = train_data[~train_data['TARGET'].isnull()]
train_data.head()

In [None]:
train_data.shape

In [None]:
#Checking Distribution of Target Variable
sns.countplot(x=train_data['TARGET'])

In [None]:
train_data['TARGET'].value_counts(normalize=True)

In [None]:
#Checking Variables with Null Values
(train_data.isnull().sum()/train_data.shape[0]).sort_values(ascending=False).head(60)

In [None]:
#Dropping columns with more than 30% Nulls as treating them would require a lot of time 
#and would still havelow explanatory power
train_data=train_data.drop(columns=['COMMONAREA_MEDI','COMMONAREA_AVG','COMMONAREA_MODE','NONLIVINGAPARTMENTS_MODE',
                                    'NONLIVINGAPARTMENTS_AVG','NONLIVINGAPARTMENTS_MEDI','FONDKAPREMONT_MODE',
                                    'LIVINGAPARTMENTS_MODE','LIVINGAPARTMENTS_AVG','LIVINGAPARTMENTS_MEDI',
                                    'FLOORSMIN_AVG','FLOORSMIN_MODE','FLOORSMIN_MEDI','YEARS_BUILD_MEDI',
                                    'YEARS_BUILD_MODE','YEARS_BUILD_AVG','OWN_CAR_AGE','LANDAREA_MEDI',
                                    'LANDAREA_MODE','LANDAREA_AVG','BASEMENTAREA_MEDI','BASEMENTAREA_AVG',
                                    'BASEMENTAREA_MODE','EXT_SOURCE_1','NONLIVINGAREA_MODE','NONLIVINGAREA_AVG',
                                    'NONLIVINGAREA_MEDI','ELEVATORS_MEDI','ELEVATORS_AVG','ELEVATORS_MODE',
                                    'WALLSMATERIAL_MODE','APARTMENTS_MEDI','APARTMENTS_AVG','APARTMENTS_MODE',
                                    'ENTRANCES_MEDI','ENTRANCES_AVG','ENTRANCES_MODE','LIVINGAREA_AVG','LIVINGAREA_MODE',
                                    'LIVINGAREA_MEDI','HOUSETYPE_MODE','FLOORSMAX_MODE','FLOORSMAX_MEDI','FLOORSMAX_AVG',
                                    'YEARS_BEGINEXPLUATATION_MODE','YEARS_BEGINEXPLUATATION_MEDI',
                                    'YEARS_BEGINEXPLUATATION_AVG','TOTALAREA_MODE','EMERGENCYSTATE_MODE','OCCUPATION_TYPE'])

In [None]:
(train_data.isnull().sum()/train_data.shape[0]).sort_values(ascending=False).head(20)

In [None]:
train_data.shape

In [None]:
#Checking Correlation with TARGET variable
train_data.corr()['TARGET'].abs().sort_values().head(50)

In [None]:
#Dropping Variables with <2% Correlationship with TARGET variable
train_data = train_data.drop(columns=['FLAG_DOCUMENT_20','FLAG_DOCUMENT_5','FLAG_CONT_MOBILE','FLAG_MOBIL','FLAG_DOCUMENT_12',
                         'AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_HOUR','FLAG_DOCUMENT_19','FLAG_DOCUMENT_10','FLAG_DOCUMENT_7',
                         'FLAG_EMAIL','AMT_REQ_CREDIT_BUREAU_QRT','SK_ID_CURR','FLAG_DOCUMENT_4','AMT_REQ_CREDIT_BUREAU_DAY',
                         'LIVE_REGION_NOT_WORK_REGION','FLAG_DOCUMENT_17','FLAG_DOCUMENT_21','AMT_INCOME_TOTAL','FLAG_DOCUMENT_11',
                         'FLAG_DOCUMENT_9','FLAG_DOCUMENT_2','REG_REGION_NOT_LIVE_REGION','FLAG_DOCUMENT_15','REG_REGION_NOT_WORK_REGION',
                         'FLAG_DOCUMENT_18','FLAG_DOCUMENT_8','OBS_60_CNT_SOCIAL_CIRCLE','OBS_30_CNT_SOCIAL_CIRCLE','CNT_FAM_MEMBERS',
                         'FLAG_DOCUMENT_14','FLAG_DOCUMENT_13','FLAG_DOCUMENT_16','AMT_REQ_CREDIT_BUREAU_MON','AMT_ANNUITY','CNT_CHILDREN',
                         'AMT_REQ_CREDIT_BUREAU_YEAR'])
train_data.shape

In [None]:
train_data.corr()['TARGET'].abs().sort_values(ascending=False).head(50)

In [None]:
sns.heatmap(train_data.corr().abs()>0.8)

In [None]:
train_data.corr().abs()['REGION_RATING_CLIENT']>0.8

Multicollinearity Exists between the following pairs of variables
<ol>
    <li>AMT_CREDIT - AMT_GOODS_PRICE</li>
    <li>DEF_30_CNT_SOCIAL_CIRCLE - DEF_60_CNT_SOCIAL_CIRCLE</li>
    <li>DAYS_EMPLOYED - FLAG_EMP_PHONE</li>
    <li>REG_CITY_NOT_WORK_CITY - LIVE_CITY_NOT_WORK_CITY</li>
    <li>REGION_RATING_CLIENT_W_CITY - REGION_RATING_CLIENT</li>

<h2>Comments</h2>
<ol>
    <li>It makes sense that the AMT_CREDIT(Credit amount of the loan) and AMT_GOODS_PRICE(For consumer loans it is the price of the goods for which the loan is given) are correlated <br>
        As the Credit Amount is applicable to all loans, we will <b>drop AMT_GOODS_PRICE </b>column</li>
    <li>DEF_30_CNT_SOCIAL_CIRCLE(How many observation of client's social surroundings defaulted on 30 days past due) and 
        DEF_60_CNT_SOCIAL_CIRCLE(How many observation of client's social surroundings defaulted on 60 days past due) are also highly correlated as they are the same metric with a change in the temporal scope.
        <br> We will <b>drop DEF_60_CNT_SOCIAL_CIRCLE </b>as the sensitivity of this variable and correlation with TARGET is lower

### Exploring DAYS_EMPLOYED AND FLAG_EMP_PHONE

In [None]:
train_data_emp_phone1 = train_data[train_data['FLAG_EMP_PHONE']== 1]
train_data_emp_phone0 = train_data[(train_data['FLAG_EMP_PHONE']== 0)]

figure, ax = plt.subplots(1,2,figsize=(10,5))
sns.histplot(train_data_emp_phone1['DAYS_EMPLOYED'],ax=ax[0])
sns.histplot(train_data_emp_phone0['DAYS_EMPLOYED'],ax=ax[1])

del train_data_emp_phone0
del train_data_emp_phone1

## Comments

We can See that for FLAG_EMP_PHONE=0 we have a very high positive value of 365,243 for DAYS_EMPLOTED whereas all values of FLAG_EMP_PHONE = 1 has a good spread of values ranging from -17,500 to 0. <br>
Going through discussion boards, the value 365,243 represents when the information is not provided. ie the applicant wasn't employed <br>
<br>
We will <b>drop FLAG_EMP_PHONE</b>, as this flag represents if the applicant had an employer or not. The same information is still available in the DAYS_EMPLOYED variable 