In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.pandas.set_option('display.max_columns', None)

In [None]:
application_train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
application_train.shape

In [None]:
application_train.head(10)

In [None]:
application_train.describe()

In [None]:
application_train.columns.values

In [None]:
application_train.isna().sum().sort_values(ascending=False)

In [None]:
application_train.duplicated().sum()

In [None]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [None]:
missing_data(application_train).head(20)


## Check data unbalance

TARGET value 0 means loan is repayed, value 1 means loan is not repayed.


In [None]:
application_train["TARGET"].value_counts()

In [None]:
# Distribution of Target
sns.barplot(x=[0,1], y=application_train['TARGET'].value_counts(normalize=True)).set_title('Distribution of Target')

the ratio of target is nearly 9/10 hence data is unbalanced

# EDA

### **i. categorical feature**

In [None]:
def categorical_plot(data, feature_name,label_rotation=False,horizontal_layout=True):
    '''
    This function to plot categorical features
    i) distribution of feature
    ii) distribution of feature with target
    iii) probability of defaulters
    iv) probability of non defaulters
    '''
    plotdata_1 = data[data.TARGET == 1][feature_name].value_counts() / data[feature_name].value_counts()
    
    plotdata_0 = data[data.TARGET == 0][feature_name].value_counts() / data[feature_name].value_counts()

    if(horizontal_layout):
        fig, axes = plt.subplots(2, 2, figsize=(16,8))
        if(label_rotation):
            for i in range(2):
                for j in range(2):
                    axes[i][j].tick_params(axis='x',labelrotation=90)
        sns.countplot(ax = axes[0][0],x=data[feature_name]).set_title(f"Distribution of {feature_name}")
        sns.countplot(ax = axes[0][1],x=data[feature_name], hue=data.TARGET, data=data).set_title(f"Distribution of {feature_name} by default/non-default")
        sns.barplot(ax = axes[1][0],x = plotdata_1.index, y=plotdata_1.values).set_title(f"Defaulter by {feature_name}")
        sns.barplot(ax = axes[1][1],x = plotdata_0.index, y=plotdata_0.values).set_title(f"Non Defaulter by {feature_name}")
    else:
        fig, axes = plt.subplots(4, 1, figsize=(10,12))
        if(label_rotation):
            for i in range(4):
                axes[i].tick_params(axis='x',labelrotation=90)
        sns.countplot(ax = axes[0],x=data[feature_name]).set_title(f"Distribution of {feature_name}") 
        sns.countplot(ax = axes[1],x=data[feature_name], hue=data.TARGET, data=data).set_title(f"Distribution of {feature_name} by default/non-default")
        sns.barplot(ax = axes[2],x = plotdata_1.index, y=plotdata_1.values).set_title(f"Defaulter by {feature_name}")
        sns.barplot(ax = axes[3],x = plotdata_0.index, y=plotdata_0.values).set_title(f"Non Defaulter by {feature_name}")

    fig.tight_layout()
    plt.show()

**Distribution according to NAME_CONTRACT_TYPE**

In [None]:
application_train.NAME_CONTRACT_TYPE.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'NAME_CONTRACT_TYPE')

- Contract type Revolving loans are just a small fraction from the total number of loans
- Relatively there is more chance that customer with revolving loan tends to default more

**Distribution according to CODE_GENDER**

In [None]:
application_train.CODE_GENDER.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'CODE_GENDER')

- The number of female clients is almost double the number of male clients. 
- Looking to the percent, males have a higher chance of defaulting.

**Distribution according to FLAG_OWN_CAR**

In [None]:
application_train.FLAG_OWN_CAR.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'FLAG_OWN_CAR')

- Client owning car is almost half that of no owning car
- from percentage client owing car is more likely to default 

**Distribution according to FLAG_OWN_REALTY**

In [None]:
application_train.FLAG_OWN_REALTY.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'FLAG_OWN_REALTY')

- Client owning realty is more that twice that of not owning realty
- From percentage client not owing realty is more likely to default



**Distribution according to NAME_TYPE_SUITE**

In [None]:
application_train.NAME_TYPE_SUITE.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'NAME_TYPE_SUITE')

- Most clients applying for loan are unaccompanied, followeded by family and spouse/partner
- There is more chance of default if client is not unaccompanied



**Distribution according to NAME_FAMILY_STATUS**

In [None]:
application_train.NAME_FAMILY_STATUS.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'NAME_FAMILY_STATUS',label_rotation = True)

- Most clients applying for loan are married
- All client except married have higher chances of defaulting excluding unknown

**Distribution according to CNT_FAM_MEMBERS**

In [None]:
application_train.CNT_FAM_MEMBERS.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'CNT_FAM_MEMBERS')

- Most clients are from range having 1 - 5 family members with highest client form family members of 2
- Relatively clients except family member 2 have higher chance of default


**Distribution according to CNT_CHILDREN**

In [None]:
application_train.CNT_CHILDREN.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'CNT_CHILDREN')

- Most client applying for loan dont have child
- The probability of client having 1-6 children are more likely to default than without children

**Distribution according to OCCUPATION_TYPE**

In [None]:
application_train.OCCUPATION_TYPE.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'OCCUPATION_TYPE',label_rotation=True,horizontal_layout=False)



- Most of the loans are taken by Laborers, followed by Sales staff. IT staff take the lowest amount of loans.

- The category with highest percent of default are Low-skill Laborers, followed by Drivers and Waiters/barmen staff, Security staff, Laborers and Cooking staff.


**Distribution according to ORGANIZATION_TYPE**

In [None]:
application_train.ORGANIZATION_TYPE.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'ORGANIZATION_TYPE',label_rotation = True, horizontal_layout = False)

- Business Entity Type 3,XNA and self employed tends to take loan more.
- Organizations with highest percent of defaults are Transport: type 3, Industry: type 13, Industry: type 8 and Restaurant but those have minorities where as organization which tends to take more loans also have high probabilities for defaulting


**Distribution according to NAME_EDUCATION_TYPE**

In [None]:
application_train.NAME_EDUCATION_TYPE.value_counts()

In [None]:

categorical_plot(application_train,feature_name = 'NAME_EDUCATION_TYPE',label_rotation = True)

- Majority of the clients have Secondary / secondary special education, followed by clients with Higher education. Only a very small number having an academic degree.

- The Lower secondary category, although rare, have the largest rate of not returning the loan.

**Distribution according to NAME_HOUSING_TYPE**

In [None]:
application_train.NAME_HOUSING_TYPE.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'NAME_HOUSING_TYPE',label_rotation = True)

- Most of clients are from House/Appartment
- From these categories, Rented apartment and With parents have higher default rate.


**Distribution according to NAME_INCOME_TYPE**

In [None]:
application_train.NAME_INCOME_TYPE.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'NAME_INCOME_TYPE',label_rotation = True)

- Most of applicants for loans are income from Working, followed by Commercial associate, Pensioner and State servant.

- The applicants with the type of income working has highest change of defaulting.


**Distribution according to WEEKDAY_APPR_PROCESS_START**

In [None]:
application_train.WEEKDAY_APPR_PROCESS_START.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'WEEKDAY_APPR_PROCESS_START')

- We dont see any infulence on default by weekday start

**Distribution according to REG_REGION_NOT_LIVE_REGION and REG_REGION_NOT_WORK_REGION**

In [None]:
application_train.REG_REGION_NOT_LIVE_REGION.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'REG_REGION_NOT_LIVE_REGION')

In [None]:
application_train.REG_REGION_NOT_WORK_REGION.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'REG_REGION_NOT_WORK_REGION')

- Very few people are registered in not live or not work region.
- Generally, the rate of not return is slightly larger for these cases than in the rest

**Distribution according to REG_CITY_NOT_LIVE_CITY and REG_CITY_NOT_WORK_CITY**

In [None]:
application_train.REG_CITY_NOT_LIVE_CITY.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'REG_CITY_NOT_LIVE_CITY')

In [None]:
application_train.REG_CITY_NOT_WORK_CITY.value_counts()

In [None]:
categorical_plot(application_train,feature_name = 'REG_CITY_NOT_WORK_CITY')

- Generally, much more people register in the city they live or work
- The ones that register in different city than the working or living city are more frequently not-repaying the loans than the ones that register same city.


### **ii. numerical plot**

In [None]:
def numerical_plot(data, col, size=[14, 10]):
    '''use this for ploting the distribution of numercial features'''
    plt.figure(figsize=size)
    sns.kdeplot(data.loc[(data['TARGET']==0),col], color='b',label='0')
    sns.kdeplot(data.loc[(data['TARGET']==1), col], color='r',label='1')
    plt.legend()
    plt.xlabel(col)
    plt.ylabel('Probability Density')

**Distribution according to AMT_CREDIT**

In [None]:
application_train.AMT_CREDIT.describe()

In [None]:
numerical_plot(application_train, 'AMT_CREDIT')

- The distribution looks right skewed normal distribution
- Client with Lower credit amount default more

**Distribution according to AMT_ANNUITY**

In [None]:
numerical_plot(application_train, 'AMT_ANNUITY')

- It also looks right skewed
- Lower annuity have higher chance of default

**Distribution according to DAYS_EMPLOYED**

In [None]:
numerical_plot(application_train, 'DAYS_EMPLOYED')

- we can see some anamoly in right side which have to be handled

**Distribution according to AMT_INCOME_TOTAL**

In [None]:
numerical_plot(application_train, 'AMT_INCOME_TOTAL')

- It is left skewed 

**Distribution according to AMT_GOODS_PRICE**

In [None]:
numerical_plot(application_train, 'AMT_GOODS_PRICE')

- It is left skewed with most of client ranging between amt good price form 0 - 2000000


**Distribution according to DAYS_BIRTH**

In [None]:
numerical_plot(application_train, 'DAYS_BIRTH')

In [None]:

sns.kdeplot(application_train.loc[(application_train['TARGET']==0),"DAYS_BIRTH"]/-365, color='b',label='0')
sns.kdeplot(application_train.loc[(application_train['TARGET']==1),"DAYS_BIRTH"]/-365, color='r',label='1')

plt.legend()


- The age range is between approximative 20 and 70 years.
- Most default are within range of 30 years

**Distribution according to DAYS_REGISTRATION**

In [None]:
numerical_plot(application_train, 'DAYS_REGISTRATION')

**Distribution according to DAYS_ID_PUBLISH**

In [None]:
numerical_plot(application_train, 'DAYS_ID_PUBLISH')

**Distribution according to EXT_SOURCE_1**

In [None]:
numerical_plot(application_train, 'EXT_SOURCE_1')

- It is normally distributed for non defaulter where as slightly left skewed for default which suggests lower ext_source_1 have higher defaulting probability

**Distribution according to EXT_SOURCE_2**

In [None]:
numerical_plot(application_train, 'EXT_SOURCE_2')

- The probabilty for defaulting is higher for ext_source_2 ranging from 0.4 - 0.8

**Distribution according to EXT_SOURCE_3**

In [None]:
numerical_plot(application_train, 'EXT_SOURCE_3')

- The defaulter distribution for ext_source_3 is normally distributed


**Distribution according to DAYS_LAST_PHONE_CHANGE**

In [None]:
numerical_plot(application_train, 'DAYS_LAST_PHONE_CHANGE')

- The distribution looks right skewed 


**Distribution according to HOUR_APPR_PROCESS_START**

In [None]:
numerical_plot(application_train, 'HOUR_APPR_PROCESS_START')

- the distribution is centered between 10 - 15 with complex curve

**Box plot for AMT_CREDIT and NAME_EDUCATION_TYPE**

In [None]:
sns.catplot(y="AMT_CREDIT", x="NAME_EDUCATION_TYPE", kind="box", data=application_train[application_train.TARGET == 1], height=6,aspect=2)

- most defaulters of all education type has amt credit centered around in between 500000  except academic degree which is slightly higher

**Box plot for AMT_CREDIT and NAME_INCOME_TYPE**

In [None]:
sns.catplot(y="AMT_CREDIT", x="NAME_INCOME_TYPE", kind="box", data=application_train[application_train.TARGET == 1], height=6,aspect=2)

- most defaulters of all income type has amt credit centered around in between 500000 except maternity leave which is slightly higher

In [None]:
sns.catplot(y="AMT_CREDIT", x="CODE_GENDER", kind="box", data=application_train[application_train.TARGET == 1], height=6,aspect=2)

- most defaulters of male or female both has amt credit centered around in between 500000

In [None]:
sns.catplot(y="AMT_CREDIT", x="NAME_CONTRACT_TYPE", kind="box", data=application_train[application_train.TARGET == 1], height=6,aspect=2)

- defaulters of cash loan has higher amt credit than revolving loans

In [None]:
sns.catplot(y="AMT_CREDIT", x="FLAG_OWN_CAR", kind="box", data=application_train[application_train.TARGET == 1], height=6,aspect=2)

- most defaulters of both owing car or not has amt credit centered around in between 500000

In [None]:
sns.catplot(y="AMT_CREDIT", x="FLAG_OWN_REALTY", kind="box", data=application_train[application_train.TARGET == 1], height=6,aspect=2)

- most defaulters of both owing realty or not has amt credit centered around in between 500000