In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from datetime import timedelta

train=pd.read_csv(r"../input/application_train.csv")



# Any results you write to the current directory are saved as output.

# Inital EDA of the Home Credit Default Challenge

This is just a quick dive into the main dataset to investigate some of the data points. The dataset is pretty diverse but most columns can already be integrated quite well into an ML algorithm. This EDA is helpful though for spotting categorical data as well as getting a better overall understanding.

This is going to be an ongoing notebook so **please** comment, question and critique this. I am **very** appreciative of any input. 

First we clean up the data a little bit so they can be visualized. 

In [None]:
#Data Engineering

train["AGE_IN_YEARS"]=train["DAYS_BIRTH"]*-1.0/365
train["AGE_IN_DAYS"]=train["DAYS_BIRTH"].abs()
train["DAYS_EMPLOYED_CLEAN"]=train["DAYS_EMPLOYED"].abs().loc[train["DAYS_EMPLOYED"]<train["DAYS_EMPLOYED"].quantile(1)]
train["DAYS_REGISTRATION_CLEAN"]=train["DAYS_REGISTRATION"].abs()
train["DAYS_ID_PUBLISH_CLEAN"]=train["DAYS_ID_PUBLISH"].abs()

## Basic Factors

First up, we are gonna look at some of the most obvious measures. Mainly, how many problematic creditors there actually are, the gender ratio, what kind of loans are taken out, whether they posess assets and how many kids they have.

In [None]:
plt.hist(train["TARGET"])
plt.title("Target")
plt.show()

plt.bar(x=train.groupby("NAME_CONTRACT_TYPE").size().index,height=train.groupby("NAME_CONTRACT_TYPE").size())
plt.title("Contract Type")
plt.show()

plt.bar(x=train.groupby("CODE_GENDER").size().index,height=train.groupby("CODE_GENDER").size())
plt.title("Gender")
plt.show()

plt.bar(x=train.groupby("FLAG_OWN_CAR").size().index,height=train.groupby("FLAG_OWN_CAR").size())
plt.title("Own Car")
plt.show()

plt.bar(x=train.groupby("FLAG_OWN_REALTY").size().index,height=train.groupby("FLAG_OWN_REALTY").size())
plt.title("Own Real Estate")
plt.show()

plt.bar(x=train.groupby("CNT_CHILDREN").size().index,height=train.groupby("CNT_CHILDREN").size())
plt.title("Number of Children")
plt.show()

## Credit-Related Distributions 

Now we are going to look at the size of the credits as well as the income size of the creditors (with outliers removed). Both distributions are positively skewed which is what you would expect if the loans correspond to the creditor's income.

In [None]:
plt.figure(figsize=(15,10))
sns.distplot(train["AMT_INCOME_TOTAL"].loc[train["AMT_INCOME_TOTAL"]<train["AMT_INCOME_TOTAL"].quantile(.95)],kde=False).set_title("Income in Total (95% of Data)")
plt.show()

plt.figure(figsize=(15,10))
sns.distplot(train["AMT_CREDIT"].loc[train["AMT_CREDIT"]<train["AMT_CREDIT"].quantile(.99)],kde=False).set_title("Credit Amount (99% of Data)")
plt.show()

With the loan annuity and the price of the credit-intended goods we have similar distributions. As one expects, they correspond with one another. I would guess, that problematic creditors are the intersections where income/credit-intended goods do not align with the credit or credit annuity.

In [None]:
plt.figure(figsize=(15,10))
train["AMT_ANNUITY"]=train["AMT_ANNUITY"].fillna(0)
sns.distplot(train["AMT_ANNUITY"],kde=False).set_title("Loan Annuity (without outliers)")
plt.xlim(0, 100000)
plt.show()

train["AMT_GOODS_PRICE"]=train["AMT_GOODS_PRICE"].fillna(0)
plt.figure(figsize=(15,10))
plt.xlim(0, 2500000)
sns.distplot(train["AMT_GOODS_PRICE"],kde=False).set_title("Price of the credit-intended goods")
plt.show()

## Categorical Values

Looking at the categorical values, we can see that some values are more dominant in certain categories than others. This could potentially make it a lot easier for the ML algorithms if these are related to problematic creditors. 

In [None]:
def type_plot(cat,title):
    plt.figure(figsize=(15,10))
    plt.bar(x=train.groupby(cat)[cat].count().sort_values(ascending=False).index,
            height=train.groupby(cat)[cat].count().sort_values(ascending=False).tolist())
    plt.title(title)
    plt.show()

type_plot("NAME_TYPE_SUITE","Client Accompaniment")
type_plot("NAME_INCOME_TYPE","Income Type")
type_plot("NAME_EDUCATION_TYPE","Education Type")
type_plot("NAME_FAMILY_STATUS","Family Status")
type_plot("NAME_HOUSING_TYPE","Housing Type")


This is a bit more interesting. It is not categorical per se, but it could potentially turn into one, since it is not too varied.

In [None]:
sns.distplot(train["REGION_POPULATION_RELATIVE"],kde=False).set_title("Home Population Density")
plt.show()

## Binary Categorical Values

These values are all binary categories. The distributions between the two targets do not vary much. 

In [None]:
def target_comp(cat,title):
    g=sns.FacetGrid(train,col="TARGET",size=5)
    g.map(plt.hist,cat,bins=50)
    g.fig.subplots_adjust(top=0.9)
    g.axes[0,0].set_xlabel('')
    g.axes[0,1].set_xlabel('')
    g.fig.suptitle(title, fontsize=16)
    plt.show()

    g=sns.FacetGrid(train,col="TARGET",size=5)
    g.map(sns.distplot,cat,bins=50)
    g.axes[0,0].set_xlabel('')
    g.axes[0,1].set_xlabel('')
    g.fig.subplots_adjust(top=0.9)
    g.fig.suptitle(title, fontsize=16)
    plt.show()
    
target_comp("FLAG_MOBIL","Cell Phone")
target_comp("FLAG_EMP_PHONE","Work Phone")
target_comp("FLAG_CONT_MOBILE","Reachable by Phone")
target_comp("FLAG_PHONE","Home Phone")
target_comp("FLAG_EMAIL","E-Mail")
target_comp("REG_REGION_NOT_LIVE_REGION","Address matches contact address")
target_comp("REG_REGION_NOT_WORK_REGION","Address matches work address")
target_comp("LIVE_REGION_NOT_WORK_REGION","Living address matches work address")
target_comp("REG_REGION_NOT_LIVE_REGION","Address matches contact address")

## Direct Comparisons

Next we are going to do something we could hypothetically do with other values as well, but these feel more immediately relevant. We are going to compare the distributions of values between problematic and non-problematic creditors with certain values. 

Interestingly enough, it seems that age and number of days employed seem to have the greatest discrepancies. 

In [None]:
target_comp("AGE_IN_YEARS","Age in years")
target_comp("DAYS_EMPLOYED_CLEAN","Days Employed")
target_comp("DAYS_REGISTRATION_CLEAN","Days since registration has changed")
target_comp("DAYS_ID_PUBLISH_CLEAN","Days since ID has changed")
target_comp("DAYS_EMPLOYED_CLEAN","Days Employed")
target_comp("OWN_CAR_AGE","Length of Ownership of Car")
target_comp("CNT_FAM_MEMBERS","Amount ofFamily Members")
target_comp("HOUR_APPR_PROCESS_START","Starting hour")


Comparing occupation types, the difference between problematic and non-problematic creditors are not too strong but will eventually matter once we start training our model. Of course we have to turn these categorical values into something machine readable. 

In [None]:
occupation=train.groupby(["OCCUPATION_TYPE","TARGET"]).count().reset_index()
occupation_no=occupation[["OCCUPATION_TYPE","TARGET","SK_ID_CURR"]].loc[occupation["TARGET"]==0]
occupation_yes=occupation[["OCCUPATION_TYPE","TARGET","SK_ID_CURR"]].loc[occupation["TARGET"]==1]

sns.set(font_scale=1.8)
plt.figure(figsize=(40,10))
ax=sns.barplot(data=occupation_no.sort_values(by="SK_ID_CURR",ascending=False),x="OCCUPATION_TYPE",y="SK_ID_CURR")
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title("Occupation - Non-Problematic Creditor")
plt.show()

plt.figure(figsize=(40,10))
ax=sns.barplot(data=occupation_yes.sort_values(by="SK_ID_CURR",ascending=False),x="OCCUPATION_TYPE",y="SK_ID_CURR")
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title("Occupation - Problematic Creditor")
plt.show()

In contrast, the type of organization the creditors work for, seem to have more differences. 

In [None]:
organization=train.groupby(["ORGANIZATION_TYPE","TARGET"]).count().reset_index()
organization_no=organization[["ORGANIZATION_TYPE","TARGET","SK_ID_CURR"]].loc[organization["TARGET"]==0]
organization_yes=organization[["ORGANIZATION_TYPE","TARGET","SK_ID_CURR"]].loc[organization["TARGET"]==1]


plt.figure(figsize=(25,10))
ax=sns.barplot(data=organization_no.sort_values(by="SK_ID_CURR",ascending=False).iloc[:15],x="ORGANIZATION_TYPE",y="SK_ID_CURR")
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title("Organization Type - Non-Problematic Creditor")
plt.show()

plt.figure(figsize=(25,10))
ax=sns.barplot(data=organization_yes.sort_values(by="SK_ID_CURR",ascending=False).iloc[:15],x="ORGANIZATION_TYPE",y="SK_ID_CURR")
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_title("Organization Type - Problematic Creditor")
plt.show()

There are some categorical values which are better compared by just looking at some key metrics. When we look at the Home Credit ratings of the clients' home region, we see that here is no real difference between problematic and non-problematic. 

In [None]:
train_no=train.loc[train["TARGET"]==0]
train_no["TARGET"].nunique()
train_yes=train.loc[train["TARGET"]==1]
train_yes["TARGET"].nunique()


def describe(cat):
    print ("Values for non-problematic clients:")
    print (train_no[cat].describe())
    print ("")
    print ("Values for problematic clients:")
    print (train_yes[cat].describe())

describe("REGION_RATING_CLIENT")
describe("REGION_RATING_CLIENT_W_CITY")

Checking for differences between the targets in regards to which day they applied for credit is mainly done out of due diligence here, no real difference between the two groups. 

In [None]:
weekday=train.groupby(["WEEKDAY_APPR_PROCESS_START","TARGET"]).count().reset_index()
weekday_no=weekday[["WEEKDAY_APPR_PROCESS_START","TARGET","SK_ID_CURR"]].loc[weekday["TARGET"]==0]
weekday_yes=weekday[["WEEKDAY_APPR_PROCESS_START","TARGET","SK_ID_CURR"]].loc[weekday["TARGET"]==1]

plt.figure(figsize=(25,10))
ax=sns.barplot(data=weekday_no.sort_values(by="SK_ID_CURR",ascending=False),x="WEEKDAY_APPR_PROCESS_START",y="SK_ID_CURR")
ax.set_xlabel("")
ax.set_title("Weekday - Non-Problematic Creditor")
plt.show()

plt.figure(figsize=(25,10))
ax=sns.barplot(data=weekday_yes.sort_values(by="SK_ID_CURR",ascending=False),x="WEEKDAY_APPR_PROCESS_START",y="SK_ID_CURR")
ax.set_xlabel("")
ax.set_title("Weekday - Problematic Creditor")
plt.show()

# Conclusion (for now)

Looking at some key metrics, there are no values that stand out by a lot but there are definitely enough differences to work with. Given the amount of data we have about each creditor, and we haven't even included, the other data yet, there is a lot to work with. 

Please comment and critique this ongoing notebook, it is my first published EDA notebook. 