# Home Credit Default Risk - Team 3 (Kahsai, Nichols, Pellerito)

In [None]:
# Import Packages
import os
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import gc    

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# list all files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))# import packages

In [None]:
# Create function that counts the observations in each categorical bin using bars
def graph_objects(frame, hue=None):
    
    if hue is not None:
        hue = hue
        
    df = frame.dtypes
    df.index.name = 'columns'
    df = pd.DataFrame(df, columns=['dtype'])
    df = df.reset_index()
    df = df[df['dtype'] == 'object']
    list_objects = df['columns'].tolist()
    
    for obj in list_objects:
        plt.figure(figsize=(11,5))
        plot = sns.countplot(obj, data=frame, hue=hue)
        plt.xticks(rotation=90)

# POS_CASH EDA

In [None]:
pos_cash = pd.read_csv('../input/home-credit-default-risk/POS_CASH_balance.csv')
pos_cash.drop(['SK_ID_PREV','SK_ID_CURR'], axis = 1, inplace=True)
print('POS_CASH Shape:', pos_cash.shape)
pos_cash.head(5)

In [None]:
# Sample of 5000 rows from the POS_CASH data to save time
pos_5K = pos_cash.sample(n=5000, random_state=1)
pos_5K = pos_5K.copy()
del pos_cash
gc.collect()

In [None]:
# Find missing values in POS_CASH
count = pos_5K.isnull().sum().sort_values(ascending=False)
percentage = ((pos_5K.isnull().sum()/len(pos_5K)*100)).sort_values(ascending=False)

missing_pos = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])
print('Count and percentage of missing values for POS_CASH Dataset:')
missing_pos.head(8)

In [None]:
#Countplot shows the counts of observations in each categorical columns in POS_CASH data
graph_objects(pos_5K)

In [None]:
#Show  Distribution of CNT_INSTALNENT in POS_CASH data
plt.figure(figsize=(12,5))
plt.title("Distribution of CNT_INSTALMENT")
ax = sns.distplot(pos_5K.CNT_INSTALMENT.dropna())
plt.show()

* Most of the CNT_INSTALMENT payments falls in the bracket of 0-10.

In [None]:
#Show Distribution of CNT_INSTALNENT_FURURE in POS_CASH data
plt.figure(figsize=(12,5))
plt.title("Distribution of CNT_INSTALMENT_FUTURE")
ax = sns.distplot(pos_5K.CNT_INSTALMENT_FUTURE.dropna())
plt.show()

* Most of the CNT_INSTALMENT_FUTURE payments falls in the bracket of 0-20. 

In [None]:
#Visualizing the relationships between CNT_INSTALMENT and CNT_INSTALMENT_FUTRUE Vs. NAME_CONTRACT_STATUS
plt.figure(figsize=(12,5))
ax= sns.pairplot(pos_5K, vars = ['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE'], hue='NAME_CONTRACT_STATUS');
plt.show()

* CNT_INSTALMENT and CNT_INSTALMENT_FUTURE are highly corrleated with a postive linear slope. 
* Accunting customers NEW_CONTRACT_STATUS, active is much common than any another options. 

In [None]:
# Correlation table
corrs = pos_5K.corr()
plt.figure(figsize=(20,20))
sns.heatmap(pos_5K.corr(), annot = True, cmap = "BuPu", alpha = 0.5, fmt = ".4f", cbar = False)
plt.show()

annot = True   #gives you the numbers instead of just the colors.
cmap = "BuPu"  #is a blue/purple color scheme that I like to use
alpha = 0.5    #tones down the intensity of the colors
fmt = ".4f"    #formats the numbers to four decimal places
cbar = False   #turns off the scale bar on the right

del pos_5K
gc.collect()

# Bureau EDA

In [None]:
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau.drop(['SK_ID_BUREAU','SK_ID_CURR'], axis = 1, inplace=True)
print('Bureau Data Shape:', bureau.shape)
bureau.head(5)

In [None]:
#Sample of 5000 rows from the bureau data to save time
bureau_5K = bureau.sample(n=5000, random_state=1)
bureau_5K = bureau_5K.copy()
del bureau
gc.collect()

In [None]:
# Find missing values in bureau
count = bureau_5K.isnull().sum().sort_values(ascending=False)
percentage = ((bureau_5K.isnull().sum()/len(bureau_5K)*100)).sort_values(ascending=False)

missing_bur = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])
print('Count and percentage of missing values for Bureau Dataset:')
missing_bur.head(17)

In [None]:
#Countplot shows the counts of observations in each categorical columns in bureau data
graph_objects(bureau_5K)

In [None]:
#Visualizing the relationships between AMT_ANNUITY,AMT_CREDIT_SUM_LIMIT Vs. CREDIT_ACTIVE
plt.figure(figsize=(12,5))
ax = sns.pairplot(bureau_5K, vars = ['AMT_ANNUITY','AMT_CREDIT_SUM_LIMIT'], hue='CREDIT_ACTIVE', size=2.5);
plt.xlim((0,1000000))
plt.show()

* AMT_ANNUITY and AMT_CREDIT_SUM_LIMIT do not have signficate replationshp.
* Accounting to customers CREDIT_ACTIVE, active is much common than any other options. 

In [None]:
#Visualizing the relationships between AMT_ANNUITY,AMT_CREDIT_SUM_LIMIT Vs. CREDIT_CURRENCY
plt.figure(figsize=(12,5))
ax = sns.pairplot(bureau_5K, vars = ['AMT_ANNUITY','AMT_CREDIT_SUM_LIMIT'], hue='CREDIT_CURRENCY', size=2.5);
plt.xlim((0,1000000))
plt.show()

* AMT_ANNUITY and AMT_CREDIT_SUM_LIMIT do not have signficate replationshp.
* Accounting customers CREDIT_CURRENCY, currency 1 is much common than any other options. 

In [None]:
#Visualizing the relationships between AMT_ANNUITY,AMT_CREDIT_SUM_LIMIT Vs. CREDIT_TYPE
plt.figure(figsize=(12,5))
ax = sns.pairplot(bureau_5K, vars = ['AMT_ANNUITY','AMT_CREDIT_SUM_LIMIT'], hue='CREDIT_TYPE', size=2.5);
plt.xlim((0,1000000))
plt.show()

* AMT_ANNUITY and AMT_CREDIT_SUM_LIMIT do not have signficate replationshp.
* Accounting customers CREDIT_TYPE, credit card is much common than any  other options.

In [None]:
# Correlation table
corrs = bureau_5K.corr()
plt.figure(figsize=(20,20))
sns.heatmap(bureau_5K.corr(), annot = True, cmap = "BuPu", alpha = 0.5, fmt = ".4f", cbar = False)
plt.show()

annot = True   #gives you the numbers instead of just the colors.
cmap = "BuPu"  #is a blue/purple color scheme that I like to use
alpha = 0.5    #tones down the intensity of the colors
fmt = ".4f"    #formats the numbers to four decimal places
cbar = False   #turns off the scale bar on the right

del bureau_5K
gc.collect()

# Bureau Balance EDA

In [None]:
bur_balance= pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')
bur_balance.drop(['SK_ID_BUREAU'], axis = 1, inplace=True)
print('Bureau Balance Data Shape:', bur_balance.shape)
bur_balance.head(5)

In [None]:
#Sample of 5000 rows from the bureau balance data to save time
burbal_5K = bur_balance.sample(n=5000, random_state=1)
burbal_5K = burbal_5K.copy()
del bur_balance
gc.collect()

In [None]:
# Find missing values in bureau balance
count = burbal_5K.isnull().sum().sort_values(ascending=False)
percentage = ((burbal_5K.isnull().sum()/len(burbal_5K)*100)).sort_values(ascending=False)

missing_bal = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])
print('Count and percentage of missing values for Bureau Balance Dataset:')
missing_bal.head(3)

In [None]:
#Countplot shows the counts of observations in each categorical columns in bureau balance data
graph_objects(burbal_5K)

In [None]:
#Show MONTHS_BALANCE Distribution in bureau balance Data
plt.figure(figsize=(12,5))
plt.title("Distribution of MONTHS_BALANCE")
ax = sns.distplot(burbal_5K.MONTHS_BALANCE.dropna())
plt.show()

* MONTHS_BALANCE is inconsistent since the data includes negative values.

In [None]:
#Visualizing the relationships between MONTHS_BALANCE Vs. STATUS
plt.figure(figsize=(12,5))
ax = sns.pairplot(burbal_5K, vars = ['MONTHS_BALANCE'],hue='STATUS');
plt.show()

del burbal_5K
gc.collect()

* MONTHS_BALANCE and STATUS are highly correlated.

# Credit Card Balance EDA

In [None]:
credit = pd.read_csv('../input/home-credit-default-risk/credit_card_balance.csv')
credit.drop(['SK_ID_PREV','SK_ID_CURR'], axis = 1, inplace=True)
print('Testing Credit Balance Shape:', credit.shape)
credit.head(5)

In [None]:
#Sample of 5000 rows from the credit card balance data to save time
credit_5K = credit.sample(n=5000, random_state=1)
credit_5K = credit_5K.copy()
del credit
gc.collect()

In [None]:
# Find missing values in credit card balance 
count = credit_5K.isnull().sum().sort_values(ascending=False)
percentage = ((credit_5K.isnull().sum()/len(credit_5K)*100)).sort_values(ascending=False)

missing_cred = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])
print('Count and percentage of missing values for Credit Balance Dataset:')
missing_cred.head(23)

In [None]:
#Countplot shows the counts of observations in each categorical columns in credit balance data
graph_objects(credit_5K)

In [None]:
#Visualizing the relationships between AMT_PAYMENT_CURRENT and AMT_DRAWINGS_ATM_CURRENT Vs. NAME_CONTRACT_STATUS
plt.figure(figsize=(12,5))
ax = sns.pairplot(credit_5K, vars = ['AMT_PAYMENT_CURRENT','AMT_DRAWINGS_ATM_CURRENT'], hue='NAME_CONTRACT_STATUS');
plt.show()

* AMT_PAYMENT_CURRENT and AMT_DRAWINGS_ATM_CURRENT do not have signficate replationshp.
* Accounting customers NAME_CONTRACT_STATUS, active is much common than any other options.

In [None]:
# Correlation table
corrs = credit_5K.corr()
plt.figure(figsize=(20,20))
sns.heatmap(credit_5K.corr(), annot = True, cmap = "BuPu", alpha = 0.5, fmt = ".4f", cbar = False)
plt.show()

annot = True   #gives you the numbers instead of just the colors.
cmap = "BuPu"  #is a blue/purple color scheme that I like to use
alpha = 0.5    #tones down the intensity of the colors
fmt = ".4f"    #formats the numbers to four decimal places
cbar = False   #turns off the scale bar on the right

del credit_5K
gc.collect()

# Installments Payments EDA

In [None]:
instal = pd.read_csv('../input/home-credit-default-risk/installments_payments.csv')
instal.drop(['SK_ID_PREV','SK_ID_CURR'], axis = 1, inplace=True)
print('Testing Installments Payments Shape:', instal.shape)
instal.head(5)

In [None]:
#Sample of 5000 rows from the instalments payments data to save time
instal_5K = instal.sample(n=5000, random_state=1)
instal_5K = instal_5K.copy()
del instal
gc.collect()

In [None]:
# Find missing values in installments payments
count = instal_5K.isnull().sum().sort_values(ascending=False)
percentage = ((instal_5K.isnull().sum()/len(instal_5K)*100)).sort_values(ascending=False)
missing_inst = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])
print('Count and percentage of missing values for Instalment Payments Dataset:')
missing_inst.head()

In [None]:
#Show DAYS_ENTRY_PAYMENT Distribution in installements payments Data
plt.figure(figsize=(12,5))
plt.title("Distribution of DAYS_ENTRY_PAYMENT")
ax = sns.distplot(instal_5K.DAYS_ENTRY_PAYMENT.dropna())
plt.show()

* DAYS_ENTRY_PAYMENTS is inconsistent since the data includes negative values. 

In [None]:
#Show AMT_PAYMENT Distribution in installements payments Data
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_PAYMENT")
ax = sns.distplot(instal_5K.AMT_PAYMENT.dropna())
plt.show()

* AMT_PAYMENT has close to 0 paymens.

In [None]:
# Correlation table
corrs = instal_5K.corr()
plt.figure(figsize=(20,20))
sns.heatmap(instal_5K.corr(), annot = True, cmap = "BuPu", alpha = 0.5, fmt = ".4f", cbar = False)
plt.show()

annot = True   #gives you the numbers instead of just the colors.
cmap = "BuPu"  #is a blue/purple color scheme that I like to use
alpha = 0.5    #tones down the intensity of the colors
fmt = ".4f"    #formats the numbers to four decimal places
cbar = False   #turns off the scale bar on the right

del instal_5K
gc.collect()

# Previous Appliations EDA

In [None]:
prev_app= pd.read_csv('../input/home-credit-default-risk/previous_application.csv')
prev_app.drop(['SK_ID_PREV','SK_ID_CURR'], axis = 1, inplace=True)
print('Testing Instalment Payments Shape:', prev_app.shape)
prev_app.head(20)

In [None]:
# Sample of 5000 rows from the previous application data to save time
prev_5K = prev_app.sample(n=5000, random_state=1)
prev_5K = prev_5K.copy()
del prev_app
gc.collect()

In [None]:
# Find missing values in previous application
count = prev_5K.isnull().sum().sort_values(ascending=False)
percentage = ((prev_5K.isnull().sum()/len(prev_5K)*100)).sort_values(ascending=False)
missing_prev = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])
print('Count and percentage of missing values for Previous Applications Dataset:')
missing_prev.head(10)

In [None]:
#Countplot shows the counts of observations in each categorical columns in previous application data
graph_objects(prev_5K)

In [None]:
#Visualizing the relationships between AMT_DOWN_PAYMENT, AMT_GOODS_PRICE, AMT_ANNUITY Vs. NAME_CASH_LOAN_PURPOSE
plt.figure(figsize=(12,5))
ax = sns.pairplot(prev_5K, vars = ['AMT_DOWN_PAYMENT','AMT_GOODS_PRICE','AMT_ANNUITY'], hue='NAME_CASH_LOAN_PURPOSE');
plt.xlim((0,1000000))
plt.show()

* AMT_ANNUITY and AMT_DOWN_PAYMENT are highly corrleated with a postive linear slope.
* AMT_ANNUITY and AMT_GOODS_PRICE are highly corrleated with a postive linear slope.
* AMT_GOODS_PRICE and AMT_DOWN_PAYMENT are highly corrleated with a postive linear slope.
* Accounting customers NAME_CASH_LOAN_PURPOSE, XNA is much common than any other options. 

In [None]:
#Visualizing the relationships between AMT_DOWN_PAYMENT, AMT_GOODS_PRICE, AMT_ANNUITY Vs. NAME_CONTRACT_STATUS
plt.figure(figsize=(12,5))
ax = sns.pairplot(prev_5K, vars = ['AMT_DOWN_PAYMENT','AMT_GOODS_PRICE','AMT_ANNUITY'], hue='NAME_CONTRACT_STATUS');
plt.show()

* AMT_ANNUITY and AMT_DOWN_PAYMENT are highly corrleated with a postive linear slope.
* AMT_ANNUITY and AMT_GOODS_PRICE are highly corrleated with a postive linear slope.
* AMT_GOODS_PRICE and AMT_DOWN_PAYMENT are highly corrleated with a postive linear slope.
* Accounting customers NAME_CONTRACT_STATUS, approved is much common than any other options. 

In [None]:
# Correlation table
corrs = prev_5K.corr()
plt.figure(figsize=(20,20))
sns.heatmap(prev_5K.corr(), annot = True, cmap = "BuPu", alpha = 0.5, fmt = ".4f", cbar = False)
plt.show()

annot = True   #gives you the numbers instead of just the colors.
cmap = "BuPu"  #is a blue/purple color scheme that I like to use
alpha = 0.5    #tones down the intensity of the colors
fmt = ".4f"    #formats the numbers to four decimal places
cbar = False   #turns off the scale bar on the right

del prev_5K
gc.collect()