# Team 3 Final EDA Notebook 1 (Kahsai, Nichols, Pellerito)
An Exploration of the Application_Train Dataset

# Review Training Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import gc

In [None]:
# Read in training data
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
pd.options.display.max_columns = None

train.head()

In [None]:
# Find distribution of target variable
(train['TARGET'].value_counts() / len(train)).to_frame()

This dataset is not evenly distributed among those who default and those who did not default on thier loan. 

TARGET == 0 --> Individuals who paid their loan

TARGET == 1 --> Individuals who defaulted on their loan

In [None]:
# Distribution of target
sns.catplot(data= train, x = 'TARGET', kind='count')
plt.title('Count of Borrowers by Loans Status')
plt.show()

The dataset is not evenly distributed along the target variable.

# Demographic Exploration

In [None]:
# Distribution of lenders by family status
sns.catplot(data = train, y = 'NAME_FAMILY_STATUS',kind = 'count')
plt.xticks(rotation = 60)
plt.title('Borrower Family Status')
plt.show()

The majority of borrowers are married or single.

In [None]:
plt.subplots(figsize =(10, 6))
sns.countplot(data = train, y = 'OCCUPATION_TYPE')
plt.title('Borrowwer Occupations')
plt.show()

Laborers and Sales Staff make up the most frequent type of borrower.

In [None]:
sns.catplot(data= train, y = 'NAME_HOUSING_TYPE', kind='count')
plt.xticks(rotation = 60)
plt.title('Borrower Living Situations')
plt.show()

Most borrowers live in a house or apartment.

In [None]:
# Age distribution
X = round(abs(train['DAYS_BIRTH'] / (365)))
plt.subplots(figsize =(10, 6))
sns.histplot(data= train, x = X, color = 'purple')
plt.title('Distribution of Borrower Age')
plt.xlabel('Years')
plt.show()

Borrowers range in age from approximately 20 years old to 68 years old.

In [None]:
# Employment distribution
X = round(abs(train['DAYS_EMPLOYED'] / (365)))
plt.subplots(figsize =(10, 6))
sns.histplot(data= train, x = X, color = 'red')
plt.title('Distribution of Borrower Length of Employment')
plt.xlabel('Years')
plt.xlim(0, 40)
plt.show()

The majority of borrowers less than 15 years of employment experience.

In [None]:
# Credit distribution
sns.boxplot(data = train, x = 'AMT_CREDIT', color = 'orange')
plt.show()

Although there are borrowers with credit up to 4 Million, approximately three-quarters of borrowers have less than 1 Million in credit.

In [None]:
# Income distribution
sns.boxplot(data = train, x = 'AMT_INCOME_TOTAL', color = 'yellow')
plt.xlim(0, 750000)
plt.show()

Although some income levels are as high as 1 Million, approximately three-quarters of borrowers earn less than 200,000.

# Feature Correlation

In [None]:
# Build correlation table of edited features 
# Show only those higher than 0.75
feat_corr = pd.DataFrame(train).corr()
corr_df = feat_corr.where(np.triu(np.ones(feat_corr.shape), k = 1).astype(np.bool))
corr_df = corr_df.unstack().reset_index()
corr_df.columns = ['Feature A', 'Feature B', 'Correlation']
corr_df.dropna(subset = ['Correlation'], inplace = True)
corr_df['Correlation'] = round(corr_df['Correlation'], 2)
corr_df['Correlation'] = abs(corr_df['Correlation'])
matrix = corr_df.sort_values(by = 'Correlation', ascending = False)
max_corr = matrix[matrix['Correlation'] > 0.75]
max_corr

As mean, median and mode apartment measurements are highly correlated, we will remove two of the three sets of features. We will also remocve OBS_30, OBS_60, DEF_30, DEF_60, and CNT_FAM_MEMBERS to eliminate additional redundancies in the data.

In [None]:
dels2 = ['APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
         'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE',
         'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE',
         'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE',  'NONLIVINGAREA_MODE', 
         'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI',  
         'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 
         'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI',
         'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI',  
         'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE',
         'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 
         'OBS_60_CNT_SOCIAL_CIRCLE',  'DEF_60_CNT_SOCIAL_CIRCLE', 'CNT_FAM_MEMBERS',
         'ORGANIZATION_TYPE']


train = train.drop(train[dels2], axis = 1)

In [None]:
# Create correlation variable
corr1 = train.corr()['TARGET'].sort_values()

# Strongest negative correlation
print('Features with Strongest Negative Correlation:')
corr1.head()

In [None]:
# Strongest positive correlation
print('Features with Strongest Positive Correlation:')
corr1.tail()

This provides a good starting point for determining feature importance. All three EXT_SOURCE features have the significantly higher correlation (2-3 times) than other features so we will be sure to consider these when constructing our model. Let's examine the features with correlation > 0.05.

In [None]:
top_corr = train[['TARGET','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 
                  'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT', 'DAYS_LAST_PHONE_CHANGE' ]]

top_corr.info()

In [None]:
top_corr.head()

EXT_SOURCE_1 and EXT_SOURCE_3 have a large number of missing values. We believe the three external sources are the credit bureaus and therefore these variables collectively build a borrowers credit score. To address the missing data, we will find the mean of any exisitng values among these three columns and fill null values with that mean. DAYS_BIRTH and DAYS_LAST_PHONE_CHANGE are presented with negative numbers so we will convert them to positive years to make them easier to comprehend. 

In [None]:
train.info(max_cols = 85)

# Find and Fix Additional Missing Data

Because of the importance of the three external sources and the high number missing values, we will create a feature using the mean of existing values of each row. Should no values exist, we will fill them with 0.2.

In [None]:
# Create avg of each row of EXIT_SOURCE values to fill in missing values
train['AVG_EXT'] = train.iloc[:, 39:42].sum(axis=1)/(3- train.iloc[:,39:42].isnull().sum(axis=1))
train['AVG_EXT'].replace(np.nan, 0.2, inplace = True)   

# Fill in missing values
train.EXT_SOURCE_1.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_2.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_3.fillna(train.AVG_EXT, inplace=True)
                   
gc.collect()

In [None]:
# Create function to find count of missing values by column
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis = 1 , keys = ['Total', 'Percent'])

missing_data(train).head(20)


The number of missing values in this dataset is significant. To remedy this, we chose to drop additional features based on two factors: 1) their correlation values, and 2) the number of missing values.


In [None]:
# Find columns with missing values
missing_prcnt = train.isnull().sum()/train.shape[0] * 100

# Find columns with more than 50% missing
high_missing = missing_prcnt[missing_prcnt > 50]
high_missing_index = high_missing.index.tolist()

# Correlate the highest missing columns
corr_missing = corr1.reindex(high_missing_index).sort_values()

# Define columns to drop - greater than 0.035, or less than -0.035
missing_cols_to_drop = corr_missing.index.difference(corr_missing[(corr_missing > 0.035) | (corr_missing < -0.035)].index).tolist()

print(missing_cols_to_drop)
len(missing_cols_to_drop)

In [None]:
# Drop identified columns fromt he training dataset
train = train.drop(train[missing_cols_to_drop], axis = 1)
train.shape

In [None]:
gc.collect()

# Additional Feature Exploration

Improve visualizations with adjustments to a few features

In [None]:
# Convert DAYS_BIRTH to years
train['AGE'] = round(abs(train['DAYS_BIRTH'] / (365)))

# Convert DAYS_LAST_PHONE_CHANGE to years
train['PHONE_CHANGE_YEARS'] = round(abs(train['DAYS_LAST_PHONE_CHANGE'] / (365)))

# Convert DAYS_EMPLOYED to years
train['YEARS_EMPLOYED'] = round(abs(train['DAYS_EMPLOYED'] / (365)))

# Remove unneeded columns
train = train.drop(['AVG_EXT', 'DAYS_BIRTH', 'DAYS_LAST_PHONE_CHANGE', 'DAYS_EMPLOYED'], axis = 1)

After creating a subset of the data, we will review a sampling of features that have been identified as significant thus far.

In [None]:
# Select sample of dataset for visualizations
train30k = train.sample(frac = 0.10, replace = False, random_state = 1)

In [None]:
# Build function for boxplots 
def class_box(feature):
    plt.subplots(figsize=(10, 6))
    sns.boxplot(data = train30k, x = train30k[feature], y = train30k.TARGET, orient = "h")
    plt.title(f'Boxplot for {feature} by Class')
    plt.show()

In [None]:
# Build function for histograms
def class_hist(feature):
    plt.subplots(figsize=(10, 6))
    sns.histplot(data = train30k, x = train30k[feature], color = 'green')
    plt.title(f'Distribution of {feature} among Borrowers')
    plt.show()

### EXT_SOURCE_1

In [None]:
class_hist('EXT_SOURCE_1')

In [None]:
class_box('EXT_SOURCE_1')

#### EXT_SOURCE_2

In [None]:
class_hist('EXT_SOURCE_2')

In [None]:
# Spread of TARGET on EXT_SOURCE_2
class_box('EXT_SOURCE_2')

#### EXT_SOURCE_3

In [None]:
class_hist('EXT_SOURCE_3')

In [None]:
class_box('EXT_SOURCE_3')

From these vizualizations, we glean that borrowers with higher scores from all three external sources were more likely to repay their loans.

#### AGE

In [None]:
class_hist('AGE')

In [None]:
class_box('AGE')

#### PHONE_CHANGE_YEARS

In [None]:
class_hist('PHONE_CHANGE_YEARS')

In [None]:
class_box('PHONE_CHANGE_YEARS')

It also appears that older borrowers are more likely to repay their loan. Although the median number of years is the same for both those who default and those eho do not, more people who recently changed their phone number do default.

#### Defaulted Borrowers

In [None]:
# Age
fig, ax = plt.subplots(figsize=(12, 6))

sns.histplot(data = train[train.TARGET == 1],x = 'AGE', 
             hue = 'TARGET').set_title('AGE - Defaulted Loans')

ax.set_xlim(20, 70)
plt.show()

In [None]:
# Amount of Credit
fig, ax = plt.subplots(figsize=(12, 6))

sns.histplot(data = train[train.TARGET == 1],x = 'AMT_CREDIT', 
             hue = 'TARGET').set_title('Credit Amount - Defaulted Loans')

ax.set_xlim(0, 1500000)
plt.show()

In [None]:
# Income
fig, ax = plt.subplots(figsize=(12, 6))

sns.histplot(data = train[train.TARGET == 1],
             x = 'AMT_INCOME_TOTAL', 
             hue = 'TARGET').set_title('Income Level - Defaulted Loans')


ax.set_xlim(25000, 450000)
plt.show()

From the above visualizations, the following assumptions can be made:
* younger borrowers are more likely to default than older borrowers.
* borrowers who have higher levels of credit are more likely to repay thier loan
* borrowers who have higher levels of income are more likely to repay thier loan

In [None]:
# Compare distribution of numeric features and target as well as the relationships between them
sns.pairplot(train30k[['TARGET','AGE','YEARS_EMPLOYED','AMT_INCOME_TOTAL',
                       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE']], hue='TARGET')

YEARS_EMPLOYED has outliers that need to be addressed as it is not possible to have 1000 years of employment. AMT_TOTAL_INCOME and AMT_ANNUITY also have a handful of outliers that could skew model performance. AMT_CREDIT and AMT_GOODS_PRICE are highly correlated which we will consider this when filling missing values.

# Feature Engineering
Based on the findings above, we chose to construct features to add predictive power to the dataset.

In [None]:
# Time employed to age 
# Length of employment compared to length of life
train['EA_RATIO'] = train['YEARS_EMPLOYED'] / train['AGE']

# Credit to income
# How much credit a person has compared to how much they earn in total
train['CI_RATIO'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']

# Annuity to income
# How much of a person's income is in the form of an annuity
train['AI_RATIO'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']

# Credit to annuity
# How much credit a person has compared to how much they earn through annuities
train['CA_RATIO'] = train['AMT_CREDIT'] / train['AMT_ANNUITY']

# Credit to cost of goods
# How much credit a person has compared to the price of goods for which the loan was given
train['CG_RATIO'] = train['AMT_CREDIT'] / train['AMT_GOODS_PRICE']


# Log of important numeric features
# To limit influence of outliers
train['log_INCOME'] = np.log(train['AMT_INCOME_TOTAL'])                    
train['log_ANNUITY'] = np.log(train['AMT_ANNUITY'])                        
train['log_CREDIT'] = np.log(train['AMT_CREDIT'])                       
train['log_GOODS'] = np.log(train['AMT_GOODS_PRICE'])  

In [None]:
# Flag loans of higher amount than price of the item
train['FLAG_CG_ratio'] = train['AMT_CREDIT'] > train['AMT_GOODS_PRICE']  

# Flag IDs more than about 14 years old 
train['DAYS_ID_4200'] = train['DAYS_ID_PUBLISH'] < -4200       