In [None]:
# Import necessary libraries.
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read the data files
train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")



**First 5 rows**

In [None]:
train.head()

In [None]:
test.head()

**Shape of the data**

In [None]:
print("The application_train.csv has {} entires.".format(train.shape))
print("The application_test.csv has {} entires.".format(test.shape))
print("(Test Data/Train Data)%: {}".format( (test.shape[0]/train.shape[0])*100))

As expected, test dataset contains all the columns except the target label.

**Overview of the data**

In [None]:
train.info(verbose=True, show_counts=True)

**How is the statistic?**

In [None]:
train.describe(include="all")

**How is the distribution of target labels? - Did most people return on time ?**

0: Loan was repaid 1: Loan was not repaid


In [None]:
plt.title('Distribution of TARGET variable values')
sns.countplot(x=train.TARGET)

In [None]:
train['TARGET'].value_counts()

This is clearly an imbalanced target. There are more number of people who returned/repaid as opposed to people who had difficulties


### **What are the missing values and their column names?**

In [None]:
def missing_columns(dataframe):
    """
    Returns a dataframe that contains missing column names and 
    percent of missing values in relation to the whole dataframe.
    
    dataframe: dataframe that gives the column names and their % of missing values
    """
    
    # find the missing values
    missing_values = dataframe.isnull().sum().sort_values(ascending=False)
    
    # percentage of missing values in relation to the overall size
    missing_values_pct = 100 * missing_values/len(dataframe)
    
    # create a new dataframe which is a concatinated version
    concat_values = pd.concat([missing_values, missing_values/len(dataframe),missing_values_pct.round(1)],axis=1)

    # give new col names
    concat_values.columns = ['Missing Count','Missing Count Ratio','Missing Count %']
    
    # return the required values
    return concat_values[concat_values.iloc[:,1]!=0]

In [None]:
missing_columns(train)

In [None]:
missing_columns(test)

We will have to handle these missing values (known as imputation). Other option would be to drop all those columns where there are large number of missing values. Unless we know the feature importance, it is not possible to make a call on which columns to keep which ones to drop.

**What are the different datatypes of columns? - How many floats, integers, categoricals?**

In [None]:
print("Train dataset: \n{}".format(train.dtypes.value_counts()))
print()
print("Test dataset: \n{}".format(test.dtypes.value_counts())) 


In test dataset, 40 int64 indicates that the target label is missing - which is obvious.



### Turn every column data type of testing set similar to training set. Match datatypes of test in alignment with train.


In [None]:
# def match_dtypes(training_df,testing_df,target_name='TARGET'):
#     """
#     This function converts dataframe to match columns in accordance with the 
#     training dataframe.
#     """
#     for column_name in training_df.drop([target_name],axis=1).columns:
#          testing_df[column_name]= testing_df[column_name].astype(train[column_name].dtype)
        
#     return testing_df

**What are the different kinds of classes in every categorical column?**

In [None]:
# Number of unique classes in each object column
train.select_dtypes('object').apply(pd.Series.nunique)

In [None]:
test.select_dtypes('object').apply(pd.Series.nunique)

## **Preparing Dataset for Training**

### **Creating Training, Validation and Test Datasets**

In [None]:
# from sklearn.model_selection import train_test_split
# train_df, val_df = train_test_split(train, test_size=0.25,stratify=train['TARGET'], random_state=42)
# test_df = test


### **Handling Categorical variables - Label Encoding and One Hot Encoding.**

In [None]:
# Create a label encode object having less than or equal to 2 unique values
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
transform_counter = 0

# iterate through all the categorical columns
for col in train.select_dtypes('object').columns:
    
    # select only those columns where number of unique values in the category is less than or equal to 2 
    if pd.Series.nunique(train[col]) <= 2:
        train[col] = le.fit_transform(train[col].astype(str))
        test[col] = le.fit_transform(test[col].astype(str))

        transform_counter+=1
        
print("Label encoded {} columns.".format(transform_counter)) 

In [None]:
# one-hot encode of categorical variables
train = pd.get_dummies(train,drop_first=True)
test = pd.get_dummies(test,drop_first=True)



One hot encoding would added more columns, checking how many there are:


In [None]:
print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)



There is a mismatch in the count of columns for test and train. This can be fixed by aligning them.


In [None]:
# collect the target labels to support the aligning 

target = train['TARGET']


**Ensure train and test have the same number of columns by aligning.**

In [None]:
train, test = train.align(test,axis=1,join='inner')



Add the stored target column back into the train dataset.


In [None]:
train['TARGET'] = target

In [None]:
print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

### **Exploratory Data Analysis on Encoded Dataset**

**Anomalies**

One problem we always want to be on the lookout for when doing EDA is anomalies within the data. These may be due to mis-typed numbers, errors in measuring equipment, or they could be valid but extreme measurements. One way to support anomalies quantitatively is by looking at the statistics of a column using the describe method. The numbers in the DAYS_BIRTH column are negative because they are recorded relative to the current loan application date. Hence, to see these stats in years, we can mutliple by -1 and divide by the number of days in a year (365).


In [None]:
(train['DAYS_BIRTH']/-365).describe()

Ages seem to be fine, nothing in particluar seems to be off.

In [None]:
fig, ax = plt.subplots(figsize =(12,7))
sns.distplot(train['DAYS_BIRTH']/-365,bins=5,kde=False)
plt.xlabel("Age of the client (Years)")



People in the age range 30-40 years are the most applicants. Which seems pretty normal.



**How many years has it been since the applicant started working?**

The DAYS_EMPLOYED column is negative because the days are relative only to the time of the application. -ve means so many days since the application, the client has been working. +ve means, the client is about to work in those many days. In an ideal world, the -ve has significance, +ve could mean anything from client starts working to client can be fired and resumes working, which in anyway doesn't make sense because the loan might not be given to those clients without any work.


In [None]:
(train['DAYS_EMPLOYED']/365).describe()

This doesn't seem right, the maximum value (besides being positive) is about 1000 years!


**Who are these special people who got employed 1000 years after issuance of the loan?**


In [None]:
fig, ax = plt.subplots(figsize=(12,7))
sns.distplot(train['DAYS_EMPLOYED']/365,kde=False)
plt.xlabel("Time before the loan application the persons started current employment(in years)")

So, how many of these 1000 year anomalies?

In [None]:
# find the number of records where DAYS_EMPLOYED is between [900,1100] years. 
thousand_anomalies = train[(train['DAYS_EMPLOYED']/365>=900) & (train['DAYS_EMPLOYED']/365<=1100)]
len(thousand_anomalies)

**Lets look their ability to repay.**

In [None]:
fig, ax = plt.subplots(figsize=(12,7))
sns.countplot(x='TARGET',data=thousand_anomalies)

**Most anomalies were able to repay on time. But how can they be contrasted in relation to non anomalies?**

In [None]:
# get the index of anomalies and non anomalies
anomalies_index = pd.Index(thousand_anomalies.index)
non_anomalies_index = train.index.difference(anomalies_index)

# get the anomalies records
non_anomalies = train.iloc[non_anomalies_index]

# get the anomaly targets
anomalies_target = thousand_anomalies['TARGET'].value_counts()
non_anomalies_target = non_anomalies['TARGET'].value_counts()



# find the default rate for anomalies and non anomalies

print("Anomalies have a default rate of {:0.2f}%".format(100*anomalies_target[1]/(anomalies_target[1]+anomalies_target[0])))
print("Non Anomalies have a default rate of {:0.2f}%".format(100*non_anomalies_target[1]/(non_anomalies_target[1]+non_anomalies_target[0])))

So surprisingly anomalies have lesser default rate!


Handling the anomalies depends on the exact situation, with no set rules. One of the safest approaches is just to set the anomalies to a missing value and then have them filled in (using Imputation) before machine learning. In this case, since all the anomalies have the exact same value, we want to fill them in with the same value in case all of these loans share something in common. The anomalous values seem to have some importance, so we want to tell the machine learning model if we did in fact fill in these values. As a solution, we will fill in the anomalous values with not a number (np.nan) and then create a new boolean column indicating whether or not the value was anomalous.


In [None]:
# Create an anomalous flag column
train['DAYS_EMPLOYED_ANOM'] = train["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].replace({365243: np.nan})

In [None]:
# Looking at the years employed for anomalies

plt.figure(figsize=(12,8))
(train['DAYS_EMPLOYED']/-365).plot.hist(title = 'Years Employment Histogram')
plt.xlabel("Years worked before application")

Now it all seems normal!

In [None]:
# Create an anomalous flag column
test['DAYS_EMPLOYED_ANOM'] = test["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].replace({365243: np.nan})

print("There are {:d} anomalies in the Test Dataset out of {:d} entries!".format(test["DAYS_EMPLOYED_ANOM"].sum(), len(test)))


### **Finding out the most correlated features for the TARGET variable.**

In [None]:
corr_train = train.corr()['TARGET']


**Looking at the top 10 most positively and negatively correlated features we get:**

In [None]:
print(corr_train.sort_values().tail(10))
corr_train.sort_values().head(10)


Since EXT_SOURCE_3, EXT_SOURCE_2, EXT_SOURCE_1 and DAYS_BIRTH are highly correlated (Relatively), let us also explore the possibility of having them as interaction variables.

**Initially filling up the missing values for the most correlated variables.**

In [None]:
from sklearn.impute import 

poly_fitting_vars = ['EXT_SOURCE_3', 'EXT_SOURCE_2', 'EXT_SOURCE_1','DAYS_BIRTH']

imputer = SimpleImputer(missing_values='NaN', strategy='median')