<a href="https://colab.research.google.com/github/rash-rc/credit-risk-modeling/blob/google-colab-notebook/source.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#import libraries
import numpy as np
import pandas as pd


In [None]:
#import data
loan_data_backup = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/loan.csv')

In [None]:
loan_data = loan_data_backup.copy()

In [None]:
pd.options.display.max_columns = None

In [None]:
#loan_data

In [None]:
loan_data.head() #First 5 rows

In [None]:
loan_data.tail() #Last 5 rows

In [None]:
loan_data.columns.values

In [None]:
loan_data.info()

In [None]:
loan_data.dtypes.value_counts()

In [None]:
loan_data.dtypes

In [None]:
pd.set_option('display.max_rows', None)  # Show all rows (columns in this case)
print(loan_data.dtypes)

In [None]:
columns_info = loan_data.dtypes.reset_index()
columns_info.columns = ['Column Name', 'Data Type']
print(columns_info)

In [None]:
for col in loan_data.columns:
    print(f"Column: {col}, Data Type: {loan_data[col].dtype}")

### General Preprocessing

#### Preprocessing few continuos variables

In [None]:
loan_data['emp_length'].unique()

In [None]:
loan_data['emp_length_int'] = loan_data['emp_length'].str.replace(r'\+ years', '', regex = True)
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))
loan_data['emp_length_int'] = loan_data['emp_length_int'].fillna(str(0))
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('years', '')
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('year', '')

In [None]:
loan_data['emp_length_int'].unique()

In [None]:
type(loan_data['emp_length_int'][0])

In [None]:
loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])

In [None]:
type(loan_data['emp_length_int'][0])

In [None]:
loan_data['term'].unique()

In [None]:
loan_data['term_int'] = loan_data['term'].str.replace('months', '')

In [None]:
loan_data['term_int'].unique()

In [None]:
loan_data['term_int'] = pd.to_numeric(loan_data['term_int'])

In [None]:
type(loan_data['term_int'][0])

In [None]:
loan_data['earliest_cr_line'].unique()

In [None]:
#loan_data['earliest_cr_line']

In [None]:
loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')

In [None]:
#loan_data['earliest_cr_line_date']

In [None]:
type(loan_data['earliest_cr_line_date'][0])

In [None]:
maxdate = loan_data['earliest_cr_line_date'].max()
maxdate

In [None]:
mindate = loan_data['earliest_cr_line_date'].min()
mindate

In [None]:
#In order to use the credit line data in regression we need the time since the earliest credit line was issued. We need a reference date in order to do that

#Ususally we would take the current date but since our data is older we are assuming it as December 2015

pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']

In [None]:
#Conventinally months are used ( 'M' is deprecated thus, we will first convert into days and then month)
#delta = np.timedelta64(100, 'D')
reference_date = pd.to_datetime('2017-12-01')
loan_data['months_since_earliest_cr_line'] = round(pd.to_numeric((reference_date - pd.to_datetime(loan_data['earliest_cr_line_date']))/ np.timedelta64(1, 'D'))/30.417)

In [None]:
#loan_data['months_since_earliest_cr_line']

In [None]:
loan_data['months_since_earliest_cr_line'].describe()

In [None]:
# In the above given stats we see min as negative ( negative time difference), it needs to be addressed as it;s not possible to have negative days

#Start by displaying data points where the negative time differences were calculated, we can select specific rows & columns of a pandas data frame by their labels using the loc method.

loan_data.loc[:, ['earliest_cr_line', 'earliest_cr_line_date', 'months_since_earliest_cr_line']][loan_data['months_since_earliest_cr_line']<0]

In [None]:
#In the above output we see that the earliest credit line date is after the reference date (December 2017), which is not possible.
#This could have been a miss in conversion where 1967 was interpreted as 2067 - However this conversion coukd be time consuming.
#Additionally, it is most likely the issue arose in the first place because the origin of the built-in time scale starts after 1970.
#One solution can be to remove data, but that would lead to loss of data, since data is important and we wouldn't want to remove it so easily.
#Solution we are going with is to impute the negative values, but with what ?
#We know that we get the negative values for the credit line issues at a very distant pointy in the past, in the 60s, that is a longer period than all other credit lines that we normally get values for
# - So, we could substitute the negative values with the maximum observed, normal or positive difference. That way, even if we don't claculate the exact number of months
# - that have passed since the earliest credit line was issued for those issued in the 60s, we put a very large value and we still get pretty close to the real picture.

loan_data['months_since_earliest_cr_line'][loan_data['months_since_earliest_cr_line'] < 0] = loan_data['months_since_earliest_cr_line'].max()

In [None]:
min(loan_data['months_since_earliest_cr_line'])

In [None]:
loan_data['issue_d'].unique()

In [None]:
loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')

In [None]:
#loan_data['issue_d_date']

In [None]:
loan_data['issue_d_date'].min()

In [None]:
loan_data['issue_d_date'].max()

In [None]:
loan_data['months_since_issue_date'] = round(pd.to_numeric(pd.to_datetime('2017-12-01') - loan_data['issue_d_date']))

In [None]:
#loan_data['months_since_issue_date']

In [None]:
reference_date = pd.to_datetime('2017-12-01')
# loan_data['months_since_issue_date'] = round(pd.to_numeric(pd.to_datetime('2017-12-01') - loan_data['issue_d_date']))
loan_data['months_since_issue_date'] = round(pd.to_numeric((reference_date - pd.to_datetime(loan_data['issue_d_date']))/ np.timedelta64(1, 'D'))/30.417)

In [None]:
#loan_data['months_since_issue_date']

In [None]:
loan_data['months_since_issue_date'].describe()

In [None]:
#Preprocessing discrete variables
#grade, sub_grade, home_ownership, verification_status, loan_status, purpose, addr_state, initial_list_status

loan_data.info()

With discrete features we would want to create  dummy variables for all of their categories. Dummy variables are binary indicators: 1, if an observation belongs to a category; 0, if it does not ( Eg., gender - F/M, for this information to be useful for a statistical model, it has to be numerically represented by dummy variables)

We need only k-1 dummy variables to represent the information about k categories.

Note: It will be best to create a new dataframe where we will store all the new dummy variables and then concatenate it to the loan_data dataframe.

In [None]:
#pandas has a built-in function to create dummy variables for a given categorical variable - pd.get_dummies()

# pd.get_dummies(loan_data['grade'])



In [None]:
loan_data['grade'].unique()

In [None]:
dummies = pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')

In [None]:
dummies.tail()

In [None]:
sample = loan_data['grade'].sample(1000, random_state=42)
pd.get_dummies(sample)

In [None]:
loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),
                     pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),
                     pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),
                     pd.get_dummies(loan_data['verification_status'], prefix = 'verification_staus', prefix_sep = ':'),
                     pd.get_dummies(loan_data['loan_status'], prefix ='loan_status', prefix_sep = ':'),
                     pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),
                     pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),
                     pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]

In [None]:
loan_data_dummies = pd.concat(loan_data_dummies, axis =1)

In [None]:
type(loan_data_dummies)

We need to specify whether we want to concatenate the inputs by rows or columns. We do that with the axis parameter.
By default, axis=0, which means that the inputs are concatenated by rows. If we want to concatenate by columns, we need to set axis=1.

In [None]:
loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)

In [None]:
loan_data.head()

In [None]:
loan_data.columns.values

Check for missing values and clean

A dedicated pandas method df.isnull, is used to check if each data point is missing (True) or not(False)

In [None]:
#loan_data.isnull()

In [None]:
loan_data.head()

In [None]:
# Show all rows that have at least one null value
null_rows = loan_data[loan_data.isnull().any(axis=1)]
display(null_rows.head(20))

In [None]:
pd.options.display.max_rows = None
loan_data.isnull().sum()

One way to deal with missing values is to remove all observations(rows) where we have missing value, another way is to impute them.

In [None]:
pd.options.display.max_rows = 100

In [None]:
#Total revolving limit, we use fillna, there is a need to specify two arguments. One of the missing values can be the value we want to replace missing values with, we take the funded amount. If missing values needs to be replaced in the same variable we set inplace = True
loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True)

In [None]:
pd.options.display.max_rows = None
loan_data['total_rev_hi_lim'].isnull().sum()

In [None]:
loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace = True)

In [None]:
loan_data['annual_inc'].isnull().sum()

In [None]:
#Replacing the missing values with zeroes
loan_data['months_since_earliest_cr_line'].fillna(0, inplace = True)

In [None]:
loan_data['months_since_earliest_cr_line'].isnull().sum()

In [None]:
loan_data['acc_now_delinq'].fillna(0, inplace = True)

In [None]:
loan_data['acc_now_delinq'].isnull().sum()

In [None]:
loan_data['total_acc'].fillna(0, inplace = True)

In [None]:
loan_data['total_acc'].isnull().sum()

In [None]:
loan_data['pub_rec'].fillna(0, inplace = True)

In [None]:
loan_data['pub_rec'].isnull().sum()

In [None]:
loan_data['open_acc'].fillna(0, inplace = True)

In [None]:
loan_data['open_acc'].isnull().sum()

In [None]:
loan_data['inq_last_6mths'].fillna(0, inplace = True)

In [None]:
loan_data['inq_last_6mths'].isnull().sum()

In [None]:
loan_data['delinq_2yrs'].fillna(0, inplace = True)

In [None]:
loan_data['delinq_2yrs'].isnull().sum()

In [None]:
loan_data['emp_length_int'].fillna(0, inplace = True)

In [None]:
loan_data['emp_length_int'].isnull().sum()

Analysis

In [None]:
# Expected Loss (EL) = Probability of default (PD) * Loss given default (LGD) * Exposure at default (EAD)