In [2]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

## Loan Status Classification

In [3]:
# ppp1 = pd.read_csv('../data/public_up_to_150k_1_230930.csv')
path = '../data/'
dir_list = os.listdir(path)[:-1] # Exclude the data dict
print(dir_list)

['public_up_to_150k_3_230930.csv', 'public_up_to_150k_11_230930.csv', 'public_up_to_150k_6_230930.csv', 'public_up_to_150k_1_230930.csv', 'public_up_to_150k_9_230930.csv', 'public_up_to_150k_4_230930.csv', 'public_up_to_150k_10_230930.csv', 'public_up_to_150k_7_230930.csv', 'public_up_to_150k_2_230930.csv', 'public_150k_plus_230930.csv', 'placeholder.txt', 'public_up_to_150k_8_230930.csv', 'public_up_to_150k_12_230930.csv', 'public_up_to_150k_5_230930.csv']


In [82]:
loans_1 = pd.read_csv(path + dir_list[0])
loans_2 = pd.read_csv(path + dir_list[1])
ppp = pd.concat([loans_1, loans_2])

In [5]:
ppp.shape

(1800000, 53)

In [6]:
ppp.columns

Index(['LoanNumber', 'DateApproved', 'SBAOfficeCode', 'ProcessingMethod',
       'BorrowerName', 'BorrowerAddress', 'BorrowerCity', 'BorrowerState',
       'BorrowerZip', 'LoanStatusDate', 'LoanStatus', 'Term',
       'SBAGuarantyPercentage', 'InitialApprovalAmount',
       'CurrentApprovalAmount', 'UndisbursedAmount', 'FranchiseName',
       'ServicingLenderLocationID', 'ServicingLenderName',
       'ServicingLenderAddress', 'ServicingLenderCity', 'ServicingLenderState',
       'ServicingLenderZip', 'RuralUrbanIndicator', 'HubzoneIndicator',
       'LMIIndicator', 'BusinessAgeDescription', 'ProjectCity',
       'ProjectCountyName', 'ProjectState', 'ProjectZip', 'CD', 'JobsReported',
       'NAICSCode', 'Race', 'Ethnicity', 'UTILITIES_PROCEED',
       'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED',
       'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED',
       'DEBT_INTEREST_PROCEED', 'BusinessType', 'OriginatingLenderLocationID',
       'OriginatingLender', 'Originatin

In [10]:
print(ppp['LoanStatus'].value_counts())
print(np.sum(ppp['LoanStatus'].isna())) # no nulls

LoanStatus
Paid in Full    1634401
Charged Off      111965
Exemption 4       53634
Name: count, dtype: int64
0


In [15]:
# Baseline model and class imbalance
paid_frac = np.sum(ppp['LoanStatus'] == 'Paid in Full') / ppp.shape[0]
forgiven_frac = np.sum(ppp['LoanStatus'] == 'Charged Off') / ppp.shape[0]
ex4_frac = np.sum(ppp['LoanStatus'] == 'Exemption 4') / ppp.shape[0]
print(f'Paid in full: {round(paid_frac, 3)}')
print(f'Charged Off: {round(forgiven_frac, 3)}')
print(f'Exemption 4: {round(ex4_frac, 3)}')

Paid in full: 0.908
Charged Off: 0.062
Exemption 4: 0.03


'Paid in full' is the plurality class at 0.908.  Predicting 'paid in full' 90.8% of the time represents the baseline model which a successful classifier will beat.

### Random Forest

In [83]:
# type check
ppp['SBAOfficeCode'] = ppp['SBAOfficeCode'].astype(int)

In [84]:
np.sum(ppp['LoanStatusDate'].isna()) / ppp['LoanStatusDate'].size

0.029796666666666666

We know that about 2.97% of loans are still in arrears, and the LoanStatusDate will be null. We drop this from the model features for now because this will be highly correlated with the target and lead to data leakage.  We can add this feature back in at a later iteration of the model if we want to see whether the date the loan was forgiven or repaid can predict between these two classes only.

In [24]:
np.sum(ppp['UndisbursedAmount'] > 0)

4

In [26]:
ppp[ppp['UndisbursedAmount'] > 0][['UndisbursedAmount', 'CurrentApprovalAmount']]

Unnamed: 0,UndisbursedAmount,CurrentApprovalAmount
156358,78230.0,78230.0
149720,3000.0,28400.0
772551,0.75,2262.75
809276,330000.0,0.0


Nearly all loans have been disbursed, so we drop UndisbursedAmount from the model because it will not have predictive value.

In [30]:
ppp['RuralUrbanIndicator'].value_counts()

RuralUrbanIndicator
U    1618966
R     181034
Name: count, dtype: int64

In [31]:
np.sum(ppp['RuralUrbanIndicator'].isna())

0

In [32]:
ppp['HubzoneIndicator'].value_counts()

HubzoneIndicator
N    1308199
Y     491801
Name: count, dtype: int64

In [33]:
np.sum(ppp['BusinessAgeDescription'].isna())

0

In [35]:
np.sum(ppp['ProjectCountyName'].isna())

63

We do not know what Project features refer to and there are some missing values. We elect to ignore these features.

In [36]:
np.sum(ppp['CD'].isna())

54

The congressional district associated with a loan (CD) is important for potentially connecting this dataset to related COVID public health data.  There are some missing values, which we will handle as 'unrecorded'.

In [102]:
ppp['CD'] = ppp['CD'].where(ppp['CD'].notna(), 'unrecorded')
np.sum(ppp['CD'].isna())

0

In [103]:
np.sum(ppp['JobsReported'].isna()) # All businesses report number of jobs (business size)

0

In [88]:
ppp['NAICSCode'].dtypes

dtype('float64')

In [89]:
np.sum(ppp['NAICSCode'].isna())

19483

Nearly 20,000 loans do not report the NAICSCode (type of industry) the business operates in.  This information is important for the classifier. We designate unrecorded values as '000000'.

In [90]:
# temp_ppp = ppp

In [92]:
ppp['NAICSCode'] = ppp['NAICSCode'].where(ppp['NAICSCode'].notna(), 0)
np.sum(ppp['NAICSCode'].isna())

0

In [99]:
temp = ppp['NAICSCode'].astype(int).astype(str).replace(to_replace='0', value='000000')
np.sum(temp == '000000')

19483

In [100]:
ppp['NAICSCode'] = temp

In [101]:
ppp['NAICSCode'].dtypes

dtype('O')

In [None]:
# Choose features we believe will be most predictive
drop_features = ['LoanNumber', 'DateApproved', 'BorrowerName', 'BorrowerAddress', 'LoanStatusDate', 'UndisbursedAmount', 'ServicingLenderName', 'ServicingLenderAddress', 'ProjectCity', 'ProjectCountyName', 'ProjectState', 'ProjectZip', 
ppp.drop(

In [107]:
np.sum(ppp[['Race', 'Ethnicity', 'Gender', 'Veteran']].isna(), axis=0)

Race         0
Ethnicity    0
Gender       0
Veteran      0
dtype: int64

Demographic information of the business owner has no nulls.

In [108]:
np.sum(ppp[['UTILITIES_PROCEED', 'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED', 'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED', 'DEBT_INTEREST_PROCEED']].isna(), axis=0)

UTILITIES_PROCEED            1279767
PAYROLL_PROCEED                 1379
MORTGAGE_INTEREST_PROCEED    1765119
RENT_PROCEED                 1730784
REFINANCE_EIDL_PROCEED       1784389
HEALTH_CARE_PROCEED          1779194
DEBT_INTEREST_PROCEED        1784319
dtype: int64