In [38]:
# Initial setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
plt.style.use('seaborn-white')
%matplotlib inline

In [39]:
# Read off data
lend = pd.read_csv('data/lending_ml.csv')
display(lend.head())

Unnamed: 0,purpose,yr_credit,dti,revol_util_dec,total_acc,addr_state,target
0,credit_card,26.0,27.65,0.837,9.0,AZ,1
1,car,12.0,1.0,0.094,4.0,GA,0
2,small_business,10.0,8.72,0.985,10.0,IL,1
3,other,15.0,20.0,2.1,37.0,CA,1
4,other,15.0,17.94,0.539,38.0,OR,1


### Features development

1. Get dummy variable for puropes

In [40]:
purpose = pd.get_dummies(lend['purpose'])
lend = pd.concat([lend, purpose], axis=1)

2. Get dummy variable for addr_state

Note that from inference statistic part of the exercise. Some states such as IA and MS have so few records that treating those labels as a feature may run into overfit problems later. We will replace state label that has <200 records as SML to have the model recognize them as low applicant states in the feature columns.

In [41]:
# create a crosstab to get count by states
table_state = pd.crosstab(lend['addr_state'], lend['target'], margins=True)

# filter for states that have less than 200 records, put that into list
SML_list = table_state[table_state['All'] < 200].index.tolist()

In [42]:
lend['helper_col'] = lend['addr_state'].isin(SML_list) #this serves as helper column only and will be drop later

In [43]:
# if helper column is True, the record is from a small state, replace with 'SML', otherwise no change to the state label
lend['state'] = np.where(lend.helper_col == 1, 'SML', lend.addr_state)

In [45]:
# drop the redunant columns
lend.drop(['addr_state', 'helper_col'], axis=1, inplace=True)