In [1]:
# Initial setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
plt.style.use('seaborn-white')
%matplotlib inline

In [2]:
# Read off data
lend = pd.read_csv('data/lending_ml.csv')
display(lend.head())

Unnamed: 0,purpose,yr_credit,dti,revol_util_dec,total_acc,addr_state,target
0,credit_card,26.0,27.65,0.837,9.0,AZ,1
1,car,12.0,1.0,0.094,4.0,GA,0
2,small_business,10.0,8.72,0.985,10.0,IL,1
3,other,15.0,20.0,2.1,37.0,CA,1
4,other,15.0,17.94,0.539,38.0,OR,1


### Features Engineering

1. Get dummy variable for puropes

In [3]:
purpose = pd.get_dummies(lend['purpose'], drop_first=True)
lend = pd.concat([lend, purpose], axis=1)

2. Get dummy variable for addr_state

Note that from inference statistic part of the exercise. Some states such as IA and MS have so few records that treating those labels as a feature may run into overfit problems later. We will replace state label that has <200 records as SML to have the model recognize them as low applicant states in the feature columns.

In [4]:
# create a crosstab to get count by states
table_state = pd.crosstab(lend['addr_state'], lend['target'], margins=True)

# filter for states that have less than 200 records, put that into list
SML_list = table_state[table_state['All'] < 200].index.tolist()
SML = pd.DataFrame(SML_list)
SML.to_csv('data/SML.csv', index=False) # to be used later for the test set

In [5]:
lend['helper_col'] = lend['addr_state'].isin(SML_list) #this serves as helper column only and will be drop later

In [6]:
# if helper column is True, the record is from a small state, replace with 'SML', otherwise no change to the state label
lend['state'] = np.where(lend.helper_col == 1, 'SML', lend.addr_state)

In [7]:
# get dummy variables
state = pd.get_dummies(lend['state'], drop_first = True)
lend = pd.concat([lend, state], axis=1)

3. Get categorical feature column label for DTI grouping

In [8]:
# crate function for grouping dti based on the value range
def dti(data):
    if data['dti'] <= 5: return 5
    elif (data['dti'] > 5) & (data['dti'] <=10) : return 10
    elif (data['dti'] > 10) & (data['dti'] <=15) : return 15
    elif (data['dti'] > 15) & (data['dti'] <=20) : return 20
    elif (data['dti'] > 20) & (data['dti'] <=30) : return 30
    else: return 40

In [9]:
lend['dti_gp'] = lend.apply(dti, axis=1)

In [10]:
dti_gp = pd.get_dummies(lend['dti_gp'])
lend = pd.concat([lend, dti_gp], axis = 1)

4. fillna for missing records under revol_util_dec

In [11]:
lend['revol'] = lend['revol_util_dec'].fillna(0)

In [12]:
# drop the redunant columns and form the final dataframe for machine learning
lend_fin = lend.drop(['addr_state', 'helper_col', 'dti_gp', 'state', 'dti', 'purpose', 'revol_util_dec'], axis=1)

In [13]:
# creating dataset for feature and target to be fed to machine learning model
features = lend_fin.drop(['target'], axis=1).values
target = lend_fin.target.values

### Apply Logistics Regression

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data into a training and test set.
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = .3, random_state=5)

clf = LogisticRegression(solver='liblinear')
# Fit the model on the trainng data.
clf.fit(X_train, y_train)
# Print the accuracy from the testing data.
print(accuracy_score(clf.predict(X_test), y_test))

0.8523261169967756


#### Tunning Model

In [15]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

In [16]:
clf = LogisticRegression(solver='liblinear')
score = cv_score(clf, X_train, y_train)
print('Accuracy score without reguluarization: {:.2%}'. format(score))

Accuracy score without reguluarization: 85.32%


In [17]:
# Perform Grid search to look for the best C
Cs = [0.001, 0.1, 1, 10, 100]
gridsearch = GridSearchCV(estimator=clf, param_grid={'C': Cs}, cv=5)
gridsearch.fit(X_train, y_train)
print('best parameter: {}'.format(gridsearch.best_params_))
print('best score: {:.2%}'.format(gridsearch.best_score_))

best parameter: {'C': 0.001}
best score: 85.32%


No improvement on accuracy score. We will still set C in the model to avoid data overfit

### Apply Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt', random_state=33)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

[[  32 1571]
 [ 103 9149]]
              precision    recall  f1-score   support

           0       0.24      0.02      0.04      1603
           1       0.85      0.99      0.92      9252

   micro avg       0.85      0.85      0.85     10855
   macro avg       0.55      0.50      0.48     10855
weighted avg       0.76      0.85      0.79     10855

0.8457853523721787


#### Tunning model

In [20]:
param_grid = { 
    'n_estimators': [100, 150, 200, 250, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6],
    'criterion' :['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [21]:
rfc = RandomForestClassifier(random_state=33)
CV_model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)

In [None]:
CV_model.fit(X_train, y_train) # model stuck here
# CV_model.best_params_