In [1]:
# Initial setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
plt.style.use('seaborn-white')
%matplotlib inline

In [2]:
# Read off data
lend = pd.read_csv('data/lending_ml.csv')
display(lend.head())
print(lend.shape)

Unnamed: 0,home_ownership,purpose,yr_credit,dti,revol_util_dec,total_acc,addr_state,target
0,RENT,credit_card,26.0,27.65,0.837,9.0,AZ,1
1,RENT,car,12.0,1.0,0.094,4.0,GA,0
2,RENT,small_business,10.0,8.72,0.985,10.0,IL,1
3,RENT,other,15.0,20.0,2.1,37.0,CA,1
4,RENT,other,15.0,17.94,0.539,38.0,OR,1


(36182, 8)


#### This is a classification problem. 

The features being selected are homeownership, loan purpose, years of credit, debt to income ratio (DTI), % of credit usage, total credit lines, applicant's State

### Features Engineering

1. Get dummy variable for purpose and home_ownership

In [3]:
purpose = pd.get_dummies(lend['purpose'], drop_first=True)
home_own = pd.get_dummies(lend['home_ownership'], drop_first=True)
lend = pd.concat([lend, home_own, purpose], axis=1)

display(lend.head())

Unnamed: 0,home_ownership,purpose,yr_credit,dti,revol_util_dec,total_acc,addr_state,target,OTHER,OWN,...,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding
0,RENT,credit_card,26.0,27.65,0.837,9.0,AZ,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,RENT,car,12.0,1.0,0.094,4.0,GA,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,RENT,small_business,10.0,8.72,0.985,10.0,IL,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,RENT,other,15.0,20.0,2.1,37.0,CA,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,RENT,other,15.0,17.94,0.539,38.0,OR,1,0,0,...,0,0,0,0,0,1,0,0,0,0


2. Get dummy variable for addr_state

    - From [inference statistic](https://github.com/sittingman/lending_repayment/blob/master/inference_stat.ipynb) analysis, some states such as IA and MS have so only one record each that treating those labels as a feature may run into overfit problem later. 

    - We will replace state label that has <200 records as 'SML' to have the model recognize them as a group in the feature.

In [4]:
# create a crosstab to get count by states
table_state = pd.crosstab(lend['addr_state'], lend['target'], margins=True)

# filter for states that have more than 200 records, put that into list
LT_list = table_state[table_state['All'] > 200].index.tolist()

In [5]:
lend['helper_col'] = ~(lend['addr_state'].isin(LT_list)) # finding state records that do not match the LT_list

In [6]:
# if helper column is True, the record is from a small state, replace with 'SML', otherwise no change to the state label
lend['state'] = np.where(lend.helper_col == 1, 'SML', lend.addr_state)

In [7]:
# get dummy variables
state = pd.get_dummies(lend['state'], drop_first = True)
lend = pd.concat([lend, state], axis=1)

3. Get categorical feature column label for DTI grouping

    - from [inference statistic](https://github.com/sittingman/lending_repayment/blob/master/inference_stat.ipynb), DTI at the individual level does not show dependence on the loan paid off rate, but at the binning level it does. 
    - We will create a new column to hold the categorical DTI.

In [8]:
# crate function for grouping dti based on the value range
def dti(data):
    if data['dti'] <= 5: return 5
    elif (data['dti'] > 5) & (data['dti'] <=10) : return 10
    elif (data['dti'] > 10) & (data['dti'] <=15) : return 15
    elif (data['dti'] > 15) & (data['dti'] <=20) : return 20
    elif (data['dti'] > 20) & (data['dti'] <=30) : return 30
    elif (data['dti'] > 30): return 'high'

In [9]:
lend['dti_gp'] = lend.apply(dti, axis=1)

In [10]:
dti_gp = pd.get_dummies(lend['dti_gp'], drop_first = True)
lend = pd.concat([lend, dti_gp], axis = 1)

4. fillna for missing records under revol_util_dec

In [11]:
lend['revol'] = lend['revol_util_dec'].fillna(0)

In [12]:
# drop the redunant columns and form the final dataframe for machine learning
lend_fin = lend.drop(['home_ownership','addr_state', 'helper_col', 'dti_gp', 'dti', 'state', 'purpose', 'revol_util_dec'], axis=1)
print(lend_fin.columns)

Index([         'yr_credit',          'total_acc',             'target',
                    'OTHER',                'OWN',               'RENT',
              'credit_card', 'debt_consolidation',        'educational',
         'home_improvement',              'house',     'major_purchase',
                  'medical',             'moving',              'other',
         'renewable_energy',     'small_business',           'vacation',
                  'wedding',                 'AR',                 'AZ',
                       'CA',                 'CO',                 'CT',
                       'FL',                 'GA',                 'IL',
                       'KS',                 'KY',                 'LA',
                       'MA',                 'MD',                 'MI',
                       'MN',                 'MO',                 'NC',
                       'NJ',                 'NV',                 'NY',
                       'OH',                 'OK', 

In [13]:
# Final check to confirm that there is no missing values
lend_fin.isnull().values.any()

False

#### Create features and target numpy array

In [14]:
features = lend_fin.drop(['target'], axis=1).values
target = lend_fin.target.values

# check shape of the features and target sets
print(features.shape)
print(target.shape)

(36182, 54)
(36182,)


### Baseline Model

Assume targets to be all 1s or 0s. We use F1 score for evaluation.

In [15]:
print('Baseline F1 score {:.4}'.format(f1_score(target, np.ones(len(target)))))

Baseline F1 score 0.9206


Achieve 0.9206 for F1 score, which is quite high to begin with. This could be a result of Lending Club credit underwriting policy that has already filtered out many loans loans to begin with. As a result, we have imbalance dataset (i.e. more 1s than 0s).

Our goal is to see if we can further improve on the F1 score though models below.

In [16]:
# Split the data into a training and test set to be used for all models training
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = .25, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(27136, 54)
(9046, 54)
(27136,)
(9046,)


### Apply Logistic Regression

**Base Model**

In [17]:
clf = LogisticRegression(solver='liblinear')
# Fit the model on the trainng data.
clf.fit(X_train, y_train)
# Print the accuracy from the testing data.
y_pred_clf = clf.predict(X_test)
print('Logistic Regression F1 score {:.4}'.format(f1_score(y_test, y_pred_clf)))

Logistic Regression F1 score 0.9207


In [18]:
# Cross validation
def cv_score(clf, x, y, score_func=f1_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

In [19]:
clf = LogisticRegression(solver='liblinear')
score = cv_score(clf, X_train, y_train)
print('Cross validation F1 score for Logistic Regression without reguluarization: {:.4}'. format(score))

Cross validation F1 score for Logistic Regression without reguluarization: 0.9206


F1 score stays the same as the baseline.

#### Tunning Model

In [20]:
# Perform Grid search to look for the best C
Cs = [0.001, 0.1, 1, 10, 100]
gridsearch_lf = GridSearchCV(estimator=clf, param_grid={'C': Cs}, cv=5, scoring='f1')
gridsearch_lf.fit(X_train, y_train)
y_pred_clf_C = gridsearch_lf.best_estimator_.predict(X_test)
print('best parameter: {}'.format(gridsearch_lf.best_params_))
print('Logistic Regression (Tuned) F1 score {:.4}'.format(f1_score(y_test, y_pred_clf_C)))
print('cross validation F1 score with best C: {:.4}'.format(gridsearch_lf.best_score_))

best parameter: {'C': 0.001}
Logistic Regression (Tuned) F1 score 0.9207
cross validation F1 score with best C: 0.9206


No further improvement on F1 score. We will set C in the model as 0.001.

### Apply Random Forest

#### Base Model

In [21]:
# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt', random_state=33)

model.fit(X_train, y_train)
y_pred_rf = model.predict(X_test)

In [22]:
print('Random Forest F1 Score is {:.4}'.format(f1_score(y_test, y_pred_rf)))

Random Forest F1 Score is 0.9171


In [23]:
#Cross validation
score_rf = cv_score(model, X_train, y_train)
print('Cross validation F1 score for Random Forest without reguluarization: {:.4}'. format(score))

Cross validation F1 score for Random Forest without reguluarization: 0.9206


The random forest baseline model, have the same performance as Logistic Regression, and equal to baseline model.

We will tune parameters which may help improving the performance

#### Tunning model

We pick n_estimators, max_depth, and max_features to optimize as they are more important to Random Forest model.

In [24]:
param_grid = { 
    'n_estimators': [10, 100, 200, 500, 1000],
    'max_depth' : [5, 10, 20, 50, 100],
    'max_features' : ['auto', 'sqrt']
}

In [25]:
rfc = RandomForestClassifier(random_state=33, bootstrap=True, n_jobs=2)
gridsearch_rf = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs=3)

In [26]:
gridsearch_rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=2,
            oob_score=False, random_state=33, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'n_estimators': [10, 100, 200, 500, 1000], 'max_depth': [5, 10, 20, 50, 100], 'max_features': ['auto', 'sqrt']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
print('best parameters: {}'.format(gridsearch_rf.best_params_))

best parameters: {'max_depth': 5, 'max_features': 'auto', 'n_estimators': 10}


In [29]:
# predicting using the model with the best estimators
y_pred_rf_tune = gridsearch_rf.best_estimator_.predict(X_test)

In [30]:
print('Random Forest (Tuned) F1 score: {:.4}'.format(f1_score(y_test,y_pred_rf_tune)))
print('Cross validation F1 score for Random Forest tuned: {:.4}'. format(gridsearch_rf.best_score_))

Random Forest (Tuned) F1 score: 0.9207
Cross validation F1 score for Random Forest tuned: 0.8529


F1 score performance for tuned Random Forest perform worse than Logistic Regression and also baseline.

We will stay with the base Random Forest classifier without tuning.

### Performance Summary

|Model | F1 score (cross validated)|
|----- | -------|
|Baseline | 0.9206|
|Logistic Regression | 0.9206 |
|Random Forest | 0.9206 |

Based on the F1 score results, it turns out that we don't have a winner that get higher F1 score than the baseline. 

### Measure errors

In [31]:
# importing error test data (2012-2013)
test_error = pd.read_csv('data\lending_test.csv')
test_error_1 = pd.read_csv('data\lending_test.csv')
print(test_error.head())
print(test_error.shape)

  home_ownership             purpose  yr_credit    dti  revol_util_dec  \
0       MORTGAGE  debt_consolidation       24.0  15.55           0.444   
1           RENT  debt_consolidation       15.0  16.73           0.545   
2       MORTGAGE  debt_consolidation       22.0  15.75           0.346   
3       MORTGAGE  debt_consolidation       19.0  18.55           0.546   
4           RENT  debt_consolidation       14.0  27.06           0.709   

   total_acc addr_state  target  
0       22.0         MA     1.0  
1       41.0         NY     1.0  
2       16.0         CO     0.0  
3       31.0         CA     1.0  
4       17.0         CA     1.0  
(188180, 8)


In [32]:
# Create a function for feature engineering steps above to streamline future data processing
def data_clean(dataset):
    purpose_f = pd.get_dummies(dataset['purpose'], drop_first=True)
    home_own_f = pd.get_dummies(dataset['home_ownership'], drop_first=True)
    dataset['helper_col'] = ~(dataset['addr_state'].isin(LT_list))
    dataset['state'] = np.where(dataset.helper_col == 1, 'SML', dataset.addr_state)
    state_f = pd.get_dummies(dataset['state'], drop_first = True)
    dataset['dti_gp'] = dataset.apply(dti, axis=1)
    dti_gp_f = pd.get_dummies(dataset['dti_gp'], drop_first=True)
    dataset['revol'] = dataset['revol_util_dec'].fillna(0)
    dataset = pd.concat([home_own_f, dataset, purpose_f, state_f, dti_gp], axis=1)    
    dataset = dataset.drop(['home_ownership','addr_state', 'helper_col', 'dti_gp', 'dti', 'state', 'purpose', 'revol_util_dec'], axis=1)
    return dataset

In [33]:
test_error_clean = data_clean(test_error_1)

In [34]:
test_error_clean.tail()

Unnamed: 0,NONE,OTHER,OWN,RENT,yr_credit,total_acc,target,revol,credit_card,debt_consolidation,...,SML,TX,UT,VA,WA,WI,10,15,20,30
188175,0,0,0,1,25.0,20.0,1.0,0.584,0,1,...,0,0,0,0,0,0,,,,
188176,0,0,0,1,9.0,30.0,1.0,0.331,0,0,...,0,0,0,0,0,0,,,,
188177,0,0,0,0,13.0,17.0,1.0,0.624,0,0,...,0,0,0,0,0,0,,,,
188178,0,0,0,0,12.0,27.0,0.0,0.932,0,0,...,0,1,0,0,0,0,,,,
188179,0,0,0,1,10.0,25.0,1.0,0.379,1,0,...,0,0,0,1,0,0,,,,


In [35]:
purpose_f = pd.get_dummies(test_error['purpose'], drop_first=True)
home_own_f = pd.get_dummies(test_error['home_ownership'], drop_first=True)
test_error['helper_col'] = ~(test_error['addr_state'].isin(LT_list))
test_error['state'] = np.where(test_error.helper_col == 1, 'SML', test_error.addr_state)
state_f = pd.get_dummies(test_error['state'], drop_first = True)
test_error['dti_gp'] = test_error.apply(dti, axis=1)
dti_gp_f = pd.get_dummies(test_error['dti_gp'], drop_first=True)
test_error['revol'] = test_error['revol_util_dec'].fillna(0)
test_error = pd.concat([home_own_f, test_error, purpose_f, state_f, dti_gp_f], axis=1)    
test_error = test_error.drop(['home_ownership','addr_state', 'helper_col', 'dti_gp', 'dti', 'state', 'purpose', 'revol_util_dec'], axis=1)

In [36]:
test_error.tail()

Unnamed: 0,NONE,OTHER,OWN,RENT,yr_credit,total_acc,target,revol,credit_card,debt_consolidation,...,TX,UT,VA,WA,WI,10,15,20,30,high
188175,0,0,0,1,25.0,20.0,1.0,0.584,0,1,...,0,0,0,0,0,0,0,1,0,0
188176,0,0,0,1,9.0,30.0,1.0,0.331,0,0,...,0,0,0,0,0,1,0,0,0,0
188177,0,0,0,0,13.0,17.0,1.0,0.624,0,0,...,0,0,0,0,0,0,0,0,0,0
188178,0,0,0,0,12.0,27.0,0.0,0.932,0,0,...,1,0,0,0,0,0,1,0,0,0
188179,0,0,0,1,10.0,25.0,1.0,0.379,1,0,...,0,0,1,0,0,1,0,0,0,0


In [37]:
# obatain target and features from the error test set
features_test = test_error.drop(['target','high'], axis=1).values
target_test = test_error.target.values

print(features_test.shape)
print(target_test.shape)

(188180, 54)
(188180,)


**Apply Logistic Regression with C=0.001**

In [38]:
y_pred_test = gridsearch_lf.predict(features_test)
print('Final F1 score for test data {:.4}'.format(f1_score(target_test, y_pred_test)))

Final F1 score for test data 0.9144


**Baseline F1 score**

In [41]:
print('Baseline F1 score {:.4}'.format(f1_score(target_test, np.ones(len(target_test)))))

Baseline F1 score 0.9144


The score remain drop by 0.006. The model performance is relatively consistence when applying to new data.

### Conclusion

- Lending Club has its own set of credit underwriting policy. Loan requests that didn't meet the policy were declined. The policy directly contributed to F1 score of 0.9206.
- Applying Logistic Regression and Random Forest do not result in higher F1 score. Further actions are needed and are listed below.

### Further Improvements

- Cross validate the models with more recent years' data to see if Logistic Regression is still the most recommended model.
- Access 1-2 more classification models to have a broader measurement for model performance. I would consider Naive Bayes and Support Vector Machines, for example.
- During features engineering stage, add in clustering to capture potential patterns that were not captured by looking at one feature at a time
- Run correlation test across features to confirm that features are independent of each other
- Adding the declined loan data from Lending Club to the dataset. This balances out the sample of loans being paid off versus default. We can also see if Lending Club credit underwriting policy denied loans that would have been good loans that will be paid off.