In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier


In [4]:
train_df = pd.read_csv(('Resources/2019loans.csv'))
test_df = pd.read_csv(('Resources/2020Q1loans.csv'))

In [5]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [6]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [7]:
train_df.dtypes

Unnamed: 0                      int64
index                           int64
loan_amnt                     float64
int_rate                      float64
installment                   float64
                               ...   
total_bal_ex_mort             float64
total_bc_limit                float64
total_il_high_credit_limit    float64
hardship_flag                  object
debt_settlement_flag           object
Length: 86, dtype: object

In [10]:
# Separating and prepping train data
X = train_df.drop("loan_status", axis = 1)
X
y = train_df["loan_status"]
y

X_dummies = pd.get_dummies(X)
y_labels = LabelEncoder().fit_transform(y)

In [11]:
# Separating and prepping test data

Z = test_df.drop("loan_status", axis = 1)
Z
v = test_df["loan_status"]
v

Z_dummies = pd.get_dummies(Z)
v_labels = LabelEncoder().fit_transform(v)

In [12]:
Z_dummies

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,67991,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,25429,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,37505,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,0,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,...,1,0,1,1,0,1,0,1,0,1
4698,77291,77291,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,...,0,0,1,0,1,1,0,1,0,1
4699,77292,77292,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,...,0,1,1,1,0,1,0,1,0,1
4700,77297,77297,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,...,1,0,1,0,1,1,0,1,0,1


In [13]:
# add missing dummy variables to testing set
Z_dummies["debt_settlement_flag_Y"]=0
Z_dummies

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,67991,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,25429,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,37505,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,...,0,1,1,0,1,0,1,0,1,0
4698,77291,77291,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,...,0,1,0,1,1,0,1,0,1,0
4699,77292,77292,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,...,1,1,1,0,1,0,1,0,1,0
4700,77297,77297,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,...,0,1,0,1,1,0,1,0,1,0


In [15]:
X_train=X_dummies
y_train=y_labels
X_test=Z_dummies
y_test=v_labels


In [17]:
# Train the Logistic Regression model on the unscaled data and print the model score
logreg_model = LogisticRegression()

logreg_model.fit(X_train, y_train)

print(f"Training Data Score: {logreg_model.score(X_train, y_train)}")
print(f"Testing Data Score: {logreg_model.score(X_test, y_test)}")

Training Data Score: 0.6485221674876848
Testing Data Score: 0.5253083794130158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Train a Random Forest Classifier model and print the model score

ranfor_class = RandomForestClassifier(n_estimators = 200, random_state = 1)

ranfor_class.fit(X_train, y_train)

print(f"Training Data Score: {ranfor_class.score(X_train, y_train)}")
print(f"Testing Data Score: {ranfor_class.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.6210123351765207


In [19]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
# Train the Logistic Regression model on the scaled data and print the model score


# Create the Logistic Regression model
logreg_model = LogisticRegression()

# Fit/train the model using the training data from 2019 loans
logreg_model.fit(X_train_scaled, y_train)

print(f"Training Data Score: {logreg_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {logreg_model.score(X_test_scaled, y_test)}")

# Confusion Matrix
y_true_log = y_test
y_pred_log = logreg_model.predict(X_test_scaled)
confusion_matrix(y_true_log, y_pred_log)
tn, fp, fn, tp = confusion_matrix(y_true_log, y_pred_log).ravel()
tn, fp, fn, tp
print(f"True negative: {tn}")
print(f"False positive: {fp}")
print(f"False negative: {fn}")
print(f"True postive: {tp}")

# Classification Report
print(classification_report(y_true_log, y_pred_log))

Training Data Score: 0.713136288998358
Testing Data Score: 0.7201190982560612
True negative: 1242
False positive: 1109
False negative: 207
True postive: 2144
              precision    recall  f1-score   support

           0       0.86      0.53      0.65      2351
           1       0.66      0.91      0.77      2351

    accuracy                           0.72      4702
   macro avg       0.76      0.72      0.71      4702
weighted avg       0.76      0.72      0.71      4702



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Train a Random Forest Classifier model on the scaled data and print the model score


# Create Random Forest Classifier
ranfor_class = RandomForestClassifier(n_estimators = 200, random_state = 1)

# Fit/train the classifier with 2019 loan data
ranfor_class.fit(X_train_scaled, y_train)

# Validate the model using the test data
print(f"Training Data Score: {ranfor_class.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {ranfor_class.score(X_test_scaled, y_test)}")

# Confusion Matrix
y_true_rf = y_test
y_pred_rf = ranfor_class.predict(X_test_scaled)
confusion_matrix(y_true_rf, y_pred_rf)
tn, fp, fn, tp = confusion_matrix(y_true_rf, y_pred_rf).ravel()
tn, fp, fn, tp
print(f"True negative: {tn}")
print(f"False positive: {fp}")
print(f"False negative: {fn}")
print(f"True postive: {tp}")

# Classification Report
print(classification_report(y_true_rf, y_pred_rf))

Training Data Score: 1.0
Testing Data Score: 0.6214376860910251
True negative: 814
False positive: 1537
False negative: 243
True postive: 2108
              precision    recall  f1-score   support

           0       0.77      0.35      0.48      2351
           1       0.58      0.90      0.70      2351

    accuracy                           0.62      4702
   macro avg       0.67      0.62      0.59      4702
weighted avg       0.67      0.62      0.59      4702



Without fitting, the logistic regression model had a worse data score than the random forest model. However after fitting the dataset random forrest model's data
scored lower than the logistic regression. Looking at the false positive count, it seems that random forest was suffering from noise values more than the logistic regression model.