In [106]:
#import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [107]:
#import data
path = "Resources/lending_data.csv"
df = pd.read_csv(path)
df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [108]:
#establish a prior for models to improve on
print("Based on the full dataset, always predicting a loan status of 0 will yield an accuracy of " + str(1-df["loan_status"].sum()/len(df["loan_status"])) + ".  To be any good, a model must significantly improve on that.")

Based on the full dataset, always predicting a loan status of 0 will yield an accuracy of 0.9677569129178704.  To be any good, a model must significantly improve on that.


# Prediction

The logistic regression splits the parameter space in half along a flat surface, while the random forest splits the data in a more complicated way. That leads me to believe that the random forest will have a significantly better fit to the training data. I think it will also be more accurate for the testing data, but not to near the extent as with the training data. In other words, I think the random forest will be both more accurate and more overfitted.


In [116]:
#split the data between independent and dependent variables and training/testing datasets
X = df.drop("loan_status",axis=1)
y = df["loan_status"]
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [117]:
#make the logistic regression model
model1 = LogisticRegression()
#fit the model
model1.fit(X_train,y_train)
#evaluate the model
print(f"Accuracy for training data:  {model1.score(X_train,y_train)}")
print(f"Accuracy for testing data:  {model1.score(X_test,y_test)}")
print(f"Confusion Matrix:")
print(confusion_matrix(y_test,model1.predict(X_test)))

Accuracy for training data:  0.9916941807676434
Accuracy for testing data:  0.9925711927362774
Confusion Matrix:
[[18676    87]
 [   57   564]]


In [118]:
#make the random forest model
model2 = RandomForestClassifier()
#fit the model
model2.fit(X_train,y_train)
#evaluate the model
print(f"Accuracy for training data:  {model2.score(X_train,y_train)}")
print(f"Accuracy for testing data:  {model2.score(X_test,y_test)}")
print(f"Confusion Matrix:")
print(confusion_matrix(y_test,model2.predict(X_test)))

Accuracy for training data:  0.9972485899023249
Accuracy for testing data:  0.9923648369789517
Confusion Matrix:
[[18681    82]
 [   66   555]]


# Analysis

Having tested both the logistic regression and random forest techniques multiple times across different training/testing data splits, the logistic regression usually had slightly better accuracy for the testing data (about 0.99206 vs  0.9918, a difference of about 0.00026).  Also, the random forest generally yielded much higher accuracy for the training data (about 0.9973 vs 0.992, a difference of about 0.0053).  This indicates that the random forest algorithm tended to overfit the training data and not generalize as well to the testing data.  In short, the logistic regression works better for this dataset.

In [91]:
#Warning:  this cell took 4-5 minutes to run on a pretty fast computer and is not required for the challenge
#this cell splits the data then fits and evaluates both models 100 times
#it then averages the accuracies of each model across the 100 iterations
#this is where I got the numbers I quoted in the analysis, which matched my observations running the above cells a few dozen times
log_train_acc = []
log_test_acc = []
for_train_acc = []
for_test_acc = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X,y)
    model1 = LogisticRegression()
    model1.fit(X_train,y_train)
    model2 = RandomForestClassifier()
    model2.fit(X_train,y_train)
    log_train_acc.append(model1.score(X_train,y_train))
    log_test_acc.append(model1.score(X_test,y_test))
    for_train_acc.append(model2.score(X_train,y_train))
    for_test_acc.append(model2.score(X_test,y_test))
print(f"Average logistic regression training accuracy: {np.mean(log_train_acc)}")
print(f"Average logistic regression testing accuracy: {np.mean(log_test_acc)}")
print(f"Average random forest training accuracy: {np.mean(for_train_acc)}")
print(f"Average random forest testing accuracy: {np.mean(for_test_acc)}")

Average logistic regression training accuracy: 0.9920011349566653
Average logistic regression testing accuracy: 0.9920625257944695
Average random forest training accuracy: 0.9973354312835331
Average random forest testing accuracy: 0.9918004539826661
