In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from pathlib import Path
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for testing data
y_train = LabelEncoder().fit_transform(train_df['target'])
y_test = LabelEncoder().fit_transform(test_df['target'])


In [4]:
# Changes categorical data to numerical data using get_dummies
X = train_df.drop('target', axis=1)
X_train = pd.get_dummies(X)
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0,1,1,0,1,0,1,0,1,0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,0,1,1,0,1,0,1,0
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,1,1,0,1,1,0,1,0,1,0
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,0,1,1,0,1,0
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,0,1,1,0,1,0,1,0


In [5]:
X_t = test_df.drop('target', axis=1)
X_test = pd.get_dummies(X_t)
X_test.head()


Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,0,1,0,1,1,0,0,1,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,0,1,1,0,1,0,1


In [6]:
#Identify what columns are different from the train and test dataframes and add the missing column to the test dataframe.
new_column = X_train.columns.difference(X_test.columns).tolist()
X_test[new_column]=0

In [7]:
X_train.columns.difference(X_test.columns).tolist()
#Verify both dataframes have the same column names

[]

### Initial Analysis: I suspect that the logistic regression (LR) will initially provide a more accurate model than the Randon Forest (RF) because I believe the data may tend to have a more linear correlation to the predicted value (low or high risk). Also, RF, initially without tuning or scaling may tend to overfit the training data, thus proving to be less accurate with the test data.

In [9]:
# Train the Logistic Regression model on the unscaled data and print the model score

clf_lr = LogisticRegression(max_iter=1000).fit(X_train, y_train)
print(f"LR Training Data Score: {clf_lr.score(X_train, y_train)}")
print(f"LR Testing Data Score: {clf_lr.score(X_test, y_test)}")
#several iter values were tried. No significant better results for the unscaled data

LR Training Data Score: 0.6963054187192118
LR Testing Data Score: 0.5646533390046788


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [10]:
y_true = y_test
y_pred = clf_lr.predict(X_test)
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))


[[ 846 1505]
 [ 542 1809]]
              precision    recall  f1-score   support

           0       0.61      0.36      0.45      2351
           1       0.55      0.77      0.64      2351

    accuracy                           0.56      4702
   macro avg       0.58      0.56      0.55      4702
weighted avg       0.58      0.56      0.55      4702



In [11]:
# Train a Random Forest Classifier model and print the model score
clf_rf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f"RF Training Score: {clf_rf.score(X_train, y_train)}")
print(f"RF Testing Score: {clf_rf.score(X_test, y_test)}")


RF Training Score: 1.0
RF Testing Score: 0.646958740961293


In [12]:
y_true = y_test
y_pred = clf_rf.predict(X_test)
confusion_matrix(y_true, y_pred)
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

[[1961  390]
 [1270 1081]]
              precision    recall  f1-score   support

           0       0.61      0.83      0.70      2351
           1       0.73      0.46      0.57      2351

    accuracy                           0.65      4702
   macro avg       0.67      0.65      0.63      4702
weighted avg       0.67      0.65      0.63      4702



### Post comparison analysis: RF provided to be a better model than LR for the test data; however, both models have poor scores (RF=0.647, LR = 0.56). Both unscaled model fit have a low accuracy. However, the LR is better at predicting the high-risk loans (recall 0.77). The RF was much better at predicting the high risk loans, but incorrectly predicts too many low risk loans (not good for the bank).

### Analysis - scaled values: I suspect that scaling the data will result in better results for both the LR and the RF models. Based on the low score for the LR model, I suspect that the relationship between the X values and the predicted values does not show linear characteristics. Therefore, RF should provide a more accurate prediction. 

In [13]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression(max_iter=2000).fit(X_train_scaled, y_train)
print(f"LR Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"LR Testing Data Score: {classifier.score(X_test_scaled, y_test)}")


LR Training Data Score: 0.7110837438423645
LR Testing Data Score: 0.7598894087622289


In [15]:
y_true = y_test
y_pred = classifier.predict(X_test_scaled)
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

[[1762  589]
 [ 540 1811]]
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      2351
           1       0.75      0.77      0.76      2351

    accuracy                           0.76      4702
   macro avg       0.76      0.76      0.76      4702
weighted avg       0.76      0.76      0.76      4702



In [22]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f"RF Training Score: {clf.score(X_train_scaled, y_train)}")
print(f"RF Testing Data Score: {clf.score(X_test_scaled, y_test)}")


RF Training Score: 1.0
RF Testing Data Score: 0.6480221182475542


In [23]:
y_true = y_test
y_pred = clf.predict(X_test_scaled)
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

[[1960  391]
 [1264 1087]]
              precision    recall  f1-score   support

           0       0.61      0.83      0.70      2351
           1       0.74      0.46      0.57      2351

    accuracy                           0.65      4702
   macro avg       0.67      0.65      0.64      4702
weighted avg       0.67      0.65      0.64      4702



### Post scaled run analysis: After scaling the data, the LR shows marked improvements in the model results. The RF model only show a slight improvement after scaling of the X values. The LR improved in precision, recall and specificity.