In [20]:
from pathlib import Path
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.utils.random import sample_without_replacement
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier

In [21]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [22]:
train_df.shape

(12180, 86)

In [24]:
dependent_columns = ["loan_amnt", "int_rate", "installment", "home_ownership", "annual_inc", "verification_status", "loan_status", "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths", "open_acc", "pub_rec", "revol_bal", "total_acc", "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "collections_12_mths_ex_med", "policy_code", 
           "application_type", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il", "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il", "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc", "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl", "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy", "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct", "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", 
           "mo_sin_rcnt_tl", "mort_acc", "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl", "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl", "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0", "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m", "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies", "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit", "total_il_high_credit_limit",
           "hardship_flag", "debt_settlement_flag"   
]
target = "loan_status"

In [25]:
# Convert categorical data to numeric and separate target feature for training data
x_train = train_df[dependent_columns]
y_train = train_df[target]
x_train_dummies= pd.get_dummies(x_train)
x_train_dummies.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,1,1,0,1,1,0,1,0,1,0
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,1,1,0,1,1,0,1,0,1,0
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,1,1,0,1,1,0,1,0,1,0
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,1,1,0,1,1,0,1,0,1,0
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,1,1,0,1,1,0,1,0,1,0


In [26]:
# Convert categorical data to numeric and separate target feature for testing data
x_test_df = test_df[dependent_columns]
y_test_df = test_df[target]
x_test_dummies= pd.get_dummies(x_test_df)
x_test_dummies.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,loan_status_high_risk,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0,1,1,0,1,1,0,1,0,1
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0,1,1,0,1,1,0,1,0,1
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0,1,1,0,1,1,0,1,0,1
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0,1,1,0,1,1,0,1,0,1
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0,1,1,0,1,1,0,1,0,1


In [27]:
# add missing dummy variables to testing set
for column in x_train_dummies:
    if column not in x_test_dummies:
        x_test_dummies [column] = 0
print (x_test_dummies.shape)
    

(4702, 94)


In [28]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
logistic_modle = classifier.fit(x_train_dummies, y_train)
print(f"Testing Data Score: {classifier.score(x_test_dummies, y_test_df)}")

Testing Data Score: 0.5165886856656742


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier().fit(x_train_dummies, y_train)
print(f'Testing Score: {clf.score(x_test_dummies, y_test_df)}')

Testing Score: 1.0


In [30]:
# Scale the data
scaler = StandardScaler().fit(x_train_dummies)
X_train_scaled = scaler.transform(x_train_dummies)
X_test_scaled = scaler.transform(x_test_dummies)

In [31]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression()
logistic_modle = classifier.fit(X_train_scaled, y_train)
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test_df)}")

Testing Data Score: 0.9997873245427478


In [32]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier().fit(X_train_scaled, y_train)
print(f'Testing Score: {clf.score(X_test_scaled, y_test_df)}')

Testing Score: 1.0


How do the model scores compare to each other, and to the previous results on unscaled data?

Looks like the model for the linear regression got way better, but the random forest classifier remained spot on. From what I can tell, the random forest classifier did not benefit much from the scaling, whereas the linear regression did.

How does this compare to your prediction?

Looks like I was totally underestimating the strength of scaling the data would do to the linear regression!

Notes

There is some strangeness to how I can run this whole notebook and get some significantly different values to the ones I have.