In [25]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk


In [5]:
# Convert categorical data to numeric and separate target feature for training data
x_train = pd.get_dummies(train_df.drop(columns=['target']))
y_train = train_df['target']
x_test = pd.get_dummies(test_df.drop(columns=['target']))
y_test = test_df['target']

In [6]:
x_train.head(2)

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0,1,1,0,1,0,1,0,1,0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,0,1,1,0,1,0,1,0


In [7]:
y_train.head(2)

0    low_risk
1    low_risk
Name: target, dtype: object

In [8]:
x_test.head(2)

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1


In [9]:
y_test.head(2)

0    low_risk
1    low_risk
Name: target, dtype: object

In [10]:
# add missing dummy variables to testing set
for column in x_train.columns:
    if column not in x_test.columns:
        x_test[column] = 0
        print(f'Added dummy column {column} to testing data set')

Added dummy column debt_settlement_flag_Y to testing data set


## Prediction - unscaled data
I predict that the random forest classifier will perform better here. I believe this type of model will handle the unscaled data better since Logistic Regressions are vulnerable to overfitting. 

In [20]:
# Train the Logistic Regression model on the unscaled data and print the model score
lc = LogisticRegression()
lc.fit(x_train, y_train)
lc.score(x_test, y_test)

0.5102084219481072

In [21]:
# Train a Random Forest Classifier model and print the model score
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc.score(x_test, y_test)

0.6410038281582305

## Analysis - unscaled data, results
The Random Forest Classifier scored significantly higher than the Logistic Regression (64% compared to 51%). This matched my prediction. I did not expect the gap between the scores to be quite this wide - the Logistic Regression barely broke a 50% score!

In [26]:
# Scale the data
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [27]:
pd.DataFrame(x_train_scaled).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
0,-1.008699,0.91962,-0.85042,-0.104831,0.230684,-0.32162,1.758968,1.217675,-0.348464,0.997756,...,-0.435396,0.0,3.323775,-3.323775,0.417647,-0.417647,0.16843,-0.16843,0.020265,-0.020265
1,2.224099,0.387179,1.602612,0.109872,-0.437176,2.255569,-0.694141,-0.436716,-0.348464,1.133413,...,-0.435396,0.0,-0.300863,0.300863,0.417647,-0.417647,0.16843,-0.16843,0.020265,-0.020265


In [28]:
pd.DataFrame(x_test_scaled).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
0,2.224099,-0.717637,1.195746,0.322189,-0.407538,-0.32162,0.532413,-0.767594,-0.348464,0.897613,...,-0.435396,0.0,-0.300863,0.300863,0.417647,-0.417647,-5.937171,5.937171,0.020265,-0.020265
1,0.700765,0.037289,0.228351,-0.346777,-0.30558,-0.32162,0.532413,-1.098472,-0.348464,-0.688659,...,-0.435396,0.0,-0.300863,0.300863,0.417647,-0.417647,0.16843,-0.16843,0.020265,-0.020265


## Prediction - scaled data
I predict that scaling the data will increase the Logistic Regression Score and will not significantly impact the Random Forest Classifier Score. As for which model will perform better with the scaled data, that is more difficult to predict since I think the two models will be competing on more even ground. Even though the Random Forest Classifier scored much higher than the Logistic Regression on the unscaled data, it did not get what I would consider a stellar score - only about 64%. I believe that, since the Logistic Regression will improve substantially while the Random Forest Clasifier will not, the Logistic Regression will get a higher score this time. *crosses fingers*

In [29]:
# Train the Logistic Regression model on the scaled data and print the model score
lc2 = LogisticRegression()
lc2.fit(x_train_scaled, y_train)
lc2.score(x_test_scaled, y_test)

0.7598894087622289

In [30]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rfc2 = RandomForestClassifier()
rfc2.fit(x_train_scaled, y_train)
rfc2.score(x_test_scaled, y_test)

0.6420672054444917

## Analysis - scaled data, results
The Logistic Regression performed better than the Random Forest Classifier while using the scaled data (76% to 64%). Also, the Random Forest Classifier got almost the exact same score with scaled and unscaled data. While this confirms my prediction, I am surprised at how much the Logistic Regression model improved! From 51% to 76% - quite a jump!