In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data

# categorical/non-numeric data columns manually determined from opening csv, make numeric with get_dummies()
train_binary_encoded = pd.get_dummies(train_df, columns=['home_ownership', 'verification_status',
                                                         'pymnt_plan', 'initial_list_status',
                                                         'application_type','hardship_flag',
                                                         'debt_settlement_flag'])

# set 'target' column to numeric data using lambda function to avoid creating two dummy value columns
target_num = {
    "low_risk": 0,
    "high_risk": 1
}
train_binary_encoded['target_num'] = train_df['target'].apply(lambda x: target_num[x])

# drop original 'target' column
# get_dummies() does not create 'hardship_flag_Y' column in test data, both dataframes should have same
# number of columns for consistency in model fitting. Since 'hardship_flag_Y' and 'hardship_flag_N' are 
# simple inverses, 'hardship_flag_Y' can be dropped with no loss of information.
columns = ['target','hardship_flag_Y']
train_binary_encoded.drop(columns=columns, inplace=True)

train_binary_encoded.head()
               

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N,target_num
0,1500.0,0.1102,49.13,30000.0,23.04,0.0,0.0,5.0,0.0,3575.0,...,0,0,1,0,1,1,0,1,1,0
1,40000.0,0.139,928.66,88000.0,27.26,0.0,0.0,18.0,0.0,33951.0,...,0,1,1,0,1,1,0,1,1,0
2,16000.0,0.1033,518.76,75000.0,9.9,0.0,0.0,6.0,1.0,1601.0,...,1,0,1,0,1,1,0,1,1,0
3,7000.0,0.0756,217.94,75000.0,14.5,0.0,0.0,6.0,0.0,8036.0,...,1,0,1,0,1,1,0,1,1,0
4,35000.0,0.1774,883.83,135200.0,33.98,0.0,0.0,25.0,0.0,72360.0,...,1,0,1,0,1,1,0,1,1,0


In [4]:
# Convert categorical data to numeric and separate target feature for testing data
test_binary_encoded = pd.get_dummies(test_df, columns=['home_ownership', 'verification_status',
                                                         'pymnt_plan', 'initial_list_status',
                                                         'application_type','hardship_flag',
                                                         'debt_settlement_flag'])

target_num = {
    "low_risk": 0,
    "high_risk": 1
}
test_binary_encoded['target_num'] = test_df['target'].apply(lambda x: target_num[x])

test_binary_encoded.drop(columns='target', inplace=True)
test_binary_encoded.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N,target_num
0,15000.0,0.0646,459.47,86000.0,33.78,0.0,0.0,14.0,0.0,19821.0,...,0,0,1,0,1,1,0,1,1,0
1,24000.0,0.1695,595.82,80000.0,26.7,0.0,0.0,21.0,0.0,39993.0,...,0,0,1,0,1,0,1,1,1,0
2,35000.0,0.0819,712.87,134000.0,19.61,0.0,0.0,8.0,0.0,34707.0,...,1,0,1,0,1,1,0,1,1,0
3,12000.0,0.1774,432.27,65000.0,11.27,0.0,0.0,11.0,0.0,12970.0,...,0,0,1,0,1,1,0,1,1,0
4,18475.0,0.288,584.19,95000.0,53.56,0.0,0.0,15.0,0.0,34968.0,...,1,0,1,0,1,0,1,1,1,0


## Predictions: Linear Regression vs Random Forest Classification
I expect that a simple linear regression will not suit this dataset for two reasons: there are multiple inputs affecting the risk of the loan, and that risk is a binary low or high risk classification rather than a continuous function that a regression would typically be used for.  The random forest classification will likely give better results. 

In [5]:
# Train the Logistic Regression model on the unscaled data and print the model score

#set target and data for training and testing
y_train = train_binary_encoded['target_num']
X_train = train_binary_encoded.drop('target_num', axis=1)
y_test = test_binary_encoded['target_num']
X_test = test_binary_encoded.drop('target_num', axis=1)

# create model
model = LinearRegression()

# fit model to training data
model.fit(X_train, y_train)

# score model for training and testing data
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")


Training Score: 0.1634934365560965
Testing Score: -5507501464.9423


In [6]:
# Train a Random Forest Classifier model and print the model score

clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)

print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')


Training Score: 1.0
Testing Score: 0.6661417322834645


## Results
As expected, the linear regression did not work well with this dataset. I was suprised at just how poorly it peformed though, only 16% on training data and I'm not actually sure what a -5.5 billion testing score even means. The Random Forest Classifier score of 100% on training data with only 66% on test data makes me suspect overfitting.

## Predictions: Scaling
I expect scaling will marginally improve the results of both models, as far as I know this is not a very large dataset for machine learning, and it is a simple binary classification output.

In [7]:
# Scale the data

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
# Train the Logistic Regression model on the scaled data and print the model score

model = LinearRegression()

model.fit(X_train_scaled, y_train)

training_score = model.score(X_train_scaled, y_train)
testing_score = model.score(X_test_scaled, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.16349064603105823
Testing Score: -3.058533268096012e+29


In [9]:
# Train a Random Forest Classifier model on the scaled data and print the model score

clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6655511811023622


## Results
The linear regression training data was indeed marginally better with scaling, however the testing score was drastically changed. The Random Forest Classifier model still appears to be overfitted, and the testing score actually very slightly lowered with scaling. I'm not sure what to make of this.