In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
new_train_df = train_df.drop('loan_status', axis=1)
new_train_df

train_dummies = pd.get_dummies(new_train_df)
# print(train_dummies.columns)
train_dummies

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.1240,448.95,197000.0,11.28,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.1240,100.22,45000.0,18.08,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
12176,354944,354944,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,...,1,1,0,1,1,0,1,0,1,0
12177,354973,354973,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,...,0,1,0,1,1,0,1,0,1,0
12178,355002,355002,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,...,0,1,0,1,0,1,1,0,1,0


In [5]:
# Converting output labels to 0 and 1
ytrain_label = LabelEncoder().fit_transform(train_df['loan_status'])
ytrain_label

array([1, 1, 1, ..., 0, 0, 0])

In [6]:
# Convert categorical data to numeric and separate target feature for testing data
new_test_df = train_df.drop('loan_status', axis=1)
new_test_df

test_dummies = pd.get_dummies(new_test_df)
# print(test_dummies.columns)
test_dummies

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.1240,448.95,197000.0,11.28,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.1240,100.22,45000.0,18.08,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
12176,354944,354944,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,...,1,1,0,1,1,0,1,0,1,0
12177,354973,354973,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,...,0,1,0,1,1,0,1,0,1,0
12178,355002,355002,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,...,0,1,0,1,0,1,1,0,1,0


In [7]:
ytest_label = LabelEncoder().fit_transform(test_df['loan_status'])
ytest_label

array([1, 1, 1, ..., 0, 0, 0])

In [8]:
# add missing dummy variables to testing set

final_test = test_dummies.reindex(columns = train_dummies.columns, fill_value=0)
print(len(final_test))
print(len(train_dummies))

12180
12180


In [None]:
# I predict the random forest classifiier model will perform better because it is more suited towards heavy categorical data.  

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_dummies, ytrain_label, random_state=1)

In [10]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

classifier.fit(X_train, y_train)

print(f"The unscaled training score: {classifier.score(X_train, y_train)}")
print(f"The unscaled testing score: {classifier.score(X_test, y_test)}")

The unscaled training score: 0.655719759168035
The unscaled testing score: 0.6482758620689655


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train, y_train)
print(f'The unscaled training score: {clf.score(X_train, y_train)}')
print(f'The unscaled t score: {clf.score(X_test, y_test)}')


The unscaled training score: 1.0
The unscaled t score: 0.7796387520525452


In [None]:
# As predicted, the random forest classifier scored alot stronger than logistic regression.
# Although my training score being 1.0 is a red flag, my testing score came out more in line with what I was expecting compared to LR.


In [None]:
# Scaling the data should improve the scores of both of my models.
# Looking at the data source, there is a lot of range in numbers that need to be accounted for
# For example, annual incomes in the hundreds of thousands and interest rates being small decimals should not be treated equally.

In [13]:
# Scale the data

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# Train the Logistic Regression model on the scaled data and print the model score

classifier = LogisticRegression()

classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.7056376573617953
Testing Data Score: 0.7155993431855501


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Train a Random Forest Classifier model on the scaled data and print the model score

clf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')


Training Score: 1.0
Testing Score: 0.7865353037766831


In [20]:
# As predicted, the scaled results are significantly stronger than unscaled.  
# Unscaled scores are essentially 50/50 while the scaled scores being around .8 are much more reliable for predictions.

print('For unscaled linear Regression')
print(f"The training score: {classifier.score(X_train, y_train)}")
print(f"The testing score: {classifier.score(X_test, y_test)}")
print('----------')

print('Unscaled Random Forest Classifier')
print(f'The training score: {clf.score(X_train, y_train)}')
print(f'The testing score: {clf.score(X_test, y_test)}')
print('----------')

print('Scaled LR')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")
print('----------')

print('Scaled random forest classifier')
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

For unscaled linear Regression
The training score: 0.5087027914614122
The testing score: 0.496551724137931
----------
Unscaled Random Forest Classifier
The training score: 0.5346469622331691
The testing score: 0.5385878489326765
----------
Scaled LR
Training Data Score: 0.7056376573617953
Testing Data Score: 0.7155993431855501
----------
Scaled random forest classifier
Training Score: 1.0
Testing Score: 0.7865353037766831
