In [60]:
import numpy as np
import pandas as pd
from pathlib import Path

In [61]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [62]:
#drop first and second columns of both datasets - index and its unnamed copy
train_df=train_df.drop(columns=['Unnamed: 0','index'],axis=1)
test_df=test_df.drop(columns=['Unnamed: 0','index'],axis=1)

In [63]:
# Convert categorical data to numeric and separate target feature for training data
categorical_columns_2019=['home_ownership','verification_status','loan_status','pymnt_plan','initial_list_status','application_type','hardship_flag','debt_settlement_flag']
train_df_numeric=pd.get_dummies(train_df, columns=categorical_columns_2019)

#separate target feature for training data
X_train = train_df_numeric.drop(columns=['loan_status_high_risk','loan_status_low_risk'], axis=1)
y_train = train_df_numeric['loan_status_high_risk']

In [64]:
train_df_numeric.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,1,1,0,1,1,0,1,0,1,0
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,1,1,0,1,1,0,1,0,1,0
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,1,1,0,1,1,0,1,0,1,0
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,1,1,0,1,1,0,1,0,1,0
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,1,1,0,1,1,0,1,0,1,0


In [65]:
# Convert categorical data to numeric and separate target feature for testing data
categorical_columns_2020=['home_ownership','verification_status','loan_status','pymnt_plan','initial_list_status','application_type','hardship_flag','debt_settlement_flag']
test_df_numeric=pd.get_dummies(test_df, columns=categorical_columns_2020)

# add missing dummy variables to testing set
# add a column for the missing category debt_settlement_flag=Y in the 2020 dataset 
test_df_numeric['debt_settlement_flag_Y']=0

#separate target feature for training data
X_test = test_df_numeric.drop(columns=['loan_status_high_risk','loan_status_low_risk'], axis=1)
y_test = test_df_numeric['loan_status_high_risk']

In [66]:
test_df_numeric.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,1,1,0,1,1,0,1,0,1,0
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,1,1,0,1,1,0,1,0,1,0
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,1,1,0,1,1,0,1,0,1,0
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,1,1,0,1,1,0,1,0,1,0
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,1,1,0,1,1,0,1,0,1,0


In [67]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=1, max_iter=4000).fit(X_train, y_train)
#from sklearn.metrics import classification_report
#target_names = ["low_risk", "high_risk"]
#print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.6954022988505747
Testing Score: 0.5748617609527861


In [68]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6544023819651212


In [69]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [70]:
# Train the Logistic Regression model on the scaled data and print the model score
clf = LogisticRegression(random_state=1, max_iter=500).fit(X_train_scaled, y_train)

print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.7078817733990148
Testing Score: 0.7677584006805614


In [71]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6552530837941302


Logistic regression will with scaled data will predict a mre accurately and is not overfitting the data. Testing score for logistic regression is higher. Scaling the data improved the test and the training score for logistic regression but did not affect RamdomForest classifier result. 