In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/Generator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/Generator/2020Q1loans.csv'))

In [3]:
train_df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,19975.0,0.2565,801.09,RENT,28000.0,Not Verified,n,28.42,0.0,0.0,...,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,N,N,high_risk
12176,15000.0,0.1774,540.34,RENT,50000.0,Verified,n,23.43,4.0,0.0,...,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,N,N,high_risk
12177,3600.0,0.1862,131.28,RENT,60000.0,Not Verified,n,28.80,0.0,1.0,...,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,N,N,high_risk
12178,15000.0,0.0881,475.68,MORTGAGE,62000.0,Source Verified,n,11.44,0.0,0.0,...,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,N,N,high_risk


In [4]:
# Split out the targets from train and test data sets

X_train = train_df.drop(["target"], axis=1)
y_train = train_df["target"]

X_test = test_df.drop(["target"], axis=1)
y_test = test_df["target"]


In [5]:
# Convert categorical data to numeric and separate target feature for training data
X_train = pd.get_dummies(X_train)

In [6]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = pd.get_dummies(X_test)

In [7]:
# add missing dummy variables to testing set
train_cols = list(X_train.columns)
test_cols = list(X_test.columns)
# finds the missing column
missing_col = list(set(train_cols) - set(test_cols))
# insert the missing column in the right place with default values of zero
X_test.insert(train_cols.index(missing_col[0]), missing_col[0], 0)

In [8]:
test_cols

['loan_amnt',
 'int_rate',
 'installment',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'total_acc',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'collections_12_mths_ex_med',
 'policy_code',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'mths_since_rcnt_il',
 'total_bal_il',
 'il_util',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'total_rev_hi_lim',
 'inq_fi',
 'total_cu_tl',
 'inq_last_12m',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths_since_recent_bc',
 'mths_since_recent_inq',
 'num_accts_ever_120_pd',
 'num_actv_bc_

In [9]:
# confirm that the shapes are the same
print(X_train.shape)
print(X_test.shape)

(12180, 92)
(4702, 92)


I predict that the Random Forest Model will more accurately predict the classifications.
From my experience and reading it is generally more accurate. 

In [10]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

print(f"Unscaled Logistic Regression Training Score: {logistic_regression_model.score(X_train, y_train)}")
print(f"Unscaled Logistic Regression Testing Score: {logistic_regression_model.score(X_test, y_test)}")

Unscaled Logistic Regression Training Score: 0.65311986863711
Unscaled Logistic Regression Testing Score: 0.5072309655465759


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
# Train a Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)

In [12]:
# Print the model score
print(f'Unscaled Random Forest Training Score: {random_forest_classifier.score(X_train, y_train)}')
print(f'Unscaled Random Forest Testing Score: {random_forest_classifier.score(X_test, y_test)}')

Unscaled Random Forest Training Score: 1.0
Unscaled Random Forest Testing Score: 0.646958740961293


Like I predicted, the random forest model was more accurate. It still only returned a score of 0.65, so there is certainly room for improvement. 

After scaling the data I think that both models will improve significantly. My hope is that they'll improve by 10-20% each. 

In [13]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# Train the Logistic Regression model on the scaled data
scaled_logistic_regression_model = LogisticRegression()
scaled_logistic_regression_model.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [15]:
# Print scaled scores
print(f"Scaled Logistic Regression Training Score: {scaled_logistic_regression_model.score(X_train_scaled, y_train)}")
print(f"Scaled Logistic Regression Testing Score: {scaled_logistic_regression_model.score(X_test_scaled, y_test)}")

Scaled Logistic Regression Training Score: 0.710919540229885
Scaled Logistic Regression Testing Score: 0.7598894087622289


In [20]:
# Train a Random Forest Classifier model on the scaled data
scaled_random_forest_classifier = RandomForestClassifier(random_state=50, n_estimators=500).fit(X_train_scaled, y_train)


In [21]:
# Print the model score
print(f'Scaled Random Forest Training Score: {scaled_random_forest_classifier.score(X_train_scaled, y_train)}')
print(f'Scaled Random Forest Testing Score: {scaled_random_forest_classifier.score(X_test_scaled, y_test)}')

Scaled Random Forest Training Score: 1.0
Scaled Random Forest Testing Score: 0.6471714164185453


It appears that the Logistic Regression model testing score improved by about 20% after scaling the data. This was on the higher end of my expectations. Hoever, the Random Forest model didn't improve much at all with the scaled data. I tried different random states for the Random Forest model and got similar results. 