In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv(Path("Resources/2019loans.csv"))
test_df = pd.read_csv(Path("Resources/2020Q1loans.csv"))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,25000.0,0.2,662.35,RENT,45000.0,Verified,n,22.4,2.0,0.0,...,25.0,0.0,0.0,41532.0,12941.0,18000.0,20632.0,N,N,low_risk
1,35000.0,0.1862,900.62,MORTGAGE,130000.0,Not Verified,n,17.71,0.0,0.0,...,88.9,0.0,0.0,401978.0,69537.0,65200.0,19056.0,N,N,low_risk
2,6400.0,0.0881,202.96,RENT,70000.0,Not Verified,n,3.27,0.0,0.0,...,0.0,1.0,0.0,23400.0,3571.0,15800.0,4000.0,N,N,low_risk
3,7000.0,0.2,260.15,RENT,65000.0,Verified,n,15.66,0.0,0.0,...,50.0,0.0,0.0,64853.0,53589.0,5300.0,57453.0,N,N,low_risk
4,8500.0,0.143,291.75,OWN,41000.0,Not Verified,n,24.18,0.0,0.0,...,25.0,1.0,0.0,37469.0,19623.0,16000.0,18169.0,N,N,low_risk


In [4]:
a_train = train_df["target"]
b_train = train_df.drop(columns = ["target"])
b_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,25000.0,0.2,662.35,RENT,45000.0,Verified,n,22.4,2.0,0.0,...,81.0,25.0,0.0,0.0,41532.0,12941.0,18000.0,20632.0,N,N
1,35000.0,0.1862,900.62,MORTGAGE,130000.0,Not Verified,n,17.71,0.0,0.0,...,100.0,88.9,0.0,0.0,401978.0,69537.0,65200.0,19056.0,N,N
2,6400.0,0.0881,202.96,RENT,70000.0,Not Verified,n,3.27,0.0,0.0,...,100.0,0.0,1.0,0.0,23400.0,3571.0,15800.0,4000.0,N,N
3,7000.0,0.2,260.15,RENT,65000.0,Verified,n,15.66,0.0,0.0,...,100.0,50.0,0.0,0.0,64853.0,53589.0,5300.0,57453.0,N,N
4,8500.0,0.143,291.75,OWN,41000.0,Not Verified,n,24.18,0.0,0.0,...,100.0,25.0,1.0,0.0,37469.0,19623.0,16000.0,18169.0,N,N


In [5]:
#get dummies 
b_train_dummy = pd.get_dummies(b_train)
b_train_dummy.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,25000.0,0.2,662.35,45000.0,22.4,2.0,0.0,9.0,0.0,6637.0,...,0,1,1,0,1,0,1,1,0,1
1,35000.0,0.1862,900.62,130000.0,17.71,0.0,0.0,14.0,0.0,58513.0,...,0,0,1,0,1,1,0,1,0,1
2,6400.0,0.0881,202.96,70000.0,3.27,0.0,0.0,6.0,1.0,1238.0,...,0,0,1,0,1,1,0,1,0,1
3,7000.0,0.2,260.15,65000.0,15.66,0.0,0.0,12.0,0.0,4578.0,...,0,1,1,0,1,0,1,1,0,1
4,8500.0,0.143,291.75,41000.0,24.18,0.0,0.0,9.0,1.0,6540.0,...,0,0,1,0,1,1,0,1,0,1


In [6]:
print(b_train_dummy.columns)

Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_in

In [7]:
# Change labels to 0 and 1
a_train_label = LabelEncoder().fit_transform(train_df['target'])
a_train_label

array([1, 1, 1, ..., 0, 0, 0])

In [8]:
# Convert categorical data to numeric and separate target feature for testing data

In [9]:
a_test = test_df["target"]
b_test = test_df.drop(columns = ["target"])
b_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,8000.0,0.0819,251.4,MORTGAGE,53000.0,Not Verified,n,30.87,0.0,1.0,...,100.0,0.0,0.0,0.0,214538.0,52345.0,500.0,67066.0,N,N
1,30000.0,0.1102,652.58,MORTGAGE,120000.0,Not Verified,n,21.53,0.0,0.0,...,100.0,0.0,0.0,0.0,554901.0,101234.0,42500.0,119678.0,N,N
2,16000.0,0.0819,325.88,MORTGAGE,95000.0,Verified,n,23.37,0.0,1.0,...,100.0,42.9,1.0,0.0,292025.0,76609.0,29900.0,62902.0,N,N
3,3000.0,0.1524,104.35,MORTGAGE,50000.0,Not Verified,n,22.3,0.0,0.0,...,89.2,75.0,0.0,0.0,472470.0,219678.0,48100.0,171741.0,N,N
4,10000.0,0.2305,282.2,OWN,34000.0,Not Verified,n,5.58,0.0,2.0,...,100.0,0.0,0.0,0.0,14729.0,5309.0,12800.0,1929.0,N,N


In [10]:
#get dummies 
b_test_dummy = pd.get_dummies(b_test)
b_test_dummy.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
0,8000.0,0.0819,251.4,53000.0,30.87,0.0,1.0,11.0,0.0,13802.0,...,1,0,0,1,0,1,0,1,1,1
1,30000.0,0.1102,652.58,120000.0,21.53,0.0,0.0,13.0,0.0,19427.0,...,1,0,0,1,0,1,0,1,1,1
2,16000.0,0.0819,325.88,95000.0,23.37,0.0,1.0,16.0,1.0,20786.0,...,0,0,1,1,0,1,1,0,1,1
3,3000.0,0.1524,104.35,50000.0,22.3,0.0,0.0,14.0,0.0,30125.0,...,1,0,0,1,0,1,1,0,1,1
4,10000.0,0.2305,282.2,34000.0,5.58,0.0,2.0,9.0,0.0,4166.0,...,1,0,0,1,0,1,1,0,1,1


In [11]:
b_test_dummy.columns

Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_in

In [12]:
# Change labels to 0 and 1
a_test_labels = LabelEncoder().fit_transform(test_df['target'])
a_test_labels

array([1, 1, 1, ..., 0, 0, 0])

In [13]:
# add missing dummy variables to testing set
missing_dummy = set(b_train_dummy.columns ) - set(b_test_dummy.columns )

for c in missing_dummy:
    b_test_dummy[c] = 0
    
b_test_dummy = b_test_dummy[b_train_dummy.columns]
b_test_dummy

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,8000.0,0.0819,251.40,53000.0,30.87,0.0,1.0,11.0,0.0,13802.0,...,0,0,1,0,1,0,1,1,0,1
1,30000.0,0.1102,652.58,120000.0,21.53,0.0,0.0,13.0,0.0,19427.0,...,0,0,1,0,1,0,1,1,0,1
2,16000.0,0.0819,325.88,95000.0,23.37,0.0,1.0,16.0,1.0,20786.0,...,0,1,1,0,1,1,0,1,0,1
3,3000.0,0.1524,104.35,50000.0,22.30,0.0,0.0,14.0,0.0,30125.0,...,0,0,1,0,1,1,0,1,0,1
4,10000.0,0.2305,282.20,34000.0,5.58,0.0,2.0,9.0,0.0,4166.0,...,0,0,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13519,30000.0,0.2055,1123.34,180000.0,12.06,0.0,0.0,8.0,0.0,4771.0,...,1,0,1,1,0,1,0,1,0,1
13520,17000.0,0.1524,591.32,240000.0,15.88,0.0,2.0,15.0,0.0,4796.0,...,1,0,1,0,1,1,0,1,0,1
13521,25000.0,0.2565,1002.62,60000.0,22.44,0.0,0.0,12.0,0.0,10979.0,...,1,0,1,1,0,1,0,1,0,1
13522,25000.0,0.1862,911.61,160000.0,10.12,0.0,3.0,9.0,1.0,4233.0,...,0,0,1,1,0,0,1,1,0,1


# Compare the following models LogisticRegression vs RandomForestClassifier
## Before Scaling 
Random forest tends to be the ideal candidate for estimating datasets that are more longitudinal in nature with more complexity and randomness, and usually scale and preprocessing the data does not tend to affect its results, so my guess is it will perform better in this case.

In [14]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(b_train_dummy,a_train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [15]:
print(f"Training Data Score: {classifier.score(b_train_dummy,a_train_label)}")
print(f"Testing Data Score: {classifier.score(b_test_dummy,a_test_labels)}")

Training Data Score: 0.5981791697013839
Testing Data Score: 0.4927536231884058


In [16]:
# Train a Random Forest Classifier model and print the model score
forest_classifier = RandomForestClassifier(random_state=42, n_estimators=200)
forest_classifier.fit(b_train_dummy,a_train_label)

RandomForestClassifier(n_estimators=200, random_state=42)

In [17]:
print(f"Training Data Score: {forest_classifier.score(b_train_dummy,a_train_label)}")
print(f"Testing Data Score: {forest_classifier.score(b_test_dummy,a_test_labels)}")

Training Data Score: 1.0
Testing Data Score: 0.6346495119787046


## Results for Unscaled Data 

RandomForest achieved a perfect score for the training data, and a moderate 0.63 for the testing data, outperforming Logistic Regression by quite a significant percentage

## After Scaling
RandomForest is not very sensisitive to scaling and so my prediction is its score won't change much. However, after processing the data the Logistic Regression score will improve significantly. As to which model will perform better, I assume Random Forest will remain undefeated.

In [18]:
# Scale the data
scaler = StandardScaler().fit(b_train_dummy)

In [19]:
b_scaled_test = scaler.transform(b_test_dummy)
b_scaled_train = scaler.transform(b_train_dummy)

In [20]:
# Train the Logistic Regression model on the scaled data and print the model score

In [21]:
classifier = LogisticRegression()
classifier.fit(b_scaled_train,a_train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [22]:
print(f"Training Data Score: {classifier.score(b_scaled_train,a_train_label)}")
print(f"Testing Data Score: {classifier.score(b_scaled_test,a_test_labels)}")

Training Data Score: 0.6921340131099781
Testing Data Score: 0.7896332446021888


In [23]:
# Train a Random Forest Classifier model on the scaled data and print the model score

In [24]:
forest_classifier = RandomForestClassifier(random_state=42, n_estimators=200)
forest_classifier.fit(b_scaled_train,a_train_label)

RandomForestClassifier(n_estimators=200, random_state=42)

In [25]:
print(f"Training Data Score: {forest_classifier.score(b_scaled_train,a_train_label)}")
print(f"Testing Data Score: {forest_classifier.score(b_scaled_test,a_test_labels)}")

Training Data Score: 1.0
Testing Data Score: 0.634575569358178


## Results after scaling
Random Forest's scores remained almost identical as predicted, and Logistic Regression's improved significantly. Nevertheless, I would have to say that Logistic Regression performed better overall contrary to my previous predictions.