In [1132]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [1133]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [1134]:
train_df['debt_settlement_flag'].value_counts()

N    12175
Y        5
Name: debt_settlement_flag, dtype: int64

In [1135]:
test_df['debt_settlement_flag'].value_counts()

N    4702
Name: debt_settlement_flag, dtype: int64

In [1136]:
# Need to make sure we have the correct number of columns
dummy = pd.get_dummies(train_df)
dummy.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,1,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,1,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,...,1,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,...,1,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,1,1,0,1,1,0,1,0,1,0


In [1137]:
testdata = test_df.copy()
testdata = testdata.drop(['loan_status', 'index', 'Unnamed: 0'], axis=1)
testdata.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,n,19.75,0.0,1.0,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,n,11.52,2.0,0.0,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,n,6.74,0.0,0.0,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,n,12.13,0.0,2.0,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,3600.0,0.124,120.27,RENT,50000.0,Not Verified,n,16.08,0.0,3.0,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [1138]:
# Seperate columns
testdummies = pd.get_dummies(testdata)
testdummies.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0,0,1,0,1,1,0,1,0,1
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0,0,1,0,1,1,0,1,0,1
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0,0,1,0,1,1,0,1,0,1
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0,0,1,0,1,1,0,1,0,1
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0,0,1,0,1,1,0,1,0,1


In [1139]:
# Fixing the columns
testdummies['debt_settlement_flag_Y'] = 0
testdummies['debt_settlement_flag_Y'].value_counts()

0    4702
Name: debt_settlement_flag_Y, dtype: int64

In [1140]:
# Check for nulls
testdummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 92 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            4702 non-null   float64
 1   int_rate                             4702 non-null   float64
 2   installment                          4702 non-null   float64
 3   annual_inc                           4702 non-null   float64
 4   dti                                  4702 non-null   float64
 5   delinq_2yrs                          4702 non-null   float64
 6   inq_last_6mths                       4702 non-null   float64
 7   open_acc                             4702 non-null   float64
 8   pub_rec                              4702 non-null   float64
 9   revol_bal                            4702 non-null   float64
 10  total_acc                            4702 non-null   float64
 11  out_prncp                     

In [1141]:
traindata = train_df.copy()
traindata.shape


(12180, 86)

In [1142]:
traindata = traindata.drop(['loan_status', 'index', 'Unnamed: 0'], axis=1)
traindata.shape

(12180, 83)

In [1143]:
traindata_dummies = pd.get_dummies(traindata)
traindata_dummies.shape

(12180, 92)

In [1144]:
ytrain = train_df['loan_status']
ytest = test_df['loan_status']

Prediction before running the models: I would assume that the Random Forest Classifier model would best suit this data set as there are multiple points where smaller logic trees would better suit the type of predictions we need to make an accurate prediction. 

In [1145]:
# Complete the Logistic Regression Classifer Model

classifier = LogisticRegression()
classifier.fit(traindata_dummies, ytrain)
print(f"Testing Data Score: {classifier.score(testdummies, ytest)}")

Testing Data Score: 0.5161633347511697


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [1146]:
# Complete the Random Forest Classifer Model

clf = RandomForestClassifier(random_state = 1).fit(traindata_dummies, ytrain)
print(f'Testing Score: {clf.score(testdummies, ytest)}')


Testing Score: 0.6544023819651212


The model that performed better was the Random Forest classifier model, but both models still weren't very accurate. Random Forest Classifier model achieved 0.65 as a testing score, and logistical regression achieved   
0.52. Scaling the data should allow the logistical regression model to have more chances to learn as it will have more data to sample from. 

Now, for scaling the data:

In [1147]:
# Now to scale the data
scaler = StandardScaler().fit(traindata_dummies)
xtrain = scaler.transform(traindata_dummies)
xtest = scaler.transform(testdummies)

xtrain[0]

array([-0.39311205,  0.73658452, -0.08760946,  0.79027929,  0.30961463,
       -0.32002561, -0.69540144,  0.37691739, -0.35198632,  0.97933871,
        1.08333618, -0.07136838, -0.07121699, -1.09378946, -1.093659  ,
       -0.92422387, -0.9683145 , -0.15098128,  0.        ,  0.        ,
        0.05395363, -0.13312632,  0.        , -0.00906138, -0.13971415,
        1.80631885,  0.78540566,  1.17957176,  2.1409169 ,  3.13618293,
       -0.51642773,  0.74114448, -0.02074907, -0.22528046, -0.66929854,
        3.4553425 ,  0.61662261,  0.50360151, -0.28280057,  3.50659128,
       -0.13229478,  1.18604519,  1.17736048, -0.4308203 ,  0.95191111,
       -0.07108199, -0.00906138,  0.60175665,  1.90745971, -0.64875472,
       -0.64835039, -0.17563511, -0.66180723,  0.13191341, -0.35942048,
       -1.16762498, -0.52389304, -0.98968236, -0.50928902,  1.0718466 ,
       -0.33415066,  0.46534105, -0.52521034,  0.37926859,  0.        ,
       -0.00906138, -0.1488685 ,  1.32666689,  0.61758087,  0.50

In [1148]:
classifier_scaled = LogisticRegression()
classifier_scaled.fit(xtrain, ytrain)
print(f"Testing Data Score: {classifier_scaled.score(xtest, ytest)}")

Testing Data Score: 0.767333049766057


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [1149]:

clf_scaled = RandomForestClassifier(random_state = 1).fit(xtrain, ytrain)
print(f'Testing Score: {clf_scaled.score(xtest, ytest)}')

Testing Score: 0.6548277328796257


As I predicted, logistical regression achieved a higher score at 0.77 (instead of 0.52 before scaling). However, random forest classifier did not change, still achieving 0.65. I am not sure as to why the score did not change, other than having more data to sample from would not affect the decisions the logic trees would come up with. 