In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [5]:
print (train_df.columns)
print (test_df.columns)

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

In [6]:
print (train_df.shape)
print (test_df.shape)

(12180, 86)
(4702, 86)


In [7]:
# Convert categorical data to numeric and separate target feature for training data
# Risk against a loan is indicated by loan_status column.

train_target = train_df['loan_status']
train_features = train_df.drop('loan_status', axis=1)

In [8]:
print (train_target.shape)
print (train_features.shape)

(12180,)
(12180, 85)


In [9]:
X_train = pd.get_dummies(train_features)
X_train

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.1240,448.95,197000.0,11.28,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.1240,100.22,45000.0,18.08,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
12176,354944,354944,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,...,1,1,0,1,1,0,1,0,1,0
12177,354973,354973,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,...,0,1,0,1,1,0,1,0,1,0
12178,355002,355002,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,...,0,1,0,1,0,1,1,0,1,0


In [10]:
X_train.shape

(12180, 94)

In [11]:
X_train.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc'

In [12]:
y_train = LabelEncoder().fit_transform(train_target)
y_train

array([1, 1, 1, ..., 0, 0, 0])

In [13]:
# Convert categorical data to numeric and separate target feature for testing data

test_target = test_df['loan_status']
test_features = test_df.drop('loan_status', axis=1)

X_test = pd.get_dummies(test_features)
y_test = LabelEncoder().fit_transform(test_target)

print("Shape: ", X_train.shape, X_test.shape)
y_test

Shape:  (12180, 94) (4702, 93)


array([1, 1, 1, ..., 0, 0, 0])

In [14]:
X_test.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc'

In [15]:
X_train

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.1240,448.95,197000.0,11.28,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.1240,100.22,45000.0,18.08,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
12176,354944,354944,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,...,1,1,0,1,1,0,1,0,1,0
12177,354973,354973,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,...,0,1,0,1,1,0,1,0,1,0
12178,355002,355002,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,...,0,1,0,1,0,1,1,0,1,0


In [16]:
X_test

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,67991,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,25429,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,37505,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,0,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,...,1,0,1,1,0,1,0,1,0,1
4698,77291,77291,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,...,0,0,1,0,1,1,0,1,0,1
4699,77292,77292,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,...,0,1,1,1,0,1,0,1,0,1
4700,77297,77297,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,...,1,0,1,0,1,1,0,1,0,1


In [17]:
# add missing dummy variables to testing set

#find and add the missing column
for i in list(X_train):
    if i not in list(X_test):
        print(i)
        X_test[i] = 0

print("Shape: ", X_train.shape, X_test.shape)

debt_settlement_flag_Y
Shape:  (12180, 94) (4702, 94)


In [18]:
X_test.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc'

In [19]:
##Prediction for which model will perform better in evaluating Risk Assessment for the LendingTree data;

 #This problem requires classification of data and target category will be 'Loan Status' - giving a binary 
  #assessment of low risk vs. high risk loan for LendingTree.

 #"RandomForest" is by definition better model for diverse and non-relational dataset, as it randomly selecting 
  #subsets of features, some trees of the forest can isolate more important features while increasing the overall 
  #accuracy of the result.

 #Logistic Regression on the other hand, will not predict the exact category your observation should be in, 
  #it will provide a probability of each observation whether it falls into the threshold and then categorize the 
  #prediction is ‘1/high risk’ or '0/low risk'.
    
#RandomForest should perform better for the given dataset, as the customer records are totally random and 
 #unrelated. Accuracy of the prediction is of higher importance in case of critical financial analysis and risk
 #assessment.


In [20]:
# Train the Logistic Regression model on the unscaled data and print the model score

# *** Note: Setting a max_iter value to avoid ConvergenceWarning ***
model_LR = LogisticRegression(max_iter=12180)

model_LR.fit(X_train, y_train)

LogisticRegression(max_iter=12180)

In [21]:
print(f"Unscaled Training Data Score for LogisticRegression: {model_LR.score(X_train, y_train)}")
print(f"Unscaled Testing Data Score for LogisticRegression: {model_LR.score(X_test, y_test)}")

Unscaled Training Data Score for LogisticRegression: 0.7016420361247947
Unscaled Testing Data Score for LogisticRegression: 0.5621012335176521


In [22]:
# Train a Random Forest Classifier model and print the model score

model_RF = RandomForestClassifier()

model_RF.fit(X_train, y_train)

RandomForestClassifier()

In [23]:
print(f"Unscaled Training Data Score for RandomForestClassifier: {model_RF.score(X_train, y_train)}")
print(f"Unscaled Testing Data Score for RandomForestClassifier: {model_RF.score(X_test, y_test)}")

Unscaled Training Data Score for RandomForestClassifier: 1.0
Unscaled Testing Data Score for RandomForestClassifier: 0.6108039132284134


In [24]:
#As predicted, RandomForest provided better scores when compared to LogisticRegression. This is also evident
 #by the comparision between actual and model predicted values for the subset of data.

print(f'Unscaled - LogisticRegression Actual:\t{list(y_test[:20])}')
print(f'Unscaled - LogisticRegression Predicted:{list(model_LR.predict(X_test[:20]))}')
print('\t\t\t\t------------------------------------------------------------------------')
print(f'Unscaled - RandomForest Actual:\t\t{list(y_test[:20])}')
print(f'Unscaled - RandomForest Predicted:\t{list(model_RF.predict(X_test[:20]))}')

Unscaled - LogisticRegression Actual:	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Unscaled - LogisticRegression Predicted:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
				------------------------------------------------------------------------
Unscaled - RandomForest Actual:		[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Unscaled - RandomForest Predicted:	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]


In [25]:
# *** Prediction - SCALED DATA ***

 #RandomForest model should be impacted much post scaling, however LogisticRegression is expected to perform
 #better with scaled dataset. 

#My bets are still on RandomForest, for the same reasons stated above.

In [26]:
# Scale the data

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
print(X_train_scaled.shape)
X_train_scaled

(12180, 94)


array([[-1.31172014, -1.31172014, -0.39311205, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-0.46579523, -0.46579523,  0.35168119, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 1.3364188 ,  1.3364188 ,  0.25400339, ..., -0.17149859,
         0.02026518, -0.02026518],
       ...,
       [ 1.67571549,  1.67571549, -1.34791257, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 1.67600634,  1.67600634, -0.23438563, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 1.67906533,  1.67906533, -0.23438563, ..., -0.17149859,
         0.02026518, -0.02026518]])

In [27]:
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled.shape)
X_test_scaled

(4702, 94)


array([[-1.20255948, -1.20255948,  2.20755943, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.62943343, -1.62943343, -1.11348584, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.49837845, -1.49837845, -1.34791257, ..., -0.17149859,
         0.02026518, -0.02026518],
       ...,
       [-1.10927546, -1.10927546, -0.72277464, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.10922531, -1.10922531, -0.91813024, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.1091551 , -1.1091551 ,  1.23078141, ..., -0.17149859,
         0.02026518, -0.02026518]])

In [28]:
# Train the Logistic Regression model on the scaled data and print the model score

model_LR.fit(X_train_scaled, y_train)

print(f"Scaled Training Data Score for LogisticRegression: {model_LR.score(X_train_scaled, y_train)}")
print(f"Scaled Testing Data Score for LogisticRegression: {model_LR.score(X_test_scaled, y_test)}")

Scaled Training Data Score for LogisticRegression: 0.7127257799671592
Scaled Testing Data Score for LogisticRegression: 0.7201190982560612


In [29]:
# Train a Random Forest Classifier model on the scaled data and print the model score

model_RF.fit(X_train_scaled, y_train)

print(f"Scaled Training Data Score for RandomForest: {model_RF.score(X_train_scaled, y_train)}")
print(f"Scaled Testing Data Score for RandomForest: {model_RF.score(X_test_scaled, y_test)}")

Scaled Training Data Score for RandomForest: 1.0
Scaled Testing Data Score for RandomForest: 0.5991067630795406


In [30]:
#As predicted, performance of LogisticRegression model has improved considerably over scaled data, whereas
 #RandomForest test score went down, but not by much. 

#For this dataset, LogisticRegression model perform much better when the dataset is scaled.  