In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Read CSV file
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))
test_df

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.70,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.70,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.1240,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,high_risk,n,...,100.0,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N
4698,77291,77291,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,high_risk,n,...,100.0,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N
4699,77292,77292,10000.0,0.2305,387.36,RENT,33000.0,Verified,high_risk,n,...,100.0,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N
4700,77297,77297,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,high_risk,n,...,95.0,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N


In [3]:
# Convert categorical data to numeric and separate target feature for training data
X = train_df.drop(columns=['loan_status'])
y_train = train_df.loan_status

X_train= pd.get_dummies(X,drop_first=True)
X_train

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,total_il_high_credit_limit,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,initial_list_status_w,application_type_Joint App,hardship_flag_Y,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,170200.0,1,0,0,0,0,1,0,0,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,35398.0,1,0,0,1,0,1,0,0,0
2,321143,321143,20000.0,0.1240,448.95,197000.0,11.28,0.0,0.0,12.0,...,90340.0,1,0,0,1,0,1,0,0,0
3,11778,11778,3000.0,0.1240,100.22,45000.0,18.08,0.0,0.0,12.0,...,15406.0,0,0,1,0,0,1,0,0,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,58778.0,1,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,...,19055.0,0,0,1,0,0,1,0,0,0
12176,354944,354944,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,...,54824.0,0,0,1,0,1,1,0,0,0
12177,354973,354973,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,...,53065.0,0,0,1,0,0,1,0,0,0
12178,355002,355002,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,...,32930.0,1,0,0,1,0,1,1,0,0


In [4]:
y_train

0         low_risk
1         low_risk
2         low_risk
3         low_risk
4         low_risk
           ...    
12175    high_risk
12176    high_risk
12177    high_risk
12178    high_risk
12179    high_risk
Name: loan_status, Length: 12180, dtype: object

In [5]:
# Convert categorical data to numeric and separate target feature for testing data
X_ = test_df.drop(columns=['loan_status'])
y_test = test_df.loan_status

X_test = pd.get_dummies(X_,drop_first=True)

In [6]:
# Add missing dummy variables to testing set
for data in X_train.columns:
    if data not in X_test.columns:
        X_test[data] = 0

In [7]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression(solver='sag', max_iter=1000)
classifier.fit(X_train, y_train)



LogisticRegression(max_iter=1000, solver='sag')

In [8]:
# View classifier score for X_test
classifier.score(X_test, y_test)

0.5325393449595917

In [9]:
# Train a Random Forest Classifier model and print the model score
clf= RandomForestClassifier (random_state=1, n_estimators=500).fit(X_train,y_train)
clf.score(X_test, y_test)

0.635048915355168

In [10]:
# Scale the data
scaler=StandardScaler().fit(X_train)
X_train_scaled= scaler.transform(X_train)
X_train_scaled

array([[-1.31172014, -1.31172014, -0.39311205, ..., -0.41370744,
        -0.17149859, -0.02026518],
       [-0.46579523, -0.46579523,  0.35168119, ..., -0.41370744,
        -0.17149859, -0.02026518],
       [ 1.3364188 ,  1.3364188 ,  0.25400339, ..., -0.41370744,
        -0.17149859, -0.02026518],
       ...,
       [ 1.67571549,  1.67571549, -1.34791257, ..., -0.41370744,
        -0.17149859, -0.02026518],
       [ 1.67600634,  1.67600634, -0.23438563, ...,  2.41716707,
        -0.17149859, -0.02026518],
       [ 1.67906533,  1.67906533, -0.23438563, ..., -0.41370744,
        -0.17149859, -0.02026518]])

In [11]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression(solver='sag', max_iter=1000)
classifier.fit(X_train_scaled,y_train)



LogisticRegression(max_iter=1000, solver='sag')

In [12]:
# View classifier score
classifier.score(X_test, y_test)

0.5710336027222459

In [13]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
clf.score(X_test, y_test)

0.515525308379413