In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [5]:
test_df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,40000.0,0.1033,856.40,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.1430,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.1430,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.70,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.50,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,n,15.74,0.0,0.0,...,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N,high_risk
4698,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,n,26.81,0.0,0.0,...,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N,high_risk
4699,10000.0,0.2305,387.36,RENT,33000.0,Verified,n,38.51,0.0,2.0,...,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N,high_risk
4700,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,n,16.36,0.0,1.0,...,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N,high_risk


In [12]:
# Convert categorical data to numeric and separate target feature for training data
XTrn=train_df.drop('target',axis=1)
X_train=pd.get_dummies(XTrn)

yTrn=train_df['target']
y_train=LabelEncoder().fit_transform(yTrn)

In [13]:
# Convert categorical data to numeric and separate target feature for testing data
XTst=test_df.drop('target',axis=1)
X_test=pd.get_dummies(XTst)
 
yTst=test_df['target']
y_test=LabelEncoder().fit_transform(yTst)

In [18]:
# add missing dummy variables to testing set

# Get the name of missing columns
missing_columns = set(X_train.columns) - set(X_test.columns)

# Add the missing columns in the testing set and filling them with 0
for i in missing_columns:
    X_test[i] = 0

# # Align the order of the columns in the training and testing sets
X_train, X_test = X_train.align(X_test, axis=1)

# # Confirm that the columns are aligned and none is missing
X_train.columns==X_test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

# Using Unscaled Data

In [24]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier= LogisticRegression().fit(X_train,y_train)

print('Logisitc Regression - Unscaled Data')
print("------------------------------------")
print(f'Training Data Score: {classifier.score(X_train, y_train)}')
print(f'Testing Data Score: {classifier.score(X_test, y_test)}')

Logisitc Regression - Unscaled Data
------------------------------------
Training Data Score: 0.65311986863711
Testing Data Score: 0.5072309655465759


In [22]:
# Train a Random Forest Classifier model and print the model score
classifier=RandomForestClassifier(random_state=1,n_estimators=500).fit(X_train,y_train)

print('Random Forest Classifier - Unscaled Data')
print("------------------------------------")
print(f'Training Data Score: {classifier.score(X_train, y_train)}')
print(f'Testing Data Score: {classifier.score(X_test, y_test)}')

Random Forest Classifier - Unscaled Data
------------------------------------
Training Data Score: 1.0
Testing Data Score: 0.646958740961293


# Using Scaled Data

In [25]:
# Scale the data
scaler=StandardScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [27]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier= LogisticRegression(max_iter=5000,solver='lbfgs').fit(X_train_scaled,y_train)

print('Logisitc Regression - Scaled Data')
print("------------------------------------")
print(f'Training Data Score: {classifier.score(X_train_scaled, y_train)}')
print(f'Testing Data Score: {classifier.score(X_test_scaled, y_test)}')

Logisitc Regression - Scaled Data
------------------------------------
Training Data Score: 0.7108374384236453
Testing Data Score: 0.7598894087622289


In [28]:
# Train a Random Forest Classifier model on the scaled data and print the model score
classifier=RandomForestClassifier(random_state=1,n_estimators=500).fit(X_train_scaled,y_train)

print('Random Forest Classifier - Scaled Data')
print("------------------------------------")
print(f'Training Data Score: {classifier.score(X_train_scaled, y_train)}')
print(f'Testing Data Score: {classifier.score(X_test_scaled, y_test)}')

Random Forest Classifier - Scaled Data
------------------------------------
Training Data Score: 1.0
Testing Data Score: 0.6480221182475542
