In [91]:
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [92]:
def check_test_and_train_matching_columns():
    # Display warning if columns do not match
    inner_join = set(train_df.columns) & set(test_df.columns)
    full_join = set(train_df.columns) | set(test_df.columns)
    unmatching_columns = list(full_join - inner_join)

    if (len(unmatching_columns) != 0):
        print("columns count does not match at...")
        return unmatching_columns
    else:
        print("columns match!")

In [93]:
# Load data
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

check_test_and_train_matching_columns()

columns match!


In [94]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [95]:
#Convert categorical data to numeric
# Split the columns into quantitative and categorical
all_columns = set(train_df.columns) | set(test_df.columns)
all_quantitative_columns = set(train_df.describe().columns) | set(test_df.describe().columns)
all_categorical_columns = all_columns - all_quantitative_columns
all_categorical_columns

{'application_type',
 'debt_settlement_flag',
 'hardship_flag',
 'home_ownership',
 'initial_list_status',
 'loan_status',
 'pymnt_plan',
 'verification_status'}

In [96]:
train_df = pd.get_dummies(train_df, columns=all_categorical_columns)
test_df = pd.get_dummies(test_df, columns=all_categorical_columns)
check_test_and_train_matching_columns()

columns count does not match at...


  train_df = pd.get_dummies(train_df, columns=all_categorical_columns)
  test_df = pd.get_dummies(test_df, columns=all_categorical_columns)


['debt_settlement_flag_Y']

In [97]:
# Fix the missing values for the debt_settlement_flag columns we get_dummies'd earlier
# by adding missing dummy variables to testing set
test_df["debt_settlement_flag_Y"] = 0
check_test_and_train_matching_columns()

columns match!


In [98]:
train_df

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,home_ownership_OWN,home_ownership_RENT,loan_status_high_risk,loan_status_low_risk,pymnt_plan_n,application_type_Individual,application_type_Joint App,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,0,0,1,1,1,0,1,0,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,0,0,1,1,1,0,0,1,0
2,321143,321143,20000.0,0.1240,448.95,197000.0,11.28,0.0,0.0,12.0,...,0,0,0,1,1,1,0,0,1,0
3,11778,11778,3000.0,0.1240,100.22,45000.0,18.08,0.0,0.0,12.0,...,0,1,0,1,1,1,0,1,0,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,0,0,0,1,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,...,0,1,1,0,1,1,0,1,0,0
12176,354944,354944,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,...,0,1,1,0,1,1,0,0,0,1
12177,354973,354973,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,...,0,1,1,0,1,1,0,1,0,0
12178,355002,355002,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,...,0,0,1,0,1,0,1,0,1,0


Separate target feature for training data

In [99]:
# we will train the model to be sensitive if the loans are of high risk
target_feature = "loan_status_high_risk"

In [100]:
# Split the training data
X_train  = train_df.drop(columns=[target_feature])
y_train = train_df[[target_feature]].values.ravel()
#
print(X_train.shape, y_train.shape)

(12180, 95) (12180,)


In [101]:
# Split the testing data
X_test  = test_df.drop(columns=[target_feature])
y_test = test_df[[target_feature]].values.ravel()
#
print(X_test.shape, y_test.shape)

(4702, 95) (4702,)


In [102]:
# Train the Logistic Regression model on the unscaled data and print the model score
logisticRegr = LogisticRegression(
    solver='lbfgs', 
    max_iter=100,
    random_state=0
)
logisticRegr.fit(X_train, y_train)
print("LogisticRegression score: ", logisticRegr.score(X_test, y_test))

LogisticRegression score:  0.5195661420672054


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Feature names must be in the same order as they were in fit.



In [103]:
# Train a Random Forest Classifier model and print the model score
randomForestClass = RandomForestClassifier(random_state=0)
randomForestClass.fit(X_train, y_train)
print("RandomForestClassifier score: ", randomForestClass.score(X_test, y_test))

RandomForestClassifier score:  0.5091450446618461


Feature names must be in the same order as they were in fit.



Make a prediction as to which model you think will perform better
I think the linear regression will be the better fit to the data, but without scaling, the random forest classifier could beat it.

In [104]:
# Scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Feature names must be in the same order as they were in fit.



In [105]:
# Train the Logistic Regression model on the scaled data and print the model score
logisticRegr_scaled = LogisticRegression(
    solver='lbfgs', 
    max_iter=100,
    random_state=0
)
logisticRegr_scaled.fit(X_train_scaled, y_train)
print("LogisticRegression scaled score: ", logisticRegr_scaled.score(X_test_scaled, y_test))

LogisticRegression scaled score:  0.5


In [106]:
# Train a Random Forest Classifier model on the scaled data and print the model score
randomForestClass_scaled = RandomForestClassifier(random_state=0)
randomForestClass_scaled.fit(X_train_scaled, y_train)
print("RandomForestClassifier scaled score: ", randomForestClass_scaled.score(X_test_scaled, y_test))

RandomForestClassifier scaled score:  0.5091450446618461


How do the model scores compare to each other, and to the previous results on unscaled data?
Looks like the model for the linear regression got way better, but the random forest classifier remained spot on. From what I can tell, the random forest classifier did not benefit much from the scaling, whereas the linear regression did.

How does this compare to your prediction?
Looks like I was totally underestimating the strength of scaling the data would do to the linear regression!

Notes
There is some strangeness to how I can run this whole notebook and get some significantly different values to the ones I have.