In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression

In [2]:
df=pd.read_csv("resources/leftovers_to_test.csv")

In [3]:
retypes={
'loan_type':'str'
,'loan_amount_000s':'int64'
,'action_taken':'int8'
,'applicant_ethnicity':'str'
,'co_applicant_ethnicity':'str'
,'applicant_race_1':'str'
,'co_applicant_race_1':'str'
,'applicant_sex':'str'
,'co_applicant_sex':'str'
,'applicant_income_000s':'int64'
}

In [4]:
df = df.astype(retypes)

In [5]:
X = df.drop(columns=['action_taken','Unnamed: 0'])
X

Unnamed: 0,loan_type,loan_amount_000s,applicant_ethnicity,co_applicant_ethnicity,applicant_race_1,co_applicant_race_1,applicant_sex,co_applicant_sex,applicant_income_000s
0,1,144,2,2,5,5,1,2,154
1,1,300,2,5,5,8,1,5,119
2,1,263,1,5,5,8,1,5,141
3,1,187,2,5,5,8,1,5,76
4,1,361,2,2,1,5,2,1,107
...,...,...,...,...,...,...,...,...,...
15014602,1,76,2,2,5,5,2,1,80
15014603,1,195,2,2,5,5,2,1,45
15014604,3,480,2,2,5,5,2,1,225
15014605,1,627,2,5,2,8,1,5,191


In [6]:
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

Index(['loan_amount_000s', 'applicant_income_000s', 'loan_type_1',
       'loan_type_2', 'loan_type_3', 'loan_type_4', 'applicant_ethnicity_1',
       'applicant_ethnicity_2', 'co_applicant_ethnicity_1',
       'co_applicant_ethnicity_2', 'co_applicant_ethnicity_5',
       'applicant_race_1_1', 'applicant_race_1_2', 'applicant_race_1_3',
       'applicant_race_1_4', 'applicant_race_1_5', 'co_applicant_race_1_1',
       'co_applicant_race_1_2', 'co_applicant_race_1_3',
       'co_applicant_race_1_4', 'co_applicant_race_1_5',
       'co_applicant_race_1_8', 'applicant_sex_1', 'applicant_sex_2',
       'co_applicant_sex_1', 'co_applicant_sex_2', 'co_applicant_sex_5'],
      dtype='object')


Unnamed: 0,loan_amount_000s,applicant_income_000s,loan_type_1,loan_type_2,loan_type_3,loan_type_4,applicant_ethnicity_1,applicant_ethnicity_2,co_applicant_ethnicity_1,co_applicant_ethnicity_2,...,co_applicant_race_1_2,co_applicant_race_1_3,co_applicant_race_1_4,co_applicant_race_1_5,co_applicant_race_1_8,applicant_sex_1,applicant_sex_2,co_applicant_sex_1,co_applicant_sex_2,co_applicant_sex_5
0,144,154,1,0,0,0,0,1,0,1,...,0,0,0,1,0,1,0,0,1,0
1,300,119,1,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,1
2,263,141,1,0,0,0,1,0,0,0,...,0,0,0,0,1,1,0,0,0,1
3,187,76,1,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,1
4,361,107,1,0,0,0,0,1,0,1,...,0,0,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15014602,76,80,1,0,0,0,0,1,0,1,...,0,0,0,1,0,0,1,1,0,0
15014603,195,45,1,0,0,0,0,1,0,1,...,0,0,0,1,0,0,1,1,0,0
15014604,480,225,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0,1,1,0,0
15014605,627,191,1,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,1


In [7]:
y = df['action_taken']

In [8]:
# Set some individual examples like we'd get from our webform
# High Loan Low Income
form_info_1=[200,10,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0]
# Low Loan High Income
form_info_2=[10,200,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0]
# In the Middle
form_info_3=[300,75,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0]

In [9]:
# test LogisticRegression vs dataset
loaded_model = pickle.load(open('Resources/lr_classifier.pkl', 'rb'))

lr_result = loaded_model.score(X_dummies, y)
print(lr_result)



0.8544712492308324


In [11]:
# Test Logistic Regression vs extremes
model_input1 = np.array(form_info_1)[np.newaxis, :]
lr_result1=loaded_model.predict(model_input1)
lr_prob1=loaded_model.predict_proba(model_input1)

model_input2 = np.array(form_info_2)[np.newaxis, :]
lr_result2=loaded_model.predict(model_input2)
lr_prob2=loaded_model.predict_proba(model_input2)

model_input3 = np.array(form_info_3)[np.newaxis, :]
lr_result3=loaded_model.predict(model_input3)
lr_prob3=loaded_model.predict_proba(model_input3)

print(f"Logistic Regression High Loan Low Income: {lr_result1}")
print(f"Logistic Regression Predict Probability: {lr_prob1}")
print(f"Logistic Regression Low Loan High Income: {lr_result2}")
print(f"Logistic Regression Predict Probability: {lr_prob2}")
print(f"Logistic Regression Middle: {lr_result3}")
print(f"Logistic Regression Middle: {lr_prob3}")

Logistic Regression High Loan Low Income: [0]
Logistic Regression Predict Probability: [[0.98652877 0.01347123]]
Logistic Regression Low Loan High Income: [1]
Logistic Regression Predict Probability: [[7.10542736e-15 1.00000000e+00]]
Logistic Regression Middle: [1]
Logistic Regression Middle: [[0.16512952 0.83487048]]


In [12]:
# Test SVC vs dataset
loaded_model3 = pickle.load(open('Resources/svc_classifier.pkl', 'rb'))
svc_result = loaded_model3.score(X_dummies, y)
print(svc_result)



0.8923311146272427


In [13]:
# SVC vs examples
model_input1 = np.array(form_info_1)[np.newaxis, :]
svc_result1=loaded_model3.predict(model_input1)
# svc_prob1=loaded_model3.predict_proba(model_input1)

model_input2 = np.array(form_info_2)[np.newaxis, :]
svc_result2=loaded_model3.predict(model_input2)
# svc_prob2=loaded_model3.predict_proba(model_input2)

model_input3 = np.array(form_info_3)[np.newaxis, :]
svc_result3=loaded_model3.predict(model_input3)
# svc_prob3=loaded_model3.predict_proba(model_input3)

print(f"SVC High Loan Low Income: {svc_result1}")
# print(f"SVC Predict Probability: {svc_prob1}")
print(f"SVC Low Loan High Income: {svc_result2}")
# print(f"SVC Predict Probability: {svc_prob2}")
print(f"SVC Middle: {svc_result3}")
# print(f"SVC Middle: {svc_prob3}")


SVC High Loan Low Income: [0]
SVC Low Loan High Income: [1]
SVC Middle: [1]


In [14]:
# Test AdaBoostClassifier vs dataset
loaded_model2 = pickle.load(open('Resources/ab_classifier.pkl', 'rb'))
ab_result = loaded_model2.score(X_dummies, y)
print(ab_result)



0.8931082911460819


In [15]:
# Ada Boost vs examples
model_input1 = np.array(form_info_1)[np.newaxis, :]
ab_result1=loaded_model2.predict(model_input1)
ab_prob1=loaded_model2.predict_proba(model_input1)

model_input2 = np.array(form_info_2)[np.newaxis, :]
ab_result2=loaded_model2.predict(model_input2)
ab_prob2=loaded_model2.predict_proba(model_input2)

model_input3 = np.array(form_info_3)[np.newaxis, :]
ab_result3=loaded_model2.predict(model_input3)
ab_prob3=loaded_model2.predict_proba(model_input3)

print(f"Ada Boost High Loan Low Income: {ab_result1}")
print(f"Ada Boost Predict Probability: {ab_prob1}")
print(f"Ada Boost Low Loan High Income: {ab_result2}")
print(f"Ada Boost Predict Probability: {ab_prob2}")
print(f"Ada Boost Middle: {ab_result3}")
print(f"Ada Boost Middle: {ab_prob3}")


Ada Boost High Loan Low Income: [1]
Ada Boost Predict Probability: [[0.44391522 0.55608478]]
Ada Boost Low Loan High Income: [1]
Ada Boost Predict Probability: [[0.44391522 0.55608478]]
Ada Boost Middle: [1]
Ada Boost Middle: [[0.44391522 0.55608478]]


In [16]:
# Test KNClassifier vs dataset
loaded_model4 = pickle.load(open('Resources/kn_classifier.pkl', 'rb'))
kn_result = loaded_model4.score(X_dummies, y)
print(kn_result)



0.8931082911460819


In [17]:
# Test KNClassifier vs examples
model_input1 = np.array(form_info_1)[np.newaxis, :]
kn_result1=loaded_model4.predict(model_input1)
kn_prob1=loaded_model4.predict_proba(model_input1)

model_input2 = np.array(form_info_2)[np.newaxis, :]
kn_result2=loaded_model4.predict(model_input2)
kn_prob2=loaded_model4.predict_proba(model_input2)

model_input3 = np.array(form_info_3)[np.newaxis, :]
kn_result3=loaded_model4.predict(model_input3)
kn_prob3=loaded_model4.predict_proba(model_input3)

print(f"KNClassifier High Loan Low Income: {kn_result1}")
print(f"KNClassifier Predict Probability: {kn_prob1}")
print(f"KNClassifier Low Loan High Income: {kn_result2}")
print(f"KNClassifier Predict Probability: {kn_prob2}")
print(f"KNClassifier Middle: {kn_result3}")
print(f"KNClassifier Middle: {kn_prob3}")

KNClassifier High Loan Low Income: [1]
KNClassifier Predict Probability: [[0.2 0.8]]
KNClassifier Low Loan High Income: [1]
KNClassifier Predict Probability: [[0.13333333 0.86666667]]
KNClassifier Middle: [1]
KNClassifier Middle: [[0.2 0.8]]


In [18]:
# Test Random Forest vs dataset
loaded_model5 = pickle.load(open('Resources/rf_classifier.pkl', 'rb'))
rf_result = loaded_model5.score(X_dummies, y)
print(rf_result)



0.892255188564043


In [20]:
# Test Random Forest vs examples
model_input1 = np.array(form_info_1)[np.newaxis, :]
rf_result1=loaded_model5.predict(model_input1)
rf_prob1=loaded_model5.predict_proba(model_input1)

model_input2 = np.array(form_info_2)[np.newaxis, :]
rf_result2=loaded_model5.predict(model_input2)
rf_prob2=loaded_model5.predict_proba(model_input2)

model_input3 = np.array(form_info_3)[np.newaxis, :]
rf_result3=loaded_model5.predict(model_input3)
rf_prob3=loaded_model5.predict_proba(model_input3)

print(f"Random Forest High Loan Low Income: {rf_result1}")
print(f"Random Forest Predict Probability: {rf_prob1}")
print(f"Random Forest Low Loan High Income: {rf_result2}")
print(f"Random Forest Predict Probability: {rf_prob2}")
print(f"Random Forest Middle: {rf_result3}")
print(f"Random Forest Middle: {rf_prob3}")

Random Forest High Loan Low Income: [1]
Random Forest Predict Probability: [[0.275 0.725]]
Random Forest Low Loan High Income: [1]
Random Forest Predict Probability: [[0.22 0.78]]
Random Forest Middle: [1]
Random Forest Middle: [[0.225 0.775]]


In [21]:
print(f'Logistic Regression: {lr_result}')
print(f'SVC: {svc_result}')
print(f'K Neighbors: {kn_result}')
print(f'Ada Boost: {ab_result}')
print(f'Random Forest: {rf_result}')


Logistic Regression: 0.8544712492308324
SVC: 0.8923311146272427
K Neighbors: 0.8931082911460819
Ada Boost: 0.8931082911460819
Random Forest: 0.892255188564043


In [22]:
#How much agreement on the middle? 
print(f"Random Forest Middle: {rf_result3}")
print(f"Random Forest Prob Middle: {rf_prob3}")

print(f"KNClassifier Middle: {kn_result3}")
print(f"KNClassifier Prob Middle: {kn_prob3}")

print(f"Ada Boost Middle: {ab_result3}")
print(f"KNClassifier Prob Middle: {ab_prob3}")

print(f"SVC Middle: {svc_result3}")

print(f"Logistic Regression Middle: {lr_result3}")
print(f"Logistic Regression Prob Middle: {lr_prob3}")

Random Forest Middle: [1]
Random Forest Prob Middle: [[0.225 0.775]]
KNClassifier Middle: [1]
KNClassifier Prob Middle: [[0.2 0.8]]
Ada Boost Middle: [1]
KNClassifier Prob Middle: [[0.44391522 0.55608478]]
SVC Middle: [1]
Logistic Regression Middle: [1]
Logistic Regression Prob Middle: [[0.16512952 0.83487048]]
