In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import numpy as np
from numpy import savetxt

In [2]:
df=pd.read_csv("resources/medium_loans_no_geo.csv")

In [3]:
retypes={'agency_code':'str'
,'loan_type':'str'
,'property_type':'str'
,'loan_amount_000s':'int64'
,'preapproval':'str'
,'action_taken':'int8'
,'applicant_ethnicity':'str'
,'co_applicant_ethnicity':'str'
,'applicant_race_1':'str'
,'co_applicant_race_1':'str'
,'applicant_sex':'str'
,'co_applicant_sex':'str'
,'applicant_income_000s':'int64'
,'purchaser_type':'str'}

In [4]:
# categorical fields to strings for one-hot encoding
df = df.astype(retypes)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Unnamed: 0              500000 non-null  int64 
 1   agency_code             500000 non-null  object
 2   loan_type               500000 non-null  object
 3   property_type           500000 non-null  object
 4   loan_purpose            500000 non-null  int64 
 5   loan_amount_000s        500000 non-null  int64 
 6   preapproval             500000 non-null  object
 7   action_taken            500000 non-null  int8  
 8   applicant_ethnicity     500000 non-null  object
 9   co_applicant_ethnicity  500000 non-null  object
 10  applicant_race_1        500000 non-null  object
 11  co_applicant_race_1     500000 non-null  object
 12  applicant_sex           500000 non-null  object
 13  co_applicant_sex        500000 non-null  object
 14  applicant_income_000s   500000 non-n

In [30]:
X = df.drop(columns=['action_taken','Unnamed: 0'])
X

Unnamed: 0,loan_type,property_type,loan_amount_000s,preapproval,applicant_ethnicity,co_applicant_ethnicity,applicant_race_1,co_applicant_race_1,applicant_sex,co_applicant_sex,applicant_income_000s
0,1,2,84,3,3,5,5,8,2,5,30
1,1,1,533,3,2,2,5,5,1,2,287
2,1,1,307,3,2,5,5,8,1,5,126
3,1,1,258,2,2,5,3,8,2,5,49
4,1,1,288,2,2,3,6,6,1,2,191
...,...,...,...,...,...,...,...,...,...,...,...
499995,1,1,171,2,2,2,4,5,1,2,120
499996,2,1,134,2,2,5,5,8,2,5,78
499997,1,1,84,3,2,5,5,8,2,5,42
499998,2,1,73,3,2,2,5,5,1,2,25


In [31]:
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

Index(['loan_amount_000s', 'applicant_income_000s', 'loan_type_1',
       'loan_type_2', 'loan_type_3', 'loan_type_4', 'property_type_1',
       'property_type_2', 'preapproval_1', 'preapproval_2', 'preapproval_3',
       'applicant_ethnicity_1', 'applicant_ethnicity_2',
       'applicant_ethnicity_3', 'applicant_ethnicity_4',
       'co_applicant_ethnicity_1', 'co_applicant_ethnicity_2',
       'co_applicant_ethnicity_3', 'co_applicant_ethnicity_4',
       'co_applicant_ethnicity_5', 'applicant_race_1_1', 'applicant_race_1_2',
       'applicant_race_1_3', 'applicant_race_1_4', 'applicant_race_1_5',
       'applicant_race_1_6', 'applicant_race_1_7', 'co_applicant_race_1_1',
       'co_applicant_race_1_2', 'co_applicant_race_1_3',
       'co_applicant_race_1_4', 'co_applicant_race_1_5',
       'co_applicant_race_1_6', 'co_applicant_race_1_7',
       'co_applicant_race_1_8', 'applicant_sex_1', 'applicant_sex_2',
       'applicant_sex_3', 'applicant_sex_4', 'co_applicant_sex_1',
       'c

Unnamed: 0,loan_amount_000s,applicant_income_000s,loan_type_1,loan_type_2,loan_type_3,loan_type_4,property_type_1,property_type_2,preapproval_1,preapproval_2,...,co_applicant_race_1_8,applicant_sex_1,applicant_sex_2,applicant_sex_3,applicant_sex_4,co_applicant_sex_1,co_applicant_sex_2,co_applicant_sex_3,co_applicant_sex_4,co_applicant_sex_5
0,84,30,1,0,0,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,1
1,533,287,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,307,126,1,0,0,0,1,0,0,0,...,1,1,0,0,0,0,0,0,0,1
3,258,49,1,0,0,0,1,0,0,1,...,1,0,1,0,0,0,0,0,0,1
4,288,191,1,0,0,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,171,120,1,0,0,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,0
499996,134,78,0,1,0,0,1,0,0,1,...,1,0,1,0,0,0,0,0,0,1
499997,84,42,1,0,0,0,1,0,0,0,...,1,0,1,0,0,0,0,0,0,1
499998,73,25,0,1,0,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [32]:
# Split our preprocessed data into our features and target arrays

y = df['action_taken']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, random_state=42)

In [33]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [45]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=500)
classifier

LogisticRegression(max_iter=500)

In [46]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500)

In [47]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8725546666666667
Testing Data Score: 0.87264
