In [1]:
# --- Data manipulation ---
import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

import warnings
warnings.filterwarnings('ignore')

# --- Defined Functions ---
import sys
sys.path.append('/home/adedapo/code/roski10/Project_Mortgages/')
from dapo_clean_data import clean_data
from dapo_preprocessing import preprocess_and_resample

from sklearn import set_config

# --- Data Modeling ---
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFECV

# Clean Data

In [2]:
file = "data/Washington_State_HDMA-2016.csv"
data_original = pd.read_csv(file, decimal=',')

In [3]:
data = clean_data(data_original)

# Preprocessing

In [4]:
X_train, X_test, y_train, y_test = preprocess_and_resample(data)

In [5]:
X_train.shape

(310870, 67)

In [6]:
X_test.shape

(84968, 67)

In [7]:
y_train.value_counts()

1.0    155435
0.0    155435
Name: ordinalencoder__loan_status, dtype: int64

In [8]:
X_train.head(2)

Unnamed: 0,pipeline__tract_to_msamd_income,pipeline__population,pipeline__minority_population,pipeline__number_of_owner_occupied_units,pipeline__number_of_1_to_4_family_units,pipeline__loan_amount_000s,pipeline__hud_median_family_income,pipeline__applicant_income_000s,onehotencoder__property_type_name_Manufactured housing,onehotencoder__property_type_name_Multifamily dwelling,...,onehotencoder__agency_name_Department of Housing and Urban Development,onehotencoder__agency_name_Federal Deposit Insurance Corporation,onehotencoder__agency_name_Federal Reserve System,onehotencoder__agency_name_National Credit Union Administration,onehotencoder__agency_name_Office of the Comptroller of the Currency,onehotencoder__region_Eastern Washington,onehotencoder__region_Northern Cascades,onehotencoder__region_Olympic Peninsula,onehotencoder__region_Southwest Washington,onehotencoder__region_Western Region
0,-1.446076,0.181832,1.0039,-0.246196,-0.480725,-0.030033,1.049424,-0.257218,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.81496,-0.872252,-0.888183,-1.073426,-1.039653,-0.075359,-0.25859,-0.081002,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Creating a Model

In [9]:
# model = LogisticRegression()

# # Train model on training set
# model.fit(X_train, y_train)

# # Make predictions on test set
# y_pred = model.predict(X_test)

# # Evaluate model performance
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy}")

# # Accuracy: 0.6159730722154223

Accuracy: 0.6129719423783071


## Simple Logistic Regression

In [10]:
model = LogisticRegression()

param_grid = {
    'C': [0.01, 0.1, 1],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga', 'lbfgs']
}

# Instantiate Grid Search
grid = GridSearchCV(model, param_grid, cv=2)

# Fit data to Grid Search
grid.fit(X_train, y_train)

# Print the best hyperparameter value
print('Best hyperparameter value: ', grid.best_params_)

# Print the mean cross-validation score for the best hyperparameter value
print('Mean cross-validation score: ', grid.best_score_)

print('Best estimator value: ', grid.best_estimator_)

# Best hyperparameter value:  {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
# Mean cross-validation score:  0.6414771447872101
# Best estimator value:  LogisticRegression(C=0.01, solver='liblinear')

Best hyperparameter value:  {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
Mean cross-validation score:  0.6414771447872101
Best estimator value:  LogisticRegression(C=0.01, solver='liblinear')


## RandomForestClassifier

In [11]:
# Define the RandomForest classifier
rfc = RandomForestClassifier(max_depth=30,
                             max_features='sqrt',
                             n_estimators=200)

# Train the classifier
rfc.fit(X_train, y_train)

# Predict the test data
y_pred = rfc.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Accuracy: 0.7526833631484794

Accuracy: 0.7502706901421712


In [11]:
rfc = RandomForestClassifier()

# Define the parameter grid
param_grid2 = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'max_features': ['sqrt', 'log2']
}

# Create a grid search object
grid_search = GridSearchCV(rfc, param_grid2, cv=2)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameter value
print('Best hyperparameter value: ', grid_search.best_params_)

# Print the mean cross-validation score for the best hyperparameter value
print('Mean cross-validation score: ', grid_search.best_score_)

# Best hyperparameter value:  {'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 200}
# Mean cross-validation score:  0.8069353749155596

Best hyperparameter value:  {'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 200}
Mean cross-validation score:  0.8069353749155596


## XGBoost

In [7]:
# Define the XGBoost classifier
xgb_clf = XGBClassifier(objective='binary:logistic')

# Train the classifier
xgb_clf.fit(X_train, y_train)

# Predict the test data
y_pred = xgb_clf.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Accuracy: 0.7641818096224461

Accuracy: 0.7250023538273233


In [13]:
xgb_clf = XGBClassifier(objective='binary:logistic')

# Define the parameter grid
parameters = {
    'learning_rate': [0.1, 0.3],
    'n_estimators': [50, 100, 200],
    'reg_alpha': [0.0, 0.1, 0.5],
    'reg_lambda': [0.0, 0.1, 0.5]
}

# Create a grid search object
grid_s = GridSearchCV(xgb_clf, parameters, cv=2)

# Fit the grid search object to the training data
grid_s.fit(X_train, y_train)

# Print the best hyperparameter value
print('Best hyperparameter value: ', grid_s.best_params_)

# Print the mean cross-validation score for the best hyperparameter value
print('Mean cross-validation score: ', grid_s.best_score_)

# Best hyperparameter value:  {'learning_rate': 0.3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 0.5}
# Mean cross-validation score:  0.80176279473735

Best hyperparameter value:  {'learning_rate': 0.3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 0.5}
Mean cross-validation score:  0.80176279473735


## K-Nearest Neighbors (KNN)

In [9]:
# Create KNN model
knn = KNeighborsClassifier()

# Fit model to training data
knn.fit(X_train, y_train)

# Make predictions on test data
y_pred = knn.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Accuracy: 0.6301313435646361

Accuracy: 0.6310022596742303


In [None]:
# Create KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# Define the parameter grid
params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40],
    'p': [1, 2, 3]
}

# Create a grid search object
grid_se = GridSearchCV(knn, params, cv=2)

# Fit the grid search object to the training data
grid_se.fit(X_train, y_train)

# Print the best hyperparameter value
print('Best hyperparameter value: ', grid_se.best_params_)

# Print the mean cross-validation score for the best hyperparameter value
print('Mean cross-validation score: ', grid_se.best_score_)