In [1]:
# --- Data Manipulation ---
import pandas as pd

# --- Defined Functions ---
import sys
sys.path.append('/home/adedapo/code/roski10/Project_Mortgages/ml_logic/')

from clean_data import clean_data
from preprocessing import preprocess_and_resample

import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
file = "raw_data/Washington_State_HDMA-2016.csv"
data_original = pd.read_csv(file, decimal=',')

In [3]:
data = clean_data(data_original)


✅ data cleaned


In [4]:
X_train, X_test, y_train, y_test = preprocess_and_resample(data)


✅ X_train_sm, with shape (310356, 67)
✅ X_test, with shape (84968, 67)
✅ y_train_sm, with shape (310356,)
✅ y_test, with shape (84968,)


In [5]:
# Define the XGBoost classifier
xgb_clf = XGBClassifier(objective='binary:logistic', learning_rate= 0.3, n_estimators=200, reg_alpha=0.1, reg_lambda=0.5)

# Train the classifier
xgb_clf.fit(X_train, y_train)

# Predict the test data
y_pred = xgb_clf.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.7894030693908295


In [6]:
# Get feature importances
importances = xgb_clf.feature_importances_

# Create a list of (feature, importance) tuples and sort it in descending order by importance
feature_importances = list(zip(X_train.columns, importances))
feature_importances.sort(key=lambda x: x[1], reverse=True)

# Print feature importances in order of importance
for feature, importance in feature_importances:
    print(f"{feature}: {importance}")

onehotencoder__applicant_sex_name_Male: 0.06484796851873398
onehotencoder__loan_purpose_name_Home purchase: 0.043750178068876266
onehotencoder__lien_status_name_Secured by a first lien: 0.04297970235347748
onehotencoder__agency_name_Consumer Financial Protection Bureau: 0.04018291085958481
onehotencoder__loan_type_name_FHA-insured: 0.03470155596733093
onehotencoder__preapproval_name_Preapproval was not requested: 0.033214271068573
onehotencoder__loan_type_name_VA-guaranteed: 0.03239839896559715
onehotencoder__applicant_race_name_1_White: 0.03192327916622162
onehotencoder__co_applicant_race_name_1_White: 0.03160567209124565
onehotencoder__agency_name_Department of Housing and Urban Development: 0.03128965198993683
onehotencoder__loan_purpose_name_Home improvement: 0.030304670333862305
onehotencoder__agency_name_Federal Deposit Insurance Corporation: 0.028848225250840187
pipeline__hud_median_family_income: 0.028486331924796104
onehotencoder__property_type_name_Manufactured housing: 0.027

In [8]:
from sklearn.inspection import permutation_importance

# Compute feature importance with permutation
result = permutation_importance(xgb_clf, X_test, y_test, n_repeats=10)

# Print feature importance
for i in result.importances_mean.argsort()[::-1]:
    print(f"{X_train.columns[i]:<30}"
          f"{result.importances_mean[i]:.3f}"
          f" +/- {result.importances_std[i]:.3f}")

pipeline__applicant_income_000s0.081 +/- 0.002
pipeline__number_of_owner_occupied_units0.046 +/- 0.001
pipeline__loan_amount_000s    0.037 +/- 0.001
onehotencoder__loan_purpose_name_Home purchase0.032 +/- 0.001
pipeline__number_of_1_to_4_family_units0.027 +/- 0.001
pipeline__hud_median_family_income0.021 +/- 0.001
pipeline__tract_to_msamd_income0.020 +/- 0.001
pipeline__minority_population 0.014 +/- 0.000
pipeline__population          0.013 +/- 0.000
onehotencoder__loan_type_name_Conventional0.009 +/- 0.000
onehotencoder__agency_name_Department of Housing and Urban Development0.009 +/- 0.001
onehotencoder__region_Southwest Washington0.008 +/- 0.000
onehotencoder__loan_type_name_FHA-insured0.003 +/- 0.000
onehotencoder__applicant_sex_name_Male0.003 +/- 0.000
onehotencoder__lien_status_name_Secured by a first lien0.003 +/- 0.000
onehotencoder__loan_type_name_VA-guaranteed0.002 +/- 0.000
onehotencoder__applicant_sex_name_Female0.002 +/- 0.000
onehotencoder__region_Eastern Washington0.002 