In [1]:
# --- Data Manipulation ---
import pandas as pd

# --- Defined Functions ---
import sys
sys.path.append('/home/adedapo/code/roski10/Project_Mortgages/ml_logic/')

from clean_data import clean_data
from preprocessing import preprocess_and_resample

import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
file = "raw_data/Washington_State_HDMA-2016.csv"
data_original = pd.read_csv(file, decimal=',')

In [3]:
data = clean_data(data_original)


✅ data cleaned


In [4]:
X_train, X_test, y_train, y_test = preprocess_and_resample(data)


✅ X_train_sm, with shape (311114, 67)
✅ X_test_processed, with shape (84968, 67)
✅ y_train_sm, with shape (311114,)
✅ y_test, with shape (84968,)


In [5]:
# Define the XGBoost classifier
xgb_clf = XGBClassifier(objective='binary:logistic', learning_rate= 0.3, n_estimators=200, reg_alpha=0.1, reg_lambda=0.5)

# Train the classifier
xgb_clf.fit(X_train, y_train)

# Predict the test data
y_pred = xgb_clf.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.7888852273797194


In [6]:
# Get feature importances
importances = xgb_clf.feature_importances_

# Create a list of (feature, importance) tuples and sort it in descending order by importance
feature_importances = list(zip(X_train.columns, importances))
feature_importances.sort(key=lambda x: x[1], reverse=True)

# Print feature importances in order of importance
for feature, importance in feature_importances:
    print(f"{feature}: {importance}")

applicant_sex_name_Male: 0.06180758774280548
lien_status_name_Secured by a first lien: 0.04786307364702225
loan_purpose_name_Home purchase: 0.04489203914999962
applicant_race_name_1_White: 0.04250352084636688
loan_type_name_FHA-insured: 0.040145840495824814
agency_name_Federal Deposit Insurance Corporation: 0.03446587547659874
co_applicant_race_name_1_White: 0.034317124634981155
preapproval_name_Preapproval was not requested: 0.0335385724902153
agency_name_Consumer Financial Protection Bureau: 0.03201712295413017
hud_median_family_income: 0.03110465221107006
loan_purpose_name_Home improvement: 0.02994738146662712
agency_name_Department of Housing and Urban Development: 0.029164090752601624
applicant_income_000s: 0.026687661185860634
owner_occupancy_name_Not owner-occupied as a principal dwelling: 0.025766927748918533
property_type_name_One-to-four family dwelling (other than manufactured housing): 0.024369705468416214
applicant_ethnicity_name_Hispanic or Latino: 0.02433655597269535
loa

In [7]:
from sklearn.inspection import permutation_importance

# Compute feature importance with permutation
result = permutation_importance(xgb_clf, X_test, y_test, n_repeats=10)

# Print feature importance
for i in result.importances_mean.argsort()[::-1]:
    print(f"{X_train.columns[i]:<30}"
          f"{result.importances_mean[i]:.3f}"
          f" +/- {result.importances_std[i]:.3f}")

applicant_income_000s         0.084 +/- 0.001
number_of_owner_occupied_units0.073 +/- 0.001
number_of_1_to_4_family_units 0.061 +/- 0.001
loan_amount_000s              0.041 +/- 0.001
hud_median_family_income      0.027 +/- 0.001
loan_purpose_name_Home purchase0.025 +/- 0.001
tract_to_msamd_income         0.014 +/- 0.001
population                    0.014 +/- 0.001
region_Southwest Washington   0.014 +/- 0.000
minority_population           0.012 +/- 0.001
agency_name_Department of Housing and Urban Development0.009 +/- 0.001
loan_type_name_Conventional   0.003 +/- 0.000
loan_type_name_FHA-insured    0.002 +/- 0.000
lien_status_name_Secured by a first lien0.002 +/- 0.000
lien_status_name_Secured by a subordinate lien0.002 +/- 0.000
owner_occupancy_name_Not owner-occupied as a principal dwelling0.001 +/- 0.000
property_type_name_Manufactured housing0.001 +/- 0.000
region_Western Region         0.001 +/- 0.000
applicant_sex_name_Male       0.001 +/- 0.000
region_Eastern Washington     0.