In [1]:
# --- Data Manipulation ---
import pandas as pd

# --- Defined Functions ---
import sys
sys.path.append('/home/adedapo/code/roski10/Project_Mortgages/')

from dapo_clean_data import clean_data
from dapo_preprocessing import preprocess_and_resample

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV

In [2]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [3]:
file = "data/Washington_State_HDMA-2016.csv"
data_original = pd.read_csv(file, decimal=',')

In [4]:
data = clean_data(data_original)

In [5]:
X_train, X_test, y_train, y_test = preprocess_and_resample(data)

In [6]:
X_train.shape

(310588, 67)

In [7]:
y_train.value_counts()

0.0    155294
1.0    155294
Name: ordinalencoder__loan_status, dtype: int64

In [8]:
# # Train a logistic regression model
# clf = LogisticRegression(random_state=0).fit(X_train, y_train)

# # Get the coefficients and feature names
# coef = clf.coef_[0]
# feature_names = X_train.columns.tolist()

# # Create a list of tuples containing the feature name and coefficient
# coef_tuples = list(zip(feature_names, coef))

# # Sort the tuples by coefficient in descending order
# sorted_coef = sorted(coef_tuples, key=lambda x: abs(x[1]), reverse=True)

# # Print out the sorted list of feature names and coefficients
# for feature, coef in sorted_coef:
#     print(feature, ':', coef)

In [9]:
model = LogisticRegression(C=1.0, penalty ='l1',solver='liblinear', max_iter=10000)
model.fit(X_train, y_train)

In [10]:
from sklearn.metrics import accuracy_score

# Predict the test data
y_pred = model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.6158436117126448


In [11]:
# Get the coefficients (weights) of the model
coefficients = model.coef_
    
# Create a dataframe of feature names and their corresponding coefficients
feature_importance = pd.DataFrame(list(zip(X_train.columns, coefficients[0])), columns=['Feature', 'Coefficient'])

# Sort the features by their absolute coefficient values in descending order
feature_importance['abs_coef'] = feature_importance['Coefficient'].abs()
feature_importance = feature_importance.sort_values('abs_coef', ascending=False)

# Print the top 10 features with the highest absolute coefficient values
print(feature_importance.head(20))

                                              Feature  Coefficient  abs_coef
27        onehotencoder__hoepa_status_name_HOEPA loan     2.309479  2.309479
24  onehotencoder__lien_status_name_Not secured by...    -0.855443  0.855443
55  onehotencoder__applicant_ethnicity_name_Not ap...     0.741525  0.741525
43  onehotencoder__co_applicant_ethnicity_name_Not...     0.727007  0.727007
22     onehotencoder__loan_purpose_name_Home purchase     0.653415  0.653415
61  onehotencoder__agency_name_Office of the Compt...     0.622175  0.622175
58  onehotencoder__agency_name_Federal Deposit Ins...     0.595158  0.595158
8   onehotencoder__property_type_name_Manufactured...    -0.442022  0.442022
9   onehotencoder__property_type_name_Multifamily ...    -0.418025  0.418025
32  onehotencoder__co_applicant_sex_name_Not appli...    -0.401402  0.401402
23       onehotencoder__loan_purpose_name_Refinancing    -0.389400  0.389400
20        onehotencoder__loan_type_name_VA-guaranteed     0.362331  0.362331

In [12]:
from sklearn.inspection import permutation_importance

# Compute feature importance with permutation
result = permutation_importance(model, X_test, y_test, n_repeats=10)

# Print feature importance
for i in result.importances_mean.argsort()[::-1]:
    print(f"{X_train.columns[i]:<30}"
          f"{result.importances_mean[i]:.3f}"
          f" +/- {result.importances_std[i]:.3f}")

onehotencoder__co_applicant_sex_name_Female0.009 +/- 0.000
onehotencoder__co_applicant_sex_name_No co-applicant0.008 +/- 0.000
onehotencoder__co_applicant_race_name_1_White0.008 +/- 0.000
pipeline__hud_median_family_income0.007 +/- 0.001
onehotencoder__loan_purpose_name_Home purchase0.006 +/- 0.001
pipeline__tract_to_msamd_income0.004 +/- 0.000
onehotencoder__loan_type_name_VA-guaranteed0.003 +/- 0.000
pipeline__applicant_income_000s0.003 +/- 0.000
onehotencoder__agency_name_Federal Deposit Insurance Corporation0.002 +/- 0.000
onehotencoder__lien_status_name_Not secured by a lien0.002 +/- 0.000
onehotencoder__loan_type_name_FHA-insured0.002 +/- 0.000
onehotencoder__property_type_name_Manufactured housing0.002 +/- 0.000
onehotencoder__agency_name_National Credit Union Administration0.002 +/- 0.000
pipeline__population          0.001 +/- 0.001
onehotencoder__applicant_race_name_1_Asian0.001 +/- 0.000
onehotencoder__applicant_race_name_1_Black or African American0.001 +/- 0.000
onehotenco

In [13]:
from sklearn.ensemble import RandomForestClassifier

# Define the RandomForest classifier
rfc = RandomForestClassifier(max_depth = 30, max_features = 'sqrt', n_estimators = 200)

# Train the classifier
rfc.fit(X_train, y_train)

# Predict the test data
y_pred = rfc.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.7487524715186894


In [14]:
from xgboost import XGBClassifier

# Define the XGBoost classifier
xgb_clf = XGBClassifier(objective='binary:logistic', learning_rate=0.1, max_depth=7, n_estimators=50)

# Train the classifier
xgb_clf.fit(X_train, y_train)

# Predict the test data
y_pred = xgb_clf.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.7257320402975238


In [15]:
from sklearn.ensemble import VotingClassifier

# Define the ensemble model
ensemble_model = VotingClassifier(estimators=[('rf', rfc), ('xgb', xgb_clf)], voting='soft')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
accuracy = ensemble_model.score(X_test, y_test)
print('Accuracy: ', accuracy)

NameError: name 'xgb_clf_clf' is not defined