In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
# Load processed data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').iloc[:, 0]
y_test = pd.read_csv('../data/processed/y_test.csv').iloc[:, 0]

In [10]:
# Train logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)


In [11]:
# Predict and evaluate
y_pred_log = log_reg.predict(X_test)
y_pred_prob_log = log_reg.predict_proba(X_test)[:, 1]

In [12]:
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_log))
print("AUC Score:", roc_auc_score(y_test, y_pred_prob_log))

Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93    394356
           1       0.37      0.00      0.01     57772

    accuracy                           0.87    452128
   macro avg       0.62      0.50      0.47    452128
weighted avg       0.81      0.87      0.81    452128

AUC Score: 0.6872732807304265


In [13]:
# Train random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [15]:
# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
y_pred_prob_rf = rf_model.predict_proba(X_test)[:, 1]

In [16]:
print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))
print("AUC Score:", roc_auc_score(y_test, y_pred_prob_rf))

Random Forest Results:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93    394356
           1       0.44      0.00      0.01     57772

    accuracy                           0.87    452128
   macro avg       0.66      0.50      0.47    452128
weighted avg       0.82      0.87      0.81    452128

AUC Score: 0.6893206083381063


In [17]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print(feature_importance.head(10))

            Feature  Importance
1          int_rate    0.117918
11        revol_bal    0.113876
4               dti    0.113860
12       revol_util    0.109005
2       installment    0.104323
3        annual_inc    0.094651
13        total_acc    0.082881
0         loan_amnt    0.074206
9          open_acc    0.066226
7   fico_range_high    0.036033


In [19]:
# Save models
import joblib
joblib.dump(log_reg, '../models/logistic_regression.pkl')
joblib.dump(rf_model, '../models/random_forest.pkl')

['../models/random_forest.pkl']