In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import joblib
import os

In [None]:
# Load Processed Data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (25932, 26), Test shape: (6484, 26)


In [None]:
# Load Processed Data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# Check data types
print("X_train dtypes:")
print(X_train.dtypes)
print("Unique values in first few columns:")
for col in X_train.columns[:5]:
    print(f"{col}: {X_train[col].unique()[:5]}")

Train shape: (25932, 26), Test shape: (6484, 26)
X_train dtypes:
person_age                           float64
person_income                        float64
person_emp_length                    float64
loan_amnt                            float64
loan_int_rate                        float64
loan_percent_income                  float64
cb_person_cred_hist_length           float64
debt_to_income                       float64
loan_grade_encoded                     int64
cb_person_default_on_file_encoded      int64
person_home_ownership_OTHER             bool
person_home_ownership_OWN               bool
person_home_ownership_RENT              bool
loan_intent_EDUCATION                   bool
loan_intent_HOMEIMPROVEMENT             bool
loan_intent_MEDICAL                     bool
loan_intent_PERSONAL                    bool
loan_intent_VENTURE                     bool
age_group_26-35                         bool
age_group_36-45                         bool
age_group_46-55                    

## Handle Class Imbalance with Class Weights

In [None]:
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.values.ravel())
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
print("Class weights:", class_weight_dict)

Class weights: {np.int64(0): np.float64(0.6399486698583485), np.int64(1): np.float64(2.2863692470463763)}


## Baseline Model

In [None]:
# Logistic Regression Baseline
baseline_model = LogisticRegression(random_state=42, class_weight=class_weight_dict, max_iter=1000)
baseline_model.fit(X_train, y_train.values.ravel())

# Predictions
y_pred_baseline = baseline_model.predict(X_test)
y_pred_proba_baseline = baseline_model.predict_proba(X_test)[:, 1]

print("Baseline Model - Logistic Regression")
print(classification_report(y_test, y_pred_baseline))
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_baseline)}")

Baseline Model - Logistic Regression
              precision    recall  f1-score   support

           0       0.93      0.80      0.86      5066
           1       0.53      0.80      0.64      1418

    accuracy                           0.80      6484
   macro avg       0.73      0.80      0.75      6484
weighted avg       0.85      0.80      0.82      6484

ROC AUC: 0.8742035317170194


## Random Forest Model

In [None]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42, class_weight=class_weight_dict)
rf_model.fit(X_train, y_train.values.ravel())

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

print("Random Forest Model")
print(classification_report(y_test, y_pred_rf))
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_rf)}")

Random Forest Model
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      5066
           1       0.97      0.72      0.83      1418

    accuracy                           0.93      6484
   macro avg       0.95      0.86      0.89      6484
weighted avg       0.94      0.93      0.93      6484

ROC AUC: 0.9323473172459222


## Hyperparameter Tuning

In [None]:
# Grid Search for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42, class_weight=class_weight_dict), param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train.values.ravel())

print("Best parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

Best parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
# Evaluate Best Model
y_pred_best = best_rf_model.predict(X_test)
y_pred_proba_best = best_rf_model.predict_proba(X_test)[:, 1]

print("Best Random Forest Model")
print(classification_report(y_test, y_pred_best))
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_best)}")

Best Random Forest Model
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      5066
           1       0.97      0.72      0.82      1418

    accuracy                           0.93      6484
   macro avg       0.95      0.86      0.89      6484
weighted avg       0.94      0.93      0.93      6484

ROC AUC: 0.9344456419271261


## Save Model

In [None]:
# Save the best model
os.makedirs('../models', exist_ok=True)
joblib.dump(best_rf_model, '../models/best_model.pkl')
print("Model saved.")

Model saved.
