In [1]:
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Tuple, List

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)

# ===========================
# CONFIG
# ===========================

@dataclass
class Config:
    csv_path: str = "/Users/peekay/Downloads/Loan_default.csv"
    target: str = "Default"
    drop_cols: Tuple[str, ...] = ("LoanID",)   # drop identifiers if present
    test_size: float = 0.2
    random_state: int = 42
    cv_folds: int = 5

    # Logistic Regression hyperparams
    lr_use_class_weight_balanced: bool = False
    lr_solver: str = "liblinear"   # "liblinear" or "lbfgs"
    lr_C: float = 1.0
    lr_max_iter: int = 2000

    # Random Forest toggle (optional)
    run_random_forest: bool = True
    rf_n_estimators: int = 400
    rf_max_depth: int = None
    rf_min_samples_leaf: int = 1
    rf_class_weight_balanced: bool = False

    # Business economics (adjust as needed)
    revenue_per_good: float = 125_000 * 0.13   # ~16,250 per good loan approved
    loss_per_default: float = 144_000 * 0.16   # ~23,040 per defaulted loan approved

    # Threshold sweep
    threshold_low: float = 0.05
    threshold_high: float = 0.95
    threshold_points: int = 37   # odd number so 0.5 is near the grid center

CFG = Config()

# ===========================
# DATA & PREPROCESSING
# ===========================

def load_data(cfg: Config) -> pd.DataFrame:
    df = pd.read_csv(cfg.csv_path)
    for col in cfg.drop_cols:
        if col in df.columns:
            df = df.drop(columns=col)
    return df

def split_features(df: pd.DataFrame, target: str) -> Tuple[pd.DataFrame, pd.Series, List[str], List[str]]:
    y = df[target].astype(int)
    X = df.drop(columns=target)
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]
    return X, y, num_cols, cat_cols

def build_preprocessor(num_cols: List[str], cat_cols: List[str]) -> ColumnTransformer:
    numeric = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    categorical = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    return ColumnTransformer(
        transformers=[
            ("num", numeric, num_cols),
            ("cat", categorical, cat_cols),
        ]
    )

# ===========================
# MODELS
# ===========================

def build_logistic_pipeline(pre: ColumnTransformer, cfg: Config) -> Pipeline:
    class_weight = "balanced" if cfg.lr_use_class_weight_balanced else None
    lr = LogisticRegression(
        solver=cfg.lr_solver,
        C=cfg.lr_C,
        max_iter=cfg.lr_max_iter,
        class_weight=class_weight,
        n_jobs=None if cfg.lr_solver == "liblinear" else -1,
        random_state=cfg.random_state
    )
    return Pipeline([("pre", pre), ("clf", lr)])

def build_rf_pipeline(pre: ColumnTransformer, cfg: Config) -> Pipeline:
    class_weight = "balanced" if cfg.rf_class_weight_balanced else None
    rf = RandomForestClassifier(
        n_estimators=cfg.rf_n_estimators,
        max_depth=cfg.rf_max_depth,
        min_samples_leaf=cfg.rf_min_samples_leaf,
        class_weight=class_weight,
        n_jobs=-1,
        random_state=cfg.random_state
    )
    return Pipeline([("pre", pre), ("clf", rf)])

# ===========================
# BUSINESS EVALUATION
# ===========================

def business_metrics(y_true: np.ndarray,
                     y_prob: np.ndarray,
                     threshold: float,
                     revenue_per_good: float,
                     loss_per_default: float):
    """
    Interpret y_pred=1 as 'decline' (predicted default).
    Approved = predicted 0.
    Revenue from approved good = TN * revenue_per_good
    Loss from approved default = FN * loss_per_default
    """
    y_pred = (y_prob >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    revenue = tn * revenue_per_good
    loss = fn * loss_per_default
    profit = revenue - loss
    return {
        "threshold": threshold,
        "tn": tn, "fp": fp, "fn": fn, "tp": tp,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "accuracy": accuracy_score(y_true, y_pred),
        "revenue": revenue, "loss": loss, "profit": profit
    }

def sweep_thresholds(y_true: np.ndarray,
                     y_prob: np.ndarray,
                     low: float, high: float, points: int,
                     revenue_per_good: float, loss_per_default: float):
    thresholds = np.linspace(low, high, points)
    grid = [business_metrics(y_true, y_prob, t, revenue_per_good, loss_per_default)
            for t in thresholds]
    best = max(grid, key=lambda r: r["profit"])
    return best, grid

# ===========================
# TRAIN / EVAL
# ===========================

def evaluate_pipeline(name: str, pipe: Pipeline,
                      X_train, y_train, X_test, y_test,
                      cfg: Config):
    print(f"\n===== {name} =====")
    # quick CV (accuracy & F1) on TRAIN
    skf = StratifiedKFold(n_splits=cfg.cv_folds, shuffle=True, random_state=cfg.random_state)
    cv_acc = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="accuracy")
    cv_f1  = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="f1")
    print(f"CV {cfg.cv_folds}-fold | Acc: {cv_acc.mean():.4f} ± {cv_acc.std():.4f} | F1: {cv_f1.mean():.4f} ± {cv_f1.std():.4f}")

    # fit & test
    pipe.fit(X_train, y_train)
    if hasattr(pipe.named_steps["clf"], "predict_proba"):
        y_prob = pipe.predict_proba(X_test)[:, 1]
    else:
        # pseudo-probabilities for models without predict_proba (not needed here)
        y_prob = pipe.predict(X_test)

    # default 0.50
    y_pred = (y_prob >= 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_prob)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()

    print("\n-- Test metrics @ threshold=0.50 --")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC AUC:   {auc:.4f}")
    print(f"Confusion  TN={tn:,}  FP={fp:,}  FN={fn:,}  TP={tp:,}")
    print("\nClassification report:")
    print(classification_report(y_test, y_pred, target_names=["No Default", "Default"]))

    # business sweep
    best, grid = sweep_thresholds(
        y_test.values, y_prob,
        cfg.threshold_low, cfg.threshold_high, cfg.threshold_points,
        cfg.revenue_per_good, cfg.loss_per_default
    )
    print("\n-- Business-optimal threshold (sweep) --")
    print(f"Best threshold: {best['threshold']:.4f}")
    print(f"Profit:  ${best['profit']:,.0f}")
    print(f"Revenue: ${best['revenue']:,.0f}  |  Loss: ${best['loss']:,.0f}")
    print(f"Acc: {best['accuracy']:.4f}  Prec: {best['precision']:.4f}  Rec: {best['recall']:.4f}  F1: {best['f1']:.4f}")
    print(f"Confusion  TN={best['tn']:,}  FP={best['fp']:,}  FN={best['fn']:,}  TP={best['tp']:,}")

def main(cfg: Config):
    print("=== Credit Risk Baselines: Logistic Regression (+ optional Random Forest) ===")
    df = load_data(cfg)
    X, y, num_cols, cat_cols = split_features(df, cfg.target)
    print(f"Dataset: {df.shape}, Default rate={y.mean():.3%}")
    print(f"Numeric: {len(num_cols)} | Categorical: {len(cat_cols)}")

    pre = build_preprocessor(num_cols, cat_cols)

    # Split once and reuse for all models for a fair comparison
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=cfg.test_size, random_state=cfg.random_state, stratify=y
    )

    # Logistic Regression
    lr_pipe = build_logistic_pipeline(pre, cfg)
    evaluate_pipeline("Logistic Regression", lr_pipe, X_train, y_train, X_test, y_test, cfg)

    # Random Forest (optional baseline)
    if cfg.run_random_forest:
        rf_pipe = build_rf_pipeline(pre, cfg)
        evaluate_pipeline("Random Forest", rf_pipe, X_train, y_train, X_test, y_test, cfg)

if __name__ == "__main__":
    main(CFG)


=== Credit Risk Baselines: Logistic Regression (+ optional Random Forest) ===
Dataset: (255347, 17), Default rate=11.613%
Numeric: 9 | Categorical: 7

===== Logistic Regression =====
CV 5-fold | Acc: 0.8852 ± 0.0002 | F1: 0.0641 ± 0.0042

-- Test metrics @ threshold=0.50 --
Accuracy:  0.8853
Precision: 0.6084
Recall:    0.0341
F1-score:  0.0645
ROC AUC:   0.7531
Confusion  TN=45,009  FP=130  FN=5,729  TP=202

Classification report:
              precision    recall  f1-score   support

  No Default       0.89      1.00      0.94     45139
     Default       0.61      0.03      0.06      5931

    accuracy                           0.89     51070
   macro avg       0.75      0.52      0.50     51070
weighted avg       0.85      0.89      0.84     51070


-- Business-optimal threshold (sweep) --
Best threshold: 0.3750
Profit:  $600,976,010
Revenue: $720,346,250  |  Loss: $119,370,240
Acc: 0.8827  Prec: 0.4808  Rec: 0.1265  F1: 0.2002
Confusion  TN=44,329  FP=810  FN=5,181  TP=750

===== 