# JP Morgan Chase Forage Virtual Experience
# Task 3: Loan Default Prediction & Expected Loss Estimation
# Author: Rohan Veer
# Date: October 2025
#
# Note: A detailed presentation of results and insights
# is available separately (PDF file titled:
# "Loan_Default_Prediction_Presentation.pdf").
# View the full project presentation here:
# https://drive.google.com/file/d/1digY6FcX3dgUah_Oy8PT_phEXW4UIsbk/view?usp=sharing


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import joblib
import os

In [None]:
RANDOM_STATE = 42
CSV_PATH = "Task 3 and 4_Loan_Data.csv"  # update if your file is elsewhere
LOGISTIC_MODEL_PATH = "logistic_model.joblib"
SCALER_PATH = "scaler.joblib"
RF_MODEL_PATH = "rf_model.joblib"

In [None]:
def load_data(path=CSV_PATH):
    """Load dataset from CSV."""
    if not os.path.exists(path):
        raise FileNotFoundError(f"CSV file not found at {path}. Put the CSV in same folder or update CSV_PATH.")
    df = pd.read_csv(path)
    return df

In [None]:
def prepare_data(df):
    """Drop ID, separate X and y, return train/test splits and scaler-fitted arrays for LR."""
    X = df.drop(columns=["customer_id", "default"])
    y = df["default"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler

In [None]:
def train_models(X_train, X_train_scaled, y_train):
    """Train Logistic Regression (on scaled features) and Random Forest (on raw features)."""
    log_model = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    log_model.fit(X_train_scaled, y_train)

    rf_model = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
    rf_model.fit(X_train, y_train)

    return log_model, rf_model


In [None]:
def evaluate_model(model, X_test, X_test_scaled, y_test, model_type="logistic"):
    """Return accuracy and ROC-AUC. model_type decides which X to use."""
    if model_type == "logistic":
        preds = model.predict(X_test_scaled)
        proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        preds = model.predict(X_test)
        proba = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, proba)
    return acc, auc

In [None]:
def predict_expected_loss(model, scaler, borrower_info, recovery_rate=0.1):
    """
    Predict PD and Expected Loss for a single borrower.
    borrower_info: dict with keys:
      credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding,
      income, years_employed, fico_score
    returns: (pd_value, expected_loss)
    """
    cols = ["credit_lines_outstanding", "loan_amt_outstanding", "total_debt_outstanding",
            "income", "years_employed", "fico_score"]
    borrower_df = pd.DataFrame([borrower_info], columns=cols)

    borrower_scaled = scaler.transform(borrower_df)
    pd_value = model.predict_proba(borrower_scaled)[:, 1][0]
    expected_loss = pd_value * (1 - recovery_rate) * borrower_info["loan_amt_outstanding"]
    return pd_value, expected_loss

In [None]:

def main():
    # 1. Load data
    df = load_data(CSV_PATH)
    print("Loaded data shape:", df.shape)

    # 2. Prepare data
    X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler = prepare_data(df)
    print("Train/test split sizes:", X_train.shape, X_test.shape)

    # 3. Train
    log_model, rf_model = train_models(X_train, X_train_scaled, y_train)

    # 4. Evaluate
    log_acc, log_auc = evaluate_model(log_model, X_test, X_test_scaled, y_test, model_type="logistic")
    rf_acc, rf_auc = evaluate_model(rf_model, X_test, X_test_scaled, y_test, model_type="rf")

    print("\nMODEL PERFORMANCE")
    print(f"Logistic Regression - Accuracy: {log_acc:.4f}, ROC-AUC: {log_auc:.6f}")
    print(f"Random Forest       - Accuracy: {rf_acc:.4f}, ROC-AUC: {rf_auc:.6f}")

    # 5. Save Logistic model and scaler (for reproducibility / deployment)
    joblib.dump(log_model, LOGISTIC_MODEL_PATH)
    joblib.dump(scaler, SCALER_PATH)
    joblib.dump(rf_model, RF_MODEL_PATH)
    print(f"\nSaved logistic model to {LOGISTIC_MODEL_PATH} and scaler to {SCALER_PATH}")

    # 6. Example prediction
    sample_borrower = {
        "credit_lines_outstanding": 4,
        "loan_amt_outstanding": 20000.0,
        "total_debt_outstanding": 15000.0,
        "income": 60000.0,
        "years_employed": 5,
        "fico_score": 620
    }
    pd_value, expected_loss = predict_expected_loss(log_model, scaler, sample_borrower, recovery_rate=0.1)
    print("\nSAMPLE BORROWER PREDICTION")
    print(f"Predicted PD: {pd_value:.6f}")
    print(f"Expected Loss (recovery 10%): {expected_loss:.2f}")

    # 7. Optionally create a small results CSV for quick reference
    example_out = pd.DataFrame([{
        "pd": pd_value,
        "expected_loss": expected_loss,
        **sample_borrower
    }])
    example_out.to_csv("example_prediction.csv", index=False)
    print("Saved example_prediction.csv with sample prediction.")

if __name__ == "__main__":
    main()

Loaded data shape: (10000, 8)
Train/test split sizes: (8000, 6) (2000, 6)

MODEL PERFORMANCE
Logistic Regression - Accuracy: 0.9990, ROC-AUC: 0.999988
Random Forest       - Accuracy: 0.9960, ROC-AUC: 0.999893

Saved logistic model to logistic_model.joblib and scaler to scaler.joblib

SAMPLE BORROWER PREDICTION
Predicted PD: 0.994597
Expected Loss (recovery 10%): 17902.74
Saved example_prediction.csv with sample prediction.
