In [2]:
# ----------------------------
# Task 3: Probability of Default & Expected Loss
# ----------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss

# ----------------------------
# 1. Load Dataset
# ----------------------------
df = pd.read_csv("Task 3 and 4_Loan_Data.csv")

# Make column names lowercase for consistency
df.columns = [c.strip().lower() for c in df.columns]

# Check label column
label_col = "default"  # should be binary 0/1

# ----------------------------
# 2. Feature Engineering
# ----------------------------
df["payment_to_income"] = df["loan_amt_outstanding"] / df["income"]
df["debt_to_income"] = df["total_debt_outstanding"] / df["income"]

features = [
    "credit_lines_outstanding",
    "debt_to_income",
    "payment_to_income",
    "years_employed",
    "fico_score"
]

# Drop rows with missing values
df = df.dropna(subset=features + [label_col])

X = df[features]
y = df[label_col].astype(int)

# ----------------------------
# 3. Train/Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ----------------------------
# 4. Model (Pipeline with Scaling + Logistic Regression)
# ----------------------------
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(class_weight="balanced", solver="liblinear", max_iter=2000))
])

clf.fit(X_train, y_train)

# ----------------------------
# 5. Evaluation
# ----------------------------
y_pred_proba = clf.predict_proba(X_test)[:, 1]
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
print("Brier Score (calibration):", brier_score_loss(y_test, y_pred_proba))

# ----------------------------
# 6. Functions for New Borrowers
# ----------------------------
def estimate_pd(borrower_dict):
    """Return probability of default (PD) for a borrower dict"""
    # Keep only model features
    X_new = pd.DataFrame([{f: borrower_dict[f] for f in features}])
    return float(clf.predict_proba(X_new)[:, 1][0])

def expected_loss(borrower_dict, recovery_rate=0.10):
    """Return Expected Loss = PD × (1 - Recovery) × Loan Outstanding"""
    PD = estimate_pd(borrower_dict)
    EAD = borrower_dict.get("loan_amt_outstanding", None)
    if EAD is None:
        raise ValueError("Need 'loan_amt_outstanding' in borrower_dict")
    LGD = 1 - recovery_rate
    EL = PD * LGD * EAD
    return {"PD": PD, "LGD": LGD, "EAD": EAD, "Expected_Loss": EL}


# ----------------------------
# 7. Example Test Borrowers
# ----------------------------
test_cases = [
    {
        "credit_lines_outstanding": 8,
        "debt_to_income": 0.9,
        "payment_to_income": 0.5,
        "years_employed": 1,
        "fico_score": 580,
        "loan_amt_outstanding": 12000
    },
    {
        "credit_lines_outstanding": 5,
        "debt_to_income": 0.4,
        "payment_to_income": 0.2,
        "years_employed": 6,
        "fico_score": 680,
        "loan_amt_outstanding": 10000
    },
    {
        "credit_lines_outstanding": 12,
        "debt_to_income": 0.2,
        "payment_to_income": 0.1,
        "years_employed": 10,
        "fico_score": 760,
        "loan_amt_outstanding": 15000
    }
]

for borrower in test_cases:
    print(borrower)
    print("PD:", round(estimate_pd(borrower), 3))
    print(expected_loss(borrower))
    print("-----")


Accuracy: 0.993
ROC-AUC: 1.0
Brier Score (calibration): 0.007034629479037304
{'credit_lines_outstanding': 8, 'debt_to_income': 0.9, 'payment_to_income': 0.5, 'years_employed': 1, 'fico_score': 580, 'loan_amt_outstanding': 12000}
PD: 1.0
{'PD': 1.0, 'LGD': 0.9, 'EAD': 12000, 'Expected_Loss': 10800.0}
-----
{'credit_lines_outstanding': 5, 'debt_to_income': 0.4, 'payment_to_income': 0.2, 'years_employed': 6, 'fico_score': 680, 'loan_amt_outstanding': 10000}
PD: 1.0
{'PD': 0.9999982478409976, 'LGD': 0.9, 'EAD': 10000, 'Expected_Loss': 8999.984230568978}
-----
{'credit_lines_outstanding': 12, 'debt_to_income': 0.2, 'payment_to_income': 0.1, 'years_employed': 10, 'fico_score': 760, 'loan_amt_outstanding': 15000}
PD: 1.0
{'PD': 0.9999999999533458, 'LGD': 0.9, 'EAD': 15000, 'Expected_Loss': 13499.999999370168}
-----


In [3]:
test_case = {
    "credit_lines_outstanding": 8,
    "debt_to_income": 0.9,
    "payment_to_income": 0.5,
    "years_employed": 1,
    "fico_score": 580,
    "loan_amt_outstanding": 12000
}

print("PD:", round(estimate_pd(test_case), 3))
print(expected_loss(test_case))


PD: 1.0
{'PD': 1.0, 'LGD': 0.9, 'EAD': 12000, 'Expected_Loss': 10800.0}
