In [1]:
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv('Task 3 and 4_Loan_Data.csv')

In [5]:
df


Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.752520,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.830850,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0
...,...,...,...,...,...,...,...,...
9995,3972488,0,3033.647103,2553.733144,42691.62787,5,697,0
9996,6184073,1,4146.239304,5458.163525,79969.50521,8,615,0
9997,6694516,2,3088.223727,4813.090925,38192.67591,5,596,0
9998,3942961,0,3288.901666,1043.099660,50929.37206,2,647,0


In [6]:
df.info

<bound method DataFrame.info of       customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0         8153374                         0           5221.545193   
1         7442532                         5           1958.928726   
2         2256073                         0           3363.009259   
3         4885975                         0           4766.648001   
4         4700614                         1           1345.827718   
...           ...                       ...                   ...   
9995      3972488                         0           3033.647103   
9996      6184073                         1           4146.239304   
9997      6694516                         2           3088.223727   
9998      3942961                         0           3288.901666   
9999      5533570                         1           1917.652480   

      total_debt_outstanding       income  years_employed  fico_score  default  
0                3915.471226  78039.38546               5 

In [15]:
# ===============================
# Loan Default Prediction & Expected Loss Estimation (Fixed Version)
# ===============================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# --- Load dataset ---
df = pd.read_csv("Task 3 and 4_Loan_Data.csv")

# --- Prepare features and target ---
X = df.drop(columns=["default", "customer_id"])
y = df["default"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# --- Logistic Regression ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)
log_pred = log_model.predict_proba(X_test_scaled)[:, 1]

# --- Random Forest ---
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict_proba(X_test)[:, 1]

# --- Evaluate ---
auc_log = roc_auc_score(y_test, log_pred)
auc_rf = roc_auc_score(y_test, rf_pred)

print(f"Logistic Regression AUC: {auc_log:.4f}")
print(f"Random Forest AUC: {auc_rf:.4f}")

# --- Expected Loss Function ---
def expected_loss(model, borrower_features, recovery_rate=0.1, scaler=None):
    """
    Compute Expected Loss for a single borrower.

    borrower_features: dict or Series with borrower details
    recovery_rate: float (default 0.1)
    scaler: optional StandardScaler if model was trained with scaled data
    """
    # Convert to DataFrame and drop irrelevant columns if present
    x = pd.DataFrame([borrower_features]).copy()
    for col in ["default", "customer_id"]:
        if col in x.columns:
            x.drop(columns=col, inplace=True)

    # Align columns to match training order
    x = x[X.columns]

    # Predict PD
    if scaler:
        x_scaled = scaler.transform(x)
        pd_est = model.predict_proba(x_scaled)[:, 1][0]
    else:
        pd_est = model.predict_proba(x)[:, 1][0]

    # Compute Expected Loss
    lgd = 1 - recovery_rate
    ead = borrower_features["loan_amt_outstanding"]
    el = pd_est * lgd * ead

    return {"PD": round(pd_est, 4), "Expected_Loss": round(el, 2)}

# --- Example borrower from test set ---
example_borrower = X_test.iloc[0].to_dict()

print("\nExample Borrower Features:")
print(example_borrower)

# --- Predictions ---
result_log = expected_loss(log_model, example_borrower, scaler=scaler)
result_rf = expected_loss(rf_model, example_borrower)

print("\nPredicted PD and Expected Loss:")
print("Logistic Regression:", result_log)
print("Random Forest:", result_rf)

# --- Borrower who actually defaulted ---
default_borrower = df[df["default"] == 1].iloc[0].to_dict()

result_log_default = expected_loss(log_model, default_borrower, scaler=scaler)
result_rf_default = expected_loss(rf_model, default_borrower)

print("\nFor an Actual Defaulter:")
print("Logistic Regression:", result_log_default)
print("Random Forest:", result_rf_default)


Logistic Regression AUC: 1.0000
Random Forest AUC: 0.9999

Example Borrower Features:
{'credit_lines_outstanding': 1.0, 'loan_amt_outstanding': 3151.996604, 'total_debt_outstanding': 5862.286797, 'income': 52183.14695, 'years_employed': 5.0, 'fico_score': 677.0}

Predicted PD and Expected Loss:
Logistic Regression: {'PD': np.float64(0.0), 'Expected_Loss': np.float64(0.0)}
Random Forest: {'PD': np.float64(0.0), 'Expected_Loss': np.float64(0.0)}

For an Actual Defaulter:
Logistic Regression: {'PD': np.float64(1.0), 'Expected_Loss': np.float64(1763.03)}
Random Forest: {'PD': np.float64(1.0), 'Expected_Loss': np.float64(1763.04)}
