# Credit Risk Modeling (PD)
Objective: Predict the Probability of Default (PD) for borrowers to estimate expected capital loss.

Methods: Logistic Regression, Scikit-Learn.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Loan_Data.csv')
print(df.head())
print(df.columns)

   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347               5         612        0  
4             1768.826187  23448.32631               6         631        0  
Index(['customer_id', 'credit_lines_outstanding', 'loan_amt_outstanding',
       'total_debt_outstanding', 'income', 'years_employed', 

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# 1. LOAD THE DATA
# Replacing the path with your actual file name
df = pd.read_csv('loan_data.csv')

# 2. PRE-PROCESSING
# Drop 'customer_id' as it has no predictive power
X = df.drop(columns=['customer_id', 'default'])
y = df['default']

# Split into Training (80%) and Testing (20%) sets
# This ensures we test the model on data it hasn't seen before
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. TRAIN THE MODEL (Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 4. EVALUATE
# Check how accurate the model is
y_pred = model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 5. THE EXPECTED LOSS FUNCTION
def calculate_expected_loss(loan_properties, model):
    """
    Calculates the Expected Loss (EL) for a specific loan.
    
    EL = PD * LGD * EAD
    - PD: Probability of Default (from model)
    - LGD: Loss Given Default (1 - Recovery Rate). Given as 0.90.
    - EAD: Exposure at Default (Loan Amount).
    """
    # Convert input properties to a DataFrame (if it's a dict or list)
    # This ensures it matches the format the model expects
    input_df = pd.DataFrame([loan_properties])
    
    # Get the Probability of Default (PD)
    # .predict_proba() returns [[prob_0, prob_1]]. We want prob_1 (Default).
    pd_probability = model.predict_proba(input_df)[0][1]
    
    # Get the Loan Amount (Exposure)
    loan_amount = loan_properties['loan_amt_outstanding']
    
    # Calculate Expected Loss
    # LGD is 0.90 (since recovery rate is 10%)
    expected_loss = pd_probability * 0.90 * loan_amount
    
    return expected_loss

# --- TESTING THE FUNCTION ---
# Let's take a sample borrower from the test set
sample_borrower = X_test.iloc[0]

# Calculate the loss
loss = calculate_expected_loss(sample_borrower, model)

print(f"\n--- Prediction for Sample Borrower ---")
print(f"Loan Amount: ${sample_borrower['loan_amt_outstanding']:,.2f}")
print(f"Income:      ${sample_borrower['income']:,.2f}")
print(f"FICO Score:  {sample_borrower['fico_score']}")
print(f"Expected Loss: ${loss:,.2f}")

Model Accuracy: 0.997

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       1.00      0.98      0.99       348

    accuracy                           1.00      2000
   macro avg       1.00      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000


--- Prediction for Sample Borrower ---
Loan Amount: $3,584.84
Income:      $72,005.84
FICO Score:  593.0
Expected Loss: $0.00


In [4]:
# Loop through the test set to find a risky borrower (PD > 0.5)
print("--- Searching for Risky Borrowers ---")
for index, row in X_test.iterrows():
    # Get the probability of default
    input_df = pd.DataFrame([row])
    pd_probability = model.predict_proba(input_df)[0][1]
    
    if pd_probability > 0.5:
        loss = calculate_expected_loss(row, model)
        print(f"Found Risky Borrower (Index {index}):")
        print(f"  Income:      ${row['income']:,.2f}")
        print(f"  Loan Amount: ${row['loan_amt_outstanding']:,.2f}")
        print(f"  FICO Score:  {row['fico_score']}")
        print(f"  Prob of Default: {pd_probability:.2%}")
        print(f"  Expected Loss:   ${loss:,.2f}")
        break  # Stop after finding the first one

--- Searching for Risky Borrowers ---
Found Risky Borrower (Index 1731):
  Income:      $86,303.95
  Loan Amount: $5,343.19
  FICO Score:  607.0
  Prob of Default: 100.00%
  Expected Loss:   $4,808.87
