# 1️⃣ Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report,
    precision_recall_curve
)

import joblib


# 2️⃣ Load Dataset & Split Features / Target

In [2]:
dataset = pd.read_csv("german_credit_cleaned.csv")

X = dataset.drop("risk", axis=1)
y = dataset["risk"]

print(X.shape)


(1000, 15)


# 3️⃣ Identify Column Types

In [3]:
bin_cols = [
    col for col in X.columns
    if X[col].dropna().nunique() == 2
    and set(X[col].dropna().unique()).issubset({0, 1})
]

num_cols = [
    col for col in X.select_dtypes(include="number").columns
    if col not in bin_cols
]

cat_cols = X.select_dtypes(include="object").columns.to_list()

print("Binary:", bin_cols)
print("Numerical:", num_cols)
print("Categorical:", cat_cols)


Binary: ['job_stability', 'housing_stability', 'has_saving_account', 'has_checking_account']
Numerical: ['age', 'job', 'credit_amount', 'duration', 'credit_per_month']
Categorical: ['sex', 'housing', 'saving_accounts', 'checking_account', 'purpose', 'age_risk_band']


# 4️⃣ Preprocessing Pipeline

In [4]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols),
        ("bin", "passthrough", bin_cols)
    ]
)


# 5️⃣ Train / Test Split (Stratified)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


# 6️⃣ Models (Cost-Sensitive)

In [6]:
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ),

    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        max_depth=8,
        class_weight="balanced",
        random_state=42
    ),

    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1],
        eval_metric="logloss",
        random_state=42
    )
}


# 7️⃣ Train, Evaluate (ROC-AUC + PR-AUC)

In [7]:
results = []

for name, model in models.items():
    pipe = Pipeline([
        ("preprocessing", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    y_proba = pipe.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, y_proba)
    pr_auc = average_precision_score(y_test, y_proba)

    results.append({
        "Model": name,
        "ROC_AUC": roc_auc,
        "PR_AUC": pr_auc
    })

results_df = pd.DataFrame(results).sort_values("PR_AUC", ascending=False)
print(results_df)


                 Model   ROC_AUC    PR_AUC
1        Random Forest  0.779881  0.664593
2              XGBoost  0.756429  0.619475
0  Logistic Regression  0.741786  0.593602


# 8️⃣ Select Best Model (Random Forest)

In [8]:
best_model = Pipeline([
    ("preprocessing", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=8,
        class_weight="balanced",
        random_state=42
    ))
])

best_model.fit(X_train, y_train)


# 9️⃣ Business-Driven Threshold Optimization

In [9]:
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Bank costs
COST_FN = 100_000   # default approved
COST_FP = 10_000    # good customer rejected

thresholds = [0.1, 0.2, 0.3, 0.35, 0.4]
losses = []

for t in thresholds:
    y_pred = (y_pred_proba >= t).astype(int)
    cm = confusion_matrix(y_test, y_pred)

    FN = cm[1, 0]
    FP = cm[0, 1]

    total_loss = FN*COST_FN + FP*COST_FP
    losses.append((t, total_loss))

loss_df = pd.DataFrame(losses, columns=["Threshold", "Total_Loss"])
print(loss_df)


   Threshold  Total_Loss
0       0.10     1320000
1       0.20     1520000
2       0.30     1760000
3       0.35     1840000
4       0.40     1970000


# Final Chosen Threshold (Risk-Averse Bank)

In [10]:
final_threshold = 0.35

y_final = (y_pred_proba >= final_threshold).astype(int)

print(confusion_matrix(y_test, y_final))
print(classification_report(y_test, y_final))


[[86 54]
 [13 47]]
              precision    recall  f1-score   support

           0       0.87      0.61      0.72       140
           1       0.47      0.78      0.58        60

    accuracy                           0.67       200
   macro avg       0.67      0.70      0.65       200
weighted avg       0.75      0.67      0.68       200



# 1️⃣0️⃣ Save Production-Ready Model

In [11]:
joblib.dump(best_model, "credit_risk_random_forest.pkl")
print("Model saved successfully!")


Model saved successfully!


# 1️⃣1️⃣ Load & Predict (Simulation)

In [12]:
loaded_model = joblib.load("credit_risk_random_forest.pkl")

sample_proba = loaded_model.predict_proba(X_test.iloc[:5])[:, 1]
sample_pred = (sample_proba >= final_threshold).astype(int)

print(sample_pred)


[1 1 1 1 1]


In [16]:
def risk_bucket(p):
    if p >= 0.6:
        return "HIGH RISK (Reject)"
    elif p >= 0.35:
        return "MEDIUM RISK (Manual Review)"
    else:
        return "LOW RISK (Approve)"

for p in sample_proba:
    print(p, risk_bucket(p))


0.41000388353282285 MEDIUM RISK (Manual Review)
0.48272803664943353 MEDIUM RISK (Manual Review)
0.6475163284011647 HIGH RISK (Reject)
0.5628779776235405 MEDIUM RISK (Manual Review)
0.49154675624527566 MEDIUM RISK (Manual Review)


In [None]:
output = pd.DataFrame({
    "Actual": y_test.iloc[:5].values,
    "Predicted_Probability": sample_proba,
    "Decision": sample_pred
})

print(output)
"""
“This is due to a deliberately low decision threshold selected based on asymmetric business costs.
Since false negatives are significantly more expensive for the bank than false positives, the model 
is tuned to be conservative, prioritizing  recall  of defaulters  even at the expense of rejecting 
some good customers.”  

"""

   Actual  Predicted_Probability  Decision
0       0               0.410004         1
1       0               0.482728         1
2       1               0.647516         1
3       0               0.562878         1
4       1               0.491547         1
