In [4]:
# DL_final_Calibration.py
# Approach: Uncertainty and Risk-Aware Triage using Probability Calibration.
# This script trains, evaluates, and compares an uncalibrated model against a calibrated
# model to demonstrate the improvement in probability reliability for risk assessment.
# It uses append-only audit logs for accountability and traceability.

# Env: Python 3.10+ recommended. scikit-learn >=1.1, pandas, numpy.

import os, json, time, uuid, platform
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_recall_fscore_support, brier_score_loss, classification_report
)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV

# ================== CONFIG ==================
# Ensure this CSV file is in the same directory as the script
csv_path = "/content/diabetes_dataset.csv"
model_version = "v3.0-calibration"
risk_threshold_high = 0.75  # Calibrated probability threshold for high-risk flag
risk_threshold_low = 0.25   # Calibrated probability threshold for low-risk flag
random_seed = 42

# ================== SETUP ==================
# Create a unique ID for this execution run for traceability
run_id = str(uuid.uuid4())
timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())

# Create directory for audit logs if it doesn't exist
os.makedirs("audit_logs", exist_ok=True)

# Helper function to calculate Expected Calibration Error (ECE)
def expected_calibration_error(y_true, y_prob, n_bins=15):
    """A simple function to calculate ECE."""
    bin_limits = np.linspace(0, 1, n_bins + 1)
    bin_lowers, bin_uppers = bin_limits[:-1], bin_limits[1:]
    ece = 0
    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
        in_bin = (y_prob > bin_lower) & (y_prob <= bin_upper)
        prop_in_bin = np.mean(in_bin)
        if prop_in_bin > 0:
            accuracy_in_bin = np.mean(y_true[in_bin])
            avg_confidence_in_bin = np.mean(y_prob[in_bin])
            ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
    return ece

# ================== DATA PREPARATION ==================
try:
    df = pd.read_csv(csv_path)
    print(f"Dataset '{csv_path}' loaded successfully.")
except FileNotFoundError:
    print(f"CRITICAL ERROR: The file '{csv_path}' was not found. Please check the path.")
    exit()

# Define target and features
target_col = 'diagnosed_diabetes'
# Drop rows with NaN in the target column before splitting
df.dropna(subset=[target_col], inplace=True)
# Drop the target and the closely related 'diabetes_stage' column to prevent data leakage
feature_cols = df.drop(columns=[target_col, 'diabetes_stage']).columns

X = df[feature_cols]
y = df[target_col]

# Split data: 60% train, 20% calibration, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=random_seed, stratify=y)
X_calib, X_test, y_calib, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_seed, stratify=y_temp)

# Identify column types for preprocessing
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['number']).columns

# Create the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# ================== MODELING & CALIBRATION ==================
print("\n--- Training and Calibrating the Model ---")

# Define a base model known to benefit from calibration (e.g., Gradient Boosting)
base_classifier = GradientBoostingClassifier(n_estimators=100, random_state=random_seed)

# 1. Create and train the UNCALIBRATED model pipeline
uncalibrated_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', base_classifier)
])
uncalibrated_pipeline.fit(X_train, y_train)
print("Uncalibrated model trained.")

# 2. Create and train the CALIBRATED model
# This wraps the already-trained classifier and learns a correction function
calibrated_classifier = CalibratedClassifierCV(
    estimator=uncalibrated_pipeline.named_steps['classifier'], # Changed base_estimator to estimator
    method='isotonic', # A powerful non-parametric calibration method
    cv='prefit'
)
# Pre-process the calibration data before fitting the calibrator
X_calib_processed = uncalibrated_pipeline.named_steps['preprocessor'].transform(X_calib)
calibrated_classifier.fit(X_calib_processed, y_calib)
print("Model calibration complete.")

# ================== EVALUATION ==================
print("\n--- Comparative Evaluation ---")

# Get predictions and probabilities from both models
y_pred_uncalib = uncalibrated_pipeline.predict(X_test)
prob_uncalib = uncalibrated_pipeline.predict_proba(X_test)[:, 1]

X_test_processed = uncalibrated_pipeline.named_steps['preprocessor'].transform(X_test)
y_pred_calib = calibrated_classifier.predict(X_test_processed)
prob_calib = calibrated_classifier.predict_proba(X_test_processed)[:, 1]

# --- Performance Metrics ---
print("\n--- Performance (Accuracy, etc.) ---")
print("\n[Uncalibrated Model]")
print(classification_report(y_test, y_pred_uncalib, target_names=["No Diabetes", "Diabetes"]))
print("\n[Calibrated Model]")
print(classification_report(y_test, y_pred_calib, target_names=["No Diabetes", "Diabetes"]))


# --- Calibration Metrics (Lower is better) ---
print("\n--- Calibration Quality (Lower is Better) ---")
brier_uncalib = brier_score_loss(y_test, prob_uncalib)
brier_calib = brier_score_loss(y_test, prob_calib)
ece_uncalib = expected_calibration_error(y_test.values, prob_uncalib)
ece_calib = expected_calibration_error(y_test.values, prob_calib)

print(f"Brier Score (Uncalibrated): {brier_uncalib:.4f}")
print(f"Brier Score (Calibrated):   {brier_calib:.4f}")
print(f"\nExpected Cal. Error (Uncalibrated): {ece_uncalib:.4f}")
print(f"Expected Cal. Error (Calibrated):   {ece_calib:.4f}")


# ================== AUDIT LOGGING ==================
print("\n--- Writing Audit Logs for Triage Comparison ---")

# Persist environment header for accountability
run_header = {
    "run_id": run_id,
    "timestamp_utc": timestamp,
    "env": {
        "python": platform.python_version(),
        "sklearn": sklearn.__version__,
    },
    "model": {
        "type": "ProbabilityCalibrationComparison",
        "version": model_version,
        "base_classifier": type(base_classifier).__name__,
        "calibration_method": "isotonic"
    },
    "risk_thresholds": {"high": risk_threshold_high, "low": risk_threshold_low},
    "target_col": target_col
}
with open("audit_logs/run_header_calibration.json", "w", encoding="utf-8") as f:
    json.dump(run_header, f, indent=2)

# Write a log for each prediction, comparing calibrated vs. uncalibrated outputs
log_path = f"audit_logs/predictions_calibration.jsonl"
with open(log_path, "w", encoding="utf-8") as f:
    for i in range(len(X_test)):
        p_uncalib = prob_uncalib[i]
        p_calib = prob_calib[i]

        # Assign a risk flag based on the TRUSTED (calibrated) probability
        if p_calib >= risk_threshold_high:
            risk_assessment = "HIGH_RISK"
        elif p_calib <= risk_threshold_low:
            risk_assessment = "LOW_RISK"
        else:
            risk_assessment = "UNCERTAIN_TRIAGE" # Case for manual review

        record = {
            "run_id": run_id,
            "record_index": int(X_test.index[i]), # Convert int64 to int
            "true_label": int(y_test.iloc[i]),
            "uncalibrated_prob": float(p_uncalib),
            "calibrated_prob": float(p_calib),
            "calibration_impact": float(p_calib - p_uncalib),
            "risk_assessment": risk_assessment
        }
        f.write(json.dumps(record) + "\n")

print(f"Audit logs successfully written to '{log_path}'.")
print("\n--- Workflow Complete ---")

Dataset '/content/diabetes_dataset.csv' loaded successfully.

--- Training and Calibrating the Model ---
Uncalibrated model trained.
Model calibration complete.

--- Comparative Evaluation ---

--- Performance (Accuracy, etc.) ---

[Uncalibrated Model]
              precision    recall  f1-score   support

 No Diabetes       0.83      1.00      0.91      8001
    Diabetes       1.00      0.87      0.93     11999

    accuracy                           0.92     20000
   macro avg       0.92      0.93      0.92     20000
weighted avg       0.93      0.92      0.92     20000


[Calibrated Model]




              precision    recall  f1-score   support

 No Diabetes       0.83      1.00      0.91      8001
    Diabetes       1.00      0.87      0.93     11999

    accuracy                           0.92     20000
   macro avg       0.92      0.93      0.92     20000
weighted avg       0.93      0.92      0.92     20000


--- Calibration Quality (Lower is Better) ---
Brier Score (Uncalibrated): 0.0663
Brier Score (Calibrated):   0.0663

Expected Cal. Error (Uncalibrated): 0.0042
Expected Cal. Error (Calibrated):   0.0041

--- Writing Audit Logs for Triage Comparison ---
Audit logs successfully written to 'audit_logs/predictions_calibration.jsonl'.

--- Workflow Complete ---
