# Metadata Postprocessing & Metrics Calculation


## Imports and Global Variables

In [34]:

import os
from collections import OrderedDict
import pandas as pd
import numpy as np
import os
import re
import glob
from sklearn.calibration import expit
import sys
import yaml

# Set sys.path to the parent directory of the current working directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
from src.eval.metrics import calculate_all_metrics

In [35]:
# Global task and dataset order for consistent plotting
TASK_MAPPING = OrderedDict([
    ("mortality", "Mortality"),
    ("aki", "AKI"),
    ("sepsis", "Sepsis"),
])

DATASET_MAPPING = OrderedDict([
    ("hirid", "HiRID"),
    ("miiv", "MIMIC-IV"),
    ("eicu", "eICU"),
])
MODEL_LIST = [
    "RandomForest", "LightGBM", "XGBoost", "CNN", "InceptionTime", "LSTM", "GRU",
    "GPT4o", "Gemini2p5flash", "Llama3", "Mistral", "DeepseekR1Llama8b", "Gemma3", "MedGemma"
]

CONVML_MODELS = ["RandomForest", "XGBoost", "LightGBM"]
CONVDL_MODELS = ["CNN", "InceptionTime", "GRU", "LSTM"]
LLM_MODELS = {
    "OpenAIo3": "OpenAI-o3",
    "ClaudeSonnet4": "Claude-Sonnet-4",
    "Grok4": "Grok-4",
    "Gemini2p5pro": "Gemini-2.5-Pro",
    "Gemini2p5flash": "Gemini-2.5-Flash",
    "Llama3": "Llama-3.1-8B-Instruct",
    "DeepseekR1Llama8b": "Deepseek-R1-Distill-Llama-8B",
    "Mistral": "Mistral-7B-Instruct-v0.3",
    "Gemma3": "Gemma-3-4B-it",  # 4B or 12B
    "MedGemma": "MedGemma-4B-it",
}

SUBGROUP_TYPES = ["Sex", "Age", "BMI"]
SEX_LIST = ["Male", "Female"]
AGE_LIST = ["18-65 Years", "65-75 Years", "75-91 Years"]
BMI_LIST = ["BMI < 18.5 kg/m2", "BMI 18.5-25 kg/m2", "BMI 25-30 kg/m2", "BMI > 30 kg/m2"]

PROMPTING_ID_DISPLAY_MAPPING = {
    "sarvari_2024_aggregation_preprocessor": "Aggregation",
    "zhu_2024b_zero_shot_preprocessor": "Zero-Shot",
    "zhu_2024b_one_shot_preprocessor": "One-Shot",
    "liu_2023_few_shot_preprocessor": "Few-Shot (3)",
    "zhu_2024a_cot_preprocessor": "CoT",
    "zhu_2024c_categorization_summary_agent_preprocessor": "SumAgent",
    "collaborative_reasoning_agent_preprocessor": "ColAgent",
    "clinical_workflow_agent_preprocessor": "ClinFlowAgent",
    "hybrid_reasoning_agent_preprocessor": "HybReAgent",
}

METRICS_MAPPING = {
    "auroc": "AUROC",
    "auprc": "AUPRC",
    "normalized_auprc": "Normalized AUPRC",
    "minpse": "Min(+P, Se)",
    "recall": "Sensitivity (Recall)",
    "specificity": "Specificity",
    "precision": "Precision",
    "f1_score": "F1 Score",
    "accuracy": "Accuracy",
    "balanced_accuracy": "Balanced Accuracy",
    "mcc": "MCC",
    "kappa": "Cohen's Kappa",
}

MODEL_CONFIG_PATH_MAPPING = {
    "configs/model_configs/Gemma34BModel.yaml": "Gemma3",
    "configs/model_configs/MedGemma4bModel.yaml": "MedGemma",
}

## Input Setup

In [36]:
# Utility to generate all valid LLM model/prompting_id input paths
def get_llm_prompting_id_paths(llms_root_dir, prompting_id_display_mapping, allowed_models=None, allowed_prompting_ids=None):
    """
    Scan the LLMs results directory and return all valid (model, prompting_id) subdirectory paths.
    Only includes subdirectories that match a key in PROMPTING_ID_DISPLAY_MAPPING.
    Optionally filter by allowed_models and/or allowed_prompting_ids.
    Args:
        llms_root_dir (str): Path to the root LLMs directory (e.g., .../results_benchmark/llms)
        prompting_id_display_mapping (dict): Mapping of valid prompting_id subdirectory names
        allowed_models (list, optional): List of allowed model directory names (str)
        allowed_prompting_ids (list, optional): List of allowed prompting_id subdirectory names (str)
    Returns:
        list: List of full paths for all valid (model, prompting_id) combinations
    """
    valid_prompting_ids = set(prompting_id_display_mapping.keys())
    if allowed_prompting_ids is not None:
        valid_prompting_ids = valid_prompting_ids & set(allowed_prompting_ids)
    all_paths = []
    if not os.path.isdir(llms_root_dir):
        print(f"LLMs root directory not found: {llms_root_dir}")
        return all_paths
    for model_name in os.listdir(llms_root_dir):
        if allowed_models is not None and model_name not in allowed_models:
            continue
        model_dir = os.path.join(llms_root_dir, model_name)
        if not os.path.isdir(model_dir):
            continue
        for subdir in os.listdir(model_dir):
            if subdir not in valid_prompting_ids:
                continue
            subdir_path = os.path.join(model_dir, subdir)
            if os.path.isdir(subdir_path):
                all_paths.append(subdir_path)
    return all_paths


def get_model_name_from_config(config_path):
    try:
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)
        load_models = config.get("load_models", [])
        if isinstance(load_models, list) and load_models:
            model_config_path = load_models[0]
            return MODEL_CONFIG_PATH_MAPPING.get(model_config_path, None)
    except Exception as e:
        print(f"Error reading config for model name: {e}")
    return None

In [37]:
# Get parent directory of current working directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
results_path = os.path.join(parent_dir, "output")

# Add all baseline model paths
outputfolder_path_list = [
    os.path.join(results_path, r"baseline_models/20250603_160304_CNN"),
    os.path.join(results_path, r"baseline_models/20250603_192812_GRU"),
    os.path.join(results_path, r"baseline_models/20250604_094650_LSTM"),
    os.path.join(results_path, r"baseline_models/20250617_131406_InceptionTime"),
    os.path.join(results_path, r"baseline_models/20250618_192129_XGBoost"),
    os.path.join(results_path, r"baseline_models/20250619_004541_RandomForest"),
]

# Dynamically add all valid LLM model/prompting_id paths
llms_root = os.path.join(results_path, "llms")
llm_prompting_paths = get_llm_prompting_id_paths(
    llms_root,
    PROMPTING_ID_DISPLAY_MAPPING,
    allowed_models=[
        "o3",
        "claudesonnet4",
        "grok4",
        "gemini2p5pro",
        "gemini2p5flash",
        "llama3p18b",
        "deepseekr1llama8b",
        "mistral7b",
        "gemma34b",
        "medgemma4b",
    ],
    allowed_prompting_ids=[
        "sarvari_2024_aggregation_preprocessor",
        "zhu_2024b_zero_shot_preprocessor",
        "zhu_2024b_one_shot_preprocessor",
        "liu_2023_few_shot_preprocessor",
        "zhu_2024a_cot_preprocessor",
        "zhu_2024c_categorization_summary_agent_preprocessor",
        "collaborative_reasoning_agent_preprocessor",
        "clinical_workflow_agent_preprocessor",
        "hybrid_reasoning_agent_preprocessor",
    ]
)
outputfolder_path_list += llm_prompting_paths
print(f"Total output folders (Baselines + LLMs): {len(outputfolder_path_list)}")
for path in outputfolder_path_list:
    print(path)

# Global output directory for all visualizations
OUTPUT_BASE_DIR = os.path.join("..", "visualizations", "benchmark_baseline_models")

Total output folders (Baselines + LLMs): 68
/Users/sophiaehlers/Documents/pulse_agents/output/baseline_models/20250603_160304_CNN
/Users/sophiaehlers/Documents/pulse_agents/output/baseline_models/20250603_192812_GRU
/Users/sophiaehlers/Documents/pulse_agents/output/baseline_models/20250604_094650_LSTM
/Users/sophiaehlers/Documents/pulse_agents/output/baseline_models/20250617_131406_InceptionTime
/Users/sophiaehlers/Documents/pulse_agents/output/baseline_models/20250618_192129_XGBoost
/Users/sophiaehlers/Documents/pulse_agents/output/baseline_models/20250619_004541_RandomForest
/Users/sophiaehlers/Documents/pulse_agents/output/llms/gemma34b/clinical_workflow_agent_preprocessor
/Users/sophiaehlers/Documents/pulse_agents/output/llms/gemma34b/hybrid_reasoning_agent_preprocessor
/Users/sophiaehlers/Documents/pulse_agents/output/llms/gemma34b/sarvari_2024_aggregation_preprocessor
/Users/sophiaehlers/Documents/pulse_agents/output/llms/gemma34b/zhu_2024b_one_shot_preprocessor
/Users/sophiaehle

## Data Loading and Preparation

### Function Definitions

In [38]:
def categorize_files(outputfolder_path_list):
    """
    Categorize files in the output folders into metrics report files, metadata files, and log files.

    Args:
        outputfolder_path_list (list): List of output folder paths.

    Returns:
        dict: A dictionary containing categorized files.
    """
    file_list = []
    file_sources = {}  # Track which folder each file came from
    
    for outputfolder_path in outputfolder_path_list:
        folder_files = glob.glob(os.path.join(outputfolder_path, "*"))
        for file_path in folder_files:
            if file_path in file_sources:
                print(f"⚠️  Duplicate file found:")
                print(f"     File: {file_path}")
                print(f"     First found in: {file_sources[file_path]}")
                print(f"     Also found in: {outputfolder_path}")
            else:
                file_sources[file_path] = outputfolder_path
        file_list.extend(folder_files)

    # Remove duplicates to ensure unique file paths
    file_list = list(set(file_list))

    categorized_files = {
        "metrics_report_files": [f for f in file_list if "metrics_report" in f],
        "metadata_files": [f for f in file_list if "metadata" in f],
        "log_files": [f for f in file_list if "log" in f],
        "config_files": [f for f in file_list if "config" in f],
    }

    print(f"Found {len(categorized_files['metrics_report_files'])} metrics report files, {len(categorized_files['metadata_files'])} metadata files, {len(categorized_files['log_files'])} log files.")
    return categorized_files


def load_metadata(metadata_path_list):
    """
    Load metadata from CSV files into a DataFrame.
    Enhanced version with better error handling and path parsing.
    Adds a 'prompting_id' column for LLM models using the config yaml in the same directory.
    Adds an 'is_agent' column: True if unmapped prompting_id contains 'agent', else False.
    For LLM metadata files, if the column 'System Message Index' exists, only keep rows where its value is 1.
    For ambiguous Gemma3/MedGemma cases, uses config_copy.yaml to assign correct model name.
    """
    df_mdata = pd.DataFrame()

    for m_path in metadata_path_list:
        try:
            df = pd.read_csv(m_path)
            df["source_file"] = m_path

            filename = os.path.basename(m_path)
            folder_path = os.path.dirname(m_path)
            folder_name = os.path.basename(folder_path)
            patterns = [
                r"([^_]+)_([^_]+)_([^_]+)_(\d{8}_\d{6})_metadata\.csv$",
                r"([^_]+)_([^_]+)_([^_]+)_output_metadata\.csv$",
                r"([^_]+)_([^_]+)_([^_]+)_metadata\.csv$",
                r"([^_]+)_([^_]+)_([^_]+).*metadata\.csv$",
            ]
            extracted = False
            timestamp = "Unknown"
            folder_match = re.search(r"(\d{8}_\d{6})", folder_name)
            if folder_match:
                timestamp = folder_match.group(1)
            model_name, task, dataset = None, None, None

            # Try to extract model_name, task, dataset from filename
            for i, pattern in enumerate(patterns):
                match = re.search(pattern, filename)
                if match:
                    if i == 0:
                        model_name, task, dataset, timestamp = match.groups()
                    else:
                        model_name, task, dataset = match.groups()
                    if model_name.endswith("Model"):
                        model_name = model_name[:-5]
                    extracted = True
                    break

            # --- Robust model name assignment for Gemma3/MedGemma ---
            # If ambiguous (model_name == "Gemma3"), use config_copy.yaml to resolve
            if model_name == "Gemma3":
                config_files = [f for f in os.listdir(folder_path) if f.startswith("config_copy") and f.endswith(".yaml")]
                if config_files:
                    config_path = os.path.join(folder_path, config_files[0])
                    resolved_name = get_model_name_from_config(config_path)
                    if resolved_name:
                        model_name = resolved_name

            df["model_name"] = model_name
            df["task"] = task
            df["dataset"] = dataset
            df["timestamp"] = timestamp

            # Add prompting_id column (empty by default)
            df["prompting_id"] = "" # Use empty string for missing prompting_id

            # If LLM, try to get prompting_id from config yaml in the same folder
            llm_names = list(LLM_MODELS.keys()) + list(LLM_MODELS.values())
            if model_name in llm_names:
                config_files = [f for f in os.listdir(folder_path) if f.endswith(".yaml") or f.endswith(".yml")]
                for config_file in config_files:
                    config_path = os.path.join(folder_path, config_file)
                    try:
                        with open(config_path, "r") as f:
                            config = yaml.safe_load(f)
                        # Try to get prompting_ids from config
                        prompting_ids = None
                        if "prompting" in config and "prompting_ids" in config["prompting"]:
                            prompting_ids = config["prompting"]["prompting_ids"]
                        if prompting_ids:
                            # Use first prompting_id if list, else as string
                            if isinstance(prompting_ids, list):
                                df["prompting_id"] = prompting_ids[0]
                            else:
                                df["prompting_id"] = prompting_ids
                            break
                    except Exception:
                        continue

            # Add is_agent column: True if unmapped prompting_id contains 'agent', else False
            def compute_is_agent_row(row):
                if model_name in llm_names and isinstance(row['prompting_id'], str):
                    return 'agent' in row['prompting_id'].lower()
                return False
            df['is_agent'] = df.apply(compute_is_agent_row, axis=1)

            df_mdata = pd.concat([df_mdata, df], ignore_index=True)
        except Exception as e:
            print(f"Error loading metadata from {m_path}: {e}")
            continue

    if not df_mdata.empty:
        df_mdata["model_type"] = df_mdata.apply(
            lambda row: categorize_model_type(
                row["model_name"],
                context=f"file={row.get('source_file', 'unknown')}, row_index={row.name}, model_name={row.get('model_name')}",
            ),
            axis=1,
        )
        df_mdata = unpack_metadata_by_model_type(df_mdata)
        df_mdata = apply_task_dataset_mappings(df_mdata)
        # Apply all name mappings before ordering
        df_mdata = apply_prompting_id_mapping(df_mdata)
        df_mdata = map_llm_model_names(df_mdata)
        # Now order using display names only
        df_mdata = order_dataframe_by_mappings(df_mdata)
        # Add model_prompting_id column: LLMs get model_name+prompting_id, others just model_name
        def get_model_prompting_id(row):
            if row['model_type'] == 'LLM':
                return f"{row['model_name']}, {row['prompting_id']}" if row['prompting_id'] else row['model_name']
            else:
                return row["model_name"]
        df_mdata["model_prompting_id"] = df_mdata.apply(get_model_prompting_id, axis=1)
        print(
            f"Loaded metadata: {df_mdata.shape[0]} rows, {df_mdata.shape[1]} columns."
        )
        return df_mdata

def apply_prompting_id_mapping(df_metadata):
    """
    Map the prompting_id column to display names using PROMPTING_ID_DISPLAY_MAPPING.
    """
    if 'prompting_id' in df_metadata.columns:
        df_metadata['prompting_id'] = df_metadata['prompting_id'].map(PROMPTING_ID_DISPLAY_MAPPING).fillna(df_metadata['prompting_id'])
    return df_metadata


# Map LLM model names to display names using the LLM_MODELS dictionary
def map_llm_model_names(df_metadata):
    """
    Map LLM model names to their display names using the LLM_MODELS dictionary.
    """
    if 'model_name' in df_metadata.columns:
        df_metadata['model_name'] = df_metadata['model_name'].replace(LLM_MODELS)
    return df_metadata


def categorize_model_type(model_name, context=None):
    """
    Categorize model into convML, convDL, or LLM based on model name.
    """
    if model_name in CONVML_MODELS:
        return "convML"
    elif model_name in CONVDL_MODELS:
        return "convDL"
    elif model_name in LLM_MODELS.keys() or model_name in LLM_MODELS.values():
        return "LLM"
    else:
        msg = f"Warning: Unknown model type for {model_name}, defaulting to 'Unknown'"
        if context:
            msg += f" | Context: {context}"
        print(msg)
        return "Unknown"


def unpack_metadata_by_model_type(df_metadata):
    """
    Unpack metadata based on model type (convDL vs convML vs LLM).
    
    Args:
        df_metadata (DataFrame): Metadata DataFrame with model_type column
        
    Returns:
        DataFrame: Unpacked DataFrame where each row is one sample
    """
    def parse_array_string(array_str):
        """Parse string representation of numpy array to actual array."""
        import ast
        import numpy as np
        try:
            # Remove newlines and extra spaces, then evaluate
            clean_str = ' '.join(array_str.split())
            # Handle numpy array string format
            if array_str.startswith('[[') or array_str.startswith('['):
                # Convert string to actual array
                array_str = array_str.replace('[', '').replace(']', '').replace('\n', ' ')
                values = [float(x) for x in array_str.split() if x.strip()]
                return np.array(values)
            else:
                return ast.literal_eval(array_str)
        except:
            print(f"Warning: Could not parse array string: {array_str[:100]}...")
            return None
    
    unpacked_rows = []
    
    for idx, row in df_metadata.iterrows():
        model_type = row['model_type']
        
        if model_type == 'convDL':
            # convDL models have batch structure - need to unpack predictions and labels
            try:
                # Parse predictions and labels
                predictions = parse_array_string(row['prediction'])
                labels = parse_array_string(row['label'])
                
                # Parse demographic arrays (these are repeated for the batch)
                age_array = parse_array_string(row['age'])
                sex_array = parse_array_string(row['sex'])
                height_array = parse_array_string(row['height'])
                weight_array = parse_array_string(row['weight'])
                
                if predictions is not None and labels is not None:
                    # Create individual rows for each prediction/label pair
                    for i in range(len(predictions)):
                        new_row = row.copy()
                        new_row['prediction'] = predictions[i]
                        new_row['label'] = labels[i]
                        new_row['age'] = age_array[i] if age_array is not None and i < len(age_array) else None
                        new_row['sex'] = sex_array[i] if sex_array is not None and i < len(sex_array) else None
                        new_row['height'] = height_array[i] if height_array is not None and i < len(height_array) else None
                        new_row['weight'] = weight_array[i] if weight_array is not None and i < len(weight_array) else None
                        unpacked_rows.append(new_row)
                else:
                    print(f"Warning: Could not parse predictions/labels for row {idx}")
                    unpacked_rows.append(row)
                    
            except Exception as e:
                print(f"Error unpacking convDL row {idx}: {e}")
                unpacked_rows.append(row)
                
        elif model_type == 'convML':
            # convML models already have individual sample structure - parse arrays
            try:
                # Parse the arrays but keep individual values
                predictions = parse_array_string(row['prediction'])
                labels = parse_array_string(row['label'])
                age_array = parse_array_string(row['age'])
                sex_array = parse_array_string(row['sex'])
                height_array = parse_array_string(row['height'])
                weight_array = parse_array_string(row['weight'])
                
                if predictions is not None and labels is not None:
                    # Create individual rows for each sample
                    for i in range(len(predictions)):
                        new_row = row.copy()
                        new_row['prediction'] = predictions[i]
                        new_row['label'] = labels[i]
                        new_row['age'] = age_array[i] if age_array is not None and i < len(age_array) else None
                        new_row['sex'] = sex_array[i] if sex_array is not None and i < len(sex_array) else None
                        new_row['height'] = height_array[i] if height_array is not None and i < len(height_array) else None
                        new_row['weight'] = weight_array[i] if weight_array is not None and i < len(weight_array) else None
                        unpacked_rows.append(new_row)
                else:
                    print(f"Warning: Could not parse arrays for row {idx}")
                    unpacked_rows.append(row)
                    
            except Exception as e:
                print(f"Error unpacking convML row {idx}: {e}")
                unpacked_rows.append(row)
                
        elif model_type == 'LLM':
            # LLM models are already unpacked - extract target label and predicted probability
            # LLM metadata structure: Input Prompt, Target Label, Predicted Probability, Predicted Diagnosis, 
            # Predicted Explanation, Tokenization Time, Inference Time, Input Tokens, Output Tokens
            new_row = row.copy()
            
            # Map LLM columns to standard format
            if 'Target Label' in row:
                new_row['label'] = row['Target Label']
            elif 'target_label' in row:
                new_row['label'] = row['target_label']
            
            if 'Predicted Probability' in row:
                new_row['prediction'] = row['Predicted Probability']
            elif 'predicted_probability' in row:
                new_row['prediction'] = row['predicted_probability']
            
            # Set placeholder demographics - these will be mapped from convML models later
            new_row['age'] = None
            new_row['sex'] = None
            new_row['height'] = None
            new_row['weight'] = None
            
            unpacked_rows.append(new_row)
            
        else:
            # Unknown model type - keep as is
            context = f"file={row.get('source_file', 'unknown')}, row_index={idx}, model_name={row.get('model_name')}"
            print(f"Warning: Unknown model type '{model_type}' for row {idx} | Context: {context}")
            unpacked_rows.append(row)
    
    # Create new DataFrame from unpacked rows
    if unpacked_rows:
        df_unpacked = pd.DataFrame(unpacked_rows)
        print(f"Unpacked {len(df_metadata)} batch rows into {len(df_unpacked)} individual sample rows")
        
        # Show breakdown by model type
        model_type_counts = df_unpacked['model_type'].value_counts()
        print(f"Breakdown by model type:")
        for model_type, count in model_type_counts.items():
            print(f"  {model_type}: {count:,} samples")
        
        return df_unpacked
    else:
        print("Warning: No rows were successfully unpacked")
        return df_metadata


def apply_task_dataset_mappings(df_metadata):
    """
    Apply task and dataset name mappings according to global TASK_MAPPING and DATASET_MAPPING.
    Args:
        df_metadata (DataFrame): Metadata DataFrame
    Returns:
        DataFrame: DataFrame with mapped task and dataset names
    """
    # Create reverse mappings to find keys from current values
    task_reverse_mapping = {}
    for key, display_name in TASK_MAPPING.items():
        task_reverse_mapping[key] = display_name
        task_reverse_mapping[key.lower()] = display_name
        task_reverse_mapping[key.capitalize()] = display_name
        task_reverse_mapping[key.upper()] = display_name
    dataset_reverse_mapping = {}
    for key, display_name in DATASET_MAPPING.items():
        dataset_reverse_mapping[key] = display_name
        dataset_reverse_mapping[key.lower()] = display_name
        dataset_reverse_mapping[key.capitalize()] = display_name
        dataset_reverse_mapping[key.upper()] = display_name
    # Apply task mappings
    original_tasks = df_metadata['task'].unique()
    df_metadata['task'] = df_metadata['task'].map(task_reverse_mapping).fillna(df_metadata['task'])
    mapped_tasks = df_metadata['task'].unique()
    # Apply dataset mappings
    original_datasets = df_metadata['dataset'].unique()
    df_metadata['dataset'] = df_metadata['dataset'].map(dataset_reverse_mapping).fillna(df_metadata['dataset'])
    mapped_datasets = df_metadata['dataset'].unique()
    return df_metadata


def order_dataframe_by_mappings(df_metadata):
    """
    Order the dataframe according to the order defined in global mappings.
    Hierarchical ordering: model_type (convML → convDL → LLM) → model_name → prompting_id → dataset → task
    NOTE: This function expects display names to already be mapped before calling.
    Args:
        df_metadata (DataFrame): Metadata DataFrame with mapped display names
    Returns:
        DataFrame: Ordered DataFrame
    """
    # Define order based on mappings
    task_order = list(TASK_MAPPING.values())
    dataset_order = list(DATASET_MAPPING.values())
    
    # Create model type ordering
    model_type_order = ['convML', 'convDL', 'LLM']
    
    # Create model name ordering within each model type - use display names only
    convml_model_order = CONVML_MODELS
    convdl_model_order = CONVDL_MODELS
    # For LLM models, use only the display names
    llm_model_order = list(LLM_MODELS.values())
    
    # Create prompting_id ordering (for LLMs) - use only display names
    prompting_id_order = list(PROMPTING_ID_DISPLAY_MAPPING.values())
    
    # Create categorical columns for proper sorting
    df_metadata['model_type'] = pd.Categorical(
        df_metadata['model_type'],
        categories=model_type_order,
        ordered=True
    )
    
    # Create a combined model ordering with display names only
    all_model_order = convml_model_order + convdl_model_order + llm_model_order
    df_metadata['model_name'] = pd.Categorical(
        df_metadata['model_name'], 
        categories=all_model_order,
        ordered=True
    )
    
    # Create prompting_id categorical (empty string for non-LLM models)
    if 'prompting_id' in df_metadata.columns:
        # Add empty string to the beginning and include only display names
        prompting_id_categories = [''] + prompting_id_order
        df_metadata['prompting_id'] = pd.Categorical(
            df_metadata['prompting_id'],
            categories=prompting_id_categories,
            ordered=True
        )
    
    df_metadata['task'] = pd.Categorical(
        df_metadata['task'],
        categories=task_order,
        ordered=True
    )
    df_metadata['dataset'] = pd.Categorical(
        df_metadata['dataset'],
        categories=dataset_order,
        ordered=True
    )
    
    # Sort hierarchically: model_type → model_name → prompting_id → dataset → task
    sort_columns = ['model_type', 'model_name']
    if 'prompting_id' in df_metadata.columns:
        sort_columns.append('prompting_id')
    sort_columns.extend(['dataset', 'task'])
    
    df_metadata = df_metadata.sort_values(sort_columns, ignore_index=True)
    return df_metadata

In [39]:
def add_sample_index_to_metadata(df_metadata):
    """
    Add sample_index to metadata to ensure consistent sample alignment across models.
    For convML/convDL: index by (model_name, task, dataset).
    For LLM: index by (model_name, prompting_id, task, dataset).
    For agent LLMs (is_agent=True): sample_index increments at Step Name == 1 or Step Name == 'SAMPLE_METADATA'.
    Uses original_row_index to guarantee sample_index follows original row order.
    Args:
        df_metadata (DataFrame): Metadata DataFrame after unpacking
    Returns:
        DataFrame: DataFrame with sample_index column added
    """
    df_result = df_metadata.copy()
    # convML/convDL
    mask_ml = df_result['model_type'].isin(['convML', 'convDL'])
    group_cols_ml = ['model_name', 'task', 'dataset']
    for keys, group in df_result[mask_ml].groupby(group_cols_ml, observed=True):
        idx = group.index
        # Sort by original_row_index to preserve original order
        group_sorted = group.sort_values('original_row_index')
        sample_indices = np.arange(len(group_sorted)).astype(int)
        df_result.loc[group_sorted.index, 'sample_index'] = sample_indices
    # LLM
    mask_llm = df_result['model_type'] == 'LLM'
    group_cols_llm = ['model_name', 'prompting_id', 'task', 'dataset']
    for keys, group in df_result[mask_llm].groupby(group_cols_llm, observed=True):
        idx = group.index
        is_agent = group['is_agent'].iloc[0] if 'is_agent' in group else False
        # Sort by original_row_index to preserve original order
        group_sorted = group.sort_values('original_row_index')

        if is_agent:
            sample_indices = np.full(len(group_sorted), -1, dtype=int)
            current_index = -1
            prompting_id = group['prompting_id'].iloc[0] if 'prompting_id' in group else ''

            for i, (_, row) in enumerate(group_sorted.iterrows()):
                step_name = row.get('Step Name', None)
                step_number = row.get('Step Number', None)

                # Different increment logic based on prompting_id
                if prompting_id == 'SumAgent':
                    # For SumAgent: increment at Step Number == 1
                    if step_number == 1:
                        current_index += 1
                else:
                    # For other agents: increment at Step Name == 'SAMPLE_METADATA'
                    if isinstance(step_name, str) and step_name.strip().upper() == 'SAMPLE_METADATA':
                        current_index += 1

                sample_indices[i] = current_index
            df_result.loc[group_sorted.index, 'sample_index'] = sample_indices.astype(int)
        else:
            sample_indices = np.arange(len(group_sorted)).astype(int)
            df_result.loc[group_sorted.index, 'sample_index'] = sample_indices
    # Ensure column is int type (if possible)
    if 'sample_index' in df_result.columns:
        try:
            df_result['sample_index'] = df_result['sample_index'].astype(int)
        except Exception:
            pass
    print(f"Sample indices added for all model-type groups (using original_row_index order).")
    return df_result

In [40]:
def quality_control_check_sample_counts(df_metadata):
    """
    Quality control function to check if task-dataset combinations have 
    the same number of rows for each model_name.
    
    Args:
        df_metadata (DataFrame): Metadata DataFrame with unpacked samples
        
    Returns:
        dict: Quality control results with warnings and statistics
    """
    qc_results = {
        'passed': True,
        'warnings': [],
        'statistics': {},
        'inconsistencies': []
    }
    task_dataset_combinations = df_metadata.groupby(['task', 'dataset'])
    for (task, dataset), group in task_dataset_combinations:
        model_counts = group['model_name'].value_counts().sort_index()
        combination_key = f"{task}_{dataset}"
        qc_results['statistics'][combination_key] = model_counts.to_dict()
        unique_counts = model_counts.unique()
        if len(unique_counts) != 1:
            qc_results['passed'] = False
            expected_count = model_counts.mode().iloc[0]
            inconsistent_models = model_counts[model_counts != expected_count]
            inconsistency = {
                'task': task,
                'dataset': dataset,
                'expected_count': expected_count,
                'inconsistent_models': inconsistent_models.to_dict()
            }
            qc_results['inconsistencies'].append(inconsistency)
            warning_msg = f"Inconsistent sample counts in {task}-{dataset}: {inconsistent_models.to_dict()}"
            qc_results['warnings'].append(warning_msg)
    total_combinations = len(task_dataset_combinations)
    inconsistent_combinations = len(qc_results['inconsistencies'])
    consistent_combinations = total_combinations - inconsistent_combinations
    print(f"QC: {consistent_combinations} consistent, {inconsistent_combinations} inconsistent task-dataset combinations.")
    demographics_qc = check_demographics_overlap_convml(df_metadata)
    qc_results['demographics_check'] = demographics_qc
    if not demographics_qc['passed']:
        qc_results['passed'] = False
        qc_results['warnings'].extend(demographics_qc['warnings'])
    return qc_results

In [41]:
def map_convml_demographics_to_all_models(df_metadata):
    """
    Map demographics data from convML models to all convDL and LLM models using sample_index for alignment.
    For LLMs, mapping is performed for each unique (model_name, prompting_id, task, dataset) group.
    Args:
        df_metadata (DataFrame): Metadata DataFrame with mixed demographics and sample_index column
        
    Returns:
        DataFrame: Updated DataFrame with consistent demographics across all models
    """
    print("\n--- Demographics Mapping Step ---")
    df_result = df_metadata.copy()
    demographic_cols = ['age', 'sex', 'height', 'weight']
    required_cols = demographic_cols + ['sample_index']
    missing_cols = [col for col in required_cols if col not in df_result.columns]
    if missing_cols:
        print(f"   ❌ Missing required columns: {missing_cols}")
        return df_result
    mapping_stats = {
        'combinations_processed': 0,
        'convdl_models_updated': set(),
        'llm_models_updated': set(),
        'samples_updated': 0,
        'successful_combinations': 0,
        'failed_combinations': []
    }
    combinations = df_result.groupby(['task', 'dataset'], observed=True).size().index.tolist()
    for task, dataset in combinations:
        combination_mask = (df_result['task'] == task) & (df_result['dataset'] == dataset)
        combination_data = df_result[combination_mask].copy()
        convml_data = combination_data[combination_data['model_type'] == 'convML']
        convdl_data = combination_data[combination_data['model_type'] == 'convDL']
        llm_data = combination_data[combination_data['model_type'] == 'LLM']
        if convml_data.empty:
            continue
        if convdl_data.empty and llm_data.empty:
            continue
        mapping_stats['combinations_processed'] += 1
        convml_models = convml_data['model_name'].unique()
        reference_model = convml_models[0]
        reference_data = convml_data[convml_data['model_name'] == reference_model].set_index('sample_index')
        combination_success = True
        # Update convDL models as before
        if not convdl_data.empty:
            convdl_models = convdl_data['model_name'].unique()
            for convdl_model in convdl_models:
                convdl_model_data = convdl_data[convdl_data['model_name'] == convdl_model].set_index('sample_index')
                if not reference_data.index.equals(convdl_model_data.index):
                    mapping_stats['failed_combinations'].append(f"{task}-{dataset} convDL {convdl_model}")
                    combination_success = False
                    continue
                for sample_idx in reference_data.index:
                    row_mask = (df_result['model_name'] == convdl_model) & \
                              (df_result['task'] == task) & \
                              (df_result['dataset'] == dataset) & \
                              (df_result['sample_index'] == sample_idx)
                    for col in demographic_cols:
                        df_result.loc[row_mask, col] = reference_data.loc[sample_idx, col]
                mapping_stats['convdl_models_updated'].add(convdl_model)
                mapping_stats['samples_updated'] += len(convdl_model_data)
        # Update LLM models for each (model_name, prompting_id, task, dataset)
        if not llm_data.empty:
            llm_group_cols = ['model_name', 'prompting_id', 'task', 'dataset']
            llm_groups = llm_data.groupby(llm_group_cols, observed=True)
            for llm_keys, llm_group in llm_groups:
                llm_model, prompting_id, llm_task, llm_dataset = llm_keys
                if llm_task != task or llm_dataset != dataset:
                    continue
                llm_model_data = llm_group.set_index('sample_index')
                is_agent = llm_group['is_agent'].iloc[0] if 'is_agent' in llm_group else False

                if is_agent:
                    # For agent, map demographics to all rows with sample_index in reference_data
                    valid_indices = reference_data.index.intersection(llm_model_data.index)
                    for sample_idx in valid_indices:
                        row_mask = (df_result['model_name'] == llm_model) & \
                                  (df_result['task'] == task) & \
                                  (df_result['dataset'] == dataset) & \
                                  (df_result['sample_index'] == sample_idx) & \
                                  (df_result['prompting_id'] == prompting_id)
                        for col in demographic_cols:
                            df_result.loc[row_mask, col] = reference_data.loc[sample_idx, col]
                    mapping_stats['llm_models_updated'].add((llm_model, prompting_id, task, dataset))
                    mapping_stats['samples_updated'] += len(valid_indices)
                else:
                    if not reference_data.index.equals(llm_model_data.index):
                        fail_str = f"{task}-{dataset} LLM {llm_model} (prompting_id={prompting_id})"
                        mapping_stats['failed_combinations'].append(fail_str)
                        combination_success = False
                        continue
                    for sample_idx in reference_data.index:
                        row_mask = (df_result['model_name'] == llm_model) & \
                                  (df_result['task'] == task) & \
                                  (df_result['dataset'] == dataset) & \
                                  (df_result['sample_index'] == sample_idx) & \
                                  (df_result['prompting_id'] == prompting_id)
                        for col in demographic_cols:
                            df_result.loc[row_mask, col] = reference_data.loc[sample_idx, col]
                    mapping_stats['llm_models_updated'].add((llm_model, prompting_id, task, dataset))
                    mapping_stats['samples_updated'] += len(llm_model_data)
        if combination_success:
            mapping_stats['successful_combinations'] += 1
    total_convdl_updated = len(mapping_stats['convdl_models_updated'])
    total_llm_updated = len(mapping_stats['llm_models_updated'])
    print(f"   Processed {mapping_stats['combinations_processed']} task-dataset combinations.")
    print(f"   Updated {total_convdl_updated} convDL models and {total_llm_updated} LLM model/prompting_id/task/dataset combinations.")
    print(f"   Total samples updated: {mapping_stats['samples_updated']:,}")
    if mapping_stats['failed_combinations']:
        print(f"   Failed to map {len(mapping_stats['failed_combinations'])} model combinations.")
    verification_result = verify_demographics_mapping(df_result)
    if verification_result['perfect_consistency']:
        print(f"   Demographics mapping verification PASSED: all combinations consistent.")
    else:
        print(f"   Demographics mapping verification FAILED: {len(verification_result['inconsistent_combinations'])} inconsistent combinations.")
    print("--- End Demographics Mapping ---\n")
    return df_result


def verify_demographics_mapping(df_metadata):
    """
    Verify that all models now have identical demographics for each group used in mapping.
    For convML/convDL: check (model_name, task, dataset).
    For LLM: check (model_name, prompting_id, task, dataset).
    Returns:
        dict: Verification results
    """
    verification_results = {
        'perfect_consistency': True,
        'inconsistent_combinations': [],
        'consistent_combinations': 0
    }
    demographic_cols = ['age', 'sex', 'height', 'weight']
    # convML/convDL
    mask_ml = df_metadata['model_type'].isin(['convML', 'convDL'])
    group_cols_ml = ['task', 'dataset']
    for (task, dataset), group in df_metadata[mask_ml].groupby(group_cols_ml, observed=True):
        models = group['model_name'].unique()
        if len(models) < 2:
            continue
        reference_model = models[0]
        reference_data = group[group['model_name'] == reference_model].set_index('sample_index')[demographic_cols]
        all_models_consistent = True
        inconsistent_models = []
        for model in models[1:]:
            model_data = group[group['model_name'] == model].set_index('sample_index')[demographic_cols]
            if not reference_data.index.equals(model_data.index):
                all_models_consistent = False
                inconsistent_models.append(model)
                continue
            demographics_match = True
            for col in demographic_cols:
                if col == 'sex':
                    if not (reference_data[col] == model_data[col]).all():
                        demographics_match = False
                        break
                else:
                    if not np.allclose(reference_data[col], model_data[col], rtol=1e-6, equal_nan=True):
                        demographics_match = False
                        break
            if not demographics_match:
                all_models_consistent = False
                inconsistent_models.append(model)
        if all_models_consistent:
            verification_results['consistent_combinations'] += 1
        else:
            verification_results['perfect_consistency'] = False
            verification_results['inconsistent_combinations'].append({
                'task': task,
                'dataset': dataset,
                'reference_model': reference_model,
                'inconsistent_models': inconsistent_models
            })
    # LLM
    mask_llm = df_metadata['model_type'] == 'LLM'
    group_cols_llm = ['task', 'dataset', 'model_name']
    for (task, dataset, model_name), group in df_metadata[mask_llm].groupby(group_cols_llm, observed=True):
        prompting_ids = group['prompting_id'].unique()
        # Find convML reference row count for this task/dataset
        convml_mask = (
            (df_metadata['model_type'] == 'convML') &
            (df_metadata['task'] == task) &
            (df_metadata['dataset'] == dataset)
        )
        convml_ref_count = None
        if convml_mask.any():
            convml_ref_count = len(df_metadata[convml_mask]['sample_index'].unique())
        # Find all prompting_id groups matching convML reference row count
        matching_promptings = []
        for pid in prompting_ids:
            pid_group = group[group['prompting_id'] == pid]
            if convml_ref_count is not None and len(pid_group['sample_index'].unique()) == convml_ref_count:
                matching_promptings.append(pid)
        # Report only those that do not match convML reference
        inconsistent_promptings = []
        for pid in prompting_ids:
            pid_group = group[group['prompting_id'] == pid]
            if convml_ref_count is None or len(pid_group['sample_index'].unique()) != convml_ref_count:
                inconsistent_promptings.append(pid)
        if len(inconsistent_promptings) == 0:
            verification_results['consistent_combinations'] += 1
        else:
            verification_results['perfect_consistency'] = False
            verification_results['inconsistent_combinations'].append({
                'task': task,
                'dataset': dataset,
                'model_name': model_name,
                'inconsistent_prompting_ids': inconsistent_promptings
            })
    total_combinations = verification_results['consistent_combinations'] + len(verification_results['inconsistent_combinations'])
    if verification_results['perfect_consistency']:
        print(f"   ✅ Verification: All {total_combinations} mapped groups have consistent demographics")
    else:
        print(f"   ❌ Verification: {len(verification_results['inconsistent_combinations'])}/{total_combinations} mapped groups have inconsistent demographics")
        for inc in verification_results['inconsistent_combinations']:
            if 'inconsistent_models' in inc:
                print(f"     - {inc['task']}-{inc['dataset']}: {inc['inconsistent_models']}")
            else:
                print(f"     - {inc['task']}-{inc['dataset']} {inc['model_name']}: {inc['inconsistent_prompting_ids']}")
    return verification_results

### Running Data Loading and Preparation

In [42]:
def prepare_and_save_metadata(outputfolder_path_list, save_path="./notebook_output/postprocessed_data/pulse_metadata.csv"):
    """
    Prepare df_metadata: load, add indices, map demographics, add probabilities, reorder columns, and save.
    Returns the prepared DataFrame.
    """
    cat_files = categorize_files(outputfolder_path_list)

    # Load the metadata from identified files
    df_metadata = load_metadata(cat_files["metadata_files"])

    # Add original_row_index before any sorting or mapping
    df_metadata['original_row_index'] = np.arange(len(df_metadata))

    # Add sample_index for consistent sample alignment across models
    print("\n🔢 Adding sample_index for consistent sample alignment...")
    df_metadata = add_sample_index_to_metadata(df_metadata)

    # Display basic information about the loaded data
    print(f"\nLoaded metadata: {len(df_metadata)} records, columns: {list(df_metadata.columns)}")
    print(f"Models: {list(df_metadata['model_name'].unique())}")
    print(f"Prompting IDs: {list(df_metadata['prompting_id'].unique())}")
    print(f"Tasks: {list(df_metadata['task'].unique())}")
    print(f"Datasets: {list(df_metadata['dataset'].unique())}")

    # Map convML demographics to all models for consistency (summary printed inside function)
    df_metadata = map_convml_demographics_to_all_models(df_metadata)

    # Add probabilities column (convert convDL logits to probabilities, copy convML and LLM probabilities)
    print("\n🔄 Adding probabilities column...")
    df_metadata['probabilities'] = df_metadata['prediction'].copy()

    convdl_mask = df_metadata['model_type'] == 'convDL'
    if convdl_mask.any():
        df_metadata.loc[convdl_mask, 'probabilities'] = expit(df_metadata.loc[convdl_mask, 'prediction'])
        convdl_models = df_metadata[convdl_mask]['model_name'].unique()
        print(f"   ✅ Converted logits to probabilities for convDL models: {list(convdl_models)}")

    convml_mask = df_metadata['model_type'] == 'convML'
    if convml_mask.any():
        convml_models = df_metadata[convml_mask]['model_name'].unique()
        print(f"   ✅ Copied probabilities for convML models: {list(convml_models)}")

    llm_mask = df_metadata['model_type'] == 'LLM'
    if llm_mask.any():
        llm_models = df_metadata[llm_mask]['model_name'].unique()
        print(f"   ✅ Copied probabilities for LLM models: {list(llm_models)}")
    print(f"   📊 Probabilities column added - range: [{df_metadata['probabilities'].min():.3f}, {df_metadata['probabilities'].max():.3f}]")

    # Reorder columns to have key columns first
    first_cols = [
        'model_type', 'model_name', 'is_agent', 'prompting_id', 'task', 'dataset',
        'sample_index', 'original_row_index', 'sex', 'age', 'height', 'weight',
        'probabilities', 'prediction', 'label'
    ]
    first_cols_present = [col for col in first_cols if col in df_metadata.columns]
    other_cols = [col for col in df_metadata.columns if col not in first_cols_present]
    # Place source_file as the last column if it exists
    if "source_file" in df_metadata.columns:
        df_metadata = df_metadata[first_cols_present + other_cols + ["source_file"]]
    else:
        df_metadata = df_metadata[first_cols_present + other_cols]

    # Save the prepared metadata DataFrame
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    df_metadata.to_csv(save_path, index=False)
    print(f"Saved metadata to {save_path}")

    # Show first few rows
    print("\nFirst 5 rows of metadata:")
    display(df_metadata.head())

    return df_metadata

# Prepare and save metadata
df_metadata = prepare_and_save_metadata(outputfolder_path_list, save_path="./notebook_output/postprocessed_data/pulse_metadata.csv")

Found 68 metrics report files, 612 metadata files, 118 log files.
Unpacked 1025336 batch rows into 1062044 individual sample rows
Breakdown by model type:
  LLM: 1,024,910 samples
  convDL: 24,756 samples
  convML: 12,378 samples
Loaded metadata: 1062044 rows, 92 columns.

🔢 Adding sample_index for consistent sample alignment...
Sample indices added for all model-type groups (using original_row_index order).

Loaded metadata: 1062044 records, columns: ['Sample ID', 'Step Name', 'Step Number', 'Target Label', 'metadata_patient_age', 'metadata_patient_sex', 'metadata_patient_weight', 'metadata_patient_height', 'metadata_monitoring_hours', 'metadata_total_features_available', 'metadata_data_completeness_score', 'metadata_imputation_percentage', 'metadata_agent_type', 'metadata_sample_start_time', 'System Message', 'Input Prompt', 'Output', 'Predicted Probability', 'Predicted Diagnosis', 'Predicted Explanation', 'Requested Tests', 'Confidence', 'Tokenization Time', 'Inference Time', 'Input

Unnamed: 0,model_type,model_name,is_agent,prompting_id,task,dataset,sample_index,original_row_index,sex,age,...,metadata_specialist_probability_variance,metadata_specialist_probability_range,metadata_average_specialist_confidence,metadata_successful_specialists_count,metadata_failed_specialists_count,metadata_highest_prob_specialist,metadata_highest_conf_specialist,metadata_specialists_with_data,model_prompting_id,source_file
0,convML,RandomForest,False,,Mortality,HiRID,0,0,1.0,55.0,...,,,,,,,,,RandomForest,/Users/sophiaehlers/Documents/pulse_agents/out...
1,convML,RandomForest,False,,Mortality,HiRID,1,1,0.0,50.0,...,,,,,,,,,RandomForest,/Users/sophiaehlers/Documents/pulse_agents/out...
2,convML,RandomForest,False,,Mortality,HiRID,2,2,0.0,30.0,...,,,,,,,,,RandomForest,/Users/sophiaehlers/Documents/pulse_agents/out...
3,convML,RandomForest,False,,Mortality,HiRID,3,3,0.0,60.0,...,,,,,,,,,RandomForest,/Users/sophiaehlers/Documents/pulse_agents/out...
4,convML,RandomForest,False,,Mortality,HiRID,4,4,1.0,75.0,...,,,,,,,,,RandomForest,/Users/sophiaehlers/Documents/pulse_agents/out...


In [43]:
# Debug: Check if each model_prompting_id has all 9 task-dataset combinations
print("🔍 DEBUG: Checking task-dataset combination completeness for each model_prompting_id\n")

# Define expected combinations
expected_tasks = list(TASK_MAPPING.values())  # ["Mortality", "AKI", "Sepsis"]
expected_datasets = list(DATASET_MAPPING.values())  # ["HiRID", "MIMIC-IV", "eICU"]
expected_combinations = [(task, dataset) for task in expected_tasks for dataset in expected_datasets]
print(f"Expected {len(expected_combinations)} combinations: {expected_combinations}\n")

# Get actual combinations for each model_prompting_id
model_prompting_ids = df_metadata['model_prompting_id'].unique()
all_complete = True
missing_summary = []

for model_prompting_id in sorted(model_prompting_ids):
    model_data = df_metadata[df_metadata['model_prompting_id'] == model_prompting_id]
    actual_combinations = set(zip(model_data['task'], model_data['dataset']))
    expected_combinations_set = set(expected_combinations)
    
    missing_combinations = expected_combinations_set - actual_combinations
    
    if missing_combinations:
        all_complete = False
        missing_list = sorted(list(missing_combinations))
        missing_summary.append((model_prompting_id, missing_list))
        print(f"❌ {model_prompting_id}")
        print(f"   Missing {len(missing_combinations)}/{len(expected_combinations)} combinations:")
        for task, dataset in missing_list:
            print(f"   - {task} + {dataset}")
        print()
    else:
        print(f"✅ {model_prompting_id} - All {len(expected_combinations)} combinations present")

print(f"\n📊 SUMMARY:")
if all_complete:
    print(f"🎉 All {len(model_prompting_ids)} model_prompting_ids have complete task-dataset combinations!")
else:
    complete_count = len(model_prompting_ids) - len(missing_summary)
    print(f"⚠️  {complete_count}/{len(model_prompting_ids)} model_prompting_ids are complete")
    print(f"   {len(missing_summary)} model_prompting_ids have missing combinations:")
    for model_prompting_id, missing_list in missing_summary:
        print(f"   - {model_prompting_id}: {len(missing_list)} missing")

🔍 DEBUG: Checking task-dataset combination completeness for each model_prompting_id

Expected 9 combinations: [('Mortality', 'HiRID'), ('Mortality', 'MIMIC-IV'), ('Mortality', 'eICU'), ('AKI', 'HiRID'), ('AKI', 'MIMIC-IV'), ('AKI', 'eICU'), ('Sepsis', 'HiRID'), ('Sepsis', 'MIMIC-IV'), ('Sepsis', 'eICU')]

✅ CNN - All 9 combinations present
✅ Claude-Sonnet-4, HybReAgent - All 9 combinations present
✅ Claude-Sonnet-4, Zero-Shot - All 9 combinations present
✅ Deepseek-R1-Distill-Llama-8B, Aggregation - All 9 combinations present
✅ Deepseek-R1-Distill-Llama-8B, ClinFlowAgent - All 9 combinations present
✅ Deepseek-R1-Distill-Llama-8B, CoT - All 9 combinations present
✅ Deepseek-R1-Distill-Llama-8B, ColAgent - All 9 combinations present
✅ Deepseek-R1-Distill-Llama-8B, Few-Shot (3) - All 9 combinations present
✅ Deepseek-R1-Distill-Llama-8B, HybReAgent - All 9 combinations present
✅ Deepseek-R1-Distill-Llama-8B, One-Shot - All 9 combinations present
✅ Deepseek-R1-Distill-Llama-8B, SumAgent -

## Metrics Calculation

### Performance Metrics

In [44]:
def calculate_subgroup_metrics(df_metadata):
    """
    Calculate subgroup metrics for all models, tasks, datasets and subgroups.
    Also includes overall metrics (without subgroups) as the first row for each combination.
    Adds 'prompting_id' and 'is_agent' columns to the metrics DataFrame.
    Args:
        df_metadata (DataFrame): Metadata DataFrame with predictions and labels
    Returns:
        DataFrame: Single DataFrame with overall and all subgroup metrics
    """
    df_metadata = add_bmi_to_metadata(df_metadata)
    all_metrics_list = []
    # Group by model_type, model_name, is_agent, prompting_id, task, dataset (in this order, only if columns exist)
    group_cols = []
    for col in ['model_type', 'model_name', 'is_agent', 'prompting_id', 'task', 'dataset']:
        if col in df_metadata.columns:
            group_cols.append(col)
    for group_keys, group in df_metadata.groupby(group_cols):
        # Unpack group keys in the correct order
        key_dict = dict(zip(group_cols, group_keys if isinstance(group_keys, tuple) else [group_keys]))
        model = key_dict.get('model_name', '')
        is_agent = key_dict.get('is_agent', False)
        prompting_id = key_dict.get('prompting_id', '')
        task = key_dict.get('task', '')
        dataset = key_dict.get('dataset', '')
        # Filter for agent: only Step Name == 'final_prediction'
        group_for_metrics = group
        if is_agent and 'Step Name' in group.columns:
            group_for_metrics = group[group['Step Name'] == 'final_prediction']
        overall_metrics = calculate_overall_metrics_helper(group_for_metrics, model, task, dataset)
        if overall_metrics:
            overall_metrics['prompting_id'] = prompting_id
            overall_metrics['is_agent'] = is_agent
            all_metrics_list.append(overall_metrics)
        all_subgroups = calculate_all_subgroup_metrics(group_for_metrics, model, task, dataset)
        for m in all_subgroups:
            m['prompting_id'] = prompting_id
            m['is_agent'] = is_agent
        all_metrics_list.extend(all_subgroups)
    df_metrics = pd.DataFrame(all_metrics_list)
    print(f"📈 Summary:")
    print(f"   Total metric combinations: {len(df_metrics)}")
    return df_metrics


def calculate_overall_metrics_helper(group_data, model, task, dataset):
    """Helper function to calculate overall metrics for the entire group."""
    try:
        predictions = group_data['probabilities'].values  # Use probabilities column
        labels = group_data['label'].values

        # Calculate basic metrics
        metrics = calculate_all_metrics(labels, predictions)

        # Add metadata and counts, with key columns first
        metrics_ordered = {
            "model_name": model,
            "is_agent": (
                group_data["is_agent"].iloc[0]
                if "is_agent" in group_data.columns
                else False
            ),
            "prompting_id": (
                group_data["prompting_id"].iloc[0]
                if "prompting_id" in group_data.columns
                else ""
            ),
            "task": task,
            "dataset": dataset,
            "subgroup_type": "Overall",
            "subgroup_value": "All",
            "sample_count": len(predictions),
            "positive_count": int(np.sum(labels)),
            "negative_count": int(len(labels) - np.sum(labels)),
            "positive_rate": np.mean(labels),
        }
        metrics_ordered.update(metrics)
        metrics = metrics_ordered

        return metrics

    except Exception as e:
        return None


def add_bmi_to_metadata(df_metadata):
    """
    Add BMI calculation and categorization to the metadata DataFrame.
    
    Args:
        df_metadata (DataFrame): Metadata DataFrame with height and weight
        
    Returns:
        DataFrame: DataFrame with BMI and BMI_category columns added
    """
    df_result = df_metadata.copy()

    # Calculate BMI: weight (kg) / height (m)^2
    # Height is in cm, convert to meters
    height_m = df_result['height'] / 100
    df_result['BMI'] = df_result['weight'] / (height_m ** 2)

    # Categorize BMI according to WHO standards
    def categorize_bmi(bmi):
        if pd.isna(bmi):
            return 'Unknown'
        elif bmi < 18.5:
            return 'BMI < 18.5 kg/m2'
        elif bmi < 25.0:
            return 'BMI 18.5-25 kg/m2'
        elif bmi < 30.0:
            return 'BMI 25-30 kg/m2'
        else:
            return 'BMI > 30 kg/m2'

    df_result['bmi_category'] = df_result['BMI'].apply(categorize_bmi)

    # Order BMI categories correctly
    df_result["bmi_category"] = pd.Categorical(
        df_result["bmi_category"], categories=BMI_LIST, ordered=True
    )

    return df_result

def calculate_all_subgroup_metrics(group_data, model, task, dataset):
    """Calculate metrics for all subgroups (sex, age, BMI) in a single function."""
    all_subgroups = []
    
    # 1. Sex subgroups
    sex_groups = {
        'Male': group_data[group_data['sex'] == 1],
        'Female': group_data[group_data['sex'] == 0]
    }
    
    for sex, sex_group in sex_groups.items():
        if len(sex_group) > 0 and len(sex_group['label'].unique()) >= 2:
            metrics = calculate_subgroup_metrics_helper(sex_group, model, task, dataset, 'Sex', sex)
            if metrics:
                all_subgroups.append(metrics)
    
    # 2. Age subgroups
    age_groups = {
        '18-65 Years': group_data[(group_data['age'] >= 18) & (group_data['age'] < 65)],
        '65-75 Years': group_data[(group_data['age'] >= 65) & (group_data['age'] < 75)],
        '75-91 Years': group_data[(group_data['age'] >= 75) & (group_data['age'] <= 91)]
    }
    
    for age_range, age_group in age_groups.items():
        if len(age_group) > 0 and len(age_group['label'].unique()) >= 2:
            metrics = calculate_subgroup_metrics_helper(age_group, model, task, dataset, 'Age', age_range)
            if metrics:
                all_subgroups.append(metrics)
    
    # 3. BMI subgroups
    bmi_categories = group_data['bmi_category'].unique()
    bmi_categories = [cat for cat in BMI_LIST if cat != 'Unknown']
    
    for bmi_category in bmi_categories:
        bmi_group = group_data[group_data['bmi_category'] == bmi_category]
        if len(bmi_group) > 0 and len(bmi_group['label'].unique()) >= 2:
            metrics = calculate_subgroup_metrics_helper(bmi_group, model, task, dataset, 'BMI', bmi_category)
            if metrics:
                all_subgroups.append(metrics)
    
    return all_subgroups


def calculate_subgroup_metrics_helper(subgroup_data, model, task, dataset, subgroup_type, subgroup_value):
    """Helper function to calculate metrics for a single subgroup."""
    try:
        predictions = subgroup_data['probabilities'].values  # Use probabilities column
        labels = subgroup_data['label'].values
        
        # Calculate basic metrics
        metrics = calculate_all_metrics(labels, predictions)
        
        # Add metadata and counts
        metrics.update({
            'model_name': model,
            'task': task,
            'dataset': dataset,
            'subgroup_type': subgroup_type,
            'subgroup_value': subgroup_value,
            'sample_count': len(predictions),
            'positive_count': int(np.sum(labels)),
            'negative_count': int(len(labels) - np.sum(labels)),
            'positive_rate': np.mean(labels)
        })
        
        return metrics
        
    except Exception as e:
        return None


def display_metrics_summary(df_metrics):
    """Display a simplified summary of calculated metrics."""
    print(f"📈 Summary:")
    print(f"   Total metric combinations: {len(df_metrics)}")

    print("📊 SUBGROUP DEFINITIONS:")
    for subgroup_type in sorted(df_metrics['subgroup_type'].unique()):
        values = sorted(df_metrics[df_metrics['subgroup_type'] == subgroup_type]['subgroup_value'].unique())
        print(f"   {subgroup_type}: {values}")

    # Check for subgroups with no positive labels
    zero_positive = df_metrics[df_metrics['positive_count'] == 0]
    if not zero_positive.empty:
        print(f"\n⚠️  SUBGROUPS WITH NO POSITIVE LABELS ({len(zero_positive)} found):")
        for _, row in zero_positive.iterrows():
            print(f"   {row['model_name']} - {row['task']} - {row['dataset']} - {row['subgroup_type']}:{row['subgroup_value']}")
    else:
        print(f"\n✅ All subgroups have positive labels")

    print(f"\n📋 FIRST 10 ROWS OF df_metrics:")

# Calculate metrics (overall + subgroups)
df_metrics = calculate_subgroup_metrics(df_metadata)

# Add model_type to metrics DataFrame (copied from metadata)
if 'model_name' in df_metrics.columns and 'model_name' in df_metadata.columns:
    model_type_map = dict(zip(df_metadata['model_name'], df_metadata['model_type']))
    df_metrics['model_type'] = df_metrics['model_name'].map(model_type_map)

# Add model_prompting_id column to metrics DataFrame
if 'model_name' in df_metrics.columns and 'model_type' in df_metrics.columns:
    def get_model_prompting_id_metrics(row):
        if row.get('model_type') == 'LLM' and row.get('prompting_id', ''):
            return f"{row['model_name']}, {row['prompting_id']}"
        else:
            return row['model_name']
    df_metrics['model_prompting_id'] = df_metrics.apply(get_model_prompting_id_metrics, axis=1)

# Reorder columns to have model_type first
cols = list(df_metrics.columns)
if "model_type" in cols:
    cols = ["model_type"] + [c for c in cols if c != "model_type"]
    df_metrics = df_metrics[cols]

# Rename columns
df_metrics = df_metrics.rename(
    columns={
        **METRICS_MAPPING,
        'model_name': 'Model',
        'task': 'Task',
        'dataset': 'Dataset'
    }
)

# After renaming columns
metric_cols = [
    "AUROC", "AUPRC", "Normalized AUPRC", "Min(+P, Se)", "Sensitivity (Recall)",
    "Specificity", "Precision", "F1 Score", "Accuracy", "Balanced Accuracy", "MCC", "Cohen's Kappa"
]
# Place these after your key columns (e.g., Model, Task, Dataset, etc.)
key_cols = ["model_type", "Model", "Task", "Dataset", "subgroup_type", "subgroup_value"]
df_metrics = df_metrics[key_cols + metric_cols + [c for c in df_metrics.columns if c not in key_cols + metric_cols]]

# Save df_metrics as pulse_metrics.csv in ./notebook_output/data
os.makedirs("./notebook_output/postprocessed_data", exist_ok=True)
df_metrics.to_csv("./notebook_output/postprocessed_data/pulse_metrics.csv", index=False)
print("Saved metrics to ./notebook_output/postprocessed_data/pulse_metrics.csv")

# Display summary
display_metrics_summary(df_metrics)
display(df_metrics.head(10))

  for group_keys, group in df_metadata.groupby(group_cols):


📈 Summary:
   Total metric combinations: 5704
Saved metrics to ./notebook_output/data/pulse_metrics.csv
📈 Summary:
   Total metric combinations: 5704
📊 SUBGROUP DEFINITIONS:
   Age: ['18-65 Years', '65-75 Years', '75-91 Years']
   BMI: ['BMI 18.5-25 kg/m2', 'BMI 25-30 kg/m2', 'BMI < 18.5 kg/m2', 'BMI > 30 kg/m2']
   Overall: ['All']
   Sex: ['Female', 'Male']

✅ All subgroups have positive labels

📋 FIRST 10 ROWS OF df_metrics:


Unnamed: 0,model_type,Model,Task,Dataset,subgroup_type,subgroup_value,AUROC,AUPRC,Normalized AUPRC,"Min(+P, Se)",...,Balanced Accuracy,MCC,Cohen's Kappa,is_agent,prompting_id,sample_count,positive_count,negative_count,positive_rate,model_prompting_id
0,convML,RandomForest,Mortality,HiRID,Overall,All,0.902,0.614,5.58,0.5,...,0.591,0.406,0.283,False,,100,11,89,0.11,RandomForest
1,convML,RandomForest,Mortality,HiRID,Sex,Male,0.888,0.7,5.162,0.545,...,0.625,0.473,0.366,False,,59,8,51,0.135593,RandomForest
2,convML,RandomForest,Mortality,HiRID,Sex,Female,0.917,0.285,3.891,0.375,...,0.5,0.0,0.0,False,,41,3,38,0.073171,RandomForest
3,convML,RandomForest,Mortality,HiRID,Age,18-65 Years,0.969,0.817,9.528,0.667,...,0.667,0.56,0.478,False,,35,3,32,0.085714,RandomForest
4,convML,RandomForest,Mortality,HiRID,Age,65-75 Years,0.889,0.559,5.593,0.5,...,0.5,0.0,0.0,False,,30,3,27,0.1,RandomForest
5,convML,RandomForest,Mortality,HiRID,Age,75-91 Years,0.863,0.518,3.628,0.429,...,0.6,0.42,0.3,False,,35,5,30,0.142857,RandomForest
6,convML,RandomForest,Mortality,HiRID,BMI,BMI 18.5-25 kg/m2,0.909,0.682,6.311,0.5,...,0.625,0.479,0.373,False,,37,4,33,0.108108,RandomForest
7,convML,RandomForest,Mortality,HiRID,BMI,BMI 25-30 kg/m2,0.885,0.515,4.018,0.5,...,0.5,0.0,0.0,False,,39,5,34,0.128205,RandomForest
8,convML,RandomForest,Mortality,HiRID,BMI,BMI > 30 kg/m2,1.0,1.0,9.0,1.0,...,0.75,0.686,0.64,False,,18,2,16,0.111111,RandomForest
9,convML,RandomForest,Mortality,MIMIC-IV,Overall,All,0.823,0.652,5.434,0.667,...,0.5,0.0,0.0,False,,100,12,88,0.12,RandomForest


### Save Performance Metrics in CSV Tables

In [53]:
# --- Save baseline model metrics in publication-ready format ---
def save_baseline_metrics_table(df_metrics, save_path="./notebook_output/metrics_tables/baseline_metrics.csv"):
    """
    Save baseline model metrics in a publication-ready format with hierarchical rows (see screenshot).
    Only includes convML and convDL models, and only overall metrics (not subgroups).
    """
    # Ensure output directory exists
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # Filter for convML and convDL, and only overall metrics
    df = df_metrics[
        (df_metrics["model_type"].isin(["convML", "convDL"])) &
        (df_metrics["subgroup_type"] == "Overall")
    ].copy()

    # Define the order for model types, models, tasks, datasets
    model_type_order = ["convML", "convDL"]
    convml_model_order = ["RandomForest", "XGBoost"]
    convdl_model_order = ["CNN", "InceptionTime", "LSTM", "GRU"]
    model_order = convml_model_order + convdl_model_order
    task_order = ["Mortality", "AKI", "Sepsis"]
    dataset_order = ["HiRID", "MIMIC-IV", "eICU"]

    # Prepare columns for output
    metric_cols = [
        "AUROC", "AUPRC", "Normalized AUPRC", "Min(+P, Se)", "Sensitivity (Recall)", "Specificity",
        "Precision", "F1 Score", "Accuracy", "Balanced Accuracy", "MCC", "Cohen's Kappa"
    ]
    # Ensure all metric columns exist
    for col in metric_cols:
        if col not in df.columns:
            df[col] = ""

    # Build the hierarchical table
    rows = []
    for model_type in model_type_order:
        models = convml_model_order if model_type == "convML" else convdl_model_order
        for model in models:
            for task in task_order:
                for dataset in dataset_order:
                    # Find the row in df
                    row = df[
                        (df["model_type"] == model_type) &
                        (df["Model"] == model) &
                        (df["Task"] == task) &
                        (df["Dataset"] == dataset)
                    ]
                    # Prepare row values
                    row_dict = {
                        "Model Type": model_type,
                        "Model": model,
                        "Task": task,
                        "Dataset": dataset
                    }
                    if not row.empty:
                        for col in metric_cols:
                            row_dict[col] = row.iloc[0][col]
                    else:
                        for col in metric_cols:
                            row_dict[col] = ""
                    rows.append(row_dict)

    # Convert to DataFrame
    df_out = pd.DataFrame(rows)

    # Replace repeated values with empty string for hierarchical effect
    for col in ["Model Type", "Model", "Task"]:
        last_val = None
        for i in range(len(df_out)):
            if df_out.loc[i, col] == last_val:
                df_out.loc[i, col] = ""
            else:
                last_val = df_out.loc[i, col]

    # Save to CSV
    df_out.to_csv(save_path, index=False)
    print(f"Saved baseline metrics table to {save_path}")

# Usage:
save_baseline_metrics_table(
    df_metrics,
    save_path="./notebook_output/metrics_tables/baseline_all_metrics.csv",
)

Saved baseline metrics table to ./notebook_output/metrics_tables/baseline_all_metrics.csv


In [55]:
def save_llm_metrics_table(
    df_metrics,
    is_agent=None,
    save_path=None,
):
    """
    Save LLM model metrics in publication-ready format with hierarchical rows.
    If is_agent is None, save all LLM metrics (both agent and non-agent) into one file.
    """
    # Ensure output directory exists
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # Filter for LLMs, only overall metrics
    df = df_metrics[
        (df_metrics["model_type"] == "LLM")
        & (df_metrics["subgroup_type"] == "Overall")
    ].copy()

    # Filter by is_agent if not None
    if is_agent is not None:
        df = df[df["is_agent"] == is_agent]

    # Define the order for models, prompting methods, tasks, datasets
    if is_agent is None:
        prompting_order = list(PROMPTING_ID_DISPLAY_MAPPING.values())
    elif is_agent:
        prompting_order = [
            v for k, v in PROMPTING_ID_DISPLAY_MAPPING.items() if "agent" in k.lower()
        ]
    else:
        prompting_order = [
            v for k, v in PROMPTING_ID_DISPLAY_MAPPING.items() if "agent" not in k.lower()
        ]

    model_order = list(LLM_MODELS.values())
    task_order = ["Mortality", "AKI", "Sepsis"]
    dataset_order = ["HiRID", "MIMIC-IV", "eICU"]

    # Prepare columns for output
    metric_cols = [
        "AUROC", "AUPRC", "Normalized AUPRC", "Min(+P, Se)", "Sensitivity (Recall)",
        "Specificity", "Precision", "F1 Score", "Accuracy", "Balanced Accuracy", "MCC", "Cohen's Kappa"
    ]

    # Ensure all metric columns exist
    for col in metric_cols:
        if col not in df.columns:
            df[col] = ""

    # Set correct column name for second column
    if is_agent is None:
        second_col = "Prompting/Agent"
    else:
        second_col = "Agent" if is_agent else "Prompting Method"

    rows = []
    for model in model_order:
        for prompting in prompting_order:
            for task in task_order:
                for dataset in dataset_order:
                    row = df[
                        (df["Model"] == model)
                        & (df["prompting_id"] == prompting)
                        & (df["Task"] == task)
                        & (df["Dataset"] == dataset)
                    ]
                    row_dict = {
                        "Model": model,
                        second_col: prompting,
                        "Task": task,
                        "Dataset": dataset,
                    }
                    if not row.empty:
                        for col in metric_cols:
                            row_dict[col] = row.iloc[0][col]
                    else:
                        for col in metric_cols:
                            row_dict[col] = ""
                    rows.append(row_dict)

    df_out = pd.DataFrame(rows)

    # Replace repeated values with empty string for hierarchical effect
    for col in ["Model", second_col, "Task"]:
        last_val = None
        for i in range(len(df_out)):
            if df_out.loc[i, col] == last_val:
                df_out.loc[i, col] = ""
            else:
                last_val = df_out.loc[i, col]

    df_out.to_csv(save_path, index=False)
    print(f"Saved LLM metrics table (is_agent={is_agent}) to {save_path}")

# Usage:
# Save all LLM metrics (both agent and non-agent) into one file
save_llm_metrics_table(
    df_metrics,
    is_agent=None,
    save_path="./notebook_output/metrics_tables/llm_all_metrics.csv",
)

Saved LLM metrics table (is_agent=None) to ./notebook_output/metrics_tables/llm_all_metrics.csv


In [56]:
def save_paper_ready_metrics_tables(df_metrics, output_dir="./notebook_output/metrics_tables"):
    """
    Save one CSV per task (mortality, aki, sepsis) in the publication-ready format.
    Only AUROC and AUPRC are included. Structure matches the provided screenshot.
    """
    import os

    # --- Define all groupings and orders ---
    tasks = ["Mortality", "AKI", "Sepsis"]
    datasets = ["HiRID", "MIMIC-IV", "eICU"]
    metric_cols = ["AUROC", "AUPRC"]

    # Baseline models
    convml_models = ["RandomForest", "XGBoost", "LightGBM"]
    convdl_models = ["CNN", "InceptionTime", "LSTM", "GRU"]

    # LLMs
    proprietary_llms = ["OpenAI-o3", "Claude-Sonnet-4", "Grok-4", "Gemini-2.5-Pro", "Gemini-2.5-Flash"]
    open_llms = ["Llama-3.1-8B-Instruct", "Deepseek-R1-Distill-Llama-8B", "Mistral-7B-Instruct-v0.3", "Gemma-3-4B-it", "MedGemma-4B-it"]

    # Prompting methods (order and display names)
    prompting_methods = [
        ("Aggregation (Sarvari et al., 2024)", "Aggregation"),
        ("Zero-Shot (Zhu et al., 2024b)", "Zero-Shot"),
        ("One-Shot (Zhu et al., 2024b)", "One-Shot"),
        ("Few-Shot (3) (Liu et al., 2023)", "Few-Shot (3)"),
        ("CoT (Zhu et al., 2024a)", "CoT"),
    ]
    agentic_methods = [
        ("SumAgent", "SumAgent"),
        ("ColAgent", "ColAgent"),
        ("ClinFlowAgent", "ClinFlowAgent"),
        ("HybReAgent", "HybReAgent"),
    ]

    # Helper for LLM model category
    def get_llm_model_category(model):
        return "Proprietary LLM" if model in proprietary_llms else "Open-Source LLM"

    # Helper for Method Category
    def get_method_category(method):
        return "Standard Prompting" if method in [x[0] for x in prompting_methods] else "Agentic"

    # --- Build and save table for each task ---
    os.makedirs(output_dir, exist_ok=True)
    for task in tasks:
        rows = []
        # Baseline rows
        for method_cat, method, model_cat, model in [
            ("Baseline", "", "convML", m) for m in convml_models
        ] + [
            ("Baseline", "", "convDL", m) for m in convdl_models
        ]:
            row = {
                "Method Cat": method_cat,
                "Method": method,
                "Model Category": model_cat,
                "Model": model,
            }
            for ds in datasets:
                for metric in metric_cols:
                    val = df_metrics.loc[
                        (df_metrics["Model"] == model)
                        & (df_metrics["Task"] == task)
                        & (df_metrics["Dataset"] == ds)
                        & (df_metrics["subgroup_type"] == "Overall"),
                        metric,
                    ]
                    row[f"{ds} {metric}"] = f"{val.iloc[0]:.3f}" if not val.empty and pd.notnull(val.iloc[0]) else ""
            rows.append(row)

        # LLM rows (Standard Prompting)
        for method_disp, method_key in prompting_methods:
            for model_cat, model_list in [("Proprietary LLM", proprietary_llms), ("Open-Source LLM", open_llms)]:
                for model in model_list:
                    row = {
                        "Method Cat": "Standard Prompting",
                        "Method": method_disp,
                        "Model Category": model_cat,
                        "Model": model,
                    }
                    for ds in datasets:
                        for metric in metric_cols:
                            val = df_metrics.loc[
                                (df_metrics["Model"] == model)
                                & (df_metrics["Task"] == task)
                                & (df_metrics["Dataset"] == ds)
                                & (df_metrics["prompting_id"] == method_key)
                                & (df_metrics["subgroup_type"] == "Overall"),
                                metric,
                            ]
                            row[f"{ds} {metric}"] = f"{val.iloc[0]:.3f}" if not val.empty and pd.notnull(val.iloc[0]) else ""
                    rows.append(row)

        # LLM rows (Agentic)
        for method_disp, method_key in agentic_methods:
            for model_cat, model_list in [("Proprietary LLM", proprietary_llms), ("Open-Source LLM", open_llms)]:
                for model in model_list:
                    row = {
                        "Method Cat": "Agentic",
                        "Method": method_disp,
                        "Model Category": model_cat,
                        "Model": model,
                    }
                    for ds in datasets:
                        for metric in metric_cols:
                            val = df_metrics.loc[
                                (df_metrics["Model"] == model)
                                & (df_metrics["Task"] == task)
                                & (df_metrics["Dataset"] == ds)
                                & (df_metrics["prompting_id"] == method_key)
                                & (df_metrics["subgroup_type"] == "Overall"),
                                metric,
                            ]
                            row[f"{ds} {metric}"] = f"{val.iloc[0]:.3f}" if not val.empty and pd.notnull(val.iloc[0]) else ""
                    rows.append(row)

        # --- Build DataFrame and save ---
        col_order = [
            "Method Cat", "Method", "Model Category", "Model",
            "HiRID AUROC", "HiRID AUPRC",
            "MIMIC-IV AUROC", "MIMIC-IV AUPRC",
            "eICU AUROC", "eICU AUPRC",
        ]
        df_out = pd.DataFrame(rows)
        # Ensure all columns exist
        for col in col_order:
            if col not in df_out.columns:
                df_out[col] = ""
        df_out = df_out[col_order]

        # Hierarchical effect: replace repeated values with ""
        for col in ["Method Cat", "Method", "Model Category"]:
            last_val = None
            for i in range(len(df_out)):
                if df_out.loc[i, col] == last_val:
                    df_out.loc[i, col] = ""
                else:
                    last_val = df_out.loc[i, col]

        save_path = os.path.join(output_dir, f"paper_auroc_auprc_{task.lower()}.csv")
        df_out.to_csv(save_path, index=False)
        print(f"Saved paper-ready table for {task}: {save_path}")

# Usage:
save_paper_ready_metrics_tables(df_metrics)

Saved paper-ready table for Mortality: ./notebook_output/metrics_tables/paper_auroc_auprc_mortality.csv
Saved paper-ready table for AKI: ./notebook_output/metrics_tables/paper_auroc_auprc_aki.csv
Saved paper-ready table for Sepsis: ./notebook_output/metrics_tables/paper_auroc_auprc_sepsis.csv


### Operational Metrics

In [48]:
def calculate_llm_operational_metrics(df_metadata):
    """
    Calculate average and total operational metrics for LLM models at all aggregation levels.
    For higher aggregation levels, average the metrics from the most fine-grained level.
    Ensures consistency between avg step and avg sample metrics.
    """

    llm_mask = df_metadata["model_type"] == "LLM"
    df_metadata_llm = df_metadata[llm_mask].copy()
    if df_metadata_llm.empty:
        print("No LLM models found in metadata. Skipping operational metrics.")
        return pd.DataFrame()

    # Remove unused categories
    for col in [
        "model_type",
        "model_name",
        "prompting_id",
        "task",
        "dataset",
        "is_agent",
    ]:
        if col in df_metadata_llm.columns and isinstance(
            df_metadata_llm[col].dtype, pd.CategoricalDtype
        ):
            df_metadata_llm[col] = df_metadata_llm[col].cat.remove_unused_categories()

    # Ensure columns exist
    for col in [
        "Tokenization Time",
        "Inference Time",
        "Input Tokens",
        "Output Tokens",
        "sample_index",
        "Step Name",
        "Step Number",
    ]:
        if col not in df_metadata_llm.columns:
            df_metadata_llm[col] = np.nan

    all_cols = [
        "model_type",
        "model_name",
        "is_agent",
        "prompting_id",
        "task",
        "dataset",
    ]
    fine_grained_level = [
        "model_type",
        "model_name",
        "is_agent",
        "prompting_id",
        "task",
        "dataset",
    ]
    agg_levels = [
        (["model_type"], "model_type"),
        (["model_type", "model_name"], "model_type+model_name"),
        (["model_type", "model_name", "is_agent"], "model_type+model_name+is_agent"),
        (
            ["model_type", "model_name", "is_agent", "prompting_id"],
            "model_type+model_name+is_agent+prompting_id",
        ),
        (
            ["model_type", "model_name", "is_agent", "prompting_id", "task"],
            "model_type+model_name+is_agent+prompting_id+task",
        ),
        (
            ["model_type", "model_name", "is_agent", "prompting_id", "task", "dataset"],
            "model_type+model_name+is_agent+prompting_id+task+dataset",
        ),
    ]

    # --- Step 1: Calculate fine-grained metrics ---
    fine_metrics = []
    groupby_obj = df_metadata_llm.groupby(fine_grained_level, observed=True)
    for keys, group in groupby_obj:
        is_agent = group["is_agent"].iloc[0] if "is_agent" in group else False
        if is_agent:
            group_filtered = group[~(group["Step Name"] == "SAMPLE_METADATA")]
        else:
            group_filtered = group

        sample_count = len(group_filtered)
        if "sample_index" in group_filtered.columns and len(group_filtered) > 0:
            n_unique_sample_index = group_filtered["sample_index"].nunique()
            avg_num_steps = (
                sample_count / n_unique_sample_index
                if n_unique_sample_index > 0
                else np.nan
            )

            sample_groups = group_filtered.groupby("sample_index", observed=True)
            sample_tokenization = sample_groups["Tokenization Time"].sum()
            sample_inference = sample_groups["Inference Time"].sum()
            sample_input_tokens = sample_groups["Input Tokens"].sum()
            sample_output_tokens = sample_groups["Output Tokens"].sum()

            avg_sample_tokenization_time = (
                sample_tokenization.mean() if len(sample_tokenization) > 0 else np.nan
            )
            avg_sample_inference_time = (
                sample_inference.mean() if len(sample_inference) > 0 else np.nan
            )
            avg_sample_input_tokens = (
                sample_input_tokens.mean() if len(sample_input_tokens) > 0 else np.nan
            )
            avg_sample_output_tokens = (
                sample_output_tokens.mean() if len(sample_output_tokens) > 0 else np.nan
            )
        else:
            avg_num_steps = np.nan if is_agent else 1
            avg_sample_tokenization_time = (
                group_filtered["Tokenization Time"].mean()
                if len(group_filtered) > 0
                else np.nan
            )
            avg_sample_inference_time = (
                group_filtered["Inference Time"].mean()
                if len(group_filtered) > 0
                else np.nan
            )
            avg_sample_input_tokens = (
                group_filtered["Input Tokens"].mean()
                if len(group_filtered) > 0
                else np.nan
            )
            avg_sample_output_tokens = (
                group_filtered["Output Tokens"].mean()
                if len(group_filtered) > 0
                else np.nan
            )

        # Adjust sample_count for is_agent==True
        adj_sample_count = sample_count
        if is_agent and avg_num_steps > 0:
            adj_sample_count = sample_count / avg_num_steps
            if not np.isclose(adj_sample_count, round(adj_sample_count)):
                print(
                    f"[WARNING] Adjusted sample_count is not integer for group {keys}: {adj_sample_count}"
                )
            adj_sample_count = int(round(adj_sample_count))

        # Step metrics (per row)
        avg_step_tokenization_time = (
            group_filtered["Tokenization Time"].mean()
            if len(group_filtered) > 0
            else np.nan
        )
        avg_step_inference_time = (
            group_filtered["Inference Time"].mean()
            if len(group_filtered) > 0
            else np.nan
        )
        avg_step_input_tokens = (
            group_filtered["Input Tokens"].mean() if len(group_filtered) > 0 else np.nan
        )
        avg_step_output_tokens = (
            group_filtered["Output Tokens"].mean()
            if len(group_filtered) > 0
            else np.nan
        )

        # Total metrics
        total_tokenization_time = group_filtered["Tokenization Time"].sum()
        total_inference_time = group_filtered["Inference Time"].sum()
        total_input_tokens = group_filtered["Input Tokens"].sum()
        total_output_tokens = group_filtered["Output Tokens"].sum()

        fine_metrics.append(
            {
                **dict(
                    zip(fine_grained_level, keys if isinstance(keys, tuple) else [keys])
                ),
                "sample_count": adj_sample_count,
                "avg_num_steps": avg_num_steps,
                "avg_step_tokenization_time": avg_step_tokenization_time,
                "avg_step_inference_time": avg_step_inference_time,
                "avg_step_input_tokens": avg_step_input_tokens,
                "avg_step_output_tokens": avg_step_output_tokens,
                "avg_sample_tokenization_time": avg_sample_tokenization_time,
                "avg_sample_inference_time": avg_sample_inference_time,
                "avg_sample_input_tokens": avg_sample_input_tokens,
                "avg_sample_output_tokens": avg_sample_output_tokens,
                "total_tokenization_time": total_tokenization_time,
                "total_inference_time": total_inference_time,
                "total_input_tokens": total_input_tokens,
                "total_output_tokens": total_output_tokens,
            }
        )
    fine_df = pd.DataFrame(fine_metrics)

    # --- Step 2: Aggregate for higher levels ---
    all_results = []
    for level, level_name in agg_levels:
        # For each group at this level, average the fine-grained metrics
        grouped = fine_df.groupby(level, observed=True)
        agg_rows = []
        for keys, group in grouped:
            # For sample_count, sum (not mean)
            sample_count = group["sample_count"].sum()
            # For all other metrics, take mean
            avg_num_steps = group["avg_num_steps"].mean()
            avg_step_tokenization_time = group["avg_step_tokenization_time"].mean()
            avg_step_inference_time = group["avg_step_inference_time"].mean()
            avg_step_input_tokens = group["avg_step_input_tokens"].mean()
            avg_step_output_tokens = group["avg_step_output_tokens"].mean()
            avg_sample_tokenization_time = group["avg_sample_tokenization_time"].mean()
            avg_sample_inference_time = group["avg_sample_inference_time"].mean()
            avg_sample_input_tokens = group["avg_sample_input_tokens"].mean()
            avg_sample_output_tokens = group["avg_sample_output_tokens"].mean()
            total_tokenization_time = group["total_tokenization_time"].sum()
            total_inference_time = group["total_inference_time"].sum()
            total_input_tokens = group["total_input_tokens"].sum()
            total_output_tokens = group["total_output_tokens"].sum()

            row = {
                **dict(zip(level, keys if isinstance(keys, tuple) else [keys])),
                "aggregation_level": level_name,
                "sample_count": sample_count,
                "avg_num_steps": avg_num_steps,
                "avg_step_tokenization_time": avg_step_tokenization_time,
                "avg_step_inference_time": avg_step_inference_time,
                "avg_step_input_tokens": avg_step_input_tokens,
                "avg_step_output_tokens": avg_step_output_tokens,
                "avg_sample_tokenization_time": avg_sample_tokenization_time,
                "avg_sample_inference_time": avg_sample_inference_time,
                "avg_sample_input_tokens": avg_sample_input_tokens,
                "avg_sample_output_tokens": avg_sample_output_tokens,
                "total_tokenization_time": total_tokenization_time,
                "total_inference_time": total_inference_time,
                "total_input_tokens": total_input_tokens,
                "total_output_tokens": total_output_tokens,
            }
            # Fill missing columns with 'all'
            for col in all_cols:
                if col not in row:
                    row[col] = "all"
            agg_rows.append(row)
        agg_df = pd.DataFrame(agg_rows)
        # Reorder columns
        agg_df = agg_df[
            all_cols
            + [
                "aggregation_level",
                "sample_count",
                "avg_num_steps",
                "avg_step_tokenization_time",
                "avg_step_inference_time",
                "avg_step_input_tokens",
                "avg_step_output_tokens",
                "avg_sample_tokenization_time",
                "avg_sample_inference_time",
                "avg_sample_input_tokens",
                "avg_sample_output_tokens",
                "total_tokenization_time",
                "total_inference_time",
                "total_input_tokens",
                "total_output_tokens",
            ]
        ]
        all_results.append(agg_df)

    combined = pd.concat(all_results, ignore_index=True)
    float_cols = combined.select_dtypes(include=["float"]).columns
    combined[float_cols] = combined[float_cols].round(4)
    for col in ["total_input_tokens", "total_output_tokens"]:
        if col in combined.columns:
            combined[col] = combined[col].fillna(0).astype(int)
    return combined


# Calculate operational metrics
df_metrics_operational = calculate_llm_operational_metrics(df_metadata)

# Add model_prompting_id column to operational metrics DataFrame
if 'model_name' in df_metrics_operational.columns and 'model_type' in df_metrics_operational.columns:
    def get_model_prompting_id_operational(row):
        if row.get('model_type') == 'LLM' and row.get('prompting_id', ''):
            return f"{row['model_name']}, {row['prompting_id']}"
        else:
            return row['model_name']
    df_metrics_operational['model_prompting_id'] = df_metrics_operational.apply(get_model_prompting_id_operational, axis=1)

# Save df_metrics_operational
os.makedirs("./notebook_output/postprocessed_data", exist_ok=True)
save_path_operational = "./notebook_output/postprocessed_data/pulse_metrics_operational.csv"
df_metrics_operational.to_csv(save_path_operational, index=False)
print(f"Saved all-level operational metrics to {save_path_operational}")
display(df_metrics_operational.head())

Saved all-level operational metrics to ./notebook_output/data/pulse_metrics_operational.csv


Unnamed: 0,model_type,model_name,is_agent,prompting_id,task,dataset,aggregation_level,sample_count,avg_num_steps,avg_step_tokenization_time,...,avg_step_output_tokens,avg_sample_tokenization_time,avg_sample_inference_time,avg_sample_input_tokens,avg_sample_output_tokens,total_tokenization_time,total_inference_time,total_input_tokens,total_output_tokens,model_prompting_id
0,LLM,all,all,all,all,all,model_type,383638,2.3441,0.0041,...,308.7802,0.0059,27.3315,6103.3365,763.5133,1859.4811,10052990.0,1819684243,286565447,"all, all"
1,LLM,Claude-Sonnet-4,all,all,all,all,model_type+model_name,12378,2.1314,0.0,...,869.9855,0.0,35.4292,4174.4118,1672.5284,0.0,437232.6,43922761,20892503,"Claude-Sonnet-4, all"
2,LLM,Deepseek-R1-Distill-Llama-8B,all,all,all,all,model_type+model_name,55701,2.4917,0.0067,...,930.4386,0.007,68.3631,5808.6491,2264.5434,300.5096,3580474.0,260729707,123696223,"Deepseek-R1-Distill-Llama-8B, all"
3,LLM,Gemini-2.5-Flash,all,all,all,all,model_type+model_name,55701,2.3231,0.0,...,174.7053,0.0,5.0338,6563.2477,430.2607,0.0,268448.4,275650601,23091933,"Gemini-2.5-Flash, all"
4,LLM,Gemini-2.5-Pro,all,all,all,all,model_type+model_name,12378,2.1811,0.0,...,139.9383,0.0,44.4366,4510.8939,322.6688,0.0,552209.6,45122663,4144375,"Gemini-2.5-Pro, all"
