# Agent-Specific Evaluation Notebook of the Summary Agent

In [1]:
import os
import pandas as pd
import numpy as np
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(project_root)

## Setup and Configuration

In [None]:
# Set paths to pre-processed CSVs
metadata_path = os.path.join("./notebook_output/postprocessed_data", "pulse_metadata.csv")
metrics_path = os.path.join("./notebook_output/postprocessed_data", "pulse_metrics.csv")
metrics_op_path = os.path.join("./notebook_output/postprocessed_data", "pulse_metrics_operational.csv")

# Load metadata and metrics
df_metadata = pd.read_csv(metadata_path)
print(f"Loaded metadata: {df_metadata.shape}")

# Load performance metrics with and without subgroups
df_metrics_subgroup = pd.read_csv(metrics_path)
print(f"Loaded metrics with subgroups: {df_metrics_subgroup.shape}")

df_metrics = df_metrics_subgroup[
    df_metrics_subgroup["subgroup_type"] == "Overall"
].copy()
df_metrics = df_metrics.drop(columns=["subgroup_type", "subgroup_value"])
print(f"Loaded metrics without subgroups: {df_metrics.shape}")

# Load operational metrics
df_metrics_op = pd.read_csv(metrics_op_path)
print(f"Loaded operational metrics: {df_metrics_op.shape}")


# Split all loaded dataframes into _agents and _benchmark
def split_agents_benchmark(df):
    # Always include convML and convDL
    base = df[(df["model_type"] == "convML") | (df["model_type"] == "convDL")]
    # _agents: add is_agent == True
    agents = pd.concat(
        [base, df[df["is_agent"] == True]], ignore_index=True
    ).drop_duplicates()
    # _benchmark: add is_agent == False
    benchmark = pd.concat(
        [base, df[df["is_agent"] == False]], ignore_index=True
    ).drop_duplicates()
    return agents, benchmark


# Split all loaded dataframes into _agents and _benchmark
def split_agents_benchmark_op(df):
    # Always include convML and convDL
    base = df[(df["model_type"] == "convML") | (df["model_type"] == "convDL")]
    # _agents: include convML/convDL plus all rows where is_agent is True or 'all' (as string)
    agents = pd.concat(
        [base, df[df["is_agent"].isin(["True", "all"])]], ignore_index=True
    ).drop_duplicates()
    # _benchmark: include convML/convDL plus all rows where is_agent is False or 'all' (as string)
    benchmark = pd.concat(
        [base, df[df["is_agent"].isin(["False", "all"])]], ignore_index=True
    ).drop_duplicates()
    return agents, benchmark


df_metadata_agents, df_metadata_benchmark = split_agents_benchmark(df_metadata)
df_metrics_subgroup_agents, df_metrics_subgroup_benchmark = split_agents_benchmark(
    df_metrics_subgroup
)
df_metrics_agents, df_metrics_benchmark = split_agents_benchmark(df_metrics)
df_metrics_op_agents, df_metrics_op_benchmark = split_agents_benchmark_op(df_metrics_op)
# Drop specified columns from df_metrics_op_benchmark
df_metrics_op_benchmark = df_metrics_op_benchmark.drop(
    columns=[
        "avg_step_tokenization_time",
        "avg_step_inference_time",
        "avg_step_input_tokens",
        "avg_step_output_tokens",
    ],
    errors="ignore",
)

print(
    f"df_metadata_agents: {df_metadata_agents.shape}, df_metadata_benchmark: {df_metadata_benchmark.shape}"
)
print(
    f"df_metrics_subgroup_agents: {df_metrics_subgroup_agents.shape}, df_metrics_subgroup_benchmark: {df_metrics_subgroup_benchmark.shape}"
)
print(
    f"df_metrics_agents: {df_metrics_agents.shape}, df_metrics_benchmark: {df_metrics_benchmark.shape}"
)
print(
    f"df_metrics_op_agents: {df_metrics_op_agents.shape}, df_metrics_op_benchmark: {df_metrics_op_benchmark.shape}"
)

Loaded metadata: (12298, 30)
Loaded metrics with subgroups: (160, 25)
Loaded metrics without subgroups: (18, 23)
Loaded operational metrics: (16, 22)
df_metadata_agents: (6189, 30), df_metadata_benchmark: (12298, 30)
df_metrics_subgroup_agents: (84, 25), df_metrics_subgroup_benchmark: (160, 25)
df_metrics_agents: (9, 23), df_metrics_benchmark: (18, 23)
df_metrics_op_agents: (2, 22), df_metrics_op_benchmark: (16, 18)


In [None]:
# Global output directory for all visualizations
OUTPUT_BASE_DIR = os.path.join(".", "notebook_output", "pulse_agents_sumagent")

TASK_MAPPING = {
    "mortality": ("Mortality", {"case": "#D14E70", "control": "#F0B0C0"}),  # Raspberry
    "aki": ("AKI", {"case": "#3A78B5", "control": "#A0D0F0"}),  # Blue
    "sepsis": ("Sepsis", {"case": "#F9C27B", "control": "#FFE6A7"}),  # Amber/Gold
}

DATASET_MAPPING = {
    "hirid": ("HiRID", "#793FBB"),  # Purple
    "miiv": ("MIMIC-IV", "#17AB6F"),  # Green
    "eicu": ("eICU", "#FC8D5F"),  # Orange
}
MODEL_MAPPING = {
    # convML: slightly darker, more prominent blues (light to dark)
    "RandomForest": "#c9daf6",  # light blue, slightly darker and bluer
    "XGBoost": "#4f8ad1",  # deeper blue, more saturated
    # convDL: slightly darker, more prominent greens (light to dark)
    "CNN": "#c6eedb",  # light mint green, slightly darker
    "InceptionTime": "#7fd6b1",  # medium mint green, slightly darker
    "LSTM": "#3ca97d",  # deeper green, more saturated
    "GRU": "#217a5a",  # even deeper green
    # LLMs: vivid, distinct, friendly (legend only)
    "OpenAI-o3": "#ff66c3",  # vivid magenta (first agent color in LLM_COLOR_FAMILIES)
    "Claude-Sonnet-4": "#1080e2",  # vivid blue (first agent color in LLM_COLOR_FAMILIES)
    "Grok-4": "#b44fd1",  # vivid purple (first agent color in LLM_COLOR_FAMILIES)
    "Gemini-2.5-Pro": "#8be600",  # vivid lime green (first agent color in LLM_COLOR_FAMILIES)
    "Gemini-2.5-Flash": "#ff7c1a",  # vivid gold (first agent color in LLM_COLOR_FAMILIES)
    "Llama-3.1-8B-Instruct": "#4cb0c3",  # vivid turquoise (first agent color in LLM_COLOR_FAMILIES)
    "Deepseek-R1-Distill-Llama-8B": "#ffa233",  # vivid orange (first agent color in LLM_COLOR_FAMILIES)
    "Mistral-7B-Instruct-v0.3": "#8f5ad8",  # vivid violet (first agent color in LLM_COLOR_FAMILIES)
    "Gemma-3-4B-it": "#00e699",  # vivid mint green (first agent color in LLM_COLOR_FAMILIES)
    "MedGemma-4B-it": "#c2338c",  # vivid magenta (first agent color in LLM_COLOR_FAMILIES)
}

PROMPTING_ID_MAPPING = {
    "Aggregation": "#1f77b4",  # Vivid Blue
    "Zero-Shot": "#ff7f0e",  # Vibrant Orange
    "One-Shot": "#9cf177",  # Modern Purple
    "Few-Shot (3)": "#fbe844",  # Bold Red
    "CoT": "#2ca02c",  # Fresh Green
    "SumAgent": "#ebb128",  # Playful Pink
    "ColAgent": "#b141e9",  # Lively Cyan
    "ClinFlowAgent": "#36eaab",  # Warm Brown
    "HybReAgent": "#f13636",  # Energetic Yellow-Green
}

CONVML_MODELS = ["RandomForest", "XGBoost"]
CONVDL_MODELS = ["CNN", "InceptionTime", "GRU", "LSTM"]
LLM_MODELS = [
    "OpenAI-o3",
    "Claude-Sonnet-4",
    "Grok-4",
    "Gemini-2.5-Pro",
    "Gemini-2.5-Flash",
    "Llama-3.1-8B-Instruct",
    "Deepseek-R1-Distill-Llama-8B",
    "Mistral-7B-Instruct-v0.3",
    "Gemma-3-4B-it", 
    "MedGemma-4B-it",
]

SUBGROUP_MAPPING = {
    "Sex": {"Male": "#1f4e79", "Female": "#c5282f"},  # Dark blue, Dark red
    "Age": {
        "18-65 Years": "#ddbf94",  # Light beige
        "65-75 Years": "#c19a6b",  # Medium beige
        "75-91 Years": "#8b7355",  # Dark beige
    },
    "BMI": {
        "BMI < 18.5 kg/m2": "#b8b8b8",  # Light silver
        "BMI 18.5-25 kg/m2": "#9d9d9d",  # Medium light silver
        "BMI 25-30 kg/m2": "#808080",  # Medium dark silver
        "BMI > 30 kg/m2": "#2c3539",  # Dark gunmetal
    },
}

METRICS_LIST = [
    "AUROC",
    "AUPRC",
    "Normalized AUPRC",
    "Min(+P, Se)",
    "Sensitivity (Recall)",
    "Specificity",
    "Precision",
    "F1 Score",
    "Accuracy",
    "Balanced Accuracy",
    "MCC",
    "Cohen's Kappa",
]

LLM_ALL_PROMPTING_IDS = sorted(
    x
    for x in df_metadata.loc[
        ~df_metadata["model_prompting_id"].isin(CONVML_MODELS + CONVDL_MODELS),
        "model_prompting_id",
    ].unique()
    if isinstance(x, str)
)

# Define agentic prompting methods
AGENTIC_PROMPTING_IDS = [
    "SumAgent",
    "ColAgent",
    "ClinFlowAgent",
    "HybReAgent",
]

# Define standard prompting methods
STANDARD_PROMPTING_IDS = [
    "Aggregation",
    "Zero-Shot",
    "One-Shot",
    "Few-Shot (3)",
    "CoT",
]

LLM_COLOR_FAMILIES = {
    "OpenAI-o3": {
        "standard_prompting": [
            "#fff0fa",  # 1 - very pale pink
            "#ffd6f2",  # 2 - light bubblegum
            "#ffb3e6",  # 3 - pastel bubblegum
            "#ff99db",  # 4 - soft bubblegum
            "#ff7fcf",  # 5 - bubblegum
        ],  # Pastel magenta to vivid pink
        "agents": [
            "#ff66c3",  # 6 - vivid bubblegum
            "#ff4db8",  # 7 - strong bubblegum
            "#ff33ac",  # 8 - deep bubblegum
            "#ff199f",  # 9 - deepest bubblegum
        ],  # Deeper magenta to lighter berry
    },
    "Claude-Sonnet-4": {
        "standard_prompting": [
            "#eaf6ff",
            "#b6dcff",
            "#7fc0ff",
            "#4da3ff",
            "#1987ff",
        ],  # Pastel blue to vivid azure
        "agents": [
            "#1080e2",
            "#0c6cc5",
            "#1152ad",
            "#114d83",
        ],  # Deeper blue to lighter navy
    },
    "Grok-4": {
        "standard_prompting": [
            "#fbe6ff",  # 1 - very light pink-lilac
            "#f3c2fa",  # 2 - pastel pink-lilac
            "#e6a8f7",  # 3 - soft lilac
            "#d98cf0",  # 4 - light magenta-lilac
            "#c96ae6",  # 5 - medium lilac
        ],  # Pastel lilac to vivid purple
        "agents": [
            "#b44fd1",  # 6 - vivid lilac
            "#9443c3",  # 7 - strong blue-violet
            "#7d2ca2",  # 8 - deep blue-violet
            "#61247C",  # 9 - darkest, but lighter than pure indigo
        ],  # Deeper purple to lighter violet
    },
    "Gemini-2.5-Pro": {
        "standard_prompting": [
            "#f7ffe0",  # 1 - very pale lime
            "#eaffb3",  # 2 - light lime
            "#d4ff66",  # 3 - pastel lime
            "#baff33",  # 4 - soft lime
            "#a0ff00",  # 5 - vivid lime
        ],  # Pastel green to vivid lime
        "agents": [
            "#8be600",  # 6 - lime green
            "#7acc1a",  # 7 - yellow-green
            "#6ab300",  # 8 - olive-lime
            "#5a9900",  # 9 - darkest, yellowish green
        ],  # Deeper lime/green to lighter green
    },
    "Gemini-2.5-Flash": {
        "standard_prompting": [
            "#fff4e6",  # 1 - very light orange
            "#ffd9b3",  # 2 - light peach
            "#ffc285",  # 3 - soft orange
            "#ffad5c",  # 4 - rich orange
            "#ff9800",  # 5 - vivid orange
        ],  # Pastel yellow to vivid gold
        "agents": [
            "#ff7c1a",  # 6 - deep orange
            "#ff5a36",  # 7 - orange-red
            "#e64a19",  # 8 - strong reddish orange
            "#e65c00",
        ],  # Deeper gold to lighter orange-brown
    },
    "Llama-3.1-8B-Instruct": {
        "standard_prompting": [
            "#e0ffff",  # 1 - very light aqua
            "#b3eaf2",  # 2 - lighter aqua
            "#99dbe6",  # 3 - light pastel aqua
            "#7fcddb",  # 4 - pastel aqua
            "#66becf",  # 5 - soft aqua
        ],  # Pastel aqua to vivid turquoise
        "agents": [
            "#4cb0c3",  # 6 - medium aqua
            "#3391a7",  # 7 - deeper aqua
            "#2a7c8c",  # 8 - dark teal-blue
            "#2a6a7c",  # 9 - darkest, deep blue-teal
        ],  # Deeper turquoise to lighter teal
    },
    "Deepseek-R1-Distill-Llama-8B": {
        "standard_prompting": [
            "#fff3e0",  # 1 - very light peach
            "#ffe3c2",  # 2 - light cream
            "#ffd39f",  # 3 - pale yellow-orange
            "#ffc278",  # 4 - soft yellow-orange
            "#ffb34d",  # 5 - light amber
        ],  # Pastel peach to vivid orange
        "agents": [
            "#ffa233",  # 6 - amber
            "#e88c1a",  # 7 - brownish amber
            "#c97a1a",  # 8 - brown-orange
            "#be7835",  # 9 - brown
        ],  # Deeper orange to lighter brown
    },
    "Mistral-7B-Instruct-v0.3": {
        "standard_prompting": [
            "#f6efff",  # 1 - very light lavender
            "#e3d1fa",  # 2 - pale lavender
            "#d1b3f6",  # 3 - light pastel purple
            "#be95ea",  # 4 - soft purple
            "#a97fdc",  # 5 - medium purple
        ],  # Pastel lilac to vivid violet
        "agents": [
            "#8f5ad8",  # 6 - vivid purple
            "#7a3fc7",  # 7 - strong purple
            "#6a1bb1",  # 8 - deep purple
            "#502491",  # 9 - darkest, but not black
        ],  # Deeper violet to lighter purple
    },
    "Gemma-3-4B-it": {
        "standard_prompting": [
            "#e6fff7",  # 1 - very pale blue-green
            "#b3ffe6",  # 2 - light blue-green
            "#80ffd1",  # 3 - pastel blue-green
            "#4dffbe",  # 4 - soft blue-green
            "#1affaa",  # 5 - vivid blue-green
        ],  # Pastel mint to vivid green
        "agents": [
            "#00e699",  # 6 - cool green
            "#00cc88",  # 7 - blueish green
            "#00b377",  # 8 - deep blue-green
            "#009966",  # 9 - darkest, blueish green
        ],  # Deeper mint/green to lighter green
    },
    "MedGemma-4B-it": {
        "standard_prompting": [
            "#fbe6f6",  # 1 - very pale berry
            "#f5c2e3",  # 2 - light berry
            "#ee99cc",  # 3 - pastel berry
            "#e673b8",  # 4 - soft berry
            "#d94da3",  # 5 - berry
        ],  # Pastel pink to vivid magenta
        "agents": [
            "#c2338c",  # 6 - vivid berry
            "#a61a70",  # 7 - strong berry
            "#8c005a",  # 8 - deep berry
            "#66003f",  # 9 - darkest berry
        ],  # Deeper magenta to lighter pink/purple
    },
}

# Create unified model-prompting mapping
MODEL_PROMPTING_ID_MAPPING = {}

# Add convML and convDL models (unchanged)
for model in CONVML_MODELS + CONVDL_MODELS:
    MODEL_PROMPTING_ID_MAPPING[model] = MODEL_MAPPING[model]

# Add LLM models with integrated color schemes
for model, color_family in LLM_COLOR_FAMILIES.items():
    # Assign agent colors
    for i, agent_id in enumerate(AGENTIC_PROMPTING_IDS):
        key = f"{model}, {agent_id}"
        if i < len(color_family["agents"]):
            MODEL_PROMPTING_ID_MAPPING[key] = color_family["agents"][i]
        else:
            # Fallback to cycling through available agent colors
            MODEL_PROMPTING_ID_MAPPING[key] = color_family["agents"][
                i % len(color_family["agents"])
            ]

    # Assign standard prompting colors
    for i, standard_id in enumerate(STANDARD_PROMPTING_IDS):
        key = f"{model}, {standard_id}"
        if i < len(color_family["standard_prompting"]):
            MODEL_PROMPTING_ID_MAPPING[key] = color_family["standard_prompting"][i]
        else:
            # Fallback to cycling through available standard colors
            MODEL_PROMPTING_ID_MAPPING[key] = color_family["standard_prompting"][
                i % len(color_family["standard_prompting"])
            ]

# Function to get model-specific color
def get_model_color(model):
    if model in MODEL_MAPPING:
        # Use the color defined in MODEL_MAPPING for convML/convDL models
        return MODEL_MAPPING[model]

### Agent-specific Metadata Overview

In [None]:
# Filter df_metadata_agents to only include Summary Agent
df_metadata_suma = df_metadata_agents[df_metadata_agents["prompting_id"] == "SumAgent"].copy()
print(f"df_metadata_suma: {df_metadata_suma.shape}")

# Delete columns that have only missing values
columns_before = df_metadata_suma.shape[1]
df_metadata_suma = df_metadata_suma.dropna(axis=1, how='all')
columns_after = df_metadata_suma.shape[1]
print(f"Removed {columns_before - columns_after} columns with only missing values")
print(f"df_metadata_suma after cleanup: {df_metadata_suma.shape}")

# Comprehensive analysis of df_metadata_suma characteristics
print("=" * 80)
print("COMPREHENSIVE SUMMARY OF SUMMARY AGENT METADATA")
print("=" * 80)

# Basic shape and structure
print(f"\n1. DATASET OVERVIEW")
print(f"   Shape: {df_metadata_suma.shape[0]} rows × {df_metadata_suma.shape[1]} columns")
print(f"   Total experiments: {len(df_metadata_suma)}")

# Column information
print(f"\n2. AVAILABLE COLUMNS")
print(f"   Columns: {list(df_metadata_suma.columns)}")

# Model distribution
print(f"\n3. MODEL DISTRIBUTION")
if 'model' in df_metadata_suma.columns:
    model_counts = df_metadata_suma['model'].value_counts()
    print(f"   Models tested: {len(model_counts)} unique models")
    for model, count in model_counts.items():
        print(f"   - {model}: {count} rows")
else:
    print("   No 'model' column found")

# Task distribution
print(f"\n4. TASK DISTRIBUTION")
if 'task' in df_metadata_suma.columns:
    task_counts = df_metadata_suma['task'].value_counts()
    print(f"   Tasks evaluated: {len(task_counts)} unique tasks")
    for task, count in task_counts.items():
        print(f"   - {task}: {count} rows")
else:
    print("   No 'task' column found")

# Dataset distribution
print(f"\n5. DATASET DISTRIBUTION")
if 'dataset' in df_metadata_suma.columns:
    dataset_counts = df_metadata_suma['dataset'].value_counts()
    print(f"   Datasets used: {len(dataset_counts)} unique datasets")
    for dataset, count in dataset_counts.items():
        print(f"   - {dataset}: {count} rows")
else:
    print("   No 'dataset' column found")

# Model type distribution
print(f"\n6. MODEL TYPE DISTRIBUTION")
if 'model_type' in df_metadata_suma.columns:
    model_type_counts = df_metadata_suma['model_type'].value_counts()
    print(f"   Model types: {len(model_type_counts)} unique types")
    for model_type, count in model_type_counts.items():
        print(f"   - {model_type}: {count} experiments")
else:
    print("   No 'model_type' column found")

# Prompting ID verification
print(f"\n7. PROMPTING METHOD VERIFICATION")
if 'prompting_id' in df_metadata_suma.columns:
    prompting_counts = df_metadata_suma['prompting_id'].value_counts()
    print(f"   Prompting methods: {len(prompting_counts)} unique methods")
    for prompting, count in prompting_counts.items():
        print(f"   - {prompting}: {count} experiments")
else:
    print("   No 'prompting_id' column found")

# Agent verification
print(f"\n8. AGENT STATUS VERIFICATION")
if 'is_agent' in df_metadata_suma.columns:
    agent_counts = df_metadata_suma['is_agent'].value_counts()
    print(f"   Agent status distribution:")
    for status, count in agent_counts.items():
        print(f"   - is_agent={status}: {count} experiments")
else:
    print("   No 'is_agent' column found")

# Cross-tabulation analysis
print(f"\n9. CROSS-TABULATION ANALYSIS")
if all(col in df_metadata_suma.columns for col in ['model', 'task', 'dataset']):
    print(f"   Model × Task × Dataset combinations:")
    cross_tab = df_metadata_suma.groupby(['model', 'task', 'dataset']).size().reset_index(name='count')
    print(f"   Total unique combinations: {len(cross_tab)}")
    
    # Show coverage matrix
    print(f"\n   Coverage by Task and Dataset:")
    coverage = df_metadata_suma.pivot_table(
        index='task', 
        columns='dataset', 
        values='model', 
        aggfunc='count', 
        fill_value=0
    )
    print(coverage)
    
    print(f"\n   Coverage by Model and Task:")
    model_task_coverage = df_metadata_suma.pivot_table(
        index='model', 
        columns='task', 
        values='dataset', 
        aggfunc='count', 
        fill_value=0
    )
    print(model_task_coverage)

# Model-prompting ID combinations
print(f"\n10. MODEL-PROMPTING COMBINATIONS")
if 'model_prompting_id' in df_metadata_suma.columns:
    model_prompting_counts = df_metadata_suma['model_prompting_id'].value_counts()
    print(f"    Unique model-prompting combinations: {len(model_prompting_counts)}")
    for combo, count in model_prompting_counts.items():
        print(f"    - {combo}: {count} experiments")

# Temporal information (if available)
print(f"\n11. TEMPORAL INFORMATION")
time_columns = [col for col in df_metadata_suma.columns if any(keyword in col.lower() for keyword in ['time', 'date', 'duration', 'created'])]
if time_columns:
    print(f"    Time-related columns found: {time_columns}")
    for col in time_columns:
        if df_metadata_suma[col].dtype in ['datetime64[ns]', 'object']:
            try:
                # Try to convert to datetime if not already
                temp_series = pd.to_datetime(df_metadata_suma[col], errors='coerce')
                if not temp_series.isna().all():
                    print(f"    - {col}: {temp_series.min()} to {temp_series.max()}")
            except:
                print(f"    - {col}: Unable to parse as datetime")
        else:
            print(f"    - {col}: {df_metadata_suma[col].describe()}")
else:
    print("    No time-related columns found")

# Configuration and parameter information
print(f"\n12. CONFIGURATION PARAMETERS")
config_columns = [col for col in df_metadata_suma.columns if any(keyword in col.lower() for keyword in ['config', 'param', 'setting', 'seed', 'temperature', 'max_tokens'])]
if config_columns:
    print(f"    Configuration columns found: {config_columns}")
    for col in config_columns:
        unique_vals = df_metadata_suma[col].nunique()
        if unique_vals <= 10:  # Show all values if few unique
            print(f"    - {col}: {sorted(df_metadata_suma[col].unique())}")
        else:
            print(f"    - {col}: {unique_vals} unique values, range: {df_metadata_suma[col].min()} to {df_metadata_suma[col].max()}")
else:
    print("    No configuration-related columns found")

# File and path information
print(f"\n13. FILE AND PATH INFORMATION")
path_columns = [col for col in df_metadata_suma.columns if any(keyword in col.lower() for keyword in ['path', 'file', 'dir', 'output'])]
if path_columns:
    print(f"    Path-related columns found: {path_columns}")
    for col in path_columns:
        unique_count = df_metadata_suma[col].nunique()
        print(f"    - {col}: {unique_count} unique paths")
        if unique_count <= 5:  # Show examples for few paths
            print(f"      Examples: {list(df_metadata_suma[col].unique()[:3])}")
else:
    print("    No path-related columns found")

# Missing data analysis
print(f"\n14. DATA COMPLETENESS ANALYSIS")
missing_data = df_metadata_suma.isnull().sum()
missing_percentage = (missing_data / len(df_metadata_suma)) * 100
print(f"    Columns with missing data:")
for col, missing_count in missing_data[missing_data > 0].items():
    print(f"    - {col}: {missing_count} missing ({missing_percentage[col]:.1f}%)")
if missing_data.sum() == 0:
    print("    No missing data found - dataset is complete!")

# Sample data preview
print(f"\n15. SAMPLE DATA PREVIEW")
print("    First 3 rows:")
print(df_metadata_suma.head(3).to_string())

# Data types
print(f"\n16. DATA TYPES")
print("    Column data types:")
for col, dtype in df_metadata_suma.dtypes.items():
    print(f"    - {col}: {dtype}")

# Statistical summary for numerical columns
print(f"\n17. NUMERICAL COLUMNS SUMMARY")
numerical_cols = df_metadata_suma.select_dtypes(include=[np.number]).columns
if len(numerical_cols) > 0:
    print("    Numerical columns statistics:")
    print(df_metadata_suma[numerical_cols].describe())
else:
    print("    No numerical columns found")

print(f"\n" + "=" * 80)
print("SUMMARY COMPLETE - Ready for detailed agent evaluation analysis")
print("=" * 80)

df_metadata_suma: (49512, 94)
Removed 63 columns with only missing values
df_metadata_suma after cleanup: (49512, 31)
COMPREHENSIVE SUMMARY OF SUMMARY AGENT METADATA

1. DATASET OVERVIEW
   Shape: 49512 rows × 31 columns
   Total experiments: 49512

2. AVAILABLE COLUMNS
   Columns: ['model_type', 'model_name', 'is_agent', 'prompting_id', 'task', 'dataset', 'sample_index', 'original_row_index', 'sex', 'age', 'height', 'weight', 'probabilities', 'prediction', 'label', 'Input Prompt', 'Target Label', 'Predicted Probability', 'Predicted Diagnosis', 'Predicted Explanation', 'Tokenization Time', 'Inference Time', 'Input Tokens', 'Output Tokens', 'timestamp', 'Sample ID', 'Step Name', 'Step Number', 'System Message', 'Output', 'model_prompting_id']

3. MODEL DISTRIBUTION
   No 'model' column found

4. TASK DISTRIBUTION
   Tasks evaluated: 3 unique tasks
   - AKI: 23600 rows
   - Sepsis: 23512 rows
   - Mortality: 2400 rows

5. DATASET DISTRIBUTION
   Datasets used: 3 unique datasets
   - MIMI

  temp_series = pd.to_datetime(df_metadata_suma[col], errors='coerce')


    - Output: 40210 unique paths

14. DATA COMPLETENESS ANALYSIS
    Columns with missing data:
    - probabilities: 24756 missing (50.0%)
    - prediction: 24756 missing (50.0%)
    - Predicted Probability: 24756 missing (50.0%)
    - Predicted Diagnosis: 24756 missing (50.0%)
    - Predicted Explanation: 24756 missing (50.0%)

15. SAMPLE DATA PREVIEW
    First 3 rows:
      model_type        model_name  is_agent   prompting_id       task dataset  sample_index  original_row_index  sex   age  height  weight  probabilities  prediction  label                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

# Evaluation and Validation of Summary Agent Behaviour

In [9]:
# Create output directory
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)
print(f"Analysis output directory: {OUTPUT_BASE_DIR}")

# Data preprocessing for analysis
print("\n" + "="*80)
print("DATA PREPROCESSING FOR SUMMARY AGENT ANALYSIS")
print("="*80)

# Clean and prepare the data
df_suma_clean = df_metadata_suma.copy()

# Categorize step types for Summary Agent
def categorize_step(step_name):
    """Categorize steps for Summary Agent analysis."""
    if pd.isna(step_name):
        return "Unknown"
    step_name = str(step_name).lower()
    if step_name == "feature_analysis":
        return "Feature Analysis"
    elif step_name == "final_prediction":
        return "Final Prediction"
    else:
        return "Other"

df_suma_clean['Step_Category'] = df_suma_clean['Step Name'].apply(categorize_step)

# Basic step analysis
print(f"\nStep distribution:")
step_counts = df_suma_clean['Step_Category'].value_counts()
for step, count in step_counts.items():
    percentage = (count / len(df_suma_clean)) * 100
    print(f"  {step}: {count:,} ({percentage:.1f}%)")

# Print what "Other" step category includes
if "Other" in step_counts.index:
    other_steps = df_suma_clean[df_suma_clean['Step_Category'] == "Other"]['Step Name'].unique()
    print("\nStep Name values included in 'Other':")
    for val in other_steps:
        print(f"  - {val}")

print(f"\nModel distribution in SUMA data:")
model_counts = df_suma_clean['model_name'].value_counts()
for model, count in model_counts.items():
    percentage = (count / len(df_suma_clean)) * 100
    print(f"  {model}: {count:,} ({percentage:.1f}%)")

print(f"\nTask-Dataset distribution:")
task_dataset_counts = df_suma_clean.groupby(['task', 'dataset']).size().sort_values(ascending=False)
for (task, dataset), count in task_dataset_counts.items():
    percentage = (count / len(df_suma_clean)) * 100
    print(f"  {task}-{dataset}: {count:,} ({percentage:.1f}%)")

print(f"\nData cleaning summary:")
print(f"  Original shape: {df_metadata_suma.shape}")
print(f"  Cleaned shape (adding numeric confidence column and standardized step categories): {df_suma_clean.shape}")
print(f"  Steps categorized: {df_suma_clean['Step_Category'].notna().sum():,}")

Analysis output directory: ./notebook_output/pulse_agents_suma

DATA PREPROCESSING FOR SUMMARY AGENT ANALYSIS

Step distribution:
  Feature Analysis: 24,756 (50.0%)
  Final Prediction: 24,756 (50.0%)

Model distribution in SUMA data:
  Gemini-2.5-Flash: 12,378 (25.0%)
  Llama-3.1-8B-Instruct: 12,378 (25.0%)
  Deepseek-R1-Distill-Llama-8B: 12,378 (25.0%)
  Mistral-7B-Instruct-v0.3: 12,378 (25.0%)

Task-Dataset distribution:
  Sepsis-MIMIC-IV: 7,920 (16.0%)
  AKI-MIMIC-IV: 7,872 (15.9%)
  AKI-eICU: 7,872 (15.9%)
  AKI-HiRID: 7,856 (15.9%)
  Sepsis-eICU: 7,840 (15.8%)
  Sepsis-HiRID: 7,752 (15.7%)
  Mortality-HiRID: 800 (1.6%)
  Mortality-MIMIC-IV: 800 (1.6%)
  Mortality-eICU: 800 (1.6%)

Data cleaning summary:
  Original shape: (49512, 31)
  Cleaned shape (adding numeric confidence column and standardized step categories): (49512, 32)
  Steps categorized: 49,512
