In [1]:
# -*- coding: utf-8 -*-
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.16.1
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# # EQBench3 Results Report Generator

import json
import pandas as pd
import numpy as np
from IPython.display import HTML, display
from collections import defaultdict
import re
import os
import sys
import html # For escaping HTML content
import statistics # For averaging rubric scores
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime, timezone
from utils.constants import MODEL_NAME_SUBS

# --- Add core directory to Python path ---
# Assuming the notebook is run from the project root or a similar location
# where 'core' is a subdirectory. Adjust if necessary.
SCRIPT_DIR = os.path.dirname(os.path.abspath('./')) # Get parent directory of current working dir
CORE_DIR = os.path.join(SCRIPT_DIR, 'core')
UTILS_DIR = os.path.join(SCRIPT_DIR, 'utils') # Add utils dir for constants

# Add directories if they exist and aren't already in the path
for dir_path in [CORE_DIR, UTILS_DIR]:
    if os.path.isdir(dir_path) and dir_path not in sys.path:
        sys.path.insert(0, dir_path)
        print(f"Added {dir_path} to sys.path")

# --- Import EQBench specific functions ---
try:
    # Import constants if available (for scenario type checks etc.)
    import utils.constants as eqbench_constants
    print("Successfully imported eqbench_constants.")
except ImportError:
    print("Warning: Could not import 'constants' from utils. EQBench specific constants unavailable.")
    # Define dummy constants if needed, or handle absence gracefully
    class DummyConstants:
        NO_RP_SCENARIO_IDS = set()
        MESSAGE_DRAFTING_SCENARIO_IDS = set()
    eqbench_constants = DummyConstants()

try:
    # Import the rubric score calculation function
    from core.benchmark import calculate_final_rubric_score
    print("Successfully imported calculate_final_rubric_score.")
except ImportError as e:
    print(f"Error importing calculate_final_rubric_score from core.benchmark: {e}", file=sys.stderr)
    print("Please ensure core/benchmark.py exists and is in the Python path.", file=sys.stderr)
    # Define a dummy function if import fails
    def calculate_final_rubric_score(run_data: Dict[str, Any]) -> Tuple[Optional[float], Optional[str]]:
        print("Warning: Using dummy calculate_final_rubric_score function.")
        return None, "Function not imported"

# --- Configuration Variables ---
#RUNS_FILE = "1.json"
#ELO_RESULTS_FILE = "elo_results.json"
RUNS_FILE = "data/canonical_leaderboard_results.json.gz"
ELO_RESULTS_FILE = "data/canonical_leaderboard_elo_results.json.gz"
# Define models to potentially ignore in listings (e.g., reference models)
MODELS_TO_IGNORE = {} # Example, adjust as needed



















# --- Helper Functions (Adapted from Creative Writing) ---

# ---------------------------------------------------------------------------
# EQ‑Bench  •  Aggregate‑metrics printer  •  Drop‑in for refactored script
# ---------------------------------------------------------------------------
from collections import defaultdict
from statistics import mean

# --- group definitions -----------------------------------------------------

RUBRIC_GROUPS = {
    #"humanlike":  ["conversational", "reactive", "humanlike"],
    "humanlike":  ["conversational", "humanlike"],
    #"uptight":    ["safety_conscious", "moralising"],
    "safe":    ["safety_conscious"],
    "assertive":  ["boundary_setting", "challenging"],
    #"social_iq":  ["pragmatic_ei", "social_intelligence", "message_tailoring"],
    "social_iq":  ["social_dexterity", "message_tailoring"],
    "warm": ["warmth", "validating"],
    #"analytical": ["analytical", "emotional_reasoning"],
    "analytical": ["analytical"],
    "insightful": ["depth_of_insight"],
    "empathy": ["demonstrated_empathy"],
    "compliant": ["compliant"],
    "moral": ["moralising"],
    "pragmatic": ["pragmatic_ei"]
}

STANDARD_ALLOWED_RUBRIC_CRITERIA: set[str] = {
    "demonstrated_empathy",
    "pragmatic_ei",
    "depth_of_insight",
    "social_dexterity",
    "emotional_reasoning",
    "message_tailoring",
    "theory_of_mind",
    "subtext_identification",
    "intellectual_grounding",
    "correctness",
}

# ---------------------------------------------------------------------------
# Hard-wired master display order for all scenario IDs
# ---------------------------------------------------------------------------
_CUSTOM_SCENARIO_ORDER = [
    1, 3, 6, 304, 2, 301, 303, 208, 302, 4, 15, 9, 11, 13,
    101, 131, 132, 133, 134, 136, 137, 138, 139, 140, 141, 145,
    401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412,
    413, 414, 415, 416, 417, 418, 419, 420,
]
_SCENARIO_RANK = {str(sid): rank for rank, sid in enumerate(_CUSTOM_SCENARIO_ORDER)}
_MAX_RANK_FALLBACK = len(_CUSTOM_SCENARIO_ORDER) + 1

def rubric_score_0_100(rubric_dict: dict[str, Any] | None) -> str:
    """
    Return the mean of the *allowed* numeric rubric metrics ×5 (to 0-100),
    formatted with one decimal.  If no numeric scores are present, return
    the literal string 'Unscored'.
    """
    if not isinstance(rubric_dict, dict):
        return "Unscored"

    vals = [
        v for k, v in rubric_dict.items()
        if k in STANDARD_ALLOWED_RUBRIC_CRITERIA and isinstance(v, (int, float))
    ]
    if not vals:
        return "Unscored"

    return f"{statistics.mean(vals) * 5:.1f}"

# ---------------------------------------------------------------------------
def calculate_and_print_metrics_eqbench(
    runs_file: str = RUNS_FILE,
    elo_file: str = ELO_RESULTS_FILE,
    models_to_ignore: set | None = None,
):
    """
    Print a CSV with one row per model:
        model_name, elo_norm, rubric_0_100, <7 aggregate columns>, ci_low_norm, ci_high_norm
    """
    if models_to_ignore is None:
        models_to_ignore = set(MODELS_TO_IGNORE)

    # -------- load data ----------------------------------------------------
    runs_data = load_json_file(runs_file)
    elo_data  = load_json_file(elo_file)

    if not runs_data:
        print(f"[metrics] Runs file '{runs_file}' missing or empty."); return
    if not elo_data:
        print(f"[metrics] ELO file '{elo_file}' missing or empty.");  return

    # -------- harvest rubric scores ---------------------------------------
    per_model_metric_scores: dict[str, dict[str, list[float]]] = (
        defaultdict(lambda: defaultdict(list))
    )
    for run in runs_data.values():
        model = run.get("test_model")
        if not model or model in models_to_ignore:
            continue
        for iter_data in run.get("scenario_tasks", {}).values():
            for task in iter_data.values():
                rs = task.get("rubric_scores")
                if isinstance(rs, dict):
                    for metric, val in rs.items():
                        if isinstance(val, (int, float)):
                            per_model_metric_scores[model][metric.lower()].append(val)

    # -------- aggregate by group ------------------------------------------
    group_avgs: dict[str, dict[str, float | None]] = {}
    for model, metrics in per_model_metric_scores.items():
        group_avgs[model] = {}
        for group, metric_names in RUBRIC_GROUPS.items():
            vals = []
            for m in metric_names:
                vals.extend(metrics.get(m, []))
            group_avgs[model][group] = round(mean(vals), 2) if vals else None

    # -------- overall rubric score (0-100) ---------------------------------
    overall_rubric_scaled: dict[str, float | None] = {}
    for model in per_model_metric_scores:
        run_key = find_run_key_for_model(runs_data, model)
        run_data = runs_data.get(run_key, {}) if run_key else {}
        avg_raw, _ = calculate_final_rubric_score(run_data)
        overall_rubric_scaled[model] = (
            avg_raw * 5 if isinstance(avg_raw, (int, float)) else None
        )

    # -------- ELO + CI look-ups -------------------------------------------
    elo_norm_lookup: dict[str, float | None]      = {}
    ci_low_norm_lookup: dict[str, float | None]   = {}
    ci_high_norm_lookup: dict[str, float | None]  = {}

    for m, d in elo_data.items():
        if m == "__metadata__": 
            continue
        elo_norm_lookup[m]     = d.get("elo_norm", d.get("elo"))
        ci_low_norm_lookup[m]  = d.get("ci_low_norm")
        ci_high_norm_lookup[m] = d.get("ci_high_norm")

    # -------- CSV header ---------------------------------------------------
    header_cols = [
        "model_name",
        "elo_norm",
        "rubric_0_100",
        *RUBRIC_GROUPS.keys(),
        "ci_low_norm",
        "ci_high_norm",
    ]
    print(",".join(header_cols))

    # -------- print one line per model -------------------------------------
    for model in sorted(
            elo_norm_lookup,
            key=lambda x: (elo_norm_lookup.get(x) or -float('inf')),
            reverse=True
        ):
        if model in models_to_ignore:
            continue

        disp_name   = get_updated_model_name(model)
        elo_val     = elo_norm_lookup.get(model)
        elo_str     = f"{elo_val:.1f}" if isinstance(elo_val, (int, float)) else "N/A"
        rubric_val  = overall_rubric_scaled.get(model)
        rubric_str  = f"{rubric_val:.1f}" if isinstance(rubric_val, (int, float)) else "N/A"

        row = [disp_name, elo_str, rubric_str]

        # group means
        group_vals = group_avgs.get(model, {})
        for g in RUBRIC_GROUPS:
            v = group_vals.get(g)
            row.append(f"{v:.2f}" if isinstance(v, (int, float)) else "N/A")

        # CI bounds
        lo = ci_low_norm_lookup.get(model)
        hi = ci_high_norm_lookup.get(model)
        row.append(f"{lo:.1f}" if isinstance(lo, (int, float)) else "N/A")
        row.append(f"{hi:.1f}" if isinstance(hi, (int, float)) else "N/A")

        print(",".join(row))


from utils.file_io import load_json_file

def sanitize_model_name(model_name: str) -> str:
    """Sanitize model name for use in filenames and HTML IDs."""
    sanitized = model_name.replace("/", "__") # Replace slashes first
    unsafe_chars = r'<>:"|?*\\' # Characters unsafe in Windows filenames
    for char in unsafe_chars:
        sanitized = sanitized.replace(char, '-')
    # Replace other potentially problematic chars like spaces, colons, periods
    sanitized = re.sub(r'[\s:.]+', '-', sanitized)
    # Ensure it starts with a letter for valid ID
    if not sanitized[0].isalpha():
        sanitized = "m-" + sanitized
    return sanitized.lower()

def get_updated_model_name(original: str) -> str:
    """Return the substituted model name if one exists, otherwise the original."""
    return MODEL_NAME_SUBS.get(original, original)

# --- EQBench Specific Helper Functions (Adapted from Matchup Inspo) ---

def find_run_key_for_model(results_data: Dict, model_name: str) -> Optional[str]:
    """Finds the top-level run key corresponding to a given model name in EQBench runs."""
    # Prioritize exact match on 'test_model' field
    for run_key, run_details in results_data.items():
        if isinstance(run_details, dict) and run_details.get("test_model") == model_name:
            return run_key

    # Fallback: If exact match fails, try matching based on model name being *part* of the key
    # This is less reliable but can be helpful if naming conventions vary.
    # Sanitize the model name slightly for better key matching
    sanitized_search_name = model_name.replace('/', '__')
    for run_key in results_data.keys():
        if sanitized_search_name in run_key:
            print(f"Warning: Found run key '{run_key}' via partial key match for model '{model_name}'. Using this.")
            return run_key

    return None

def find_scenario_run_data(results_data: Dict, target_run_key: Optional[str], scenario_id: str, iteration_index: int) -> Optional[Dict]:
    """Finds the specific scenario task data within a specific run key."""
    if not target_run_key or target_run_key not in results_data:
        # print(f"Debug: Run key '{target_run_key}' not found in results data.")
        return None

    run_details = results_data.get(target_run_key)
    if not isinstance(run_details, dict):
        # print(f"Debug: Data for run key '{target_run_key}' is not a dictionary.")
        return None

    scenario_tasks = run_details.get("scenario_tasks", {})
    if not isinstance(scenario_tasks, dict):
        # print(f"Debug: 'scenario_tasks' for run key '{target_run_key}' is not a dictionary.")
        return None

    # Structure is {iteration_str: {scenario_id_str: data}}
    iter_tasks = scenario_tasks.get(str(iteration_index))
    if isinstance(iter_tasks, dict):
        scenario_data = iter_tasks.get(str(scenario_id))
        if isinstance(scenario_data, dict):
            return scenario_data
    return None


def parse_structured_response(content: str, scenario_id: str) -> Dict[str, str]:
    """Parses the structured response sections from assistant content."""
    parsed = {}
    is_drafting = scenario_id in eqbench_constants.MESSAGE_DRAFTING_SCENARIO_IDS

    if is_drafting:
        patterns = {
            "perspective_taking": r"#\s*Perspective[- ]taking\s*\n([\s\S]*?)(?=#\s*Draft brainstorming|\Z)",
            "draft_brainstorming": r"#\s*Draft brainstorming\s*\n([\s\S]*?)(?=#\s*Draft|\Z)",
            "draft": r"#\s*Draft\s*\n([\s\S]*?)(?=--|\Z)",
        }
    else: # Standard RP
        patterns = {
            "thinking_feeling": r"#\s*I'm thinking & feeling\s*\n([\s\S]*?)(?=#\s*They're thinking & feeling|#\s*My response|\Z)",
            "their_thinking_feeling": r"#\s*They're thinking & feeling\s*\n([\s\S]*?)(?=#\s*My response|\Z)",
            "response": r"#\s*My response\s*\n([\s\S]*?)(?=--|\Z)",
        }

    for key, pat in patterns.items():
        m = re.search(pat, content, flags=re.IGNORECASE | re.MULTILINE)
        parsed[key] = m.group(1).strip() if m else ""

    # Check if all parsed sections are empty
    all_empty = all(not v for v in parsed.values())
    if all_empty and content:
        # If parsing failed to extract anything, return the raw content under a 'raw' key
        # This signals the formatting function to display the raw content.
        return {"raw": content}

    return parsed


def format_conversation_for_html(history: List[Dict[str, str]], scenario_id: str) -> str:
    """Formats EQBench conversation history into HTML, preserving structure."""
    convo_html = "<div class='conversation'>"
    if not history:
        return "<p class='missing-data'><em>No conversation history found.</em></p>"

    is_no_rp = scenario_id in eqbench_constants.NO_RP_SCENARIO_IDS

    for msg in history:
        role = msg.get('role', 'unknown')
        raw_content = msg.get('content', '')
        role_class = role.lower() if role else 'unknown'
        role_display = f"{role.capitalize()}:" if role else "Unknown Role:"
        formatted_content = ""

        if role == 'assistant' and not is_no_rp:
            parsed = parse_structured_response(raw_content, scenario_id)
            if "raw" in parsed: # Parsing failed or yielded nothing, show raw
                formatted_content = f"<pre>{html.escape(parsed['raw'])}</pre>"
            else:
                # Build HTML with headers for existing sections
                content_parts = []
                if scenario_id in eqbench_constants.MESSAGE_DRAFTING_SCENARIO_IDS:
                    if parsed.get("perspective_taking"):
                        content_parts.append(f"<strong># Perspective-taking</strong>\n<pre>{html.escape(parsed['perspective_taking'])}</pre>")
                    if parsed.get("draft_brainstorming"):
                        content_parts.append(f"<strong># Draft brainstorming</strong>\n<pre>{html.escape(parsed['draft_brainstorming'])}</pre>")
                    if parsed.get("draft"):
                        content_parts.append(f"<strong># Draft</strong>\n<pre>{html.escape(parsed['draft'])}</pre>")
                else: # Standard RP
                    if parsed.get("thinking_feeling"):
                        content_parts.append(f"<strong># I'm thinking & feeling</strong>\n<pre>{html.escape(parsed['thinking_feeling'])}</pre>")
                    if parsed.get("their_thinking_feeling"):
                        content_parts.append(f"<strong># They're thinking & feeling</strong>\n<pre>{html.escape(parsed['their_thinking_feeling'])}</pre>")
                    if parsed.get("response"):
                        content_parts.append(f"<strong># My response</strong>\n<pre>{html.escape(parsed['response'])}</pre>")

                formatted_content = "\n".join(content_parts)
                # Add the '--' separator if it was present in the original raw content
                if raw_content and raw_content.splitlines():
                    if "--" in raw_content.splitlines()[-1]:
                        formatted_content += "\n<pre>--</pre>"

                if not formatted_content: # Should not happen if parsed['raw'] logic is correct, but fallback
                    formatted_content = f"<pre>{html.escape(raw_content)}</pre>"

        else: # User message or NO_RP assistant message
            formatted_content = f"<pre>{html.escape(raw_content if raw_content else '(No content)')}</pre>"

        convo_html += f"<div class='message {role_class}'>"
        convo_html += f"  <strong>{role_display}</strong>"
        convo_html += formatted_content # Already contains <pre> tags
        convo_html += f"</div>"
    convo_html += "</div>"
    return convo_html

import json, re

def extract_judge_reasoning(raw_text: str) -> Optional[str]:
    """
    Return the `chain_of_thought_reasoning` field from the raw judge blob.
    Works even if the string contains extra prefix/suffix noise.
    """
    if not isinstance(raw_text, str):
        return None

    # Grab the first … last braces so we can still recover if the caller
    # stuffed warnings or markdown around the JSON.
    m = re.search(r'{.*}', raw_text, flags=re.S)
    if not m:
        return None

    try:
        blob = json.loads(m.group(0))
        return blob.get("chain_of_thought_reasoning")
    except json.JSONDecodeError:
        return None

import re, json

def substitute_judge_codes(text: str,
                           order_str: str,
                           test_model: str,
                           neighbor_model: str) -> str:
    """
    Replace the short codes (e.g. 'A0493', 'A0488') that the judge uses
    with the actual model names.

    * `order_str` looks like  "A0493:test / A0488:other"
    * test_model / neighbor_model come from entry["pair"]

    We return the text with every standalone occurrence of the code
    swapped for `get_updated_model_name(<model>)`.
    """
    if not (isinstance(text, str) and isinstance(order_str, str)):
        return text          # nothing to do

    # Build code→role map from the "order" field
    role_map = {}            # {'A0493': 'test', 'A0488': 'other'}
    for m in re.finditer(r'([A-Za-z]\d+)\s*:\s*(test|other|neighbor)', order_str):
        role_map[m.group(1)] = m.group(2)

    # Translate to code→model‑name
    code_to_model = {}
    for code, role in role_map.items():
        if role == 'test':
            code_to_model[code] = get_updated_model_name(test_model).split('/')[-1].capitalize()
        else:                 # 'other' or 'neighbor'
            code_to_model[code] = get_updated_model_name(neighbor_model).split('/')[-1].capitalize()

    # Replace every whole‑word occurrence
    for code, model_name in code_to_model.items():
        text = re.sub(rf'\b{re.escape(code)}\b', model_name, text)

    return text

import gzip, base64

def gzip_b64(s: str) -> str:
    """Return gzip‑compressed UTF‑8 text, Base‑64 encoded (ASCII)."""
    return base64.b64encode(gzip.compress(s.encode('utf-8'))).decode('ascii')

def generate_matchup_section_html(
    comparison_entry: Dict,
    results_data: Dict,
    target_model: str,
    uniq: str,  # <- pass in a short string that is unique within the opponent group (“1”, “2”, …)
) -> str:
    """
    Build the HTML fragment for a single pair‑wise comparison section on the
    “Matchups” tab.  The `uniq` parameter is concatenated onto all element IDs
    so every matchup gets its own distinct ID (preventing toggle collisions).
    """

    # ------------------------------------------------------------
    # 1) Extract basic matchup metadata
    # ------------------------------------------------------------
    scenario_id      = comparison_entry.get("scenario_id")
    iteration_idx    = comparison_entry.get("pair", {}).get("iteration_index")
    elo_test_model   = comparison_entry.get("pair", {}).get("test_model")
    elo_neighbor_mod = comparison_entry.get("pair", {}).get("neighbor_model")

    # Display versions of the two model names
    elo_test_disp    = get_updated_model_name(elo_test_model ).split('/')[-1].capitalize()
    elo_neigh_disp   = get_updated_model_name(elo_neighbor_mod).split('/')[-1].capitalize()

    if scenario_id is None or iteration_idx is None:
        return "<div class='comparison-section error-message'>Skipped: missing scenario or iteration.</div>"

    # ------------------------------------------------------------
    # 2) Work out which side is “this model” (A) and the opponent (B)
    # ------------------------------------------------------------
    if elo_test_model == target_model:
        model_A_name   = elo_test_model
        model_B_name   = elo_neighbor_mod
        model_A_label  = elo_test_disp
        model_B_label  = elo_neigh_disp
        outcome_value  = comparison_entry.get("outcome_for_test_model", "N/A")
    elif elo_neighbor_mod == target_model:
        model_A_name   = elo_neighbor_mod
        model_B_name   = elo_test_model
        model_A_label  = elo_neigh_disp
        model_B_label  = elo_test_disp
        raw_outcome    = comparison_entry.get("outcome_for_test_model")
        outcome_value  = 1.0 if raw_outcome == 0.0 else 0.0 if raw_outcome == 1.0 else raw_outcome
    else:
        return "<div class='comparison-section error-message'>Internal error – model mismatch.</div>"

    # ------------------------------------------------------------
    # 3) Pull the run‑data and conversation transcripts
    # ------------------------------------------------------------
    run_key_A = find_run_key_for_model(results_data, model_A_name)
    run_key_B = find_run_key_for_model(results_data, model_B_name)

    model_A_task = find_scenario_run_data(results_data, run_key_A, scenario_id, iteration_idx)
    model_B_task = find_scenario_run_data(results_data, run_key_B, scenario_id, iteration_idx)

    model_A_convo = model_A_task.get("conversation_history", []) if model_A_task else []
    model_B_convo = model_B_task.get("conversation_history", []) if model_B_task else []

    # Judge reasoning (convert short codes -> model names)
    judge_blob    = comparison_entry.get("judge_response", {})
    judge_reason  = judge_blob.get("chain_of_thought_reasoning", "")
    judge_reason  = substitute_judge_codes(
        judge_reason,
        comparison_entry.get("order", ""),
        elo_test_model,
        elo_neighbor_mod,
    )

    # ------------------------------------------------------------
    # 4) Outcome summary for header line
    # ------------------------------------------------------------
    if outcome_value == 1.0:
        win_loss = "Win"
        sign     = "+"
    elif outcome_value == 0.0:
        win_loss = "Loss"
        sign     = "−"
    else:
        win_loss = "Draw"
        sign     = "±"

    plus_diff = comparison_entry.get("plus_diff", 0)

    # ------------------------------------------------------------
    # 5) Build unique IDs and assemble HTML
    # ------------------------------------------------------------
    opponent_sanitized = sanitize_model_name(model_B_name)
    anchor_id = (
        f"matchup_s{scenario_id}_i{iteration_idx}_vs_{opponent_sanitized}_{uniq}"
    )

        # ------------------------------------------------------------------
    # 1)  Build ONLY the **inner content** (no header, no outer wrapper)
    # ------------------------------------------------------------------
    inner_html = f"""
        <div class="info-box small-info-box">          
          <strong>Outcome for this model:</strong> {win_loss}
        </div>

        <div class="section judge-section matchup-judge">
          <h5>Judge&apos;s Reasoning:</h5>
          <pre>{html.escape(judge_reason or '(none provided)')}</pre>
        </div>

        <div class="comparison-grid">
          <div class="model-column">
            <h6>{html.escape(model_A_label)}</h6>
            {format_conversation_for_html(model_A_convo, scenario_id) if model_A_convo else '<p class="missing-data">Conversation not found.</p>'}
          </div>
          <div class="model-column">
            <h6>{html.escape(model_B_label)}</h6>
            {format_conversation_for_html(model_B_convo, scenario_id) if model_B_convo else '<p class="missing-data">Conversation not found.</p>'}
          </div>
        </div>
    """

    # gzip + base‑64 encode → ready for the data‑attribute
    compressed_html = gzip_b64(inner_html)

    # ------------------------------------------------------------------
    # 2)  Return placeholder markup:
    #     • outer <div class="comparison-section"> stays in the DOM
    #     • header is visible & clickable
    #     • collapsible <div id="…-content"> gets the compressed payload
    # ------------------------------------------------------------------
    cat, title = SCENARIO_META.get(str(scenario_id),
                                   (f"Scenario {scenario_id}", ""))  # fallback
    scenario_label = f"{cat} | {title}" if title else cat

    return f"""
    <div class="comparison-section" id="{anchor_id}">

      <!-- clickable summary header -->
      <div class="comparison-header"
           onclick="expandMatchup('{anchor_id}-content')">
        <span class="toggle-icon">+</span>
        {html.escape(scenario_label)} | [{win_loss}] ({sign}{abs(plus_diff)})
      </div>

      <!-- content container (initially empty) -->
      <div id="{anchor_id}-content"
           class="collapsible-content"
           data-matchup="{compressed_html}"
           data-expanded="false"></div>
    </div>
    """



# ---------------------------------------------------------------------------
# Scenario-ID → (category, title)  lookup  (STRICT)
# ---------------------------------------------------------------------------
def load_scenario_metadata() -> dict[str, tuple[str, str]]:
    """
    Look for     data/scenario_prompts.txt     next to the repo root.
    Each useful line must look like:
        ######## 11 | Work Dilemma | Lunchroom Theft Scapegoat
    Any deviation raises.
    """
    candidate_paths = [
       './data/scenario_prompts.txt'
    ]
    for p in candidate_paths:
        if os.path.isfile(p):
            meta_path = p
            break
    else:
        raise FileNotFoundError("scenario_prompts.txt not found in ./data/ or ../data/")

    line_pat = re.compile(r'^#{8}\s*(\d+)\s*\|\s*([^|]+?)\s*\|\s*(.+)$')
    mapping: dict[str, tuple[str, str]] = {}

    with open(meta_path, encoding='utf-8') as fh:
        for ln_no, ln in enumerate(fh, 1):
            ln = ln.rstrip()

            # ignore blank lines & comments
            if not ln or ln.lstrip().startswith('//'):
                continue

            m = line_pat.match(ln)
            if not m:               # not a metadata line → just skip
                continue

            sid, cat, title = (s.strip() for s in m.groups())
            if not cat or not title:
                raise ValueError(f"[meta] Missing category/title on line {ln_no}")
            mapping[sid] = (cat, title)

    if not mapping:
        raise RuntimeError("[meta] File parsed but produced an empty mapping")

    print(f"[meta] Loaded {len(mapping)} scenarios from {meta_path}")
    return mapping

def scenario_label_str(sid: str) -> tuple[str, str]:
    try:
        return SCENARIO_META[str(sid)]
    except KeyError:
        raise KeyError(f"[meta] Scenario ID {sid} missing from metadata file")


SCENARIO_META = load_scenario_metadata()



# --- Main Report Generation Function ---

def generate_model_report_eqbench(model_name: str, run_key: Optional[str] = None, save_to_file: bool = False) -> HTML:
    """
    Generate an HTML report for a specific model's EQBench3 results,
    including scenario outputs and pairwise matchups.
    """
    # --- Data Loading ---
    runs_data = load_json_file(RUNS_FILE)
    elo_data = load_json_file(ELO_RESULTS_FILE)

    if not runs_data:
        return HTML(f"<h2>Error: Runs data file '{RUNS_FILE}' not found or empty.</h2>")
    if not elo_data:
        print(f"Warning: ELO data file '{ELO_RESULTS_FILE}' not found or empty. Matchup data will be unavailable.")
        all_comparisons = [] # Ensure it's an empty list if file is missing
    else:
        all_comparisons = elo_data.get("__metadata__", {}).get("global_pairwise_comparisons", [])


    # --- Find the correct run for the model ---
    if run_key is None:
        run_key = find_run_key_for_model(runs_data, model_name)
        if not run_key:
            # Try finding via substituted name as a fallback
            substituted_name = get_updated_model_name(model_name)
            if substituted_name != model_name:
                 run_key = find_run_key_for_model(runs_data, substituted_name)

            if not run_key:
                 return HTML(f"<h2>No runs found for model: {html.escape(model_name)} (or its substitution)</h2>")
            else:
                 print(f"Using run key '{run_key}' found via substituted name '{substituted_name}'")
                 model_name = substituted_name # Use the name associated with the found run key

    if run_key not in runs_data:
        return HTML(f"<h2>Run key '{html.escape(run_key)}' not found in {RUNS_FILE}</h2>")

    run_data = runs_data[run_key]
    original_model_name = run_data.get("test_model", model_name) # Get model name from run data if possible
    display_model_name = get_updated_model_name(original_model_name)

    scenario_tasks = run_data.get("scenario_tasks", {})
    if not scenario_tasks:
        print(f"Warning: No scenario tasks found for run: {run_key}")
        # Allow report generation but indicate missing task data

    # --- Calculate/Retrieve Overall Scores ---
    # ELO Scores
    model_elo_data = elo_data.get(original_model_name, {})
    elo_raw = model_elo_data.get("elo", "N/A")
    elo_norm = model_elo_data.get("elo_norm", "N/A")
    elo_raw_display = f"{elo_raw:.1f}" if isinstance(elo_raw, (int, float)) else "N/A"
    elo_norm_display = f"{elo_norm:.1f}" if isinstance(elo_norm, (int, float)) else "N/A"

    # Rubric Score (Calculate from run_data)
    avg_rubric_score, rubric_err = calculate_final_rubric_score(run_data)
    # Scale score 0-20 to 0-100
    rubric_score_scaled = avg_rubric_score * 5 if isinstance(avg_rubric_score, (int, float)) else None
    rubric_score_display = f"{rubric_score_scaled:.1f}" if rubric_score_scaled is not None else "N/A"

    if rubric_err:
        rubric_score_display += f" (Error: {rubric_err})"
    # Check if rubric was explicitly skipped in the run config
    elif run_data.get("results", {}).get("average_rubric_score") == "Skipped" or run_data.get("run_rubric") is False:
         rubric_score_display = "Skipped"


    # --- HTML Generation ---
    html_output = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>EQBench3 Report: {display_model_name}</title>
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <style>
            /* --- Base Variables & Font Defaults (Copied from Creative Writing) --- */
            /* ----------------------------------------------------
            2) Base Variables & Font Defaults  –  Cozy LIGHT
            ---------------------------------------------------- */
            /* ----------------------------------------------------
            2) Base Variables & Font Defaults  –  Cozy LIGHT
            ---------------------------------------------------- */
            :root {{
                /* Theme identity ---------------------------------- */
                --theme-name:                  'cozy';

                /* Fonts (unchanged) --------------------------------*/
                --font-body-cozy:              'Tiempos Text', Georgia, serif;
                --font-heading-cozy:           'Lora', serif;

                --font-body-modern:            'Inter', sans-serif;
                --font-heading-modern:         'Besley', 'Merriweather', serif;

                --font-ui:                     'Lora', serif;

                /* Active font vars */
                --font-body:                   var(--font-body-cozy);
                --font-heading:                var(--font-heading-cozy);

                /* Page & text ------------------------------------- */
                --bg-color:                    #fffbf8;   /* warm ivory */
                --text-color:                  #000000;   /* near‑black, subtle warmth */

                --header-color:                #5a3d32;   /* rich brown */
                --subheader-color:             #8a7163;   /* warm taupe */

                /* Borders & accents ------------------------------- */
                --border-color:                #e6e0d7;   /* light beige‑grey */
                --accent-border-color:         #d0c0b0;   /* soft brown‑beige */

                /* Panels / boxes ---------------------------------- */
                --container-bg:                #ffffff;   /* main blocks */
                --iter-header-bg:              #f8f4f1;   /* iteration / opponent bars */
                --iter-header-hover-bg:        #f1ede9;

                --prompt-header-bg:            #fef9f7;   /* scenario / comparison headers */
                --prompt-header-hover-bg:      #f7f2ef;

                --prompt-display-bg:           #fcf8f5;   /* italic prompt strip */

                /* Judge / rubric panels --------------------------- */
                --judge-bg:                    #f5f5f7;   /* neutral light grey */
                --judge-border:                #cdcfd6;   /* quiet blue‑grey */
                --judge-text:                  #000;

                /* Misc UI ----------------------------------------- */
                --toggle-icon-color:           #a28c7c;
                --shadow-color:                rgba(0,0,0,0.08);

                --link-color:                  #8a7163;   /* match subheader */
                --link-hover-color:            #5a3d32;   /* match header */

                /* Toggle / select gadgets */
                --toggle-bg:                   #ccc;
                --toggle-checked-bg:           #8a7163;
                --toggle-knob-bg:              #ffffff;

                --select-text-color:           var(--subheader-color);
                --select-chevron-color:        var(--subheader-color);
                --select-bg:                   transparent;
                --select-border:               none;

                /* Tabs -------------------------------------------- */
                --tab-bg:                      #ded7ce;
                --tab-text:                    var(--subheader-color);
                --tab-active-bg:               var(--container-bg);
                --tab-active-text:             var(--header-color);
                --tab-border:                  var(--border-color);

                /* Utility colours --------------------------------- */
                --missing-data-color:          #999;
                --error-message-color:         #d9534f;
            }}




            /* --- Cozy Dark Mode Variables --- */
            /* --- Cozy Dark‑mode Variables (updated) --- */
            body.theme-cozy.dark-mode {{
                --bg-color:                #2a2527;   /* page background unchanged */
                --text-color:              #fff9f2;
                --header-color:            #f7eee0;
                --subheader-color:         #e9dfd0;

                --border-color:            #3e3936;
                --accent-border-color:     #6a5349;

                /* ↓ darker “panels” */
                --container-bg:            #262224;   /* main boxes, convo bubbles */
                --iter-header-bg:          #2b2628;   /* iteration/opponent bars */
                --iter-header-hover-bg:    #353033;
                --prompt-header-bg:        #272324;   /* scenario / comparison headers */
                --prompt-header-hover-bg:  #302b2d;
                --prompt-display-bg:       #242021;   /* italic prompt strip */
                --judge-bg:                #232427;   /* judge + rubric boxes */
                --judge-border:            #4a4542;
                --shadow-color:            #0c0705;

                --link-color:              #d0bca8;
                --link-hover-color:        #ebdac5;

                --toggle-icon-color:       #c0b0a0;
                --select-text-color:       var(--subheader-color);
                --select-chevron-color:    var(--subheader-color);

                --tab-bg:                  #3e3936;
                --tab-text:                var(--subheader-color);
                --tab-active-bg:           var(--container-bg);
                --tab-active-text:         var(--header-color);
                --tab-border:              #4e4944;

                --missing-data-color:      #aaa;
                --error-message-color:     #e57373;
                --judge-text:              #fff9f2;
            }}


            /* --- Modern Theme Variables --- */
            body.theme-modern {{
                --theme-name: 'modern';
                --font-body: var(--font-body-modern); --font-heading: var(--font-heading-modern);
                --bg-color: #ffffff; --text-color: #000000; --header-color: #000000;
                --subheader-color: #495057; --border-color: #dee2e6; --accent-border-color: #adb5bd;
                --container-bg: #ffffff; --iter-header-bg: #f8f9fa; --iter-header-hover-bg: #e9ecef;
                --prompt-header-bg: #ffffff; --prompt-header-hover-bg: #f8f9fa; --judge-bg: #f1f3f5;
                --judge-border: #ced4da; --judge-text: #000; --prompt-display-bg: #f8f9fa;
                --toggle-icon-color: #6c757d; --shadow-color: rgba(0, 0, 0, 0.1);
                --link-color: #007bff; --link-hover-color: #0056b3;
                --select-text-color: var(--subheader-color); --select-chevron-color: var(--subheader-color);
                --tab-bg: #e9ecef; --tab-text: var(--subheader-color); --tab-active-bg: var(--container-bg);
                --tab-active-text: var(--header-color); --tab-border: var(--border-color);
                --missing-data-color: #6c757d; --error-message-color: #dc3545;
            }}

            /* --- Modern Dark Mode Variables --- */
            body.theme-modern.dark-mode {{
                --bg-color: #1a1a1a; --text-color: #e9ecef; --header-color: #ffffff;
                --subheader-color: #adb5bd; --border-color: #495057; --accent-border-color: #6c757d;
                --container-bg: #212529; --iter-header-bg: #343a40; --iter-header-hover-bg: #495057;
                --prompt-header-bg: #2c3034; --prompt-header-hover-bg: #343a40; --judge-bg: #343a40;
                --judge-border: #495057; --judge-text: #ced4da; --prompt-display-bg: #343a40;
                --toggle-icon-color: #adb5bd; --shadow-color: rgba(0, 0, 0, 0.3);
                --link-color: #69b1ff; --link-hover-color: #a8d1ff;
                --select-text-color: var(--subheader-color); --select-chevron-color: var(--subheader-color);
                --tab-bg: #343a40; --tab-text: var(--subheader-color); --tab-active-bg: var(--container-bg);
                --tab-active-text: var(--header-color); --tab-border: #495057;
                --missing-data-color: #adb5bd; --error-message-color: #f08080;
            }}

            /* --- Base Global Styles (Adapted) --- */
            body {{
                font-family: var(--font-body); line-height: 1.7; color: var(--text-color);
                background-color: var(--bg-color); max-width: 1100px; /* Wider for side-by-side */
                margin: 30px auto; padding: 40px 50px; border: 1px solid var(--border-color);
                box-shadow: 0 5px 15px var(--shadow-color); transition: background-color 0.3s, color 0.3s, border-color 0.3s;
            }}
            h1, h2, h3, h4, h5, h6 {{ /* Added h5, h6 */
                font-family: var(--font-heading); color: var(--header-color);
                margin-top: 1.5em; margin-bottom: 0.8em; line-height: 1.3; transition: color 0.3s;
            }}
            h1 {{ /* Main Title */
                text-align: center; font-size: 2.2em; border-bottom: 2px solid var(--accent-border-color);
                padding-bottom: 15px; margin-bottom: 1em; font-weight: 700;
                transition: border-color 0.3s; font-family: var(--font-ui) !important;
            }}
            h2 {{ /* Iteration Header / Matchup Opponent Header */
                font-size: 1.6em; font-weight: 700; margin-top: 2em;
                border-bottom: 1px solid var(--border-color); padding-bottom: 5px;
            }}
            h3 {{ /* Scenario Header / Matchup Section Header */
                font-size: 1.3em; font-style: italic; font-weight: 400;
                color: var(--subheader-color); margin-top: 1.5em;
            }}
             h4 {{ /* Matchup Sub-header (Scenario ID/Iter) */
                font-size: 1.1em; font-weight: 700; color: var(--header-color);
                 margin-top: 1.2em; margin-bottom: 0.5em;
            }}
             h5 {{ /* Judge Reasoning / Model Column Headers */
                 font-size: 1.0em; font-weight: bold; color: var(--subheader-color);
                 margin-top: 1em; margin-bottom: 0.5em;
             }}
             h6 {{ /* Conversation Headers within Matchups */
                 font-size: 0.95em; font-weight: bold; color: var(--text-color);
                 margin-top: 0.8em; margin-bottom: 0.3em;
             }}
            strong {{ font-weight: bold; color: var(--header-color); transition: color 0.3s; }}
            a {{ color: var(--link-color); text-decoration: none; transition: color 0.3s; }}
            a:hover {{ color: var(--link-hover-color); text-decoration: underline; }}
            pre {{ /* Styling for code/conversation blocks */
                background-color: var(--prompt-display-bg); padding: 10px 15px; border-radius: 4px;
                white-space: pre-wrap; word-wrap: break-word; font-family: monospace; /* Use monospace for convo */
                font-size: 0.95em; border: 1px solid var(--border-color); margin-bottom: 1em;
                transition: background-color 0.3s, border-color 0.3s;
            }}
            .missing-data {{ font-style: italic; color: var(--missing-data-color); }}
            .error-message {{ color: var(--error-message-color); font-weight: bold; }}

            /* --- Top Controls & Header Info --- */
            .top-controls {{
                display: flex; justify-content: space-between; align-items: center;
                margin-bottom: 20px; padding-bottom: 10px; border-bottom: 1px solid var(--border-color);
                transition: border-color 0.3s; font-family: var(--font-ui) !important;
            }}
            .back-button {{ font-family: var(--font-ui) !important; font-size: 1em; color: var(--select-text-color); transition: color 0.3s; }}
            .controls-right {{ display: flex; align-items: center; gap: 15px; }}
            .control-select-wrapper {{ position: relative; display: inline-block; }}
            .control-select {{
                font-family: var(--font-ui) !important; font-size: 0.9em; color: var(--select-text-color);
                background-color: var(--select-bg); border: none; padding: 2px 5px 2px 18px; margin: 0;
                cursor: pointer; appearance: none; -webkit-appearance: none; -moz-appearance: none;
                transition: color 0.3s; border-radius: 0;
            }}
            .control-select:focus {{ outline: none; }}
            .control-select-wrapper::before {{
                content: '▼'; font-size: 0.6em; color: var(--select-chevron-color); position: absolute;
                left: 5px; top: 50%; transform: translateY(-50%); pointer-events: none; transition: color 0.3s;
            }}
            .control-select option {{ background-color: var(--bg-color); color: var(--text-color); font-family: var(--font-ui); }}
            .mode-toggle {{ display: flex; align-items: center; font-family: var(--font-ui) !important; }}
            .mode-toggle .form-check-input {{ opacity: 0; width: 0; height: 0; position: absolute; }}
            .mode-toggle .form-check-label {{
                font-family: var(--font-ui) !important; font-size: 0.9em; color: var(--subheader-color);
                cursor: pointer; transition: color 0.3s; user-select: none; padding: 2px 5px;
            }}
            .mode-toggle .form-check-label:hover {{ color: var(--link-hover-color); }}
            .header-info {{
                 background-color: var(--prompt-display-bg); border: 1px solid var(--border-color);
                 padding: 15px 20px; margin-bottom: 25px; border-radius: 5px; font-size: 0.95em;
                 transition: background-color 0.3s, border-color 0.3s;
            }}
            .header-info strong {{ color: var(--header-color); }}

            /* --- Tabs --- */
            .tab-container {{ margin-bottom: 20px; border-bottom: 2px solid var(--tab-border); }}
            .tab-button {{
                padding: 10px 20px; cursor: pointer; background-color: var(--tab-bg);
                border: 1px solid var(--tab-border); border-bottom: none; border-radius: 5px 5px 0 0;
                margin-right: 5px; margin-bottom: -2px; /* Overlap border */
                display: inline-block; color: var(--tab-text); font-family: var(--font-ui);
                transition: background-color 0.3s, color 0.3s, border-color 0.3s;
            }}
            .tab-button.active {{
                background-color: var(--tab-active-bg); color: var(--tab-active-text);
                border-color: var(--tab-border); border-bottom: 2px solid var(--tab-active-bg); /* Cover bottom border */
                font-weight: bold;
            }}
            .tab-content {{ display: none; padding-top: 20px; }}
            .tab-content.active {{ display: block; }}

            /* --- Scenario Output Styling --- */
            .iteration-container {{
                margin: 30px 0; border: 1px solid var(--border-color); border-radius: 4px;
                overflow: hidden; background-color: var(--container-bg);
                box-shadow: 0 2px 5px rgba(0,0,0,0.05);
                transition: background-color 0.3s, border-color 0.3s, box-shadow 0.3s;
            }}
            .iteration-header {{ /* Use h2 style */
                background: var(--iter-header-bg); padding: 12px 20px; cursor: pointer;
                position: relative; border-bottom: 1px solid var(--border-color);
                font-size: 1.2em; font-weight: 700; color: var(--header-color);
                transition: background-color 0.3s, border-color 0.3s, color 0.3s;
            }}
            .iteration-header:hover {{ background: var(--iter-header-hover-bg); }}
            .scenario-container {{ border-top: 1px dashed var(--accent-border-color); transition: border-color 0.3s; }}
            .scenario-container:first-child {{ border-top: none; }}
            .scenario-header {{ /* Use h3 style */
                background: var(--prompt-header-bg); padding: 10px 20px; cursor: pointer;
                font-size: 1.1em; font-weight: 400; color: var(--subheader-color);
                transition: background-color 0.3s, color 0.3s;
            }}
            .scenario-header:hover {{ background: var(--prompt-header-hover-bg); }}
            .content-block {{ /* Contains prompt, convo, debrief */
                padding: 15px 25px; border-top: 1px solid var(--border-color);
                background-color: var(--container-bg);
                transition: background-color 0.3s, border-color 0.3s;
            }}
            .scenario-prompt-display {{ /* Specific style for the initial prompt */
                font-style: italic; color: var(--subheader-color); margin-bottom: 1em;
                padding: 10px 15px; background-color: var(--prompt-display-bg);
                border-left: 3px solid var(--accent-border-color); white-space: pre-wrap;
                font-family: var(--font-body);
                transition: background-color 0.3s, border-color 0.3s, color 0.3s, font-style 0.3s;
            }}
            .collapsible-content {{ display: none; padding: 0; background-color: var(--container-bg); transition: background-color 0.3s; }}
            .expanded {{ display: block; }}
            .toggle-icon {{
                display: inline-block; width: 20px; text-align: center; font-weight: bold;
                margin-right: 8px; color: var(--toggle-icon-color); transition: color 0.3s;
            }}
            .rubric-scores-display {{
                 margin-top: 10px; padding: 10px; background-color: var(--judge-bg);
                 border: 1px dashed var(--judge-border); border-radius: 3px; font-size: 0.9em;
                 color: var(--judge-text); transition: background-color 0.3s, border-color 0.3s, color 0.3s;
            }}
            .rubric-scores-display table {{ width: auto; border-collapse: collapse; margin-top: 5px; }}
            .rubric-scores-display th, .rubric-scores-display td {{ border: 1px solid var(--judge-border); padding: 4px 8px; text-align: left; }}
            .rubric-scores-display th {{ background-color: rgba(0,0,0,0.05); }} /* Slight shade */

            /* --- Matchup Styling --- */
            .matchups-opponent-group {{ /* NEW: Container for one opponent's matchups */
                 margin-bottom: 30px; border: 1px solid var(--border-color); border-radius: 4px;
                 background-color: var(--container-bg); overflow: hidden; /* For border radius */
                 transition: background-color 0.3s, border-color 0.3s;
            }}
            .matchups-opponent-header {{ /* NEW: Collapsible header for opponent */
                 background: var(--iter-header-bg); padding: 12px 20px; cursor: pointer;
                 position: relative; border-bottom: 1px solid var(--border-color);
                 font-size: 1.2em; font-weight: 700; color: var(--header-color);
                 transition: background-color 0.3s, border-color 0.3s, color 0.3s;
            }}
             .matchups-opponent-header:hover {{ background: var(--iter-header-hover-bg); }}
             .matchups-opponent-content {{ /* NEW: Content div for opponent matchups */
                 padding: 15px 20px;
                 /* Inherits background from parent .matchups-opponent-group */
             }}

            .comparison-section {{ /* Individual matchup */
                 border-top: 1px dashed var(--accent-border-color); padding-top: 15px; margin-top: 15px;
            }}
            .matchups-opponent-content .comparison-section:first-child {{ border-top: none; margin-top: 0; padding-top: 0; }} /* Remove top border for first matchup in group */

            .comparison-header {{
                background: var(--prompt-header-bg);
                padding: 8px 15px;
                cursor: pointer;
                font-size: 1.05em;
                font-weight: 600;
                color: var(--subheader-color);
                border: 1px solid var(--border-color);
                border-radius: 4px;
                transition: background-color 0.3s, color 0.3s;
            }}
            .comparison-header:hover {{ background: var(--prompt-header-hover-bg); }}

            .info-box.small-info-box {{ font-size: 0.85em; padding: 8px; margin-bottom: 10px; }}
            .matchup-judge {{ background-color: var(--judge-bg); border-color: var(--judge-border); padding: 10px 15px; margin-bottom: 15px; }}
            .matchup-judge h5 {{ margin-top: 0; color: var(--judge-text); }}
            .matchup-judge pre {{ background-color: var(--container-bg); border-color: var(--border-color); color: var(--judge-text); font-size: 0.9em; }}
            .comparison-grid {{ display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-top: 15px; }}
            .model-column {{ border: 1px solid var(--border-color); padding: 0 15px 15px 15px; border-radius: 5px; background-color: var(--prompt-header-bg); }}
            .model-column h6 {{ border-bottom: 1px solid var(--border-color); padding-bottom: 5px; margin-bottom: 10px; }}
            .conversation {{ border: none; padding: 0; margin-top: 5px; background: transparent; }}
            .conversation .message {{ margin-bottom: 8px; padding: 8px; border-radius: 4px; border: 1px solid var(--border-color); }}
            .conversation .user {{ background-color: var(--iter-header-bg); border-left: 3px solid var(--link-color); }}
            .conversation .assistant {{ background-color: var(--container-bg); border-left: 3px solid var(--accent-border-color); }}
            .conversation strong {{ display: block; margin-bottom: 3px; font-size: 0.85em; color: var(--subheader-color); }}
            .judge-reasoning-block {{
                background: var(--judge-bg);
                border: 1px dashed var(--judge-border);
                padding: 10px 15px;
                margin: 10px 0 15px;
                border-radius: 3px;
                color: var(--judge-text);
            }}
            /* Ensure pre within conversation inherits body font but uses monospace */
            .conversation pre {{
                font-size: 0.9em; padding: 5px; margin-bottom: 0; background: transparent; border: none;
                font-family: monospace, var(--font-body);
            }}
            /* Style for structured assistant response headers */
            .conversation .message.assistant > strong:not(:first-child) {{ /* Target headers after the main "Assistant:" */
                 margin-top: 8px; /* Add space before section headers */
                 font-size: 0.9em;
                 color: var(--header-color);
            }}


            /* --- Theme Specific Overrides (Copied) --- */
            body.theme-cozy.dark-mode {{ box-shadow: 0 5px 20px var(--shadow-color); background-image: linear-gradient(to bottom, #211f21, #232022); }}
            body.theme-cozy.dark-mode .iteration-container,
            body.theme-cozy.dark-mode .matchups-opponent-group {{ /* Apply shadow to opponent group too */
                 box-shadow: 0 2px 8px #000000; border-color: var(--border-color);
            }}
            body.theme-cozy.dark-mode h1 {{ text-shadow: 0 1px 2px #000000; }}
            body.theme-cozy.dark-mode .content-block {{ border-color: var(--border-color); }}
            body.theme-cozy.dark-mode .scenario-prompt-display {{ border-left: 3px solid var(--accent-border-color); background-color: #362e2b; }}
            body.theme-modern {{ padding: 35px 45px; }}
            body.theme-modern h1 {{ font-weight: 600; border-bottom-width: 1px; }}
            body.theme-modern h2 {{ font-weight: 600; }}
            body.theme-modern h3 {{ font-weight: 500; font-style: normal; }}
            body.theme-modern .iteration-header,
            body.theme-modern .matchups-opponent-header {{ font-weight: 600; }} /* Apply to opponent header */
            body.theme-modern .scenario-header {{ font-weight: 500; font-style: normal; }}
            body.theme-modern .scenario-prompt-display {{ border-left-width: 4px; border-radius: 3px; font-style: normal; }}
            body.theme-modern .judge-content {{ border-style: solid; border-width: 1px; }} /* Keep if judge content used elsewhere */
            body.theme-modern strong {{ font-weight: 600; }}

            /* --- Font Application --- */
            h1.main-title, .back-button, .control-select, .form-check-label, .top-controls, .tab-button {{
                font-family: var(--font-ui) !important;
            }}
            /* Allow iteration/scenario/opponent headers to use selected font */
            .iteration-header,
            .scenario-header,
            .matchups-opponent-header,
            .comparison-header,
            .model-column h6,
            .matchup-judge h5,
            .conversation .message strong 
            h2, h3, h4, h5, h6 {{
                font-family: var(--font-body) !important;   /* was --font-heading */
            }}

            /* Body text uses --font-body */
            body, .rubric-scores-display {{
                 font-family: var(--font-body);
            }}
             /* Ensure monospace for actual code/convo blocks */
            pre, .conversation pre, .matchup-judge pre, .content-block pre {{
                font-family: var(--font-body) !important;
            }}


            /* --- Mobile Responsiveness --- */
            /* --- Mobile Responsiveness --- */
        @media screen and (max-width: 900px) {{
            /* overall page chrome - UPDATED */
            body {{
                max-width: 100%;
                margin: 10px !important;      /* force zero margin */
                padding: 0px !important;     /* force zero padding */
                border: none !important;   /* remove border completely */
                box-shadow: none !important;
                overflow-x: hidden;        /* prevent horizontal scrolling */
            }}

            /* top-level containers - ADDED */
            .tab-content {{
                padding: 5px 0 0 0; /* minimal top padding, no side padding */
            }}

            /* title sizes */
            h1 {{ font-size: 1.8em; padding: 0 4px; }}
            h2 {{ font-size: 1.4em; }}
            h3 {{ font-size: 1.2em; }}

            /* control bar */
            .top-controls {{ 
                flex-direction: column; 
                align-items: flex-start; 
                gap: 8px; 
                padding: 5px;      /* add minimal padding */
                margin: 0;         /* remove margin */
            }}
            .controls-right {{ width: 100%; justify-content: space-between; }}

            /* headers & blocks */
            .iteration-header,
            .scenario-header,
            .matchups-opponent-header {{ 
                padding: 10px 6px; 
            }}

            .content-block,
            .matchups-opponent-content {{ 
                padding: 10px 6px; 
            }}

            /* buttons & tab bar */
            .tab-container {{
                margin: 10px 0;     /* reduce vertical margin, eliminate horizontal */
            }}
            .tab-button {{ 
                padding: 6px 8px; 
                font-size: 0.9em;
                margin-right: 2px;  /* reduce space between tabs */
            }}

            /* ensure side-by-side columns stack */
            .comparison-grid {{ grid-template-columns: 1fr; }}

            /* tighten matchup & scenario blocks */
            .matchups-opponent-content,
            .content-block {{
                padding: 5px 0;     /* minimal vertical padding, no horizontal padding */
            }}

            /* kill the grid's left / right gap when it collapses to one column */
            .comparison-grid {{
                gap: 12px 0;        /* keep vertical breathing-room only */
                margin: 0;          /* no margin */
            }}

            /* ---------- remove all horizontal margin/padding from containers ---------- */

            /* 1 – outer wrapper (.iteration-container / opponent group) */
            .iteration-container,
            .matchups-opponent-group {{
                margin: 10px 0;     /* keep minimal vertical margin, zero horizontal */
                border-left: none;  /* remove side borders */
                border-right: none;
                border-radius: 0;   /* remove border radius */
                width: 100%;        /* ensure full width */
            }}

            /* 2 – model column inside the comparison grid */
            .model-column {{
                margin: 5px 0;      /* minimal vertical margin, no horizontal */
                border-left: none;  /* remove side borders */
                border-right: none;
                padding-left: 2px;  /* minimal horizontal padding */
                padding-right: 2px;
                border-radius: 0;   /* remove border radius */
            }}

            /* 3 – individual conversation bubbles */
            .conversation .message {{
                margin: 5px 0;      /* minimal vertical margin, no horizontal */
                border-left-width: 1px;
                border-right-width: 0;
                border-radius: 0;   /* remove border radius */
            }}

            /* 4 - header info box */
            .header-info {{
                margin: 5px 0;      /* minimal vertical margin, no horizontal */
                padding: 10px 5px;  /* reduce padding */
                border-radius: 0;   /* remove border radius */
            }}

            /* 5 - other info boxes and panels */
            .info-box,
            .rubric-scores-display,
            .matchup-judge,
            .judge-reasoning-block,
            pre {{
                margin: 5px 0;     /* minimal vertical margin, no horizontal */
                padding: 5px;      /* minimal padding */
                border-radius: 0;  /* remove border radius */
            }}
        }}

        </style>
        <!-- Font Imports (Keep from Creative Writing) -->
        <link rel="preconnect" href="https://fonts.googleapis.com">
        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
        <link href="https://fonts.googleapis.com/css2?family=Lora:ital,wght@0,400..700;1,400..700&family=Merriweather:ital,wght@0,300;0,400;0,700;1,300;1,400;1,700&display=swap" rel="stylesheet">
        <!-- Note: Other fonts are loaded dynamically via JS -->
    </head>
    <body class="theme-modern">
        <div class="top-controls">
             <div class="nav-left">
                 <a href="javascript:history.back()" class="back-button">← Back</a>
             </div>
             <div class="controls-right">
                 <div class="control-select-wrapper">
                     <select id="themeSelector" class="control-select" aria-label="Select Theme">
                        <option value="cozy">Cozy</option>
                        <option value="modern" selected>Modern</option>
                    </select>
                 </div>
                 <div class="control-select-wrapper">
                     <select id="fontSelector" class="control-select" aria-label="Select Font">
                         <option value="tiempos">Tiempos Text</option>
                         <option value="bookerly">Bookerly</option>
                         <option value="bitter">Bitter Pro</option>
                         <option value="roboto">Roboto</option>
                         <option value="inter">Inter</option>
                         <option value="source_sans">Source Sans 3</option>
                         <option value="open_sans">Open Sans</option>
                         <option value="fira_sans">Fira Sans</option>
                         <option value="besley">Besley</option>
                     </select>
                 </div>
                 <div class="mode-toggle">
                     <input class="form-check-input" type="checkbox" id="darkModeToggle">
                     <label class="form-check-label" for="darkModeToggle" id="toggleLabel">Light</label>
                 </div>
             </div>
         </div>

        <h1 class="main-title">EQBench3 Report: {display_model_name}</h1>

        <div class="header-info">
            <strong>Model:</strong> {html.escape(original_model_name)}<br>
            <strong>Run Key:</strong> {html.escape(run_key)}<br>
            <strong>ELO (Normalized):</strong> {elo_norm_display}<br>
            <strong>ELO (Raw):</strong> {elo_raw_display}<br>
            <strong>Rubric Score (0-100):</strong> {rubric_score_display}
        </div>

        <!-- Tab Navigation -->
        <div class="tab-container">
            <button class="tab-button active" onclick="openTab(event, 'scenarioOutputs')">Scenario Outputs</button>
            <button class="tab-button" onclick="openTab(event, 'matchups')">Matchups</button>
        </div>

        <!-- Tab Content: Scenario Outputs -->
        <div id="scenarioOutputs" class="tab-content active">
            <h2>Scenario Outputs</h2>
    """

    # --- Process Scenario Tasks ---
    iterations_data = defaultdict(list) # { iter_idx: [task_data, ...] }
    if isinstance(scenario_tasks, dict):
        for iter_idx_str, scenarios_in_iter in scenario_tasks.items():
            if isinstance(scenarios_in_iter, dict):
                try:
                    iter_idx = int(iter_idx_str)
                    for scenario_id, task_data in scenarios_in_iter.items():
                         if isinstance(task_data, dict) and task_data.get("status") in ["completed", "rubric_scored", "error"]: # Include errored tasks
                            task_data['scenario_id_parsed'] = scenario_id # Add scenario_id for sorting/display
                            iterations_data[iter_idx].append(task_data)
                except ValueError:
                    print(f"Warning: Invalid iteration index '{iter_idx_str}' in run {run_key}. Skipping.")
    else:
        html_output += "<p class='error-message'>Scenario task data is missing or not in the expected format.</p>"

    # Sort iterations by index
    sorted_iterations = sorted(iterations_data.items())

    if not sorted_iterations:
         html_output += "<p class='missing-data'>No completed or processed scenario tasks found for this run.</p>"

    for display_idx, (iter_idx, tasks_in_iter) in enumerate(sorted_iterations):
        is_first_iter = display_idx == 0
        html_output += f"""
        <div class="iteration-container">
            <div class="iteration-header" onclick="toggleContent('iteration-{iter_idx}')">
                <span class="toggle-icon">{'−' if is_first_iter else '+'}</span>
                Iteration {iter_idx} ({len(tasks_in_iter)} scenarios processed)
            </div>
            <div id="iteration-{iter_idx}" class="collapsible-content {'expanded' if is_first_iter else ''}">
        """
        # Sort scenarios within iteration (e.g., numerically by ID)
        def get_scenario_sort_key(task: dict) -> tuple[int, int]:
            """
            Primary key  = explicit master order
            Secondary    = numeric ID (so unknown IDs still sort sensibly)
            """
            sid = str(task.get('scenario_id_parsed', ''))
            rank = _SCENARIO_RANK.get(sid, _MAX_RANK_FALLBACK)
            try:
                sid_int = int(sid)
            except ValueError:
                sid_int = _MAX_RANK_FALLBACK
            return (rank, sid_int)

        tasks_in_iter.sort(key=get_scenario_sort_key)

        for task_data in tasks_in_iter:
            scenario_id = task_data.get('scenario_id_parsed', 'N/A')
            scenario_html_id = f"scenario-{iter_idx}-{scenario_id}"
            status = task_data.get("status", "Unknown")
            error_msg = task_data.get("error")

            # Extract data safely
            prompts = task_data.get("prompts", [])
            scenario_prompt = prompts[0] if prompts else "N/A"
            conversation_history = task_data.get("conversation_history", [])
            debrief_response = task_data.get("debrief_response") # Can be None or empty string
            rubric_scores = task_data.get("rubric_scores") # Dict or None
            raw_judge = task_data.get("raw_rubric_judge_text")
            judge_reasoning = extract_judge_reasoning(raw_judge)

            label_cat, label_title = scenario_label_str(scenario_id)
            scenario_label = f"{html.escape(label_cat)} | {html.escape(label_title)}"

            score_raw  = rubric_score_0_100(rubric_scores)
            score_part = f" | Score: {score_raw}" if score_raw != "Unscored" else " | Unscored"

            html_output += f"""
                        <div class="scenario-container">
                            <div class="scenario-header" onclick="toggleContent('{scenario_html_id}')">
                                <span class="toggle-icon">+</span>
                                {scenario_label}{score_part}
                                {f'<span style="color: var(--error-message-color);"> - Error</span>' if status == "error" or score_raw == "Unscored" else ""}
                            </div>
                            <div id="{scenario_html_id}" class="collapsible-content">
                                <div class="content-block">
                                    <h5>Conversation History:</h5>
                                    {format_conversation_for_html(conversation_history, scenario_id)}
                                    <h5>Self-Debrief:</h5>
                                    <pre>{html.escape(debrief_response) if debrief_response is not None else "<span class='missing-data'>(No debrief response)</span>"}</pre>
            """

            if judge_reasoning:
                html_output += f"""
                <div class="judge-reasoning-block">
                    <strong>Judge Analysis:</strong>
                    <pre>{html.escape(judge_reasoning)}</pre>
                </div>
                """
            # Display Rubric Scores if available
            if isinstance(rubric_scores, dict):
                html_output += "<div class='rubric-scores-display'><strong>Rubric Scores:</strong><br>"
                # Filter out non-score entries like reasoning if present
                scores_only = {k: v for k, v in rubric_scores.items() if isinstance(v, (int, float))}
                if scores_only:
                     html_output += "<table><thead><tr><th>Metric</th><th>Score</th></tr></thead><tbody>"
                     for metric, score in sorted(scores_only.items()):
                         html_output += f"<tr><td>{html.escape(metric)}</td><td>{html.escape(str(score))}</td></tr>"
                     html_output += "</tbody></table>"
                else:
                     html_output += "<span class='missing-data'>(No numeric scores found)</span>"
                html_output += "</div>"
            elif task_data.get("status") == "rubric_scored": # Status says scored but no dict found
                 html_output += "<div class='rubric-scores-display'><span class='missing-data'>(Rubric scores expected but not found)</span></div>"

            # Display error message if status is error
            if status == "error" and error_msg:
                 html_output += f"<div class='error-message' style='margin-top: 10px;'><strong>Error Details:</strong> {html.escape(error_msg)}</div>"

            html_output += """
                    </div>
                </div>
            </div>""" # End scenario-container
        html_output += """
            </div>
        </div>""" # End iteration-container

    html_output += """
        </div> <!-- End Tab Content: scenarioOutputs -->

        <!-- Tab Content: Matchups -->
        <div id="matchups" class="tab-content">
            <h2>Pairwise Matchups</h2>
    """

    # --- Process Matchups ---
    model_comparisons = []
    if all_comparisons:
        target_model_set = {original_model_name} # Use the name from the run data
        for entry in all_comparisons:
            pair = entry.get("pair", {})
            test_model = pair.get("test_model")
            neighbor_model = pair.get("neighbor_model")
            if test_model and neighbor_model and target_model_set.intersection({test_model, neighbor_model}):
                 model_comparisons.append(entry)
    else:
         html_output += "<p class='missing-data'>No global comparison data found in ELO results file.</p>"

    # ── Deduplicate mirror match‑ups ──────────────────────────────
    # A scenario/iteration can appear twice—once with your model in
    # test_model and once in neighbor_model. Keep only the first.
    deduped = {}
    for entry in model_comparisons:
        sid      = entry.get("scenario_id")
        iter_idx = entry.get("pair", {}).get("iteration_index")
        a        = entry.get("pair", {}).get("test_model")
        b        = entry.get("pair", {}).get("neighbor_model")
        key = (sid, iter_idx, frozenset({a, b}))  # unordered pair key
        if key not in deduped:
            deduped[key] = entry

    #model_comparisons = list(deduped.values())

    if not model_comparisons:
        html_output += f"<p class='missing-data'>No matchups involving {display_model_name} found.</p>"
    else:
        # Group comparisons by opponent
        comparisons_by_opponent = defaultdict(list)
        for entry in model_comparisons:
             pair = entry.get("pair", {})
             test_model = pair.get("test_model")
             neighbor_model = pair.get("neighbor_model")
             opponent = neighbor_model if test_model == original_model_name else test_model
             comparisons_by_opponent[opponent].append(entry)

        # Sort opponents (e.g., alphabetically by display name)
        sorted_opponents = sorted(comparisons_by_opponent.keys(), key=lambda x: get_updated_model_name(x))

        for opponent_name in sorted_opponents:
            opponent_display_name = get_updated_model_name(opponent_name)
            opponent_sanitized = sanitize_model_name(opponent_name) # For ID
            opponent_comparisons = comparisons_by_opponent[opponent_name]
            # Sort comparisons within opponent group (e.g., by scenario, then iteration)
            def _comparison_sort_key(comp: dict) -> tuple[int, int]:
                sid = str(comp.get("scenario_id", ''))
                rank = _SCENARIO_RANK.get(sid, _MAX_RANK_FALLBACK)
                iter_idx = comp.get("pair", {}).get("iteration_index", 0)
                return (rank, iter_idx)

            opponent_comparisons.sort(key=_comparison_sort_key)


            opponent_group_id = f"opponent-group-{opponent_sanitized}"
            html_output += f"""
            <div class="matchups-opponent-group">
                 <div class="matchups-opponent-header" onclick="toggleContent('{opponent_group_id}')">
                     <span class="toggle-icon">+</span>
                     vs. {html.escape(opponent_display_name)} ({len(opponent_comparisons)} comparisons)
                 </div>
                 <div id="{opponent_group_id}" class="collapsible-content matchups-opponent-content">
            """
            processed_count = 0
            error_count = 0
            for idx, comparison_entry in enumerate(opponent_comparisons, start=1):
                 try:
                    fragment = generate_matchup_section_html(
                        comparison_entry=comparison_entry,
                        results_data=runs_data,
                        target_model=original_model_name,
                        uniq=str(idx),          # ensures every ID is distinct
                    )
                    # Check if the fragment indicates an error before appending
                    if "error-message" not in fragment:
                        html_output += fragment
                        processed_count += 1
                    else:
                        html_output += fragment # Append error message fragment
                        error_count += 1
                 except Exception as e:
                    scenario_id = comparison_entry.get("scenario_id", "UNK_S")
                    iter_idx = comparison_entry.get("pair", {}).get("iteration_index", "UNK_I")
                    print(f"!! Critical Error generating matchup section S:{scenario_id} I:{iter_idx} vs {opponent_name}: {e}")
                    html_output += f"<div class='comparison-section error-message'>Error generating section for S:{scenario_id}/I:{iter_idx}. See console logs.</div>"
                    error_count += 1

            if error_count > 0:
                 print(f"  -> Encountered errors in {error_count} matchup sections against {opponent_name}.")
            html_output += """
                 </div>
            </div>""" # End matchups-opponent-group

    html_output += """
        </div> <!-- End Tab Content: matchups -->

        <!-- JavaScript (Adapted from Creative Writing) -->
        <!-- pako for gzip inflation -->
        <script src="https://cdnjs.cloudflare.com/ajax/libs/pako/2.1.0/pako.min.js"            
            crossorigin="anonymous" referrerpolicy="no-referrer"></script>
        <script>
            // --- DOM Elements ---
            const body = document.body;
            const themeSelector = document.getElementById('themeSelector');
            const fontSelector = document.getElementById('fontSelector');
            const darkModeToggle = document.getElementById('darkModeToggle');
            const toggleLabel = document.getElementById('toggleLabel');
            const tabButtons = document.querySelectorAll('.tab-button');
            const tabContents = document.querySelectorAll('.tab-content');

            // --- Constants (Font Maps, Defaults - Copied) ---
            const FONT_MAP = { /* ... Font mappings ... */
                'tiempos': "'Tiempos Text', Georgia, serif", 'bookerly': "'Bookerly', Georgia, serif",
                'bitter': "'Bitter Pro', Georgia, serif", 'roboto': "'Roboto', sans-serif",
                'inter': "'Inter', sans-serif", 'source_sans': "'Source Sans 3', sans-serif",
                'open_sans': "'Open Sans', sans-serif", 'fira_sans': "'Fira Sans', sans-serif",
                'besley': "'Besley', 'Merriweather', serif"
            };
            const FONT_DEFINITIONS = { /* ... Font definitions with URLs ... */
                'tiempos': { family: 'Tiempos Text', variants: [ { weight: 400, style: 'normal', url: 'fonts/tiempos_text/TiemposText-Regular.woff2' }, { weight: 400, style: 'italic', url: 'fonts/tiempos_text/TiemposText-RegularItalic.woff2' }, { weight: 700, style: 'normal', url: 'fonts/tiempos_text/TiemposText-Bold.woff2' } ], fallback: 'Georgia, serif' },
                'bookerly': { family: 'Bookerly', variants: [ { weight: 400, style: 'normal', url: 'fonts/bookerly/Bookerly.woff' }, { weight: 400, style: 'italic', url: 'fonts/bookerly/Bookerly Italic.woff' }, { weight: 700, style: 'normal', url: 'fonts/bookerly/Bookerly Bold.woff' } ], fallback: 'Georgia, serif' },
                'bitter': { family: 'Bitter Pro', variants: [ { weight: 400, style: 'normal', url: 'fonts/bitter_pro/BitterPro-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/bitter_pro/BitterPro-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/bitter_pro/BitterPro-Bold.ttf' } ], fallback: 'Georgia, serif' },
                'roboto': { family: 'Roboto', variants: [ { weight: 400, style: 'normal', url: 'fonts/roboto/static/Roboto-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/roboto/static/Roboto-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/roboto/static/Roboto-Bold.ttf' } ], fallback: 'sans-serif' },
                'inter': { family: 'Inter', variants: [ { weight: 400, style: 'normal', url: 'fonts/inter/static/Inter-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/inter/static/Inter-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/inter/static/Inter-Bold.ttf' } ], fallback: 'sans-serif' },
                'source_sans': { family: 'Source Sans 3', variants: [ { weight: 400, style: 'normal', url: 'fonts/source_sans_3/static/SourceSans3-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/source_sans_3/static/SourceSans3-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/source_sans_3/static/SourceSans3-Bold.ttf' } ], fallback: 'sans-serif' },
                'open_sans': { family: 'Open Sans', variants: [ { weight: 400, style: 'normal', url: 'fonts/open_sans/static/OpenSans-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/open_sans/static/OpenSans-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/open_sans/static/OpenSans-Bold.ttf' } ], fallback: 'sans-serif' },
                'fira_sans': { family: 'Fira Sans', variants: [ { weight: 400, style: 'normal', url: 'fonts/fira_sans/FiraSans-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/fira_sans/FiraSans-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/fira_sans/FiraSans-Bold.ttf' } ], fallback: 'sans-serif' },
                'besley': { family: 'Besley', variants: [ { weight: 400, style: 'normal', url: 'fonts/besley/Besley-VariableFont_wght.ttf' }, { weight: 400, style: 'italic', url: 'fonts/besley/Besley-Italic-VariableFont_wght.ttf' } ], fallback: 'serif' }
            };
            const SANS_FONTS = ['roboto', 'inter', 'source_sans', 'open_sans', 'fira_sans'];
            const THEME_DEFAULT_FONTS = { 'cozy': 'tiempos', 'modern': 'open_sans' };
            const THEME_DEFAULT_HEAD_FONTS = { 'cozy': "'Lora', serif", 'modern': "'Besley', 'Merriweather', serif" };
            const loadedFonts = new Set();
            const STORAGE_PREFIX = 'eqbench_report_'; // Use different prefix
            const KEYS = {
                THEME: `${STORAGE_PREFIX}theme`, FONT: `${STORAGE_PREFIX}font`,
                DARK_MODE: `${STORAGE_PREFIX}darkMode`, ACTIVE_TAB: `${STORAGE_PREFIX}activeTab`
            };

            // --- Dynamic Font Loading (Copied) ---
            async function loadFontFace(fontKey) { /* ... Same as creative writing ... */
                 if (loadedFonts.has(fontKey)) return;
                 const fontDef = FONT_DEFINITIONS[fontKey];
                 if (!fontDef) { console.warn(`Font definition not found for: ${fontKey}`); return; }
                 try {
                     const fontLoadPromises = fontDef.variants.map(variant => {
                         const fontFace = new FontFace(fontDef.family, `url(${variant.url})`, { weight: variant.weight, style: variant.style });
                         return fontFace.load().then(loadedFont => { document.fonts.add(loadedFont); return loadedFont; });
                     });
                     await Promise.all(fontLoadPromises);
                     loadedFonts.add(fontKey); console.log(`Loaded font: ${fontDef.family}`);
                 } catch (err) { console.error(`Error loading font ${fontDef.family}:`, err); }
            }

            // --- Content Toggling (Copied) ---
            function toggleContent(id) { /* ... Same as creative writing ... */
                 const element = document.getElementById(id); if (!element) return;
                 const isExpanded = element.classList.contains('expanded');
                 const header = element.previousElementSibling; // Assumes header is directly before content
                 const toggleIcon = header ? header.querySelector('.toggle-icon') : null;
                 if (isExpanded) { element.classList.remove('expanded'); if (toggleIcon) toggleIcon.textContent = '+'; }
                 else { element.classList.add('expanded'); if (toggleIcon) toggleIcon.textContent = '−'; }
            }

            // --- Shared Settings Saving (Copied, uses new KEYS) ---
            function saveSettings(type, value) { localStorage.setItem(KEYS[type], value); }

            // --- Dark Mode (Copied, uses new KEYS) ---
            function setDarkMode(isDark) { /* ... Same as creative writing, uses new KEYS.DARK_MODE ... */
                 body.classList.toggle('dark-mode', isDark);
                 toggleLabel.textContent = isDark ? 'Dark' : 'Light';
                 if (darkModeToggle.checked !== isDark) { darkModeToggle.checked = isDark; }
                 saveSettings('DARK_MODE', isDark);
            }

            // --- Theme Selection (Copied, uses new KEYS) ---
            function applyTheme(themeName) { /* ... Same as creative writing, uses new KEYS.THEME/FONT ... */
                 body.classList.remove('theme-cozy', 'theme-modern'); body.classList.add(`theme-${themeName}`);
                 if (themeSelector.value !== themeName) { themeSelector.value = themeName; }
                 saveSettings('THEME', themeName);
                 const savedFont = localStorage.getItem(KEYS.FONT);
                 const defaultFont = THEME_DEFAULT_FONTS[themeName] || 'tiempos';
                 applyFont(savedFont || defaultFont); // Apply font after theme change
            }

            // --- Font Selection (Copied, uses new KEYS) ---
            async function applyFont(fontValue) { /* ... Same as creative writing, uses new KEYS.FONT/THEME ... */
                 await loadFontFace(fontValue); // Ensure font is loaded
                 const fontFamily = FONT_MAP[fontValue];
                 const currentTheme = localStorage.getItem(KEYS.THEME) || 'cozy';
                 let headingFontFamily = THEME_DEFAULT_HEAD_FONTS[currentTheme];
                 if (fontFamily) {
                     body.style.setProperty('--font-body', fontFamily);
                     // Determine heading font based on theme and selected body font
                     if (currentTheme === 'modern') { headingFontFamily = THEME_DEFAULT_HEAD_FONTS['modern']; }
                     else { headingFontFamily = THEME_DEFAULT_HEAD_FONTS['cozy']; }
                     // Special case for Besley
                     if (fontValue === 'besley') { headingFontFamily = FONT_MAP['besley']; }

                     body.style.setProperty('--font-heading', headingFontFamily);

                     if (fontSelector.value !== fontValue) { fontSelector.value = fontValue; }
                     saveSettings('FONT', fontValue);
                 } else {
                     console.warn("Font value not found:", fontValue);
                     const theme = localStorage.getItem(KEYS.THEME) || 'cozy';
                     applyFont(THEME_DEFAULT_FONTS[theme]); // Fallback to theme default
                 }
            }

            // --- Tab Switching ---
            function openTab(evt, tabName) {
                // Hide all tab content
                tabContents.forEach(content => content.classList.remove('active'));
                // Deactivate all tab buttons
                tabButtons.forEach(button => button.classList.remove('active'));
                // Show the selected tab content
                const targetTabContent = document.getElementById(tabName);
                if (targetTabContent) {
                    targetTabContent.classList.add('active');
                }
                // Activate the clicked tab button
                if (evt && evt.currentTarget) {
                    evt.currentTarget.classList.add('active');
                }
                // Save the active tab
                saveSettings('ACTIVE_TAB', tabName);
            }

            // --- Initial Settings Application (Adapted) ---
            async function applyInitialSettings() {
                // Dark Mode
                const savedDarkMode = localStorage.getItem(KEYS.DARK_MODE);
                const prefersDark = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches;
                setDarkMode(savedDarkMode !== null ? (savedDarkMode === 'true') : prefersDark);
                // Theme
                const savedTheme = localStorage.getItem(KEYS.THEME) || 'modern';
                applyTheme(savedTheme); // This will also trigger applyFont
                // Font (Set selector value after applyTheme->applyFont finishes)
                const savedFont = localStorage.getItem(KEYS.FONT) || THEME_DEFAULT_FONTS[savedTheme];
                await applyFont(savedFont); // Ensure font is applied before setting selector
                fontSelector.value = savedFont; // Set selector value

                // Active Tab
                const savedTab = localStorage.getItem(KEYS.ACTIVE_TAB) || 'scenarioOutputs';
                const savedTabButton = document.querySelector(`.tab-button[onclick*="'${savedTab}'"]`);
                if (savedTabButton) {
                     // Directly manipulate classes instead of simulating click
                     tabContents.forEach(content => content.classList.remove('active'));
                     tabButtons.forEach(button => button.classList.remove('active'));
                     const targetTabContent = document.getElementById(savedTab);
                     if (targetTabContent) targetTabContent.classList.add('active');
                     savedTabButton.classList.add('active');
                } else {
                     // Fallback if saved tab button not found
                     const firstTabContent = document.getElementById('scenarioOutputs');
                     const firstTabButton = document.querySelector('.tab-button');
                     if (firstTabContent) firstTabContent.classList.add('active');
                     if (firstTabButton) firstTabButton.classList.add('active');
                }
            }

            // --- Run Initialization ---
            // Use DOMContentLoaded to ensure elements are ready before applying settings
            document.addEventListener('DOMContentLoaded', applyInitialSettings);

            /* --------------------------------------------------
            Wire up the style controls
            -------------------------------------------------- */
            darkModeToggle.addEventListener('change', () => {
            setDarkMode(darkModeToggle.checked);
            });

            themeSelector.addEventListener('change', () => {
            applyTheme(themeSelector.value);
            });

            fontSelector.addEventListener('change', () => {
            applyFont(fontSelector.value);
            });

            // System theme listener
            window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', event => {
                if (localStorage.getItem(KEYS.DARK_MODE) === null) { // Only if user hasn't set preference
                    setDarkMode(event.matches);
                }
            });

            /**
            * Expand or collapse a compressed‑HTML matchup section.
            * ------------------------------------------------------------------
            *  • The target container must have  id="<id>"  and
            *      data-matchup="<base64‑encoded, gzip|raw‑deflate compressed html>"
            *  • The function caches the decompressed HTML so subsequent toggles
            *    don’t re‑inflate the payload.
            *  • If the section is already visible the call collapses it.
            *
            * Requires:  pako 2.x  (https://cdnjs.cloudflare.com/ajax/libs/pako/2.1.0/pako.min.js)
            */
            function expandMatchup(id) {
                const container = document.getElementById(id);
                if (!container) {
                    console.error(`expandMatchup: element #${id} not found`);
                    return;
                }

                /* ---------- toggle collapse if already expanded ---------- */
                if (container.dataset.expanded === 'true') {
                    container.style.display = 'none';
                    container.dataset.expanded = 'false';
                    return;
                }

                /* ---------- use cached html if we decompressed before ---------- */
                if (container.dataset.decompressed) {
                    container.innerHTML = container.dataset.decompressed;
                    container.style.display = 'block';
                    container.dataset.expanded = 'true';
                    return;
                }

                /* ---------- first‑time expand: decode + inflate ---------- */
                const b64 = container.dataset.matchup || container.getAttribute('data-matchup');
                if (!b64) {
                    console.error(`expandMatchup: #${id} is missing data-matchup attribute`);
                    return;
                }

                try {
                    /* 1) base64 ⇨ Uint8Array */
                    const binary = atob(b64);
                    const bytes  = new Uint8Array(binary.length);
                    for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);

                    /* 2) gzip first, raw‑deflate fallback */
                    let html;
                    try {
                        html = pako.ungzip(bytes, { to: 'string' });
                    } catch (gzipErr) {
                        html = pako.inflateRaw(bytes, { to: 'string' });
                    }

                    /* 3) inject + cache */
                    container.innerHTML = html;
                    container.dataset.decompressed = html;
                    container.style.display = 'block';
                    container.dataset.expanded = 'true';

                } catch (err) {
                    console.error('expandMatchup: decompression failed', err);
                }
            }


        </script>
    </body>
    </html>
    """

    # --- Save to File ---
    if save_to_file:
        os.makedirs("results", exist_ok=True)
        # Use the display name for the filename
        sanitized_name = sanitize_model_name(display_model_name)
        filename = f"results/{sanitized_name}.html"
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(html_output)
            print(f"Report saved to {filename}")
        except IOError as e:
            print(f"Error saving report to {filename}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred during file saving: {e}")


    return HTML(html_output)


# --- Wrapper Functions ---
def view_model_report(model_name, run_key=None, save_to_file=False):
    """Display the HTML report for a given model and optionally save it."""
    report = generate_model_report_eqbench(model_name, run_key, save_to_file)
    display(report)

def save_model_report(model_name, run_key=None):
    """Generate and save the HTML report for a given model."""
    generate_model_report_eqbench(model_name, run_key, save_to_file=True)

def list_available_models_eqbench():
    """List all models available in the EQBench ELO results file."""
    elo_data = load_json_file(ELO_RESULTS_FILE)
    if not elo_data:
        print(f"No ELO data found in {ELO_RESULTS_FILE}.")
        return []

    models = []
    print(f"Available models in {ELO_RESULTS_FILE} (sorted by Normalized ELO):")
    for model_name, model_data in elo_data.items():
        if model_name == "__metadata__": continue # Skip metadata entry
        if not isinstance(model_data, dict):
             print(f"Warning: Skipping invalid entry for '{model_name}' in ELO data.")
             continue

        # Prioritize normalized ELO, fall back to raw ELO, then -inf
        elo_norm = model_data.get("elo_norm")
        elo_raw = model_data.get("elo")
        sort_score = -float('inf')
        if isinstance(elo_norm, (int, float)):
            sort_score = elo_norm
        elif isinstance(elo_raw, (int, float)):
            sort_score = elo_raw # Use raw for sorting if norm is missing

        models.append((model_name, sort_score, elo_norm, elo_raw))

    # Sort by sort_score (descending)
    models.sort(key=lambda x: x[1], reverse=True)

    model_names_only = []
    for rank, (name, _, elo_n, elo_r) in enumerate(models, 1):
        elo_n_display = f"{elo_n:.1f}" if isinstance(elo_n, (int, float)) else "N/A"
        elo_r_display = f"{elo_r:.1f}" if isinstance(elo_r, (int, float)) else "N/A"
        display_name = get_updated_model_name(name)
        print(f"{rank}. {display_name} (ELO Norm: {elo_n_display}, Raw: {elo_r_display})")
        model_names_only.append(name) # Store original name

    return model_names_only

def list_model_runs_eqbench(model_name):
    """List all runs available for a specific model in the EQBench runs file."""
    runs_data = load_json_file(RUNS_FILE)
    if not runs_data:
        print(f"No runs data found in {RUNS_FILE}.")
        return []

    matching_runs = []
    # Check both original and substituted names
    names_to_check = {model_name, get_updated_model_name(model_name)}

    for key, data in runs_data.items():
        if isinstance(data, dict) and data.get("test_model") in names_to_check:
            start_time_str = data.get("start_time", "Unknown Time")
            try:
                # Attempt to parse ISO format time for sorting
                start_time_dt = datetime.fromisoformat(start_time_str.replace('Z', '+00:00')) if start_time_str != "Unknown Time" else None
            except ValueError:
                start_time_dt = None # Handle non-ISO formats gracefully

            status = data.get("status", "Unknown Status")
            matching_runs.append((key, start_time_str, status, start_time_dt))

    if not matching_runs:
        print(f"No runs found for model: {get_updated_model_name(model_name)}")
        return []

    print(f"\nAvailable runs for {get_updated_model_name(model_name)}:")
    # Sort by datetime object if available, otherwise by key
    matching_runs.sort(key=lambda x: x[3] if x[3] else datetime.min.replace(tzinfo=timezone.utc), reverse=True)

    run_keys_only = []
    for idx, (key, time_str, status, _) in enumerate(matching_runs, 1):
        print(f"{idx}. {key} (Started: {time_str}, Status: {status})")
        run_keys_only.append(key)

    return run_keys_only


# --- Main Execution Block ---
if __name__ == "__main__":
    # Ensure the results directory exists for saving reports
    os.makedirs("results", exist_ok=True)

    # 1. List available models
    print("--- Available Models ---")
    models = list_available_models_eqbench()
    print("-" * 24)

    calculate_and_print_metrics_eqbench()

    # 2. Example: Generate and save reports for *all* models found
    print("\nGenerating and saving HTML reports for all models...")
    if models:
        saved_count = 0
        error_count = 0
        for model in models:
            if model in MODELS_TO_IGNORE:
                print(f"Skipping ignored model: {get_updated_model_name(model)}")
                continue
            print(f"Processing report for: {get_updated_model_name(model)}")
            try:
                # Let the function find the latest run automatically
                save_model_report(model)
                saved_count += 1
            except Exception as e:
                print(f"  ERROR generating report for {model}: {e}")
                # Optionally add more detailed error logging here
                import traceback
                traceback.print_exc()
                error_count += 1
        print(f"\nFinished saving reports. Saved: {saved_count}, Errors: {error_count}.")
    else:
        print("\nNo models found in ELO data to generate reports for.")

    # 3. Example: View a specific report directly in IPython/Jupyter
    # Make sure to check if running in an IPython environment
    try:
        #get_ipython() # This will raise NameError if not in IPython
        in_ipython = False
    except NameError:
        in_ipython = False

    if models and in_ipython:
        target_model_to_view = models[0] # View the top model
        print(f"\nDisplaying report for {get_updated_model_name(target_model_to_view)} in IPython...")
        # You could also specify a run_key if needed: view_model_report(target_model_to_view, run_key='specific_run_key')
        view_model_report(target_model_to_view)
    elif not in_ipython:
         print("\nSkipping direct display (not running in IPython/Jupyter environment).")
    elif not models:
         print("\nSkipping direct display (no models found).")


    print("\nScript finished.")

Successfully imported eqbench_constants.
Successfully imported calculate_final_rubric_score.
[meta] Loaded 46 scenarios from ./data/scenario_prompts.txt
--- Available Models ---
Available models in data/canonical_leaderboard_elo_results.json.gz (sorted by Normalized ELO):
1. o3 (ELO Norm: 1500.0, Raw: 1515.5)
2. chatgpt-4o-latest-2025-03-27 (ELO Norm: 1311.8, Raw: 1435.8)
3. chatgpt-4o-latest-2025-04-25 (ELO Norm: 1304.2, Raw: 1432.6)
4. openai/o4-mini (ELO Norm: 1303.2, Raw: 1432.2)
5. deepseek-ai/DeepSeek-R1 (ELO Norm: 1234.8, Raw: 1403.2)
6. gpt-4.1 (ELO Norm: 1220.1, Raw: 1397.0)
7. Qwen/Qwen3-235B-A22B (ELO Norm: 1213.4, Raw: 1394.1)
8. gemini-2.5-pro-preview-03-25 (ELO Norm: 1203.9, Raw: 1390.1)
9. deepseek-ai/DeepSeek-V3-0324 (ELO Norm: 1194.8, Raw: 1386.2)
10. qwen/qwq-32b (ELO Norm: 1136.7, Raw: 1361.6)
11. gpt-4.1-mini (ELO Norm: 1113.8, Raw: 1351.9)
12. gpt-4.5-preview-2025-02-27 (ELO Norm: 1104.9, Raw: 1348.1)
13. claude-3-5-sonnet-20241022 (ELO Norm: 1101.0, Raw: 1346.4)
1

In [2]:
# ╔════════════════════════════════════════════════════════════════════╗
# ║  EQ‑BENCH · Build criteria dataframe + JS chartData object (radar) ║
# ╚════════════════════════════════════════════════════════════════════╝
import os, json, math, numpy as np, pandas as pd
from collections import defaultdict
from statistics import mean

# ---------------------------------------------------------------------
# CONFIG – tweak lists here, everything else is automatic
# ---------------------------------------------------------------------
RUNS_PATH          = RUNS_FILE                # already defined in the report script
MIN_OCCURRENCES    = 5                        # criterion must appear in ≥ this many models
N_NEIGHBORS        = 6                      # window size for relative scores

NEGATIVE_CRITERIA  = {}
RENAME_MAP = {                                # after inversion / cleaning

}

# optional group‑combines (same structure as the original notebook)
COMBINATIONS = {
    # "Aggregate Name": [list_of_component_columns]
    # (provide *post‑rename* column names here)
    # e.g.  "Humanness": ["humanlike", "conversational", "reactive"],
}

CRITERIA_LIST = [
    "demonstrated_empathy", "pragmatic_ei", "depth_of_insight",
    "social_dexterity", "emotional_reasoning", "message_tailoring",
    "boundary_setting", "safety_conscious", "moralising",
    #"sycophantic",
     "compliant", "challenging",
    "warmth", "validating", "analytical",
    "reactive", "conversational", "humanlike",
]

# ---------------------------------------------------------------------
# 1. Load runs JSON
# ---------------------------------------------------------------------
if not os.path.exists(RUNS_PATH):
    raise FileNotFoundError(f"Runs file not found: {RUNS_PATH}")

#with open(RUNS_PATH, "r", encoding="utf-8") as f:
#    runs_data = json.load(f)
runs_data = load_json_file(RUNS_PATH)

# ---------------------------------------------------------------------
# 2. Extract per‑model rubric means & overall score (0‑100)
# ---------------------------------------------------------------------
model_rows = []                      # we’ll build a row per model after filtering
criterion_counts = defaultdict(int)  # for the min‑occurrence filter

for run_key, run in runs_data.items():
    model = run.get("test_model", run_key)

    # overall (0‑100) via your helper calculate_final_rubric_score()
    raw_overall, _ = calculate_final_rubric_score(run)
    overall = raw_overall * 5 if isinstance(raw_overall, (int, float)) else np.nan

    per_crit = defaultdict(list)
    for iter_dict in run.get("scenario_tasks", {}).values():
        for task in iter_dict.values():
            scores = task.get("rubric_scores", {})
            if isinstance(scores, dict):
                for crit, val in scores.items():
                    if isinstance(val, (int, float)) and crit in CRITERIA_LIST:
                        per_crit[crit].append(val)

    if not per_crit:
        continue

    # aggregate means + count occurrences
    agg = {}
    for crit, vals in per_crit.items():
        agg[crit] = mean(vals)
        criterion_counts[crit] += 1

    model_rows.append({"model": model, "overall": overall, **agg})

if not model_rows:
    raise ValueError("No rubric data found in any run.")

# ---------------------------------------------------------------------
# 3. Build raw DataFrame & keep criteria meeting min_occurrences
# ---------------------------------------------------------------------
keep_cols = [c for c, n in criterion_counts.items() if n >= MIN_OCCURRENCES]
df = pd.DataFrame(model_rows)
df = df[["model", "overall", *keep_cols]].dropna(subset=["overall"]).reset_index(drop=True)

# ---------------------------------------------------------------------
# 4. Invert negative metrics (20‑score scale) and rename
# ---------------------------------------------------------------------
for col in keep_cols:
    if col in NEGATIVE_CRITERIA:
        df[col] = 20 - df[col]
        df.rename(columns={col: f"Inverted_{col}"}, inplace=True)

df.rename(columns=RENAME_MAP, inplace=True)

# ---------------------------------------------------------------------
# 5. Combine criteria if COMBINATIONS dict is non‑empty
# ---------------------------------------------------------------------
for new_name, src_cols in COMBINATIONS.items():
    existing = [c for c in src_cols if c in df.columns]
    if existing:
        df[new_name] = df[existing].mean(axis=1, skipna=True)
        # drop originals (only if not used elsewhere)
        df.drop(columns=existing, inplace=True)

# ---------------------------------------------------------------------
# 6. Relative scores vs ±N_NEIGHBORS models by overall
# ---------------------------------------------------------------------
df = df.sort_values("overall", ascending=False).reset_index(drop=True)
abs_cols = [c for c in df.columns if c not in {"model", "overall"}]

for idx, row in df.iterrows():
    nbr_idx = [i for i in range(max(0, idx-N_NEIGHBORS),
                                min(len(df), idx+N_NEIGHBORS+1)) if i != idx]
    neighbours = df.loc[nbr_idx]
    for col in abs_cols:
        cur = row[col]
        rel = np.nan
        if pd.notna(cur):
            neigh_vals = neighbours[col].dropna()
            if len(neigh_vals):
                rel = cur - neigh_vals.mean()
        df.at[idx, f"relative_{col}"] = rel

# ---------------------------------------------------------------------
# 7.  Build JS object (radar + strengths / weaknesses)
# ---------------------------------------------------------------------
def signed_log(x):
    return math.copysign(math.log10(abs(x) + 1), x) if x else 0.0

chart_dict = {}
for _, row in df.iterrows():
    model_disp = get_updated_model_name(row["model"])

    abs_labels, abs_vals = [], []
    rel_labels, rel_vals_log = [], []
    rel_pairs = []

    for col in abs_cols:
        if pd.notna(row[col]):
            abs_labels.append(col)
            abs_vals.append(round(float(row[col]), 2))

    for col in abs_cols:
        rel_col = f"relative_{col}"
        val = row[rel_col]
        if pd.notna(val):
            rel_labels.append(col)
            rel_vals_log.append(round(signed_log(val), 2))
            rel_pairs.append((col, float(val)))

    # strengths / weaknesses: normalise to min=-1, median=0, max=1
    if rel_pairs:
        vals = sorted(v for _, v in rel_pairs)
        min_v, max_v = vals[0], vals[-1]
        median_v = vals[len(vals)//2] if len(vals)%2 else (vals[len(vals)//2 -1]+vals[len(vals)//2])/2

        norm = []
        for crit, v in rel_pairs:
            if min_v == max_v:
                n = 0.0
            elif v <= median_v:
                n = -1 + (v - min_v) / (median_v - min_v) if median_v > min_v else -1
            else:
                n =       (v - median_v) / (max_v - median_v) if max_v > median_v else 1
            norm.append((crit, round(n, 2)))

        norm.sort(key=lambda x: x[1])
        weaknesses = [{"criterion": c, "relativeScore": v} for c, v in norm[:5]]
        strengths  = [{"criterion": c, "relativeScore": v} for c, v in norm[-5:][::-1]]
    else:
        strengths = weaknesses = []

    chart_dict[model_disp] = {
        "absoluteRadar": {"labels": abs_labels, "values": abs_vals},
        "relativeRadarLog": {"labels": rel_labels, "values": rel_vals_log},
        "strengths": strengths,
        "weaknesses": weaknesses,
    }

chart_data_js = f"const chartData = {json.dumps(chart_dict, indent=2)};"
print(chart_data_js)

# DataFrame is also left in `df` for further analysis
df_eqbench = df.copy()


const chartData = {
  "o3": {
    "absoluteRadar": {
      "labels": [
        "depth_of_insight",
        "emotional_reasoning",
        "demonstrated_empathy",
        "pragmatic_ei",
        "social_dexterity",
        "message_tailoring",
        "boundary_setting",
        "safety_conscious",
        "moralising",
        "compliant",
        "challenging",
        "warmth",
        "validating",
        "analytical",
        "reactive",
        "conversational",
        "humanlike"
      ],
      "values": [
        17.98,
        17.56,
        17.32,
        16.95,
        16.14,
        16.41,
        14.77,
        16.18,
        7.32,
        12.5,
        12.23,
        14.68,
        16.27,
        18.59,
        8.45,
        15.09,
        16.95
      ]
    },
    "relativeRadarLog": {
      "labels": [
        "depth_of_insight",
        "emotional_reasoning",
        "demonstrated_empathy",
        "pragmatic_ei",
        "social_dexterity",
        "message_tailoring"