In [1]:
##############################################################################
# 1) Family assignments
##############################################################################
# Note: You said “gemma & gemini are both google.” So any model name 
# containing "gemma" or "gemini" is also mapped to Google in this dict.

from model_name_subs import MODEL_NAME_SUBS

model_to_family = {
    'google/gemini-2.0-flash-001': 'Google',
    'anthropic/claude-3.7-sonnet': 'Anthropic',
    'anthropic/claude-3.5-sonnet': 'Anthropic',
    'deepseek/deepseek-chat-v3-0324': 'DeepSeek',
    'google/gemma-2-9b-it': 'Google',
    'deepseek/deepseek-r1': 'DeepSeek',
    'anthropic/claude-3.5-haiku-20241022': 'Anthropic',
    'openai/gpt-4o-mini': 'OpenAI',
    'openai/gpt-4.5-preview': 'OpenAI',
    'mistralai/mistral-small-3.1-24b-instruct-2503': 'Mistral',
    'meta-llama/llama-3.1-8b-instruct': 'Meta-Llama',
    'meta-llama/llama-3.1-70b-instruct': 'Meta-Llama',
    'meta-llama/llama-3.1-405b-instruct': 'Meta-Llama',
    'google/gemma-3-27b-it': 'Google',
    'qwen/qwq-32b': 'Qwen',
    'openai/chatgpt-4o-latest': 'OpenAI',
    'cohere/command-a': 'Cohere',
    'mistralai/mistral-nemo': 'Mistral',
    'mistralai/mistral-small-24b-instruct-2501': 'Mistral',
    'meta-llama/llama-3.2-3b-instruct': 'Meta-Llama',
    'gemini-2.5-pro-exp-03-25': 'Google',  # 'gemini' => Google
    'google/gemma-3-4b-it': 'Google',
    'google/gemma-3-12b-it': 'Google',
    'sam-paech/Darkest-muse-v1': 'Sam-Paech',
    'ifable/gemma-2-Ifable-9B': 'Google',  # 'gemma' => Google
    'ToastyPigeon/Gemma-3-Starshine-12B': 'Google',  # 'Gemma' => Google
    'allura-org/Gemma-3-Glitter-12B': 'Google',       # 'Gemma' => Google
    'liquid/lfm-7b': 'Liquid',
    'chatgpt-4o-latest': 'OpenAI',
    'anthropic/claude-3-haiku': 'Anthropic',
    'rekaai/reka-flash-3:free': 'Reka',
    'meta-llama/llama-3.2-1b-instruct': 'Meta-Llama',
    'google/gemma-3-4b-it:free': 'Google',
    'x-ai/grok-3-beta': 'grok-3-beta',
    'x-ai/grok-3-mini-beta': 'grok-3-mini-beta',
}

##############################################################################
# Family colors to match the above
##############################################################################
family_colors = {
    'Google':     '#8a5cf5',
    'Anthropic':  '#ffc13b',
    'DeepSeek':   '#1eb980',
    'OpenAI':     '#ff5c8d',
    'Mistral':    '#ff6e40',
    'Meta-Llama': '#1e3d59',
    'Qwen':       '#b2df8a',
    'Cohere':     '#bebada',
    'Sam-Paech':  '#f28e2c',
    'Liquid':     '#767676',
    'Reka':       '#fb8072',
    # If not in dictionary, default to "Other"
    'Other':      '#cccccc'
}


MODELS_TO_IGNORE = [
        'mistralai/ministral-3b',
        'ministral-3b',
        'google/gemma-3-4b-it:free'
    ]


In [2]:
import json
import pandas as pd
import numpy as np
from IPython.display import HTML, display
from collections import defaultdict
import re
import os
import sys
from typing import Dict, List, Any, Optional, Tuple
from core.metrics import calculate_repetition_metric, get_top_repetitive_words, get_multi_prompt_ngrams, calculate_slop_index_new

# --- Add core directory to Python path ---
SCRIPT_DIR = os.path.dirname(os.path.abspath('./')) # Assumes running from parent dir of script
CORE_DIR = os.path.join(SCRIPT_DIR, 'core')
if CORE_DIR not in sys.path:
    sys.path.insert(0, CORE_DIR)




# --- Helper function to update model name if a substitution exists ---
def get_updated_model_name(original: str) -> str:
    return MODEL_NAME_SUBS.get(original, original)

# --- Import metrics functions ---
try:
    from core.metrics import calculate_slop_index, calculate_complexity_index
except ImportError as e:
    print(f"Error importing metrics from core.metrics: {e}", file=sys.stderr)
    print("Please ensure core/metrics.py exists and is in the Python path.", file=sys.stderr)
    # Define dummy functions if import fails to avoid crashing later
    def calculate_slop_index(text: str) -> float: return -1.0
    def calculate_complexity_index(text: str) -> float: return -1.0

# Config variables
RUNS_FILE = "creative_bench_runs.json"
#RUNS_FILE = "repro_testing.json"
ELO_RESULTS_FILE = "elo_results.json"
ELO_RESULTS_UPDATED_FILE = "elo_results_with_metrics_repro.json"

PROMPTS_ORDER = [
    "25", "9", "8", "33", "31", "4", "3", "32", "20", "30",
    "15", "19", "18", "7", "28", "6", "5", "16", "1", "2",
    "10", "11", "12", "13", "14", "17", "21", "22", "23",
    "24", "26", "29"
]

# --- Existing Functions (load_json_file, sanitize_model_name, etc.) ---
def load_json_file(file_path: str) -> Dict:
    """Load data from a JSON file."""
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return {}
    with open(file_path, 'r', encoding='utf-8') as f:
        try:
            return json.load(f)
        except json.JSONDecodeError:
            print(f"Error decoding JSON from {file_path}")
            return {}

def sanitize_model_name(model_name: str) -> str:
    """Sanitize model name for use in filenames."""
    sanitized = model_name.replace("/", "__")
    unsafe_chars = r'<>:"|?*\\'
    for char in unsafe_chars:
        sanitized = sanitized.replace(char, '-')
    return sanitized

def calculate_creative_writing_scores(runs_data: Dict, model_name: str) -> Tuple[float, Dict]:
    """
    Calculate the creative writing scores for a model using the same methodology as generate_model_report.
    
    Args:
        runs_data: The dictionary containing all run data
        model_name: The name of the model to calculate the score for
        
    Returns:
        Tuple of (overall_average_score, iterations_dict)
        where iterations_dict maps iteration IDs to their score and prompts
    """
    # Get negative criteria if available
    neg_criteria = []
    try:
        neg_criteria_file = 'data/negative_criteria.txt'
        if os.path.exists(neg_criteria_file):
            with open(neg_criteria_file, 'r') as f:
                neg_criteria = [line.strip().lower() for line in list(f.readlines())]
    except FileNotFoundError:
        print(f"Warning: {neg_criteria_file} not found. Negative criteria scoring adjustment will not be applied.")
    
    # Find matching runs for the model
    matching_runs = [k for k, v in runs_data.items() if v.get("test_model") == model_name]
    if not matching_runs:
        return 0.0, {}
    
    # Use the most recent run
    run_key = matching_runs[-1]
    run_data = runs_data[run_key]
    
    creative_tasks = run_data.get("creative_tasks", {})
    if not creative_tasks:
        return 0.0, {}
    
    # Calculate scores by iteration
    iterations = {}
    total_score_sum = 0
    total_score_count = 0
    
    for iter_idx, prompt_data in creative_tasks.items():
        iter_score_sum = 0
        iter_score_count = 0
        
        for prompt_id, task_data in prompt_data.items():
            if task_data.get("status") not in ["completed", "judged"]:
                continue
            
            results_by_mod = task_data.get("results_by_modifier", {})
            for seed_mod, block in results_by_mod.items():
                j_scores = block.get("judge_scores", {})
                for metric, val in j_scores.items():
                    if isinstance(val, (int, float)) and val <= 20:
                        score_val = (20 - val) if metric.lower() in neg_criteria else val
                        iter_score_sum += score_val
                        iter_score_count += 1
                        total_score_sum += score_val
                        total_score_count += 1
        
        iterations[iter_idx] = {
            "score": round(iter_score_sum / iter_score_count, 2) if iter_score_count > 0 else 0,
            "prompts": prompt_data
        }
    
    overall_avg_score = round(total_score_sum / total_score_count, 2) if total_score_count > 0 else 0.0
    return overall_avg_score, iterations

def generate_model_report(model_name: str, run_key: Optional[str] = None, save_to_file: bool = False) -> HTML:
    """
    Generate an HTML report for a specific model with theme and font selection,
    including a back button and dark mode toggle.

    Args:
        model_name: The name of the model to generate the report for
        run_key: Optional specific run key to use
        save_to_file: Whether to save the report to an HTML file

    Returns:
        An HTML object containing the report
    """
    # --- Data Loading and Processing (Identical to previous version) ---
    runs_data = load_json_file(RUNS_FILE)
    elo_data = load_json_file(ELO_RESULTS_FILE)

    if run_key is None:
        matching_runs = [k for k, v in runs_data.items() if v.get("test_model") == model_name]
        if not matching_runs:
            return HTML(f"<h2>No runs found for model: {model_name}</h2>")
        run_key = matching_runs[-1]

    if run_key not in runs_data:
        return HTML(f"<h2>Run key not found: {run_key}</h2>")

    run_data = runs_data[run_key]
    original_model_name = run_data.get("test_model", model_name)
    display_model_name = get_updated_model_name(original_model_name) # Use updated name for display

    creative_tasks = run_data.get("creative_tasks", {})
    if not creative_tasks:
        return HTML(f"<h2>No creative tasks found for run: {run_key}</h2>")

    # --- Data Processing ---
    creative_prompts = {}
    try:
        # Adjust path relative to SCRIPT_DIR if needed
        creative_prompts_file = run_data.get("creative_prompts_file", os.path.join(SCRIPT_DIR, "data/creative_writing_prompts_v3.json"))
        if os.path.exists(creative_prompts_file):
            creative_prompts = load_json_file(creative_prompts_file)
        else:
             print(f"Warning: Creative prompts file not found at {creative_prompts_file}")
    except Exception as e:
        print(f"Warning: Could not load creative prompts: {str(e)}")

    # Use the new function to calculate overall score and iterations
    overall_avg_score, iterations = calculate_creative_writing_scores(runs_data, model_name)
    
    # Sort iterations by score (descending)
    sorted_iterations = sorted(iterations.items(), key=lambda x: x[1]["score"], reverse=True)
    # --- End Data Processing ---

    # --- HTML Generation with Themes, Fonts, Back Button, Dark Mode ---
    html_output = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Model Outputs: {display_model_name}</title>
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <style>
            /* ----------------------------------------------------
            1) Font Imports & Face Definitions
            ---------------------------------------------------- */
            /* Lora (Used for Cozy Headers) */
            @import url('https://fonts.googleapis.com/css2?family=Lora:ital,wght@0,400..700;1,400..700&display=swap');
            /* Merriweather (Used for Modern Headers - fallback) */
            @import url('https://fonts.googleapis.com/css2?family=Merriweather:ital,wght@0,300;0,400;0,700;1,300;1,400;1,700&display=swap');

            /* Dynamic font loading will be handled with JavaScript */
            /* We'll keep the font declarations in CSS for fallback purposes in case JS fails */

            /* ----------------------------------------------------
            2) Base Variables & Font Defaults
            ---------------------------------------------------- */
            :root {{
                /* Default Theme: Cozy Light */
                --theme-name: 'cozy'; /* JS uses this */

                /* Fonts */
                --font-body-cozy: 'Tiempos Text', Georgia, serif;
                --font-heading-cozy: 'Lora', serif;
                --font-body-modern: 'Inter', sans-serif; /* Changed modern default body */
                --font-heading-modern: 'Besley', 'Merriweather', serif;
                --font-ui: 'Lora', sans-serif; /* For controls */

                /* Default to Cozy fonts */
                --font-body: var(--font-body-cozy);
                --font-heading: var(--font-heading-cozy);

                /* Cozy Light Colors */
                --bg-color: #fdfaf6;
                --text-color: #3a3a3a;
                --header-color: #5c4033;
                --subheader-color: #7a6a60;
                --border-color: #e0dcd1;
                --accent-border-color: #d3c0a5;
                --container-bg: #fffcf7;
                --iter-header-bg: #f5f0e8;
                --iter-header-hover-bg: #ede8de;
                --prompt-header-bg: #faf5ef;
                --prompt-header-hover-bg: #f5f0e8;
                --judge-bg: #f3f6f9;
                --judge-border: #c8d7e6;
                --judge-text: #555;
                --prompt-display-bg: #f9f6f0;
                --toggle-icon-color: #8a7a70;
                --shadow-color: rgba(0, 0, 0, 0.08);
                --link-color: #7a6a60;
                --link-hover-color: #5c4033;
                --toggle-bg: #ccc; /* Not used visually now */
                --toggle-checked-bg: #7a6a60; /* Not used visually now */
                --toggle-knob-bg: white; /* Not used visually now */
                --select-text-color: var(--subheader-color);
                --select-chevron-color: var(--subheader-color);
                --select-bg: transparent;
                --select-border: none;
            }}

            /* ----------------------------------------------------
            3) Cozy Dark Mode Variables
            ---------------------------------------------------- */
            body.theme-cozy.dark-mode {{
                --bg-color: #2a2527;
                --text-color: #fff9f2;
                --header-color: #f7eee0;
                --subheader-color: #e9dfd0;
                --border-color: #3e3936;
                --accent-border-color: #6a5349;
                --container-bg: #312c2e;
                --iter-header-bg: #342e2f;
                --iter-header-hover-bg: #413935;
                --prompt-header-bg: #312b2d;
                --prompt-header-hover-bg: #3a3234;
                --judge-bg: #2f3136;
                --judge-border: #4e4944;
                --judge-text: #fcf5eb;
                --prompt-display-bg: #302a2c;
                --toggle-icon-color: #c0b0a0;
                --shadow-color: #0c0705;
                --link-color: #d0bca8;
                --link-hover-color: #ebdac5;
                --toggle-bg: #524740; /* Not used visually now */
                --toggle-checked-bg: #9a8778; /* Not used visually now */
                --toggle-knob-bg: #ede6dc; /* Not used visually now */
                --select-text-color: var(--subheader-color);
                --select-chevron-color: var(--subheader-color);
            }}

            /* ----------------------------------------------------
            4) Modern Theme Variables (Light & Dark)
            ---------------------------------------------------- */
            body.theme-modern {{
                --theme-name: 'modern'; /* JS uses this */

                /* Fonts */
                --font-body: var(--font-body-modern);
                --font-heading: var(--font-heading-modern);

                /* Modern Light Colors */
                --bg-color: #ffffff;
                --text-color: #212529;
                --header-color: #000000;
                --subheader-color: #495057;
                --border-color: #dee2e6;
                --accent-border-color: #adb5bd;
                --container-bg: #ffffff;
                --iter-header-bg: #f8f9fa;
                --iter-header-hover-bg: #e9ecef;
                --prompt-header-bg: #ffffff;
                --prompt-header-hover-bg: #f8f9fa;
                --judge-bg: #f1f3f5;
                --judge-border: #ced4da;
                --judge-text: #343a40;
                --prompt-display-bg: #f8f9fa;
                --toggle-icon-color: #6c757d;
                --shadow-color: rgba(0, 0, 0, 0.1);
                --link-color: #007bff;
                --link-hover-color: #0056b3;
                --toggle-bg: #ced4da; /* Not used visually now */
                --toggle-checked-bg: #007bff; /* Not used visually now */
                --toggle-knob-bg: white; /* Not used visually now */
                --select-text-color: var(--subheader-color);
                --select-chevron-color: var(--subheader-color);
            }}

            body.theme-modern.dark-mode {{
                /* Modern Dark Colors */
                --bg-color: #1a1a1a;
                --text-color: #e9ecef;
                --header-color: #ffffff;
                --subheader-color: #adb5bd;
                --border-color: #495057;
                --accent-border-color: #6c757d;
                --container-bg: #212529;
                --iter-header-bg: #343a40;
                --iter-header-hover-bg: #495057;
                --prompt-header-bg: #2c3034;
                --prompt-header-hover-bg: #343a40;
                --judge-bg: #343a40;
                --judge-border: #495057;
                --judge-text: #ced4da;
                --prompt-display-bg: #343a40;
                --toggle-icon-color: #adb5bd;
                --shadow-color: rgba(0, 0, 0, 0.3);
                --link-color: #69b1ff;
                --link-hover-color: #a8d1ff;
                --toggle-bg: #495057; /* Not used visually now */
                --toggle-checked-bg: #0d6efd; /* Not used visually now */
                --toggle-knob-bg: #dee2e6; /* Not used visually now */
                --select-text-color: var(--subheader-color);
                --select-chevron-color: var(--subheader-color);
            }}


            /* ----------------------------------------------------
            5) Base Global Styles (Theme Independent)
            ---------------------------------------------------- */
            body {{
                font-family: var(--font-body);
                line-height: 1.7;
                color: var(--text-color);
                background-color: var(--bg-color);
                max-width: 900px;
                margin: 30px auto;
                padding: 40px 50px;
                border: 1px solid var(--border-color);
                box-shadow: 0 5px 15px var(--shadow-color);
                transition: background-color 0.3s, color 0.3s, border-color 0.3s;
            }}
            h1, h2, h3, h4 {{
                font-family: var(--font-heading);
                color: var(--header-color);
                margin-top: 2em;
                margin-bottom: 0.8em;
                line-height: 1.3;
                transition: color 0.3s;
            }}
            h1 {{
                text-align: center;
                font-size: 2.5em;
                border-bottom: 2px solid var(--accent-border-color);
                padding-bottom: 15px;
                margin-bottom: 1.5em;
                font-weight: 700;
                transition: border-color 0.3s;
                font-family: var(--font-ui) !important; /* Keep title in UI font */
            }}
            h2 {{
                font-size: 1.8em;
                font-weight: 700;
            }}
            h3 {{
                font-size: 1.4em;
                font-style: italic;
                font-weight: 400;
                color: var(--subheader-color);
            }}
            strong {{
                font-weight: bold;
                color: var(--header-color);
                transition: color 0.3s;
            }}
            a {{
                color: var(--link-color);
                text-decoration: none;
                transition: color 0.3s;
            }}
            a:hover {{
                color: var(--link-hover-color);
                text-decoration: underline;
            }}
            .top-controls {{
                display: flex;
                justify-content: space-between; /* Align items to opposite ends */
                align-items: center;
                margin-bottom: 20px;
                padding-bottom: 10px;
                border-bottom: 1px solid var(--border-color);
                transition: border-color 0.3s;
                font-family: var(--font-ui) !important; /* Keep controls in UI font */
            }}
            .back-button {{
                font-family: var(--font-ui) !important;
                font-size: 1em;
                color: var(--select-text-color); /* Add this line to match other nav elements */
                transition: color 0.3s; /* Add transition for smooth theme changes */
            }}
            
            /* Controls right side container */
            .controls-right {{
                display: flex;
                align-items: center;
                gap: 15px; /* Space between controls */
            }}

            /* ----------------------------------------------------
            6) Theme Specific Overrides & Effects
            ---------------------------------------------------- */

            /* Cozy Theme Specifics */
            body.theme-cozy {{
                /* Existing body styles are cozy defaults */
            }}
            body.theme-cozy.dark-mode {{
                box-shadow: 0 5px 20px var(--shadow-color);
                background-image: linear-gradient(to bottom, #211f21, #232022);
            }}
            body.theme-cozy.dark-mode .iteration-container {{
                box-shadow: 0 2px 8px #000000;
                border-color: var(--border-color);
            }}
            body.theme-cozy.dark-mode h1 {{
                text-shadow: 0 1px 2px #000000;
            }}
            body.theme-cozy.dark-mode .content-block {{
                border-color: var(--border-color);
            }}
            body.theme-cozy.dark-mode .prompt-text-display {{
                border-left: 3px solid var(--accent-border-color);
                background-color: #362e2b;
            }}
            body.theme-cozy.dark-mode .scores-container {{
                color: #b0a598;
            }}

            /* Modern Theme Specifics */
            body.theme-modern {{
                padding: 35px 45px;
            }}
            body.theme-modern h1 {{
                font-weight: 600;
                border-bottom-width: 1px;
            }}
            body.theme-modern h2 {{
                font-weight: 600;
            }}
            body.theme-modern h3 {{
                font-weight: 500; /* Use Medium for Inter/Modern */
                font-style: normal;
            }}
            body.theme-modern .iteration-header {{
                font-weight: 600; /* Besley */
            }}
            body.theme-modern .prompt-header {{
                font-weight: 500; /* Besley */
                font-style: normal;
            }}
            body.theme-modern .prompt-text-display {{
                border-left-width: 4px;
                border-radius: 3px;
                font-style: normal; /* Modern prompt less italic */
            }}
            body.theme-modern .judge-content {{
                border-style: solid;
                border-width: 1px;
            }}
            body.theme-modern strong {{
                font-weight: 600; /* Use SemiBold for Inter/Modern */
            }}


            /* ----------------------------------------------------
            7) Components / Containers (Theme Independent Styles)
            ---------------------------------------------------- */

            /* --- Selectors (Theme, Font) --- */
            .control-select-wrapper {{
                position: relative;
                display: inline-block;
            }}
            .control-select {{
                font-family: var(--font-ui) !important;
                font-size: 0.9em;
                color: var(--select-text-color);
                background-color: var(--select-bg);
                border: none;
                padding: 2px 5px 2px 18px; /* top/bottom, right, left (space for chevron) */
                margin: 0;
                cursor: pointer;
                appearance: none;
                -webkit-appearance: none;
                -moz-appearance: none;
                transition: color 0.3s;
                border-radius: 0; /* Ensure no default rounding */
            }}
            .control-select:focus {{
                outline: none;
            }}
            /* Custom Chevron */
            .control-select-wrapper::before {{ /* Changed from ::after */
                content: '▼';
                font-size: 0.6em;
                color: var(--select-chevron-color);
                position: absolute;
                left: 5px; /* Position on the left */
                top: 50%;
                transform: translateY(-50%);
                pointer-events: none;
                transition: color 0.3s;
            }}
            .control-select option {{
                background-color: var(--bg-color);
                color: var(--text-color);
                font-family: var(--font-ui); /* Ensure options use UI font */
            }}

            /* --- Dark Mode Toggle --- */
            .mode-toggle {{
                display: flex;
                align-items: center;
                font-family: var(--font-ui) !important;
            }}
            .mode-toggle .form-check-input {{ /* The hidden checkbox */
                opacity: 0;
                width: 0;
                height: 0;
                position: absolute;
            }}
            /* No visual switch span needed */
            .mode-toggle .form-check-label {{ /* The clickable text */
                font-family: var(--font-ui) !important;
                font-size: 0.9em;
                color: var(--subheader-color);
                cursor: pointer;
                transition: color 0.3s;
                user-select: none; /* Prevent text selection on click */
                padding: 2px 5px; /* Add some padding for easier clicking */
            }}
            .mode-toggle .form-check-label:hover {{
                color: var(--link-hover-color); /* Use link hover color for feedback */
            }}


            /* --- Report Content Containers --- */
            .iteration-container {{
                margin: 30px 0;
                border: 1px solid var(--border-color);
                border-radius: 4px;
                overflow: hidden;
                background-color: var(--container-bg);
                box-shadow: 0 2px 5px rgba(0,0,0,0.05);
                transition: background-color 0.3s, border-color 0.3s, box-shadow 0.3s;
            }}
            .iteration-header {{
                background: var(--iter-header-bg);
                padding: 12px 20px;
                cursor: pointer;
                position: relative;
                border-bottom: 1px solid var(--border-color);                
                font-size: 1.2em;
                font-weight: 700;
                color: var(--header-color);
                transition: background-color 0.3s, border-color 0.3s, color 0.3s;
            }}
            .iteration-header:hover {{
                background: var(--iter-header-hover-bg);
            }}
            .prompt-container {{
                border-top: 1px dashed var(--accent-border-color);
                transition: border-color 0.3s;
            }}
            .prompt-container:first-child {{
                border-top: none;
            }}
            .prompt-header {{
                background: var(--prompt-header-bg);
                padding: 10px 20px;
                cursor: pointer;
                font-size: 1.1em;
                font-weight: 400;
                color: var(--subheader-color);
                transition: background-color 0.3s, color 0.3s;
            }}
            .prompt-header:hover {{
                background: var(--prompt-header-hover-bg);
            }}
            .content-block {{
                padding: 15px 25px;
                border-top: 1px solid var(--border-color);
                background-color: var(--container-bg);
                transition: background-color 0.3s, border-color 0.3s;
            }}
            .response-content {{
                white-space: pre-wrap;
                font-family: var(--font-body);
                font-size: 1.05em;
                line-height: 1.7;
                margin-bottom: 15px;
                color: var(--text-color);
                transition: color 0.3s;
            }}
            .judge-content {{
                white-space: pre-wrap;
                font-family: var(--font-body);
                font-size: 1.0em;
                line-height: 1.6;
                background: var(--judge-bg);
                border: 1px dashed var(--judge-border);
                padding: 10px 15px;
                margin-top: 10px;
                border-radius: 3px;
                color: var(--judge-text);
                transition: background-color 0.3s, border-color 0.3s, color 0.3s;
            }}
            .prompt-text-display {{
                font-style: italic; /* Default italic */
                color: var(--subheader-color);
                margin-bottom: 1em;
                padding: 10px 15px;
                background-color: var(--prompt-display-bg);
                border-left: 3px solid var(--accent-border-color);
                white-space: pre-wrap;
                font-family: var(--font-body);
                transition: background-color 0.3s, border-color 0.3s, color 0.3s, font-style 0.3s;
            }}
            .collapsible-content {{
                display: none;
                padding: 0;
                background-color: var(--container-bg);
                transition: background-color 0.3s;
            }}
            .expanded {{
                display: block;
            }}
            .toggle-icon {{
                display: inline-block;
                width: 20px;
                text-align: center;
                font-weight: bold;
                margin-right: 8px;
                color: var(--toggle-icon-color);
                transition: color 0.3s;
            }}
            .scores-container {{
                margin-left: 20px;
                font-style: italic;
                color: #888;
                font-size: 0.9em;
            }}

            /* Make certain elements always use the UI font */
            h1, 
            .back-button,
            .control-select,
            .form-check-label,
            .top-controls {{
                font-family: var(--font-ui) !important; /* Override with UI font */
            }}

            h1.main-title, /* Add a class to the main title */
            .back-button,
            .control-select,
            .form-check-label,
            .top-controls {{
                font-family: var(--font-ui) !important; /* Override with UI font */
            }}

            /* Allow iteration and prompt headers to use selected font */
            .iteration-header,
            .prompt-header {{
                font-family: var(--font-body) !important;
            }}

            /* Mobile Responsiveness Adjustments */
            @media screen and (max-width: 768px) {{
    /* Body / Layout */
    body.theme-cozy,
    body.theme-modern {{
        max-width: 100%;
        margin: 10px 5px;
        padding: 15px 10px;
    }}

    /* Headings */
    body.theme-cozy h1,
    body.theme-modern h1 {{
        font-size: 1.8em;
        padding-bottom: 10px;
        margin-bottom: 1em;
    }}

    body.theme-cozy h2,
    body.theme-modern h2 {{
        font-size: 1.5em;
    }}

    body.theme-cozy h3,
    body.theme-modern h3 {{
        font-size: 1.2em;
    }}

    /* Iteration / Prompt headers */
    body.theme-cozy .iteration-header,
    body.theme-modern .iteration-header {{
        padding: 10px 12px;
    }}

    body.theme-cozy .prompt-header,
    body.theme-modern .prompt-header {{
        padding: 8px 12px;
    }}

    /* Content blocks */
    body.theme-cozy .content-block,
    body.theme-modern .content-block {{
        padding: 10px 12px;
    }}

    /* Top controls layout */
    body.theme-cozy .top-controls,
    body.theme-modern .top-controls {{
        flex-direction: column;
        align-items: flex-start;
        gap: 10px;
    }}

    body.theme-cozy .controls-right,
    body.theme-modern .controls-right {{
        width: 100%;
        justify-content: space-between;
    }}
}}



        </style>
    </head>
    <body class="theme-cozy">
        <div class="top-controls">
            <div class="nav-left">
                <a href="javascript:history.back()" class="back-button">← Back</a>
            </div>
            
            <div class="controls-right">
                <div class="control-select-wrapper">
                    <select id="themeSelector" class="control-select" aria-label="Select Theme">
                        <option value="cozy">Cozy</option>
                        <option value="modern">Modern</option>
                    </select>
                </div>

                <div class="control-select-wrapper">
                    <select id="fontSelector" class="control-select" aria-label="Select Font">
                        <option value="tiempos">Tiempos Text</option>
                        <option value="bookerly">Bookerly</option>
                        <option value="bitter">Bitter Pro</option>
                        <option value="roboto">Roboto</option>
                        <option value="inter">Inter</option>
                        <option value="source_sans">Source Sans 3</option>
                        <option value="open_sans">Open Sans</option>
                        <option value="fira_sans">Fira Sans</option>
                        <option value="besley">Besley</option>
                    </select>
                </div>

                <div class="mode-toggle">
                    <input class="form-check-input" type="checkbox" id="darkModeToggle">
                    <label class="form-check-label" for="darkModeToggle" id="toggleLabel">Light</label>
                </div>
            </div>
        </div>

        <h1 class="main-title">Sample Outputs: {display_model_name}</h1>
    """

    for display_idx, (iter_idx, iter_data) in enumerate(sorted_iterations):
        is_first = display_idx == 0
        html_output += f"""
        <div class="iteration-container">
            <div class="iteration-header" onclick="toggleContent('iteration-{iter_idx}')">
                <span class="toggle-icon">{'−' if is_first else '+'}</span>
                Iteration {display_idx + 1} — Avg Score: {round(iter_data['score']*5, 1)}
            </div>
            <div id="iteration-{iter_idx}" class="collapsible-content {'expanded' if is_first else ''}">
        """
        prompt_data = iter_data["prompts"]
        prompt_items = []
        for prompt_id, task_data in prompt_data.items():
            if task_data.get("status") not in ["completed", "judged"]: continue
            prompt_text = task_data.get("base_prompt", "")
            if not prompt_text: continue

            prompt_category = "Unknown Category"
            prompt_title = f"Prompt {prompt_id}"
            if prompt_id in creative_prompts:
                prompt_info = creative_prompts[prompt_id]
                prompt_category = prompt_info.get("category", prompt_category)
                prompt_title = prompt_info.get("title", prompt_title)

            all_responses = []
            total_score = 0
            score_count = 0
            results_by_mod = task_data.get("results_by_modifier", {})
            for seed_mod, block in results_by_mod.items():
                response_text = block.get("model_response", "")
                raw_judge_text = block.get("raw_judge_text", "")
                j_scores = block.get("judge_scores", {})
                response_scores_list = []
                for metric, val in j_scores.items():
                    if isinstance(val, (int, float)):
                        score_val = (20 - val) if metric.lower() in neg_criteria else val
                        total_score += score_val
                        score_count += 1
                        response_scores_list.append(f"{metric}: {val}")
                all_responses.append({
                    "text": response_text, "judge_text": raw_judge_text, "scores": ", ".join(response_scores_list)
                })
            avg_score = round(total_score / score_count, 2) if score_count > 0 else 0
            prompt_items.append({
                "id": prompt_id, "prompt": prompt_text, "category": prompt_category,
                "title": prompt_title, "responses": all_responses, "avg_score": avg_score
            })

        def get_prompt_order(prompt_item):
            try: return PROMPTS_ORDER.index(prompt_item["id"])
            except ValueError: return len(PROMPTS_ORDER)
        prompt_items.sort(key=get_prompt_order)

        for pidx, item in enumerate(prompt_items):
            prompt_html_id = f"prompt-{iter_idx}-{item['id']}"
            html_output += f"""
            <div class="prompt-container">
                <div class="prompt-header" onclick="toggleContent('{prompt_html_id}')">
                    <span class="toggle-icon">+</span>
                    {item['category'].capitalize()}: {item['title']} — Score: {round(item['avg_score']*5, 1)}
                </div>
                <div id="{prompt_html_id}" class="collapsible-content">
                    <div class="content-block">
                        <div class="prompt-text-display">
<strong>Prompt:</strong><br>{item['prompt']}
                        </div>"""
            for ridx, response in enumerate(item["responses"]):
                html_output += f"""
                        <div class="response-content">
<strong>Model Output:</strong><br>{response['text']}
                        </div>"""
                if response["judge_text"]:
                    scores_display = f"<br><i>Scores: {response['scores']}</i>" if response['scores'] else ""
                    html_output += f"""
                        <div class="judge-content">
<strong>Judge Evaluation:</strong><br>{response['judge_text']} {scores_display}
                        </div>"""
                if ridx < len(item["responses"]) - 1:
                    html_output += "<hr style='border: none; border-top: 1px dotted var(--border-color); margin: 15px 0; transition: border-color 0.3s;'>"
            html_output += """
                    </div>
                </div>
            </div>"""
        html_output += """
            </div>
        </div>"""
    # --- End Iteration Loop ---

    # --- JavaScript for Toggling, Dark Mode, Themes, Fonts with Dynamic Font Loading ---
    html_output += """
        <script>
            // --- DOM Elements ---
            const body = document.body;
            const themeSelector = document.getElementById('themeSelector');
            const fontSelector = document.getElementById('fontSelector');
            const darkModeToggle = document.getElementById('darkModeToggle');
            const toggleLabel = document.getElementById('toggleLabel');

            // --- Constants ---
            const FONT_MAP = {
                'tiempos': "'Tiempos Text', Georgia, serif",
                'bookerly': "'Bookerly', Georgia, serif",
                'bitter': "'Bitter Pro', Georgia, serif",
                'roboto': "'Roboto', sans-serif",
                'inter': "'Inter', sans-serif",
                'source_sans': "'Source Sans 3', sans-serif",
                'open_sans': "'Open Sans', sans-serif",
                'fira_sans': "'Fira Sans', sans-serif",
                'besley': "'Besley', 'Merriweather', serif"
            };
            
            // Font definitions with URLs for dynamic loading
            const FONT_DEFINITIONS = {
                'tiempos': {
                    family: 'Tiempos Text',
                    variants: [
                        { weight: 400, style: 'normal', url: 'fonts/tiempos_text/TiemposText-Regular.woff2' },
                        { weight: 400, style: 'italic', url: 'fonts/tiempos_text/TiemposText-RegularItalic.woff2' },
                        { weight: 700, style: 'normal', url: 'fonts/tiempos_text/TiemposText-Bold.woff2' }
                    ],
                    fallback: 'Georgia, serif'
                },
                'bookerly': {
                    family: 'Bookerly',
                    variants: [
                        { weight: 400, style: 'normal', url: 'fonts/bookerly/Bookerly.woff' },
                        { weight: 400, style: 'italic', url: 'fonts/bookerly/Bookerly Italic.woff' },
                        { weight: 700, style: 'normal', url: 'fonts/bookerly/Bookerly Bold.woff' }
                    ],
                    fallback: 'Georgia, serif'
                },
                'bitter': {
                    family: 'Bitter Pro',
                    variants: [
                        { weight: 400, style: 'normal', url: 'fonts/bitter_pro/BitterPro-Regular.ttf' },
                        { weight: 400, style: 'italic', url: 'fonts/bitter_pro/BitterPro-Italic.ttf' },
                        { weight: 700, style: 'normal', url: 'fonts/bitter_pro/BitterPro-Bold.ttf' }
                    ],
                    fallback: 'Georgia, serif'
                },
                'roboto': {
                    family: 'Roboto',
                    variants: [
                        { weight: 400, style: 'normal', url: 'fonts/roboto/static/Roboto-Regular.ttf' },
                        { weight: 400, style: 'italic', url: 'fonts/roboto/static/Roboto-Italic.ttf' },
                        { weight: 700, style: 'normal', url: 'fonts/roboto/static/Roboto-Bold.ttf' }
                    ],
                    fallback: 'sans-serif'
                },
                'inter': {
                    family: 'Inter',
                    variants: [
                        { weight: 400, style: 'normal', url: 'fonts/inter/static/Inter-Regular.ttf' },
                        { weight: 400, style: 'italic', url: 'fonts/inter/static/Inter-Italic.ttf' },
                        { weight: 700, style: 'normal', url: 'fonts/inter/static/Inter-Bold.ttf' }
                    ],
                    fallback: 'sans-serif'
                },
                'source_sans': {
                    family: 'Source Sans 3',
                    variants: [
                        { weight: 400, style: 'normal', url: 'fonts/source_sans_3/static/SourceSans3-Regular.ttf' },
                        { weight: 400, style: 'italic', url: 'fonts/source_sans_3/static/SourceSans3-Italic.ttf' },
                        { weight: 700, style: 'normal', url: 'fonts/source_sans_3/static/SourceSans3-Bold.ttf' }
                    ],
                    fallback: 'sans-serif'
                },
                'open_sans': {
                    family: 'Open Sans',
                    variants: [
                        { weight: 400, style: 'normal', url: 'fonts/open_sans/static/OpenSans-Regular.ttf' },
                        { weight: 400, style: 'italic', url: 'fonts/open_sans/static/OpenSans-Italic.ttf' },
                        { weight: 700, style: 'normal', url: 'fonts/open_sans/static/OpenSans-Bold.ttf' }
                    ],
                    fallback: 'sans-serif'
                },
                'fira_sans': {
                    family: 'Fira Sans',
                    variants: [
                        { weight: 400, style: 'normal', url: 'fonts/fira_sans/FiraSans-Regular.ttf' },
                        { weight: 400, style: 'italic', url: 'fonts/fira_sans/FiraSans-Italic.ttf' },
                        { weight: 700, style: 'normal', url: 'fonts/fira_sans/FiraSans-Bold.ttf' }
                    ],
                    fallback: 'sans-serif'
                },
                'besley': {
                    family: 'Besley',
                    variants: [
                        { weight: 400, style: 'normal', url: 'fonts/besley/Besley-VariableFont_wght.ttf' },
                        { weight: 400, style: 'italic', url: 'fonts/besley/Besley-Italic-VariableFont_wght.ttf' }
                    ],
                    fallback: 'serif'
                }
            };
            
            // Define which fonts are generally sans-serif for logic purposes
            const SANS_FONTS = ['roboto', 'inter', 'source_sans', 'open_sans', 'fira_sans'];

            const THEME_DEFAULT_FONTS = {
                'cozy': 'tiempos',
                'modern': 'inter'
            };
            const THEME_DEFAULT_HEAD_FONTS = {
                 'cozy': "'Lora', serif",
                 'modern': "'Besley', 'Merriweather', serif"
            };
            
            // Keep track of loaded fonts to avoid loading the same font multiple times
            const loadedFonts = new Set();

            // --- Dynamic Font Loading ---
            async function loadFontFace(fontKey) {
                if (loadedFonts.has(fontKey)) return; // Skip if already loaded
                
                const fontDef = FONT_DEFINITIONS[fontKey];
                if (!fontDef) {
                    console.warn(`Font definition not found for: ${fontKey}`);
                    return;
                }
                
                try {
                    const fontLoadPromises = fontDef.variants.map(variant => {
                        const fontFace = new FontFace(
                            fontDef.family,
                            `url(${variant.url})`,
                            {
                                weight: variant.weight,
                                style: variant.style
                            }
                        );
                        
                        return fontFace.load().then(loadedFont => {
                            document.fonts.add(loadedFont);
                            return loadedFont;
                        });
                    });
                    
                    await Promise.all(fontLoadPromises);
                    loadedFonts.add(fontKey);
                    console.log(`Loaded font: ${fontDef.family}`);
                } catch (err) {
                    console.error(`Error loading font ${fontDef.family}:`, err);
                    // Fall back silently - CSS will use fallback fonts
                }
            }

            // --- Content Toggling ---
            function toggleContent(id) {
                const element = document.getElementById(id);
                if (!element) return;
                const isExpanded = element.classList.contains('expanded');
                const header = element.previousElementSibling;
                const toggleIcon = header ? header.querySelector('.toggle-icon') : null;

                if (isExpanded) {
                    element.classList.remove('expanded');
                    if (toggleIcon) toggleIcon.textContent = '+';
                } else {
                    element.classList.add('expanded');
                    if (toggleIcon) toggleIcon.textContent = '−';
                }
            }

            // --- Shared settings with consistent keys ---
            const STORAGE_PREFIX = 'model_viewer_';
            const KEYS = {
                THEME: `${STORAGE_PREFIX}theme`,
                FONT: `${STORAGE_PREFIX}font`,
                DARK_MODE: `modelViewerDarkModeEnabled`
            };

            // Save settings with consistent keys
            function saveSettings(type, value) {
                localStorage.setItem(KEYS[type], value);
            }

            // --- Dark Mode ---
            function setDarkMode(isDark) {
                body.classList.toggle('dark-mode', isDark);
                toggleLabel.textContent = isDark ? 'Dark' : 'Light';
                if (darkModeToggle.checked !== isDark) {
                    darkModeToggle.checked = isDark;
                }
                saveSettings('DARK_MODE', isDark);
            }


            // --- Theme Selection ---
            function applyTheme(themeName) {
                body.classList.remove('theme-cozy', 'theme-modern');
                body.classList.add(`theme-${themeName}`);
                if (themeSelector.value !== themeName) {
                    themeSelector.value = themeName;
                }
                saveSettings('THEME', themeName);
                
                // Re-apply font based on theme's default or user's saved preference
                const savedFont = localStorage.getItem(KEYS.FONT);
                const defaultFont = THEME_DEFAULT_FONTS[themeName] || 'tiempos';
                applyFont(savedFont || defaultFont);
            }

            // --- Font Selection ---
            async function applyFont(fontValue) {
                // First, load the font faces dynamically
                await loadFontFace(fontValue);
                
                const fontFamily = FONT_MAP[fontValue];
                const currentTheme = localStorage.getItem(KEYS.THEME) || 'cozy';
                let headingFontFamily = THEME_DEFAULT_HEAD_FONTS[currentTheme]; // Default heading for theme

                if (fontFamily) {
                    // Set body font - content text only, not UI elements
                    body.style.setProperty('--font-body', fontFamily);

                    // Determine appropriate heading font based on selected body font and theme
                    if (currentTheme === 'modern') {
                        headingFontFamily = THEME_DEFAULT_HEAD_FONTS['modern'];
                    } else { 
                        headingFontFamily = THEME_DEFAULT_HEAD_FONTS['cozy'];
                    }
                    
                    // Special case: If Besley is explicitly selected, use it for heading regardless of theme
                    if (fontValue === 'besley') {
                        headingFontFamily = FONT_MAP['besley'];
                    }

                    // Set the content heading font - not UI elements
                    body.style.setProperty('--font-heading', headingFontFamily);

                    // Update the selector value if needed
                    if (fontSelector.value !== fontValue) {
                        fontSelector.value = fontValue;
                    }
                    
                    saveSettings('FONT', fontValue);
                } else {
                    console.warn("Font value not found in FONT_MAP:", fontValue);
                    // Fallback to theme default
                    const theme = localStorage.getItem(KEYS.THEME) || 'cozy';
                    applyFont(THEME_DEFAULT_FONTS[theme]);
                }
            }

            // --- Event Listeners ---
            darkModeToggle.addEventListener('change', function() {
                setDarkMode(this.checked);
            });

            themeSelector.addEventListener('change', function() {
                applyTheme(this.value);
            });

            fontSelector.addEventListener('change', function() {
                applyFont(this.value);
            });

            // --- Initial Settings Application ---
            async function applyInitialSettings() {
                const savedDarkMode = localStorage.getItem(KEYS.DARK_MODE);
                const prefersDark = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches;
                setDarkMode(savedDarkMode !== null ? (savedDarkMode === 'true') : prefersDark);

                const savedTheme = localStorage.getItem(KEYS.THEME) || 'cozy';
                applyTheme(savedTheme);

                const savedFont = localStorage.getItem(KEYS.FONT) || THEME_DEFAULT_FONTS[savedTheme];
                await applyFont(savedFont);
                
                fontSelector.value = savedFont || THEME_DEFAULT_FONTS[savedTheme];
            }

            applyInitialSettings();

            // Optional: Listen for system theme changes ONLY if no preference is saved
            window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', event => {
                if (localStorage.getItem('darkModeEnabled') === null) {
                    setDarkMode(event.matches);
                }
            });

        </script>
    </body>
    </html>
    """

    if save_to_file:
        os.makedirs("results", exist_ok=True)
        sanitized_name = sanitize_model_name(get_updated_model_name(original_model_name))
        filename = f"results/{sanitized_name}.html"
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(html_output)
            print(f"Report saved to {filename}")
        except IOError as e:
            print(f"Error saving report to {filename}: {e}")

    return HTML(html_output)


# --- Helper Functions (Identical to original) ---
def view_model_report(model_name, run_key=None, save_to_file=False):
    """Display the HTML report for a given model and optionally save it."""
    report = generate_model_report(model_name, run_key, save_to_file)
    display(report)

def save_model_report(model_name, run_key=None):
    """Generate and save the HTML report for a given model."""
    generate_model_report(model_name, run_key, save_to_file=True)

def list_available_models():
    """List all models available in the ELO results file."""
    elo_data = load_json_file(ELO_RESULTS_FILE)
    if not elo_data:
        print("No ELO data found.")
        return []
    models = []
    print("Available models (sorted by ELO):")
    for model_name, model_data in elo_data.items():
        elo_score = model_data.get("elo", -float('inf'))
        models.append((model_name, elo_score))
    models.sort(key=lambda x: x[1] if isinstance(x[1], (int, float)) else -float('inf'), reverse=True)
    for rank, (name, elo) in enumerate(models, 1):
        elo_display = f"{elo:.0f}" if isinstance(elo, (int, float)) else "N/A"
        print(f"{rank}. {get_updated_model_name(name)} (ELO: {elo_display})") # Use updated name
    return [name for name, _ in models]

def list_model_runs(model_name):
    """List all runs available for a specific model."""
    runs_data = load_json_file(RUNS_FILE)
    if not runs_data:
        print("No runs data found.")
        return []
    matching_runs = []
    for key, data in runs_data.items():
        if data.get("test_model") == model_name:
            start_time = data.get("start_time", "Unknown Time")
            status = data.get("status", "Unknown Status")
            matching_runs.append((key, start_time, status))
    if not matching_runs:
        print(f"No runs found for model: {model_name}")
        return []
    print(f"\nAvailable runs for {get_updated_model_name(model_name)}:") # Use updated name
    matching_runs.sort(key=lambda x: x[0])
    for idx, (key, time, status) in enumerate(matching_runs, 1):
        print(f"{idx}. {key} (Started: {time}, Status: {status})")
    return [key for key, _, _ in matching_runs]


import html # Import the html module for escaping

# Assume MODELS_TO_IGNORE and get_updated_model_name are defined elsewhere

def format_slop_profile_string(elo_data_with_metrics: Dict[str, Dict]) -> str:
    """
    Formats repetitive word and n-gram data into a single multi-line string
    with HTML formatting, including clickable dendrogram thumbnails.
    
    Args:
        elo_data_with_metrics: The dictionary containing model data
        
    Returns:
        A multi-line string containing the formatted slop profiles with thumbnails
    """
    output_string = ""

    # Sort models by normalized ELO descending
    sorted_models = sorted(
        elo_data_with_metrics.items(),
        key=lambda item: (
            item[1].get("normalized_elo", -float('inf'))
            if isinstance(item[1].get("normalized_elo"), (int, float))
            else (
                item[1].get("elo", -float('inf'))
                if isinstance(item[1].get("elo"), (int, float))
                else -float('inf')
            )
        ),
        reverse=True
    )

    for model_name, data in sorted_models:
        if model_name in MODELS_TO_IGNORE:
            continue

        updated_name = get_updated_model_name(model_name)
        sanitized_name = sanitize_model_name(updated_name)
        
        output_string += f"##### {updated_name}\n"
        
        # Add dendrogram thumbnails with links to full-size images
        output_string += "<div class='dendrogram-thumbnails'>\n"
        
        # Circular dendrogram thumbnail
        circular_path = f"results/creative-writing-v3/hybrid_parsimony/charts/{sanitized_name}__phylo_tree_parsimony_circular.png"
        output_string += f"  <a href='{circular_path}' target='_blank' class='dendrogram-link'>\n"
        output_string += f"    <img src='{circular_path}' alt='Circular dendrogram for {html.escape(updated_name)}' class='dendrogram-thumb circular-thumb' />\n"
        output_string += f"    <span class='dendrogram-caption'>Circular View</span>\n"
        output_string += f"  </a>\n"
        
        # Rectangular dendrogram thumbnail
        rectangular_path = f"results/creative-writing-v3/hybrid_parsimony/charts/{sanitized_name}__phylo_tree_parsimony_rectangular.png"
        output_string += f"  <a href='{rectangular_path}' target='_blank' class='dendrogram-link'>\n"
        output_string += f"    <img src='{rectangular_path}' alt='Rectangular dendrogram for {html.escape(updated_name)}' class='dendrogram-thumb rect-thumb' />\n"
        output_string += f"    <span class='dendrogram-caption'>Rectangular View</span>\n"
        output_string += f"  </a>\n"
        output_string += "</div>\n\n"
        
        # Continue with existing similar models section
        top_5 = data.get("top_5_similar", [])
        if top_5:
            output_string += "<h4>Most Similar To:</h4>\n"
            output_string += "<div class='slop-similar-section'>\n"
            for item in top_5:
                dist_str = f"{item['distance']:.3f}"
                output_string += f"<div class='slop-similar'>{html.escape(item['model'])} (distance={dist_str})</div>\n"
            output_string += "</div>\n"
            output_string += "\n"

        # Continue with the rest of the existing code for repetitive words, bigrams, trigrams
        rep_words = data.get('top_repetitive_words', [])
        output_string += "<h4>Top Repetitive Words</h4>\n"
        if rep_words:
            output_string += "<div class='slop-section-items'>\n"
            items_html = []
            for item in rep_words[:50]:
                word = item.get('word', 'N/A')
                safe_word = html.escape(word)
                items_html.append(f"<span class='slop-word-item'>{safe_word}</span>")
            output_string += " ".join(items_html)
            output_string += "\n</div>\n"
        else:
            output_string += "<p><i>No multi-prompt repetitive words found.</i></p>\n"

        # Bigrams section (unchanged)
        bigrams = data.get('top_multi_prompt_bigrams', [])
        output_string += "<h4>Top Bigrams</h4>\n"
        if bigrams:
            output_string += "<div class='slop-section-items'>\n"
            items_html = []
            for item in bigrams[:30]:
                ngram = item.get('ngram', 'N/A')
                freq = item.get('frequency', 0)
                safe_ngram = html.escape(ngram)
                items_html.append(f"<span class='slop-ngram-item'>{safe_ngram} ({freq})</span>")
            output_string += " ".join(items_html)
            output_string += "\n</div>\n"
        else:
            output_string += "<p><i>No multi-prompt bigrams found.</i></p>\n"

        # Trigrams section (unchanged)
        trigrams = data.get('top_multi_prompt_trigrams', [])
        output_string += "<h4>Top Trigrams</h4>\n"
        if trigrams:
            output_string += "<div class='slop-section-items'>\n"
            items_html = []
            for item in trigrams[:30]:
                ngram = item.get('ngram', 'N/A')
                freq = item.get('frequency', 0)
                safe_ngram = html.escape(ngram)
                items_html.append(f"<span class='slop-ngram-item'>{safe_ngram} ({freq})</span>")
            output_string += " ".join(items_html)
            output_string += "\n</div>\n"
        else:
            output_string += "<p><i>No multi-prompt trigrams found.</i></p>\n"

        output_string += "\n"

    return output_string.strip()


# --- ADDED: import for distance computation
from scipy.spatial.distance import pdist, squareform

# --- ADDED: compute top-5 nearest models (via combined Jaccard features)
def calculate_combined_jaccard_similarities(elo_data_with_metrics: Dict[str, Dict], top_n: int = 1500):
    """
    Builds a presence/absence matrix of "combined" features (top_repetitive_words,
    bigrams, trigrams) for each model, computes pairwise Jaccard distances,
    and stores each model’s top 5 neighbors under 'top_5_similar'.
    """
    model_names = list(elo_data_with_metrics.keys())
    model_to_features = {}

    for m in model_names:
        info = elo_data_with_metrics[m]
        words = info.get("top_repetitive_words", [])
        bigrams = info.get("top_multi_prompt_bigrams", [])
        trigrams = info.get("top_multi_prompt_trigrams", [])

        w_count = 120 #top_n // 3
        b_count = 40 #top_n // 3
        t_count = 40 #top_n // 3

        words_sorted = sorted(words, key=lambda x: x.get("score", 0), reverse=True)[:w_count]
        bigrams_sorted = sorted(bigrams, key=lambda x: x.get("frequency", 0), reverse=True)[:b_count]
        trigrams_sorted = sorted(trigrams, key=lambda x: x.get("frequency", 0), reverse=True)[:t_count]

        word_set = set(x["word"] for x in words_sorted)
        bigram_set = set(x["ngram"] for x in bigrams_sorted)
        trigram_set = set(x["ngram"] for x in trigrams_sorted)

        combined_set = word_set.union(bigram_set).union(trigram_set)
        model_to_features[m] = combined_set

    all_models = sorted(model_names)
    global_vocab = set()
    for feats in model_to_features.values():
        global_vocab.update(feats)
    if len(all_models) < 2 or not global_vocab:
        return

    global_vocab = sorted(global_vocab)
    df = pd.DataFrame(0, index=all_models, columns=global_vocab, dtype=int)
    for m in all_models:
        for ft in model_to_features[m]:
            if ft in df.columns:
                df.loc[m, ft] = 1

    dist_array = pdist(df.values, metric="jaccard")
    dist_matrix = squareform(dist_array)

    for i, m in enumerate(all_models):
        row_dist = dist_matrix[i, :]
        pair_list = [(all_models[j], row_dist[j]) for j in range(len(all_models)) if j != i]
        pair_list.sort(key=lambda x: x[1])  # ascending
        top_5 = pair_list[:5]
        elo_data_with_metrics[m]["top_5_similar"] = [
            {"model": get_updated_model_name(t[0]), "distance": t[1]} for t in top_5
        ]


def calculate_and_print_metrics(save_updated_elo: bool = True, print_slop_profile: bool = True): # Added print_slop_profile flag
    """
    Calculates aggregated metrics (length, vocab, slop, repetition) AND
    extracts top multi-prompt N-grams for all models found in the runs file.
    Repetition metrics & N-grams only consider words/sequences appearing in multiple prompts.
    Merges metrics with ELO data, prints results, optionally saves the updated ELO data,
    and optionally prints the formatted slop profile string.
    """
    print("\nCalculating aggregated metrics and N-grams...")
    runs_data = load_json_file(RUNS_FILE)
    elo_data = load_json_file(ELO_RESULTS_FILE)

    if not runs_data:
        print(f"Runs data file ('{RUNS_FILE}') is empty or not found. Cannot calculate metrics.")
        return

    # Structure: { model_name: { prompt_id: [text1, text2, ...], ... }, ... }
    model_texts_by_prompt = defaultdict(lambda: defaultdict(list))
    print("Extracting text from runs (grouped by prompt)...")
    processed_runs = 0
    # ... (rest of text extraction logic remains the same) ...
    for run_key, run_data in runs_data.items():
        model_name = run_data.get("test_model")
        if model_name in MODELS_TO_IGNORE:
            continue
        if not model_name: continue
        creative_tasks = run_data.get("creative_tasks", {})
        if not creative_tasks: continue

        run_has_text = False
        for iter_idx, prompt_data in creative_tasks.items():
            for prompt_id, task_data in prompt_data.items():
                if task_data.get("status") not in ["completed", "judged"]: continue
                results_by_mod = task_data.get("results_by_modifier", {})
                for seed_mod, block in results_by_mod.items():
                    response_text = block.get("model_response")
                    if isinstance(response_text, str) and response_text.strip():
                        model_texts_by_prompt[model_name][prompt_id].append(response_text)
                        run_has_text = True
        if run_has_text: processed_runs += 1

    print(f"Extracted text for {len(model_texts_by_prompt)} models from {processed_runs} runs.")
    if not model_texts_by_prompt:
        print("No model text found in any run. Cannot calculate metrics.")
        return

    print("Calculating metrics and extracting N-grams per model...")
    model_metrics = {}
    model_repetitive_words = {}
    model_top_bigrams = {}
    model_top_trigrams = {}

    # --- Iterate through models ---
    for model_name, prompts_data in model_texts_by_prompt.items():
        all_responses_flat = [] # For slop, complexity, avg_length
        texts_with_ids_list = [] # For repetition metrics
        repetition_score = 0.0  # Default score
        top_repetitive_words = [] # Default list
        top_bigrams = []        # Default list
        top_trigrams = []       # Default list

        # Basic check: Does the model have *any* data?
        if not prompts_data:
            print(f"Skipping {model_name}: No prompt data found.")
            continue

        print(f"\n  Processing {model_name} (Responses from {len(prompts_data)} prompts)...")

        # Check for multi-prompt data BEFORE calculating repetition/n-grams
        has_multi_prompt_data = len(prompts_data) >= 2

        # Populate lists needed for different metrics
        for prompt_id, texts in prompts_data.items():
            all_responses_flat.extend(texts)
            # Only add to list for repetition if multi-prompt data exists
            if has_multi_prompt_data:
                for text in texts:
                    if isinstance(text, str) and text.strip(): # Ensure only valid text is added
                        texts_with_ids_list.append((text, prompt_id))

        # --- Calculate N-grams, etc. (only if multi-prompt data) ---
        if has_multi_prompt_data and texts_with_ids_list:
            ngram_calculation_error = False
            word_extraction_error = False
            
            # Calculate total text length for normalization
            total_text_length = sum(len(text.split()) for text, _ in texts_with_ids_list)
            
            print("      Calculating N-grams and N-gram Repetition Score (multi-prompt)...")
            try:
                top_bigram_count = 0
                top_trigram_count = 0
                
                top_bigrams = get_multi_prompt_ngrams(prompts_data, n=2, top_k=200, min_prompt_ids=2)
                if top_bigrams:
                    model_top_bigrams[model_name] = top_bigrams
                    top_bigram_count = sum(freq for ngram, freq in top_bigrams[:40])
                    print(f"        Found {len(top_bigrams)} top bigrams meeting criteria.")
                else:
                    print("        No bigrams met the multi-prompt criteria.")

                top_trigrams = get_multi_prompt_ngrams(prompts_data, n=3, top_k=200, min_prompt_ids=2)
                if top_trigrams:
                    model_top_trigrams[model_name] = top_trigrams
                    top_trigram_count = sum(freq for ngram, freq in top_trigrams[:40])
                    print(f"        Found {len(top_trigrams)} top trigrams meeting criteria.")
                else:
                    print("        No trigrams met the multi-prompt criteria.")
                    
                # Calculate normalized repetition score
                if total_text_length > 0:
                    repetition_score = (top_bigram_count + top_trigram_count) / total_text_length * 1000
                else:
                    repetition_score = 0

            except NameError:
                print("      ERROR: `get_multi_prompt_ngrams` function not found. Skipping N-gram calculation and score.")
                ngram_calculation_error = True
                repetition_score = 'Error'
            except Exception as e:
                print(f"      ERROR calculating N-grams for {model_name}: {e}")
                ngram_calculation_error = True
                repetition_score = 'Error'

            if not ngram_calculation_error:
                print(f"        Calculated N-gram repetition score (normalized by text length * 1000): {repetition_score:.4f}")

            # --- Extract Top Repetitive Words ---
            print("      Extracting top repetitive words (multi-prompt)...")
            try:
                top_repetitive_words = get_top_repetitive_words(texts_with_ids_list, top_n=1000, min_prompt_ids=2) # Get more initially
                if top_repetitive_words:
                    model_repetitive_words[model_name] = top_repetitive_words # Store all found
                    print(f"        Found {len(top_repetitive_words)} repetitive words meeting criteria.")
                else:
                    print("        No words met the multi-prompt repetitive word criteria.")
            except NameError:
                print("      ERROR: `get_top_repetitive_words` function not found. Skipping word extraction.")
                word_extraction_error = True
            except Exception as e:
                print(f"      ERROR extracting repetitive words for {model_name}: {e}")
                word_extraction_error = True

        elif not has_multi_prompt_data:
            print(f"      Skipping N-grams, Repetition Score, and Repetitive Words: Only 1 prompt ID found.")
        elif not texts_with_ids_list:
            print("      Skipping N-grams, Repetition Score, and Repetitive Words: No valid text entries found after filtering.")

        # --- Calculate Other Metrics ---
        print("      Calculating other metrics (length, vocab, slop)...")
        num_responses = len(all_responses_flat)
        if num_responses == 0:
            print(f"      Skipping length/vocab/slop metrics: No text after flattening.")
            avg_length = 0.0
            vocab_complexity = 0.0
            slop_score = 0.0
        else:
            total_chars = sum(len(r) for r in all_responses_flat if isinstance(r, str))
            avg_length = round(total_chars / num_responses, 2)
            all_text_combined = "\n\n".join(r for r in all_responses_flat if isinstance(r, str))
            if not all_text_combined.strip():
                 print("      Warning: Combined text is empty after joining, cannot calculate vocab/slop.")
                 vocab_complexity = 0.0
                 slop_score = 0.0
            else:
                try:
                    vocab_complexity = calculate_complexity_index(all_text_combined)
                except Exception as e:
                    print(f"      ERROR calculating vocab complexity for {model_name}: {e}")
                    vocab_complexity = 'Error'
                try:
                    slop_score = calculate_slop_index_new(all_text_combined)
                except Exception as e:
                    print(f"      ERROR calculating slop score for {model_name}: {e}")
                    slop_score = 'Error'

        model_metrics[model_name] = {
            'avg_length': avg_length,
            'vocab_complexity': round(vocab_complexity, 4) if isinstance(vocab_complexity, (int, float)) and vocab_complexity != float('inf') else str(vocab_complexity),
            'slop_score': round(slop_score, 4) if isinstance(slop_score, (int, float)) else str(slop_score),
            'repetition_score': round(repetition_score, 4) if isinstance(repetition_score, (int, float)) else str(repetition_score) # Store potentially updated score
        }

        # --- Print Summary for Model ---
        print(f"    Metrics - Avg Len: {avg_length:.0f}, Vocab K: {model_metrics[model_name]['vocab_complexity']}, "
              f"Slop: {model_metrics[model_name]['slop_score']}, Repetition (multi-prompt): {model_metrics[model_name]['repetition_score']}")

        if top_repetitive_words:
            filtered_top_words = top_repetitive_words[:10] # Limit printout in console
            print(f"    Top multi-prompt repetitive words: " + ", ".join([f"{word} ({score:.1f}x)" for word, score in filtered_top_words]))
        elif has_multi_prompt_data:
             print("    No words met the multi-prompt repetition criteria.")

        if top_bigrams:
            print("    Top multi-prompt Bigrams:")
            for bg, freq in top_bigrams[:5]: # Limit printout
                print(f"      - {' '.join(bg)} ({freq})")
        elif has_multi_prompt_data:
            print("    No bigrams met the multi-prompt criteria.")

        if top_trigrams:
            print("    Top multi-prompt Trigrams:")
            for tg, freq in top_trigrams[:5]: # Limit printout
                print(f"      - {' '.join(tg)} ({freq})")
        elif has_multi_prompt_data:
            print("    No trigrams met the multi-prompt criteria.")


    # --- Merging metrics with ELO data ---
    print("\nMerging metrics with ELO data...")
    updated_elo_data = elo_data.copy() if isinstance(elo_data, dict) else {}

    # Add calculated metrics
    for model_name, metrics in model_metrics.items():
        if model_name not in updated_elo_data:
            updated_elo_data[model_name] = {}
            print(f"  Note: Model '{model_name}' found in runs but not in ELO data. Added entry.")
        updated_elo_data[model_name].update(metrics)

    # Add repetitive words if found
    for model_name, words_list in model_repetitive_words.items():
        if model_name in updated_elo_data:
             updated_elo_data[model_name]['top_repetitive_words'] = [
                 {"word": word, "score": float(score)}
                 for word, score in words_list
             ]

    # Add N-grams if found
    for model_name, ngrams_list in model_top_bigrams.items():
         if model_name in updated_elo_data:
             updated_elo_data[model_name]['top_multi_prompt_bigrams'] = [
                 {"ngram": ' '.join(ngram), "frequency": int(freq)}
                 for ngram, freq in ngrams_list
             ]
    for model_name, ngrams_list in model_top_trigrams.items():
         if model_name in updated_elo_data:
             updated_elo_data[model_name]['top_multi_prompt_trigrams'] = [
                 {"ngram": ' '.join(ngram), "frequency": int(freq)}
                 for ngram, freq in ngrams_list
             ]

    # --- Set default values for models present in ELO but potentially missing metrics ---
    default_metrics = {
        'avg_length': 0.0, 'vocab_complexity': 'N/A', 'slop_score': 'N/A',
        'repetition_score': 0.0, 'top_repetitive_words': [],
        'top_multi_prompt_bigrams': [], 'top_multi_prompt_trigrams': []
    }
    all_model_names = set(updated_elo_data.keys())
    for model_name in all_model_names:
        if model_name not in updated_elo_data: continue
        for key, default_value in default_metrics.items():
            updated_elo_data[model_name].setdefault(key, default_value)


    # --- Normalize ELO scores ---
    print("\nNormalizing ELO scores...")
    raw_elo_scores = {}
    for model_name, data in updated_elo_data.items():
        elo = data.get('elo')
        if isinstance(elo, (int, float)):
            raw_elo_scores[model_name] = elo

    anchor_models = {
        'deepseek/deepseek-r1': 1500,
        'meta-llama/llama-3.2-1b-instruct': 200
    }

    normalized_scores = normalize_elo_scores(raw_elo_scores, anchor_models)

    normalized_count = 0
    for model_name, normalized_elo in normalized_scores.items():
        if model_name in updated_elo_data:
            updated_elo_data[model_name]['normalized_elo'] = round(normalized_elo, 1)
            normalized_count += 1

    print(f"  Normalized ELO scores for {normalized_count} models using anchor models.")


    # --- Print CSV Results ---
    print("\n--- Aggregated Metrics Results (CSV Format) ---")
    print("model_name,elo_score,creative_writing_score,avg_length,vocab_complexity,slop_score,repetition_score")
    sorted_models = sorted(
        updated_elo_data.items(),
        key=lambda item: (
            item[1].get("normalized_elo", -float('inf'))
            if isinstance(item[1].get("normalized_elo"), (int, float))
            else (
                item[1].get("elo", -float('inf'))
                if isinstance(item[1].get("elo"), (int, float))
                else -float('inf')
            )
        ),
        reverse=True
    )

    for model_name, data in sorted_models:
        if model_name in MODELS_TO_IGNORE:
            continue
        updated_name = get_updated_model_name(model_name)
        elo = data.get('elo', 'N/A')
        elo_display = f"{elo:.1f}" if isinstance(elo, (int, float)) else 'N/A'

        norm_elo = data.get('normalized_elo', 'N/A')
        norm_elo_display = f"{norm_elo:.1f}" if isinstance(norm_elo, (int, float)) else 'N/A'

        # Calculate creative writing score from scratch
        creative_score, iterations = calculate_creative_writing_scores(runs_data, model_name)
        if iterations:  # If we got valid iterations data
            creative_score_display = f"{creative_score:.2f}"
            
            # Optionally, update the elo_data with our calculated score for consistency
            updated_elo_data[model_name]['calculated_creative_score'] = creative_score
        else:
            # Fall back to existing score if no scores were found
            creative_score = data.get('creative_writing_rubric_score_agg', 'N/A')
            creative_score_display = f"{creative_score:.2f}" if isinstance(creative_score, (int, float)) else 'N/A'

        avg_len = data.get('avg_length', 'N/A')
        avg_len_display = f"{avg_len:.0f}" if isinstance(avg_len, (int, float)) else 'N/A'

        vocab = data.get('vocab_complexity', 'N/A')
        vocab_display = f"{float(vocab):.2f}" if isinstance(vocab, (int, float)) else str(vocab)

        slop = data.get('slop_score', 'N/A')
        slop_display = f"{float(slop):.2f}" if isinstance(slop, (int, float)) else str(slop)

        repetition = data.get('repetition_score', 'N/A')
        repetition_display = f"{float(repetition):.2f}" if isinstance(repetition, (int, float)) else str(repetition)

        safe_model_name = f'"{updated_name}"' if ',' in updated_name else updated_name
        print(f"{safe_model_name},{norm_elo_display},{creative_score_display},{avg_len_display},{vocab_display},{slop_display},{repetition_display}")

    # --- ADDED: call our new combined Jaccard similarity function
    calculate_combined_jaccard_similarities(updated_elo_data, top_n=1500)

    # --- Save Updated ELO Data ---
    if save_updated_elo:
        print(f"\nSaving updated ELO data with metrics (and N-grams) to {ELO_RESULTS_UPDATED_FILE}...")
        try:
            with open(ELO_RESULTS_UPDATED_FILE, 'w', encoding='utf-8') as f:
                json.dump(updated_elo_data, f, indent=2, ensure_ascii=False)
            print("Save successful.")
        except IOError as e:
            print(f"Error saving updated ELO data to {ELO_RESULTS_UPDATED_FILE}: {e}")
        except TypeError as e:
             print(f"Error serializing updated ELO data to JSON: {e}. Check for non-serializable types.")


    # --- Generate and Print Slop Profile String ---
    if print_slop_profile:
        print("\n--- Generating Slop Profile String for JS ---")
        slop_profile_output = format_slop_profile_string(updated_elo_data)
        print("\n----- BEGIN SLOP PROFILE STRING -----")
        print(slop_profile_output)
        print("----- END SLOP PROFILE STRING -----\n")


def normalize_elo_scores(raw_scores, anchor_models=None):
    """
    Normalize ELO scores by anchoring specific models to predefined values.
    
    Args:
        raw_scores (dict): Dictionary of model names to raw ELO scores
        anchor_models (dict, optional): Dictionary mapping model names to their anchor values.
            Default: {'deepseek/deepseek-r1': 1500, 'mistralai/ministral-3b': 200}
            
    Returns:
        dict: Dictionary of model names to normalized ELO scores
    """
    if anchor_models is None:
        anchor_models = {
            'deepseek/deepseek-r1': 1500,
            'meta-llama/llama-3.2-1b-instruct': 200
        }
    
    # First check if we have at least two anchor models in our raw scores
    valid_anchors = {k: v for k, v in anchor_models.items() if k in raw_scores}
    
    if len(valid_anchors) < 2:
        print(f"Warning: Not enough anchor models found in scores. "
              f"Found {len(valid_anchors)} of {len(anchor_models)}. "
              f"Returning raw scores.")
        return {k: v for k, v in raw_scores.items()}
    
    # Get first two valid anchors to calculate normalization
    anchor_items = list(valid_anchors.items())
    model_a, target_a = anchor_items[0]
    model_b, target_b = anchor_items[1]
    
    raw_a = raw_scores[model_a]
    raw_b = raw_scores[model_b]
    
    # Avoid division by zero
    if raw_a == raw_b:
        scale = 1.0
    else:
        scale = (target_a - target_b) / (raw_a - raw_b)
    
    shift = target_a - (scale * raw_a)
    
    # Apply the transformation to all scores
    normalized_scores = {model: (score * scale + shift) for model, score in raw_scores.items()}
    
    return normalized_scores


# --- Main Execution Block ---
if __name__ == "__main__":
    # Ensure the results directory exists for saving reports
    os.makedirs("results", exist_ok=True)

    # 1. List available models (optional, uses updated names)
    print("--- Available Models ---")
    models = list_available_models()
    print("-" * 24)

    # 2. Calculate and print the aggregated metrics
    #    Set save_updated_elo=True to save to ELO_RESULTS_UPDATED_FILE
    calculate_and_print_metrics(save_updated_elo=True) # Set to True to save the file
    # print("-" * 24)

    # 3. Example: Generate and save reports for *all* models
    #    (using the updated generate_model_report function)
    
    print("\nGenerating and saving HTML reports for all models...")
    if models and True:
        for model in models:
            if model in MODELS_TO_IGNORE:
                continue
            print(f"Processing report for: {get_updated_model_name(model)}")
            try:
                save_model_report(model) # This now generates the report with themes/fonts
            except Exception as e:
                print(f"  ERROR generating report for {model}: {e}")
        print("\nFinished saving reports.")
    else:
        print("\nNo models found in ELO data to generate reports for.")

    # 4. Example: View a report directly in IPython/Jupyter (if available)
    # if models and 'IPython' in sys.modules:
    #     print(f"\nDisplaying report for {get_updated_model_name(models[0])} in IPython...")
    #     view_model_report(models[0]) # Display the first model's report
    # else:
    #      print("\nSkipping direct display (not in IPython or no models found).")

    print("\nScript finished.")


--- Available Models ---
Available models (sorted by ELO):
1. o3 (ELO: 1665)
2. deepseek-ai/DeepSeek-R1 (ELO: 1585)
3. chatgpt-4o-latest-2025-03-27 (ELO: 1455)
4. optimus-alpha (ELO: 1450)
5. deepseek-ai/DeepSeek-V3-0324 (ELO: 1444)
6. gpt-4.1 (ELO: 1411)
7. gemini-2.5-pro-exp-03-25 (ELO: 1401)
8. chatgpt-4o-latest-2025-01-29 (ELO: 1376)
9. quasar-alpha (ELO: 1376)
10. claude-3-5-sonnet-20241022 (ELO: 1362)
11. qwen/qwq-32b (ELO: 1358)
12. RekaAI/reka-flash-3 (ELO: 1352)
13. claude-3-7-sonnet-20250219 (ELO: 1341)
14. google/gemma-3-27b-it (ELO: 1263)
15. gpt-4.5-preview (ELO: 1218)
16. grok-3-beta (ELO: 1177)
17. anthropic/claude-3.5-haiku-20241022 (ELO: 1149)
18. CohereForAI/c4ai-command-a-03-2025 (ELO: 1143)
19. gpt-4.1-mini (ELO: 1141)
20. google/gemma-3-12b-it (ELO: 1122)
21. sam-paech/Darkest-muse-v1 (ELO: 1116)
22. gemini-2.0-flash-001 (ELO: 1110)
23. allura-org/Gemma-3-Glitter-12B (ELO: 1092)
24. google/gemma-3-4b-it (ELO: 1052)
25. ifable/gemma-2-Ifable-9B (ELO: 1018)
26. mistr

In [3]:
import os
import json
import subprocess
import tempfile
import shutil
from ete3 import Tree, TreeStyle, NodeStyle, TextFace, faces

import os
# Force Qt to run in 'offscreen' mode so ETE won't crash in headless environments
os.environ["QT_QPA_PLATFORM"] = "offscreen"

# -----------------------------
# GLOBAL FLAG: RUN CONSENSE?
# -----------------------------
RUN_CONSENSE = True  # set to True to produce a single consensus from multiple best trees

MODELS_TO_IGNORE = [
    'mistralai/ministral-3b',
    'ministral-3b',
    'google/gemma-3-4b-it:free',
    'unsloth/gemma-2-9b-it'
]



# --- Helper functions ---
def get_updated_model_name(original):
    """Get the updated model name if a substitution exists."""
    return MODEL_NAME_SUBS.get(original, original)

def sanitize_model_name(model_name):
    """Sanitize model name for use in filenames."""
    sanitized = model_name.replace("/", "__")
    unsafe_chars = r'<>:"|?*\\'
    for char in unsafe_chars:
        sanitized = sanitized.replace(char, '-')
    return sanitized

def layout_fn_with_highlight(node, focus_model_name=None, highlight_color="#FF0000"):
    """
    Colors nodes by model family and highlights the focal model.
    - Internal nodes: standard style
    - Leaf nodes: colored by family, with optional highlighting
    """
    if not node.is_leaf():
        # Internal node style
        style = NodeStyle()
        style["size"] = 0
        style["hz_line_width"] = 2
        style["vt_line_width"] = 2
        node.set_style(style)
        return

    # Leaf node: figure out color and label
    leaf_label = node.name

    # If the node has an 'original_name' (PHYLIP code), use that
    if hasattr(node, 'original_name'):
        leaf_label = node.original_name

    # Access family information
    family = model_to_family.get(leaf_label, "Other")
    circle_color = family_colors.get(family, "#cccccc")

    # Highlight this model if it's the focus
    if leaf_label == focus_model_name:
        circle_color = highlight_color
        text_face = TextFace(get_updated_model_name(leaf_label), fsize=10, fgcolor=highlight_color)
    else:
        text_face = TextFace(get_updated_model_name(leaf_label), fsize=10, fgcolor="black")

    # Attach label face to the branch
    faces.add_face_to_node(text_face, node, column=0, position="branch-right")

    # Circle style
    style = NodeStyle()
    style["size"] = 8
    style["fgcolor"] = circle_color
    style["shape"] = "circle"
    style["hz_line_width"] = 2
    style["vt_line_width"] = 2
    node.set_style(style)

def render_ete_tree_focus(ete_tree, focus_model_name, output_image, layout="c"):
    """
    Renders an ETE3 tree, highlighting one particular leaf.
    layout="c" => circular, layout="r" => rectangular
    """
    ts = TreeStyle()
    ts.mode = layout
    ts.show_leaf_name = False
    ts.show_branch_length = False
    ts.show_scale = False
    if layout == 'r':
        ts.branch_vertical_margin = 10
    
    # Set width based on layout type
    width = 1200 if layout == 'c' else 500

    def custom_layout(node):
        layout_fn_with_highlight(node, focus_model_name)

    ts.layout_fn = custom_layout
    ete_tree.render(output_image, w=width, units="px", tree_style=ts)
    print(f"Saved {layout.upper()} tree with highlight on '{get_updated_model_name(focus_model_name)}' => {output_image} (width: {width}px)")

def run_pars_command(env=None, timeout=300):
    """
    Executes PHYLIP 'pars' with minimal interactive input (i.e., 'Y' for defaults).
    If you want multiple jumbles, modify 'pars_input' below accordingly.
    """
    # Currently, 'pars_input = "Y\n"' just accepts all defaults.
    # If you want to enable jumbles (for multiple equally parsimonious trees), you could do:
    #   pars_input = "J\n5\n1\nY\nY\n"
    # and so on, adjusting each question asked by 'pars'.
    pars_input = "Y\n"

    result = subprocess.run(
        ["pars"],
        input=pars_input,
        text=True,
        capture_output=True,
        env=env,
        timeout=timeout
    )
    return result

def run_consense_command(env=None, timeout=300):
    """
    Executes PHYLIP 'consense' with defaults.
    """
    if os.path.exists("outfile"):
        os.remove("outfile")
    if os.path.exists("outtree"):
        os.remove("outtree")

    cons_input = "Y\n"
    result = subprocess.run(
        ["consense"],
        input=cons_input,
        text=True,
        capture_output=True,
        env=env,
        timeout=timeout
    )
    return result


def build_hybrid_parsimony_trees(
    elo_file="elo_results_with_metrics.json",
    top_n=1500,
    output_dir="results/hybrid_parsimony",
    phylip_path="/usr/local/bin"
):
    """
    1) Extract feature matrices from ELO data
    2) Create PHYLIP input with a translation table for model names
    3) Run parsimony analysis (pars)
    4) Optionally run consense if RUN_CONSENSE = True
    5) Load final tree into ETE3 with mapped model names
    6) Render multiple views (circular/rectangular) with highlighting
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    charts_dir = os.path.join(output_dir, "charts")
    if not os.path.exists(charts_dir):
        os.makedirs(charts_dir, exist_ok=True)

    # --- 1. Load and process ELO data ---
    print("Loading and processing feature data...")
    with open(elo_file, "r", encoding="utf-8") as f:
        elo_data = json.load(f)

    # Collect features for each model
    model_to_feats = {}
    for raw_name, info in elo_data.items():
        if raw_name in MODELS_TO_IGNORE:
            continue

        # Just an example with fewer features:
        w_count = 1000#100 #200
        b_count = 200#20
        t_count = 200#20
        #w_count = top_n // 3
        #b_count = top_n // 3
        #t_count = top_n // 3

        words = info.get("top_repetitive_words", [])
        bigrams = info.get("top_multi_prompt_bigrams", [])
        trigrams = info.get("top_multi_prompt_trigrams", [])

        sorted_w = sorted(words, key=lambda x: x.get('score', 0), reverse=True)[:w_count]
        sorted_b = sorted(bigrams, key=lambda x: x.get('frequency', 0), reverse=True)[:b_count]
        sorted_t = sorted(trigrams, key=lambda x: x.get('frequency', 0), reverse=True)[:t_count]

        top_words = [x["word"] for x in sorted_w]
        top_bigs = [x["ngram"] for x in sorted_b]
        top_tris = [x["ngram"] for x in sorted_t]

        combined_feats = set(top_words + top_bigs + top_tris)
        if combined_feats:
            model_to_feats[raw_name] = combined_feats

    all_models = sorted(model_to_feats.keys())
    if len(all_models) < 2:
        print("Not enough models with features to build a parsimony tree.")
        return

    print(f"Found {len(all_models)} models with feature data.")

    # Build global vocabulary
    global_vocab = set()
    for feats in model_to_feats.values():
        global_vocab.update(feats)
    global_vocab = sorted(global_vocab)
    print(f"Total feature space size: {len(global_vocab)} features")

    # --- 2. Create PHYLIP input file ---
    print("Creating PHYLIP input files...")
    temp_dir = tempfile.mkdtemp(prefix="phylip_")
    try:
        code_to_model = {}
        model_to_code = {}
        for i, model in enumerate(all_models):
            code = f"M{i+1:04d}"  # M0001, M0002, ...
            code_to_model[code] = model
            model_to_code[model] = code

        translation_file = os.path.join(output_dir, "model_codes.json")
        with open(translation_file, "w", encoding="utf-8") as f:
            json.dump({"code_to_model": code_to_model, "model_to_code": model_to_code},
                      f, indent=2, ensure_ascii=False)

        phylip_file = os.path.join(temp_dir, "infile")
        with open(phylip_file, "w") as f:
            n_taxa = len(all_models)
            n_chars = len(global_vocab)
            f.write(f"{n_taxa} {n_chars}\n")
            for model in all_models:
                code = model_to_code[model]
                feats = model_to_feats[model]
                bits = ["1" if feat in feats else "0" for feat in global_vocab]
                bitstring = "".join(bits)
                f.write(f"{code.ljust(10)} {bitstring}\n")

        # --- 3. Run PHYLIP parsimony analysis ---
        print("Running PHYLIP parsimony analysis...")
        current_dir = os.getcwd()
        output_dir_abs = os.path.abspath(output_dir)
        charts_dir_abs = os.path.join(output_dir_abs, "charts")
        os.makedirs(output_dir_abs, exist_ok=True)
        os.makedirs(charts_dir_abs, exist_ok=True)

        os.chdir(temp_dir)
        try:
            env = os.environ.copy()
            if phylip_path and os.path.exists(phylip_path):
                env["PATH"] = f"{phylip_path}:{env.get('PATH', '')}"
                print(f"Added {phylip_path} to PATH")

            # Also try the original location
            original_phylip_path = "/usr/lib/phylip/bin"
            if os.path.exists(original_phylip_path):
                env["PATH"] = f"{original_phylip_path}:{env.get('PATH', '')}"
                print(f"Added {original_phylip_path} to PATH")

            # Check if 'pars' is in PATH
            which_result = subprocess.run(["which", "pars"],
                                          capture_output=True,
                                          text=True,
                                          env=env)
            pars_path = which_result.stdout.strip()
            if not pars_path:
                print("ERROR: Could not find 'pars' executable in the PATH.")
                print("Falling back to hierarchical clustering.")
                return

            print(f"Using pars from: {pars_path}")

            # Actually run pars
            result = run_pars_command(env=env, timeout=300)
            print(f"PARS STDOUT (first 500 chars):\n{result.stdout[:500]}...")
            if result.stderr:
                print(f"PARS STDERR:\n{result.stderr}")

            if result.returncode != 0:
                print(f"PHYLIP Error: pars returned code {result.returncode}")
                print("Falling back to hierarchical clustering approach.")
                return

            # See if outfile/outtree exist
            print("Files after running pars:")
            for file in os.listdir('.'):
                print(f" - {file} ({os.path.getsize(file)} bytes)")

            # Copy output files to final location
            output_files_found = False
            for file in ["outfile", "outtree"]:
                if os.path.exists(file):
                    output_files_found = True
                    dest_file = os.path.join(output_dir_abs, file)
                    shutil.copy(file, dest_file)
                    print(f"Copied {file} to {dest_file}")

            # Save a copy of infile
            shutil.copy("infile", os.path.join(output_dir_abs, "infile"))

            if not output_files_found:
                print("ERROR: No output files created by PHYLIP 'pars'.")
                print("Falling back to hierarchical clustering.")
                return

            print("PHYLIP parsimony finished successfully.")

            # ------------------------------
            # Optional: run consense
            # ------------------------------
            final_tree_path = os.path.join(output_dir_abs, "outtree")  # default
            if RUN_CONSENSE:
                print("RUN_CONSENSE = True, so we'll run 'consense' to build a consensus tree.")

                # Move 'outtree' → 'intree' for consense
                intree_path = os.path.join(output_dir_abs, "intree")
                if os.path.exists(intree_path):
                    os.remove(intree_path)
                shutil.copy(final_tree_path, intree_path)

                # Run 'consense' in the same temp_dir context
                # Because consense also expects an "intree" in the working directory
                # We'll rename in the temp_dir, run consense, then copy results out
                # So copy "intree" from output_dir_abs → temp_dir
                intree_local = os.path.join(temp_dir, "intree")
                if os.path.exists(intree_local):
                    os.remove(intree_local)
                shutil.copy(intree_path, intree_local)

                which_cons = subprocess.run(["which", "consense"], capture_output=True, text=True, env=env)
                cons_path = which_cons.stdout.strip()
                if not cons_path:
                    print("ERROR: 'consense' not found in PATH. Skipping consensus.")
                else:
                    print(f"Running consense from: {cons_path}")                    
                    res_cons = run_consense_command(env=env, timeout=300)
                    print(f"CONSENSE STDOUT (first 500 chars):\n{res_cons.stdout[:500]}...")
                    if res_cons.stderr:
                        print(f"CONSENSE STDERR:\n{res_cons.stderr}")

                    if res_cons.returncode != 0:
                        print(f"ERROR: consense returned code {res_cons.returncode}")
                    else:
                        # consense writes 'outfile' and 'outtree'
                        # We'll rename 'outtree' → 'outtree_consensus'
                        outtree_cons_path = os.path.join(temp_dir, "outtree")
                        if os.path.exists(outtree_cons_path):
                            consensus_dest = os.path.join(output_dir_abs, "outtree_consensus")
                            shutil.copy(outtree_cons_path, consensus_dest)
                            print(f"Consensus tree => {consensus_dest}")
                            final_tree_path = consensus_dest  # the final we parse

        finally:
            os.chdir(current_dir)

        # --- 4. Load final tree into ETE3 ---
        if not os.path.exists(final_tree_path):
            print(f"Error: final tree not found: {final_tree_path}")
            return

        # PHYLIP format=1 typically means branch lengths + supports are recognized
        tree = Tree(final_tree_path, format=1)

        # Map short codes back to original names
        for leaf in tree.get_leaves():
            original_code = leaf.name.strip()
            if original_code in code_to_model:
                leaf.original_name = code_to_model[original_code]
                leaf.name = code_to_model[original_code]
            else:
                # Might be an already-correct name or something else
                pass

        # Basic tree
        basic_tree_path = os.path.join(output_dir, "parsimony_tree_basic.png")
        ts = TreeStyle()
        ts.show_leaf_name = False
        ts.mode = "r"

        def basic_layout(node):
            layout_fn_with_highlight(node)

        ts.layout_fn = basic_layout
        tree.render(basic_tree_path, w=800, units="px", tree_style=ts)
        print(f"Saved basic tree: {basic_tree_path}")

        # --- 5. Per-model visualizations
        print("Generating per-model visualizations...")
        for model_name in all_models:
            updated_model = get_updated_model_name(model_name)
            sanitized = sanitize_model_name(updated_model)

            # Circular layout
            circ_png = os.path.join(charts_dir, f"{sanitized}__phylo_tree_parsimony_circular.png")
            render_ete_tree_focus(tree, model_name, circ_png, layout="c")

            # Rectangular layout
            rect_png = os.path.join(charts_dir, f"{sanitized}__phylo_tree_parsimony_rectangular.png")
            render_ete_tree_focus(tree, model_name, rect_png, layout="r")

        # Also save as Nexus
        nexus_path = os.path.join(output_dir, "parsimony_tree.nex")
        with open(nexus_path, "w") as f:
            f.write("#NEXUS\nBEGIN TREES;\n")
            f.write(f"  TREE parsimony = {tree.write(format=8)};\n")
            f.write("END;\n")

        print(f"Success! Generated circular & rectangular phylogenetic trees for {len(all_models)} models.")
        if RUN_CONSENSE:
            print(f"(Used CONSENSE for a final consensus tree.)")
        print(f"Reference files in: {output_dir}")
        print(f"Per-model visualizations in: {charts_dir}")

    finally:
        shutil.rmtree(temp_dir)


def build_fallback_hierarchical_trees(
    elo_file="elo_results_with_metrics.json",
    top_n=1500,
    output_dir="results/hierarchical_clustering"
):
    """
    Fallback method that uses scipy for hierarchical clustering.
    """
    import pandas as pd
    from scipy.spatial.distance import pdist
    from scipy.cluster.hierarchy import linkage, to_tree

    print("Using fallback hierarchical clustering method...")

    output_dir_abs = os.path.abspath(output_dir)
    if not os.path.exists(output_dir_abs):
        os.makedirs(output_dir_abs, exist_ok=True)

    charts_dir_abs = os.path.join(output_dir_abs, "charts")
    if not os.path.exists(charts_dir_abs):
        os.makedirs(charts_dir_abs, exist_ok=True)

    print("Loading and processing feature data...")
    with open(elo_file, "r", encoding="utf-8") as f:
        elo_data = json.load(f)

    model_to_feats = {}
    for raw_name, info in elo_data.items():
        if raw_name in MODELS_TO_IGNORE:
            continue

        w_count = top_n // 3
        b_count = top_n // 3
        t_count = top_n // 3

        words = info.get("top_repetitive_words", [])
        bigrams = info.get("top_multi_prompt_bigrams", [])
        trigrams = info.get("top_multi_prompt_trigrams", [])

        sorted_w = sorted(words, key=lambda x: x.get('score', 0), reverse=True)[:w_count]
        sorted_b = sorted(bigrams, key=lambda x: x.get('frequency', 0), reverse=True)[:b_count]
        sorted_t = sorted(trigrams, key=lambda x: x.get('frequency', 0), reverse=True)[:t_count]

        top_words = [x["word"] for x in sorted_w]
        top_bigs = [x["ngram"] for x in sorted_b]
        top_tris = [x["ngram"] for x in sorted_t]

        combined_feats = set(top_words + top_bigs + top_tris)
        if combined_feats:
            model_to_feats[raw_name] = combined_feats

    all_models = sorted(model_to_feats.keys())
    if len(all_models) < 2:
        print("Not enough models with features to build a tree.")
        return

    print(f"Found {len(all_models)} models with feature data.")

    global_vocab = set()
    for feats in model_to_feats.values():
        global_vocab.update(feats)
    global_vocab = sorted(global_vocab)
    print(f"Total feature space size: {len(global_vocab)} features")

    print("Building feature matrix...")
    df = pd.DataFrame(0, index=all_models, columns=global_vocab, dtype=int)
    for m in all_models:
        for ft in model_to_feats[m]:
            if ft in df.columns:
                df.loc[m, ft] = 1

    print("Performing hierarchical clustering...")
    dist_matrix = pdist(df.values, metric='jaccard')
    linked = linkage(dist_matrix, method='complete')

    root_node = to_tree(linked, rd=False)
    ete_tree = Tree()

    def scipy_cluster_to_ete(scipy_node, ete_parent, id_to_label):
        if scipy_node.is_leaf():
            leaf_name = id_to_label[scipy_node.id]
            ete_parent.name = leaf_name
        else:
            left_child = ete_parent.add_child()
            scipy_cluster_to_ete(scipy_node.left, left_child, id_to_label)
            right_child = ete_parent.add_child()
            scipy_cluster_to_ete(scipy_node.right, right_child, id_to_label)

    id_to_label = dict(enumerate(df.index))
    scipy_cluster_to_ete(root_node, ete_tree, id_to_label)

    basic_tree_path = os.path.join(output_dir_abs, "hierarchical_tree_basic.png")
    ts = TreeStyle()
    ts.show_leaf_name = False
    ts.mode = "r"

    def basic_layout(node):
        layout_fn_with_highlight(node)

    ts.layout_fn = basic_layout
    ete_tree.render(basic_tree_path, w=800, units="px", tree_style=ts)
    print(f"Saved basic tree: {basic_tree_path}")

    print("Generating per-model visualizations...")
    for model_name in all_models:
        updated_model = get_updated_model_name(model_name)
        sanitized = sanitize_model_name(updated_model)

        circ_png = os.path.join(charts_dir_abs, f"{sanitized}__hierarchical_circular.png")
        render_ete_tree_focus(ete_tree, model_name, circ_png, layout="c")

        rect_png = os.path.join(charts_dir_abs, f"{sanitized}__hierarchical_rectangular.png")
        render_ete_tree_focus(ete_tree, model_name, rect_png, layout="r")

    print(f"Success! Generated circular & rectangular hierarchical trees for {len(all_models)} models.")
    print(f"Reference files in: {output_dir_abs}")
    print(f"Per-model visualizations in: {charts_dir_abs}")
    return ete_tree


if __name__ == "__main__":
    # Try to find phylip installation automatically
    possible_phylip_paths = [
        "/usr/lib/phylip/bin",
        "/usr/local/bin",
        "/opt/homebrew/bin",
        "/usr/local/phylip",
        os.path.expanduser("~/phylip")
    ]
    existing_paths = [p for p in possible_phylip_paths if os.path.exists(p)]
    phylip_path = None
    for path in existing_paths:
        if os.path.exists(os.path.join(path, "pars")):
            phylip_path = path
            print(f"Found PHYLIP at: {phylip_path}")
            break

    if phylip_path:
        print(f"Attempting parsimony trees using PHYLIP from {phylip_path}...")
        build_hybrid_parsimony_trees(
            elo_file="elo_results_with_metrics.json",
            top_n=1500,
            output_dir="results/hybrid_parsimony",
            phylip_path=phylip_path
        )
    else:
        print("Could not locate PHYLIP installation. Searched:")
        for p in possible_phylip_paths:
            print(f" - {p}")

    #print("\nAttempting fallback hierarchical clustering method...")
    #build_fallback_hierarchical_trees(
    #    elo_file="elo_results_with_metrics.json",
    #    top_n=1500,
    #    output_dir="results/hierarchical_clustering"
    #)


Found PHYLIP at: /usr/lib/phylip/bin
Attempting parsimony trees using PHYLIP from /usr/lib/phylip/bin...
Loading and processing feature data...
Found 46 models with feature data.
Total feature space size: 13468 features
Creating PHYLIP input files...
Running PHYLIP parsimony analysis...
Added /usr/lib/phylip/bin to PATH
Added /usr/lib/phylip/bin to PATH
Using pars from: /usr/lib/phylip/bin/pars
PARS STDOUT (first 500 chars):
[2J[H[2J[H
Discrete character parsimony algorithm, version 3.697

Setting for this run:
  U                 Search for best tree?  Yes
  S                        Search option?  More thorough search
  V              Number of trees to save?  100
  J     Randomize input order of species?  No. Use input order
  O                        Outgroup root?  No, use as outgroup species 1
  T              Use Threshold parsimony?  No, use ordinary parsimony
  W                       Sites weighted?  N...
Files after running pars:
 - outtree (992 bytes)
 - outfile (7225 b

QStandardPaths: XDG_RUNTIME_DIR not set, defaulting to '/tmp/runtime-sam'


Saved basic tree: results/hybrid_parsimony/parsimony_tree_basic.png
Generating per-model visualizations...
Saved C tree with highlight on 'ToastyPigeon/Gemma-3-Starshine-12B' => results/hybrid_parsimony/charts/ToastyPigeon__Gemma-3-Starshine-12B__phylo_tree_parsimony_circular.png (width: 1200px)
Saved R tree with highlight on 'ToastyPigeon/Gemma-3-Starshine-12B' => results/hybrid_parsimony/charts/ToastyPigeon__Gemma-3-Starshine-12B__phylo_tree_parsimony_rectangular.png (width: 500px)
Saved C tree with highlight on 'allura-org/Gemma-3-Glitter-12B' => results/hybrid_parsimony/charts/allura-org__Gemma-3-Glitter-12B__phylo_tree_parsimony_circular.png (width: 1200px)
Saved R tree with highlight on 'allura-org/Gemma-3-Glitter-12B' => results/hybrid_parsimony/charts/allura-org__Gemma-3-Glitter-12B__phylo_tree_parsimony_rectangular.png (width: 500px)
Saved C tree with highlight on 'anthropic/claude-3-haiku' => results/hybrid_parsimony/charts/anthropic__claude-3-haiku__phylo_tree_parsimony_circ

In [4]:
import os
import json
import pandas as pd
import numpy as np
from collections import defaultdict
import math

def build_creative_criteria_df(
    runs_file_path="creative_bench_runs.json",
    min_occurrences=5,
    n_neighbors=6
):
    """
    Reads the runs JSON, extracts per-model creative criteria scores, applies transformations
    (inverting negative criteria, combining certain criteria, etc.), calculates relative
    scores, and returns a DataFrame containing both absolute and relative columns.

    Returns:
        pd.DataFrame: Columns = ['model', 'overall_score', <absolute criteria...>, <relative_criteria...>]
    """
    # -- 1) Load JSON data --
    if not os.path.exists(runs_file_path):
        print(f"File not found: {runs_file_path}")
        return pd.DataFrame()

    with open(runs_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    if not data:
        print(f"No data in file: {runs_file_path}")
        return pd.DataFrame()

    # -- 2) Functions from your pipeline, condensed --

    def extract_model_results(data_dict):
        """
        For each model run, gather:
          - overall_score (eqbench_creative_score)
          - criteria_aggregates = mean of all judge_scores per criterion
        Returns {model_name: {overall_score, criteria_aggregates:{crit: val}}}.
        """
        models_data = {}
        for run_key, run_data in data_dict.items():
            model_name = run_data.get("test_model", run_key)
            overall_score = run_data.get('results', {}).get('benchmark_results', {}).get('eqbench_creative_score', None)

            criteria_scores = defaultdict(list)
            if 'creative_tasks' in run_data:
                for iteration_dict in run_data['creative_tasks'].values():
                    for prompt_info in iteration_dict.values():
                        for mod_data in prompt_info.get('results_by_modifier', {}).values():
                            if 'judge_scores' in mod_data:
                                for crit, score in mod_data['judge_scores'].items():
                                    # Coerce to float if numeric and check that score <= 20
                                    try:
                                        score_value = float(score)
                                        if score_value <= 20:  # Only include scores that are <= 20
                                            criteria_scores[crit].append(score_value)
                                    except (ValueError, TypeError):
                                        pass

            criteria_aggregates = {
                c: np.mean(scores) if scores else np.nan
                for c, scores in criteria_scores.items()
            }

            models_data[model_name] = {
                'overall_score': overall_score,
                'criteria_aggregates': criteria_aggregates
            }
        return models_data

    def create_models_df(models_data_dict, min_occ=5):
        """
        Creates an initial DataFrame with columns:
          model, overall_score, and each included criterion (that meets min occurrences).
        """
        # List of included criteria (original approach)
        included_criteria = [
            "Adherence to Instructions",
            "Believable Character Actions",
            "Nuanced Characters",
            "Consistent Voice/Tone of Writing",
            "Imagery and Descriptive Quality",
            "Elegant Prose",
            "Emotionally Engaging",
            "Emotionally Complex",
            "Coherent",
            "Meandering",
            "Weak Dialogue",
            "Tell-Don't-Show",
            "Unsurprising or Uncreative",
            "Amateurish",
            "Purple Prose",
            "Overwrought",
            "Incongruent Ending Positivity",
            "Unearned Transformations",
            "Well-earned Lightness or Darkness",
            "Sentences Flow Naturally",
            "Overall Reader Engagement",
            "Overall Impression"
        ]

        # Count occurrences
        criteria_counts = defaultdict(int)
        all_criteria_in_data = set()
        for m_data in models_data_dict.values():
            for c in m_data['criteria_aggregates'].keys():
                all_criteria_in_data.add(c)
                # see if it's in included_criteria (case-insensitive match)
                for inc in included_criteria:
                    if c.lower() == inc.lower():
                        criteria_counts[c] += 1
                        break

        valid_criteria = {
            c for c in criteria_counts
            if criteria_counts[c] >= min_occ
        }

        # Build rows
        rows = []
        for model, model_data in models_data_dict.items():
            row = {
                'model': model,
                'overall_score': model_data['overall_score']
            }
            for c in valid_criteria:
                row[c] = model_data['criteria_aggregates'].get(c, np.nan)
            rows.append(row)

        df = pd.DataFrame(rows)
        df = df.dropna(subset=['overall_score'])
        if not df.empty:
            df = df.sort_values('overall_score', ascending=False).reset_index(drop=True)

        return df

    def preprocess_criteria_scores(df):
        """
        Invert negative criteria (20 - score).
        """
        negative_criteria = [
            "Unearned Transformations",
            "Incongruent Ending Positivity",
            "Overwrought",
            "Purple Prose",
            "Amateurish",
            "Unsurprising or Uncreative",
            "Tell-Don't-Show",
            "Weak Dialogue",
            "Meandering"
        ]
        processed_df = df.copy()
        inverted_map = {}
        for col in processed_df.columns:
            if col not in ['model', 'overall_score']:
                # check case-insensitive match
                if any(n.lower() == col.lower() for n in negative_criteria):
                    processed_df[col] = 20 - processed_df[col]
                    new_col = f"Inverted_{col}"
                    processed_df.rename(columns={col: new_col}, inplace=True)
                    inverted_map[col] = new_col
        return processed_df, inverted_map

    def transform_criteria(df, inverted_map, apply_transformations=True):
        """
        Rename, combine, ignore certain criteria.
        """
        if not apply_transformations:
            return df

        transformed_df = df.copy()

        # Criteria to ignore
        ignore_criteria = ["Overall Impression", "Overall Reader Engagement"]
        to_drop = []
        for crit in ignore_criteria:
            # check if it's in the DF or if it was inverted
            if crit in transformed_df.columns:
                to_drop.append(crit)
            elif crit in inverted_map and inverted_map[crit] in transformed_df.columns:
                to_drop.append(inverted_map[crit])

        if to_drop:
            transformed_df.drop(columns=to_drop, inplace=True, errors='ignore')

        # Rename map
        rename_map = {
            "Inverted_Weak Dialogue": "Strong Dialogue",
            "Inverted_Tell-Don't-Show": "Show-Don't-Tell",
            "Inverted_Unsurprising or Uncreative": "Creativity",
            "Inverted_Amateurish": "Avoids Amateurish Prose",
            "Adherence to Instructions": "Instruction Following",
            "Inverted_Meandering": "Pacing",
            "Imagery and Descriptive Quality": "Descriptive Imagery",
            "Consistent Voice/Tone of Writing": "Consistent Voice & Tone",
            "Sentences Flow Naturally": "Sentence Flow",
        }
        for old_name, new_name in rename_map.items():
            if old_name in transformed_df.columns:
                transformed_df.rename(columns={old_name: new_name}, inplace=True)

        # Combine
        combinations = {
            "Emotional Depth": ["Emotionally Complex", "Emotionally Engaging"],
            "Avoids Positivity Bias": ["Well-earned Lightness or Darkness", "Inverted_Unearned Transformations", "Inverted_Incongruent Ending Positivity"],
            "Avoids Purple Prose": ["Inverted_Overwrought", "Inverted_Purple Prose"],
            "Believable Characters": ["Nuanced Characters", "Believable Character Actions"],
        }
        drop_after = []
        for new_name, src_cols in combinations.items():
            existing_cols = [c for c in src_cols if c in transformed_df.columns]
            if not existing_cols:
                continue
            transformed_df[new_name] = transformed_df[existing_cols].mean(axis=1, skipna=True)
            drop_after.extend(existing_cols)
        # drop the combined ones
        drop_after = list(set(drop_after))
        for col in drop_after:
            if col in transformed_df.columns and col not in combinations:
                # ensure we don't accidentally drop the newly created column
                if col not in combinations.keys():
                    transformed_df.drop(columns=[col], inplace=True, errors='ignore')

        return transformed_df

    def calculate_relative_scores(df, n_neighbors=3):
        """
        For each model, compare each criterion to the average among the N neighbors above/below it.
        Adds columns named 'relative_<criterion>'.
        """
        if len(df) < 2:
            # not enough models
            base = df[['model', 'overall_score']].copy()
            return base

        # ensure sorted
        sorted_df = df.sort_values('overall_score', ascending=False).reset_index(drop=True)
        criteria_cols = [c for c in sorted_df.columns if c not in ['model', 'overall_score']]

        relative_df = sorted_df[['model', 'overall_score']].copy()

        for idx, row in sorted_df.iterrows():
            start_i = max(0, idx - n_neighbors)
            end_i = min(len(sorted_df) - 1, idx + n_neighbors)
            neighbor_indices = [i for i in range(start_i, end_i + 1) if i != idx]

            for crit in criteria_cols:
                cur_val = row[crit]
                if pd.isna(cur_val):
                    rel_val = np.nan
                else:
                    neighbor_vals = sorted_df.loc[neighbor_indices, crit].dropna()
                    if len(neighbor_vals) > 0:
                        rel_val = cur_val - neighbor_vals.mean()
                    else:
                        rel_val = np.nan
                relative_df.loc[idx, f'relative_{crit}'] = rel_val

        return relative_df

    # -- 3) Build the DataFrame --
    # Step A: extract per-model aggregates
    models_data_dict = extract_model_results(data)

    # Step B: initial DF
    df_raw = create_models_df(models_data_dict, min_occ=min_occurrences)
    if df_raw.empty:
        print("No data after creating models DF. Returning empty DataFrame.")
        return df_raw

    # Step C: invert negative
    df_processed, inverted_map = preprocess_criteria_scores(df_raw)

    # Step D: transform
    df_transformed = transform_criteria(df_processed, inverted_map, apply_transformations=True)

    # Step E: relative
    df_rel = calculate_relative_scores(df_transformed, n_neighbors=n_neighbors)

    # Step F: merge
    full_df = pd.merge(
        df_transformed, df_rel,
        on=['model', 'overall_score'],
        how='left'
    )

    return full_df


def format_model_charts_js_object(
    full_df,
    top_n=5,
):
    """
    Creates a single JS object definition of the form:

        const chartData = {
          "DisplayModelName": {
            "absoluteRadar": {
              "labels": [...],
              "values": [...]
            },
            "relativeRadarLog": {
              "labels": [...],
              "values": [...]
            },
            "strengths": [...],
            "weaknesses": [...]
          },
          "OtherModelName": { ... }
        };

    - Skips any model in MODELS_TO_IGNORE.
    - Renames model with MODEL_NAME_SUBS (model -> substituted name).
    - top_n controls how many strengths/weaknesses to include.
    - Rounds all numeric values to 2 decimals.
    - Advanced normalization: Sets min->-1, median->0, max->1 with proportional scaling.

    Args:
        full_df (pd.DataFrame): DataFrame containing absolute & relative criteria columns
                                (like from build_creative_criteria_df()).
        top_n (int): How many top/bottom items to list as strengths/weaknesses.
        MODEL_NAME_SUBS (dict): e.g. {"openai/gpt-4":"GPT-4"}
        MODELS_TO_IGNORE (list): e.g. ["mistralai/ministral-3b"]

    Returns:
        str: Valid JS code as a string:
             const chartData = { "ModelA": {...}, "ModelB": {...} };
    """
    import json
    import math
    import numpy as np

    def get_updated_model_name(original: str) -> str:
        # Same logic you use in your code:
        return MODEL_NAME_SUBS.get(original, original)

    def signed_log(x):
        """Signed log10 transform: sign(x)*log10(|x| + 1)."""
        if x == 0:
            return 0.0
        return math.copysign(math.log10(abs(x) + 1), x)

    # Identify columns
    absolute_cols = [
        c for c in full_df.columns
        if c not in ["model", "overall_score"] and not c.startswith("relative_")
    ]
    relative_cols = [c for c in full_df.columns if c.startswith("relative_")]

    # Build a Python dict to represent the final JS object
    chart_data_dict = {}

    for _, row in full_df.iterrows():
        original_name = row["model"]
        if original_name in MODELS_TO_IGNORE:
            continue

        display_name = get_updated_model_name(original_name)

        # (1) Gather absolute radar data
        abs_labels = []
        abs_values = []
        for col in absolute_cols:
            val = row[col]
            if pd.isna(val):
                continue
            abs_labels.append(col)
            abs_values.append(round(float(val), 2))  # round to 2 decimals

        # (2) Gather relative radar (log scale) data
        rel_labels = []
        rel_values_log = []
        for col in relative_cols:
            val = row[col]
            if pd.isna(val):
                continue
            base_name = col.replace("relative_", "")
            log_val = signed_log(float(val))
            rel_labels.append(base_name)
            rel_values_log.append(round(log_val, 2))  # round to 2 decimals

        # (3) Strengths & Weaknesses (raw relative values)
        rel_pairs = []
        for col in relative_cols:
            val = row[col]
            if pd.isna(val):
                continue
            base_name = col.replace("relative_", "")
            rel_pairs.append((base_name, float(val)))
        
        # Skip if not enough relative pairs
        if len(rel_pairs) < 3:  # Need at least 3 values for min/median/max
            continue
            
        # Sort the values for finding median
        sorted_values = sorted([pair[1] for pair in rel_pairs])
        
        # Find the min, max, and median values
        min_value = sorted_values[0]
        max_value = sorted_values[-1]
        
        # Find the median (middle value with equal number above and below)
        n = len(sorted_values)
        if n % 2 == 1:  # Odd number of values
            median_value = sorted_values[n // 2]
        else:  # Even number of values
            median_value = (sorted_values[n // 2 - 1] + sorted_values[n // 2]) / 2
        
        # Create normalized pairs with min->-1, median->0, max->1
        normalized_pairs = []
        
        # Handle edge cases
        if min_value == max_value:  # All values are the same
            normalized_pairs = [(crit, 0.0) for crit, val in rel_pairs]
        elif min_value == median_value == max_value:  # All values are the same
            normalized_pairs = [(crit, 0.0) for crit, val in rel_pairs]
        elif min_value == median_value:  # Only two distinct values
            normalized_pairs = [
                (crit, -1.0 if val == min_value else 1.0) 
                for crit, val in rel_pairs
            ]
        elif median_value == max_value:  # Only two distinct values
            normalized_pairs = [
                (crit, -1.0 if val == min_value else 1.0) 
                for crit, val in rel_pairs
            ]
        else:
            # Normal case: three different anchor points
            for criterion, value in rel_pairs:
                if value <= median_value:
                    # Scale from min (-1) to median (0)
                    if median_value > min_value:  # Avoid division by zero
                        norm_val = -1.0 + (value - min_value) * (1.0) / (median_value - min_value)
                    else:
                        norm_val = -1.0
                else:
                    # Scale from median (0) to max (1)
                    if max_value > median_value:  # Avoid division by zero
                        norm_val = 0.0 + (value - median_value) * (1.0) / (max_value - median_value)
                    else:
                        norm_val = 1.0
                        
                normalized_pairs.append((criterion, norm_val))
            
        # Sort by normalized value (ascending)
        normalized_pairs.sort(key=lambda x: x[1])
        
        # Get top_n weaknesses (lowest values) and top_n strengths (highest values)
        weaknesses = normalized_pairs[:top_n]
        strengths = normalized_pairs[-top_n:]
        strengths.reverse()  # Show highest first
        
        # Round normalized values to 2 decimal places for output
        strengths_list = [
            {"criterion": crit, "relativeScore": round(val, 2)}
            for (crit, val) in strengths
        ]
        weaknesses_list = [
            {"criterion": crit, "relativeScore": round(val, 2)}
            for (crit, val) in weaknesses
        ]

        # Assemble the data structure for this model
        chart_data_dict[display_name] = {
            "absoluteRadar": {
                "labels": abs_labels,
                "values": abs_values
            },
            "relativeRadarLog": {
                "labels": rel_labels,
                "values": rel_values_log
            },
            "strengths": strengths_list,
            "weaknesses": weaknesses_list
        }

    # Now produce valid JS code
    js_object_str = json.dumps(chart_data_dict, indent=2)

    # Return the final code as a string
    return f"const chartData = {js_object_str};"


df = build_creative_criteria_df(
    runs_file_path="creative_bench_runs.json",
    #runs_file_path="repro_testing.json",
    min_occurrences=5,
    n_neighbors=10
)
chart_data_str = format_model_charts_js_object(df, top_n=5)
print(chart_data_str)


const chartData = {
  "o3": {
    "absoluteRadar": {
      "labels": [
        "Show-Don't-Tell",
        "Sentence Flow",
        "Strong Dialogue",
        "Coherent",
        "Pacing",
        "Instruction Following",
        "Elegant Prose",
        "Creativity",
        "Consistent Voice & Tone",
        "Descriptive Imagery",
        "Avoids Amateurish Prose",
        "Emotional Depth",
        "Avoids Positivity Bias",
        "Avoids Purple Prose",
        "Believable Characters"
      ],
      "values": [
        17.11,
        17.75,
        16.87,
        18.73,
        17.0,
        19.09,
        17.12,
        15.95,
        18.49,
        17.95,
        17.69,
        16.81,
        18.04,
        16.57,
        17.58
      ]
    },
    "relativeRadarLog": {
      "labels": [
        "Show-Don't-Tell",
        "Sentence Flow",
        "Strong Dialogue",
        "Coherent",
        "Pacing",
        "Instruction Following",
        "Elegant Prose",
        "Creativity",
 