## Setup
To evaluate large language model (LLM) performance at scale, we leveraged batch inference APIs from all three model providers included in the experiments—OpenAI, Mistral, and Anthropic—across all design decision variations.
For each provider, a systematic procedure was implemented to generate, upload, and execute batch requests for both base and fine-tuned models, adhering to the respective provider’s API documentation.

#### Imports
 See `requirements.txt` for full dependency versions

In [None]:
import os
import json
import pandas as pd
import math
import glob
import re

from openai import OpenAI
from mistralai import Mistral
from anthropic import Anthropic
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
from anthropic.types.messages.batch_create_params import Request
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

#### Global Paths, Directories, and Variables

In [None]:
# Define Demo Study path
DEMO_PATH   = os.path.abspath(os.path.join(".."))

# Define relevant paths
API_KEY_DIR = os.path.expanduser(os.getenv("API_KEY_DIR", 'PATH')) # insert path to .txt with API Key
PROMPT_DIR = os.path.join(DEMO_PATH, 'LLM_API', 'prompt_templates')
LLM_API = os.path.join(DEMO_PATH,'LLM_API')
VAL_PATH = os.path.join(DEMO_PATH, 'training_validation_data', 'demo_app_updates_validation_real_1000.csv')

OPENAI_BATCH_DIR = os.path.join(LLM_API, 'OpenAI_batches', 'raw')
MISTRAL_BATCH_DIR = os.path.join(LLM_API, 'Mistral_batches', 'raw')
ANTHROPIC_BATCH_DIR = os.path.join(LLM_API, 'Anthropic_batches')

OPENAI_BATCH_RESULTS_DIR = os.path.join(LLM_API, 'OpenAI_batches', "results")
MISTRAL_BATCH_RESULTS_DIR = os.path.join(LLM_API, 'Mistral_batches', 'results')


# Reproducibility
SEED = 94032

# Max lines per batch (just redundancy, as batch per model size shouldn't exceed this)
MAX_LINES_PER_BATCH = 50000

# Load API keys
with open(os.path.join(API_KEY_DIR, "anthropic_api_key_ai-measurement.txt"), encoding="utf-8") as f: ANTHROPIC_API_KEY = f.read().strip()
with open(os.path.join(API_KEY_DIR, "mistral_api_key_ai-measurement.txt"), encoding="utf-8") as f: MISTRAL_API_KEY = f.read().strip()
with open(os.path.join(API_KEY_DIR, "openai_api_key_ai-measurement.txt"), encoding="utf-8") as f: OPENAI_API_KEY = f.read().strip()

# Define API-relevant URLs and clients
MISTRAL_CLIENT =  Mistral(api_key=MISTRAL_API_KEY)
OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)
ANTHROPIC_CLIENT = Anthropic(api_key=ANTHROPIC_API_KEY)

### Models for API-requests
# OpenAI GPT models (default set)
GPT_MODELS = [
    "gpt-3.5-turbo-0125",
    "gpt-4o-mini-2024-07-18",
    "gpt-4o-2024-08-06",
    "gpt-4.1-2025-04-14",
    "gpt-4.1-mini-2025-04-14",
    "gpt-4.1-nano-2025-04-14",
    "o3-2025-04-16",
    "o4-mini-2025-04-16",
]

# Of those, the “o-series” that don't use temperature:
GPT_REASONING_MODELS = [
    "o3-2025-04-16",
    "o4-mini-2025-04-16",
]

# OpenAI GPT models (fine-tuned set -> run 4-2 notebook before)
GPT_FT_MODELS_PATH = os.path.join(LLM_API, "fine-tuned_models", "GPT_fine-tuned.txt")

# Mistral models (default set)
MISTRAL_MODELS = [
    "mistral-large-2411",
    "mistral-medium-2505",
    "mistral-small-2503",
    "open-mistral-nemo-2407",
    "ministral-8b-2410",
    "ministral-3b-2410",
]
# Mistral models (fine-tuned set -> run 4-2 notebook before)
MISTRAL_FT_MODELS_PATH = os.path.join(LLM_API, "fine-tuned_models", "MISTRAL_fine-tuned.txt")

# Anthropic Claude models (default set)
CLAUDE_MODELS = [
    "claude-sonnet-4-20250514",
    "claude-3-7-sonnet-20250219",
    "claude-opus-4-20250514",
    "claude-3-5-haiku-20241022",
    "claude-3-5-sonnet-20241022",
    "claude-3-haiku-20240307",
    "claude-3-opus-20240229",
]   # note reasoning models can easily get very expensive
###

# Load prompt templates
PROMPT_FILES = {
    "default":         "updates_prompt_default.txt",
    "few_shot":        "updates_prompt_few-shot.txt",
    "automatic_cot":   "updates_prompt_automatic-cot.txt",
    "manual_cot":      "updates_prompt_manual-cot.txt",
    "contrastive_cot": "updates_prompt_contrastive-cot.txt",
}

PROMPTS = {
    key: open(os.path.join(PROMPT_DIR, fname), encoding="utf-8").read().strip()
    for key, fname in PROMPT_FILES.items()
}

# Other settings
TEMPERATURE_RANGE = [0, 0.5, 1.0, 1.5]
REPEATED_RUNS = 3

In [None]:
# Load validation data
df = pd.read_csv(VAL_PATH)

## OpenAI Batches
We generate JSONL batch files for **default** and **fine-tuned** GPT models, upload them, and create 24 h batch-jobs via the OpenAI API.

### Batch Creation
We build one JSONL batch per GPT model. Run either Default oder Fine-Tuned Models.

#### Default Models

In [None]:
# Process for each model
for model in GPT_MODELS:
    # Initialize container for model’s entries
    entries = []
    # Iterate over every prompt template
    for prompt_name, prompt_template in PROMPTS.items():
        # o-series ignores temperature; others get sweep on default prompt
        if model in GPT_REASONING_MODELS:
            temps = [None]
        else:
            temps = TEMPERATURE_RANGE if prompt_name == 'default' else [None]

        for temp in temps:
            for run in range(1, REPEATED_RUNS + 1):
                for _, row in df.iterrows():
                    # Compose user prompt
                    update_text = (row.get('whats_new', '') or '').strip()
                    prompt = f"{prompt_template}\n\nApp update text: {update_text}"
                    # Unique ID encodes model/prompt/temp/run/row.id
                    custom_id = (
                        f"{model}__{prompt_name}"
                        f"__t{int((temp or 0) * 10)}__run{run}__id{row['id']}"
                    )
                    body = {
                        'model': model,
                        'messages': [{'role': 'user', 'content': prompt}],
                        'max_completion_tokens': 1000,
                        'seed': SEED
                    }
                    if temp is not None:
                        body['temperature'] = temp
                    if model in GPT_REASONING_MODELS:
                        body['reasoning_effort'] = 'low'

                    entries.append({
                        'custom_id': custom_id,
                        'method': 'POST',
                        'url': '/v1/chat/completions',
                        'body': body
                    })

    # Paginate entries to ≤ 50,000 lines per file
    total = len(entries)
    num_files = math.ceil(total / MAX_LINES_PER_BATCH)

    for part in range(num_files):
        start = part * MAX_LINES_PER_BATCH
        end = min(start + MAX_LINES_PER_BATCH, total)
        batch_lines = entries[start:end]

        # Sanitize model name for filenames
        model_safe = model.replace('/', '-')
        suffix = f"{part + 1:02d}" if num_files > 1 else "01"
        batch_fname = f"openai_batch_{model_safe}_{suffix}.jsonl"
        batch_path = os.path.join(OPENAI_BATCH_DIR, batch_fname)

        with open(batch_path, 'w', encoding='utf-8') as fout:
            for entry in batch_lines:
                fout.write(json.dumps(entry, ensure_ascii=False) + '\n')

        print(f"Wrote {len(batch_lines)} entries for {model} to {batch_path} (lines {start + 1}–{end})")

#### Fine-Tuned Models
Note: Before running this code, run notebook 4-2 to generate fine-tuned models via fine-tuning API.

In [None]:
# Read every non-empty line as a model identifier
with open(GPT_FT_MODELS_PATH, encoding="utf-8") as f:
    FT_MODELS = [line.strip() for line in f if line.strip()]

# Process for each fine-tuned model
for ft_model in FT_MODELS:
    # Parse out the "type–size" token (e.g. "3-5-turbo-equal-2000")
    parts = ft_model.split(":")
    type_size = parts[3] if len(parts) > 3 else ""
    is_var_model = "2000" in type_size

    # Initialize container for model's entries
    entries = []

    if is_var_model:
        # Iterate over every prompt template
        for prompt_name, prompt_template in PROMPTS.items():
            # Only the "default" prompt gets a temperature sweep
            temps = TEMPERATURE_RANGE if prompt_name == "default" else [None]
            for temp in temps:
                for run in range(1, REPEATED_RUNS + 1):
                    for _, row in df.iterrows():
                        # Compose user prompt
                        update_text = (row.get("whats_new", "") or "").strip()
                        prompt = f"{prompt_template}\n\nApp update text: {update_text}"
                        # Unique ID encodes model/prompt/temp/run/row.id
                        custom_id = (
                            f"{ft_model.replace('/', '-')}"
                            f"__{prompt_name}"
                            f"__t{int((temp or 0) * 10)}"
                            f"__run{run}"
                            f"__id{row['id']}"
                        )
                        body = {
                            "model": ft_model,
                            "messages": [{"role": "user", "content": prompt}],
                            "max_completion_tokens": 1000,
                            "seed": SEED,
                        }
                        if temp is not None:
                            body["temperature"] = temp
                        entries.append({
                            "custom_id": custom_id,
                            "method": "POST",
                            "url": "/v1/chat/completions",
                            "body": body
                        })
    else:
        # Only 3 runs, default prompt, no temp variation
        prompt_template = PROMPTS["default"]
        for run in range(1, REPEATED_RUNS + 1):
            for _, row in df.iterrows():
                # Compose user prompt
                update_text = (row.get("whats_new", "") or "").strip()
                prompt = f"{prompt_template}\n\nApp update text: {update_text}"
                # Unique ID encodes model/prompt/temp/run/row.id
                custom_id = (
                    f"{ft_model.replace('/', '-')}"
                    f"__default"
                    f"__t0"
                    f"__run{run}"
                    f"__id{row['id']}"
                )
                body = {
                    "model": ft_model,
                    "messages": [{"role": "user", "content": prompt}],
                    "max_completion_tokens": 1000,
                    "seed": SEED,
                }
                entries.append({
                    "custom_id": custom_id,
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": body
                })

    # Paginate entries to ≤ 50,000 lines per file
    total = len(entries)
    num_files = math.ceil(total / MAX_LINES_PER_BATCH)
    # Sanitize model name for filenames
    model_safe = re.sub(r'[:/\\\s]+', '-', ft_model)
    for part in range(num_files):
        start = part * MAX_LINES_PER_BATCH
        end = min(start + MAX_LINES_PER_BATCH, total)
        batch_lines = entries[start:end]

        suffix = f"{part + 1:02d}" if num_files > 1 else "01"
        batch_fname = f"openai_batch_{model_safe}_{suffix}.jsonl"
        batch_path = os.path.join(OPENAI_BATCH_DIR, batch_fname)

        with open(batch_path, "w", encoding="utf-8") as fout:
            for entry in batch_lines:
                fout.write(json.dumps(entry, ensure_ascii=False) + "\n")

        print(f"Wrote {len(batch_lines)} entries for {ft_model} to {batch_path}"
              f" (lines {start + 1}–{end})")

### Upload Batches
We upload every JSONL batch to OpenAI and capture file IDs. Run either batch creation for default models or fine-tuned models before.

In [None]:
# Path to batch files
batch_paths = sorted(glob.glob(os.path.join(OPENAI_BATCH_DIR, "openai_batch_*.jsonl")))

# Upload each batch file and collect its file ID
uploaded_file_ids = []
for path in batch_paths:
    print(f"Uploading {path}...")
    with open(path, 'rb') as f:
        file_resp = OPENAI_CLIENT.files.create(
            file=f,
            purpose='batch'
        )
    print(f"Uploaded: id={file_resp.id}, filename={file_resp.filename}, bytes={file_resp.bytes}")
    uploaded_file_ids.append(file_resp.id)

print("All batch files uploaded.\n")

### Create Batch Jobs
We create a 24 h chat-completion batch job for each uploaded file

In [None]:
# Create a Batch for each uploaded file
for file_id in uploaded_file_ids:
    batch = OPENAI_CLIENT.batches.create(
        input_file_id=file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"eval job for {file_id}"
        }
    )
    print(f"Created batch {batch.id} for {file_id}, status: {batch.status}")

## Mistral Batches
We generate JSONL batch files for default and fine-tuned Mistral models, upload them, and create batch-jobs via the Mistral API.

### Batch Creation
We build one JSONL batch per Mistral model. Run either Default or Fine-Tuned Models.

#### Default Models

In [None]:
# Process for each model
for model in MISTRAL_MODELS:
    # Initialize container for model's entries
    entries = []
    for prompt_name, prompt_template in PROMPTS.items():
        # Only the "default" prompt gets a temperature sweep
        temps = TEMPERATURE_RANGE if prompt_name == 'default' else [None]
        for temp in temps:
            for run in range(1, REPEATED_RUNS + 1):
                for _, row in df.iterrows():
                    # Compose user prompt
                    update_text = (row.get('whats_new', '') or '').strip()
                    prompt = f"{prompt_template}\n\nApp update text: {update_text}"
                    # Unique ID encodes model/prompt/temp/run/row.id
                    custom_id = (
                        f"{model}__{prompt_name}"
                        f"__t{int((temp or 0) * 10)}__run{run}__id{row['id']}"
                    )
                    body = {
                        "max_tokens": 1000,
                        "messages": [{"role": "user", "content": prompt}],
                        **({"temperature": temp} if temp is not None else {}),
                        "random_seed": SEED,
                    }
                    entries.append({
                        "custom_id": custom_id,
                        "body": body
                    })

   # Paginate entries to ≤ 50,000 lines per file
    total = len(entries)
    num_files = math.ceil(total / MAX_LINES_PER_BATCH)

    for part in range(num_files):
        start = part * MAX_LINES_PER_BATCH
        end = min(start + MAX_LINES_PER_BATCH, total)
        batch_lines = entries[start:end]

        # Sanitize model name for filenames
        suffix = f"{part + 1:02d}" if num_files > 1 else "01"
        fname = f"mistral_batch_{model.replace('/', '-')}_{suffix}.jsonl"
        path = os.path.join(MISTRAL_BATCH_DIR, fname)

        with open(path, 'w', encoding='utf-8') as fout:
            for entry in batch_lines:
                fout.write(json.dumps(entry, ensure_ascii=False) + "\n")
        print(f"Wrote {len(batch_lines)} entries for {model} to {path}")

#### Fine-Tuned Models
Note: Before running this code, execute notebook 4-2 to generate fine-tuned models via fine-tuning API.

In [None]:
# Read every non-empty line as a model identifier
with open(MISTRAL_FT_MODELS_PATH, encoding="utf-8") as f:
    raw_ft = [line.strip() for line in f if line.strip()]

# Process for each fine-tuned model
for ft_full in raw_ft:
    # Parse out the model components
    parts = ft_full.split(":")
    if len(parts) < 5:
        raise ValueError(f"Bad line in MISTRAL_fine-tuned.txt: {ft_full}")
    model_id = ":".join(parts[:4])
    suffix   = parts[4]
    # Sanitize model name for filenames
    safe_ft  = re.sub(r"[:/\\\s]+", "-", ft_full)

    # Initialize container for model's entries
    entries = []
    is_var  = "real_2000" in suffix

    if is_var:
        # Iterate over every prompt template
        for prompt_name, prompt_template in PROMPTS.items():
            # Only the "default" prompt gets a temperature sweep
            temps = TEMPERATURE_RANGE if prompt_name == "default" else [None]
            for temp in temps:
                for run in range(1, REPEATED_RUNS + 1):
                    for _, row in df.iterrows():
                        # Compose user prompt
                        txt = (row.whats_new or "").strip()
                        prompt = f"{prompt_template}\n\nApp update text: {txt}"
                        # Unique ID encodes model/prompt/temp/run/row.id
                        cid = (
                            f"{safe_ft}"
                            f"__{prompt_name}"
                            f"__t{int((temp or 0)*10)}"
                            f"__run{run}"
                            f"__id{row.id}"
                        )
                        body = {
                            "max_tokens":   1000,
                            "messages":    [{"role":"user","content":prompt}],
                            "random_seed": SEED,
                            **({"temperature": temp} if temp is not None else {})
                        }
                        entries.append({"custom_id": cid, "body": body})

    else:
        # Only 3 runs, default prompt, no temp variation
        template = PROMPTS["default"]
        for run in range(1, REPEATED_RUNS + 1):
            for _, row in df.iterrows():
                # Compose user prompt
                txt = (row.whats_new or "").strip()
                prompt = f"{template}\n\nApp update text: {txt}"
                # Unique ID encodes model/prompt/temp/run/row.id
                cid = (
                    f"{safe_ft}"
                    "__default"
                    f"__t0"
                    f"__run{run}"
                    f"__id{row.id}"
                )
                body = {
                    "max_tokens":   1000,
                    "messages":    [{"role":"user","content":prompt}],
                    "random_seed": SEED
                }
                entries.append({"custom_id": cid, "body": body})

    # Paginate entries to ≤ 50,000 lines per file
    total     = len(entries)
    num_files = math.ceil(total / MAX_LINES_PER_BATCH)
    for part in range(num_files):
        chunk = entries[part*MAX_LINES_PER_BATCH:(part+1)*MAX_LINES_PER_BATCH]

        # Sanitize model name for filenames
        idx   = f"{part+1:02d}" if num_files > 1 else "01"
        fname = f"mistral_batch_{safe_ft}_{idx}.jsonl"
        path  = os.path.join(MISTRAL_BATCH_DIR, fname)

        with open(path, "w", encoding="utf-8") as fout:
            for e in chunk:
                fout.write(json.dumps(e, ensure_ascii=False) + "\n")

        print(f"Wrote {len(chunk)} entries for {ft_full} → {path}")

### Upload Batches
We upload every JSONL batch to Mistral and capture file IDs. Run either batch creation for default models or fine-tuned models before.

In [None]:
# Path to batch files
batch_paths = sorted(glob.glob(os.path.join(MISTRAL_BATCH_DIR, "mistral_batch_*.jsonl")))

# Upload each batch file and collect its file ID
uploaded = []
for path in batch_paths:
    fname = os.path.basename(path)
    model = fname.split("_")[2]

    print(f"Uploading {fname} (model={model})…")
    with open(path, 'rb') as f:
        up = MISTRAL_CLIENT.files.upload(
            file={"file_name": fname, "content": f},
            purpose="batch"
        )
    print(f"Uploaded: id={up.id}")
    uploaded.append({"model": model, "file_id": up.id})

print("All Mistral batch files uploaded.\n")

### Create Batch Jobs
We create a batch job for each uploaded file.

In [None]:
# Create a batch job for each uploaded file
for item in uploaded:
    file_id = item["file_id"]
    model = item["model"]
    # Parse model name for fine-tuned models
    if model.startswith("ft-"):
        parts = model.split("-")
        # parts = ["ft","mistral","small","latest","d1ef7e20","20250527","314de464","equal_500"]

        prefix = parts[0]
        model_name = "-".join(parts[1:4])   # "mistral-small-latest"
        rev = parts[4]              # "d1ef7e20"
        date = parts[5]              # "20250527"
        sha = parts[6]              # "314de464"

        real_model = f"{prefix}:{model_name}:{rev}:{date}:{sha}"
        # "ft:mistral-small-latest:d1ef7e20:20250527:314de464"
    else:
        real_model = model

    # Submit batch job to Mistral API
    job = MISTRAL_CLIENT.batch.jobs.create(
        input_files=[file_id],
        model=real_model,
        endpoint="/v1/chat/completions",
        metadata={"description": f"classification batch for {real_model}"}
    )
    print(f"Created batch job {job.id} for file {file_id} (model={real_model}), status={job.status}")

## Anthropic Batches
We create batch requests for Claude models and process JSONL results into CSV format.

### Batch Requests
We build batch requests for each Claude model with various prompts and temperature settings.

In [None]:
# Shorten model names as custom_id max is 64 characters
def short_model_name(full_model: str) -> str:
    """
    Converts full Claude model names to shortened versions for custom_id usage.
    # e.g. "claude-3-7-sonnet-20250219" → ["claude","3","7","sonnet","20250219"]
    """
    parts = full_model.split("-")
    # Drop the first ("claude") and last (date) parts, then re-join
    return "-".join(parts[1:-1])

In [None]:
# Process for each model
for model in CLAUDE_MODELS:
    short_model = short_model_name(model)
    batch_requests = []

    # Iterate through all prompt templates
    for prompt_name, prompt_template in PROMPTS.items():
        # Only the "default" prompt gets a temperature sweep
        base_temps = TEMPERATURE_RANGE if prompt_name == "default" else [None]
        temps = [t for t in base_temps if t is None or t <= 1.0]

        for temp in temps:
            for run in range(1, REPEATED_RUNS + 1):
                for _, row in df.iterrows():
                    # Compose user prompt
                    update_text = (row.get("whats_new", "") or "").strip()
                    prompt = f"{prompt_template}\n\nApp update text: {update_text}"

                    # Unique ID encodes model/prompt/temp/run/row.id
                    raw_id = (
                        f"{short_model}_{prompt_name}"
                        f"_t{int((temp or 0) * 10)}_r{run}_i{row['id']}"
                    )
                    # Ensure ≤64 chars for API limits (redundancy)
                    custom_id = (
                        raw_id[:64]
                        if len(raw_id) <= 64
                        else raw_id[:50]  # or hash fallback
                    )

                    # Build request parameters
                    params = MessageCreateParamsNonStreaming(
                        model=model,
                        max_tokens=1000,
                        temperature=(temp or 0),
                        messages=[{"role": "user", "content": prompt}],
                    )
                    batch_requests.append(Request(custom_id=custom_id, params=params))

    # Submit the complete batch for model
    message_batch = ANTHROPIC_CLIENT.messages.batches.create(
        requests=batch_requests
    )
    print(
        f"Created batch for {model} "
        f"with {len(batch_requests)} requests: "
        f"{message_batch.id} (status={message_batch.processing_status})"
    )

## Process results
We process JSONL result files and convert them to CSV format for analysis. Note: Download batch files from each provider's developer platform before.

In [None]:
# Load validation dataset and prepare ID columns for matching with batch results
df = pd.read_csv(VAL_PATH, dtype={'id': str})
df['id'] = df['id'].str.strip()

### Helper Functions
Utility functions for ID normalization and record parsing across different LLM providers.

#### Helper: normalize ID string by removing 'id' prefix

In [None]:
def normalize_id_str(id_part):
    """Clean ID strings by removing 'id' prefix and whitespace"""
    return id_part.replace('id', '').strip()

#### Provider-Specific Parsers
Each LLM provider has different JSONL response formats requiring specialized parsing.

In [None]:
def parse_openai_record(record):
    """Parse OpenAI batch response record"""
    # Split custom_id into components
    parts = record.get('custom_id', '').split('__')
    if len(parts) < 5:
        return None
    model, prompt_type, temp, run, id_part = parts[:5]
    id_str = normalize_id_str(id_part)

    # Create standardized column name
    col = f"{model}__{prompt_type}__{temp}__{run}".replace(':','_').replace('-','_').replace('.','_')

    # Extract response content
    try:
        content = record['response']['body']['choices'][0]['message']['content']
    except Exception:
        return None
    return id_str, col, content

def parse_mistral_record(record):
    """Parse Mistral batch response record"""
    # Split custom_id into components
    parts = record.get('custom_id', '').split('__')
    if len(parts) < 5:
        return None
    model, prompt_type, temp, run, id_part = parts[:5]
    id_str = normalize_id_str(id_part)

    # Create standardized column name
    col = f"{model}__{prompt_type}__{temp}__{run}".replace(':','_').replace('-','_').replace('.','_')

    # Extract response content
    try:
        content = record['response']['body']['choices'][0]['message']['content']
    except Exception:
        return None
    return id_str, col, content

def parse_anthropic_record(record):
    """Parse Anthropic batch response record"""
    # Split custom_id into components (single underscore separator)
    parts = record.get('custom_id', '').split('_')
    if len(parts) < 5:
        return None

    # Extract components from different positions due to Anthropic format
    id_part = parts[-1]
    run = parts[-2]
    temp = parts[-3]
    model = parts[0]
    prompt_type = '_'.join(parts[1:-3])
    id_str = normalize_id_str(id_part.lstrip('i'))

    # Create standardized column name
    col = f"{model}__{prompt_type}__{temp}__{run}".replace(':','_').replace('-','_').replace('.','_')

    # Extract response content
    msg = record.get('result', {}).get('message', {})
    content = ''
    try:
        content = msg['content'][0]['text']
    except Exception:
        # If content missing or malformed, leave empty
        pass
    return id_str, col, content

#### File Processing Functions
Core functions for processing JSONL files and parallel execution.

In [None]:
def process_file(path, parser):
    """Process a single JSONL file using the specified parser"""
    mapping = {}
    print(f"Processing {path.name}")
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                rec = json.loads(line)
            except json.JSONDecodeError:
                # Skip malformed JSON lines
                continue
            # Parse record using provider-specific parser
            parsed = parser(rec)
            if parsed:
                id_str, col, content = parsed
                # Group by column, then by ID
                mapping.setdefault(col, {})[id_str] = content

    print(f"  Mapped {len(mapping)} columns from {path.name}")
    return mapping

def process_results_parallel(directory, parser, df, workers=10):
    """Process all JSONL files in a directory using parallel execution."""
    # Find all JSONL files in directory and subdirectories
    files = list(Path(directory).rglob('*.jsonl'))
    print(f"Found {len(files)} files in {directory}")

    combined = {}

    # Process files in parallel
    with ThreadPoolExecutor(max_workers=workers) as executor:
        # Submit all file processing tasks
        futures = {executor.submit(process_file, f, parser): f for f in files}

        # Collect results as they complete
        for future in as_completed(futures):
            file_map = future.result()
            # Merge results from each file
            for col, id_map in file_map.items():
                combined.setdefault(col, {}).update(id_map)

    # Add new columns to dataframe
    for col, id_map in combined.items():
        df[col] = df['id'].map(id_map)

    return df.copy()

#### Process All Provider Results
Execute processing for each LLM provider in sequence.

In [None]:
df = process_results_parallel(OPENAI_BATCH_RESULTS_DIR, parse_openai_record, df)
df = process_results_parallel(MISTRAL_BATCH_RESULTS_DIR, parse_mistral_record, df)
df = process_results_parallel(ANTHROPIC_BATCH_DIR, parse_anthropic_record, df)

In [None]:
# Save final output
df.to_csv(os.path.join(DEMO_PATH, 'output_data', 'validation_with_model_preds_LLM.csv'), index=False)

## Clean Results
Standardize raw model outputs into valid single-label strings (1–7).
Cleans and validates outputs from LLMs or API responses, ensuring only acceptable class labels are retained.

### Cleaning Prep and Helper Function

In [None]:
# Columns to exclude from label cleaning
EXCLUDE_COLS = [
    'release_date', 'version_display', 'whats_new', 'body', 'update_classification', 'id', 'app_id', 'previous_version', 'previous_release_date', 'previous_id'
]

def extract_single_label(text):
    """
    Normalize prediction output into a single class label string (1–7).
    Returns '0' as a fallback for invalid predictions.

    Handles:
    - Integers or floats like 3, 3.0
    - Strings like '3', ' (3) ', or even '3 ; 4' (keeps only last valid single label)
    """
    if isinstance(text, float):
        if pd.isna(text):
            return '0'  # fallback
        if text.is_integer() and 1 <= int(text) <= 7:
            return str(int(text))
        return '0'

    if isinstance(text, int):
        return str(text) if 1 <= text <= 7 else '0'

    # Parse strings and try to extract digits in range 1–7
    text = str(text)
    pattern = r'\(?\s*([1-7](?:\s*;\s*[1-7])*)\s*\)?'
    matches = re.findall(pattern, text)
    if not matches:
        return '0'
    last = matches[-1]
    cleaned = [d for d in re.split(r'\s*;\s*', last.strip('; ')) if d.isdigit() and 1 <= int(d) <= 7]
    return cleaned[-1] if cleaned else '0'

### Load Model Output File and Apply Cleaning

In [None]:
# Load CSV
df = pd.read_csv(os.path.join(DEMO_PATH, 'output_data', 'validation_with_model_preds_LLM.csv'))

In [None]:
# Identify prediction columns (exclude inputs/metadata)
model_cols = [c for c in df.columns if c not in EXCLUDE_COLS]

# Apply label cleaning function to each model column
for col in model_cols:
    df[col] = df[col].apply(extract_single_label)

# Save cleaned predictions to file
save_path = os.path.join(DEMO_PATH, 'output_data', 'validation_with_model_preds_LLM_cleaned.csv')
df.to_csv(save_path, index=False)