## Setup
In this notebook, we compute the same evaluation metrics as in demonstration study 2
for a small set of LLM prediction columns (frontier models only).

#### Imports

In [None]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score

#### Paths

In [None]:
DEMO_PATH  = os.path.abspath(os.path.join(".."))
OUT_DIR    = os.path.join(DEMO_PATH, "output_data")
VISUAL_DIR = os.path.join(DEMO_PATH, "paper_visuals")
LLM_PATH   = os.path.join(OUT_DIR, "02_validation_with_model_preds_LLM_cleaned.csv")

## Pre-Processing
In this section, we define regular expressions and helper functions for parsing model output columns. This includes extracting metadata standardizing column names and processing LLM results. These routines are used to prepare the results tables for the online appendix.

In [None]:
NUM_LABELS = 9
RUN_COL_RE = re.compile(r"^(?P<prefix>.+?)__(?:run|r)(?P<run>\d+)$")

In [None]:
def standardize_llm_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardizes LLM column names by:
    - removing trailing '__review'
    - fixing underscores before temp tokens: '_t0' -> '__t0'
    """
    def _fix(c):
        c = re.sub(r'__review$', '', c)
        return re.sub(r'(?<!_)_(t\d+)', r'__\1', c)

    out = df.copy()
    out.columns = [_fix(c) for c in out.columns]
    return out


def parse_model_pred(val, num_labels=NUM_LABELS):
    """
    Parse prediction into a multi-hot vector of length num_labels.
    Handles empty, 'nan', or '9' as all-negative.
    Expected format: "0;3;7" (semicolon-separated label ids).
    """
    val = str(val).strip()
    if val in ("", "9", "nan", "None"):
        return [0] * num_labels

    vec = [0] * num_labels
    for tok in val.replace(" ", "").split(";"):
        if tok.isdigit():
            k = int(tok)
            if 0 <= k < num_labels:
                vec[k] = 1
    return vec


def parse_multi_hot(val, num_labels=NUM_LABELS):
    """
    Parse ground-truth multi_hot.
    - if list/array: use it
    - if str: extract digits (fallback)
    """
    if isinstance(val, (list, np.ndarray)):
        arr = [int(x) for x in val]
        return (arr + [0] * num_labels)[:num_labels]

    if isinstance(val, str):
        digs = [int(x) for x in re.findall(r"\d", val)]
        return (digs + [0] * num_labels)[:num_labels]

    return [0] * num_labels


def multilabel_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    """
    y_true/y_pred are (n_samples, num_labels) binary matrices.
    Returns per-label F1 plus macro/micro/weighted, subset accuracy, hamming loss.
    """
    per_label = f1_score(y_true, y_pred, average=None, zero_division=0)
    out = {f"f1_{i}": float(per_label[i]) for i in range(y_true.shape[1])}

    out["f1_macro"] = float(f1_score(y_true, y_pred, average="macro", zero_division=0))
    out["f1_micro"] = float(f1_score(y_true, y_pred, average="micro", zero_division=0))
    out["f1_weighted"] = float(f1_score(y_true, y_pred, average="weighted", zero_division=0))

    out["subset_acc"] = float(np.mean((y_true == y_pred).all(axis=1)))
    out["hamming_loss"] = float(np.mean(y_true != y_pred))
    return out


def consistency_across_runs(df: pd.DataFrame, cols: list) -> float:
    """
    Fraction of samples where ALL runs agree exactly on the multi-hot vector.
    Returns NaN if fewer than 2 runs.
    """
    if len(cols) < 2:
        return np.nan

    mats = [np.vstack(df[c].apply(parse_model_pred).values) for c in cols]  # list of (n,L)
    arr = np.stack(mats, axis=1)  # (n, runs, L)

    return float(np.mean([np.all(arr[i] == arr[i][0]) for i in range(arr.shape[0])]))


def parse_prefix(prefix: str) -> dict:
    """
    Parse prefix like: model__prompt__tX
    Returns model, provider, prompt_type, temperature.
    """
    parts = prefix.split("__")
    model_raw   = parts[0] if len(parts) >= 1 else prefix
    prompt_type = parts[1] if len(parts) >= 2 else None
    tpart       = parts[2] if len(parts) >= 3 else None

    temperature = None
    if isinstance(tpart, str) and tpart.startswith("t"):
        try:
            temperature = int(tpart[1:]) / 10.0
        except Exception:
            temperature = None

    m = model_raw.lower()
    if m.startswith("gpt") or m.startswith("o"):
        provider = "OpenAI"
    elif "mistral" in m or "ministral" in m:
        provider = "Mistral"
    elif any(x in m for x in ["haiku", "opus", "sonnet"]):
        provider = "Anthropic"
    else:
        provider = None

    return dict(model=model_raw, provider=provider, prompt_type=prompt_type, temperature=temperature)


def _run_num(colname: str) -> int:
    m = RUN_COL_RE.match(colname)
    return int(m.group("run")) if m else 10**9


## Main function for compiling results DataFrame for all models

In [None]:
def build_llm_multilabel_results(df: pd.DataFrame, truth_col="multi_hot") -> pd.DataFrame:
    """
    Builds a tidy results table for LLM multilabel predictions.
    Uses a representative run (run1 if present else smallest run) for metrics,
    and uses all runs for consistency.
    """
    run_cols = [c for c in df.columns if RUN_COL_RE.match(c)]
    prefix_to_cols = {}
    for c in run_cols:
        prefix = RUN_COL_RE.match(c).group("prefix")
        prefix_to_cols.setdefault(prefix, []).append(c)

    # Ground truth
    y_true = np.vstack(df[truth_col].apply(parse_multi_hot).values).astype(int)

    records = []
    for prefix, cols in prefix_to_cols.items():
        cols_sorted = sorted(cols, key=_run_num)
        run1 = next((c for c in cols_sorted if c.endswith("__run1") or c.endswith("__r1")), cols_sorted[0])

        meta = parse_prefix(prefix)

        # Representative predictions
        y_pred = np.vstack(df[run1].apply(parse_model_pred).values).astype(int)

        rec = {
            **meta,
            "prefix": prefix,
            "run_used": run1,
            "n_runs_available": len(cols_sorted),
            **multilabel_metrics(y_true, y_pred),
            "consistency_all_runs": consistency_across_runs(df, cols_sorted),
        }
        records.append(rec)

    out = pd.DataFrame.from_records(records).sort_values(
        ["provider", "model", "prompt_type", "temperature"], na_position="last"
    )
    return out


## Load Data + Compile Outputs
Loads the LLM file, standardizes column names, computes results, and writes CSV output.

In [None]:
df_raw = pd.read_csv(LLM_PATH, low_memory=False)
df = standardize_llm_columns(df_raw)

# If your truth column is not named "multi_hot", change truth_col=... below.
results_df = build_llm_multilabel_results(df, truth_col="multi_hot")

out_csv = os.path.join(OUT_DIR, "performance_reviews_label-specific_frontier.csv")
results_df.to_csv(out_csv, index=False)

## Tables
In this section, we generate all Table E9 for the online appendix, based on the processed results.

#### Output

In [None]:
BASE_COLS = ["model", "provider", "prompt_type", "temperature", "run_used", "n_runs_available"]
F1_COLS   = [f"f1_{i}" for i in range(NUM_LABELS)]
SUM_COLS  = ["f1_macro", "f1_micro", "f1_weighted", "subset_acc", "hamming_loss", "consistency_all_runs"]

TABLE_COLS = BASE_COLS + F1_COLS + SUM_COLS
TABLE_COLS = [c for c in TABLE_COLS if c in results_df.columns]

table_all = results_df[TABLE_COLS].copy()

out_xlsx = os.path.join(VISUAL_DIR, "table_F9-2_02.xlsx")
table_all.to_excel(out_xlsx, index=False)