## Setup
In this notebook, we compute the same evaluation metrics as in the demonstration studies for a small set of LLM prediction columns (frontier models) and create Table F9 for the Online Appendix.

#### Imports

In [None]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score, cohen_kappa_score
)
from statsmodels.stats.inter_rater import fleiss_kappa

#### Paths

In [None]:
DEMO_PATH = os.path.abspath(os.path.join(".."))
OUT_DIR   = os.path.join(DEMO_PATH, "output_data")
LLM_PATH  = os.path.join(OUT_DIR, "01_validation_with_model_preds_LLM_temp_cleaned.csv")
VISUAL_DIR  = os.path.join(DEMO_PATH, "paper_visuals")

## Pre-Processing
In this section, we define regular expressions and helper functions for parsing model output columns. This includes extracting metadata standardizing column names and processing LLM results. These routines are used to prepare the results for Table F9 for the Online Appendix.

In [None]:
TRUTH_COL  = "update_classification"
LABELS_1_7 = list(range(1, 8))

RUN_COL_RE = re.compile(r"^(?P<prefix>.+?)__(?:run|r)(?P<run>\d+)$")

In [None]:
def innovation_dummy(s: pd.Series) -> pd.Series:
    """Innovation = labels 1 or 2 (<=2)."""
    s = s.astype("Int64")
    return (s <= 2).astype("Int64")


def compute_binary_metrics(y_true: pd.Series, y_pred: pd.Series) -> dict:
    """Binary metrics: F1, precision, recall, accuracy."""
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    return dict(
        f1=f1_score(y_true, y_pred, average="binary", zero_division=0),
        precision=precision_score(y_true, y_pred, average="binary", zero_division=0),
        recall=recall_score(y_true, y_pred, average="binary", zero_division=0),
        accuracy=accuracy_score(y_true, y_pred),
    )


def compute_multiclass_metrics(y_true: pd.Series, y_pred: pd.Series, labels=LABELS_1_7) -> dict:
    """
    Multiclass metrics:
    - per-class F1: f1_1..f1_7
    - macro F1
    - weighted F1
    """
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)

    per_class = f1_score(y_true, y_pred, labels=labels, average=None, zero_division=0)
    out = {f"f1_{lab}": float(per_class[i]) for i, lab in enumerate(labels)}
    out["f1_macro"] = float(f1_score(y_true, y_pred, labels=labels, average="macro", zero_division=0))
    out["f1_weighted"] = float(f1_score(y_true, y_pred, labels=labels, average="weighted", zero_division=0))
    return out


def _consistency_rate(preds_df: pd.DataFrame) -> float:
    """Fraction of items where all runs agree exactly."""
    if preds_df.shape[1] < 2 or preds_df.shape[0] == 0:
        return np.nan
    return float((preds_df.nunique(axis=1) == 1).mean())


def _mean_cohen_kappa(preds_df: pd.DataFrame) -> float:
    """Mean pairwise Cohen's kappa across runs."""
    cols = list(preds_df.columns)
    if len(cols) < 2 or preds_df.shape[0] == 0:
        return np.nan
    ks = []
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            ks.append(cohen_kappa_score(preds_df[cols[i]], preds_df[cols[j]]))
    return float(np.mean(ks)) if ks else np.nan


def _fleiss_from_preds(preds_df: pd.DataFrame, labels: list) -> float:
    """
    Fleiss' kappa:
    Build counts per item of how many runs chose each label.
    """
    arr = preds_df.to_numpy()
    countmat = np.array([[(row == lab).sum() for lab in labels] for row in arr])
    return float(fleiss_kappa(countmat))


def parse_prefix(prefix: str) -> dict:
    """
    Parse prefix like: model__prompt__tX
    Returns model, provider, prompt_type, temperature.
    """
    parts = prefix.split("__")
    model_raw   = parts[0] if len(parts) >= 1 else prefix
    prompt_type = parts[1] if len(parts) >= 2 else None
    tpart       = parts[2] if len(parts) >= 3 else None

    temperature = None
    if isinstance(tpart, str) and tpart.startswith("t"):
        try:
            temperature = int(tpart[1:]) / 10.0  # t0 -> 0.0, t10 -> 1.0
        except Exception:
            temperature = None

    m = model_raw.lower()
    if m.startswith("gpt") or m.startswith("o"):
        provider = "OpenAI"
    elif "mistral" in m or "ministral" in m:
        provider = "Mistral"
    elif any(x in m for x in ["haiku", "opus", "sonnet"]):
        provider = "Anthropic"
    else:
        provider = None

    return dict(model=model_raw, provider=provider, prompt_type=prompt_type, temperature=temperature)


def _run_num(colname: str) -> int:
    m = RUN_COL_RE.match(colname)
    return int(m.group("run")) if m else 10**9

## Main function for compiling results DataFrame for all models

In [None]:
def build_llm_results(df: pd.DataFrame):
    """
    Build a tidy DataFrame summarizing all multilabel metrics, consistency, and model metadata for all predictors. Used for generating result tables.
    """
    # Group run columns by prefix
    run_cols = [c for c in df.columns if RUN_COL_RE.match(c)]
    prefix_to_cols = {}
    for c in run_cols:
        prefix = RUN_COL_RE.match(c).group("prefix")
        prefix_to_cols.setdefault(prefix, []).append(c)

    # Ground truth
    y_true_cls = df[TRUTH_COL].astype("Int64")
    y_true_innov = innovation_dummy(y_true_cls)

    innovation_records = []
    classification_records = []

    for prefix, cols in prefix_to_cols.items():
        cols_sorted = sorted(cols, key=_run_num)

        # choose representative run
        run1 = next((c for c in cols_sorted if c.endswith("__run1") or c.endswith("__r1")), cols_sorted[0])

        meta = parse_prefix(prefix)

        # representative predictions
        y_pred_run1_cls = df[run1].astype("Int64")
        valid = y_true_cls.notna() & y_pred_run1_cls.notna()

        # --- Innovation metrics (binary) ---
        innov_metrics = dict(f1=np.nan, precision=np.nan, recall=np.nan, accuracy=np.nan)
        if valid.any():
            innov_metrics = compute_binary_metrics(
                y_true_innov[valid],
                innovation_dummy(y_pred_run1_cls[valid]),
            )

        # agreement on innovation dummy (all runs)
        preds_innov = pd.DataFrame({c: innovation_dummy(df[c].astype("Int64")) for c in cols_sorted})
        preds_innov = preds_innov.dropna(axis=0, how="any")

        innov_consistency = _consistency_rate(preds_innov)
        innov_mean_ck = _mean_cohen_kappa(preds_innov)
        innov_fleiss = np.nan
        if preds_innov.shape[1] >= 2 and preds_innov.shape[0] > 0:
            try:
                innov_fleiss = _fleiss_from_preds(preds_innov, labels=[0, 1])
            except Exception:
                innov_fleiss = np.nan

        innovation_records.append({
            **meta,
            "prefix": prefix,
            "run_used": run1,
            "n_runs_available": len(cols_sorted),
            **innov_metrics,
            "consistency_all_runs": innov_consistency,
            "mean_cohen_kappa": innov_mean_ck,
            "fleiss_kappa": innov_fleiss,
        })

        # --- Classification metrics (multiclass) ---
        cls_metrics = {f"f1_{lab}": np.nan for lab in LABELS_1_7}
        cls_metrics.update(dict(f1_macro=np.nan, f1_weighted=np.nan))
        if valid.any():
            cls_metrics = compute_multiclass_metrics(
                y_true_cls[valid],
                y_pred_run1_cls[valid],
                labels=LABELS_1_7
            )

        # agreement on multiclass (all runs)
        preds_cls = pd.DataFrame({c: df[c].astype("Int64") for c in cols_sorted})
        preds_cls = preds_cls.dropna(axis=0, how="any")

        cls_consistency = _consistency_rate(preds_cls)
        cls_mean_ck = _mean_cohen_kappa(preds_cls)
        cls_fleiss = np.nan
        if preds_cls.shape[1] >= 2 and preds_cls.shape[0] > 0:
            try:
                cls_fleiss = _fleiss_from_preds(preds_cls, labels=LABELS_1_7)
            except Exception:
                cls_fleiss = np.nan

        classification_records.append({
            **meta,
            "prefix": prefix,
            "run_used": run1,
            "n_runs_available": len(cols_sorted),
            **cls_metrics,
            "consistency_all_runs": cls_consistency,
            "mean_cohen_kappa": cls_mean_ck,
            "fleiss_kappa": cls_fleiss,
        })

    innovation_df = pd.DataFrame(innovation_records).sort_values(
        ["provider", "model", "temperature"], na_position="last"
    )
    classification_df = pd.DataFrame(classification_records).sort_values(
        ["provider", "model", "temperature"], na_position="last"
    )
    return innovation_df, classification_df

## Load Data + Compile Outputs

In [None]:
# Load prediction files
df = pd.read_csv(LLM_PATH, low_memory=False)

# Compile results for all LLMs
innovation_df, classification_df = build_llm_results(df)

# Save
innovation_out_path = os.path.join(OUT_DIR, "performance_updates_innovation_frontier.csv")
classification_out_path = os.path.join(OUT_DIR, "performance_updates_label-specific_frontier.csv")

innovation_df.to_csv(innovation_out_path, index=False)
classification_df.to_csv(classification_out_path, index=False)

## Tables
In this section, we generate all Table E9 for the online appendix, based on the processed results.

#### Output and Helper Function

In [None]:
# Define Table Output Columns
BINARY_COLS = [
    "model", "provider", "prompt_type", "temperature", "run_used", "n_runs_available",
    "f1", "precision", "recall", "accuracy",
    "fleiss_kappa", "mean_cohen_kappa", "consistency_all_runs",
]

MULTICLASS_COLS = [
    "model", "provider", "prompt_type", "temperature", "run_used", "n_runs_available",
    "f1_1", "f1_2", "f1_3", "f1_4", "f1_5", "f1_6", "f1_7",
    "f1_macro", "f1_weighted",
    "fleiss_kappa", "mean_cohen_kappa", "consistency_all_runs",
]

# Function to construct tables based on filters and models
def make_table(df, columns, **filters):
    mask = pd.Series(True, index=df.index)
    for col, val in filters.items():
        if isinstance(val, (list, tuple, set)):
            mask &= df[col].isin(val) | df[col].isna()
        else:
            mask &= (df[col] == val) | df[col].isna()
    cols_keep = [c for c in columns if c in df.columns]
    return df.loc[mask, cols_keep].copy()

#### Build and export all tables

In [None]:
table_innov = make_table(innovation_df, BINARY_COLS)
table_cls = make_table(classification_df, MULTICLASS_COLS)

table_innov.to_excel(os.path.join(VISUAL_DIR, "table_F9-1_01.xlsx"), index=False)
table_cls.to_excel(os.path.join(VISUAL_DIR, "table_F9-2_01.xlsx"), index=False)