## Setup
In this notebook, we compute the same evaluation metrics as in the demonstration studies for a small set of LLM prediction columns (frontier models) and create Figure E6 for the Online Appendix.

#### Imports

In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score

#### Paths

In [None]:
DEMO_PATH = os.path.abspath(os.path.join(".."))
OUT_DIR = os.path.join(DEMO_PATH, "output_data")
VISUAL_DIR = os.path.join(DEMO_PATH, "paper_visuals")

NLP_10K_PATH = os.path.join(OUT_DIR, "10000_validation_with_model_preds_NLP.csv")
NLP_20K_PATH = os.path.join(OUT_DIR, "20000_validation_with_model_preds_NLP.csv")
LLM_20K_PATH = os.path.join(OUT_DIR, "20000_ft_validation_with_model_preds_LLM_cleaned.csv")

## Pre-Processing
In this section, we define regular expressions and helper functions for parsing model output columns. This includes extracting metadata standardizing column names and processing model predictions. These routines are used to prepare the Figure E6 for the Online Appendix.

In [None]:
TRUTH_COL = "update_classification"
LABELS_1_7 = list(range(1, 8))

RUN1_RE = re.compile(r"__(run1|r1)$")

In [None]:
## Helper Functions
def macro_f1_multiclass(y_true: pd.Series, y_pred: pd.Series, labels=LABELS_1_7) -> float:
    """
    Macro F1 for 7-class classification (labels 1..7).
    Drops NA rows.
    """
    y_true = y_true.astype("Int64")
    y_pred = y_pred.astype("Int64")
    valid = y_true.notna() & y_pred.notna()
    return float(
        f1_score(y_true[valid].astype(int), y_pred[valid].astype(int),
                 labels=labels, average="macro", zero_division=0)
    )

## Predictions

#### NLP 10k Training Sample

In [None]:
df_10k_nlp = pd.read_csv(NLP_10K_PATH, low_memory=False)
y_true_10k_nlp = df_10k_nlp[TRUTH_COL]

# model columns (0..6 -> shift to 1..7)
NLP_COLS = {
    "NaiveBayes": "tfidf__for__ml__NaiveBayes_pred",
    "SVM": "tfidf__for__ml__SVM_pred",
    "XGBoost": "tfidf__for__ml__XGBoost_pred",
    "bert": "bert_for_ml_pred",
    "xlnet": "xlnet_for_ml_pred",
}

macro10k_nlp = {}
for key, col in NLP_COLS.items():
    y_pred_10k_nlp = pd.to_numeric(df_10k_nlp[col], errors="coerce") + 1
    macro10k_nlp[key] = macro_f1_multiclass(y_true_10k_nlp, y_pred_10k_nlp)

#### NLP 20k Training Sample

In [None]:
df_20k_nlp = pd.read_csv(NLP_20K_PATH, low_memory=False)
y_true_20k_nlp = df_20k_nlp[TRUTH_COL]

# same schema as 10k NLP
NLP_COLS = {
    "NaiveBayes": "tfidf__for__ml__NaiveBayes_pred",
    "SVM": "tfidf__for__ml__SVM_pred",
    "XGBoost": "tfidf__for__ml__XGBoost_pred",
    "bert": "bert_for_ml_pred",
    "xlnet": "xlnet_for_ml_pred",
}

macro20k_nlp = {}
for key, col in NLP_COLS.items():
    y_pred_20k_nlp = pd.to_numeric(df_20k_nlp[col], errors="coerce") +1
    macro20k_nlp[key] = macro_f1_multiclass(y_true_20k_nlp, y_pred_20k_nlp)

### LLM 20k Training Sample

In [None]:
df_20k_llm = pd.read_csv(LLM_20K_PATH, low_memory=False)
y_true_20k_llm = df_20k_llm[TRUTH_COL]

run1_cols = [c for c in df_20k_llm.columns if RUN1_RE.search(c)]


def pick_first_contains(substr: str):
    hits = [c for c in run1_cols if substr in c]
    return hits[0] if hits else None

LLM_RUN1_COLS = {
    "gpt_4_1_nano": pick_first_contains("ft_gpt_4_1_nano"),
    "ministral_8b": pick_first_contains("ft_ministral_8b"),
}

macro20k_llm = {}
for key, col in LLM_RUN1_COLS.items():
    if col is None:
        continue
    y_pred_20k_llm = pd.to_numeric(df_20k_llm[col], errors="coerce")  # <-- NO +1
    macro20k_llm[key] = macro_f1_multiclass(y_true_20k_llm, y_pred_20k_llm)

## Figure

#### Assemble Plot Data

In [None]:
data_by_size = {
    "2k": [                                             # hard-coded 2k baselines (macro-F1; from main article)
        ("NaiveBayes", 0.524, "Na誰ve Bayes"),
        ("SVM", 0.506, "SVM"),
        ("XGBoost", 0.501, "XGBoost"),
        ("xlnet", 0.547, "XLNet"),
        ("bert", 0.530, "BERT"),
        ("ministral_8b", 0.538, "Ministral 8B"),
        ("gpt_4_1_nano", 0.710, "GPT-4.1 Nano"),
    ],
    "10k": [
        ("NaiveBayes", macro10k_nlp["NaiveBayes"], "Na誰ve Bayes"),
        ("SVM", macro10k_nlp["SVM"], "SVM"),
        ("XGBoost", macro10k_nlp["XGBoost"], "XGBoost"),
        ("xlnet", macro10k_nlp["xlnet"], "XLNet"),
        ("bert", macro10k_nlp["bert"], "BERT"),
    ],
    "20k": [
        # NLP @ 20k
        ("NaiveBayes", macro20k_nlp["NaiveBayes"], "Na誰ve Bayes"),
        ("SVM", macro20k_nlp["SVM"], "SVM"),
        ("XGBoost", macro20k_nlp["XGBoost"], "XGBoost"),
        ("xlnet", macro20k_nlp["xlnet"], "XLNet"),
        ("bert", macro20k_nlp["bert"], "BERT"),
        # LLM @ 20k (run1)
        ("ministral_8b", macro20k_llm.get("ministral_8b", np.nan), "Ministral 8B"),
        ("gpt_4_1_nano", macro20k_llm.get("gpt_4_1_nano", np.nan), "GPT-4.1 Nano"),
    ],
}

#### Create Figure: Macro Avg. F1 vs. Scaled Training Size

In [None]:
plt.rc("font", family="Times New Roman")

STYLE_LLM = {
    "gpt_4_1_nano": {"color": "#6BAED6", "marker": "^", "label": "GPT-4.1 Nano"},
    "ministral_8b": {"color": "#74C476", "marker": "p", "label": "Ministral 8B"},
}
STYLE_NON = {
    "NaiveBayes": {"color": "#FF6347", "marker": "D", "label": "Na誰ve Bayes"},
    "SVM": {"color": "#FA8072", "marker": "v", "label": "SVM"},
    "XGBoost": {"color": "#FF4500", "marker": "p", "label": "XGBoost"},
    "bert": {"color": "#2F4F4F", "marker": "X", "label": "BERT"},
    "xlnet": {"color": "#D3D3D3", "marker": "D", "label": "XLNet"},
}

STYLE = {
    **STYLE_NON,
    **STYLE_LLM,
}

sizes = list(data_by_size.keys())
n_groups = len(sizes)

bar_width = 0.08
inner_gap = 0.02
group_gap = 0.4

max_bars = max(len(v) for v in data_by_size.values())
group_span = max_bars * (bar_width + inner_gap) + group_gap
group_centers = np.arange(n_groups) * group_span

fig, ax = plt.subplots(figsize=(10, 6))
handles_done = {}

for gi, size in enumerate(sizes):
    models_for_size = data_by_size[size]
    n_bars = len(models_for_size)

    total_width = n_bars * (bar_width + inner_gap)
    start_x = group_centers[gi] - total_width / 2.0

    for bi, (model_key, macro_f1, nice_label) in enumerate(models_for_size):
        x = start_x + bi * (bar_width + inner_gap)

        bar = ax.bar(
            x, macro_f1,
            width=bar_width,
            color=STYLE[model_key]["color"],
            edgecolor="black"
        )

        if model_key not in handles_done:
            handles_done[model_key] = (bar[0], nice_label)

# optional teacher/reference line
teacher_f1 = 0.744
ax.axhline(teacher_f1, color="black", linestyle="--", linewidth=1)
ax.text(
    group_centers[0] + group_gap * 0.2,
    teacher_f1,
    "Teacher Model (F1 = 0.744)",
    va="bottom", ha="left", fontsize=9
)

ax.set_xticks(group_centers)
ax.set_xticklabels(sizes)
ax.set_xlabel("Training-Data Size (N)")
ax.set_ylabel("Macro Avg. F1-Score")
ax.set_title("Teacher-Student Setup: Macro Avg. F1-Score vs. Scaled Training Size")

legend_order = ["NaiveBayes", "SVM", "XGBoost", "xlnet", "bert", "ministral_8b", "gpt_4_1_nano"]
legend_handles = [handles_done[k][0] for k in legend_order if k in handles_done]
legend_labels = [handles_done[k][1] for k in legend_order if k in handles_done]

ax.legend(
    legend_handles, legend_labels,
    title="Student Model",
    loc="lower left",
    bbox_to_anchor=(1.02, 0.02),
    borderaxespad=0.5
)

plt.tight_layout(rect=[0, 0, 0.8, 1])
ax.grid(axis="y", linestyle="--", alpha=0.3)

out_png = os.path.join(VISUAL_DIR, "figure_E6.png")
plt.savefig(out_png, dpi=300, bbox_inches="tight")
plt.show()