## Setup
In this notebook, we generate all tables and figures based on the results of demonstration study 1, for both the main paper and the online appendix. This includes data filtering, statistical summaries, and all visualization scripts. All output files are saved to the `paper_visuals` and `output_data` directories.

#### Imports
 See `requirements.txt` for full dependency versions

In [None]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa

#### Global Variables, Paths, and Settings

In [None]:
# Define Demo Study path
DEMO_PATH   = os.path.abspath(os.path.join(".."))

# Define relevant paths
OUT_DIR     = os.path.join(DEMO_PATH, "output_data")
LLM_PATH    = os.path.join(OUT_DIR, "validation_with_model_preds_LLM_cleaned.csv")
MODELS_PATH = os.path.join(OUT_DIR, "validation_with_model_preds_NLP.csv")
LITERATURE_PATH = os.path.join(OUT_DIR, "validation_literature_classification.csv")
VISUAL_DIR  = os.path.join(DEMO_PATH, "paper_visuals")

## Pre-Processing
In this section, we define regular expressions and helper functions for parsing model output columns. This includes extracting metadata (e.g., data type, size, classifier/model, and vectorizer) for classical ML, CNN, and PLM models, as well as standardizing column names and processing LLM results. These routines are used to prepare the results tables and figures for the paper and online appendix.

In [None]:
# Define constants for parsing model output column names and metadata extraction
DATA_TYPES   = ['real', 'equal']
DATA_SIZES   = ['2000','1000','500','250','100']
CLASSIFIERS  = ['RandomForest','KNeighbors','LogisticRegression',
                'NaiveBayes','SVM','XGBoost']
CNN_MODELS   = ['TextCNN42B']
PLM_MODELS   = ['roberta','bert','electra','xlnet']
GLOVE_VECTORIZERS = ["6B-100d","6B-300d","42B-300d","840B-300d"]

# Pre-compiled regex patterns to match output columns for each model family
ML_PATTERN    = re.compile(
    rf'^(.+?)__({"|".join(DATA_TYPES)})__({"|".join(DATA_SIZES)})__'
    rf'({"|".join(CLASSIFIERS)})_pred$', re.I)
CNN_PATTERN   = re.compile(
    rf'^({"|".join(CNN_MODELS)})__({"|".join(DATA_TYPES)})__'
    rf'({"|".join(DATA_SIZES)})_pred$', re.I)
PLM_PATTERN   = re.compile(
    rf'^({"|".join(PLM_MODELS)})_({"|".join(DATA_TYPES)})_('
    rf'{"|".join(DATA_SIZES)})_pred$', re.I)
GLOVE_PATTERN = re.compile(
    rf'^({"|".join(GLOVE_VECTORIZERS)})_({"|".join(DATA_TYPES)})_('
    rf'{"|".join(DATA_SIZES)})__({"|".join(CLASSIFIERS)})_pred$', re.I)

# Define known LLM and other model keys for later identification and provider mapping
MODEL_KEYS = sorted([
    # OpenAI
    "gpt_4_1_nano", "gpt_4_1_mini", "gpt_4_1",
    # Mistral
    "mistral_large", "mistral_medium", "mistral_small",
    "open_mistral_nemo", "ministral_8b", "ministral_3b",
    # Anthropic
    "4_sonnet", "sonnet_4",
    "3_5_haiku", "3_haiku",
    "3_7_sonnet", "3_5_sonnet",
    "3_opus", "opus_4",
    # OpenAI (legacy keys)
    "gpt_4o_mini", "gpt_4o",
    "gpt_3_5_turbo_0125",
    "o4_mini_2025_04_16", "o3_2025_04_16",
], key=len, reverse=True)

# Mapping model keys to their provider (OpenAI, Mistral, Anthropic)
PROVIDER = {
    **{k:"OpenAI"   for k in MODEL_KEYS if k.startswith("gpt") or k.startswith("o")},
    **{k:"Mistral"  for k in MODEL_KEYS if "mistral" in k or "ministral" in k},
    **{k:"Anthropic"for k in MODEL_KEYS if k.endswith(("haiku","sonnet","opus"))},
}

#### Helper functions for column and prediction parsing

In [None]:
# Model metadata parsing helpers
def parse_ml_info(col):
    """
    Parses ML/CNN/PLM column names to extract model metadata.
    Returns a dictionary with keys: model, category, vectorizer, distribution, and size
    if a known naming pattern is matched; otherwise returns None.
    """
    m = GLOVE_PATTERN.match(col)
    if m:
        vec, dist, size, clf = m.groups()
        return dict(model=clf, category="ML", vectorizer=vec, distribution=dist, size=int(size))
    m = ML_PATTERN.match(col)
    if m:
        vec, dist, size, clf = m.groups()
        return dict(model=clf, category="ML", vectorizer=vec, distribution=dist, size=int(size))
    m = CNN_PATTERN.match(col)
    if m:
        model, dist, size = m.groups()
        return dict(model=model, category="CNN", vectorizer=None, distribution=dist, size=int(size))
    m = PLM_PATTERN.match(col)
    if m:
        model, dist, size = m.groups()
        return dict(model=model, category="PLM", vectorizer=None, distribution=dist, size=int(size))
    return None


def parse_llm_info(prefix):
    """
    Parses LLM column prefixes to extract metadata such as model, provider, prompt type,
    temperature, fine-tuning status, distribution, and size.
    Returns a dictionary with standardized metadata keys if parsing succeeds; otherwise None.
    """
    parts = prefix.split("__")
    if len(parts) != 3: return None
    core, prompt_type, temp_str = parts
    fine_tuned = int(core.startswith("ft_"))
    if fine_tuned: core = core[3:]
    model = next((mk for mk in MODEL_KEYS if mk in core), None)
    provider = PROVIDER.get(model)
    m_ds = re.search(rf"_(real|equal)_({'|'.join(DATA_SIZES)})", core)
    distribution, size = (m_ds.group(1), int(m_ds.group(2))) if m_ds else (None, None)
    temperature = None
    if temp_str.startswith("t"):
        try: temperature = int(temp_str[1:]) / 10.0
        except: temperature = None
    return dict(
        model=model or core, category="LLM", provider=provider, prompt_type=prompt_type,
        temperature=temperature, fine_tuned=fine_tuned, distribution=distribution, size=size,
        vectorizer=None
    )

# Metric computation helpers
def innovation_dummy(labels: pd.Series) -> pd.Series:
    """
    Converts a multiclass label series into a binary target for innovation tasks:
    labels <= 2 are mapped to 1 (positive class), others to 0 (negative class).
    """
    return (labels <= 2).astype(int)

def compute_binary_metrics(y_true, y_pred):
    """
    Computes binary classification metrics: macro F1, precision, recall, and accuracy.
    Returns a dictionary of metric names to scores.
    """
    return dict(
        f1_macro=f1_score(y_true, y_pred, average="binary", zero_division=0),
        precision=precision_score(y_true, y_pred, average="binary", zero_division=0),
        recall=recall_score(y_true, y_pred, average="binary", zero_division=0),
        accuracy=accuracy_score(y_true, y_pred)
    )

def compute_multiclass_f1(y_true, y_pred, labels=range(1,8)):
    """
    Computes per-class F1, macro F1, and weighted F1 for multiclass predictions.
    Returns a dictionary of all scores.
    """
    f1s = f1_score(y_true, y_pred, labels=list(labels), average=None, zero_division=0)
    out = {f'f1_{lab}': f1s[lab-1] for lab in labels}
    out['f1_macro'] = f1_score(y_true, y_pred, average='macro', zero_division=0)
    out['f1_weighted'] = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    return out

def compute_fleiss_kappa_binary(preds_df):
    """
    Computes Fleiss' kappa statistic for binary predictions (multiple raters).
    Returns the kappa value.
    """
    counts = np.stack([(1 - preds_df.values).sum(1), preds_df.values.sum(1)], axis=1)
    return fleiss_kappa(counts)

def compute_fleiss_kappa_multiclass(preds_df, labels=range(1,8)):
    """
    Computes Fleiss' kappa statistic for multiclass predictions (multiple raters).
    Returns the kappa value.
    """
    arr = preds_df.values
    countmat = []
    for row in arr:
        countmat.append([(row==lab).sum() for lab in labels])
    return fleiss_kappa(np.array(countmat))

#### Main function for compiling results DataFrame for all models

In [None]:
def build_results_table(df_ml, df_llm, task="innovation"):
    """
    Build a tidy DataFrame summarizing all multilabel metrics, consistency, and model metadata
    for all predictors (ML, CNN, PLM, LLM). Used for generating result tables and figures.
    """
    records = []

    # Ground-truth label arrays
    TRUTH_COL = "update_classification"
    label_fn = innovation_dummy if task=="innovation" else (lambda x: x.astype(int))

    # Literature approaches
    if task == "innovation":
        literature_cols = ['first_digit', 'second_digit', 'KF23_innovation', 'AK23_innovation']
        y_true_lit = label_fn(df_ml[TRUTH_COL])
        for col in literature_cols:
            if col not in df_ml.columns:
                continue
            try:
                y_pred = df_ml[col].astype(int)
            except Exception:
                continue
            metrics = compute_binary_metrics(y_true_lit, y_pred)
            rec = {
                "raw_name": col,
                "raw_prefix": None,
                "model": col,
                "category": "Literature",
                "provider": None,
                "prompt_type": None,
                "temperature": None,
                "fine_tuned": None,
                "distribution": None,
                "size": None,
                "vectorizer": None,
                **metrics,
                "fleiss_kappa": np.nan,
                "mean_cohen_kappa": np.nan,
                "consistency_all_runs": np.nan
            }
            records.append(rec)

    # ML / CNN / PLM models
    y_true_ml = label_fn(df_ml[TRUTH_COL])
    for col in df_ml.columns:
        info = parse_ml_info(col)
        if info is None: continue
        # Binary: predictions in 0–6, shift to 1–7, then dummy
        try:
            y_pred_raw = df_ml[col].astype(int) + 1 if info['category'] in ('ML','CNN','PLM') else df_ml[col].astype(int)
        except Exception: continue
        y_pred = label_fn(y_pred_raw)
        if task=="innovation":
            metrics = compute_binary_metrics(y_true_ml, y_pred)
        else:
            metrics = compute_multiclass_f1(y_true_ml, y_pred)
        rec = {
            "raw_name": col, "raw_prefix": None, **info,
            "provider": info.get("provider", None),
            "prompt_type": info.get("prompt_type", None),
            "temperature": info.get("temperature", None),
            "fine_tuned": info.get("fine_tuned", None),
            **metrics,
            "fleiss_kappa": np.nan, "mean_cohen_kappa": np.nan, "consistency_all_runs": np.nan
        }
        records.append(rec)

    # LLMs
    y_true_llm = label_fn(df_llm[TRUTH_COL])
    all_llm_cols = [c for c in df_llm.columns if ("__run" in c) or re.search(r"__r\d+$", c)]
    prefix_to_runs = {}
    for col in all_llm_cols:
        prefix = col.rsplit("__run", 1)[0] if "__run" in col else col.rsplit("__r", 1)[0]
        prefix_to_runs.setdefault(prefix, []).append(col)
    for prefix, run_cols in prefix_to_runs.items():
        info = parse_llm_info(prefix)
        if not info: continue
        run1_col = next((c for c in run_cols if c.endswith("__run1") or c.endswith("__r1")), None)
        if not run1_col: continue
        try:
            y_pred_run1 = label_fn(df_llm[run1_col].astype(int))
        except Exception: continue
        if task=="innovation":
            metrics = compute_binary_metrics(y_true_llm, y_pred_run1)
        else:
            metrics = compute_multiclass_f1(y_true_llm, y_pred_run1)
        fleiss_k = mean_ck = consistency = np.nan
        try:
            preds_df = pd.DataFrame({
                rc: label_fn(df_llm[rc].astype(int))
                for rc in run_cols if rc in df_llm
            })
            preds_df = preds_df.dropna(axis=0, how='any')
            # Only if there are >=2 runs and non-empty
            if preds_df.shape[1] > 1 and preds_df.shape[0] > 0:
                try:
                    if task=="innovation":
                        fleiss_k = compute_fleiss_kappa_binary(preds_df)
                    else:
                        fleiss_k = compute_fleiss_kappa_multiclass(preds_df)
                except Exception:
                    fleiss_k = np.nan
                try:
                    cohens = [
                        cohen_kappa_score(preds_df[a], preds_df[b])
                        for i, a in enumerate(preds_df.columns)
                        for j, b in enumerate(preds_df.columns) if j > i
                    ]
                    mean_ck = float(np.mean(cohens)) if cohens else np.nan
                except Exception:
                    mean_ck = np.nan
                try:
                    consistency = (preds_df.nunique(axis=1) == 1).mean()
                except Exception:
                    consistency = np.nan
        except Exception:
            fleiss_k = mean_ck = consistency = np.nan
        rec = {
            "raw_name": None, "raw_prefix": prefix, **info, **metrics,
            "fleiss_kappa": fleiss_k,
            "mean_cohen_kappa": mean_ck,
            "consistency_all_runs": consistency
        }
        records.append(rec)
    return pd.DataFrame.from_records(records)

#### Load Model Predictions and Compile Results Table
We load the output prediction files for classical models (`MODELS_PATH`) and LLMs (`LLM_PATH`). We standardize column names, parse predictions, and compile a tidy results table containing all relevant evaluation metrics and metadata for each predictor. Processed results are saved in the `output_data` directory for downstream table and figure generation.

In [None]:
# Load prediction files
df_models_raw = pd.read_csv(MODELS_PATH, low_memory=False)
df_llm_raw    = pd.read_csv(LLM_PATH,    low_memory=False)
df_lit        = pd.read_csv(LITERATURE_PATH, low_memory=False)

# Merge models and literature df for processing
literature_cols = ['first_digit', 'second_digit', 'KF23_innovation', 'AK23_innovation']
cols_to_add = [c for c in literature_cols if c in df_lit.columns and c not in df_models_raw.columns]
df_models_raw = pd.concat(
    [df_models_raw.reset_index(drop=True), df_lit[cols_to_add].reset_index(drop=True)],
    axis=1
) if cols_to_add else df_models_raw

# Compile results for all models and LLMs
innovation_binary_df = build_results_table(df_models_raw, df_llm_raw, task="innovation")
innovation_binary_df.to_csv(os.path.join(OUT_DIR, "performance_updates_innovation.csv"), index=False)

classification_df = build_results_table(df_models_raw, df_llm_raw, task="classification")
classification_df.to_csv(os.path.join(OUT_DIR, "performance_updates_label-specific.csv"), index=False)

## Visuals
In this section, we generate all tables (Tables 2, 3, 5, and 6) and figures (Figures 6, 7, and C1) for the paper and online appendix, based on the processed results.

In [None]:
# Load the processed results table from the previous steps
innovation_binary_df = pd.read_csv(os.path.join(OUT_DIR, "performance_updates_innovation.csv"))
classification_df = pd.read_csv(os.path.join(OUT_DIR, "performance_updates_label-specific.csv"))

### Tables

In [None]:
# Define model groupings for later filtering
LITERATURE_APPROACHES = ['first_digit', 'second_digit', 'KF23_innovation', 'AK23_innovation']
CLASSICAL_MODELS = [
    "KNeighbors","LogisticRegression","NaiveBayes",
    "RandomForest","SVM","XGBoost",
]
CNN_MODELS = ["TextCNN42B"]
PLM_MODELS = ["bert","electra","roberta","xlnet"]
LLM_BASE_MODELS = [
    "3_5_haiku","sonnet_4","4_sonnet",
    "mistral_small","mistral_large",
    "gpt_4_1_nano","gpt_4_1_mini","gpt_4_1",
]
GPT4_FAMILY = ["gpt_4_1","gpt_4_1_mini","gpt_4_1_nano"]

ALL_COMPARATORS = (
    CLASSICAL_MODELS +
    CNN_MODELS +
    PLM_MODELS +
    LLM_BASE_MODELS
)

# Define Table Output Columns (2 and the others are different)

# For binary: summary
BINARY_COLS = [
    "model", "category", "provider", "fine_tuned", "temperature", "distribution", "size", "prompt_type", "vectorizer",
    "f1_macro", "precision", "recall", "accuracy", "fleiss_kappa", "mean_cohen_kappa", "consistency_all_runs"
]

# For multiclass: per-label F1 etc.
MULTICLASS_COLS = [
    "model", "category", "provider", "fine_tuned", "temperature", "distribution", "size", "prompt_type", "vectorizer",
    "f1_1", "f1_2", "f1_3", "f1_4", "f1_5", "f1_6", "f1_7",
    "f1_macro", "f1_weighted", "fleiss_kappa", "mean_cohen_kappa", "consistency_all_runs"
]

# Table specs for each output table: filter logic and model lists
TABLE_SPECS = {
    "table2": { # Innovation binary, all comparators, representative setting
        "models" : ALL_COMPARATORS + LITERATURE_APPROACHES,
        "filters": dict(
            temperature =(0.0, np.nan),
            distribution=("real", np.nan),
            size        =(2000, np.nan),
            prompt_type =("default", np.nan),
            vectorizer  =("tfidf",  np.nan),
        ),
        "task": "innovation"
    },

    "table3": { # Label-specific, all comparators, representative setting
        "models" : ALL_COMPARATORS,
        "filters": dict(
            temperature =(0.0, np.nan),
            distribution=("real", np.nan),
            size        =(2000, np.nan),
            prompt_type =("default", np.nan),
            vectorizer  =("tfidf",  np.nan),
        ),
        "task": "classification"
    },

    "table5": { # Label-specific, all LLMs, representative setting
        "models" : None,   # all LLMs
        "filters": dict(
            category    ="LLM",
            temperature =0.0,
            distribution=("real", np.nan),
            size        =(2000, np.nan),
            prompt_type ="default",
        ),
        "task": "classification"
    },

    "table6": { # Label-specific, only GPT-4 family models, representative setting
        "models" : GPT4_FAMILY,
        "filters": dict(
            temperature =0.0,
            distribution=("real", np.nan),
            size        =(2000, np.nan),
        ),
        "task": "classification"
    },
}

#### Function to construct tables based on filters and models

In [None]:
def make_table(df, columns, models=None, **filters):
    """
    Construct a filtered table based on provided models and column filters.
    - models: list or None. If list, keep only those models.
    - filters: key-value pairs; if tuple/list, allow any (or NaN).
    - columns: only columns present in both the input list and the DataFrame are kept.
    """
    mask = pd.Series(True, index=df.index)
    if models is not None:
        mask &= df["model"].isin(models)
    for col, val in filters.items():
        if isinstance(val, (list, tuple, set)):
            mask &= (df[col].isin(val) | df[col].isna())
        else:
            mask &= ((df[col] == val) | df[col].isna())
    cols_keep = [c for c in columns if c in df.columns]
    return df.loc[mask, cols_keep].copy()

#### Build and export all tables

In [None]:
for tname, spec in TABLE_SPECS.items():
    if spec["task"] == "innovation":
        df_use = innovation_binary_df
        colspec = BINARY_COLS
    else:
        df_use = classification_df
        colspec = MULTICLASS_COLS

    table_df = make_table(
        df_use,
        columns=colspec,
        models=spec.get("models"),
        **spec.get("filters", {})
    )
    out_path = os.path.join(VISUAL_DIR, f"{tname}_updates.xlsx")
    table_df.to_excel(out_path, index=False)

### Figures

In [None]:
# Set global matplotlib visual style for figures
plt.rcParams.update({
    # Typography
    "font.family"      : "Times New Roman",
    "font.size"        : 11,
    "axes.titlesize"   : 14,
    "axes.labelsize"   : 12,
    "xtick.labelsize"  : 10,
    "ytick.labelsize"  : 10,
    "legend.fontsize"  : 8,
    # Grid
    "grid.linestyle"   : "--",
    "grid.linewidth"   : 0.5,
    # Figure saving defaults
    "savefig.dpi"      : 300,
    "savefig.bbox"     : "tight",
})

# Define color and marker palettes for LLM and non-LLM models
STYLE_LLM = {
    "gpt_4_1":       {"color": "#08306B", "marker": "o", "label": "GPT-4.1"},
    "gpt_4_1_mini":  {"color": "#2171B5", "marker": "s", "label": "GPT-4.1 Mini"},
    "gpt_4_1_nano":  {"color": "#6BAED6", "marker": "^", "label": "GPT-4.1 Nano"},
    "mistral_large": {"color": "#006D2C", "marker": "D", "label": "Mistral Large"},
    "mistral_small": {"color": "#74C476", "marker": "p", "label": "Mistral Small"},
    "3_5_haiku":     {"color": "#A50F15", "marker": "X", "label": "Claude Haiku 3.5"},
    "sonnet_4":      {"color": "#FB6A4A", "marker": "*", "label": "Claude Sonnet 4"},
}
STYLE_NON = {
    "RandomForest":       {"color": "#8B0000", "marker": "o", "label": "Random Forest"},
    "KNeighbors":         {"color": "#B22222", "marker": "s", "label": "K-Nearest Neighbor"},
    "LogisticRegression": {"color": "#DC143C", "marker": "^", "label": "Logistic Regression"},
    "NaiveBayes":         {"color": "#FF6347", "marker": "D", "label": "Naïve Bayes"},
    "SVM":                {"color": "#FA8072", "marker": "v", "label": "SVM"},
    "XGBoost":            {"color": "#FF4500", "marker": "p", "label": "XGBoost"},
    "TextCNN42B":         {"color": "#FFD700", "marker": "h", "label": "CNN"},
    "bert":               {"color": "#2F4F4F", "marker": "X", "label": "BERT"},
    "electra":            {"color": "#696969", "marker": "*", "label": "ELECTRA"},
    "roberta":            {"color": "#A9A9A9", "marker": "+", "label": "RoBERTa"},
    "xlnet":              {"color": "#D3D3D3", "marker": "D", "label": "XLNet"},
}

def build_legend_handles(style_dict, dashed_note=False):
    """
    Build legend handles for consistent publication figures.
    """
    handles = [
        Line2D([0],[0], color=p["color"], marker=p["marker"],
               linestyle="-", linewidth=1, markersize=6)
        for p in style_dict.values()
    ]
    labels = [p["label"] for p in style_dict.values()]
    if dashed_note:
        handles.append(Line2D([0],[0], color="black", linestyle="--", linewidth=1))
        labels.append("Fine-Tuned Versions")
    return handles, labels

#### Functions to plot temperature, data size, and data distribution variations

In [None]:
# Plot performance curves over temperature for LLMs
def plot_llm_temp(classification_df, metric, ylabel, title, filename, ylim):
    """
    Plot LLM results as a function of temperature.
    Used for Figure 6 (main paper).
    """
    fig, ax = plt.subplots(figsize=(8,5))
    for (model, ft), grp in classification_df.groupby(["model", "fine_tuned"]):
        props = STYLE_LLM[model]
        ls    = "--" if ft else "-"
        grp   = grp.sort_values("temperature")
        ax.plot(grp["temperature"], grp[metric],
                color=props["color"], marker=props["marker"],
                linestyle=ls, linewidth=1, markersize=6)
    handles, labels = build_legend_handles(STYLE_LLM, dashed_note=True)
    ax.legend(handles, labels, loc="lower left",
              frameon=True, framealpha=.7, edgecolor="black")
    ax.set(xlabel="Temperature", ylabel=ylabel, title=title,
           xticks=[0, 0.5, 1, 1.5], ylim=ylim)
    ax.grid(True)
    fig.savefig(os.path.join(VISUAL_DIR, filename))
    plt.show()

# Plot F1 vs. data size for both non-LLM and LLM models
def plot_f1_vs_size(df_dist, df_base_llm, xticks, dist_label):
    """
    Plot F1-score as a function of training data size for
    (a) ML/CNN/PLM and (b) LLMs, for a given distribution.
    Used for Plots Figures 7 (main paper) and C1 (Online Appendix)
    """
    # non-LLM slice (tfidf only)
    df_nonllm = df_dist[
        (df_dist["category"] != "LLM") &
        ((df_dist["vectorizer"] == "tfidf") | df_dist["vectorizer"].isna())
    ].copy()
    if not df_nonllm.empty:
        fig, ax = plt.subplots(figsize=(8,5))
        for m, grp in df_nonllm.groupby("model"):
            props = STYLE_NON[m]
            grp   = grp.sort_values("size")
            ax.plot(grp["size"], grp["f1_macro"],
                    color=props["color"], marker=props["marker"],
                    linestyle="-", linewidth=1)
        h, l = build_legend_handles(STYLE_NON)
        ax.legend(h, l, loc="lower right", frameon=True, framealpha=.7)
        ax.set(xlabel="Training-Data Size (N)", ylabel="Macro Avg. F1-Score",
               title=f"Demonstration Study 1: Macro Avg. F1-Score vs. Size\n({dist_label} Distribution, non-LLM)",
               xticks=xticks, ylim=(0, 0.7))
        ax.grid(True); plt.tight_layout()
        fig.savefig(os.path.join(VISUAL_DIR, f"figure_C1_nonLLM_f1-size-{dist}_updates.png"))
        plt.show()

    # LLM slice (base point + dashed fine-tune curve)
    df_llm_ft = df_dist[
        (df_dist["category"] == "LLM") &
        (df_dist["fine_tuned"] != 0)
    ].copy()
    fig, ax = plt.subplots(figsize=(8,5))
    x0, xmax = 0, xticks[-1]

    ## Plot base LLMs as horizontal lines from x=0 (default) to max
    for m, row in df_base_llm.groupby("model"):
        props = STYLE_LLM[m]
        f1    = row["f1_macro"].iloc[0]
        ax.scatter(x0, f1, color=props["color"], marker=props["marker"], s=36, zorder=3)
        ax.hlines(f1, x0, xmax, color=props["color"], linewidth=1, zorder=2)

    ## Dashed curves for fine-tuned LLMs
    for m, grp in df_llm_ft.groupby("model"):
        props = STYLE_LLM[m]
        grp   = grp.sort_values("size")
        ax.plot(grp["size"], grp["f1_macro"],
                color=props["color"], marker=props["marker"],
                linestyle="--", linewidth=1, markersize=6)
    h, l = build_legend_handles(STYLE_LLM, dashed_note=True)
    xtick_full = [0] + xticks
    ax.legend(h, l, loc="lower right", frameon=True, framealpha=.7)
    ax.set(xlabel="Training-Data Size (N)", ylabel="Macro Avg. F1-Score",
           title=f"Demonstration Study 1: Macro Avg. F1-Score vs. Size\n({dist_label} Distribution, LLM)",
           xticks=xtick_full,
           xticklabels=['0\n(default)'] + list(map(str, xticks)),
           ylim=(0.2, 0.8))
    ax.grid(True); plt.tight_layout()
    fig.savefig(os.path.join(VISUAL_DIR,f"figure_7_LLM_f1-size-{dist}_updates.png"))
    plt.show()

#### Plot and export all figures

In [None]:
# For temperature plots (LLM only)
wanted_llm = list(STYLE_LLM.keys())
df_plot = classification_df[
    (classification_df["model"].isin(wanted_llm)) &
    ((classification_df["distribution"] == "real") | classification_df["distribution"].isna()) &
    ((classification_df["size"] == 2000) | classification_df["size"].isna()) &
    (classification_df["prompt_type"] == "default")
].copy()

# For data size plots: combine ML/CNN/PLM with LLMs
df_mlcnnplm = classification_df[classification_df["category"] != "LLM"]
df_llm_sel  = classification_df[(classification_df["category"] == "LLM") & (classification_df["model"].isin(wanted_llm))]
df_combined = pd.concat([df_mlcnnplm, df_llm_sel], ignore_index=True)

# Conditioned on default runs
df_cond = df_combined[
    ((df_combined["prompt_type"] == "default") | df_combined["prompt_type"].isna()) &
    ((df_combined["temperature"] == 0.0) | df_combined["temperature"].isna()) &
    ((df_combined["vectorizer"] == "tfidf") | df_combined["vectorizer"].isna())
].copy()

In [None]:
# Temperature curves: Figure 6 (F1, Consistency vs. Temperature)
plot_llm_temp(
    df_plot,
    metric   ="f1_macro",
    ylabel   ="Macro Avg. F1-Score",
    title    ="Demonstration Study 1: Macro Avg. F1-Score and Temperature",
    filename ="figure_6_LLM_f1-temperature_reviews.png",
    ylim     =(0.25, .80),
)
plot_llm_temp(
    df_plot,
    metric   ="consistency_all_runs",
    ylabel   ="Consistency Rate",
    title    ="Demonstration Study 1: Consistency Rate and Temperature",
    filename ="figure_6_LLM_consistency-temperature_reviews.png",
    ylim     =(0.80, 1.00),
)

# F1 vs size: Figures 7 and C1 (LLM and non-LLM)
label_map = {"real": "Representative", "equal": "Balanced"}
xticks = [100, 250, 500, 1000, 2000]

for dist in ["real", "equal"]:
    dist_lbl = label_map[dist]
    df_dist = df_cond[
        (df_cond["distribution"] == dist) & df_cond["size"].notna()
    ].copy()

    df_base_llm = df_cond[
        (df_cond["category"] == "LLM") &
        (df_cond["fine_tuned"] == 0) &
        (df_cond["model"].isin(STYLE_LLM.keys()))
    ].copy()
    plot_f1_vs_size(df_dist, df_base_llm, xticks, dist_lbl)