## Setup
In this notebook, we generate all tables and figures based on the results of demonstration study 2, for both the main paper and the online appendix. This includes data filtering, statistical summaries, and all visualization scripts. All output files are saved to the `paper_visuals` and `output_data` directories.

#### Imports
 See `requirements.txt` for full dependency versions

In [None]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.metrics import f1_score

#### Global Variables, Paths, and Settings

In [None]:
# Define Demo Study path
DEMO_PATH   = os.path.abspath(os.path.join(".."))

# Define relevant paths
OUT_DIR     = os.path.join(DEMO_PATH, "output_data")
LLM_PATH    = os.path.join(OUT_DIR, "validation_with_model_preds_LLM_cleaned.csv")
MODELS_PATH = os.path.join(OUT_DIR, "validation_ovr_with_model_preds_NLP.csv")
VISUAL_DIR  = os.path.join(DEMO_PATH, "paper_visuals")

# Define number of labels
NUM_LABELS = 9

## Pre-Processing
In this section, we define regular expressions and helper functions for parsing model output columns. This includes extracting metadata (e.g., data type, size, classifier/model, and vectorizer) for classical ML, CNN, and PLM models, as well as standardizing column names and processing LLM results. These routines are used to prepare the results tables and figures for the paper and online appendix.

In [None]:
# Define constants for parsing model output column names and metadata extraction
DATA_TYPES   = ['real', 'equal']
DATA_SIZES   = ['2000','1000','500','250','100']
CLASSIFIERS  = ['RandomForest','KNeighbors','LogisticRegression',
                'NaiveBayes','SVM','XGBoost']
CNN_MODELS   = ['TextCNN42B']
PLM_MODELS   = ['roberta','bert','electra','xlnet']
GLOVE_VECTORIZERS = ["6B-100d","6B-300d","42B-300d","840B-300d"]

# Pre-compiled regex patterns to match output columns for each model family
ML_PATTERN    = re.compile(
    rf'^(.+?)__({"|".join(DATA_TYPES)})__({"|".join(DATA_SIZES)})__'
    rf'({"|".join(CLASSIFIERS)})_pred$', re.I)
CNN_PATTERN   = re.compile(
    rf'^({"|".join(CNN_MODELS)})__({"|".join(DATA_TYPES)})__'
    rf'({"|".join(DATA_SIZES)})_pred$', re.I)
PLM_PATTERN   = re.compile(
    rf'^({"|".join(PLM_MODELS)})_({"|".join(DATA_TYPES)})_('
    rf'{"|".join(DATA_SIZES)})_pred$', re.I)
GLOVE_PATTERN = re.compile(
    rf'^({"|".join(GLOVE_VECTORIZERS)})_({"|".join(DATA_TYPES)})_('
    rf'{"|".join(DATA_SIZES)})__({"|".join(CLASSIFIERS)})_pred$', re.I)

# Define known LLM and other model keys for later identification and provider mapping
MODEL_KEYS = sorted([
    # OpenAI
    "gpt_4_1_nano", "gpt_4_1_mini", "gpt_4_1",
    # Mistral
    "mistral_large", "mistral_medium", "mistral_small",
    "open_mistral_nemo", "ministral_8b", "ministral_3b",
    # Anthropic
    "4_sonnet", "sonnet_4",
    "3_5_haiku", "3_haiku",
    "3_7_sonnet", "3_5_sonnet",
    "3_opus", "opus_4",
    # OpenAI (legacy keys)
    "gpt_4o_mini", "gpt_4o",
    "gpt_3_5_turbo_0125",
    "o4_mini_2025_04_16", "o3_2025_04_16",
], key=len, reverse=True)

# Mapping model keys to their provider (OpenAI, Mistral, Anthropic)
PROVIDER = {
    **{k:"OpenAI"   for k in MODEL_KEYS if k.startswith("gpt") or k.startswith("o")},
    **{k:"Mistral"  for k in MODEL_KEYS if "mistral" in k or "ministral" in k},
    **{k:"Anthropic"for k in MODEL_KEYS if k.endswith(("haiku","sonnet","opus"))},
}

#### Helper functions for column and prediction parsing

In [None]:
# Model metadata parsing helpers
def standardize_llm_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardizes LLM column names by removing trailing '__review' and fixing underscores
    before temperature tokens (e.g., '_t0' → '__t0').
    """
    def _fix(c):
        c = re.sub(r'__review$', '', c)
        return re.sub(r'(?<!_)_(t\d+)', r'__\1', c)
    df = df.copy()
    df.columns = [_fix(c) for c in df.columns]
    return df

def parse_ml_info(col: str) -> dict|None:
    """
    Parses ML/CNN/PLM column names to extract model metadata.
    Returns a dictionary of metadata if a pattern matches, otherwise None.
    """
    m = GLOVE_PATTERN.match(col)
    if m:
        vec, dist, size, clf = m.groups()
        return dict(model=clf, category="ML", vectorizer=vec,
                    distribution=dist, size=int(size))
    m = ML_PATTERN.match(col)
    if m:
        vec, dist, size, clf = m.groups()
        return dict(model=clf, category="ML", vectorizer=vec,
                    distribution=dist, size=int(size))
    m = CNN_PATTERN.match(col)
    if m:
        model, dist, size = m.groups()
        return dict(model=model, category="CNN", distribution=dist, size=int(size))
    m = PLM_PATTERN.match(col)
    if m:
        model, dist, size = m.groups()
        return dict(model=model, category="PLM", distribution=dist, size=int(size))
    return None

def parse_llm_info(prefix: str) -> dict|None:
    """
    Parses LLM column prefixes to extract metadata:
    model, provider, prompt type, temperature, fine-tuning flag, etc.
    Returns a dictionary if successfully parsed, else None.
    """
    parts = prefix.split("__")
    if len(parts) != 3:
        return None

    core, prompt_type, temp_str = parts
    fine_tuned = core.startswith("ft_")
    if fine_tuned:
        core = core[3:]

    # Look up model and provider
    model = next((mk for mk in MODEL_KEYS if mk in core), None)
    provider = PROVIDER.get(model)

    # Extract distribution and size if present
    m_ds = re.search(rf"_(real|equal)_({'|'.join(DATA_SIZES)})", core)
    distribution, size = (m_ds.group(1), int(m_ds.group(2))) if m_ds else (None, None)

    # Parse temperature value (e.g., t0 → 0.0)
    temperature = None
    if temp_str.startswith("t") and temp_str[1:].isdigit():
        temperature = int(temp_str[1:]) / 10.0

    return {
        "vectorizer":   None,
        "distribution": distribution,
        "size":         size,
        "model":        model or core,   # fallback if unseen
        "category":     "LLM",
        "provider":     provider,
        "prompt_type":  prompt_type,
        "temperature":  temperature,
        "fine_tuned":   int(fine_tuned),
    }

# Multilabel prediction parsing helpers
def parse_multi_hot(val, num_labels=NUM_LABELS):
    """Convert a prediction (list/array/str) to a multi-hot label vector."""
    if isinstance(val, (list, np.ndarray)):
        return [int(x) for x in val]
    if isinstance(val, str):
        return [int(x) for x in re.findall(r'\d', val)]
    return [0]*num_labels

def parse_model_pred(val, num_labels=NUM_LABELS):
    """
    Parse LLM or model prediction as a multi-hot vector.
    Handles empty, 'nan', or '9' (all-negative) cases robustly.
    """
    val = str(val).strip()
    if val in ("", "9", "nan"): return [0]*num_labels
    vec = [0]*num_labels
    for tok in val.replace(" ", "").split(";"):
        if tok.isdigit() and 0 <= int(tok) < num_labels:
            vec[int(tok)] = 1
    return vec

# Metric computation helpers
def multilabel_metrics(y_true, y_pred):
    """
    Compute multilabel F1 (macro/micro/weighted), subset accuracy, and hamming loss.
    Returns a dictionary of metrics for result tables.
    """
    f1s = f1_score(y_true, y_pred, average=None, zero_division=0)
    return {
        **{f"f1_{i}": f for i, f in enumerate(f1s)},
        "f1_macro":    f1_score(y_true, y_pred, average="macro",   zero_division=0),
        "f1_micro":    f1_score(y_true, y_pred, average="micro",   zero_division=0),
        "f1_weighted": f1_score(y_true, y_pred, average="weighted",zero_division=0),
        "subset_acc":  np.mean((y_true == y_pred).all(axis=1)),
        "hamming_loss":np.mean(y_true != y_pred),
    }

def consistency_across_runs(df, cols):
    """
    Compute the fraction of samples for which all prediction runs agree (consistency).
    Used to assess LLM stability across repeated calls.
    """
    arrs = [np.vstack(df[c].apply(parse_model_pred).values) for c in cols]
    arrs = np.stack(arrs, axis=1)
    return np.mean([np.all(arrs[i]==arrs[i][0]) for i in range(arrs.shape[0])])

#### Main function for compiling results DataFrame for all models

In [None]:
def build_multilabel_results(df_ml, df_llm) -> pd.DataFrame:
    """
    Build a tidy DataFrame summarizing all multilabel metrics, consistency, and model metadata
    for all predictors (ML, CNN, PLM, LLM). Used for generating result tables and figures.
    """
    exclude = {'review_id','user_id','title','body','label','id','app_id',
               'review_text_plain','review_text_tagged',
               'split_labels','sorted_labels','multi_hot'}
    records = []

    # Ground-truth label arrays
    y_true_ml  = np.vstack(df_ml['multi_hot'].apply(parse_multi_hot).values)
    y_true_llm = np.vstack(df_llm['multi_hot'].apply(parse_multi_hot).values)

    # ML / CNN / PLM models
    for col in (c for c in df_ml.columns if c not in exclude):
        info = parse_ml_info(col)
        if not info: continue
        y_pred = np.vstack(df_ml[col].apply(parse_multi_hot).values)
        rec    = {**info, "raw_name":col,
                  **multilabel_metrics(y_true_ml, y_pred),
                  "consistency": np.nan}
        records.append(rec)

    # LLMs
    llm_cols = [c for c in df_llm.columns if c not in exclude]
    prefix_runs = {}
    for c in llm_cols:
        m = re.match(r"(.+?)(__run\d+|__r\d+)$", c)
        prefix = m.group(1) if m else c
        prefix_runs.setdefault(prefix, []).append(c)

    for prefix, runs in prefix_runs.items():
        info = parse_llm_info(prefix)
        if not info: continue
        main = sorted(runs)[0]
        y_pred = np.vstack(df_llm[main].apply(parse_model_pred).values)
        rec = {**info, "raw_name":main,
               **multilabel_metrics(y_true_llm, y_pred),
               "consistency": consistency_across_runs(df_llm, runs)
                              if len(runs) > 1 else np.nan}
        records.append(rec)

    return pd.DataFrame.from_records(records)

#### Load Model Predictions and Compile Results Table
We load the output prediction files for classical models (`MODELS_PATH`) and LLMs (`LLM_PATH`). We standardize column names, parse predictions, and compile a tidy results table containing all relevant evaluation metrics and metadata for each predictor. Processed results are saved in the `output_data` directory for downstream table and figure generation.

In [None]:
# Load prediction files
df_models_raw = pd.read_csv(MODELS_PATH, low_memory=False)
df_llm_raw    = pd.read_csv(LLM_PATH, low_memory=False)

# Standardize LLM column names
df_llm    = standardize_llm_columns(df_llm_raw)
df_models = df_models_raw.copy()

# Compile results for all models and LLMs
results_df = build_multilabel_results(df_models, df_llm)
results_df.to_csv(os.path.join(OUT_DIR, "performance_reviews_label-specific.csv"), index=False)

## Visuals
In this section, we generate all tables (Tables 4, 5, and 6) and figures (Figures 6, 7, and C1) for the paper and online appendix, based on the processed results.


In [None]:
# Load the processed results table from the previous steps
df = pd.read_csv(os.path.join(OUT_DIR, "performance_reviews_label-specific.csv"))
df = df[~((df["raw_name"] == "ft_mistral_small_latest_d1ef7e20_20250529_bba6ea02_real_250__default__t0__run1") & (df.select_dtypes(include='number').eq(0).all(axis=1)))] # remove one bad row

### Tables

In [None]:
# Define model groupings for later filtering
CLASSICAL_MODELS = [
    "KNeighbors","LogisticRegression","NaiveBayes",
    "RandomForest","SVM","XGBoost",
]
CNN_MODELS = ["TextCNN42B"]
PLM_MODELS = ["bert","electra","roberta","xlnet"]
LLM_BASE_MODELS = [
    "3_5_haiku","sonnet_4","4_sonnet",
    "mistral_small","mistral_large",
    "gpt_4_1_nano","gpt_4_1_mini","gpt_4_1",
]
GPT4_FAMILY = ["gpt_4_1","gpt_4_1_mini","gpt_4_1_nano"]

ALL_COMPARATORS = (
    CLASSICAL_MODELS +
    CNN_MODELS +
    PLM_MODELS +
    LLM_BASE_MODELS
)

# Table specs for each output table: filter logic and model lists
TABLE_SPECS = {
    "table4": { # All key comparators, representative setting
        "models" : ALL_COMPARATORS,
        "filters": dict(
            temperature =(0.0, np.nan),
            distribution=("real", np.nan),
            size        =(2000, np.nan),
            prompt_type =("default", np.nan),
            vectorizer  =("tfidf",  np.nan),
        ),
    },

    "table5": { # All LLMs, representative setting
        "models" : None,
        "filters": dict(
            category    ="LLM",
            temperature =0.0,
            distribution=("real", np.nan),
            size        =(2000, np.nan),
            prompt_type ="default",
        ),
    },

    "table6": { # Only GPT-4 family models, representative setting
        "models" : GPT4_FAMILY,
        "filters": dict(
            temperature =0.0,
            distribution=("real", np.nan),
            size        =(2000, np.nan),
        ),
    },
}

#### Function to construct tables based on filters and models

In [None]:
def make_table(df, models=None, **filters):
    """
    Construct a filtered table based on provided models and column filters.
    - models: list or None. If list, keep only those models.
    - filters: key-value pairs; if tuple/list, allow any (or NaN).
    """
    mask = pd.Series(True, index=df.index)
    if models is not None:
        mask &= df["model"].isin(models)
    for col, val in filters.items():
        if isinstance(val, (list, tuple, set)):
            mask &= (df[col].isin(val) | df[col].isna())
        else:
            mask &= (df[col] == val)

    base = ["model", "category", "provider", "fine_tuned", "temperature",
            "distribution", "size", "prompt_type", "vectorizer"]
    f1s  = [f"f1_{i}" for i in range(NUM_LABELS)]
    keep = base + f1s + ["f1_macro", "f1_weighted", "consistency"]
    keep = [c for c in keep if c in df.columns]
    return df.loc[mask, keep].copy()

#### Build and export all tables

In [None]:
tables = {}
for name, spec in TABLE_SPECS.items():
    tables[name] = make_table(
        df,
        models  = spec["models"],
        **spec["filters"]
    )
    out_path = os.path.join(VISUAL_DIR, f"{name}_reviews.xlsx")
    tables[name].to_excel(out_path, index=False)

### Figures

In [None]:
# Set global matplotlib visual style for figures
plt.rcParams.update({
    # Typography
    "font.family"      : "Times New Roman",
    "font.size"        : 11,
    "axes.titlesize"   : 14,
    "axes.labelsize"   : 12,
    "xtick.labelsize"  : 10,
    "ytick.labelsize"  : 10,
    "legend.fontsize"  : 8,
    # Grid
    "grid.linestyle"   : "--",
    "grid.linewidth"   : 0.5,
    # Figure saving defaults
    "savefig.dpi"      : 300,
    "savefig.bbox"     : "tight",
})

# Define color and marker palettes for LLM and non-LLM models
STYLE_LLM = {
    "gpt_4_1":       {"color": "#08306B", "marker": "o", "label": "GPT-4.1"},
    "gpt_4_1_mini":  {"color": "#2171B5", "marker": "s", "label": "GPT-4.1 Mini"},
    "gpt_4_1_nano":  {"color": "#6BAED6", "marker": "^", "label": "GPT-4.1 Nano"},
    "mistral_large": {"color": "#006D2C", "marker": "D", "label": "Mistral Large"},
    "mistral_small": {"color": "#74C476", "marker": "p", "label": "Mistral Small"},
    "3_5_haiku":     {"color": "#A50F15", "marker": "X", "label": "Claude Haiku 3.5"},
    "sonnet_4":      {"color": "#FB6A4A", "marker": "*", "label": "Claude Sonnet 4"},
}
STYLE_NON = {
    "RandomForest":       {"color": "#8B0000", "marker": "o", "label": "Random Forest"},
    "KNeighbors":         {"color": "#B22222", "marker": "s", "label": "K-Nearest Neighbor"},
    "LogisticRegression": {"color": "#DC143C", "marker": "^", "label": "Logistic Regression"},
    "NaiveBayes":         {"color": "#FF6347", "marker": "D", "label": "Naïve Bayes"},
    "SVM":                {"color": "#FA8072", "marker": "v", "label": "SVM"},
    "XGBoost":            {"color": "#FF4500", "marker": "p", "label": "XGBoost"},
    "TextCNN42B":         {"color": "#FFD700", "marker": "h", "label": "CNN"},
    "bert":               {"color": "#2F4F4F", "marker": "X", "label": "BERT"},
    "electra":            {"color": "#696969", "marker": "*", "label": "ELECTRA"},
    "roberta":            {"color": "#A9A9A9", "marker": "+", "label": "RoBERTa"},
    "xlnet":              {"color": "#D3D3D3", "marker": "D", "label": "XLNet"},
}

def build_legend_handles(style_dict, dashed_note=False):
    """
    Build legend handles for consistent publication figures.
    """
    handles = [
        Line2D([0],[0], color=p["color"], marker=p["marker"],
               linestyle="-", linewidth=1, markersize=6)
        for p in style_dict.values()
    ]
    labels = [p["label"] for p in style_dict.values()]
    if dashed_note:
        handles.append(Line2D([0],[0], color="black", linestyle="--", linewidth=1))
        labels.append("Fine-Tuned Versions")
    return handles, labels

#### Functions to plot temperature, data size, and data distribution variations

In [None]:
# Plot performance curves over temperature for LLMs
def plot_llm_temp(df, metric, ylabel, title, filename, ylim):
    """
    Plot LLM results as a function of temperature.
    Used for Figure 6 (main paper).
    """
    fig, ax = plt.subplots(figsize=(8,5))
    for (model, ft), grp in df.groupby(["model", "fine_tuned"]):
        props = STYLE_LLM[model]
        ls    = "--" if ft else "-"
        grp   = grp.sort_values("temperature")
        ax.plot(grp["temperature"], grp[metric],
                color=props["color"], marker=props["marker"],
                linestyle=ls, linewidth=1, markersize=6)
    handles, labels = build_legend_handles(STYLE_LLM, dashed_note=True)
    ax.legend(handles, labels, loc="lower left",
              frameon=True, framealpha=.7, edgecolor="black")
    ax.set(xlabel="Temperature", ylabel=ylabel, title=title,
           xticks=[0, 0.5, 1, 1.5], ylim=ylim)
    ax.grid(True)
    fig.savefig(os.path.join(VISUAL_DIR, filename))
    plt.show()

# Plot F1 vs. data size for both non-LLM and LLM models
def plot_f1_vs_size(df_dist, df_base_llm, xticks, dist_label):
    """
    Plot F1-score as a function of training data size for
    (a) ML/CNN/PLM and (b) LLMs, for a given distribution.
    Used for Plots Figures 7 (main paper) and C1 (Online Appendix)
    """
    # non-LLM slice (tfidf only)
    df_nonllm = df_dist[
        (df_dist["category"] != "LLM") &
        ((df_dist["vectorizer"] == "tfidf") | df_dist["vectorizer"].isna())
    ].copy()
    if not df_nonllm.empty:
        fig, ax = plt.subplots(figsize=(8,5))
        for m, grp in df_nonllm.groupby("model"):
            props = STYLE_NON[m]
            grp   = grp.sort_values("size")
            ax.plot(grp["size"], grp["f1_macro"],
                    color=props["color"], marker=props["marker"],
                    linestyle="-", linewidth=1)
        h, l = build_legend_handles(STYLE_NON)
        ax.legend(h, l, loc="lower right", frameon=True, framealpha=.7)
        ax.set(xlabel="Training-Data Size (N)", ylabel="Macro Avg. F1-Score",
               title=f"Demonstration Study 2: Macro Avg. F1-Score vs. Size\n({dist_label} Distribution, non-LLM)",
               xticks=xticks, ylim=(0, 0.5))
        ax.grid(True); plt.tight_layout()
        fig.savefig(os.path.join(VISUAL_DIR, f"figure_C1_nonLLM_f1-size-{dist}_reviews.png"))
        plt.show()

    # LLM slice (base point + dashed fine-tune curve)
    df_llm_ft = df_dist[
        (df_dist["category"] == "LLM") &
        (df_dist["fine_tuned"] != 0)
    ].copy()
    fig, ax = plt.subplots(figsize=(8,5))
    x0, xmax = 0, xticks[-1]

    ## Plot base LLMs as horizontal lines from x=0 (default) to max
    for m, row in df_base_llm.groupby("model"):
        props = STYLE_LLM[m]
        f1    = row["f1_macro"].iloc[0]
        ax.scatter(x0, f1, color=props["color"], marker=props["marker"], s=36, zorder=3)
        ax.hlines(f1, x0, xmax, color=props["color"], linewidth=1, zorder=2)

    ## Dashed curves for fine-tuned LLMs
    for m, grp in df_llm_ft.groupby("model"):
        props = STYLE_LLM[m]
        grp   = grp.sort_values("size")
        ax.plot(grp["size"], grp["f1_macro"],
                color=props["color"], marker=props["marker"],
                linestyle="--", linewidth=1, markersize=6)
    h, l = build_legend_handles(STYLE_LLM, dashed_note=True)
    xtick_full = [0] + xticks
    ax.legend(h, l, loc="lower right", frameon=True, framealpha=.7)
    ax.set(xlabel="Training-Data Size (N)", ylabel="Macro Avg. F1-Score",
           title=f"Demonstration Study 2: Macro Avg. F1-Score vs. Size\n({dist_label} Distribution, LLM)",
           xticks=xtick_full,
           xticklabels=['0\n(default)'] + list(map(str, xticks)),
           ylim=(0.2, 0.8))
    ax.grid(True); plt.tight_layout()
    fig.savefig(os.path.join(VISUAL_DIR,f"figure_7_LLM_f1-size-{dist}_reviews.png"))
    plt.show()

#### Plot and export all figures

In [None]:
# For temperature plots (LLM only)
wanted_llm = list(STYLE_LLM.keys())
df_plot = df[
    (df["model"].isin(wanted_llm)) &
    ((df["distribution"] == "real") | df["distribution"].isna()) &
    ((df["size"] == 2000) | df["size"].isna()) &
    (df["prompt_type"] == "default")
].copy()

# For data size plots: combine ML/CNN/PLM with LLMs
df_mlcnnplm = df[df["category"] != "LLM"]
df_llm_sel  = df[(df["category"] == "LLM") & (df["model"].isin(wanted_llm))]
df_combined = pd.concat([df_mlcnnplm, df_llm_sel], ignore_index=True)

# Conditioned on default runs
df_cond = df_combined[
    ((df_combined["prompt_type"] == "default") | df_combined["prompt_type"].isna()) &
    ((df_combined["temperature"] == 0.0) | df_combined["temperature"].isna()) &
    ((df_combined["vectorizer"] == "tfidf") | df_combined["vectorizer"].isna())
].copy()

In [None]:
# Temperature curves: Figure 6 (F1, Consistency vs. Temperature)
plot_llm_temp(
    df_plot,
    metric   ="f1_macro",
    ylabel   ="Macro Avg. F1-Score",
    title    ="Demonstration Study 2: Macro Avg. F1-Score and Temperature",
    filename ="figure_6_LLM_f1-temperature_reviews.png",
    ylim     =(0.25, 0.80),
)
plot_llm_temp(
    df_plot,
    metric   ="consistency",
    ylabel   ="Consistency Rate",
    title    ="Demonstration Study 2: Consistency Rate and Temperature",
    filename ="figure_6_LLM_consistency-temperature_reviews.png",
    ylim     =(0.60, 1.00),
)

# F1 vs size: Figures 7 and C1 (LLM and non-LLM)
label_map = {"real": "Representative", "equal": "Balanced"}
xticks = [100, 250, 500, 1000, 2000]

for dist in ["real", "equal"]:
    dist_lbl = label_map[dist]
    df_dist = df_cond[
        (df_cond["distribution"] == dist) & df_cond["size"].notna()
    ].copy()

    df_base_llm = df_cond[
        (df_cond["category"] == "LLM") &
        (df_cond["fine_tuned"] == 0) &
        (df_cond["model"].isin(STYLE_LLM.keys()))
    ].copy()
    plot_f1_vs_size(df_dist, df_base_llm, xticks, dist_lbl)