# Final Visualizations â€” Publication Figures

Generates 7 publication-quality figures (300 DPI) from completed experiment results.
All data sourced from `results/` metrics CSVs and `FINAL_EXPERIMENT_SUMMARY.md`.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

RESULTS_DIR = "../results"
DPI = 300

# Colorblind-friendly palette (approach -> color)
COLORS = {
    "Classic ML": "#2ca02c",   # green
    "Zero-shot":  "#1f77b4",   # blue
    "Few-shot":   "#ff7f0e",   # orange
    "Fine-tuned": "#d62728",   # red
}

plt.rcParams.update({
    "figure.facecolor": "white",
    "axes.facecolor": "white",
    "axes.grid": True,
    "grid.alpha": 0.3,
    "axes.spines.top": False,
    "axes.spines.right": False,
    "font.size": 10,
})

print("Setup complete.")

Setup complete.


In [2]:
# Data definitions: all 16 parent-level experiments (from FINAL_EXPERIMENT_SUMMARY.md Table 1)
experiments = [
    {"name": "XGBoost (tuned)",         "macro_f1": 0.6928, "micro_f1": 0.7454, "macro_auc": 0.9321, "approach": "Classic ML",  "cost": 3.30,  "params_b": None},
    {"name": "XGBoost (baseline)",       "macro_f1": 0.6906, "micro_f1": 0.7459, "macro_auc": 0.9320, "approach": "Classic ML",  "cost": 0.01,  "params_b": None},
    {"name": "DeepSeek V3.2+think",      "macro_f1": 0.6807, "micro_f1": 0.7226, "macro_auc": 0.8099, "approach": "Zero-shot",  "cost": 6.73,  "params_b": 671},
    {"name": "XGBoost (164K)",           "macro_f1": 0.6777, "micro_f1": 0.7390, "macro_auc": 0.9420, "approach": "Classic ML",  "cost": 0.64,  "params_b": None},
    {"name": "Mistral Large 3 ZS",       "macro_f1": 0.6579, "micro_f1": 0.7121, "macro_auc": 0.7934, "approach": "Zero-shot",  "cost": 0.01,  "params_b": 675},
    {"name": "Mistral Large 3 FS",       "macro_f1": 0.6398, "micro_f1": 0.6860, "macro_auc": 0.7931, "approach": "Few-shot",   "cost": 0.01,  "params_b": 675},
    {"name": "DeepSeek V3.2 ZS",         "macro_f1": 0.6225, "micro_f1": 0.6934, "macro_auc": 0.7460, "approach": "Zero-shot",  "cost": 1.39,  "params_b": 671},
    {"name": "Ministral 8B FS",          "macro_f1": 0.5398, "micro_f1": 0.5360, "macro_auc": 0.7460, "approach": "Few-shot",   "cost": 0.41,  "params_b": 8},
    {"name": "Qwen3 FS tax+think",       "macro_f1": 0.5325, "micro_f1": 0.5560, "macro_auc": 0.7040, "approach": "Few-shot",   "cost": 6.67,  "params_b": 8},
    {"name": "Qwen3 FS taxonomy",        "macro_f1": 0.5255, "micro_f1": 0.5436, "macro_auc": 0.7089, "approach": "Few-shot",   "cost": 0.45,  "params_b": 8},
    {"name": "Qwen3 fine-tuned",         "macro_f1": 0.5098, "micro_f1": 0.6318, "macro_auc": 0.6999, "approach": "Fine-tuned", "cost": 10.83, "params_b": 8},
    {"name": "Qwen3 ZS taxonomy",        "macro_f1": 0.4990, "micro_f1": 0.6050, "macro_auc": 0.7270, "approach": "Zero-shot",  "cost": 0.33,  "params_b": 8},
    {"name": "Ministral 8B ZS",          "macro_f1": 0.4906, "micro_f1": 0.5430, "macro_auc": 0.7440, "approach": "Zero-shot",  "cost": 0.25,  "params_b": 8},
    {"name": "Ministral 8B FT",          "macro_f1": 0.4892, "micro_f1": 0.5420, "macro_auc": 0.7440, "approach": "Fine-tuned", "cost": 10.95, "params_b": 8},
    {"name": "Qwen3 ZS basic",           "macro_f1": 0.4590, "micro_f1": 0.4730, "macro_auc": 0.7270, "approach": "Zero-shot",  "cost": 0.35,  "params_b": 8},
    {"name": "Qwen3 FS basic",           "macro_f1": 0.4525, "micro_f1": 0.4680, "macro_auc": 0.7040, "approach": "Few-shot",   "cost": 0.46,  "params_b": 8},
]

df_exp = pd.DataFrame(experiments).sort_values("macro_f1", ascending=False).reset_index(drop=True)
print(f"{len(df_exp)} experiments loaded.")
df_exp[["name", "macro_f1", "approach", "cost"]]

16 experiments loaded.


Unnamed: 0,name,macro_f1,approach,cost
0,XGBoost (tuned),0.6928,Classic ML,3.3
1,XGBoost (baseline),0.6906,Classic ML,0.01
2,DeepSeek V3.2+think,0.6807,Zero-shot,6.73
3,XGBoost (164K),0.6777,Classic ML,0.64
4,Mistral Large 3 ZS,0.6579,Zero-shot,0.01
5,Mistral Large 3 FS,0.6398,Few-shot,0.01
6,DeepSeek V3.2 ZS,0.6225,Zero-shot,1.39
7,Ministral 8B FS,0.5398,Few-shot,0.41
8,Qwen3 FS tax+think,0.5325,Few-shot,6.67
9,Qwen3 FS taxonomy,0.5255,Few-shot,0.45


In [3]:
# Figure 1: Grand Comparison Bar Chart
fig1_path = os.path.join(RESULTS_DIR, "fig_grand_comparison.png")

if os.path.exists(fig1_path):
    print(f"Figure 1 exists, skipping: {fig1_path}")
else:
    df_plot = df_exp.sort_values("macro_f1", ascending=True).reset_index(drop=True)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    bar_colors = [COLORS[row["approach"]] for _, row in df_plot.iterrows()]
    bars = ax.barh(range(len(df_plot)), df_plot["macro_f1"], color=bar_colors, edgecolor="white", linewidth=0.5)
    
    ax.set_yticks(range(len(df_plot)))
    ax.set_yticklabels(df_plot["name"], fontsize=10)
    ax.set_xlabel("Macro-F1", fontsize=11)
    ax.set_title("Parent-Level Classification: All 16 Experiments", fontsize=13, fontweight="bold")
    
    # Value labels
    for i, (val, bar) in enumerate(zip(df_plot["macro_f1"], bars)):
        ax.text(val + 0.005, i, f"{val:.3f}", va="center", fontsize=9)
    
    # Best Classic ML dashed line
    best_ml = df_exp.loc[df_exp["approach"] == "Classic ML", "macro_f1"].max()
    ax.axvline(best_ml, color="#2ca02c", linestyle="--", linewidth=1.2, alpha=0.7)
    
    ax.set_xlim(0, df_exp["macro_f1"].max() + 0.06)
    
    # Legend
    legend_handles = [Patch(facecolor=c, label=a) for a, c in COLORS.items()]
    ax.legend(handles=legend_handles, loc="lower right", fontsize=10, framealpha=0.9)
    
    plt.tight_layout()
    plt.savefig(fig1_path, dpi=DPI, bbox_inches="tight", facecolor="white")
    plt.close()
    print(f"Figure 1 saved: {fig1_path}")

Figure 1 exists, skipping: ../results\fig_grand_comparison.png


In [4]:
# Figure 2: Four-Way Approach Summary
fig2_path = os.path.join(RESULTS_DIR, "fig_approach_summary.png")

if os.path.exists(fig2_path):
    print(f"Figure 2 exists, skipping: {fig2_path}")
else:
    # Best result per approach
    approach_best = {
        "Classic ML":  {"Macro-F1": 0.6928, "Micro-F1": 0.7454, "Macro-AUC": 0.9321},
        "Zero-shot":   {"Macro-F1": 0.6807, "Micro-F1": 0.7226, "Macro-AUC": 0.8099},
        "Few-shot":    {"Macro-F1": 0.6398, "Micro-F1": 0.6860, "Macro-AUC": 0.7931},
        "Fine-tuned":  {"Macro-F1": 0.5098, "Micro-F1": 0.6318, "Macro-AUC": 0.6999},
    }
    
    approaches = list(approach_best.keys())
    metrics = ["Macro-F1", "Micro-F1", "Macro-AUC"]
    metric_colors = ["#4c72b0", "#55a868", "#c44e52"]
    
    x = np.arange(len(approaches))
    width = 0.25
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    for i, (metric, color) in enumerate(zip(metrics, metric_colors)):
        vals = [approach_best[a][metric] for a in approaches]
        bars = ax.bar(x + (i - 1) * width, vals, width, label=metric, color=color, edgecolor="white", linewidth=0.5)
        for bar, val in zip(bars, vals):
            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
                    f"{val:.3f}", ha="center", va="bottom", fontsize=8)
    
    ax.set_xticks(x)
    ax.set_xticklabels(approaches, fontsize=11)
    ax.set_ylabel("Score", fontsize=11)
    ax.set_ylim(0, 1.05)
    ax.set_title("Best Result per Approach (Parent-Level, 13 Labels)", fontsize=13, fontweight="bold")
    ax.legend(fontsize=10, framealpha=0.9)
    
    plt.tight_layout()
    plt.savefig(fig2_path, dpi=DPI, bbox_inches="tight", facecolor="white")
    plt.close()
    print(f"Figure 2 saved: {fig2_path}")

Figure 2 exists, skipping: ../results\fig_approach_summary.png


In [5]:
# Figure 3: Per-Category F1 Heatmap (Top 5 experiments)
fig3_path = os.path.join(RESULTS_DIR, "fig_category_heatmap.png")

if os.path.exists(fig3_path):
    print(f"Figure 3 exists, skipping: {fig3_path}")
else:
    # Read F1 from 5 key experiment CSVs
    heatmap_sources = {
        "XGBoost 32K":      "classic_ml_text_metrics.csv",
        "DeepSeek+Think":   "deepseek_v32_thinking_parent_metrics.csv",
        "Mistral Large ZS": "mistral_large_zs_metrics.csv",
        "DeepSeek ZS":      "deepseek_v32_parent_metrics.csv",
        "Qwen3 FS Tax":     "few_shot_taxonomy_metrics.csv",
    }
    
    f1_data = {}
    for label, fname in heatmap_sources.items():
        df_csv = pd.read_csv(os.path.join(RESULTS_DIR, fname))
        # Exclude MACRO/MICRO aggregate rows
        df_cat = df_csv[~df_csv["Category"].isin(["MACRO", "MICRO"])].copy()
        f1_data[label] = df_cat.set_index("Category")["F1"]
    
    df_heat = pd.DataFrame(f1_data)
    # Sort rows by XGBoost F1 descending
    df_heat = df_heat.sort_values("XGBoost 32K", ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    im = ax.imshow(df_heat.values, cmap="YlOrRd", aspect="auto", vmin=0.3, vmax=0.85)
    
    ax.set_xticks(range(len(df_heat.columns)))
    ax.set_xticklabels(df_heat.columns, fontsize=10, rotation=20, ha="right")
    ax.set_yticks(range(len(df_heat.index)))
    ax.set_yticklabels(df_heat.index, fontsize=10)
    
    # Annotate cells
    for i in range(len(df_heat.index)):
        for j in range(len(df_heat.columns)):
            val = df_heat.iloc[i, j]
            text_color = "white" if val > 0.7 else "black"
            ax.text(j, i, f"{val:.3f}", ha="center", va="center", fontsize=9, color=text_color)
    
    ax.set_title("Per-Category F1 Scores Across Top 5 Experiments", fontsize=13, fontweight="bold")
    fig.colorbar(im, ax=ax, label="F1 Score", shrink=0.8)
    
    plt.tight_layout()
    plt.savefig(fig3_path, dpi=DPI, bbox_inches="tight", facecolor="white")
    plt.close()
    print(f"Figure 3 saved: {fig3_path}")

Figure 3 exists, skipping: ../results\fig_category_heatmap.png


In [6]:
# Figure 4: Cost vs. Performance Scatter
fig4_path = os.path.join(RESULTS_DIR, "fig_cost_vs_performance.png")

if os.path.exists(fig4_path):
    print(f"Figure 4 exists, skipping: {fig4_path}")
else:
    fig, ax = plt.subplots(figsize=(10, 7))
    
    for _, row in df_exp.iterrows():
        color = COLORS[row["approach"]]
        # Point size: log(params) for LLMs, fixed for Classic ML
        if row["params_b"] is not None and row["params_b"] > 0:
            size = np.log2(row["params_b"]) * 20
        else:
            size = 60
        ax.scatter(row["cost"], row["macro_f1"], c=color, s=size, edgecolors="black",
                   linewidth=0.5, zorder=5, alpha=0.85)
    
    # Label each point
    offsets = {
        "XGBoost (tuned)":    (8, 6),
        "XGBoost (baseline)": (8, -10),
        "DeepSeek V3.2+think":(8, 6),
        "XGBoost (164K)":     (8, -10),
        "Mistral Large 3 ZS": (8, 6),
        "Mistral Large 3 FS": (8, -10),
        "DeepSeek V3.2 ZS":   (8, -12),
        "Ministral 8B FS":    (8, 6),
        "Qwen3 FS tax+think": (-5, -14),
        "Qwen3 FS taxonomy":  (8, 6),
        "Qwen3 fine-tuned":   (-5, 8),
        "Qwen3 ZS taxonomy":  (8, 6),
        "Ministral 8B ZS":    (8, -10),
        "Ministral 8B FT":    (8, -10),
        "Qwen3 ZS basic":     (8, 6),
        "Qwen3 FS basic":     (8, -10),
    }
    
    for _, row in df_exp.iterrows():
        ox, oy = offsets.get(row["name"], (8, 4))
        ax.annotate(row["name"], (row["cost"], row["macro_f1"]),
                    textcoords="offset points", xytext=(ox, oy),
                    fontsize=7.5, alpha=0.85)
    
    # XGBoost ceiling line
    best_ml = df_exp.loc[df_exp["approach"] == "Classic ML", "macro_f1"].max()
    ax.axhline(best_ml, color="#2ca02c", linestyle="--", linewidth=1.2, alpha=0.7, label=f"XGBoost ceiling ({best_ml:.3f})")
    
    ax.set_xscale("log")
    ax.set_xlabel("Cost ($, log scale)", fontsize=11)
    ax.set_ylabel("Macro-F1", fontsize=11)
    ax.set_title("Cost vs. Performance: All Parent-Level Experiments", fontsize=13, fontweight="bold")
    
    # Legend
    legend_handles = [Patch(facecolor=c, label=a) for a, c in COLORS.items()]
    ax.legend(handles=legend_handles, loc="lower right", fontsize=9, framealpha=0.9)
    
    plt.tight_layout()
    plt.savefig(fig4_path, dpi=DPI, bbox_inches="tight", facecolor="white")
    plt.close()
    print(f"Figure 4 saved: {fig4_path}")

Figure 4 exists, skipping: ../results\fig_cost_vs_performance.png


In [7]:
# Figure 5: Dense vs MoE Architecture
fig5_path = os.path.join(RESULTS_DIR, "fig_scale_effect.png")

if os.path.exists(fig5_path):
    print(f"Figure 5 exists, skipping: {fig5_path}")
else:
    # Data: Dense 8B vs MoE 670B+ zero-shot taxonomy-enriched results
    dense_models = [
        {"name": "Qwen3-8B ZS", "macro_f1": 0.499, "params_b": 8, "active_b": 8},
        {"name": "Ministral 8B ZS", "macro_f1": 0.491, "params_b": 8, "active_b": 8},
    ]
    moe_models = [
        {"name": "Mistral Large 3 ZS", "macro_f1": 0.658, "params_b": 675, "active_b": 41},
        {"name": "DeepSeek V3.2 ZS", "macro_f1": 0.623, "params_b": 671, "active_b": 37},
        {"name": "DeepSeek V3.2+think", "macro_f1": 0.681, "params_b": 671, "active_b": 37},
    ]

    fig, ax = plt.subplots(figsize=(9, 6))

    # Plot dense models
    for i, m in enumerate(dense_models):
        ax.scatter(0, m["macro_f1"], s=150, c="#d62728", edgecolors="black",
                   linewidth=0.8, zorder=5, marker="o")
        ox = 12 if i == 0 else 12
        oy = 8 if i == 0 else -14
        ax.annotate(f'{m["name"]}\n({m["params_b"]}B dense)',
                    (0, m["macro_f1"]), textcoords="offset points",
                    xytext=(ox, oy), fontsize=9, color="#d62728")

    # Plot MoE models
    markers = ["o", "o", "D"]
    for i, m in enumerate(moe_models):
        marker = markers[i]
        color = "#9467bd" if "think" in m["name"] else "#1f77b4"
        ax.scatter(1, m["macro_f1"], s=150, c=color, edgecolors="black",
                   linewidth=0.8, zorder=5, marker=marker)
        if i == 0:
            ox, oy = 12, 8
        elif i == 1:
            ox, oy = 12, -14
        else:
            ox, oy = 12, 8
        label = f'{m["name"]}\n({m["params_b"]}B/{m["active_b"]}B active)'
        ax.annotate(label, (1, m["macro_f1"]), textcoords="offset points",
                    xytext=(ox, oy), fontsize=9, color=color)

    # XGBoost baseline dashed line
    ax.axhline(0.693, color="#2ca02c", linestyle="--", linewidth=1.5, alpha=0.7,
               label="XGBoost baseline (0.693)")

    # Styling
    ax.set_xticks([0, 1])
    ax.set_xticklabels(["Dense 8B", "MoE 670B+"], fontsize=12, fontweight="bold")
    ax.set_xlim(-0.5, 1.8)
    ax.set_ylim(0.40, 0.78)
    ax.set_ylabel("Macro-F1", fontsize=11)
    ax.set_title("Dense vs MoE Architecture: Zero-Shot Classification Performance",
                 fontsize=13, fontweight="bold")

    # Legend
    from matplotlib.lines import Line2D
    legend_elements = [
        Line2D([0], [0], marker="o", color="w", markerfacecolor="#d62728",
               markersize=10, markeredgecolor="black", label="Dense 8B"),
        Line2D([0], [0], marker="o", color="w", markerfacecolor="#1f77b4",
               markersize=10, markeredgecolor="black", label="MoE 670B+ (zero-shot)"),
        Line2D([0], [0], marker="D", color="w", markerfacecolor="#9467bd",
               markersize=10, markeredgecolor="black", label="MoE 670B+ (+ thinking)"),
        Line2D([0], [0], color="#2ca02c", linestyle="--", linewidth=1.5, label="XGBoost baseline (0.693)"),
    ]
    ax.legend(handles=legend_elements, loc="upper left", fontsize=9, framealpha=0.9)

    plt.tight_layout()
    plt.savefig(fig5_path, dpi=DPI, bbox_inches="tight", facecolor="white")
    plt.close()
    print(f"Figure 5 saved: {fig5_path}")

Figure 5 exists, skipping: ../results\fig_scale_effect.png


In [8]:
# Figure 6: Subcategory Grand Comparison
fig6_path = os.path.join(RESULTS_DIR, "fig_sub_grand_comparison.png")

if os.path.exists(fig6_path):
    print(f"Figure 6 exists, skipping: {fig6_path}")
else:
    sub_experiments = [
        {"name": "XGBoost (baseline)",  "csv": "classic_ml_subcategory_metrics.csv",             "approach": "Classic ML"},
        {"name": "XGBoost (tuned)",     "csv": "classic_ml_tuned_subcategory_metrics.csv",       "approach": "Classic ML"},
        {"name": "Mistral Large 3 ZS",  "csv": "mistral_large_subcategory_metrics.csv",          "approach": "Zero-shot"},
        {"name": "DeepSeek V3.2 ZS",    "csv": "deepseek_v32_subcategory_metrics.csv",           "approach": "Zero-shot"},
        {"name": "DeepSeek V3.2+think",  "csv": "deepseek_v32_thinking_subcategory_metrics.csv", "approach": "Zero-shot"},
        {"name": "Qwen3-8B ZS",         "csv": "qwen_zero_shot_subcategory_metrics.csv",         "approach": "Zero-shot"},
    ]

    # Read Macro-F1 from each CSV
    for exp in sub_experiments:
        df_csv = pd.read_csv(os.path.join(RESULTS_DIR, exp["csv"]))
        macro_row = df_csv[df_csv["Category"] == "MACRO"]
        exp["macro_f1"] = macro_row["F1"].values[0]

    df_sub = pd.DataFrame(sub_experiments).sort_values("macro_f1", ascending=True).reset_index(drop=True)

    fig, ax = plt.subplots(figsize=(10, 5))

    bar_colors = [COLORS[row["approach"]] for _, row in df_sub.iterrows()]
    bars = ax.barh(range(len(df_sub)), df_sub["macro_f1"], color=bar_colors, edgecolor="white", linewidth=0.5)

    ax.set_yticks(range(len(df_sub)))
    ax.set_yticklabels(df_sub["name"], fontsize=10)
    ax.set_xlabel("Macro-F1", fontsize=11)
    ax.set_title("Subcategory Classification (48 Labels): All Experiments", fontsize=13, fontweight="bold")

    # Value labels
    for i, (val, bar) in enumerate(zip(df_sub["macro_f1"], bars)):
        ax.text(val + 0.005, i, f"{val:.3f}", va="center", fontsize=9)

    # Best Classic ML dashed line
    best_ml_sub = df_sub.loc[df_sub["approach"] == "Classic ML", "macro_f1"].max()
    ax.axvline(best_ml_sub, color="#2ca02c", linestyle="--", linewidth=1.2, alpha=0.7)

    ax.set_xlim(0, df_sub["macro_f1"].max() + 0.08)

    # Legend
    legend_handles = [Patch(facecolor=COLORS[a], label=a) for a in ["Classic ML", "Zero-shot"]]
    ax.legend(handles=legend_handles, loc="lower right", fontsize=10, framealpha=0.9)

    plt.tight_layout()
    plt.savefig(fig6_path, dpi=DPI, bbox_inches="tight", facecolor="white")
    plt.close()
    print(f"Figure 6 saved: {fig6_path}")

Figure 6 saved: ../results\fig_sub_grand_comparison.png


In [9]:
# Figure 7: Parent vs Subcategory Drop
fig7_path = os.path.join(RESULTS_DIR, "fig_parent_vs_sub.png")

if os.path.exists(fig7_path):
    print(f"Figure 7 exists, skipping: {fig7_path}")
else:
    def read_macro_f1(csv_name):
        df_csv = pd.read_csv(os.path.join(RESULTS_DIR, csv_name))
        return df_csv.loc[df_csv["Category"] == "MACRO", "F1"].values[0]

    model_pairs = [
        {"name": "XGBoost\n(baseline)",   "parent_csv": "classic_ml_text_metrics.csv",              "sub_csv": "classic_ml_subcategory_metrics.csv"},
        {"name": "XGBoost\n(tuned)",      "parent_csv": "classic_ml_tuned_parent_metrics.csv",      "sub_csv": "classic_ml_tuned_subcategory_metrics.csv"},
        {"name": "Mistral\nLarge 3 ZS",   "parent_csv": "mistral_large_zs_metrics.csv",             "sub_csv": "mistral_large_subcategory_metrics.csv"},
        {"name": "DeepSeek\nV3.2 ZS",     "parent_csv": "deepseek_v32_parent_metrics.csv",          "sub_csv": "deepseek_v32_subcategory_metrics.csv"},
        {"name": "DeepSeek\nV3.2+think",  "parent_csv": "deepseek_v32_thinking_parent_metrics.csv", "sub_csv": "deepseek_v32_thinking_subcategory_metrics.csv"},
        {"name": "Qwen3-8B\nZS",         "parent_csv": "zero_shot_taxonomy_metrics.csv",            "sub_csv": "qwen_zero_shot_subcategory_metrics.csv"},
    ]

    for m in model_pairs:
        m["parent_f1"] = read_macro_f1(m["parent_csv"])
        m["sub_f1"] = read_macro_f1(m["sub_csv"])
        m["delta"] = m["sub_f1"] - m["parent_f1"]

    names = [m["name"] for m in model_pairs]
    parent_vals = [m["parent_f1"] for m in model_pairs]
    sub_vals = [m["sub_f1"] for m in model_pairs]
    deltas = [m["delta"] for m in model_pairs]

    x = np.arange(len(names))
    width = 0.35

    fig, ax = plt.subplots(figsize=(12, 7))

    bars_p = ax.bar(x - width / 2, parent_vals, width, label="Parent (13 labels)", color="#1f77b4", edgecolor="white", linewidth=0.5)
    bars_s = ax.bar(x + width / 2, sub_vals, width, label="Subcategory (48 labels)", color="#ff7f0e", edgecolor="white", linewidth=0.5)

    # Value labels on bars
    for bar, val in zip(bars_p, parent_vals):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.008,
                f"{val:.3f}", ha="center", va="bottom", fontsize=8.5)
    for bar, val in zip(bars_s, sub_vals):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.008,
                f"{val:.3f}", ha="center", va="bottom", fontsize=8.5)

    # Delta annotations above each pair
    for i, delta in enumerate(deltas):
        y_top = max(parent_vals[i], sub_vals[i]) + 0.04
        ax.text(x[i], y_top, f"{delta:+.3f}", ha="center", va="bottom",
                fontsize=9, fontweight="bold", color="#d62728")

    ax.set_xticks(x)
    ax.set_xticklabels(names, fontsize=10)
    ax.set_ylabel("Macro-F1", fontsize=11)
    ax.set_ylim(0, max(parent_vals) + 0.12)
    ax.set_title("Parent vs Subcategory: Macro-F1 Drop Across Models", fontsize=13, fontweight="bold")
    ax.legend(fontsize=10, framealpha=0.9)

    plt.tight_layout()
    plt.savefig(fig7_path, dpi=DPI, bbox_inches="tight", facecolor="white")
    plt.close()
    print(f"Figure 7 saved: {fig7_path}")

Figure 7 saved: ../results\fig_parent_vs_sub.png


In [10]:
# Summary: check which figures exist
figures = [
    ("fig_grand_comparison.png",    "Fig 1: Grand Comparison Bar Chart"),
    ("fig_approach_summary.png",    "Fig 2: Four-Way Approach Summary"),
    ("fig_category_heatmap.png",    "Fig 3: Per-Category F1 Heatmap"),
    ("fig_cost_vs_performance.png", "Fig 4: Cost vs Performance Scatter"),
    ("fig_scale_effect.png",        "Fig 5: Model Scale Effect"),
    ("fig_sub_grand_comparison.png", "Fig 6: Subcategory Grand Comparison"),
    ("fig_parent_vs_sub.png",        "Fig 7: Parent vs Subcategory Drop"),
]

print("=" * 50)
print("Figure Generation Summary")
print("=" * 50)
for fname, desc in figures:
    path = os.path.join(RESULTS_DIR, fname)
    if os.path.exists(path):
        size_kb = os.path.getsize(path) / 1024
        print(f"  OK  {desc} ({size_kb:.0f} KB)")
    else:
        print(f"  MISSING  {desc}")
print("=" * 50)

Figure Generation Summary
  OK  Fig 1: Grand Comparison Bar Chart (252 KB)
  OK  Fig 2: Four-Way Approach Summary (120 KB)
  OK  Fig 3: Per-Category F1 Heatmap (420 KB)
  OK  Fig 4: Cost vs Performance Scatter (210 KB)
  OK  Fig 5: Model Scale Effect (194 KB)
  OK  Fig 6: Subcategory Grand Comparison (128 KB)
  OK  Fig 7: Parent vs Subcategory Drop (180 KB)
