# LLM Response Metrics - Paper Tables

In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, Markdown

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.precision", 1)

In [None]:
metrics_path = Path("metrics_summary.jsonl")

metrics = []
with open(metrics_path, "r") as f:
    for line in f:
        metrics.append(json.loads(line))

df = pd.DataFrame(metrics)
print(f"Total records: {len(df)}")

df = df[df["valid_for_metrics"] == True].copy()
print(f"Valid records: {len(df)}")

In [None]:
MODEL_DISPLAY_NAMES = {
    "Qwen_Qwen3-4B-Thinking-2507": "Qwen3-4B-Thinking",
    "Qwen_Qwen3-4B": "Qwen3-4B",
    "Qwen_Qwen3-14B": "Qwen3-14B",
    "Qwen_Qwen3-32B": "Qwen3-32B",
    "deepseek-ai_DeepSeek-R1-Distill-Qwen-7B": "DeepSeek-R1-7B",
    "microsoft_phi-4-reasoning": "Phi-4-reasoning",
}

MODEL_ORDER = [
    "Qwen_Qwen3-4B-Thinking-2507",  # Qwen3-4B-Thinking
    "Qwen_Qwen3-4B",  # Qwen3-4B
    "Qwen_Qwen3-14B",  # Qwen3-14B
    "deepseek-ai_DeepSeek-R1-Distill-Qwen-7B",  # DS-R1-7B
    "Qwen_Qwen3-32B",  # Qwen3-32B
    "microsoft_phi-4-reasoning",  # Phi-4-reasoning
]

MODEL_THRESHOLDS = {
    "Qwen_Qwen3-4B-Thinking-2507": 0.25,
    "Qwen_Qwen3-4B": 0.25,
    "Qwen_Qwen3-14B": 0.25,
    "Qwen_Qwen3-32B": 0.25,
    "deepseek-ai_DeepSeek-R1-Distill-Qwen-7B": 1.0,
    "microsoft_phi-4-reasoning": 0.25,
}
DEFAULT_THRESHOLD = 0.25

METHOD_DISPLAY = {
    "rollout": "(baseline)",
    "nowait": "+NoWait",
    "thinkless": "+ThinkLess",
    "thinkbrake-prob": "+THINKBRAKE (prob)",
    "thinkbrake": "+THINKBRAKE",
    "oracle": "+Oracle",
}

df["model_display"] = df["model"].map(MODEL_DISPLAY_NAMES).fillna(df["model"])
print(f"Unique models: {df['model_display'].unique().tolist()}")
print(f"Unique methods: {df['method'].unique().tolist()}")
print(f"Unique benchmarks: {df['benchmark'].unique().tolist()}")

## Table 1: Reasoning Benchmarks (Math + General)


In [None]:
REASONING_BENCHMARKS = [
    "gsm8k",
    "math500",
    "aime2024",
    "aime2025",
    "gpqa-diamond",
    "arc-challenge",
]

df_reasoning = df[
    (df["benchmark"].isin(REASONING_BENCHMARKS)) & (df["sub_category"].isna())
].copy()

print(f"Reasoning benchmark records: {len(df_reasoning)}")

In [None]:
def create_reasoning_table(df_data, model_order=None):
    benchmarks = [
        "gsm8k",
        "math500",
        "aime2024",
        "aime2025",
        "gpqa-diamond",
        "arc-challenge",
    ]

    if model_order is None:
        all_models = df_data["model"].unique()
    else:
        available_models = set(df_data["model"].unique())
        all_models = [m for m in model_order if m in available_models]

    rows = []

    for model in all_models:
        model_display = MODEL_DISPLAY_NAMES.get(model, model)
        model_data = df_data[df_data["model"] == model]
        model_threshold = MODEL_THRESHOLDS.get(model, DEFAULT_THRESHOLD)

        methods_to_show = [
            "rollout",
            "nowait",
            "thinkless",
            "thinkbrake-prob",
            "thinkbrake",
        ]

        baseline_data = {}
        rollout = model_data[model_data["method"] == "rollout"]
        for bench in benchmarks:
            bench_data = rollout[rollout["benchmark"] == bench]
            if len(bench_data) > 0:
                baseline_data[bench] = {
                    "accuracy": bench_data["accuracy"].values[0],
                    "tokens": bench_data["avg_token_length"].values[0],
                }

        first_row_for_model = True
        for method in methods_to_show:
            # Determine threshold to use
            if method == "thinkbrake":
                use_threshold = model_threshold
            elif method == "thinkbrake-prob":
                use_threshold = 0.25
            else:
                use_threshold = None

            # Filter by method and threshold
            if method in ["thinkbrake", "thinkbrake-prob"]:
                method_data = model_data[
                    (model_data["method"] == method)
                    & (model_data["threshold"] == use_threshold)
                ]
            else:
                method_data = model_data[model_data["method"] == method]

            if len(method_data) == 0:
                continue

            row = {
                "Model": model_display if first_row_for_model else "",
                "Method": METHOD_DISPLAY.get(method, method),
            }
            first_row_for_model = False

            acc_values = []

            for bench in benchmarks:
                bench_row = method_data[method_data["benchmark"] == bench]
                if len(bench_row) > 0:
                    acc = bench_row["accuracy"].values[0]
                    tokens = bench_row["avg_token_length"].values[0]

                    row[f"{bench}_acc"] = f"{acc:.1f}"
                    acc_values.append(acc)
                    row[f"{bench}_tok"] = f"{tokens:.0f}"
                else:
                    row[f"{bench}_acc"] = "–"
                    row[f"{bench}_tok"] = "–"

            # Calculate average accuracy
            if acc_values:
                avg_acc = np.mean(acc_values)
                row["avg_acc"] = f"{avg_acc:.1f}"

                # Calculate average token length
                if method != "rollout":
                    all_delta_tokens = []
                    for bench in benchmarks:
                        bench_row = method_data[method_data["benchmark"] == bench]
                        if len(bench_row) > 0 and bench in baseline_data:
                            tokens = bench_row["avg_token_length"].values[0]
                            all_delta_tokens.append(tokens)
                    if all_delta_tokens:
                        row["avg_tok"] = f"{np.mean(all_delta_tokens):.0f}"
                    else:
                        row["avg_tok"] = "–"
                else:
                    row["avg_tok"] = "–"
            else:
                row["avg_acc"] = "–"
                row["avg_tok"] = "–"

            rows.append(row)

    result_df = pd.DataFrame(rows)

    # Rename columns for display
    rename_map = {
        "gsm8k_acc": "GSM8K Acc",
        "gsm8k_tok": "Token",
        "math500_acc": "MATH500 Acc",
        "math500_tok": "Token",
        "aime2024_acc": "AIME24 Acc",
        "aime2024_tok": "Token",
        "aime2025_acc": "AIME25 Acc",
        "aime2025_tok": "Token",
        "gpqa-diamond_acc": "GPQA-D Acc",
        "gpqa-diamond_tok": "Token",
        "arc-challenge_acc": "ARC-C Acc",
        "arc-challenge_tok": "Token",
        "avg_acc": "Avg Acc",
        "avg_tok": "Token",
    }
    result_df = result_df.rename(columns=rename_map)

    return result_df


# Create the table with specified model order
reasoning_table = create_reasoning_table(df_reasoning, model_order=MODEL_ORDER)
display(Markdown("### Table 1: Reasoning Benchmarks"))
display(reasoning_table)

---
## Table 2: Tool Benchmarks (BFCL + MetaTool)

BFCL sub-categories: parallel, parallel_multiple

In [None]:
df_tool = df[df["benchmark"].isin(["bfcl-v1", "bfcl-v2", "meta-tool"])].copy()
print(f"Tool benchmark records: {len(df_tool)}")
print(f"Sub-categories: {df_tool['sub_category'].dropna().unique().tolist()}")
print(f"Benchmarks: {df_tool['benchmark'].unique().tolist()}")

In [None]:
def create_tool_table(
    df_data,
    model_order=None,
):
    bfcl_subcats = ["simple", "multiple", "parallel", "parallel_multiple"]
    metatool_subcats = ["task2_subtask1", "task2_subtask4"]
    metatool_display_names = {
        "task2_subtask1": "single",
        "task2_subtask4": "multiple",
    }

    if model_order is None:
        all_models = df_data["model"].unique()
    else:
        available_models = set(df_data["model"].unique())
        all_models = [m for m in model_order if m in available_models]

    rows = []

    for model in all_models:
        model_display = MODEL_DISPLAY_NAMES.get(model, model)
        model_data = df_data[df_data["model"] == model]
        model_threshold = MODEL_THRESHOLDS.get(model, DEFAULT_THRESHOLD)

        methods_to_show = [
            "rollout",
            "nowait",
            "thinkless",
            "thinkbrake-prob",
            "thinkbrake",
        ]

        baseline_data = {}
        rollout = model_data[model_data["method"] == "rollout"]

        for bench in ["bfcl-v1", "bfcl-v2"]:
            for subcat in bfcl_subcats:
                key = f"{bench}_{subcat}"
                data = rollout[
                    (rollout["benchmark"] == bench)
                    & (rollout["sub_category"] == subcat)
                ]
                if len(data) > 0:
                    baseline_data[key] = {
                        "accuracy": data["accuracy"].values[0],
                        "tokens": data["avg_token_length"].values[0],
                    }

        for subcat in metatool_subcats:
            key = f"meta-tool_{subcat}"
            data = rollout[
                (rollout["benchmark"] == "meta-tool")
                & (rollout["sub_category"] == subcat)
            ]
            if len(data) > 0:
                baseline_data[key] = {
                    "accuracy": data["accuracy"].values[0],
                    "tokens": data["avg_token_length"].values[0],
                }

        first_row_for_model = True
        for method in methods_to_show:
            if method == "thinkbrake":
                method_data = model_data[
                    (model_data["method"] == method)
                    & (model_data["threshold"] == model_threshold)
                ]
            elif method == "thinkbrake-prob":
                method_data = model_data[
                    (model_data["method"] == method)
                    & (model_data["threshold"] == DEFAULT_THRESHOLD)
                ]
            else:
                method_data = model_data[model_data["method"] == method]

            if len(method_data) == 0:
                continue

            row = {
                "Model": model_display if first_row_for_model else "",
                "Method": METHOD_DISPLAY.get(method, method),
            }
            first_row_for_model = False

            for bench in ["bfcl-v1", "bfcl-v2"]:
                for subcat in bfcl_subcats:
                    key = f"{bench}_{subcat}"
                    col_prefix = (
                        f"{bench.upper().replace('-', '')}_{subcat.replace('_','-')}"
                    )

                    data = method_data[
                        (method_data["benchmark"] == bench)
                        & (method_data["sub_category"] == subcat)
                    ]

                    if len(data) > 0:
                        acc = data["accuracy"].values[0]
                        tokens = data["avg_token_length"].values[0]

                        row[f"{col_prefix}_acc"] = f"{acc:.1f}"
                        row[f"{col_prefix}_tok"] = f"{tokens:.0f}"

                    else:
                        row[f"{col_prefix}_acc"] = "–"
                        row[f"{col_prefix}_tok"] = "–"

            # Add MetaTool sub-category columns
            for subcat in metatool_subcats:
                key = f"meta-tool_{subcat}"
                display_name = metatool_display_names.get(subcat, subcat)
                col_prefix = f"MT_{display_name}"

                data = method_data[
                    (method_data["benchmark"] == "meta-tool")
                    & (method_data["sub_category"] == subcat)
                ]

                if len(data) > 0:
                    acc = data["accuracy"].values[0]
                    tokens = data["avg_token_length"].values[0]
                    row[f"{col_prefix}_acc"] = f"{acc:.1f}"
                    row[f"{col_prefix}_tok"] = f"{tokens:.0f}"
                else:
                    row[f"{col_prefix}_acc"] = "–"
                    row[f"{col_prefix}_tok"] = "–"

            rows.append(row)

    result_df = pd.DataFrame(rows)
    return result_df


# Create the table with specified model order
tool_table = create_tool_table(df_tool, model_order=MODEL_ORDER)
display(Markdown("### Table 2: Tool Benchmarks (BFCL + MetaTool Sub-Categories)"))
display(tool_table)

---
## Table 3: Extended Metrics (pass@5, majority@8, avg@8)

AIME와 MATH500에 대한 추가 메트릭을 표시합니다.

In [None]:
def create_extended_metrics_table(
    df_data, main_model="Qwen_Qwen3-4B-Thinking-2507", model_order=None
):
    """
    Create a table showing pass@5, majority@8, avg@8 for AIME and MATH500.
    Uses model-specific thresholds from MODEL_THRESHOLDS.
    """
    benchmarks = ["math500", "aime2024", "aime2025"]

    # Filter for extended metrics benchmarks (should have pass@k, majority_accuracy, avg@8)
    df_ext = df_data[
        (df_data["benchmark"].isin(benchmarks)) & (df_data["sub_category"].isna())
    ].copy()

    if model_order is None:
        all_models = df_ext["model"].unique()
    else:
        available_models = set(df_ext["model"].unique())
        all_models = [m for m in model_order if m in available_models]

    rows = []

    for model in all_models:
        model_display = MODEL_DISPLAY_NAMES.get(model, model)
        model_data = df_ext[df_ext["model"] == model]
        threshold = MODEL_THRESHOLDS.get(model, DEFAULT_THRESHOLD)

        # Determine methods to show
        if model == main_model:
            methods_to_show = ["rollout", "thinkbrake"]
        else:
            methods_to_show = ["rollout", "thinkbrake"]

        first_row_for_model = True
        for method in methods_to_show:
            if method in ["thinkbrake", "thinkbrake-prob"]:
                method_data = model_data[
                    (model_data["method"] == method)
                    & (model_data["threshold"] == threshold)
                ]
            else:
                method_data = model_data[model_data["method"] == method]

            if len(method_data) == 0:
                continue

            row = {
                "Model": model_display if first_row_for_model else "",
                "Method": METHOD_DISPLAY.get(method, method),
            }
            first_row_for_model = False

            for bench in benchmarks:
                bench_row = method_data[method_data["benchmark"] == bench]
                if len(bench_row) > 0:
                    # pass@k is a dict like {"1": 68.75, "5": 82.55}
                    pass_k = bench_row["pass@k"].values[0]
                    if isinstance(pass_k, dict) and "5" in pass_k:
                        row[f"{bench}_pass5"] = f"{pass_k['5']:.1f}"
                    else:
                        row[f"{bench}_pass5"] = "–"

                    # majority_accuracy
                    maj = bench_row.get("majority_accuracy")
                    if maj is not None and len(maj) > 0 and pd.notna(maj.values[0]):
                        row[f"{bench}_maj8"] = f"{maj.values[0]:.1f}"
                    else:
                        row[f"{bench}_maj8"] = "–"

                    # avg@8
                    avg8 = bench_row.get("avg@8")
                    if avg8 is not None and len(avg8) > 0 and pd.notna(avg8.values[0]):
                        row[f"{bench}_avg8"] = f"{avg8.values[0]:.1f}"
                    else:
                        row[f"{bench}_avg8"] = "–"
                else:
                    row[f"{bench}_pass5"] = "–"
                    row[f"{bench}_maj8"] = "–"
                    row[f"{bench}_avg8"] = "–"

            rows.append(row)

    result_df = pd.DataFrame(rows)

    # Rename columns
    rename_map = {
        "math500_pass5": "MATH500 pass@5",
        "math500_maj8": "maj@8",
        "aime2024_pass5": "AIME24 pass@5",
        "aime2024_maj8": "maj@8",
        "aime2025_pass5": "AIME25 pass@5",
        "aime2025_maj8": "maj@8",
    }
    result_df = result_df.rename(columns=rename_map)

    return result_df


# Create extended metrics table
extended_table = create_extended_metrics_table(df, model_order=MODEL_ORDER)
display(Markdown("### Table 3: Extended Metrics (pass@5, majority@8, avg@8)"))
display(extended_table)

## Threshold Analysis for ThinkBrake

In [None]:
PARENT_CATEGORIES = {
    "gsm8k": "math",
    "math500": "math",
    "aime2024": "math",
    "aime2025": "math",
    "gpqa-diamond": "general",
    "arc-challenge": "general",
    "bfcl-v1": "tool",
    "bfcl-v2": "tool",
    "meta-tool": "tool",
}

df_tb = df[(df["method"] == "thinkbrake") & (df["sub_category"].isna())].copy()
df_tb["category"] = df_tb["benchmark"].map(PARENT_CATEGORIES)

threshold_analysis = df_tb.pivot_table(
    index=["model", "threshold"], columns="category", values="accuracy", aggfunc="mean"
)

print("ThinkBrake Threshold Analysis by Category (Average Accuracy)")
display(threshold_analysis.style.format("{:.2f}").background_gradient(axis=0))