# LLM Response Metrics - Paper Tables

In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, Markdown

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.precision", 1)

In [2]:
metrics_path = Path("metrics_summary.jsonl")

metrics = []
with open(metrics_path, "r") as f:
    for line in f:
        metrics.append(json.loads(line))

df = pd.DataFrame(metrics)
print(f"Total records: {len(df)}")

df = df[df["valid_for_metrics"] == True].copy()
print(f"Valid records: {len(df)}")

Total records: 870
Valid records: 846


In [3]:
MODEL_DISPLAY_NAMES = {
    "Qwen_Qwen3-4B-Thinking-2507": "Qwen3-4B-Thinking",
    "Qwen_Qwen3-4B": "Qwen3-4B",
    "Qwen_Qwen3-14B": "Qwen3-14B",
    "Qwen_Qwen3-32B": "Qwen3-32B",
    "deepseek-ai_DeepSeek-R1-Distill-Qwen-7B": "DeepSeek-R1-7B",
    "microsoft_phi-4-reasoning": "Phi-4-reasoning",
}

MODEL_ORDER = [
    "Qwen_Qwen3-4B-Thinking-2507",
    "Qwen_Qwen3-4B",
    "Qwen_Qwen3-14B",
    "Qwen_Qwen3-32B",
    "deepseek-ai_DeepSeek-R1-Distill-Qwen-7B",
    "microsoft_phi-4-reasoning",
]

MODEL_THRESHOLDS = {
    "Qwen_Qwen3-4B-Thinking-2507": 0.01,
    "Qwen_Qwen3-4B": 0.1,
    "Qwen_Qwen3-14B": 0.1,
    "Qwen_Qwen3-32B": 0.1,
    "deepseek-ai_DeepSeek-R1-Distill-Qwen-7B": 0.1,
    "microsoft_phi-4-reasoning": 2.5,
}
DEFAULT_THRESHOLD = 0.25

METHOD_DISPLAY = {
    "rollout": "(baseline)",
    "nowait": "+NoWait",
    "thinkless": "+ThinkLess",
    "thinkbrake-prob": "+THINKBRAKE (prob)",
    "thinkbrake": "+THINKBRAKE",
    "oracle": "+Oracle",
}

df["model_display"] = df["model"].map(MODEL_DISPLAY_NAMES).fillna(df["model"])
print(f"Unique models: {df['model_display'].unique().tolist()}")
print(f"Unique methods: {df['method'].unique().tolist()}")
print(f"Unique benchmarks: {df['benchmark'].unique().tolist()}")

Unique models: ['Qwen3-4B', 'DeepSeek-R1-7B', 'Qwen3-4B-Thinking', 'Qwen3-32B', 'Qwen3-14B', 'Phi-4-reasoning']
Unique methods: ['rollout', 'thinkbrake-prob', 'thinkbrake', 'thinkless']
Unique benchmarks: ['bfcl-v1', 'meta-tool', 'bfcl-v2', 'gsm8k', 'aime2024', 'aime2025', 'math500', 'dapo-math', 'gpqa-diamond', 'arc-challenge']


## Table 1: Reasoning Benchmarks (Math + General)


In [4]:
REASONING_BENCHMARKS = [
    "gsm8k",
    "math500",
    "aime2024",
    "aime2025",
    "gpqa-diamond",
    "arc-challenge",
]

df_reasoning = df[
    (df["benchmark"].isin(REASONING_BENCHMARKS)) & (df["sub_category"].isna())
].copy()

print(f"Reasoning benchmark records: {len(df_reasoning)}")

Reasoning benchmark records: 270


In [5]:
def create_reasoning_table(df_data, model_order=None):
    benchmarks = [
        "gsm8k",
        "math500",
        "aime2024",
        "aime2025",
        "gpqa-diamond",
        "arc-challenge",
    ]

    if model_order is None:
        all_models = df_data["model"].unique()
    else:
        available_models = set(df_data["model"].unique())
        all_models = [m for m in model_order if m in available_models]

    rows = []

    for model in all_models:
        model_display = MODEL_DISPLAY_NAMES.get(model, model)
        model_data = df_data[df_data["model"] == model]
        model_threshold = MODEL_THRESHOLDS.get(model, DEFAULT_THRESHOLD)

        methods_to_show = [
            "rollout",
            "nowait",
            "thinkless",
            "thinkbrake-prob",
            "thinkbrake",
        ]

        baseline_data = {}
        rollout = model_data[model_data["method"] == "rollout"]
        for bench in benchmarks:
            bench_data = rollout[rollout["benchmark"] == bench]
            if len(bench_data) > 0:
                baseline_data[bench] = {
                    "accuracy": bench_data["accuracy"].values[0],
                    "tokens": bench_data["avg_token_length"].values[0],
                }

        first_row_for_model = True
        for method in methods_to_show:
            # Determine threshold to use
            if method == "thinkbrake":
                use_threshold = model_threshold
            elif method == "thinkbrake-prob":
                use_threshold = 0.25
            else:
                use_threshold = None

            # Filter by method and threshold
            if method in ["thinkbrake", "thinkbrake-prob"]:
                method_data = model_data[
                    (model_data["method"] == method)
                    & (model_data["threshold"] == use_threshold)
                ]
            else:
                method_data = model_data[model_data["method"] == method]

            if len(method_data) == 0:
                continue

            row = {
                "Model": model_display if first_row_for_model else "",
                "Method": METHOD_DISPLAY.get(method, method),
            }
            first_row_for_model = False

            acc_values = []

            for bench in benchmarks:
                bench_row = method_data[method_data["benchmark"] == bench]
                if len(bench_row) > 0:
                    acc = bench_row["accuracy"].values[0]
                    tokens = bench_row["avg_token_length"].values[0]

                    row[f"{bench}_acc"] = f"{acc:.1f}"
                    acc_values.append(acc)
                    row[f"{bench}_tok"] = f"{tokens:.0f}"
                else:
                    row[f"{bench}_acc"] = "–"
                    row[f"{bench}_tok"] = "–"

            # Calculate average accuracy
            if acc_values:
                avg_acc = np.mean(acc_values)
                row["avg_acc"] = f"{avg_acc:.1f}"

                # Calculate average token length
                if method != "rollout":
                    all_delta_tokens = []
                    for bench in benchmarks:
                        bench_row = method_data[method_data["benchmark"] == bench]
                        if len(bench_row) > 0 and bench in baseline_data:
                            tokens = bench_row["avg_token_length"].values[0]
                            all_delta_tokens.append(tokens)
                    if all_delta_tokens:
                        row["avg_tok"] = f"{np.mean(all_delta_tokens):.0f}"
                    else:
                        row["avg_tok"] = "–"
                else:
                    row["avg_tok"] = "–"
            else:
                row["avg_acc"] = "–"
                row["avg_tok"] = "–"

            rows.append(row)

    result_df = pd.DataFrame(rows)

    # Rename columns for display
    rename_map = {
        "gsm8k_acc": "GSM8K Acc",
        "gsm8k_tok": "Token",
        "math500_acc": "MATH500 Acc",
        "math500_tok": "Token",
        "aime2024_acc": "AIME24 Acc",
        "aime2024_tok": "Token",
        "aime2025_acc": "AIME25 Acc",
        "aime2025_tok": "Token",
        "gpqa-diamond_acc": "GPQA-D Acc",
        "gpqa-diamond_tok": "Token",
        "arc-challenge_acc": "ARC-C Acc",
        "arc-challenge_tok": "Token",
        "avg_acc": "Avg Acc",
        "avg_tok": "Token",
    }
    result_df = result_df.rename(columns=rename_map)

    return result_df


reasoning_table = create_reasoning_table(df_reasoning, model_order=MODEL_ORDER)
display(Markdown("### Table 1: Reasoning Benchmarks"))
display(reasoning_table)

### Table 1: Reasoning Benchmarks

Unnamed: 0,Model,Method,GSM8K Acc,Token,MATH500 Acc,Token.1,AIME24 Acc,Token.2,AIME25 Acc,Token.3,GPQA-D Acc,Token.4,ARC-C Acc,Token.5,Avg Acc,Token.6
0,Qwen3-4B-Thinking,(baseline),95.1,1578,95.3,6360,76.7,19284,72.5,21317,64.1,8455,94.3,1178,83.0,–
1,,+THINKBRAKE (prob),93.9,990,91.2,2677,35.0,5277,30.4,5119,47.0,2749,93.7,805,65.2,2936
2,,+THINKBRAKE,94.8,1279,96.6,5061,73.3,13149,70.0,14733,62.6,5937,94.0,943,81.9,6850
3,Qwen3-4B,(baseline),94.4,2407,96.0,5250,68.8,13000,59.6,15163,51.0,7497,94.0,798,77.3,–
4,,+THINKBRAKE (prob),92.8,1439,88.6,2558,29.2,4617,25.0,4879,47.5,3435,93.4,678,62.7,2934
5,,+THINKBRAKE,94.5,1682,95.4,4496,64.6,11350,60.4,13112,55.6,5273,93.9,706,77.4,6103
6,Qwen3-14B,(baseline),96.0,1912,96.8,4850,71.7,12807,69.2,14198,60.6,7379,95.9,621,81.7,–
7,,+ThinkLess,94.1,382,85.8,974,26.7,2909,23.3,2773,53.5,1297,–,–,56.7,1667
8,,+THINKBRAKE (prob),95.1,1307,89.0,2433,38.8,4092,26.2,3977,57.6,2483,94.7,510,66.9,2467
9,,+THINKBRAKE,95.0,1617,96.9,4478,77.9,11864,65.8,13342,61.1,4437,95.2,508,82.0,6041


---
## Table 2: Tool Benchmarks (BFCL + MetaTool)

BFCL sub-categories: parallel, parallel_multiple

In [6]:
df_tool = df[df["benchmark"].isin(["bfcl-v1", "bfcl-v2", "meta-tool"])].copy()
print(f"Tool benchmark records: {len(df_tool)}")
print(f"Sub-categories: {df_tool['sub_category'].dropna().unique().tolist()}")
print(f"Benchmarks: {df_tool['benchmark'].unique().tolist()}")

Tool benchmark records: 549
Sub-categories: ['simple', 'parallel', 'multiple', 'parallel_multiple', 'task2_subtask1', 'task2_subtask4']
Benchmarks: ['bfcl-v1', 'meta-tool', 'bfcl-v2']


In [7]:
def create_tool_table(
    df_data,
    model_order=None,
):
    bfcl_subcats = ["simple", "multiple", "parallel", "parallel_multiple"]
    metatool_subcats = ["task2_subtask1", "task2_subtask4"]
    metatool_display_names = {
        "task2_subtask1": "single",
        "task2_subtask4": "multiple",
    }

    if model_order is None:
        all_models = df_data["model"].unique()
    else:
        available_models = set(df_data["model"].unique())
        all_models = [m for m in model_order if m in available_models]

    rows = []

    for model in all_models:
        model_display = MODEL_DISPLAY_NAMES.get(model, model)
        model_data = df_data[df_data["model"] == model]
        model_threshold = MODEL_THRESHOLDS.get(model, DEFAULT_THRESHOLD)

        methods_to_show = [
            "rollout",
            "nowait",
            "thinkless",
            "thinkbrake-prob",
            "thinkbrake",
        ]

        baseline_data = {}
        rollout = model_data[model_data["method"] == "rollout"]

        for bench in ["bfcl-v1", "bfcl-v2"]:
            for subcat in bfcl_subcats:
                key = f"{bench}_{subcat}"
                data = rollout[
                    (rollout["benchmark"] == bench)
                    & (rollout["sub_category"] == subcat)
                ]
                if len(data) > 0:
                    baseline_data[key] = {
                        "accuracy": data["accuracy"].values[0],
                        "tokens": data["avg_token_length"].values[0],
                    }

        for subcat in metatool_subcats:
            key = f"meta-tool_{subcat}"
            data = rollout[
                (rollout["benchmark"] == "meta-tool")
                & (rollout["sub_category"] == subcat)
            ]
            if len(data) > 0:
                baseline_data[key] = {
                    "accuracy": data["accuracy"].values[0],
                    "tokens": data["avg_token_length"].values[0],
                }

        first_row_for_model = True
        for method in methods_to_show:
            if method == "thinkbrake":
                method_data = model_data[
                    (model_data["method"] == method)
                    & (model_data["threshold"] == model_threshold)
                ]
            elif method == "thinkbrake-prob":
                method_data = model_data[
                    (model_data["method"] == method)
                    & (model_data["threshold"] == DEFAULT_THRESHOLD)
                ]
            else:
                method_data = model_data[model_data["method"] == method]

            if len(method_data) == 0:
                continue

            row = {
                "Model": model_display if first_row_for_model else "",
                "Method": METHOD_DISPLAY.get(method, method),
            }
            first_row_for_model = False

            for bench in ["bfcl-v1", "bfcl-v2"]:
                for subcat in bfcl_subcats:
                    key = f"{bench}_{subcat}"
                    col_prefix = (
                        f"{bench.upper().replace('-', '')}_{subcat.replace('_','-')}"
                    )

                    data = method_data[
                        (method_data["benchmark"] == bench)
                        & (method_data["sub_category"] == subcat)
                    ]

                    if len(data) > 0:
                        acc = data["accuracy"].values[0]
                        tokens = data["avg_token_length"].values[0]

                        row[f"{col_prefix}_acc"] = f"{acc:.1f}"
                        row[f"{col_prefix}_tok"] = f"{tokens:.0f}"

                    else:
                        row[f"{col_prefix}_acc"] = "–"
                        row[f"{col_prefix}_tok"] = "–"

            # Add MetaTool sub-category columns
            for subcat in metatool_subcats:
                key = f"meta-tool_{subcat}"
                display_name = metatool_display_names.get(subcat, subcat)
                col_prefix = f"MT_{display_name}"

                data = method_data[
                    (method_data["benchmark"] == "meta-tool")
                    & (method_data["sub_category"] == subcat)
                ]

                if len(data) > 0:
                    acc = data["accuracy"].values[0]
                    tokens = data["avg_token_length"].values[0]
                    row[f"{col_prefix}_acc"] = f"{acc:.1f}"
                    row[f"{col_prefix}_tok"] = f"{tokens:.0f}"
                else:
                    row[f"{col_prefix}_acc"] = "–"
                    row[f"{col_prefix}_tok"] = "–"

            rows.append(row)

    result_df = pd.DataFrame(rows)
    return result_df


# Create the table with specified model order
tool_table = create_tool_table(df_tool, model_order=MODEL_ORDER)
display(Markdown("### Table 2: Tool Benchmarks (BFCL + MetaTool Sub-Categories)"))
display(tool_table)

### Table 2: Tool Benchmarks (BFCL + MetaTool Sub-Categories)

Unnamed: 0,Model,Method,BFCLV1_simple_acc,BFCLV1_simple_tok,BFCLV1_multiple_acc,BFCLV1_multiple_tok,BFCLV1_parallel_acc,BFCLV1_parallel_tok,BFCLV1_parallel-multiple_acc,BFCLV1_parallel-multiple_tok,BFCLV2_simple_acc,BFCLV2_simple_tok,BFCLV2_multiple_acc,BFCLV2_multiple_tok,BFCLV2_parallel_acc,BFCLV2_parallel_tok,BFCLV2_parallel-multiple_acc,BFCLV2_parallel-multiple_tok,MT_single_acc,MT_single_tok,MT_multiple_acc,MT_multiple_tok
0,Qwen3-4B-Thinking,(baseline),88.2,1072,90.5,965,90.5,1293,84.5,1727,79.8,1094,83.4,1832,87.5,1473,79.2,2397,69.7,1946,85.5,1336
1,,+THINKBRAKE (prob),88.0,643,90.5,804,91.0,739,80.0,1027,79.5,713,81.7,1459,75.0,735,70.8,1588,70.8,1443,86.7,1166
2,,+THINKBRAKE,88.0,744,91.0,819,92.5,833,80.0,1122,81.0,822,82.1,1519,81.2,898,79.2,1744,72.8,1555,85.7,1223
3,Qwen3-4B,(baseline),88.2,607,92.0,744,90.0,844,83.0,1046,80.2,653,81.0,1426,81.2,837,70.8,1555,74.3,1069,90.5,856
4,,+THINKBRAKE (prob),87.3,525,93.0,710,90.5,719,84.0,996,80.2,608,81.8,1376,75.0,711,75.0,1357,73.5,1053,90.3,831
5,,+THINKBRAKE,87.8,531,92.5,705,89.5,705,83.5,921,79.1,594,81.2,1384,62.5,660,75.0,1396,72.7,1054,91.1,830
6,Qwen3-14B,(baseline),87.8,545,93.0,725,92.0,834,83.5,975,80.6,612,81.1,1372,62.5,944,70.8,1470,63.3,1091,84.9,873
7,,+THINKBRAKE (prob),88.4,494,92.5,709,91.5,638,86.0,819,77.9,550,80.7,1299,68.8,662,50.0,1256,64.6,1068,85.9,831
8,,+THINKBRAKE,87.8,493,92.0,695,93.0,657,85.0,837,78.3,565,79.7,1300,62.5,704,62.5,1358,66.3,1074,85.7,834
9,Qwen3-32B,(baseline),88.0,553,90.0,750,93.0,826,85.0,991,83.7,617,80.7,1342,68.8,725,70.8,1572,64.8,1048,84.3,857


---
## Table 3: Extended Metrics (pass@5, majority@8, avg@8)

AIME와 MATH500에 대한 추가 메트릭을 표시합니다.

In [8]:
def create_extended_metrics_table(
    df_data, main_model="Qwen_Qwen3-4B-Thinking-2507", model_order=None
):
    """
    Create
    Uses model-specific thresholds from MODEL_THRESHOLDS.
    """
    benchmarks = ["math500", "aime2024", "aime2025"]

    # Filter for extended metrics benchmarks (should have pass@k, majority_accuracy, avg@8)
    df_ext = df_data[
        (df_data["benchmark"].isin(benchmarks)) & (df_data["sub_category"].isna())
    ].copy()

    if model_order is None:
        all_models = df_ext["model"].unique()
    else:
        available_models = set(df_ext["model"].unique())
        all_models = [m for m in model_order if m in available_models]

    rows = []

    for model in all_models:
        model_display = MODEL_DISPLAY_NAMES.get(model, model)
        model_data = df_ext[df_ext["model"] == model]
        threshold = MODEL_THRESHOLDS.get(model, DEFAULT_THRESHOLD)

        # Determine methods to show
        if model == main_model:
            methods_to_show = ["rollout", "thinkbrake"]
        else:
            methods_to_show = ["rollout", "thinkbrake"]

        first_row_for_model = True
        for method in methods_to_show:
            if method in ["thinkbrake", "thinkbrake-prob"]:
                method_data = model_data[
                    (model_data["method"] == method)
                    & (model_data["threshold"] == threshold)
                ]
            else:
                method_data = model_data[model_data["method"] == method]

            if len(method_data) == 0:
                continue

            row = {
                "Model": model_display if first_row_for_model else "",
                "Method": METHOD_DISPLAY.get(method, method),
            }
            first_row_for_model = False

            for bench in benchmarks:
                bench_row = method_data[method_data["benchmark"] == bench]
                if len(bench_row) > 0:
                    # pass@k is a dict like {"1": 68.75, "5": 82.55}
                    pass_k = bench_row["pass@k"].values[0]
                    if isinstance(pass_k, dict) and "5" in pass_k:
                        row[f"{bench}_pass5"] = f"{pass_k['5']:.1f}"
                    else:
                        row[f"{bench}_pass5"] = "–"

                    # majority_accuracy
                    maj = bench_row.get("majority_accuracy")
                    if maj is not None and len(maj) > 0 and pd.notna(maj.values[0]):
                        row[f"{bench}_maj8"] = f"{maj.values[0]:.1f}"
                    else:
                        row[f"{bench}_maj8"] = "–"
                else:
                    row[f"{bench}_pass5"] = "–"
                    row[f"{bench}_maj8"] = "–"
                    row[f"{bench}_avg8"] = "–"

            rows.append(row)

    result_df = pd.DataFrame(rows)

    # Rename columns
    rename_map = {
        "math500_pass5": "MATH500 pass@5",
        "math500_maj8": "maj@8",
        "aime2024_pass5": "AIME24 pass@5",
        "aime2024_maj8": "maj@8",
        "aime2025_pass5": "AIME25 pass@5",
        "aime2025_maj8": "maj@8",
    }
    result_df = result_df.rename(columns=rename_map)

    return result_df


# Create extended metrics table
extended_table = create_extended_metrics_table(df, model_order=MODEL_ORDER)
display(Markdown("### Table 3: Extended Metrics (pass@5, majority@8)"))
display(extended_table)

### Table 3: Extended Metrics (pass@5, majority@8)

Unnamed: 0,Model,Method,MATH500 pass@5,maj@8,AIME24 pass@5,maj@8.1,AIME25 pass@5,maj@8.2
0,Qwen3-4B-Thinking,(baseline),98.3,97.2,86.2,86.7,84.1,80.0
1,,+THINKBRAKE,–,–,–,–,–,–
2,Qwen3-4B,(baseline),98.7,97.2,82.6,80.0,77.0,73.3
3,,+THINKBRAKE,98.3,96.8,80.4,76.7,74.6,66.7
4,Qwen3-14B,(baseline),99.2,97.8,82.1,80.0,79.6,76.7
5,,+THINKBRAKE,98.5,97.6,85.4,83.3,76.5,76.7
6,Qwen3-32B,(baseline),99.2,98.2,88.6,86.7,87.1,80.0
7,,+THINKBRAKE,–,–,87.5,83.3,85.1,83.3
8,DeepSeek-R1-7B,(baseline),97.9,96.6,76.1,76.7,61.4,50.0
9,,+THINKBRAKE,96.0,93.4,74.1,70.0,51.2,46.7


## Threshold Analysis for ThinkBrake

In [9]:
PARENT_CATEGORIES = {
    "gsm8k": "math",
    "math500": "math",
    "aime2024": "math",
    "aime2025": "math",
    "gpqa-diamond": "general",
    "arc-challenge": "general",
    "bfcl-v1": "tool",
    "bfcl-v2": "tool",
    "meta-tool": "tool",
}

df_tb = df[(df["method"] == "thinkbrake") & (df["sub_category"].isna())].copy()
df_tb["category"] = df_tb["benchmark"].map(PARENT_CATEGORIES)

threshold_analysis = df_tb.pivot_table(
    index=["model", "threshold"], columns="category", values="accuracy", aggfunc="mean"
)

print("ThinkBrake Threshold Analysis by Category (Average Accuracy)")
display(threshold_analysis.style.format("{:.2f}").background_gradient(axis=0))

ThinkBrake Threshold Analysis by Category (Average Accuracy)


Unnamed: 0_level_0,category,general,math,tool
model,threshold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Qwen_Qwen3-14B,0.1,78.17,83.92,80.22
Qwen_Qwen3-14B,0.25,76.19,83.97,79.25
Qwen_Qwen3-14B,1.0,76.02,83.7,79.62
Qwen_Qwen3-14B,2.5,76.02,86.42,79.99
Qwen_Qwen3-14B,5.0,52.53,82.84,
Qwen_Qwen3-32B,0.1,78.35,84.78,80.63
Qwen_Qwen3-32B,0.25,76.75,83.94,80.02
Qwen_Qwen3-32B,1.0,79.03,85.4,80.25
Qwen_Qwen3-32B,2.5,76.2,88.38,80.04
Qwen_Qwen3-4B,0.1,74.71,78.71,82.48
