# ThinkBrake: Threshold Sensitivity Experiment

In [1]:
import json
import pandas as pd
import seaborn as sns
from pathlib import Path

sns.set_theme(style="whitegrid")

METRICS_FILE = Path("../metrics_summary.jsonl")
INCLUDE_MODELS = [
    "Qwen/Qwen3-4B-Thinking-2507",
    # "Qwen/Qwen3-4B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "microsoft/phi-4-reasoning",
]
# CATEGORY = "dapo-math"
CATEGORY = "gsm8k-val"
INCLUDE_THRESHOLDS = [0.1, 0.25, 1.0, 2.5]

In [2]:
def load_metrics(file_path):
    if not file_path.exists():
        print(f"Warning: {file_path} not found.")
        return []
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data


all_metrics = load_metrics(METRICS_FILE)
print(f"Total records loaded: {len(all_metrics)}")

Total records loaded: 1039


In [3]:
def process_model_data(model_name, benchmark, all_metrics):

    model_metrics = [
        r
        for r in all_metrics
        if (
            r.get("model") == model_name
            or r.get("model") == model_name.replace("/", "_")
        )
        and r.get("benchmark") == benchmark
    ]

    all_token_counts = [
        r.get("avg_token_length")
        for r in model_metrics
        if r.get("avg_token_length") is not None
        and r.get("threshold") in INCLUDE_THRESHOLDS
        or r.get("method") == "rollout"
    ]

    global_min_tokens = min(all_token_counts) if all_token_counts else 0
    global_max_tokens = max(all_token_counts) if all_token_counts else 0

    baseline_entry = next(
        (r for r in model_metrics if r.get("method") == "rollout"), None
    )
    base_tokens = baseline_entry.get("avg_token_length", 0) if baseline_entry else 0

    plot_data = []
    for r in model_metrics:
        if r.get("method") == "thinkbrake":
            try:
                t_val = float(r.get("threshold"))
                if not any(abs(t_val - t) < 1e-6 for t in INCLUDE_THRESHOLDS):
                    continue

                token_reduction = None
                if baseline_entry:
                    base_tokens = baseline_entry.get("avg_token_length", 0)
                    curr_tokens = r.get("avg_token_length", 0)
                    if base_tokens > 0:
                        token_reduction = (
                            (base_tokens - curr_tokens) / base_tokens * 100
                        )

                accuracy = r.get("accuracy", 0)
                avg_tok_len = r.get("avg_token_length", 0)
                token_efficiency = 1 - (avg_tok_len - global_min_tokens) / (
                    global_max_tokens - global_min_tokens
                )

                entry = {
                    "threshold": t_val,
                    "accuracy": accuracy,
                    "majority_accuracy": r.get("majority_accuracy"),
                    "avg_token_length": avg_tok_len,
                    "min_token_length": global_min_tokens,
                    "max_token_length": global_max_tokens,
                    "token_reduction": token_reduction,
                    "e3_score": accuracy * accuracy / avg_tok_len,
                    "token_efficiency": token_efficiency,
                    "overthinking_score": (2 * accuracy * token_efficiency)
                    / (accuracy + token_efficiency),
                }
                plot_data.append(entry)
            except (ValueError, TypeError):
                continue

    df = pd.DataFrame(plot_data)
    if not df.empty:
        df = df.sort_values("threshold")

    return df, baseline_entry

In [4]:
processed_data = {}
for model in INCLUDE_MODELS:
    df, baseline = process_model_data(model, CATEGORY, all_metrics)
    if not df.empty or baseline:
        processed_data[model] = {"df": df, "baseline": baseline}
        print(
            f"Processed {model}: {len(df)} ThinkBrake points, Baseline found: {baseline is not None}"
        )
    else:
        print(f"No data found for {model}")

Processed Qwen/Qwen3-4B-Thinking-2507: 4 ThinkBrake points, Baseline found: True
Processed deepseek-ai/DeepSeek-R1-Distill-Qwen-7B: 4 ThinkBrake points, Baseline found: True
Processed microsoft/phi-4-reasoning: 4 ThinkBrake points, Baseline found: True


In [8]:
for model, data in processed_data.items():
    df = data["df"]
    baseline = data["baseline"]
    display_name = model.split("/")[-1]
    summary_list = []

    if baseline:
        g_min = (
            df["min_token_length"].iloc[0]
            if not df.empty
            else baseline.get("avg_token_length", 0)
        )
        g_max = (
            df["max_token_length"].iloc[0]
            if not df.empty
            else baseline.get("avg_token_length", 0)
        )

        b_acc = baseline.get("accuracy", 0)
        b_len = baseline.get("avg_token_length", 0)
        b_eff = 1 - (b_len - g_min) / (g_max - g_min) if (g_max - g_min) > 0 else 1.0

        summary_list.append(
            {
                "Model": display_name,
                "Threshold": "--",
                "Accuracy": b_acc,
                "Avg Tokens": b_len,
                "E3 Score": (b_acc**2) / b_len if b_len > 0 else 0,
                # "Efficiency": b_eff,
                "Overthinking": "--",
            }
        )

    for _, row in df.iterrows():
        summary_list.append(
            {
                "Model": display_name,
                "Threshold": row["threshold"],
                "Accuracy": row["accuracy"],
                "Avg Tokens": row["avg_token_length"],
                "E3 Score": row["e3_score"],
                # "Efficiency": row["token_efficiency"],
                "Overthinking": row["overthinking_score"],
            }
        )

    summary_df = pd.DataFrame(summary_list)
    pd.set_option("display.max_rows", None)
    pd.set_option("display.float_format", "{:.2f}".format)
    display(summary_df)

Unnamed: 0,Model,Threshold,Accuracy,Avg Tokens,E3 Score,Overthinking
0,Qwen3-4B-Thinking-2507,--,95.4,1461.39,6.23,--
1,Qwen3-4B-Thinking-2507,0.10,94.2,1213.12,7.31,1.41
2,Qwen3-4B-Thinking-2507,0.25,94.8,1185.24,7.58,1.56
3,Qwen3-4B-Thinking-2507,1.00,94.3,1169.8,7.6,1.65
4,Qwen3-4B-Thinking-2507,2.50,94.0,1111.01,7.95,1.98


Unnamed: 0,Model,Threshold,Accuracy,Avg Tokens,E3 Score,Overthinking
0,DeepSeek-R1-Distill-Qwen-7B,--,93.5,1699.73,5.14,--
1,DeepSeek-R1-Distill-Qwen-7B,0.10,92.0,1331.04,6.36,1.14
2,DeepSeek-R1-Distill-Qwen-7B,0.25,92.5,1303.9,6.56,1.22
3,DeepSeek-R1-Distill-Qwen-7B,1.00,94.0,1159.68,7.62,1.66
4,DeepSeek-R1-Distill-Qwen-7B,2.50,91.5,1054.55,7.94,1.98


Unnamed: 0,Model,Threshold,Accuracy,Avg Tokens,E3 Score,Overthinking
0,phi-4-reasoning,--,92.0,1305.65,6.48,--
1,phi-4-reasoning,0.10,92.0,1196.16,7.08,0.77
2,phi-4-reasoning,0.25,92.0,1095.12,7.73,1.48
3,phi-4-reasoning,1.00,91.0,1022.79,8.1,1.98
4,phi-4-reasoning,2.50,91.0,1161.12,7.13,1.02
