# Analysis

#### Original parsing

In [9]:
import re
from fractions import Fraction
from typing import Optional


def extract_final_numeric_value(text: str) -> Optional[Fraction]:
    """
    Extract the final numeric value from an answer string.

    Heuristics (matches paper code):
    - Prefer a '#### <answer>' marker if present (GSM8K style)
    - Otherwise, take the last occurrence of a fraction or number
    - Supports integers, decimals, and fractions
    Returns a Fraction if parseable, else None.
    """
    if not text:
        return None

    candidate = None

    # 1. Prefer GSM8K-style final marker
    m = re.search(r"####\s*([^\n#]+)", text)
    if m:
        candidate = m.group(1).strip()
    else:
        # 2. Last fraction (a/b)
        frac_matches = list(
            re.finditer(r"-?\d[\d,]*\s*/\s*-?\d[\d,]*", text)
        )
        if frac_matches:
            candidate = frac_matches[-1].group(0)
        else:
            # 3. Last number (int or decimal)
            num_matches = list(
                re.finditer(r"-?\d[\d,]*(?:\.\d+)?", text)
            )
            if num_matches:
                candidate = num_matches[-1].group(0)

    if candidate is None:
        return None

    candidate = candidate.strip()

    # Strip trailing words / units (e.g., "7 dozen")
    m = re.match(
        r"(-?\d[\d,]*(?:\.\d+)?(?:\s*/\s*-?\d[\d,]*)?)",
        candidate,
    )
    if m:
        candidate = m.group(1)

    # Remove commas
    candidate = candidate.replace(",", "")

    # Parse into Fraction
    try:
        if "/" in candidate:
            a, b = [p.strip() for p in candidate.split("/", 1)]
            return Fraction(a) / Fraction(b)
        return Fraction(candidate)
    except Exception:
        return None


def local_numeric_compare(model_answer: str, correct_answer: str) -> bool:
    """
    Deterministically compare final numeric answers.
    This is exactly the fallback used in the paper.
    """
    gold = extract_final_numeric_value(correct_answer)
    pred = extract_final_numeric_value(model_answer)

    if gold is None or pred is None:
        return False

    return gold == pred


def evaluate_answer_parsing_only(model_answer: str, correct_answer: str):
    """
    Judge-free evaluation using numeric parsing only.
    Mirrors the paper's fallback behavior.
    """
    is_correct = local_numeric_compare(model_answer, correct_answer)
    response = "YES" if is_correct else "NO"
    return is_correct, response

In [None]:
import json
from pathlib import Path
from collections import defaultdict

# Path to results
path = Path("../results/rolling_qwen2.5-math-1.5b_gsm8k.jsonl")

models = ["base", "thinking", "hybrid"]

num_correct = defaultdict(int)
num_total = defaultdict(int)

num_seen = 0

with path.open() as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        data = json.loads(line)
        num_seen += 1

        gold_answer = data.get("gold_answer", "")
        answers = data.get("answers", {})

        for model in models:
            if model not in answers:
                continue

            model_answer = answers[model]

            is_correct, _ = evaluate_answer_parsing_only(
                model_answer=model_answer,
                correct_answer=gold_answer,
            )

            num_total[model] += 1
            if is_correct:
                num_correct[model] += 1

# ---- Report results ----
print(f"Evaluated on {num_seen} GSM8K examples (numeric parsing only)\n")

for model in models:
    total = num_total[model]
    correct = num_correct[model]
    acc = correct / total if total > 0 else 0.0

    print(
        f"{model:8s} | "
        f"accuracy = {acc:.4f} "
        f"({correct}/{total})"
    )


Evaluated on 500 GSM8K examples (numeric parsing only)

base     | accuracy = 0.6640 (332/500)
thinking | accuracy = 0.6340 (317/500)
hybrid   | accuracy = 0.5180 (259/500)


### Save original parser results

In [None]:
import pandas as pd

rows = []

with path.open() as f:
    for line_idx, line in enumerate(f):
        line = line.strip()
        if not line:
            continue

        data = json.loads(line)

        question = data.get("question", "")
        gold_answer = data.get("gold_answer", "")
        answers = data.get("answers", {})

        for model in models:
            if model not in answers:
                continue

            model_answer = answers[model]

            gold_parsed = extract_final_numeric_value(gold_answer)
            pred_parsed = extract_final_numeric_value(model_answer)

            is_correct = (
                gold_parsed is not None
                and pred_parsed is not None
                and gold_parsed == pred_parsed
            )

            rows.append({
                "line_idx": line_idx,
                "model": model,

                # Raw text
                "question": question,
                "gold_answer": gold_answer,
                "model_answer": model_answer,

                # Parsing outputs
                "gold_parsed": str(gold_parsed) if gold_parsed is not None else None,
                "pred_parsed": str(pred_parsed) if pred_parsed is not None else None,
                "is_correct": is_correct,

                # Metadata
                "parser": "original_numeric_parsing",
            })

full_df = pd.DataFrame(rows)

out_path = Path("../results/full_outputs_original_parsing.csv")
full_df.to_csv(out_path, index=False)

print(f"Saved full responses to {out_path}")

Saved full responses to results/full_outputs_original_parsing.csv


#### New parser

In [4]:
import re
from fractions import Fraction
from typing import Optional


def extract_final_numeric_value_extended(text: str) -> Optional[Fraction]:
    """
    Extended numeric answer extraction.

    Order of precedence:
    1. GSM8K '#### <answer>' marker
    2. LaTeX boxed answers: \\boxed{...}
    3. Explicit final-answer phrases
    4. Fallback: last numeric value (original heuristic)

    Returns a Fraction if parseable, else None.
    """
    if not text:
        return None

    # ---------- 1. GSM8K final marker ----------
    m = re.search(r"####\s*([^\n#]+)", text)
    if m:
        return _parse_number(m.group(1))

    # ---------- 2. LaTeX boxed answers ----------
    boxed = re.findall(r"\\boxed\{([^}]+)\}", text)
    if boxed:
        return _parse_number(boxed[-1])

    # ---------- 3. Final-answer cue phrases ----------
    final_patterns = [
        r"final answer[:\s]*([^\n]+)",
        r"the answer is[:\s]*([^\n]+)",
        r"therefore[,:\s]*([^\n]+)",
    ]

    for pat in final_patterns:
        matches = re.findall(pat, text, flags=re.IGNORECASE)
        if matches:
            val = _parse_number(matches[-1])
            if val is not None:
                return val

    # ---------- 4. Original fallback (last number) ----------
    return _extract_last_number(text)


def _parse_number(candidate: str) -> Optional[Fraction]:
    """Parse a numeric string into a Fraction."""
    if not candidate:
        return None

    candidate = candidate.strip()
    candidate = candidate.replace(",", "")
    candidate = candidate.replace("%", "")

    m = re.match(
        r"(-?\d+(?:\.\d+)?(?:\s*/\s*-?\d+)?)",
        candidate,
    )
    if not m:
        return None

    try:
        token = m.group(1)
        if "/" in token:
            a, b = token.split("/", 1)
            return Fraction(a.strip()) / Fraction(b.strip())
        return Fraction(token)
    except Exception:
        return None


def _extract_last_number(text: str) -> Optional[Fraction]:
    """Original last-number fallback."""
    nums = re.findall(r"-?\d+(?:\.\d+)?", text.replace(",", ""))
    if not nums:
        return None
    try:
        return Fraction(nums[-1])
    except Exception:
        return None


In [None]:
import json
from pathlib import Path
from collections import defaultdict

# Path to results
path = Path("../results/rolling_qwen2.5-math-1.5b_gsm8k.jsonl")

models = ["base", "thinking", "hybrid"]

num_correct = defaultdict(int)
num_total = defaultdict(int)

num_seen = 0

with path.open() as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        data = json.loads(line)
        num_seen += 1

        gold_answer = data.get("gold_answer", "")
        answers = data.get("answers", {})

        for model in models:
            if model not in answers:
                continue

            model_answer = answers[model]

            gold = extract_final_numeric_value_extended(gold_answer)
            pred = extract_final_numeric_value_extended(model_answer)

            is_correct = (
                gold is not None
                and pred is not None
                and gold == pred
            )

            num_total[model] += 1
            if is_correct:
                num_correct[model] += 1

# ---- Report results ----
print(f"Evaluated on {num_seen} GSM8K examples (extended numeric parsing)\n")

for model in models:
    total = num_total[model]
    correct = num_correct[model]
    acc = correct / total if total > 0 else 0.0

    print(
        f"{model:8s} | "
        f"accuracy = {acc:.4f} "
        f"({correct}/{total})"
    )

Evaluated on 500 GSM8K examples (extended numeric parsing)

base     | accuracy = 0.7840 (392/500)
thinking | accuracy = 0.7020 (351/500)
hybrid   | accuracy = 0.5940 (297/500)


#### Save the new parsing results

In [None]:
import pandas as pd

rows = []

with path.open() as f:
    for line_idx, line in enumerate(f):
        line = line.strip()
        if not line:
            continue

        data = json.loads(line)

        question = data.get("question", "")
        gold_answer = data.get("gold_answer", "")
        answers = data.get("answers", {})

        for model in models:
            if model not in answers:
                continue

            model_answer = answers[model]

            gold_parsed = extract_final_numeric_value_extended(gold_answer)
            pred_parsed = extract_final_numeric_value_extended(model_answer)

            is_correct = (
                gold_parsed is not None
                and pred_parsed is not None
                and gold_parsed == pred_parsed
            )

            rows.append({
                "line_idx": line_idx,
                "model": model,

                # Raw text
                "question": question,
                "gold_answer": gold_answer,
                "model_answer": model_answer,

                # Parsing outputs
                "gold_parsed": str(gold_parsed) if gold_parsed is not None else None,
                "pred_parsed": str(pred_parsed) if pred_parsed is not None else None,
                "is_correct": is_correct,

                # Metadata
                "parser": "extended_numeric_parsing",
            })

full_df_ext = pd.DataFrame(rows)

out_path = Path("../results/full_outputs_extended_parsing.csv")
full_df_ext.to_csv(out_path, index=False)

print(f"Saved extended parsing results to {out_path}")

Saved extended parsing results to results/full_outputs_extended_parsing.csv
