In [1]:
import json, os, re, pandas as pd
from collections import Counter


In [2]:
NUM_RE = re.compile(r"[+-]?\d+(?:/\d+)?")

In [3]:
def _latex_to_plain(s: str) -> str:
    s = re.sub(r"\\boxed{([^}]*)}", r"\1", s)                           # \boxed{…}
    s = re.sub(r"\\(?:d)?frac{([^}]*)}{([^}]*)}", r"\1/\2", s)          # \frac{a}{b}
    return s.replace(",", "").lstrip("$€£ ").strip()

In [4]:
def extract_numeric(text: str | None):
    """Return final numeric token - tries <answer>… first, else last number."""
    if not text:
        return None
    # closed tag
    m = re.search(r"<answer>(.*?)</answer>", text, re.I | re.S)
    if m and (n := NUM_RE.search(_latex_to_plain(m.group(1)))):
        return n.group(0)
    # open tag
    m = re.search(r"<answer>(.*)$", text, re.I | re.S)
    if m and (n := NUM_RE.search(_latex_to_plain(m.group(1)))):
        return n.group(0)
    # number before </answer>
    m = re.search(r"([+-]?\d+(?:/\d+)?)\s*</answer>", text, re.I | re.S)
    if m:
        return _latex_to_plain(m.group(1))
    # fallback
    nums = NUM_RE.findall(_latex_to_plain(text))
    return nums[-1] if nums else None

In [5]:
def _load_jsonl(path, tolerant=False):
    bad = 0; items = []
    with open(path, encoding="utf-8") as f:
        for ln_no, ln in enumerate(f, 1):
            ln = ln.strip()
            if not ln:
                continue
            try:
                items.append(json.loads(ln))
            except json.JSONDecodeError as e:
                bad += 1
                if not tolerant:
                    raise RuntimeError(f"{path}:{ln_no}\n{e}")
    if bad:
        print(f"[warn] skipped {bad} malformed lines")
    return items


In [6]:
def _load_jsonflex(path):
    """Accepts JSON-Lines, JSON array, or known HuggingFace aliases."""
    path = str(path)
    if os.path.exists(path):
        with open(path, encoding='utf-8') as f:
            first = f.read(1)
            f.seek(0)
            if first == '[':
                return json.load(f)
            items = []
            for ln in f:
                ln = ln.strip()
                if ln:
                    try:
                        items.append(json.loads(ln))
                    except json.JSONDecodeError:
                        pass
            return items
    alias = path.lower()
    if alias in {'math500', 'math-500', 'huggingfaceh4/math-500'}:
        try:
            from datasets import load_dataset
        except ImportError as exc:
            raise ImportError('Install `datasets` to load HuggingFace benchmarks.') from exc
        ds = load_dataset('HuggingFaceH4/MATH-500', split='test')
        records = []
        for row in ds:
            solution = row.get('solution') or row.get('answer')
            records.append({
                'problem': row.get('problem') or row.get('question'),
                'solution': solution,
                'answer': solution,
                'id': row.get('problem_id') or row.get('id'),
            })
        return records
    raise FileNotFoundError(f"Cannot load benchmark from '{path}'.")



In [7]:
def evaluate(pred_path: str, gold_path: str, show_errors: int = 5) -> float:
    pred, gold = _load_jsonflex(pred_path), _load_jsonflex(gold_path)
    if len(pred) != len(gold):
        print(f"[warn] len(pred)={len(pred)} ≠ len(gold)={len(gold)}")
    rows, correct = [], 0
    for i, (p, g) in enumerate(zip(pred, gold)):
        p_ans = extract_numeric(p.get("raw") or p.get("prediction"))
        g_ans = extract_numeric(g.get("answer") or g.get("solution"))
        ok = (p_ans == g_ans)
        correct += ok
        if not ok and len(rows) < show_errors:
            rows.append({"idx": i, "pred": p_ans, "gold": g_ans})
    acc = correct / len(pred) if pred else 0.0
    print(f"Accuracy: {correct}/{len(pred)}  ({acc*100:.2f}%)")
    if rows:
        display(pd.DataFrame(rows))
    return acc

In [9]:
evaluate("results/cot/math500_COT_20250905_124208.jsonl",  "HuggingFaceH4/MATH-500")

  from .autonotebook import tqdm as notebook_tqdm


Accuracy: 154/500  (30.80%)


Unnamed: 0,idx,pred,gold
0,0,3,2.0
1,1,1,3.0
2,2,14/3,3.0
3,3,9,196.0
4,4,4,


0.308

In [10]:
evaluate("results/static/math500_STATIC_COT_20250905_165844.jsonl",  "HuggingFaceH4/MATH-500")

Accuracy: 164/500  (32.80%)


Unnamed: 0,idx,pred,gold
0,1,1,3.0
1,2,14/3,3.0
2,3,9,196.0
3,4,4,
4,5,42,3.0


0.328