# ⚡ BlazeMetrics Benchmarking & Comparative Analysis

This notebook compares BlazeMetrics against popular Python implementations where available.
- ROUGE (`rouge-score`), BLEU (`nltk`), chrF (`sacrebleu`), METEOR (`nltk`), WER (`jiwer`), BERTScore (`bert-score`), MoverScore (`moverscore_v2`)
- Cleanly skips baselines when dependencies are missing
- Reports timings (min/avg over repeats) and exports CSV/Markdown



In [None]:
import time, random, os, json
from typing import List, Dict, Any
import numpy as np

# Optional baselines (best-effort imports)
try:
    import nltk
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    from nltk.translate.meteor_score import meteor_score as nltk_meteor
except Exception:
    nltk = None

try:
    import sacrebleu
except Exception:
    sacrebleu = None

try:
    from rouge_score import rouge_scorer
except Exception:
    rouge_scorer = None

try:
    import jiwer
except Exception:
    jiwer = None

try:
    from bert_score import score as bertscore_score
except Exception:
    bertscore_score = None

try:
    from moverscore_v2 import get_idf_dict, word_mover_score
except Exception:
    word_mover_score = None

from blazemetrics import (
    rouge_score as rg_rouge,
    bleu as rg_bleu,
    chrf_score as rg_chrf,
    meteor as rg_meteor,
    wer as rg_wer,
    bert_score_similarity as rg_bertsim,
    moverscore_greedy as rg_moverscore,
    compute_text_metrics as rg_compute_all,
)

print("✅ Imports complete (missing baselines will be skipped)")


In [None]:
def gen_corpus(n=1000, vocab=500, seed=123):
    rng = random.Random(seed)
    def sentence(min_len=5, max_len=20):
        length = rng.randint(min_len, max_len)
        return " ".join(f"t{rng.randint(1, vocab)}" for _ in range(length))
    candidates = [sentence() for _ in range(n)]
    references = [[sentence()] for _ in range(n)]
    return candidates, references

cands, refs = gen_corpus()
print(f"Generated corpus: {len(cands)} candidates, {len(refs)} references")

# Embeddings for blazemetrics BERT similarity demo (random)
np.random.seed(42)
cand_emb = np.random.rand(128, 768).astype(np.float32)
ref_emb = np.random.rand(128, 768).astype(np.float32)


In [None]:
def timeit(fn, repeat=3, warmup=1):
    for _ in range(warmup):
        fn()
    times = []
    for _ in range(repeat):
        t0 = time.perf_counter()
        _ = fn()
        t1 = time.perf_counter()
        times.append(t1 - t0)
    return min(times), sum(times) / len(times)

results: Dict[str, list[tuple[str, float, float]]] = {}

# ROUGE
min_t, avg_t = timeit(lambda: rg_rouge(cands, refs, score_type="rouge_n", n=1))
results.setdefault("ROUGE-1", []).append(("blazemetrics", min_t, avg_t))
if rouge_scorer is not None:
    scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=False)
    def _py_rouge1():
        return [scorer.score(refs[i][0], cands[i])["rouge1"].fmeasure for i in range(len(cands))]
    min_t, avg_t = timeit(_py_rouge1)
    results["ROUGE-1"].append(("rouge-score", min_t, avg_t))

# BLEU
min_t, avg_t = timeit(lambda: rg_bleu(cands, refs))
results.setdefault("BLEU", []).append(("blazemetrics", min_t, avg_t))
if nltk is not None:
    ch = SmoothingFunction().method1
    def _py_bleu():
        scores = []
        for c, rlist in zip(cands, refs):
            ref_tokens = [r.split() for r in rlist]
            cand_tokens = c.split()
            scores.append(sentence_bleu(ref_tokens, cand_tokens, smoothing_function=ch))
        return scores
    min_t, avg_t = timeit(_py_bleu)
    results["BLEU"].append(("nltk", min_t, avg_t))

# chrF
min_t, avg_t = timeit(lambda: rg_chrf(cands, refs))
results.setdefault("chrF", []).append(("blazemetrics", min_t, avg_t))
if sacrebleu is not None:
    def _py_chrf():
        return sacrebleu.corpus_chrf(cands, list(zip(*refs))[0]).scores  # type: ignore
    try:
        min_t, avg_t = timeit(_py_chrf)
        results["chrF"].append(("sacrebleu", min_t, avg_t))
    except Exception:
        pass

# METEOR
min_t, avg_t = timeit(lambda: rg_meteor(cands, refs))
results.setdefault("METEOR", []).append(("blazemetrics", min_t, avg_t))
if nltk is not None:
    def _py_meteor():
        return [nltk_meteor(r[0], c) for c, r in zip(cands, refs)]
    min_t, avg_t = timeit(_py_meteor)
    results["METEOR"].append(("nltk", min_t, avg_t))

# WER
min_t, avg_t = timeit(lambda: rg_wer(cands, refs))
results.setdefault("WER", []).append(("blazemetrics", min_t, avg_t))
if jiwer is not None:
    def _py_wer():
        return [jiwer.wer(r[0], c) for c, r in zip(cands, refs)]
    min_t, avg_t = timeit(_py_wer)
    results["WER"].append(("jiwer", min_t, avg_t))

# BERT similarity (kernel)
min_t, avg_t = timeit(lambda: rg_bertsim(cand_emb, ref_emb))
results.setdefault("BERT-sim", []).append(("blazemetrics", min_t, avg_t))

# MoverScore (greedy kernel)
min_t, avg_t = timeit(lambda: rg_moverscore(cands, refs))
results.setdefault("MoverScore (greedy)", []).append(("blazemetrics", min_t, avg_t))

print("✅ Benchmarks complete")
results


In [None]:
# Pretty print & optional export
import pandas as pd
from pathlib import Path

rows = []
for section, entries in results.items():
    for name, tmin, tavg in entries:
        rows.append({"metric": section, "impl": name, "t_min_s": tmin, "t_avg_s": tavg})

df = pd.DataFrame(rows).sort_values(["metric", "t_avg_s"]).reset_index(drop=True)
df


In [None]:
# Save results (optional)
out_dir = Path("bench_outputs"); out_dir.mkdir(exist_ok=True)
csv_path = out_dir / "benchmark_results.csv"
md_path = out_dir / "benchmark_summary.md"
json_path = out_dir / "benchmark_results.json"

df.to_csv(csv_path, index=False)

# Simple Markdown summary
groups = df.groupby("metric")
lines = ["# Benchmark Summary\n"]
for metric, g in groups:
    lines.append(f"\n## {metric}\n")
    for _, row in g.iterrows():
        lines.append(f"- {row['impl']}: min={row['t_min_s']:.4f}s avg={row['t_avg_s']:.4f}s")
md_path.write_text("\n".join(lines), encoding="utf-8")

json_path.write_text(json.dumps(results, default=float, indent=2), encoding="utf-8")

print(f"Saved: {csv_path}, {md_path}, {json_path}")
