# Batch Evaluation Notebook
This notebook runs generation on the whole sample set (using fallback generation as proxy), computes ROUGE and BERTScore against deterministic references (fallback outputs), and writes a CSV with per-row metrics.

In [None]:
!pip install -r ../requirements.txt
!pip install rouge-score bert-score


In [None]:
import pandas as pd
from app.utils import row_to_prompt, fallback_template
from app.llm_interface import generate_text
from rouge_score import rouge_scorer
from bert_score import score as bertscore

df = pd.read_csv('data/sample_stats.csv')
rows = []
scorer = rouge_scorer.RougeScorer(['rouge1','rougeL'], use_stemmer=True)

for i, r in df.iterrows():
    prompt = row_to_prompt(r.to_dict(), 'date','indicator','value','region')
    reference = fallback_template(prompt)
    candidate = generate_text(prompt, backend='fallback')
    rouge = scorer.score(reference, candidate)
    P, R, F = bertscore([candidate], [reference], lang='en', rescale_with_baseline=True)
    rows.append({'row_index': int(i), 'reference': reference, 'candidate': candidate, 'rouge1_f': rouge['rouge1'].fmeasure, 'rougeL_f': rouge['rougeL'].fmeasure, 'bert_f1': float(F[0])})

out_df = pd.DataFrame(rows)
out_df.to_csv('data/eval_metrics.csv', index=False)
print('Wrote data/eval_metrics.csv')


The CSV `data/eval_metrics.csv` contains per-row ROUGE and BERTScore metrics.