In [None]:
# translation_evaluator.py

import time
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd
from sacrebleu import sentence_bleu, sentence_chrf, sentence_chrf2


class TranslationEvaluator:
    """
    - load_data(): CSV or Excel
    - evaluate(): computes per-model & per-metric columns (plus optional per-model timing)
    - get_detailed_results(): DataFrame with those columns
    - get_model_metrics(): dict of all metric means
    - get_response_times(): DataFrame of run_id vs avg_response_time
    """

    def __init__(self):
        self._data: pd.DataFrame | None = None
        self._detailed_results: pd.DataFrame = pd.DataFrame()
        self._model_metrics: dict[str, float] = {}
        self._response_times: pd.DataFrame = pd.DataFrame(
            [], columns=["run_id", "avg_response_time"]
        )

    def load_data(self, file_path: Union[str, Path]) -> None:
        p = Path(file_path)
        if p.suffix.lower() == ".csv":
            self._data = pd.read_csv(p)
        elif p.suffix.lower() in (".xls", ".xlsx"):
            self._data = pd.read_excel(p)
        else:
            raise ValueError("Unsupported file format (must be .csv, .xls or .xlsx)")

    def evaluate(
        self,
        prediction_cols: List[str],
        reference_col:    str,
        metrics:          List[str],
        keep_cols:        List[str],
        run_id:           int | None = None,
        measure_time:     bool      = False,
    ) -> None:
        if self._data is None:
            raise RuntimeError("No data loaded; call load_data() first.")

        rows = []
        time_records: list[float] = []

        for _, r in self._data.iterrows():
            rec = {c: r[c] for c in keep_cols}
            ref = str(r[reference_col])

            for col in prediction_cols:
                pred = str(r[col])
                if measure_time:
                    start = time.time()

                # compute each metric and store under "{col} {METRIC}"
                for m in metrics:
                    m_up = m.upper()
                    if m_up == "BLEU":
                        sc = sentence_bleu(pred, [ref]).score
                    elif m_up == "CHRF":
                        sc = sentence_chrf(pred, [ref]).score
                    elif m_up in ("CHRF++", "CHRFF++", "CHR F++"):
                        sc = sentence_chrf2(pred, [ref]).score
                    else:
                        raise ValueError(f"Unsupported metric: {m!r}")
                    rec[f"{col} {m_up}"] = sc

                # optional timing column per model
                if measure_time and run_id is not None:
                    elapsed = time.time() - start
                    rec[f"{col} Response Time Run {run_id}"] = elapsed
                    time_records.append(elapsed)

            rows.append(rec)

        # build detailed-results DataFrame
        self._detailed_results = pd.DataFrame(rows)

        # aggregate model metrics: mean of every "{col} {METRIC}"
        metric_cols = [f"{c} {m.upper()}" for c in prediction_cols for m in metrics]
        self._model_metrics = {
            col: float(self._detailed_results[col].mean()) for col in metric_cols
        }

        # if timing was measured, record the run average
        if measure_time and run_id is not None:
            avg = float(np.mean(time_records))
            new = pd.DataFrame([{"run_id": run_id, "avg_response_time": avg}])
            self._response_times = pd.concat(
                [self._response_times, new], ignore_index=True
            )

    def get_detailed_results(self) -> pd.DataFrame:
        return self._detailed_results.copy()

    def get_model_metrics(self) -> dict[str, float]:
        return dict(self._model_metrics)

    def get_response_times(self) -> pd.DataFrame:
        return self._response_times.copy()


In [None]:
from translation_evaluator import TranslationEvaluator
import pandas as pd

def main():
    evaluator = TranslationEvaluator()
    evaluator.load_data("input_data.xlsx")

    models   = ['base_madlad400_translation', 'finetuned_madlad400_translation']
    keep     = ["en", "es"]
    ref_col  = "en"
    metrics  = ["BLEU", "ChrF", "ChrF++"]

    # Run #1, with timing
    evaluator.evaluate(
        prediction_cols=models,
        reference_col=ref_col,
        metrics=metrics,
        keep_cols=keep,
        run_id=1,
        measure_time=True
    )

    # Run #2, without timing
    evaluator.evaluate(
        prediction_cols=models,
        reference_col=ref_col,
        metrics=metrics,
        keep_cols=keep,
        run_id=2,
        measure_time=False
    )

    # View detailed results
    df = evaluator.get_detailed_results()
    print("Columns in detailed results:")
    print(df.columns.tolist())

    # Example: select exactly the columns you need:
    want = [
      'base_madlad400_translation BLEU',
      'base_madlad400_translation CHRF',
      'base_madlad400_translation Response Time Run 1',
      # etc. for the rest...
    ]
    print(df[want].head(), "\n")

    # View aggregated metrics
    print("Aggregated metrics:")
    print(pd.DataFrame([evaluator.get_model_metrics()]), "\n")

    # View timing table
    print("Response times by run:")
    print(evaluator.get_response_times())

if __name__ == "__main__":
    main()



In [None]:
# Translation_evaluator_Research.py

import time
import os
import warnings
from pathlib import Path
from typing import List, Optional, Union

import pandas as pd
from sacrebleu import sentence_bleu, corpus_bleu, sentence_chrf, corpus_chrf


class TranslationEvaluator:
    def __init__(self, output_file: Union[str, Path] = "output_data.xlsx"):
        """
        :param output_file: where to write the Excel report
        """
        self._data: Optional[pd.DataFrame] = None
        self._detailed_results: Optional[pd.DataFrame] = None
        self._model_metrics: Optional[pd.DataFrame] = None
        self._output_file = Path(output_file)

        # suppress any tokenizer warnings
        warnings.simplefilter("ignore")

    def load_data(self, file_path: Union[str, Path]) -> pd.DataFrame:
        """
        Load your input_data.xlsx / .csv into a DataFrame.
        Expects at least one column for references (e.g. "en") and
        one or more columns of model outputs.
        """
        p = Path(file_path)
        if p.suffix.lower() in {".xls", ".xlsx"}:
            df = pd.read_excel(p)
        elif p.suffix.lower() == ".csv":
            df = pd.read_csv(p)
        else:
            raise ValueError(f"Unsupported extension: {p.suffix}")
        if df.empty:
            raise ValueError("Loaded DataFrame is empty.")
        self._data = df
        return df

    def evaluate(
        self,
        prediction_cols: List[str],
        reference_col: str = "en",
        metrics: List[str] = ["BLEU", "CHRF", "ChrF++"],
        keep_cols: Optional[List[str]] = None,
        measure_time: bool = False,
    ) -> None:
        """
        Compute per-sentence and corpus-level BLEU, CHRF and ChrF++.
        :param prediction_cols: list of your model-output column names
        :param reference_col: the column with your human/reference translation
        :param metrics: choose any of "BLEU", "CHRF", "ChrF++"
        :param keep_cols: any additional columns (e.g. "es") you want carried into the Detailed sheet
        :param measure_time: if True, also record per-sentence latency
        """
        if self._data is None:
            raise ValueError("Please call load_data() first.")

        df = self._data.copy()
        keep_cols = keep_cols or []

        # sanity‐check that all required columns exist
        for col in [reference_col] + prediction_cols + keep_cols:
            if col not in df.columns:
                raise KeyError(f"Column not found: {col}")

        detailed_records = []
        # 1) per-sentence scores (+ optional timing)
        for model in prediction_cols:
            for _, row in df.iterrows():
                hyp = str(row[model])
                ref = str(row[reference_col])
                rec = {"model": model}
                if measure_time:
                    t0 = time.time()

                if "BLEU" in metrics:
                    rec["BLEU"] = sentence_bleu(hyp, [ref]).score
                if "CHRF" in metrics:
                    rec["CHRF"] = sentence_chrf(hyp, [ref]).score
                if "ChrF++" in metrics:
                    # ChrF++ = word_order=2
                    rec["ChrF++"] = sentence_chrf(hyp, [ref], word_order=2).score

                if measure_time:
                    rec["response_time"] = time.time() - t0

                # carry over any additional columns (e.g. "es")
                for col in keep_cols:
                    rec[col] = row[col]

                # also include the reference text if you like
                rec[reference_col] = ref
                detailed_records.append(rec)

        self._detailed_results = pd.DataFrame(detailed_records)

        # 2) corpus-level (aggregate) metrics
        summary = []
        for model in prediction_cols:
            hyps = df[model].astype(str).tolist()
            refs = df[reference_col].astype(str).tolist()
            rec = {"model": model}
            if "BLEU" in metrics:
                rec["BLEU"] = corpus_bleu(hyps, [refs]).score
            if "CHRF" in metrics:
                rec["CHRF"] = corpus_chrf(hyps, [refs]).score
            if "ChrF++" in metrics:
                rec["ChrF++"] = corpus_chrf(hyps, [refs], word_order=2).score
            if measure_time:
                times = self._detailed_results.query("model == @model")["response_time"]
                rec["avg_response_time"] = times.mean()
            summary.append(rec)

        self._model_metrics = pd.DataFrame(summary)

        # 3) write them both to an Excel file
        with pd.ExcelWriter(self._output_file) as writer:
            self._detailed_results.to_excel(
                writer, sheet_name="Detailed", index=False
            )
            self._model_metrics.to_excel(
                writer, sheet_name="Summary", index=False
            )

    def get_detailed_results(self) -> pd.DataFrame:
        if self._detailed_results is None:
            raise ValueError("No detailed results – did you call evaluate()?")
        return self._detailed_results

    def get_model_metrics(self) -> pd.DataFrame:
        if self._model_metrics is None:
            raise ValueError("No model metrics – did you call evaluate()?")
        return self._model_metrics

    def get_response_times(self) -> pd.DataFrame:
        """
        Only valid if you passed measure_time=True to evaluate().
        """
        if self._detailed_results is None or "response_time" not in self._detailed_results:
            raise ValueError("No timing data – did you evaluate(measure_time=True)?")
        return self._detailed_results[["model", "response_time"]]


if __name__ == "__main__":
    # --- example usage ---
    evaluator = TranslationEvaluator(output_file="bert_translations_report.xlsx")
    evaluator.load_data("input_data.xlsx")
    evaluator.evaluate(
        prediction_cols=[
            "base_madlad400_translation",
            "finetuned_madlad400_translation",
            "finetuned_helsinki_translation",
            "base_helsinki_translation",
        ],
        reference_col="en",
        metrics=["BLEU", "CHRF", "ChrF++"],
        keep_cols=["es"],
        measure_time=True,      # set False if you don't need per-sentence timing
    )

    print("=== Detailed results (first 5 rows) ===")
    print(evaluator.get_detailed_results().head(), "\n")

    print("=== Summary (corpus-level) ===")
    print(evaluator.get_model_metrics(), "\n")

    print("=== Response times ===")
    print(evaluator.get_response_times().head())
