In [None]:
# translation_evaluator.py

import time
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd
from sacrebleu import sentence_bleu, sentence_chrf, sentence_chrf2


class TranslationEvaluator:
    """
    - load_data(): CSV or Excel
    - evaluate(): computes per-model & per-metric columns (plus optional per-model timing)
    - get_detailed_results(): DataFrame with those columns
    - get_model_metrics(): dict of all metric means
    - get_response_times(): DataFrame of run_id vs avg_response_time
    """

    def __init__(self):
        self._data: pd.DataFrame | None = None
        self._detailed_results: pd.DataFrame = pd.DataFrame()
        self._model_metrics: dict[str, float] = {}
        self._response_times: pd.DataFrame = pd.DataFrame(
            [], columns=["run_id", "avg_response_time"]
        )

    def load_data(self, file_path: Union[str, Path]) -> None:
        p = Path(file_path)
        if p.suffix.lower() == ".csv":
            self._data = pd.read_csv(p)
        elif p.suffix.lower() in (".xls", ".xlsx"):
            self._data = pd.read_excel(p)
        else:
            raise ValueError("Unsupported file format (must be .csv, .xls or .xlsx)")

    def evaluate(
        self,
        prediction_cols: List[str],
        reference_col:    str,
        metrics:          List[str],
        keep_cols:        List[str],
        run_id:           int | None = None,
        measure_time:     bool      = False,
    ) -> None:
        if self._data is None:
            raise RuntimeError("No data loaded; call load_data() first.")

        rows = []
        time_records: list[float] = []

        for _, r in self._data.iterrows():
            rec = {c: r[c] for c in keep_cols}
            ref = str(r[reference_col])

            for col in prediction_cols:
                pred = str(r[col])
                if measure_time:
                    start = time.time()

                # compute each metric and store under "{col} {METRIC}"
                for m in metrics:
                    m_up = m.upper()
                    if m_up == "BLEU":
                        sc = sentence_bleu(pred, [ref]).score
                    elif m_up == "CHRF":
                        sc = sentence_chrf(pred, [ref]).score
                    elif m_up in ("CHRF++", "CHRFF++", "CHR F++"):
                        sc = sentence_chrf2(pred, [ref]).score
                    else:
                        raise ValueError(f"Unsupported metric: {m!r}")
                    rec[f"{col} {m_up}"] = sc

                # optional timing column per model
                if measure_time and run_id is not None:
                    elapsed = time.time() - start
                    rec[f"{col} Response Time Run {run_id}"] = elapsed
                    time_records.append(elapsed)

            rows.append(rec)

        # build detailed-results DataFrame
        self._detailed_results = pd.DataFrame(rows)

        # aggregate model metrics: mean of every "{col} {METRIC}"
        metric_cols = [f"{c} {m.upper()}" for c in prediction_cols for m in metrics]
        self._model_metrics = {
            col: float(self._detailed_results[col].mean()) for col in metric_cols
        }

        # if timing was measured, record the run average
        if measure_time and run_id is not None:
            avg = float(np.mean(time_records))
            new = pd.DataFrame([{"run_id": run_id, "avg_response_time": avg}])
            self._response_times = pd.concat(
                [self._response_times, new], ignore_index=True
            )

    def get_detailed_results(self) -> pd.DataFrame:
        return self._detailed_results.copy()

    def get_model_metrics(self) -> dict[str, float]:
        return dict(self._model_metrics)

    def get_response_times(self) -> pd.DataFrame:
        return self._response_times.copy()


In [None]:
from translation_evaluator import TranslationEvaluator
import pandas as pd

def main():
    evaluator = TranslationEvaluator()
    evaluator.load_data("input_data.xlsx")

    models   = ['base_madlad400_translation', 'finetuned_madlad400_translation']
    keep     = ["en", "es"]
    ref_col  = "en"
    metrics  = ["BLEU", "ChrF", "ChrF++"]

    # Run #1, with timing
    evaluator.evaluate(
        prediction_cols=models,
        reference_col=ref_col,
        metrics=metrics,
        keep_cols=keep,
        run_id=1,
        measure_time=True
    )

    # Run #2, without timing
    evaluator.evaluate(
        prediction_cols=models,
        reference_col=ref_col,
        metrics=metrics,
        keep_cols=keep,
        run_id=2,
        measure_time=False
    )

    # View detailed results
    df = evaluator.get_detailed_results()
    print("Columns in detailed results:")
    print(df.columns.tolist())

    # Example: select exactly the columns you need:
    want = [
      'base_madlad400_translation BLEU',
      'base_madlad400_translation CHRF',
      'base_madlad400_translation Response Time Run 1',
      # etc. for the rest...
    ]
    print(df[want].head(), "\n")

    # View aggregated metrics
    print("Aggregated metrics:")
    print(pd.DataFrame([evaluator.get_model_metrics()]), "\n")

    # View timing table
    print("Response times by run:")
    print(evaluator.get_response_times())

if __name__ == "__main__":
    main()
