# Evaluation

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sacrebleu import CHRF, TER, corpus_bleu

from src.config import RESULTS_DIR, TRANSLATIONS_DIR
from src.utils import load_file

Upload the translations to `data/translations` and revise the filenames below accordingly.

In [None]:
refs = load_file("data/test.tgt")
models = {
    "GRU-Base": load_file(TRANSLATIONS_DIR / "trans-base.txt"),
    "GRU-Aug": load_file(TRANSLATIONS_DIR / "trans-aug.txt"),
    "GRU-Aug-CBK": load_file(TRANSLATIONS_DIR / "trans-aug-cbk.txt"),
}

Let us compute for different evaluation metrics, namely BLEU, CHRF, and TER.

In [None]:
results = []

chrf = CHRF()
ter = TER()

for name, preds in models.items():
    bleu = corpus_bleu(preds, [refs])
    chrf = chrf.corpus_score(preds, [refs])
    ter = ter.corpus_score(preds, [refs])

    results.append(
        {
            "Name": name,
            "BLEU": bleu.score,
            "CHRF": chrf.score,
            "TER": ter.score,
        }
    )

    print(f"\n{name}")
    print(f"BLEU: {bleu.score:.2f}")
    print(f"CHRF: {chrf.score:.2f}")
    print(f"TER: {ter.score:.2f}")

results_df = pd.DataFrame(results).set_index("Model")

Let us visualize the results.

In [None]:
ax = results_df.plot(kind="bar", figsize=(10, 5))
plt.title("Translation Quality Comparison")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.legend(title="Metric")
plt.tight_layout()

Let us save the results.

In [None]:
results_df.to_csv(RESULTS_DIR / "metrics_summary.csv")