# Evaluation

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sacrebleu import CHRF, TER, corpus_bleu

from src.config import RESULTS_DIR, TRANSLATIONS_DIR
from src.utils import load_file

Upload the resulting translations to `data/translations` and revise the filepaths below accordingly.

In [None]:
refs = load_file(TRANSLATIONS_DIR / "test.tgt")
models = {
    "GRU-Base": load_file(TRANSLATIONS_DIR / "base.txt"),
    "GRU-Aug": load_file(TRANSLATIONS_DIR / "aug.txt"),
    "GRU-Aug-CBK": load_file(TRANSLATIONS_DIR / "aug-cbk.txt"),
}

Let's compute for different evaluation metrics, namely BLEU, CHRF, and TER.

In [None]:
results = []

chrf = CHRF()
ter = TER()

for name, preds in models.items():
    bleu_score = corpus_bleu(preds, [refs])
    chrf_score = chrf.corpus_score(preds, [refs])
    ter_score = ter.corpus_score(preds, [refs])

    results.append(
        {
            "Model": name,
            "BLEU": bleu_score.score,
            "CHRF": chrf_score.score,
            "TER": ter_score.score,
        }
    )

    print(f"\n[{name}]")
    print(f"BLEU: {bleu_score.score:.2f}")
    print(f"CHRF: {chrf_score.score:.2f}")
    print(f"TER: {ter_score.score:.2f}")

results_df = pd.DataFrame(results).set_index("Model")

Let's visualize the results.

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

metrics = ["BLEU", "CHRF", "TER"]

for i, metric in enumerate(metrics):
    results_df[metric].plot(
        kind="bar",
        ax=axes[i],
        title=f"Translation Quality Comparison by {metric}",
    )

    axes[i].set_xlabel("")
    axes[i].set_ylabel("Score" if i == 0 else "")
    axes[i].tick_params(axis="x", rotation=0)

    # Annotation logic for adding values to the bars
    for container in axes[i].containers:
        for bar in container:
            height = bar.get_height()
            axes[i].annotate(
                f"{height:.2f}",
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha="center",
                va="bottom",
                fontsize=9,
            )

plt.suptitle(
    "Translation Quality Comparisons of Different Models Across Different Metrics",
    y=1.04,
    fontsize=16,
)

plt.tight_layout()

plt.savefig(RESULTS_DIR / "translation_quality_comparisons.png")

Let's save the results.

In [None]:
results_df.to_csv(RESULTS_DIR / "metrics_summary.csv")