In [49]:
#!/usr/bin/env python
import pandas as pd
from pathlib import Path
from sacrebleu import corpus_bleu, CHRF, TER
import evaluate

output_dir = Path("translation_results")
priority_root = output_dir / "PRIORITY_LANGUAGES"
target_root = output_dir / "TARGET_LANGUAGES"
ref_path =  "datadf.csv"

ref_df = pd.read_csv(ref_path)
print(f"Refs loaded: {len(ref_df)} rows, {ref_df['iso_639_3'].nunique()} langs")

chrf = CHRF()
ter = TER()
meteor = evaluate.load("meteor")

def corpus_meteor(hyps, refs):
    return meteor.compute(predictions=hyps, references=refs)['meteor']

def compute_metrics(hyps, refs, tgt_lang):
    bleu = corpus_bleu(hyps, [refs]).score
    chrf_sc = chrf.corpus_score(hyps, [refs]).score
    ter_sc = ter.corpus_score(hyps, [refs]).score
    met_sc = corpus_meteor(hyps, refs)
    return {'bleu': bleu, 'chrf': chrf_sc, 'ter': ter_sc, 'meteor': met_sc}

def align_and_clean(ref_df, hyp_df, hyp_col, src_lang, tgt_lang):
    # Reference is target-language gold from FLORES
    tgt_ref_df = ref_df[ref_df['iso_639_3'] == tgt_lang][['id', 'text']].drop_duplicates('id')
    if 'id' not in hyp_df or hyp_col not in hyp_df:
        return None, None

    merged = hyp_df[['id', hyp_col]].merge(tgt_ref_df, on='id', how='inner')
    merged = merged.dropna(subset=['text', hyp_col])
    merged[hyp_col] = merged[hyp_col].astype(str).str.strip()
    merged['text'] = merged['text'].astype(str).str.strip()
    merged = merged[(merged[hyp_col] != '') & (merged['text'] != '')]

    if len(merged) == 0:
        return None, None

    hyps = merged[hyp_col].tolist()
    refs = merged['text'].tolist()
    print(f"  {src_lang}→{tgt_lang}: {len(hyps)} aligned pairs")
    return hyps, refs

  ref_df = pd.read_csv(ref_path)


Refs loaded: 223328 rows, 209 langs


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sp_hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\sp_hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sp_hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [50]:
evaluation_results = []
print("="*60)
print("FLORES EVAL (no metadata)")
print("="*60)

# TARGET_LANGUAGES
print("TARGET_LANGUAGES:")
for csv_file in target_root.glob("*.csv"):
    stem = csv_file.stem
    if "_to_" not in stem:
        continue
    src_lang, tgt_lang = stem.split("_to_")
    hyp_df = pd.read_csv(csv_file)
    hyps, refs = align_and_clean(ref_df, hyp_df, "target_text", src_lang, tgt_lang)
    if hyps is None:
        print(f"  Skip {csv_file.name}")
        continue
    metrics = compute_metrics(hyps, refs, tgt_lang)
    evaluation_results.append({
        "direction": f"{src_lang}_to_{tgt_lang}",
        "srclang": src_lang,
        "tgtlang": tgt_lang,
        "num_sentences": len(hyps),
        **metrics
    })
    print(f"  ✓ {src_lang}_to_{tgt_lang}: BLEU={metrics['bleu']:.1f}")

# PRIORITY_LANGUAGES
print("\nPRIORITY_LANGUAGES:")
for csv_file in priority_root.glob("*.csv"):
    stem = csv_file.stem
    if "_to_" not in stem:
        continue
    src_lang, tgt_lang = stem.split("_to_")
    hyp_df = pd.read_csv(csv_file)
    hyps, refs = align_and_clean(ref_df, hyp_df, "target_text", src_lang, tgt_lang)
    if hyps is None:
        print(f"  Skip {csv_file.name}")
        continue
    metrics = compute_metrics(hyps, refs, tgt_lang)
    evaluation_results.append({
        "direction": f"{src_lang}_to_{tgt_lang}",
        "srclang": src_lang,
        "tgtlang": tgt_lang,
        "num_sentences": len(hyps),
        **metrics
    })
    print(f"  ✓ {src_lang}_to_{tgt_lang}: BLEU={metrics['bleu']:.1f}")

# Save summary
if evaluation_results:
    eval_df = pd.DataFrame(evaluation_results)
    out_path = output_dir / "evaluation_summary.csv"
    eval_df.to_csv(out_path, index=False)
    print(f"\nSaved {len(eval_df)} rows to {out_path}")
else:
    print("No valid evaluations.")

Would you like to also log per-language averages (e.g., mean BLEU per tgt_lang) in a second CSV?


FLORES EVAL (no metadata)
TARGET_LANGUAGES:
  cmn→eng: 997 aligned pairs
  ✓ cmn_to_eng: BLEU=20.2
  cmn→fra: 997 aligned pairs
  ✓ cmn_to_fra: BLEU=17.7
  dan→eng: 997 aligned pairs
  ✓ dan_to_eng: BLEU=24.1
  dan→fra: 997 aligned pairs
  ✓ dan_to_fra: BLEU=18.5
  deu→eng: 997 aligned pairs
  ✓ deu_to_eng: BLEU=16.3
  deu→fra: 997 aligned pairs
  ✓ deu_to_fra: BLEU=16.0
  ell→eng: 997 aligned pairs
  ✓ ell_to_eng: BLEU=24.8
  ell→fra: 997 aligned pairs
  ✓ ell_to_fra: BLEU=21.8
  fin→eng: 997 aligned pairs
  ✓ fin_to_eng: BLEU=5.2
  fin→fra: 997 aligned pairs
  ✓ fin_to_fra: BLEU=5.8
  hin→eng: 997 aligned pairs
  ✓ hin_to_eng: BLEU=38.6
  hin→fra: 997 aligned pairs
  ✓ hin_to_fra: BLEU=27.5
  ita→eng: 997 aligned pairs
  ✓ ita_to_eng: BLEU=18.6
  ita→fra: 997 aligned pairs
  ✓ ita_to_fra: BLEU=21.4
  jpn→eng: 996 aligned pairs
  ✓ jpn_to_eng: BLEU=18.1
  jpn→fra: 997 aligned pairs
  ✓ jpn_to_fra: BLEU=15.5
  kor→eng: 997 aligned pairs
  ✓ kor_to_eng: BLEU=21.4
  kor→fra: 997 aligned 

Overall Performance
Your model was evaluated on 56 FLORES directions (~56k sentences total), with consistent 997-pair alignment per CSV. Average BLEU is ~20.3 across all pairs. Strongest results exceed 40; Asian languages (esp. cmn, jpn, kor) show biggest gaps.
​

Top Performers
Rank	Direction	BLEU	Notes
1	eng→por	45.4	Best overall (Romance)
2	eng→dan	41.1	Excellent Germanic
3	eng→swe	39.9	Strong Scandinavian
4	hin→eng	38.6	Top non-eng→ direction
5	por→eng	32.3	Bidirectional Romance
Key Insights
Language Family Effects

Romance/Germanic → eng/fra: 25-45 BLEU (por-eng 45.4, eng-deu 35.7).

Indic → eng/fra strong (hin→eng 38.6), but eng→hin drops to 31.3.

Asian languages weakest: eng→cmn 0.7, eng→jpn 0.0 (morphology/tokenization issues?).
​

Directionality Gaps

eng→eur: 25-45; non-eng→eng: 16-38.

fra→eur: consistently ~20 (fra→por 26.6).

Reverse Asian (cmn→eng 20.2 >> eng→cmn 0.7).
​

Low Performers

eng→jpn/kor/cmn: <13 (script/grammar challenges).

fin→eng/fra: 5-6 (agglutinative structure).
​

Recommendations
Prioritize fin/jpn/cmn tokenizers and Asian data. Bilingual baselines: eng-por (45+), hin-eng (38+). Scale training on low-BLEU pairs for balanced multilingual gains.