In [1]:
# ================================
# BTA: Folder-based Triple Aligner (EN–ID–SU)
# With Sundanese parenthetical-only flagging baked in
# ================================
from pathlib import Path
import re, html, json
import pandas as pd

# ------------
# CONFIG
# ------------
DIR_EN = Path("Bible/inggris/")     # folder with *_inggris.txt
DIR_ID = Path("Bible/indonesia/")  # folder with *_indonesia.txt
DIR_SU = Path("Bible/sunda/")   # folder with *_sunda.txt

SUFFIX_EN = "_inggris"
SUFFIX_ID = "_indonesia"
SUFFIX_SU = "_sunda"

USE_FILENAME_AS_BOOK = True
BOOK_NAME_NORMALIZATION = {}

# OUTPUT to Bible_Align/data  # <<< NEW
OUT_DIR = Path("Bible_Align/data")
SPLIT_BY_BOOK = True
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ------------
# Parsers & helpers
# ------------
WHITESPACE_RE = re.compile(r"\s+")
CHAPTER_HDR_RE = re.compile(r"^\s*([0-9A-Za-z .’'\-]+?)\s+(\d+)\s*$")
VERSE_LINE_RE   = re.compile(r"^\s*(\d+)\s+(.*\S)\s*$")

def clean_text(s: str) -> str:
    s = html.unescape(s).replace("\u00A0", " ")
    return WHITESPACE_RE.sub(" ", s).strip()

def normalize_book(name: str) -> str:
    name = clean_text(name)
    return BOOK_NAME_NORMALIZATION.get(name, name)

def base_from_name(fname: str, suffix: str) -> str:
    if fname.lower().endswith(".txt"): fname = fname[:-4]
    return fname[:-len(suffix)] if fname.endswith(suffix) else fname

def base_to_book(base: str) -> str:
    book = base.replace("_", " ").strip()
    if book and book[0].isdigit():
        parts = book.split(" ", 1)
        book = parts[0] + " " + (parts[1].title() if len(parts) == 2 else "")
    else:
        book = book.title()
    return normalize_book(book)

def parse_bible_plaintext(path: Path, lang_code: str, single_book: str | None = None):
    out, book, chapter = [], (normalize_book(single_book) if single_book else None), None
    if not path.exists(): return out
    with path.open("r", encoding="utf-8", errors="replace") as f:
        for raw in f:
            line = clean_text(raw)
            if not line: continue
            m_hdr = CHAPTER_HDR_RE.match(line)
            if m_hdr and single_book is None:
                book, chapter = normalize_book(m_hdr.group(1)), int(m_hdr.group(2)); continue
            m_v = VERSE_LINE_RE.match(line)
            if m_v:
                vnum, vtxt = int(m_v.group(1)), clean_text(m_v.group(2))
                if book is None and single_book: book = normalize_book(single_book)
                if book is None: book = "UNKNOWN_BOOK"
                if chapter is None: chapter = 1
                out.append({"book": book, "chapter": int(chapter), "verse": vnum, "text": vtxt, "lang": lang_code})
            elif out and out[-1]["lang"] == lang_code:
                out[-1]["text"] = clean_text(out[-1]["text"] + " " + line)
    return out

def to_frame(recs): return pd.DataFrame.from_records(recs, columns=["book","chapter","verse","lang","text"])

def align_three(df_en, df_id, df_su):
    wide_en = df_en.pivot_table(index=["book","chapter","verse"], values="text", aggfunc="first")
    wide_id = df_id.pivot_table(index=["book","chapter","verse"], values="text", aggfunc="first")
    wide_su = df_su.pivot_table(index=["book","chapter","verse"], values="text", aggfunc="first")
    aligned = wide_en.join(wide_id, how="outer", lsuffix="_en").join(wide_su, how="outer", rsuffix="_su")
    rename_map = {}
    for c in aligned.columns:
        lc = c.lower()
        rename_map[c] = "en_text" if lc.endswith("_en") or lc=="text_en" else \
                        "su_text" if lc.endswith("_su") or lc=="text_su" else \
                        "id_text"
    aligned = aligned.rename(columns=rename_map).reset_index().sort_values(["book","chapter","verse"])
    for col in ["en_text","id_text","su_text"]:
        if col not in aligned.columns: aligned[col] = pd.NA
    return aligned[["book","chapter","verse","en_text","id_text","su_text"]]

def mismatch_report(aligned: pd.DataFrame) -> pd.DataFrame:
    return aligned[aligned[["en_text","id_text","su_text"]].isna().any(axis=1)]

# <<< NEW: parenthetical-only detector for Sundanese
PAREN_REF_RE = re.compile(r"^\(\s*\d+\s*:\s*\d+(?:\s*[-–]\s*\d+)?\s*\)$")

def apply_sundanese_parenthetical_flag(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    su = df["su_text"].fillna("")
    mask = su.str.match(PAREN_REF_RE)
    df["su_missing_parenthetical"] = mask
    df.loc[mask, "su_text"] = pd.NA
    return df

# ------------
# Discover triplets and process
# ------------
def discover_triplets(dir_en: Path, dir_id: Path, dir_su: Path):
    en_files = {base_from_name(p.name, SUFFIX_EN): p for p in dir_en.glob("*.txt")}
    id_files = {base_from_name(p.name, SUFFIX_ID): p for p in dir_id.glob("*.txt")}
    su_files = {base_from_name(p.name, SUFFIX_SU): p for p in dir_su.glob("*.txt")}
    bases_all = set(en_files) | set(id_files) | set(su_files)
    triplets, missing = [], []
    for base in sorted(bases_all):
        p_en, p_id, p_su = en_files.get(base), id_files.get(base), su_files.get(base)
        (triplets.append((base, p_en, p_id, p_su)) if (p_en and p_id and p_su)
         else missing.append({"base": base, "has_en": bool(p_en), "has_id": bool(p_id), "has_su": bool(p_su)}))
    return triplets, missing

triplets, missing = discover_triplets(DIR_EN, DIR_ID, DIR_SU)
print(f"Discovered {len(triplets)} complete triplets.")
if missing:
    import pandas as _pd
    print(f"{len(missing)} bases missing at least one language. First 10:")
    print(_pd.DataFrame(missing).head(10).to_string(index=False))

all_aligned, coverage_rows = [], []

for base, p_en, p_id, p_su in triplets:
    single_book = base_to_book(base) if USE_FILENAME_AS_BOOK else None
    df_en = to_frame(parse_bible_plaintext(p_en, "en", single_book))
    df_id = to_frame(parse_bible_plaintext(p_id, "id", single_book))
    df_su = to_frame(parse_bible_plaintext(p_su, "su", single_book))
    aligned = align_three(df_en, df_id, df_su)

    # Apply Sundanese parenthetical-only rule  # <<< NEW
    aligned = apply_sundanese_parenthetical_flag(aligned)

    aligned["source_base"] = base
    all_aligned.append(aligned)

    total = aligned.shape[0]
    miss = mismatch_report(aligned)
    coverage_rows.append({
        "base": base,
        "book_inferred": single_book if single_book else "(headers)",
        "total_rows": total,
        "complete_rows": total - len(miss),
        "missing_rows": len(miss),
        "pct_complete": 0 if total == 0 else round(100.0*(total - len(miss))/total, 2),
        # how many rows were parenthetical-only in SU  # <<< NEW
        "su_parenthetical_only_rows": int(aligned["su_missing_parenthetical"].sum())
    })

# ------------
# Concatenate and save
# ------------
big = pd.concat(all_aligned, ignore_index=True) if all_aligned else \
      pd.DataFrame(columns=["book","chapter","verse","en_text","id_text","su_text","su_missing_parenthetical","source_base"])

combined_csv  = OUT_DIR / "BTA_aligned_all.csv"
combined_json = OUT_DIR / "BTA_aligned_all.jsonl"
big.to_csv(combined_csv, index=False, encoding="utf-8")

with combined_json.open("w", encoding="utf-8") as f:
    for _, r in big.iterrows():
        f.write(json.dumps({
            "book": r.get("book"),
            "chapter": int(r["chapter"]) if pd.notna(r.get("chapter")) else None,
            "verse": int(r["verse"]) if pd.notna(r.get("verse")) else None,
            "en_text": None if pd.isna(r.get("en_text")) else r.get("en_text"),
            "id_text": None if pd.isna(r.get("id_text")) else r.get("id_text"),
            "su_text": None if pd.isna(r.get("su_text")) else r.get("su_text"),
            "su_missing_parenthetical": bool(r.get("su_missing_parenthetical")),
            "source_base": r.get("source_base")
        }, ensure_ascii=False) + "\n")

print(f"Saved combined CSV: {combined_csv}")
print(f"Saved combined JSONL: {combined_json}")

# Coverage report (includes su_parenthetical_only_rows)  # <<< NEW
coverage = pd.DataFrame(coverage_rows).sort_values(["pct_complete","base"], ascending=[False, True])
coverage_csv = OUT_DIR / "coverage_by_file.csv"
coverage.to_csv(coverage_csv, index=False, encoding="utf-8")
print(f"Coverage summary saved: {coverage_csv}")
print(coverage.head(15).to_string(index=False))

# Optional per-book splits
if SPLIT_BY_BOOK and not big.empty:
    for book, grp in big.groupby("book"):
        safe_book = re.sub(r"[^0-9A-Za-z _.-]", "_", book or "UNKNOWN")
        (OUT_DIR / f"BTA_{safe_book}.csv").write_text(grp.sort_values(["chapter","verse"]).to_csv(index=False), encoding="utf-8")
    print(f"Per-book CSVs written in {OUT_DIR.resolve()}")


Discovered 66 complete triplets.
Saved combined CSV: Bible_Align\data\BTA_aligned_all.csv
Saved combined JSONL: Bible_Align\data\BTA_aligned_all.jsonl
Coverage summary saved: Bible_Align\data\coverage_by_file.csv
        base book_inferred  total_rows  complete_rows  missing_rows  pct_complete  su_parenthetical_only_rows
  1_korintus    1 Korintus          58             58             0         100.0                           0
    1_petrus      1 Petrus          25             25             0         100.0                           0
1_tesalonika  1 Tesalonika          28             28             0         100.0                           0
   1_yohanes     1 Yohanes          29             29             0         100.0                           0
 2_raja-raja   2 Raja-Raja          44             44             0         100.0                           0
    2_samuel      2 Samuel          51             51             0         100.0                           0
  2_tawarikh    2

In [3]:
# Preprocessing & Segmentation (robust to your columns)
# Accepts CSV or JSONL with columns like:
#   book, chapter, verse, su_text, id_text, en_text, ... (your file)
# or the simpler schema:
#   verse_id, su, id, en

import re, csv, json, unicodedata
from pathlib import Path

# ====== CONFIG ======
INPUT_PATH = Path("Bible_Align/data/BTA_aligned_all.csv")   # or .jsonl
OUT_DIR = Path("./out_preprocess")
WRITE_TOKENS = True
LOWERCASE = False

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ====== Normalization ======
PUNCT_MAP = {"“":"\"", "”":"\"", "‘":"'", "’":"'", "–":"-", "—":"-", "…":"...", "«":"\"", "»":"\""}
TRANS = str.maketrans(PUNCT_MAP)

def normalize_text(text: str, lowercase: bool = LOWERCASE) -> str:
    if text is None:
        return ""
    s = unicodedata.normalize("NFKC", text).translate(TRANS)
    s = re.sub(r"\s+", " ", s).strip()
    if lowercase:
        s = s.lower()
    return s

# ====== Sentence segmentation ======
ID_ABBR = {"dr","sdr","sdri","dll","dsb","sbb","no","hlm"}
EN_ABBR = {"mr","mrs","ms","dr","prof","vs","etc","e.g","i.e"}
SU_ABBR = set()
ABBR = ID_ABBR | EN_ABBR | SU_ABBR
ABBR_PAT = re.compile(r"\b(?:(?:" + "|".join(sorted(ABBR)) + r")\.)$", re.IGNORECASE) if ABBR else None

def sent_split(lang: str, text: str):
    if not text:
        return []
    parts = re.split(r"(?<=[\.!\?])\s+(?=[A-Z0-9\"'“‘(])", text)
    merged = []
    for p in parts:
        p = p.strip()
        if not p: 
            continue
        if merged:
            prev = merged[-1]
            last_tok = prev.split()[-1] if prev.split() else ""
            if last_tok.endswith(".") and ABBR_PAT and ABBR_PAT.search(last_tok):
                merged[-1] = prev + " " + p
                continue
        merged.append(p)
    return merged

# ====== Tokenization ======
TOKEN_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9']+|[^\w\s]", re.UNICODE)
def tokenize(text: str):
    return [t for t in TOKEN_RE.findall(text or "") if t.strip()]

# ====== Schema utilities ======
def build_verse_id(book: str, chapter, verse) -> str:
    # build a readable verse_id like "1 Korintus:1:1" or "Genesis:1:1"
    b = normalize_text(book, lowercase=False)
    # keep spaces in book to stay human-friendly
    return f"{b}:{str(chapter).strip()}:{str(verse).strip()}"

def unify_row(r: dict) -> dict:
    """
    Map an arbitrary source row to a unified record with keys:
    verse_id, su, id, en
    """
    keys = {k.lower(): k for k in r.keys()}

    # text fields (prefer *_text if present)
    su = r.get(keys.get("su_text",""), "") or r.get(keys.get("su",""), "")
    id_ = r.get(keys.get("id_text",""), "") or r.get(keys.get("id",""), "")
    en = r.get(keys.get("en_text",""), "") or r.get(keys.get("en",""), "")

    # verse_id direct or composed from book/chapter/verse
    if "verse_id" in keys:
        verse_id = r[keys["verse_id"]]
    else:
        # Need book, chapter, verse
        book = r.get(keys.get("book",""), "")
        chapter = r.get(keys.get("chapter",""), "")
        verse = r.get(keys.get("verse",""), "")
        if not (book and chapter and verse):
            raise ValueError("Cannot build verse_id: need columns 'book', 'chapter', 'verse' or 'verse_id'.")
        verse_id = build_verse_id(book, chapter, verse)

    return {
        "verse_id": normalize_text(verse_id, lowercase=False),
        "su": normalize_text(su),
        "id": normalize_text(id_),
        "en": normalize_text(en),
    }

def read_input(path: Path):
    rows = []
    if path.suffix.lower() == ".csv":
        with path.open("r", encoding="utf-8") as f:
            rd = csv.DictReader(f)
            for r in rd:
                rows.append(unify_row(r))
    elif path.suffix.lower() in (".jsonl", ".json"):
        with path.open("r", encoding="utf-8") as f:
            for line in f:
                if not line.strip(): 
                    continue
                obj = json.loads(line)
                rows.append(unify_row(obj))
    else:
        raise ValueError("Unsupported input format. Use .csv or .jsonl")
    return rows

# ====== Main processing ======
def process(input_path: Path, out_dir: Path):
    src_rows = read_input(input_path)

    out_rows = []
    for r in src_rows:
        verse_id = r["verse_id"]
        su_text, id_text, en_text = r["su"], r["id"], r["en"]

        su_sents = sent_split("su", su_text)
        id_sents = sent_split("id", id_text)
        en_sents = sent_split("en", en_text)

        max_len = max(len(su_sents), len(id_sents), len(en_sents)) or 1
        for i in range(max_len):
            su_i = su_sents[i] if i < len(su_sents) else ""
            id_i = id_sents[i] if i < len(id_sents) else ""
            en_i = en_sents[i] if i < len(en_sents) else ""
            if not (su_i or id_i or en_i):
                continue
            sent_idx = i + 1
            sent_id = f"{verse_id}#{sent_idx}"
            row = {
                "verse_id": verse_id,
                "sent_idx": sent_idx,
                "sent_id": sent_id,
                "su_sentence": su_i,
                "id_sentence": id_i,
                "en_sentence": en_i,
            }
            if WRITE_TOKENS:
                row["su_tokens"] = " ".join(tokenize(su_i))
                row["id_tokens"] = " ".join(tokenize(id_i))
                row["en_tokens"] = " ".join(tokenize(en_i))
            out_rows.append(row)

    out_csv = out_dir / "tri_sentences.csv"
    if out_rows:
        with out_csv.open("w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=list(out_rows[0].keys()))
            writer.writeheader()
            writer.writerows(out_rows)

    out_jsonl = out_dir / "tri_sentences.jsonl"
    with out_jsonl.open("w", encoding="utf-8") as f:
        for row in out_rows:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    print(f"Wrote {len(out_rows)} sentence rows to:")
    print(f" - {out_csv}")
    print(f" - {out_jsonl}")

# ====== RUN ======
process(INPUT_PATH, OUT_DIR)


Wrote 4875 sentence rows to:
 - out_preprocess\tri_sentences.csv
 - out_preprocess\tri_sentences.jsonl


In [4]:
import csv, os, shutil, subprocess, sys, re
from pathlib import Path
from typing import List, Set, Tuple

# ====== CONFIG ======
IN_SENTENCES = Path("./out_preprocess/tri_sentences.csv")  # from your preprocessing step
WORK_DIR     = Path("./out_align")
LANG_SRC     = "su"  # source for both pairs
LANG_TGT_1   = "id"  # pair 1: su-id
LANG_TGT_2   = "en"  # pair 2: su-en

# fast_align binaries (change if installed elsewhere)
FAST_ALIGN_BIN = shutil.which("fast_align")
ATOOLS_BIN     = shutil.which("atools")   # optional; we also have a Python symmetrizer below

WORK_DIR.mkdir(parents=True, exist_ok=True)

# ====== Helpers ======
def normalize_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip())

def write_bitext(in_csv: Path, out_bitext_path: Path, src_col: str, tgt_col: str) -> int:
    n = 0
    with in_csv.open("r", encoding="utf-8") as fi, out_bitext_path.open("w", encoding="utf-8") as fo:
        rd = csv.DictReader(fi)
        for row in rd:
            src = normalize_spaces(row.get(src_col, ""))
            tgt = normalize_spaces(row.get(tgt_col, ""))
            # skip empty pairs
            if not src and not tgt:
                continue
            fo.write(f"{src} ||| {tgt}\n")
            n += 1
    return n

def run(cmd: List[str]) -> subprocess.CompletedProcess:
    return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False)

def parse_alignment_line(line: str) -> Set[Tuple[int,int]]:
    # "0-0 1-2 ..."  -> {(0,0), (1,2), ...}
    pairs = set()
    for tok in line.strip().split():
        if "-" in tok:
            a,b = tok.split("-",1)
            try:
                pairs.add((int(a), int(b)))
            except:
                pass
    return pairs

def format_alignment_line(points: Set[Tuple[int,int]]) -> str:
    # sorted for stability
    return " ".join([f"{i}-{j}" for (i,j) in sorted(points)])

def grow_diag_final(forward: Set[Tuple[int,int]], reverse: Set[Tuple[int,int]],
                    src_len: int, tgt_len: int) -> Set[Tuple[int,int]]:
    """
    Standard grow-diag-final symmetrization (Och & Ney 2003-ish variant).
    - start with intersection
    - grow-diag: iteratively add neighbors present in union if they link an unaligned word
    - final: add remaining points from forward/reverse that link unaligned words
    """
    inter = forward & reverse
    union = forward | reverse
    alignment = set(inter)

    def aligned_src(a): return {i for (i,_) in a}
    def aligned_tgt(a): return {j for (_,j) in a}

    def neighbors(i,j):
        # 8-neighborhood
        for di in (-1,0,1):
            for dj in (-1,0,1):
                if di == 0 and dj == 0: 
                    continue
                ni, nj = i+di, j+dj
                if 0 <= ni < src_len and 0 <= nj < tgt_len:
                    yield (ni, nj)

    added = True
    while added:
        added = False
        A_src = aligned_src(alignment)
        A_tgt = aligned_tgt(alignment)
        cand = set()
        for (i,j) in alignment:
            for nb in neighbors(i,j):
                if nb in union and nb not in alignment:
                    si, tj = nb
                    if si not in A_src or tj not in A_tgt:
                        cand.add(nb)
        if cand:
            alignment |= cand
            added = True

    # final step: add remaining points from forward/reverse that link unaligned words
    A_src = aligned_src(alignment)
    A_tgt = aligned_tgt(alignment)
    for (i,j) in forward:
        if (i not in A_src) or (j not in A_tgt):
            alignment.add((i,j))
    A_src = aligned_src(alignment)
    A_tgt = aligned_tgt(alignment)
    for (i,j) in reverse:
        if (i not in A_src) or (j not in A_tgt):
            alignment.add((i,j))

    return alignment

def count_tokens_pair(line: str) -> Tuple[int,int]:
    # from "src ||| tgt" count tokens by whitespace
    src, tgt = line.split("|||")
    s = len(src.strip().split())
    t = len(tgt.strip().split())
    return s, t

# ====== Prepare bitext for both pairs ======
su_id_bitext = WORK_DIR / f"{LANG_SRC}_{LANG_TGT_1}.bitext"
su_en_bitext = WORK_DIR / f"{LANG_SRC}_{LANG_TGT_2}.bitext"

n1 = write_bitext(IN_SENTENCES, su_id_bitext, "su_sentence", "id_sentence")
n2 = write_bitext(IN_SENTENCES, su_en_bitext, "su_sentence", "en_sentence")
print(f"Wrote bitext:\n - {su_id_bitext} ({n1} lines)\n - {su_en_bitext} ({n2} lines)")

# ====== Align a single bitext with fast_align, returning (fwd, rev, symm) paths ======
def align_pair(bitext_path: Path, pair_tag: str):
    fwd_out = WORK_DIR / f"{pair_tag}.fwd.align"
    rev_out = WORK_DIR / f"{pair_tag}.rev.align"
    sym_out = WORK_DIR / f"{pair_tag}.sym.gdf"

    # Detect fast_align
    if FAST_ALIGN_BIN is None:
        print("\n[WARN] fast_align not found on PATH. Skipping execution.")
        print("You can run these commands locally once fast_align is installed:\n")
        print(f"  {FAST_ALIGN_BIN or 'fast_align'} -i {bitext_path} -d -o -v > {fwd_out}")
        print(f"  {FAST_ALIGN_BIN or 'fast_align'} -i {bitext_path} -d -o -v -r > {rev_out}")
        if ATOOLS_BIN:
            print(f"  {ATOOLS_BIN} -i {fwd_out} -j {rev_out} -c grow-diag-final-and > {sym_out}")
        else:
            print("  (Then symmetrize with grow-diag-final using the Python code in this cell.)")
        return None, None, None

    # Run forward
    cmd_fwd = [FAST_ALIGN_BIN, "-i", str(bitext_path), "-d", "-o", "-v"]
    res_fwd = run(cmd_fwd)
    if res_fwd.returncode != 0:
        print(res_fwd.stderr)
        raise RuntimeError("fast_align forward failed.")
    fwd_out.write_text(res_fwd.stdout, encoding="utf-8")

    # Run reverse
    cmd_rev = [FAST_ALIGN_BIN, "-i", str(bitext_path), "-d", "-o", "-v", "-r"]
    res_rev = run(cmd_rev)
    if res_rev.returncode != 0:
        print(res_rev.stderr)
        raise RuntimeError("fast_align reverse failed.")
    rev_out.write_text(res_rev.stdout, encoding="utf-8")

    # Symmetrize
    if ATOOLS_BIN is not None:
        cmd_sym = [ATOOLS_BIN, "-i", str(fwd_out), "-j", str(rev_out), "-c", "grow-diag-final-and"]
        res_sym = run(cmd_sym)
        if res_sym.returncode != 0:
            print(res_sym.stderr)
            raise RuntimeError("atools symmetrization failed.")
        sym_out.write_text(res_sym.stdout, encoding="utf-8")
    else:
        # Python grow-diag-final
        sym_lines = []
        with bitext_path.open("r", encoding="utf-8") as ftxt, \
             fwd_out.open("r", encoding="utf-8") as ff, \
             rev_out.open("r", encoding="utf-8") as fr:
            for src_tgt, lf, lr in zip(ftxt, ff, fr):
                src_len, tgt_len = count_tokens_pair(src_tgt)
                F = parse_alignment_line(lf)
                R = parse_alignment_line(lr)
                G = grow_diag_final(F, R, src_len, tgt_len)
                sym_lines.append(format_alignment_line(G))
        sym_out.write_text("\n".join(sym_lines) + "\n", encoding="utf-8")

    print(f"[{pair_tag}] forward -> {fwd_out.name} | reverse -> {rev_out.name} | symm -> {sym_out.name}")
    return fwd_out, rev_out, sym_out

# ====== Run alignments for both pairs ======
fwd1, rev1, sym1 = align_pair(su_id_bitext, f"{LANG_SRC}_{LANG_TGT_1}")
fwd2, rev2, sym2 = align_pair(su_en_bitext, f"{LANG_SRC}_{LANG_TGT_2}")

# ====== Attach symmetrized links back to the sentence CSV ======
enriched_csv = WORK_DIR / "tri_sentences_with_align.csv"
if sym1 is not None and sym2 is not None:
    # Read all alignments into memory
    su_id_links = sym1.read_text(encoding="utf-8").splitlines()
    su_en_links = sym2.read_text(encoding="utf-8").splitlines()

    with IN_SENTENCES.open("r", encoding="utf-8") as fi, enriched_csv.open("w", newline="", encoding="utf-8") as fo:
        rd = csv.DictReader(fi)
        out_fields = rd.fieldnames + ["su_id_align", "su_en_align"]
        wr = csv.DictWriter(fo, fieldnames=out_fields)
        wr.writeheader()

        i = 0
        for row in rd:
            # Only increment when we actually wrote a pair (same condition as write_bitext)
            src_ok = normalize_spaces(row.get("su_sentence",""))
            id_ok  = normalize_spaces(row.get("id_sentence",""))
            en_ok  = normalize_spaces(row.get("en_sentence",""))
            wrote_pair1 = bool(src_ok or id_ok)
            wrote_pair2 = bool(src_ok or en_ok)

            su_id_a = su_id_links[i] if wrote_pair1 and i < len(su_id_links) else ""
            su_en_a = su_en_links[i] if wrote_pair2 and i < len(su_en_links) else ""

            if wrote_pair1 or wrote_pair2:
                i += 1

            row["su_id_align"] = su_id_a
            row["su_en_align"] = su_en_a
            wr.writerow(row)

    print(f"Enriched CSV with alignments:\n - {enriched_csv}")
else:
    print("\n[INFO] fast_align not executed. Bitext files are ready; run fast_align manually,")
    print("then re-run the bottom block of this cell to attach alignments back into the CSV.")
    print(f"Bitext files:\n - {su_id_bitext}\n - {su_en_bitext}")


Wrote bitext:
 - out_align\su_id.bitext (4583 lines)
 - out_align\su_en.bitext (4803 lines)

[WARN] fast_align not found on PATH. Skipping execution.
You can run these commands locally once fast_align is installed:

  fast_align -i out_align\su_id.bitext -d -o -v > out_align\su_id.fwd.align
  fast_align -i out_align\su_id.bitext -d -o -v -r > out_align\su_id.rev.align
  (Then symmetrize with grow-diag-final using the Python code in this cell.)

[WARN] fast_align not found on PATH. Skipping execution.
You can run these commands locally once fast_align is installed:

  fast_align -i out_align\su_en.bitext -d -o -v > out_align\su_en.fwd.align
  fast_align -i out_align\su_en.bitext -d -o -v -r > out_align\su_en.rev.align
  (Then symmetrize with grow-diag-final using the Python code in this cell.)

[INFO] fast_align not executed. Bitext files are ready; run fast_align manually,
then re-run the bottom block of this cell to attach alignments back into the CSV.
Bitext files:
 - out_align\su_id

In [5]:
import csv
from pathlib import Path
from collections import Counter

# ====== CONFIG ======
IN_FILE = Path("./out_preprocess/tri_sentences.csv")   # from previous step
OUT_FILE = Path("./out_preprocess/corpus_stats.csv")

# ====== Tokenizer ======
import re
TOKEN_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ']+|[^\w\s]", re.UNICODE)

def tokenize(text: str):
    return [t for t in TOKEN_RE.findall(text or "") if t.strip()]

# ====== Collect stats ======
stats = {
    "su": {"books": set(), "verses": set(), "sentences": 0, "tokens": 0, "vocab": Counter()},
    "id": {"books": set(), "verses": set(), "sentences": 0, "tokens": 0, "vocab": Counter()},
    "en": {"books": set(), "verses": set(), "sentences": 0, "tokens": 0, "vocab": Counter()},
}

with IN_FILE.open("r", encoding="utf-8") as f:
    rd = csv.DictReader(f)
    for row in rd:
        verse_id = row["verse_id"]  # format like "1 Korintus:1:1"
        parts = verse_id.split(":")
        book = parts[0] if len(parts) >= 3 else "Unknown"
        verse = ":".join(parts[:3]) if len(parts) >= 3 else verse_id

        for lang, col in [("su","su_sentence"), ("id","id_sentence"), ("en","en_sentence")]:
            text = row.get(col,"").strip()
            if not text:
                continue
            toks = tokenize(text)
            stats[lang]["books"].add(book)
            stats[lang]["verses"].add(verse)
            stats[lang]["sentences"] += 1
            stats[lang]["tokens"] += len(toks)
            stats[lang]["vocab"].update([t.lower() for t in toks])

# ====== Write summary table ======
rows = []
for lang, st in stats.items():
    rows.append({
        "Language": lang,
        "Books": len(st["books"]),
        "Verses": len(st["verses"]),
        "Sentences": st["sentences"],
        "Tokens": st["tokens"],
        "Vocabulary Size": len(st["vocab"])
    })

with OUT_FILE.open("w", newline="", encoding="utf-8") as f:
    wr = csv.DictWriter(f, fieldnames=rows[0].keys())
    wr.writeheader()
    wr.writerows(rows)

print(f"Saved corpus statistics → {OUT_FILE}")
for r in rows:
    print(r)


Saved corpus statistics → out_preprocess\corpus_stats.csv
{'Language': 'su', 'Books': 66, 'Verses': 2610, 'Sentences': 4249, 'Tokens': 67699, 'Vocabulary Size': 7298}
{'Language': 'id', 'Books': 66, 'Verses': 2809, 'Sentences': 3496, 'Tokens': 72038, 'Vocabulary Size': 5807}
{'Language': 'en', 'Books': 66, 'Verses': 2809, 'Sentences': 3961, 'Tokens': 74430, 'Vocabulary Size': 5878}


In [1]:
from comet import load_from_checkpoint

# replace with the local path where you stored the checkpoint
comet_model = load_from_checkpoint("/path/to/your/comet/checkpoint.ckpt")


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


Exception: Invalid checkpoint path: \path\to\your\comet\checkpoint.ckpt

In [2]:
# MT results table: Unfiltered / Filtered / Weighted BLEU, ChrF, COMET
# Pairs supported: su-id, su-en (source su; target id/en)
# Weighted scores = weighted mean of sentence-level metric by 'weight' column (High=1.0, Medium=0.5, Low=0.0 by our earlier mapping)

import csv, math, re, json
from pathlib import Path
from typing import List, Dict, Tuple
from collections import defaultdict

import pandas as pd

# ======== CONFIG ========
TEST_FLAGGED_CSV = Path("./out_bibtralign/1kor_trilingual_release_flagged_noted.csv")  # or a test-only CSV
# If your flagged CSV has a 'split' column, set USE_SPLIT=True to restrict to split=='test'
USE_SPLIT = False

# Define your systems here:
# Each item: {"pair": "su-id" or "su-en", "system": "Name", "hyp_tsv": "path/to/hyp.tsv"}
# hyp.tsv must have columns: sent_id, hyp
SYSTEMS = [
    # Examples (edit these paths):
    # {"pair": "su-id", "system": "Moses-SMT", "hyp_tsv": "./runs/moses_su_id_test.tsv"},
    # {"pair": "su-id", "system": "Transformer-base", "hyp_tsv": "./runs/transformer_su_id_test.tsv"},
    # {"pair": "su-en", "system": "Transformer-big", "hyp_tsv": "./runs/transformer_su_en_test.tsv"},
]

# Optional: expected test size for sanity print (informational only)
EXPECTED_TEST_SIZE = 1000

# ======== Metric backends ========
# sacrebleu for BLEU/chrF (recommended)
try:
    import sacrebleu
except Exception as e:
    sacrebleu = None
    print("[WARN] sacrebleu not available. Install with: pip install sacrebleu")

# COMET (optional); if not present, we’ll output NaN
COMET_AVAILABLE = False
try:
    from comet import download_model, load_from_checkpoint
    _comet_ckpt = download_model("Unbabel/wmt22-comet-da")  # good default; change if desired
    comet_model = load_from_checkpoint(_comet_ckpt)
    COMET_AVAILABLE = True
except Exception as e:
    print("[INFO] COMET not available (or download blocked). We'll fill COMET columns with NaN.")

# ======== Helpers ========
def _lower(s: str) -> str:
    return (s or "").strip()

def load_flagged_test_rows(path: Path, pair: str, use_split: bool) -> pd.DataFrame:
    df = pd.read_csv(path, dtype=str)
    # flexible: columns may be mixed dtypes; cast where needed
    for col in ["weight"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0.0)
        else:
            df[col] = 0.0
    if "flag" not in df.columns:
        df["flag"] = "Medium"

    # choose src/ref by pair
    if pair == "su-id":
        src_col, ref_col = "su_sentence", "id_sentence"
    elif pair == "su-en":
        src_col, ref_col = "su_sentence", "en_sentence"
    else:
        raise ValueError(f"Unsupported pair: {pair}")

    need_cols = {"sent_id", src_col, ref_col, "flag", "weight"}
    missing = [c for c in need_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns in flagged CSV for pair {pair}: {missing}")

    # optionally restrict to split == test
    if use_split and "split" in df.columns:
        df = df[df["split"].astype(str).str.lower().eq("test")].copy()

    # keep only rows with a reference (non-empty)
    df = df[df[ref_col].astype(str).str.strip().ne("")].copy()

    # trim strings
    df["sent_id"] = df["sent_id"].astype(str).str.strip()
    df["src"] = df[src_col].astype(str).str.strip()
    df["ref"] = df[ref_col].astype(str).str.strip()

    # sanity
    if EXPECTED_TEST_SIZE:
        print(f"[{pair}] Loaded {len(df)} test rows (expected ~{EXPECTED_TEST_SIZE}).")

    return df[["sent_id","src","ref","flag","weight"]].copy()

def load_hyp_tsv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, sep="\t", dtype=str)
    # allow comma-separated too
    if "sent_id" not in df.columns or "hyp" not in df.columns:
        # try CSV fallback
        df = pd.read_csv(path, dtype=str)
    assert "sent_id" in df.columns and "hyp" in df.columns, f"{path} must have columns: sent_id, hyp"
    df["sent_id"] = df["sent_id"].astype(str).str.strip()
    df["hyp"] = df["hyp"].astype(str).str.strip()
    return df[["sent_id","hyp"]].copy()

def join_test_hyp(test_df: pd.DataFrame, hyp_df: pd.DataFrame) -> pd.DataFrame:
    merged = test_df.merge(hyp_df, on="sent_id", how="inner")
    missing = len(test_df) - len(merged)
    if missing:
        print(f"[WARN] {missing} test rows had no hypothesis and were dropped after join.")
    return merged

def compute_bleu_chrf_lists(refs: List[str], hyps: List[str]) -> Tuple[float, float, List[float], List[float]]:
    """Return corpus BLEU/chrF and sentence-level lists for weighted averaging."""
    if sacrebleu is None:
        return float("nan"), float("nan"), [], []
    # corpus
    bleu_corpus = sacrebleu.corpus_bleu(hyps, [refs]).score
    chrf_corpus = sacrebleu.corpus_chrf(hyps, [refs]).score
    # sentence-level (for weighted)
    bleu_sent, chrf_sent = [], []
    for h, r in zip(hyps, refs):
        bleu_sent.append(sacrebleu.sentence_bleu(h, [r]).score)
        chrf_sent.append(sacrebleu.sentence_chrf(h, [r]).score)
    return bleu_corpus, chrf_corpus, bleu_sent, chrf_sent

def weighted_mean(scores: List[float], weights: List[float]) -> float:
    num = sum(s*w for s, w in zip(scores, weights))
    den = sum(weights)
    return (num/den) if den > 0 else float("nan")

def compute_comet(srcs: List[str], hyps: List[str], refs: List[str]) -> Tuple[float, List[float]]:
    """Return corpus mean COMET and sentence-level scores; NaN if COMET unavailable."""
    if not COMET_AVAILABLE:
        return float("nan"), []
    data = [{"src": s, "mt": h, "ref": r} for s, h, r in zip(srcs, hyps, refs)]
    out = comet_model.predict(data, batch_size=32, gpus=0)
    sent_scores = out["scores"]
    corpus = float(sum(sent_scores) / max(1, len(sent_scores)))
    return corpus, sent_scores

def evaluate_split(df: pd.DataFrame, use_filter: bool, use_weighted: bool) -> Dict[str, float]:
    """
    df columns: sent_id, src, ref, flag, weight, hyp
    - Unfiltered: use all rows
    - Filtered: use only flag=='High'
    - Weighted: weight column over sentence-level metrics (not true corpus BLEU)
    """
    if use_filter:
        df = df[df["flag"].astype(str).str.lower().eq("high")].copy()
    if len(df) == 0:
        return {"BLEU": float("nan"), "ChrF": float("nan"), "COMET": float("nan")}

    refs = df["ref"].tolist()
    hyps = df["hyp"].tolist()
    srcs = df["src"].tolist()
    weights = df["weight"].astype(float).tolist()

    bleu_corpus, chrf_corpus, bleu_sent, chrf_sent = compute_bleu_chrf_lists(refs, hyps)
    comet_corpus, comet_sent = compute_comet(srcs, hyps, refs)

    if use_weighted:
        # weighted average of sentence-level scores
        bleu = weighted_mean(bleu_sent, weights)
        chrf = weighted_mean(chrf_sent, weights)
        comet = weighted_mean(comet_sent, weights) if comet_sent else float("nan")
    else:
        # corpus scores
        bleu, chrf, comet = bleu_corpus, chrf_corpus, comet_corpus

    return {"BLEU": bleu, "ChrF": chrf, "COMET": comet}

# ======== Main: build the table ========
rows_out = []

for spec in SYSTEMS:
    pair = spec["pair"].strip().lower()
    system_name = spec["system"]
    hyp_path = Path(spec["hyp_tsv"])

    # Load test rows for pair, then join hypothesis
    test_df = load_flagged_test_rows(TEST_FLAGGED_CSV, pair, USE_SPLIT)
    hyp_df = load_hyp_tsv(hyp_path)
    merged = join_test_hyp(test_df, hyp_df)

    # Unfiltered / Filtered / Weighted
    unfiltered = evaluate_split(merged, use_filter=False, use_weighted=False)
    filtered   = evaluate_split(merged, use_filter=True,  use_weighted=False)
    weighted   = evaluate_split(merged, use_filter=False, use_weighted=True)

    rows_out.append({
        "Pair": pair,
        "System": system_name,
        "Unfiltered BLEU": round(unfiltered["BLEU"], 2) if pd.notna(unfiltered["BLEU"]) else float("nan"),
        "Filtered BLEU":   round(filtered["BLEU"],   2) if pd.notna(filtered["BLEU"])   else float("nan"),
        "Weighted BLEU":   round(weighted["BLEU"],   2) if pd.notna(weighted["BLEU"])   else float("nan"),
        "Unfiltered ChrF": round(unfiltered["ChrF"], 2) if pd.notna(unfiltered["ChrF"]) else float("nan"),
        "Filtered ChrF":   round(filtered["ChrF"],   2) if pd.notna(filtered["ChrF"])   else float("nan"),
        "Weighted ChrF":   round(weighted["ChrF"],   2) if pd.notna(weighted["ChrF"])   else float("nan"),
        "Unfiltered COMET":round(unfiltered["COMET"],3) if pd.notna(unfiltered["COMET"])else float("nan"),
        "Filtered COMET":  round(filtered["COMET"],  3) if pd.notna(filtered["COMET"])  else float("nan"),
        "Weighted COMET":  round(weighted["COMET"],  3) if pd.notna(weighted["COMET"])  else float("nan"),
    })

# Output table
df_out = pd.DataFrame(rows_out, columns=[
    "Pair","System",
    "Unfiltered BLEU","Filtered BLEU","Weighted BLEU",
    "Unfiltered ChrF","Filtered ChrF","Weighted ChrF",
    "Unfiltered COMET","Filtered COMET","Weighted COMET",
])

save_path = Path("./out_eval/mt_results_table.csv")
save_path.parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(save_path, index=False, encoding="utf-8")
display(df_out)
print(f"\nSaved results table → {save_path}")


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


.gitattributes: 0.00B [00:00, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


LICENSE: 0.00B [00:00, ?B/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jtkacer01\.cache\huggingface\hub\models--Unbabel--wmt22-comet-da\snapshots\2760a223ac957f30acfb18c8aa649b01cf1d75f2\checkpoints\model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Encoder model frozen.
C:\Users\jtkacer01\anaconda3\Lib\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


Unnamed: 0,Pair,System,Unfiltered BLEU,Filtered BLEU,Weighted BLEU,Unfiltered ChrF,Filtered ChrF,Weighted ChrF,Unfiltered COMET,Filtered COMET,Weighted COMET



Saved results table → out_eval\mt_results_table.csv


In [1]:
# Python-only alignment and quality flags using multilingual sentence embeddings (LaBSE fallback)
# - Aligns within each verse: su-sentences ↔ id-sentences and su-sentences ↔ en-sentences
# - Greedy max-sim matching with threshold
# - Adds heuristic features (length ratio, punctuation diff, NE overlap) and flags High/Medium/Low
# - Writes: su_id_emb_align.csv, su_en_emb_align.csv, tri_sentences_with_embflag.csv

import os, re, math, csv
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd

# ======== CONFIG ========
IN_SENT = Path("./out_preprocess/tri_sentences.csv")
OUT_DIR = Path("./out_align_py")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Embedding model preference list (first available will be used)
# LaBSE is best; the MiniLM model is small and downloads fast if needed.
PREFERRED_MODELS = [
    "sentence-transformers/LaBSE",
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
]

DEVICE = "cpu"            # "cuda" if you have a GPU
BATCH_SIZE = 64
SIM_THRESH_HIGH = 0.80    # cosine similarity threshold for "High"
SIM_THRESH_MED  = 0.60    # threshold for "Medium" (between MED and HIGH)
LEN_GOOD_SUID   = 0.70    # length ratio good su↔id
LEN_GOOD_SUEN   = 0.60    # length ratio good su↔en

# ======== Tokenization / helpers ========
TOKEN_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9']+|[^\w\s]", re.UNICODE)

def tokenize(s: str):
    return [t for t in TOKEN_RE.findall(s or "") if t.strip()]

def token_len(s: str) -> int:
    return len(tokenize(s))

def punct_profile(s: str):
    return (s.count(","), s.count(";"), s.count(":"), s.count("?"), s.count("!"))

NAME_VARIANTS = [
    {"abraham","ibrahim"},
    {"isaac","ishak","ishaq"},
    {"yakub","yakob","jacob"},
    {"musa","moses"},
    {"daud","david"},
    {"yohanes","john"},
    {"matheus","mateus","matthew"},
    {"yesus","jesus"},
    {"petros","petrus","peter"},
]

def name_set_loose(s: str):
    toks = set(t.lower() for t in tokenize(s) if t.isalpha())
    present = set()
    for i, group in enumerate(NAME_VARIANTS):
        if toks & group:
            present.add(i)
    return present

def ne_overlap(su: str, tgt: str) -> float:
    gs, gt = name_set_loose(su), name_set_loose(tgt)
    if not gs and not gt:
        return 1.0
    if not gs or not gt:
        return 0.0
    inter = len(gs & gt)
    union = len(gs | gt)
    return inter / max(1, union)

def len_ratio(a: str, b: str) -> float:
    la, lb = token_len(a), token_len(b)
    if la == 0 or lb == 0: return 0.0
    return min(la, lb) / max(la, lb)

def punct_diff(su: str, tgt: str) -> int:
    ps, pt = punct_profile(su), punct_profile(tgt)
    return sum(abs(a-b) for a,b in zip(ps, pt))

def flag_from_metrics(sim: float, lratio: float, p_diff: int, ne_ov: float, pair: str) -> str:
    # Pair-specific good length threshold
    len_good = LEN_GOOD_SUID if pair == "su-id" else LEN_GOOD_SUEN
    if sim >= SIM_THRESH_HIGH and lratio >= len_good and p_diff <= 1 and ne_ov >= 0.8:
        return "High"
    if sim >= SIM_THRESH_MED and lratio >= 0.5 and p_diff <= 2 and ne_ov >= 0.3:
        return "Medium"
    return "Low"

# ======== Embedding backend ========
def load_embedder():
    global st_model
    st_model = None
    err = None
    try:
        from sentence_transformers import SentenceTransformer
    except Exception as e:
        raise RuntimeError("Please install sentence-transformers: pip install sentence-transformers") from e

    for m in PREFERRED_MODELS:
        try:
            st_model = SentenceTransformer(m, device=DEVICE)
            print(f"[INFO] Loaded model: {m}")
            return st_model
        except Exception as e:
            err = e
            continue
    raise RuntimeError(f"Could not load any embedding model from {PREFERRED_MODELS}. Error: {err}")

def embed_texts(texts: list) -> np.ndarray:
    # sentence-transformers returns numpy array
    if not texts:
        return np.zeros((0, 768), dtype="float32")
    embs = st_model.encode(texts, batch_size=BATCH_SIZE, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
    return embs

def cosine_sim_matrix(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    # A, B are L2-normalized; cosine = dot
    return np.matmul(A, B.T)

def greedy_max_match(sim: np.ndarray, thresh: float) -> list:
    """
    Greedy bipartite matching on similarity matrix.
    Returns list of (i, j, sim_ij) with sim_ij >= thresh. Each row/col matched at most once.
    """
    if sim.size == 0:
        return []
    S = sim.copy()
    pairs = []
    used_r = set()
    used_c = set()
    # Flatten indices sorted by similarity descending
    flat = np.dstack(np.unravel_index(np.argsort(-S, axis=None), S.shape))[0]
    for r,c in flat:
        if r in used_r or c in used_c:
            continue
        s = S[r, c]
        if s < thresh:
            break
        pairs.append((int(r), int(c), float(s)))
        used_r.add(int(r))
        used_c.add(int(c))
    return pairs

# ======== Load sentence data ========
assert IN_SENT.exists(), f"Missing input: {IN_SENT}"
df = pd.read_csv(IN_SENT, dtype=str).fillna("")
need = {"verse_id","sent_idx","su_sentence","id_sentence","en_sentence"}
missing = need - set(df.columns)
if missing:
    raise ValueError(f"tri_sentences.csv must contain columns {sorted(need)}; missing {sorted(missing)}")

# group by verse to align within verse
groups = defaultdict(list)
for row in df.to_dict("records"):
    groups[row["verse_id"]].append(row)

# ======== Load model ========
st_model = load_embedder()

# ======== Align function per pair ========
def align_pair(pair: str, src_col: str, tgt_col: str, sim_thresh: float):
    align_rows = []  # per-link table
    per_row_flags = {}  # sent_id -> flag for best aligned target (only for reporting back)

    for verse_id, rows in groups.items():
        # collect per-verse sentence lists
        src_items = [(int(r["sent_idx"]), r[src_col]) for r in rows if r[src_col].strip()]
        tgt_items = [(int(r["sent_idx"]), r[tgt_col]) for r in rows if r[tgt_col].strip()]
        if not src_items or not tgt_items:
            continue

        src_idx, src_texts = zip(*src_items)
        tgt_idx, tgt_texts = zip(*tgt_items)

        # embeddings + sim matrix
        E_src = embed_texts(list(src_texts))
        E_tgt = embed_texts(list(tgt_texts))
        S = cosine_sim_matrix(E_src, E_tgt)

        # greedy matching above minimum sim threshold (use MED)
        pairs = greedy_max_match(S, SIM_THRESH_MED)

        # record links (and compute heuristics)
        for (i_local, j_local, sim_val) in pairs:
            i = src_idx[i_local]
            j = tgt_idx[j_local]
            su = src_texts[i_local]  # src is su_sentence by design
            tg = tgt_texts[j_local]

            lr = len_ratio(su, tg)
            pdiff = punct_diff(su, tg)
            neo = ne_overlap(su, tg)
            flag = flag_from_metrics(sim_val, lr, pdiff, neo, pair)

            align_rows.append({
                "verse_id": verse_id,
                "pair": pair,
                "su_idx": i,
                "tgt_idx": j,
                "su_text": su,
                "tgt_text": tg,
                "sim": round(sim_val, 4),
                "len_ratio": round(lr, 3),
                "punct_diff": int(pdiff),
                "ne_overlap": round(neo, 3),
                "flag": flag
            })

            # set per-row best flag back to sentence id (we keep max by sim)
            sent_id = f"{verse_id}#{i}"
            if sent_id not in per_row_flags or sim_val > per_row_flags[sent_id][0]:
                per_row_flags[sent_id] = (sim_val, flag)

    # Save link table
    out_links = OUT_DIR / f"{pair.replace('-','_')}_emb_align.csv"
    pd.DataFrame(align_rows,
                 columns=["verse_id","pair","su_idx","tgt_idx","su_text","tgt_text","sim","len_ratio","punct_diff","ne_overlap","flag"])\
      .to_csv(out_links, index=False, encoding="utf-8")

    print(f"[{pair}] Wrote {len(align_rows)} links → {out_links}")
    return per_row_flags

# ======== Run for both pairs ========
su_id_flags = align_pair("su-id", "su_sentence", "id_sentence", SIM_THRESH_MED)
su_en_flags = align_pair("su-en", "su_sentence", "en_sentence", SIM_THRESH_MED)

# ======== Attach per-row flags back to tri_sentences ========
def best_flag(sent_id: str, mapping: dict):
    if sent_id in mapping:
        return mapping[sent_id][1]
    return ""

df["sent_id"] = df["verse_id"].astype(str) + "#" + df["sent_idx"].astype(str)
df["flag_su_id_emb"] = df["sent_id"].map(lambda x: best_flag(x, su_id_flags))
df["flag_su_en_emb"] = df["sent_id"].map(lambda x: best_flag(x, su_en_flags))

out_sent = OUT_DIR / "tri_sentences_with_embflag.csv"
df.to_csv(out_sent, index=False, encoding="utf-8")
print(f"Wrote per-row flags → {out_sent}")

# Small summary
for col in ["flag_su_id_emb","flag_su_en_emb"]:
    vc = Counter(df[col].fillna("").tolist())
    print(f"{col} counts:", dict(vc))


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

[INFO] Loaded model: sentence-transformers/LaBSE
[su-id] Wrote 1803 links → out_align_py\su_id_emb_align.csv
[su-en] Wrote 1467 links → out_align_py\su_en_emb_align.csv
Wrote per-row flags → out_align_py\tri_sentences_with_embflag.csv
flag_su_id_emb counts: {'High': 276, 'Medium': 1097, '': 3072, 'Low': 430}
flag_su_en_emb counts: {'High': 211, 'Low': 234, 'Medium': 1022, '': 3408}


In [3]:
import csv
from pathlib import Path
import pandas as pd

# ========= CONFIG =========
IN_SENT_FLAGS = Path("./out_align_py/tri_sentences_with_embflag.csv")
OUT_DIR       = Path("./out_eval_ready")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# How to combine multiple sources of weights (if you later add others):
AGGREGATION = "identity"  # one of: "identity", "max", "mean", "min"
# If you want to produce a deterministic Test split:
CREATE_SPLIT = True
TEST_SIZE    = 1000       # set None to skip rebalancing by size

# Weight mapping
WEIGHT_MAP = {"High": 1.0, "Medium": 0.5, "Low": 0.0, "": 0.0}

# ========= Load =========
assert IN_SENT_FLAGS.exists(), f"Missing: {IN_SENT_FLAGS}"
df = pd.read_csv(IN_SENT_FLAGS, dtype=str).fillna("")

needed_cols = {
    "verse_id","sent_idx","sent_id",
    "su_sentence","id_sentence","en_sentence",
    "flag_su_id_emb","flag_su_en_emb"
}
missing = needed_cols - set(df.columns)
if missing:
    raise ValueError(f"Input must include columns {sorted(needed_cols)}; missing: {sorted(missing)}")

# ========= Map flags -> weights =========
def to_weight(flag: str) -> float:
    return WEIGHT_MAP.get(str(flag).strip(), 0.0)

df["weight_su_id_emb"] = df["flag_su_id_emb"].map(to_weight)
df["weight_su_en_emb"] = df["flag_su_en_emb"].map(to_weight)

# If you ever want a single "weight" column aggregated from multiple sources, choose here:
def agg_weights(row, pair: str):
    if pair == "su-id":
        vals = [row["weight_su_id_emb"]]
    elif pair == "su-en":
        vals = [row["weight_su_en_emb"]]
    else:
        vals = []
    if not vals:
        return 0.0
    if AGGREGATION == "max":
        return max(vals)
    if AGGREGATION == "mean":
        return float(sum(vals) / len(vals))
    if AGGREGATION == "min":
        return min(vals)
    # identity → the one we already chose per pair
    return vals[0]

# ========= Build pair-specific eval tables =========
def build_eval_pair(df_src: pd.DataFrame, pair: str) -> pd.DataFrame:
    if pair == "su-id":
        tgt_col = "id_sentence"
        flag_col = "flag_su_id_emb"
        wcol = "weight_su_id_emb"
    elif pair == "su-en":
        tgt_col = "en_sentence"
        flag_col = "flag_su_en_emb"
        wcol = "weight_su_en_emb"
    else:
        raise ValueError("pair must be 'su-id' or 'su-en'")

    sub = df_src[["sent_id","su_sentence",tgt_col,flag_col,wcol]].copy()
    sub.columns = ["sent_id","src","ref","flag","weight"]

    # ensure string flags
    sub["flag"] = sub["flag"].astype(str)

    if CREATE_SPLIT:
        # map flags to rank: High > Medium > Low
        sub["order_key"] = sub["flag"].map({"High": 2, "Medium": 1, "Low": 0}).fillna(0).astype(float)
        # sort by the **column name**, not the Series object
        sub = sub.sort_values(by=["order_key", "sent_id"], ascending=[False, True]).reset_index(drop=True)

        if TEST_SIZE is not None:
            TEST_SIZE_INT = int(TEST_SIZE)
            sub["split"] = ["test" if i < TEST_SIZE_INT else "train" for i in range(len(sub))]
        else:
            sub["split"] = "test"
        # optional: drop helper column
        sub = sub.drop(columns=["order_key"])
    else:
        sub["split"] = ""

    return sub

eval_su_id = build_eval_pair(df, "su-id")
eval_su_en = build_eval_pair(df, "su-en")

out_id = OUT_DIR / "eval_test_su_id.csv"
out_en = OUT_DIR / "eval_test_su_en.csv"
eval_su_id.to_csv(out_id, index=False, encoding="utf-8")
eval_su_en.to_csv(out_en, index=False, encoding="utf-8")

print("Wrote:")
print(" -", out_id)
print(" -", out_en)

# Show quick summaries
for name, sub in [("su-id", eval_su_id), ("su-en", eval_su_en)]:
    cnt = len(sub)
    test_cnt = (sub["split"] == "test").sum()
    flag_counts = sub["flag"].value_counts().to_dict()
    print(f"[{name}] rows={cnt}, test={test_cnt}, flags={flag_counts}")

# ========= (Optional) Tiny helper to create a 'sent_id,hyp' TSV template for systems =========
# This lets you hand a test list to your MT system to preserve sent_id ordering.
template_id = OUT_DIR / "test_template_su_id.tsv"
template_en = OUT_DIR / "test_template_su_en.tsv"
eval_su_id.loc[eval_su_id["split"].eq("test"), ["sent_id"]].assign(hyp="").to_csv(template_id, sep="\t", index=False)
eval_su_en.loc[eval_su_en["split"].eq("test"), ["sent_id"]].assign(hyp="").to_csv(template_en, sep="\t", index=False)
print("Templates written:")
print(" -", template_id)
print(" -", template_en)


Wrote:
 - out_eval_ready\eval_test_su_id.csv
 - out_eval_ready\eval_test_su_en.csv
[su-id] rows=4875, test=1000, flags={'': 3072, 'Medium': 1097, 'Low': 430, 'High': 276}
[su-en] rows=4875, test=1000, flags={'': 3408, 'Medium': 1022, 'Low': 234, 'High': 211}
Templates written:
 - out_eval_ready\test_template_su_id.tsv
 - out_eval_ready\test_template_su_en.tsv


In [2]:
pip install transformers sentencepiece accelerate torch --upgrade




ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\jtkacer01\\anaconda3\\Lib\\site-packages\\~orch\\lib\\asmjit.dll'
Consider using the `--user` option or check the permissions.




Collecting accelerate
  Obtaining dependency information for accelerate from https://files.pythonhosted.org/packages/5f/a0/d9ef19f780f319c21ee90ecfef4431cbeeca95bec7f14071785c17b6029b/accelerate-1.10.1-py3-none-any.whl.metadata
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/84/57/2f64161769610cf6b1c5ed782bd8a780e18a3c9d48931319f2887fa9d0b1/torch-2.8.0-cp311-cp311-win_amd64.whl.metadata
  Downloading torch-2.8.0-cp311-cp311-win_amd64.whl.metadata (30 kB)
Downloading accelerate-1.10.1-py3-none-any.whl (374 kB)
   ---------------------------------------- 0.0/374.9 kB ? eta -:--:--
   --------- ------------------------------ 92.2/374.9 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 374.9/374.9 kB 4.7 MB/s eta 0:00:00
Downloading torch-2.8.0-cp311-cp311-win_amd64.whl (241.4 MB)
   ---------------------------------------- 0.0/241.4 MB ? eta -:--:

In [None]:
import pandas as pd
from pathlib import Path
from typing import List

# ===== INPUTS YOU ALREADY HAVE =====
EVAL_SU_ID = Path("out_eval_ready/eval_test_su_id.csv")     # has sent_id, src (Sundanese), ref (Indonesian), split
EVAL_SU_EN = Path("out_eval_ready/eval_test_su_en.csv")     # has sent_id, src (Sundanese), ref (English), split
TPL_SU_ID  = Path("out_eval_ready/test_template_su_id.tsv") # has sent_id
TPL_SU_EN  = Path("out_eval_ready/test_template_su_en.tsv") # has sent_id

# ===== OUTPUTS (for SYSTEMS) =====
OUT_DIR = Path("./runs"); OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_TSV_SU_ID = OUT_DIR / "nllb_su_id_test.tsv"
OUT_TSV_SU_EN = OUT_DIR / "nllb_su_en_test.tsv"

# ===== Translator setup (NLLB-200 distilled) =====
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# smaller/faster than full 3.3B:

# ===== NLLB setup (fixed) =====
MODEL_NAME = "facebook/nllb-200-distilled-600M"
print("Loading model:", MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

DEVICE = "cpu"
model = model.to(DEVICE)

# NLLB language codes
SRC = "sun_Latn"   # Sundanese (Latin)
TGT_ID = "ind_Latn"
TGT_EN = "eng_Latn"

# IMPORTANT: set source lang *on the tokenizer* (don't pass as kwarg to __call__)
tokenizer.src_lang = SRC

def _bos_id(tgt_code: str) -> int:
    # Newer tokenizer versions have lang_code_to_id
    if hasattr(tokenizer, "lang_code_to_id") and tgt_code in tokenizer.lang_code_to_id:
        return tokenizer.lang_code_to_id[tgt_code]
    # Fallback (older behavior)
    return tokenizer.convert_tokens_to_ids(tgt_code)

def translate_batch(texts, tgt_code: str, max_new_tokens=128, batch_size=32, num_beams=4):
    out = []
    bos_id = _bos_id(tgt_code)
    for i in range(0, len(texts), batch_size):
        chunk = [t if isinstance(t, str) else "" for t in texts[i:i+batch_size]]
        enc = tokenizer(
            chunk,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        )
        enc = {k: v.to(DEVICE) for k, v in enc.items()}
        gen = model.generate(
            **enc,
            forced_bos_token_id=bos_id,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams
        )
        out.extend(tokenizer.batch_decode(gen, skip_special_tokens=True))
    return out

def build_system_tsv(eval_csv: Path, tpl_tsv: Path, tgt_code: str, out_tsv: Path):
    # Join template with eval to recover source sentences in the right order
    tpl = pd.read_csv(tpl_tsv, sep="\t", dtype=str)
    eva = pd.read_csv(eval_csv, dtype=str)
    assert {"sent_id","src"}.issubset(eva.columns), f"{eval_csv} must have sent_id, src"
    df = tpl.merge(eva[["sent_id","src"]], on="sent_id", how="left")
    assert df["src"].notna().all(), "Some sent_id from template not found in eval CSV."

    hyps = translate_batch(df["src"].tolist(), tgt_code=tgt_code, max_new_tokens=128, batch_size=32)
    df["hyp"] = hyps
    df[["sent_id","hyp"]].to_csv(out_tsv, sep="\t", index=False, encoding="utf-8")
    print("Wrote →", out_tsv)

# Build both pairs
build_system_tsv(EVAL_SU_ID, TPL_SU_ID, TGT_ID, OUT_TSV_SU_ID)  # su→id
build_system_tsv(EVAL_SU_EN, TPL_SU_EN, TGT_EN, OUT_TSV_SU_EN)  # su→en


Loading model: facebook/nllb-200-distilled-600M


In [1]:
# MT results table with COMET→BERTScore fallback.
# Inputs:
#   - TEST_FLAGGED_CSV: CSV with columns [sent_id, src, ref, flag, weight, split] (use your eval_test_* files)
#   - SYSTEMS: list of dicts {"pair": "su-id"/"su-en", "system": "...", "hyp_tsv": "..."} (sent_id, hyp)

import csv, math
from pathlib import Path
from typing import List, Dict, Tuple
import pandas as pd

# ======== CONFIG ========
# Point this to the eval file you generated for EACH pair before running for that pair:
TEST_FLAGGED_CSV = Path("./out_eval_ready/eval_test_su_id.csv")  # or eval_test_su_en.csv
USE_SPLIT = True  # use only split == test

SYSTEMS = [
    # Fill with your systems (examples below); you can run this cell multiple times for different sets/pairs.
    # {"pair": "su-id", "system": "Transformer-base", "hyp_tsv": "./runs/transformer_su_id_test.tsv"},
    # {"pair": "su-id", "system": "Moses-SMT",        "hyp_tsv": "./runs/moses_su_id_test.tsv"},
    # {"pair": "su-en", "system": "mBART50",          "hyp_tsv": "./runs/mbart50_su_en_test.tsv"},
]

EXPECTED_TEST_SIZE = 1000   # informational print only

# ======== Metric backends ========
# BLEU/ChrF (sacrebleu)
try:
    import sacrebleu
except Exception as e:
    sacrebleu = None
    print("[WARN] sacrebleu not available. Install with: pip install sacrebleu")

# Try COMET first
COMET_AVAILABLE = False
try:
    from comet import download_model, load_from_checkpoint
    _ckpt = download_model("Unbabel/wmt22-comet-da")
    comet_model = load_from_checkpoint(_ckpt)
    COMET_AVAILABLE = True
    print("[INFO] Using COMET: Unbabel/wmt22-comet-da")
except Exception:
    comet_model = None
    print("[INFO] COMET not available; will try BERTScore.")

# Prepare BERTScore fallback (lazy import inside function)
_BERT_OK = None

def _ensure_bertscore():
    global _BERT_OK, bert_score_score
    if _BERT_OK is not None:
        return _BERT_OK
    try:
        from bert_score import score as bert_score_score
        globals()["bert_score_score"] = bert_score_score
        _BERT_OK = True
    except Exception:
        _BERT_OK = False
    return _BERT_OK

# ======== Helpers ========
def load_flagged_test_rows(path: Path, pair: str, use_split: bool) -> pd.DataFrame:
    df = pd.read_csv(path, dtype=str)
    if "weight" in df.columns:
        df["weight"] = pd.to_numeric(df["weight"], errors="coerce").fillna(0.0)
    else:
        df["weight"] = 0.0
    if "flag" not in df.columns:
        df["flag"] = "Medium"

    # select target column based on pair
    if pair == "su-id":
        # in eval_test_* we already named columns "src","ref"
        pass
    elif pair == "su-en":
        pass
    else:
        raise ValueError(f"Unsupported pair: {pair}")

    need = {"sent_id","src","ref","flag","weight"}
    miss = [c for c in need if c not in df.columns]
    if miss:
        raise ValueError(f"Missing columns in {path}: {miss}")

    if use_split and "split" in df.columns:
        df = df[df["split"].astype(str).str.lower().eq("test")].copy()

    df = df[df["ref"].astype(str).str.strip().ne("")].copy()
    df["sent_id"] = df["sent_id"].astype(str).str.strip()
    df["src"] = df["src"].astype(str).str.strip()
    df["ref"] = df["ref"].astype(str).str.strip()
    if EXPECTED_TEST_SIZE:
        print(f"[{pair}] Loaded {len(df)} test rows (expected ~{EXPECTED_TEST_SIZE}).")
    return df[["sent_id","src","ref","flag","weight"]].copy()

def load_hyp_tsv(path: Path) -> pd.DataFrame:
    # support TSV (preferred) or CSV
    try:
        df = pd.read_csv(path, sep="\t", dtype=str)
    except Exception:
        df = pd.read_csv(path, dtype=str)
    assert "sent_id" in df.columns and "hyp" in df.columns, f"{path} must have columns: sent_id, hyp"
    df["sent_id"] = df["sent_id"].astype(str).str.strip()
    df["hyp"] = df["hyp"].astype(str).str.strip()
    return df[["sent_id","hyp"]].copy()

def join_test_hyp(test_df: pd.DataFrame, hyp_df: pd.DataFrame) -> pd.DataFrame:
    merged = test_df.merge(hyp_df, on="sent_id", how="inner")
    dropped = len(test_df) - len(merged)
    if dropped:
        print(f"[WARN] {dropped} test rows had no hypothesis and were dropped.")
    return merged

def compute_bleu_chrf_lists(refs: List[str], hyps: List[str]) -> Tuple[float, float, List[float], List[float]]:
    if sacrebleu is None:
        return float("nan"), float("nan"), [], []
    bleu_c = sacrebleu.corpus_bleu(hyps, [refs]).score
    chrf_c = sacrebleu.corpus_chrf(hyps, [refs]).score
    bleu_s, chrf_s = [], []
    for h, r in zip(hyps, refs):
        bleu_s.append(sacrebleu.sentence_bleu(h, [r]).score)
        chrf_s.append(sacrebleu.sentence_chrf(h, [r]).score)
    return bleu_c, chrf_c, bleu_s, chrf_s

def weighted_mean(scores: List[float], weights: List[float]) -> float:
    num = sum(s*w for s, w in zip(scores, weights))
    den = sum(weights)
    return (num/den) if den > 0 else float("nan")

def tgt_lang_for_pair(pair: str) -> str:
    # Language code for BERTScore; adjust if you use a different tag set
    return "id" if pair == "su-id" else "en"

def compute_comet_or_bertscore(pair: str, srcs: List[str], hyps: List[str], refs: List[str]) -> Tuple[float, List[float]]:
    # Try COMET first
    if COMET_AVAILABLE and comet_model is not None:
        data = [{"src": s, "mt": h, "ref": r} for s,h,r in zip(srcs, hyps, refs)]
        out = comet_model.predict(data, batch_size=32, gpus=0)
        sent = out["scores"]
        corpus = float(sum(sent) / max(1, len(sent)))
        return corpus, sent

    # Fallback: BERTScore F1
    if _ensure_bertscore():
        lang = tgt_lang_for_pair(pair)
        P, R, F1 = bert_score_score(hyps, refs, lang=lang, verbose=False)
        sent = [float(x) for x in F1.tolist()]
        corpus = float(sum(sent) / max(1, len(sent)))
        return corpus, sent

    # Final fallback
    return float("nan"), []

def evaluate_split(pair: str, df: pd.DataFrame, use_filter: bool, use_weighted: bool) -> Dict[str, float]:
    if use_filter:
        df = df[df["flag"].astype(str).str.lower().eq("high")].copy()
    if len(df) == 0:
        return {"BLEU": float("nan"), "ChrF": float("nan"), "COMET": float("nan")}

    refs = df["ref"].tolist()
    hyps = df["hyp"].tolist()
    srcs = df["src"].tolist()
    weights = df["weight"].astype(float).tolist()

    bleu_c, chrf_c, bleu_s, chrf_s = compute_bleu_chrf_lists(refs, hyps)
    comet_c, comet_s = compute_comet_or_bertscore(pair, srcs, hyps, refs)

    if use_weighted:
        bleu = weighted_mean(bleu_s, weights)
        chrf = weighted_mean(chrf_s, weights)
        comet = weighted_mean(comet_s, weights) if comet_s else float("nan")
    else:
        bleu, chrf, comet = bleu_c, chrf_c, comet_c

    return {"BLEU": bleu, "ChrF": chrf, "COMET": comet}

# ======== Build results table ========
rows_out = []

for spec in SYSTEMS:
    pair = spec["pair"].strip().lower()
    system_name = spec["system"]
    hyp_path = Path(spec["hyp_tsv"])

    test_df = load_flagged_test_rows(TEST_FLAGGED_CSV, pair, USE_SPLIT)
    hyp_df = load_hyp_tsv(hyp_path)
    merged = join_test_hyp(test_df, hyp_df)

    unfiltered = evaluate_split(pair, merged, use_filter=False, use_weighted=False)
    filtered   = evaluate_split(pair, merged, use_filter=True,  use_weighted=False)
    weighted   = evaluate_split(pair, merged, use_filter=False, use_weighted=True)

    rows_out.append({
        "Pair": pair,
        "System": system_name,
        "Unfiltered BLEU": round(unfiltered["BLEU"], 2) if pd.notna(unfiltered["BLEU"]) else float("nan"),
        "Filtered BLEU":   round(filtered["BLEU"],   2) if pd.notna(filtered["BLEU"])   else float("nan"),
        "Weighted BLEU":   round(weighted["BLEU"],   2) if pd.notna(weighted["BLEU"])   else float("nan"),
        "Unfiltered ChrF": round(unfiltered["ChrF"], 2) if pd.notna(unfiltered["ChrF"]) else float("nan"),
        "Filtered ChrF":   round(filtered["ChrF"],   2) if pd.notna(filtered["ChrF"])   else float("nan"),
        "Weighted ChrF":   round(weighted["ChrF"],   2) if pd.notna(weighted["ChrF"])   else float("nan"),
        # These columns are COMET if available, else BERTScore F1
        "Unfiltered COMET":round(unfiltered["COMET"],3) if pd.notna(unfiltered["COMET"])else float("nan"),
        "Filtered COMET":  round(filtered["COMET"],  3) if pd.notna(filtered["COMET"])  else float("nan"),
        "Weighted COMET":  round(weighted["COMET"],  3) if pd.notna(weighted["COMET"])  else float("nan"),
    })

df_out = pd.DataFrame(rows_out, columns=[
    "Pair","System",
    "Unfiltered BLEU","Filtered BLEU","Weighted BLEU",
    "Unfiltered ChrF","Filtered ChrF","Weighted ChrF",
    "Unfiltered COMET","Filtered COMET","Weighted COMET",
])

save_path = Path("./out_eval/mt_results_table.csv")
save_path.parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(save_path, index=False, encoding="utf-8")
display(df_out)
print(f"\nSaved results table → {save_path}")


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jtkacer01\.cache\huggingface\hub\models--Unbabel--wmt22-comet-da\snapshots\2760a223ac957f30acfb18c8aa649b01cf1d75f2\checkpoints\model.ckpt`
Encoder model frozen.


[INFO] Using COMET: Unbabel/wmt22-comet-da


C:\Users\jtkacer01\anaconda3\Lib\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


Unnamed: 0,Pair,System,Unfiltered BLEU,Filtered BLEU,Weighted BLEU,Unfiltered ChrF,Filtered ChrF,Weighted ChrF,Unfiltered COMET,Filtered COMET,Weighted COMET



Saved results table → out_eval\mt_results_table.csv
