# Parallel Corpora

## Imports

In [11]:
from pathlib import Path
import re
from typing import Dict, Iterable, Union
import csv

import pandas as pd
import glob
import os

from itertools import zip_longest

from typing import List, Tuple, Optional

import numpy as np


## By verse
This script provides a reusable pipeline for building parallel corpora of Bible translations aligned by verse, including cases where verse identifiers are merged across languages. For each configured language pair, it loads all verse-aligned files from two input directories, parses them into dictionaries keyed by USFM identifiers (e.g., 1CO.5.12 or merged keys like 1CO.5.12+1CO.5.13), and automatically harmonizes verse boundaries between languages. When one language merges consecutive verses while the other keeps them separate, the script detects the overlap and merges the corresponding verses to maintain alignment. It then constructs a unified, verse-by-verse (or merged-verse) mapping, fills in missing entries with a placeholder token, and exports two synchronized outputs: a plain-text file with interleaved verse pairs and a CSV file containing structured metadata (book, chapter, verse label, and both language texts). The resulting corpus ensures consistent alignment even when translations differ in verse segmentation, making it suitable for multilingual text mining, translation studies, or machine learning applications.

In [19]:
# CONFIGURE: Language pairs
PAIRS = [
    {   # Tagalog-Kapampangan
        "lang1_dir": "../parser/cj/parsed/Tagalog",
        "lang2_dir": "../parser/trish/parsed/Kapampangan",
        "out_txt": "tagalog_kapampangan_verse.txt",
    },
    {   # Tagalog-Bikolano
        "lang1_dir": "../parser/cj/parsed/Tagalog",
        "lang2_dir": "../parser/trish/parsed/Bikolano",
        "out_txt": "tagalog_bikolano_verse.txt",
    },
    {   # Cebuano-Spanish
        "lang1_dir": "../parser/yna/parsed/Cebuano",
        "lang2_dir": "../parser/yna/parsed/Spanish",
        "out_txt": "cebuano_spanish_verse.txt",
    },
    {   # Cebuano-Tausug
        "lang1_dir": "../parser/yna/parsed/Cebuano",
        "lang2_dir": "../parser/yna/parsed/Tausug",
        "out_txt": "cebuano_tausug_verse.txt",
    },
    {   # Chavacano-Spanish
        "lang1_dir": "../parser/yna/parsed/Chavacano",
        "lang2_dir": "../parser/yna/parsed/Spanish",
        "out_txt": "chavacano_spanish_verse.txt",
    },
    {   # Ivatan-Yami
        "lang1_dir": "../parser/cj/parsed/Ivatan",
        "lang2_dir": "../parser/cj/parsed/Yami",
        "out_txt": "ivatan_yami_verse.txt",
    },
    {   # Pangasinene-Ilokano
        "lang1_dir": "../parser/cj/parsed/Pangasinense",
        "lang2_dir": "../parser/trish/parsed/Ilokano",
        "out_txt": "pangasinense_ilokano_verse.txt",
    },
]

In [20]:
for pair in PAIRS:
    print(Path(pair["lang1_dir"]).resolve(), Path(pair["lang1_dir"]).is_dir())
    print(Path(pair["lang2_dir"]).resolve(), Path(pair["lang2_dir"]).is_dir())


/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/Tagalog True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/trish/parsed/Kapampangan True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/Tagalog True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/trish/parsed/Bikolano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Cebuano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Spanish True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Cebuano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Tausug True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Chavacano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Spanish True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/Ivatan True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/

In [21]:
# helpers

DEFAULT_MISSING = '"N/A"'
MERGE_JOIN_SEP   = " "         
KEY_JOIN_SEP     = "+"        

LINE_RE = re.compile(
    r'^\s*(([0-9A-Z]+\.\d+\.\d+)(?:\+[0-9A-Z]+\.\d+\.\d+)*)\s+(.*\S)\s*$'
)

def _to_path(p):
    return None if p is None else (p if isinstance(p, Path) else Path(p))

def is_merged_key(usfm: str) -> bool:
    return KEY_JOIN_SEP in usfm

def split_merged(usfm: str) -> list[str]:
    return usfm.split(KEY_JOIN_SEP)

def usfm_sort_key(usfm: str):
    parts = usfm.split(".")
    book = parts[0] if parts else ""
    chap = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
    verse = int(parts[2]) if len(parts) > 2 and parts[2].isdigit() else 0
    return (book, chap, verse)

def normalize_group_key(atom_ids: list[str]) -> str:
    atoms_sorted = sorted(atom_ids, key=usfm_sort_key)
    return KEY_JOIN_SEP.join(atoms_sorted)

def split_usfm(usfm: str):
    parts = usfm.split(".")
    book = parts[0] if len(parts) > 0 else ""
    chap = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
    verse = int(parts[2]) if len(parts) > 2 and parts[2].isdigit() else 0
    return book, chap, verse

def _last_dir_name(p: Path) -> str:
    return p.name or p.parent.name


def parse_txt_file(fp: Path) -> Dict[str, str]:
    """
    Parse a single .txt file into {usfm_or_merged_key: text}.
    Allows merged identifiers joined by '+'.
    """
    d: Dict[str, str] = {}
    with fp.open("r", encoding="utf-8-sig", errors="replace") as f:
        for raw in f:
            m = LINE_RE.match(raw)
            if not m:
                continue
            key, text = m.group(1), m.group(3).strip()
            d[key] = text
    return d

def load_folder(folder: Path) -> Dict[str, str]:
    """
    Merge all *.txt files in a folder into a single map {key: text}.
    Later files overwrite earlier ones on the same key (simple last-wins).
    """
    combined: Dict[str, str] = {}
    txts = sorted(folder.glob("*.txt"))
    for fp in txts:
        part = parse_txt_file(fp)
        combined.update(part)
    return combined


def collect_groups(lang_map: Dict[str, str]) -> list[set[str]]:
    """Return list of sets of atomic IDs for all merged keys in a language map."""
    groups = []
    for k in lang_map:
        if is_merged_key(k):
            groups.append(set(split_merged(k)))
    return groups

def union_overlapping_groups(group_sets: list[set[str]]) -> list[list[str]]:
    """
    Given a list of sets of atomic IDs, union any that overlap.
    Returns list of sorted lists of atomic IDs (stable order by USFM).
    """
    groups = [set(g) for g in group_sets]
    changed = True
    while changed:
        changed = False
        out = []
        while groups:
            g = groups.pop()
            merged = True
            while merged:
                merged = False
                for i in range(len(groups)-1, -1, -1):
                    if g & groups[i]:
                        g |= groups[i]
                        groups.pop(i)
                        merged = True
                        changed = True
            out.append(g)
        groups = out
    return [sorted(list(g), key=usfm_sort_key) for g in groups]

def all_final_groups(lang1: Dict[str,str], lang2: Dict[str,str]) -> list[list[str]]:
    """
    Build the final grouping units (lists of atomic IDs) by taking:
      - all merged groupings from both langs (unioning overlaps),
      - plus singleton groups for any remaining atom not covered.
    """
    g1 = collect_groups(lang1)
    g2 = collect_groups(lang2)
    merged_groups = union_overlapping_groups(g1 + g2) 

    covered = set(a for grp in merged_groups for a in grp)

    all_atoms = set()
    for k in list(lang1.keys()) + list(lang2.keys()):
        if is_merged_key(k):
            all_atoms.update(split_merged(k))
        else:
            all_atoms.add(k)

    singletons = [[a] for a in sorted(all_atoms - covered, key=usfm_sort_key)]
    return merged_groups + singletons

def map_atoms_signature(keys_map: Dict[str,str]) -> Dict[frozenset, str]:
    """
    For quick lookup: if a language already has a merged key whose atoms set equals a group,
    we want to reuse that exact key string (and its text).
    Returns {frozenset(atom_ids): merged_key_string}
    """
    sig = {}
    for k in keys_map:
        if is_merged_key(k):
            atoms = frozenset(split_merged(k))
            sig[atoms] = k
    return sig

def assemble_text_for_group(lang_map: Dict[str,str],
                            group_atoms: list[str],
                            missing: str) -> str:
    """
    If lang has a matching merged key (any order), use it. Otherwise join member-verse texts.
    When joining, preserve verse order; insert `missing` for absent members.
    """
    sig_to_key = map_atoms_signature(lang_map)
    sig = frozenset(group_atoms)
    if sig in sig_to_key:
        return lang_map[sig_to_key[sig]]

    parts = []
    for a in group_atoms:
        parts.append(lang_map.get(a, missing))
    return MERGE_JOIN_SEP.join(parts)

def group_label(group_atoms: list[str]) -> tuple[str, str, str]:
    """
    Build output columns for (usfm_key, book, chapter, verse_label) from a group.
    - usfm_key: normalized merged key 'a+b+...'
    - book, chapter: taken from the first atomic ID (if consistent)
    - verse_label: '12+13' if same chapter; otherwise 'mixed'
    """
    atoms_sorted = sorted(group_atoms, key=usfm_sort_key)
    key = normalize_group_key(atoms_sorted)

    b0, c0, v0 = split_usfm(atoms_sorted[0])
    same_book_chapter = all(split_usfm(a)[0] == b0 and split_usfm(a)[1] == c0 for a in atoms_sorted)
    if same_book_chapter:
        verse_label = KEY_JOIN_SEP.join(str(split_usfm(a)[2]) for a in atoms_sorted)
        book, chap = b0, c0
    else:
        verse_label = "mixed"
        book, chap = b0, c0  

    return key, book, str(chap), verse_label


def process_pair(
    lang1_dir: Path,
    lang2_dir: Path,
    out_txt: Path,
    missing: str = DEFAULT_MISSING,
) -> dict:
    """
    Process one language pair and write:
      - TXT with stacked lines per GROUP-KEY:  "<KEY> <lang1_text>" then "<KEY> <lang2_text>"
      - CSV with columns: usfm_key, book, verse(s), chapter, language 1, language2
    """
    assert lang1_dir.is_dir(), f"Not a directory: {lang1_dir}"
    assert lang2_dir.is_dir(), f"Not a directory: {lang2_dir}"

    if out_txt is None:
        name1 = _last_dir_name(lang1_dir)
        name2 = _last_dir_name(lang2_dir)
        out_txt = Path(f"{name1}_{name2}_verse.txt")
    out_csv = out_txt.with_suffix(".csv")

    lang1 = load_folder(lang1_dir)
    lang2 = load_folder(lang2_dir)

    groups = all_final_groups(lang1, lang2)

    groups.sort(key=lambda g: usfm_sort_key(sorted(g, key=usfm_sort_key)[0]))

    lines_out = []
    for atoms in groups:
        key, _, _, _ = group_label(atoms)
        t1 = assemble_text_for_group(lang1, atoms, missing)
        t2 = assemble_text_for_group(lang2, atoms, missing)
        lines_out.append(f"{key} {t1}")
        lines_out.append(f"{key} {t2}")
    out_txt.write_text("\n".join(lines_out) + "\n", encoding="utf-8")


    with out_csv.open("w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(["usfm", "book", "verse", "chapter", "language 1", "language2"])
        for atoms in groups:
            key, book, chap, verse_label = group_label(atoms)
            t1 = assemble_text_for_group(lang1, atoms, missing)
            t2 = assemble_text_for_group(lang2, atoms, missing)
            w.writerow([key, book, verse_label, chap, t1, t2])

    all_atoms_lang1 = set()
    for k in lang1:
        all_atoms_lang1.update(split_merged(k) if is_merged_key(k) else [k])
    all_atoms_lang2 = set()
    for k in lang2:
        all_atoms_lang2.update(split_merged(k) if is_merged_key(k) else [k])

    group_keys = [normalize_group_key(g) for g in groups]
    summary = {
        "out_txt": str(out_txt),
        "out_csv": str(out_csv),
        "txt_lines_written": len(lines_out),
        "csv_rows_written": len(groups) + 1,  # +1 for header
        "groups_total": len(groups),
        "missing_in_lang1_atoms": sum(1 for g in groups for a in g if a not in all_atoms_lang1),
        "missing_in_lang2_atoms": sum(1 for g in groups for a in g if a not in all_atoms_lang2),
        "missing_token": missing,
        "merge_join_sep": MERGE_JOIN_SEP,
        "key_join_sep": KEY_JOIN_SEP,
    }
    print(
        f"Processed {out_txt.name} & {out_csv.name} | groups: {summary['groups_total']} | "
        f"missing_atoms(lang1): {summary['missing_in_lang1_atoms']} | missing_atoms(lang2): {summary['missing_in_lang2_atoms']}"
    )
    return summary

# main
all_summaries = []
for cfg in PAIRS:
    lang1_dir = _to_path(cfg.get("lang1_dir"))
    lang2_dir = _to_path(cfg.get("lang2_dir"))
    out_txt   = _to_path(cfg.get("out_txt"))
    missing   = cfg.get("missing", DEFAULT_MISSING)

    if lang1_dir is None or lang2_dir is None:
        raise ValueError("Each pair must include 'lang1_dir' and 'lang2_dir'.")

    summary = process_pair(
        lang1_dir=lang1_dir,
        lang2_dir=lang2_dir,
        out_txt=out_txt,
        missing=missing,
    )
    all_summaries.append(summary)

print("\nDone processing all pairs.")
for s in all_summaries:
    print(f"- Wrote TXT: {s['out_txt']} | CSV: {s['out_csv']}")

KeyboardInterrupt: 

## By sentence

This code builds parallel corpora for multiple language pairs by aligning sentence-level files book by book.

- Natural-sorted, deterministic traversal of files.
- Book detection from filename (token between the first and second underscore).
- Verse-aware grouping with merged verse ID normalization (e.g., "1CO.5.12+1CO.5.13" -> "1CO.5.12-13").
- Monotone dynamic program (DP) for sentence alignment within each verse span:
  supports 1-1, 1-2, 2-1, and 2-2 merges.
- TXT output remains human-readable; CSV keeps structured IDs and text.



In [31]:
# CONFIGURE: Language pairs
PAIRS = [
    {   # Tagalog-Kapampangan
        "lang1_dir": "../parser/cj/parsed/Tagalog",
        "lang2_dir": "../parser/trish/parsed/Kapampangan",
        "out_txt": "tagalog_kapampangan_sentence",
    },
    {   # Tagalog-Bikolano
        "lang1_dir": "../parser/cj/parsed/Tagalog",
        "lang2_dir": "../parser/trish/parsed/Bikolano",
        "out_txt": "tagalog_bikolano_sentence",
    },
    {   # Cebuano-Spanish
        "lang1_dir": "../parser/yna/parsed/Cebuano",
        "lang2_dir": "../parser/yna/parsed/Spanish",
        "out_txt": "cebuano_spanish_sentence",
    },
    {   # Cebuano-Tausug
        "lang1_dir": "../parser/yna/parsed/Cebuano",
        "lang2_dir": "../parser/yna/parsed/Tausug",
        "out_txt": "cebuano_tausug_sentence",
    },
    {   # Chavacano-Spanish
        "lang1_dir": "../parser/yna/parsed/Chavacano",
        "lang2_dir": "../parser/yna/parsed/Spanish",
        "out_txt": "chavacano_spanish_sentence",
    },
    {   # Ivatan-Yami
        "lang1_dir": "../parser/cj/parsed/Ivatan",
        "lang2_dir": "../parser/cj/parsed/Yami",
        "out_txt": "ivatan_yami_sentence",
    },
    {   # Pangasinene-Ilokano
        "lang1_dir": "../parser/cj/parsed/Pangasinense",
        "lang2_dir": "../parser/trish/parsed/Ilokano",
        "out_txt": "pangasinense_ilokano_sentence",
    },
]

OUT_DIR = None
USE_FIXED_HEADERS = False
KEEP_EMPTY_LINES = True

In [32]:
for pair in PAIRS:
    print(Path(pair["lang1_dir"]).resolve(), Path(pair["lang1_dir"]).is_dir())
    print(Path(pair["lang2_dir"]).resolve(), Path(pair["lang2_dir"]).is_dir())


/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/Tagalog True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/trish/parsed/Kapampangan True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/Tagalog True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/trish/parsed/Bikolano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Cebuano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Spanish True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Cebuano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Tausug True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Chavacano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Spanish True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/Ivatan True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/

In [None]:
# helpers


def _natural_key(s: str):
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]

def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    new_cols = {}
    for c in df.columns:
        nc = c.replace("\ufeff", "").strip().lower()
        new_cols[c] = nc
    return df.rename(columns=new_cols)

def _is_missing_str(s) -> bool:
    if s is None:
        return True
    t = str(s).strip()
    return t == "" or t.lower() in {"nan", "none", "null"}

def list_csvs(root_dir):
    pattern = os.path.join(root_dir, "**", "*.csv")
    files = [p for p in glob.glob(pattern, recursive=True) if os.path.isfile(p)]
    files.sort(key=_natural_key)
    return files

def language_name_from_dir(dir_path):
    return os.path.basename(os.path.normpath(dir_path)) or "Language"

def ensure_ext(base, ext):
    b, _ = os.path.splitext(base)
    return b + ext

def write_outputs(df, out_base, l1_name, l2_name):
    out_csv = ensure_ext(out_base, ".csv")
    out_txt = ensure_ext(out_base, ".txt")
    os.makedirs(os.path.dirname(out_csv) or ".", exist_ok=True)

    df.to_csv(out_csv, index=False, encoding="utf-8")

    with open(out_txt, "w", encoding="utf-8") as f:
        for _, r in df.iterrows():
            f.write(f"[{r['book']}] {r['verse']}\n")
            f.write(f"{l1_name} ({r[f'{l1_name} ID']})\n")
            f.write(f"{'' if pd.isna(r[f'{l1_name}']) else str(r[f'{l1_name}'])}\n")
            f.write(f"{l2_name} ({r[f'{l2_name} ID']})\n")
            f.write(f"{'' if pd.isna(r[f'{l2_name}']) else str(r[f'{l2_name}'])}\n\n")

    return out_csv, out_txt

def read_records_usfm_schema(path, keep_empty=False):
    """
    Reads a CSV with columns (case-insensitive):
      usfm, book, chapter, verse, text, iso6393, vid
    Returns list of dicts per row: {'usfm','book','chapter','verse','text','iso6393','vid'}
    """
    last_err = None
    df = None
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            df = pd.read_csv(path, dtype=str, encoding=enc, on_bad_lines="skip")
            break
        except Exception as e:
            last_err = e
    if df is None:
        raise RuntimeError(f"Failed to read CSV: {path}\n{last_err}")

    df = _normalize_columns(df)

    required = {"usfm", "book", "chapter", "verse"}
    has_text = "text" in df.columns or "sentence" in df.columns
    missing = required - set(df.columns)
    if missing or not has_text:
        raise ValueError(f"CSV {path} missing columns. Needed {required} + text/sentence. Got: {list(df.columns)}")

    text_col = "text" if "text" in df.columns else "sentence"

    tser = df[text_col].astype(str).map(lambda x: x.replace("\r", " ").strip())
    if not keep_empty:
        mask = ~tser.map(_is_missing_str)
        df = df[mask].copy()
        tser = tser[mask]

    df[text_col] = np.where(tser.map(_is_missing_str), "", tser)

    for opt in ["iso6393", "vid"]:
        if opt not in df.columns:
            df[opt] = ""

    rows = []
    stem = os.path.splitext(os.path.basename(path))[0]
    for i, r in df.iterrows():
        vid = (str(r["vid"]).strip() if str(r["vid"]).strip() else f"{stem}#{i+1}")
        rows.append({
            "usfm": str(r["usfm"]).strip(),
            "book": str(r["book"]).strip(),
            "chapter": str(r["chapter"]).strip(),
            "verse": str(r["verse"]).strip(),
            "text": str(r[text_col]),
            "iso6393": str(r["iso6393"]).strip(),
            "vid": vid,
        })
    return rows

# verse span handling


MERGE_SEP = "+"

def _normalize_verse_span(usfm_id: str) -> str:
    """
    Turn 'BOOK.C.V1+BOOK.C.V2+...' -> 'BOOK.C.Vmin-Vmax'
    Single IDs pass through unchanged.
    """
    if MERGE_SEP not in usfm_id:
        return usfm_id
    parts = [p.strip() for p in usfm_id.split(MERGE_SEP) if p.strip()]
    book_ch = ".".join(parts[0].split(".")[:2])  
    verses = []
    for p in parts:
        toks = p.split(".")
        try:
            verses.append(int(toks[-1]))
        except Exception:
            v = re.sub(r"\D+", "", toks[-1])
            verses.append(int(v) if v.isdigit() else 0)
    vmin, vmax = min(verses), max(verses)
    return f"{book_ch}.{vmin}-{vmax}"


def read_dir_grouped_by_bookverse(root_dir, keep_empty=False):
    """
    Returns:
      books: dict[book_code] -> dict[verse_span_id] -> list[{id:'vid', text:'...'}]
    It reads every CSV and buckets rows by (book, normalized usfm span).
    """
    books = {}
    for path in list_csvs(root_dir):
        try:
            recs = read_records_usfm_schema(path, keep_empty=keep_empty)
        except Exception as e:
            print(f"[WARN] Skipping {path}: {e}")
            continue
        for r in recs:
            book = r["book"].upper()
            span = _normalize_verse_span(r["usfm"])
            books.setdefault(book, {}).setdefault(span, []).append({"id": r["vid"], "text": r["text"]})
    return books

NUM_RE = re.compile(r'\d+')

def _len_sim(a: str, b: str) -> float:
    la, lb = max(1, len(a)), max(1, len(b))
    return 1.0 - abs(la - lb) / max(la, lb)

def _anchors(a: str, b: str) -> float:
    bonus = 0.0
    nums_a = set(NUM_RE.findall(a))
    nums_b = set(NUM_RE.findall(b))
    if nums_a & nums_b:
        bonus += 0.15
    names_a = set(t for t in re.findall(r'\b[A-Z][A-Za-z]+\b', a))
    names_b = set(t for t in re.findall(r'\b[A-Z][A-Za-z]+\b', b))
    if names_a and names_b and (names_a & names_b):
        bonus += 0.1
    return bonus

def _score_block(sa: list[str], tb: list[str], merge_penalty: float) -> float:
    a = " ".join(sa).strip()
    b = " ".join(tb).strip()
    base = _len_sim(a, b) + _anchors(a, b)
    merges = (len(sa) - 1) + (len(tb) - 1)
    return base - merge_penalty * merges

def align_within_verse(sent_a: list[str], sent_b: list[str], merge_penalty: float = 0.15):
    """
    Returns list of tuples: (a_start,a_end, b_start,b_end, score)
    half-open ranges; supports 1-1, 1-2, 2-1, 2-2; monotone, no crossings.
    """
    m, n = len(sent_a), len(sent_b)
    INF = -1e9
    dp = [[INF]*(n+1) for _ in range(m+1)]
    back = [[None]*(n+1) for _ in range(m+1)]
    dp[0][0] = 0.0

    def S(i,j,ii,jj):
        return _score_block(sent_a[i:ii], sent_b[j:jj], merge_penalty)

    for i in range(m+1):
        for j in range(n+1):
            cur = dp[i][j]
            if cur <= INF/2:
                continue
            if i < m and j < n:
                sc = cur + S(i,j,i+1,j+1)
                if sc > dp[i+1][j+1]:
                    dp[i+1][j+1] = sc; back[i+1][j+1] = (i,j, i+1,j+1)
            if i < m and j+1 < n:
                sc = cur + S(i,j,i+1,j+2)
                if sc > dp[i+1][j+2]:
                    dp[i+1][j+2] = sc; back[i+1][j+2] = (i,j, i+1,j+2)
            if i+1 < m and j < n:
                sc = cur + S(i,j,i+2,j+1)
                if sc > dp[i+2][j+1]:
                    dp[i+2][j+1] = sc; back[i+2][j+1] = (i,j, i+2,j+1)
            if i+1 < m and j+1 < n:
                sc = cur + S(i,j,i+2,j+2)
                if sc > dp[i+2][j+2]:
                    dp[i+2][j+2] = sc; back[i+2][j+2] = (i,j, i+2,j+2)

    i, j = m, n
    path = []
    while i > 0 or j > 0:
        if back[i][j] is None:
            if i > 0:
                path.append((i-1, i, j, j, _score_block([sent_a[i-1]], [], merge_penalty)))
                i -= 1
            elif j > 0:
                path.append((i, i, j-1, j, _score_block([], [sent_b[j-1]], merge_penalty)))
                j -= 1
        else:
            pi, pj, ii, jj = back[i][j]
            path.append((pi, ii, pj, jj, _score_block(sent_a[pi:ii], sent_b[pj:jj], merge_penalty)))
            i, j = pi, pj
    path.reverse()
    return path

# merge: book → verse → aligned rows

def merge_pair_by_verse_dirs(lang1_dir, lang2_dir, out_base_name,
                             out_dir=None, use_fixed_headers=False, keep_empty=False):
    lang1_dir = os.path.normpath(lang1_dir)
    lang2_dir = os.path.normpath(lang2_dir)

    l1_name = "Language 1" if use_fixed_headers else language_name_from_dir(lang1_dir)
    l2_name = "Language 2" if use_fixed_headers else language_name_from_dir(lang2_dir)

    l1_books = read_dir_grouped_by_bookverse(lang1_dir, keep_empty=keep_empty)
    l2_books = read_dir_grouped_by_bookverse(lang2_dir, keep_empty=keep_empty)

    all_books = sorted(set(l1_books.keys()) | set(l2_books.keys()), key=_natural_key)

    rows, per_book_stats = [], []

    for bk in all_books:
        v1 = l1_books.get(bk, {})
        v2 = l2_books.get(bk, {})

        all_spans = sorted(set(v1.keys()) | set(v2.keys()), key=_natural_key)

        aligned_units = 0
        l1_rows_total = sum(len(v) for v in v1.values())
        l2_rows_total = sum(len(v) for v in v2.values())

        for span in all_spans:
            a_ids = [e["id"] for e in v1.get(span, [])]
            b_ids = [e["id"] for e in v2.get(span, [])]
            a = [e["text"] for e in v1.get(span, [])]
            b = [e["text"] for e in v2.get(span, [])]

            if a and b:
                path = align_within_verse(a, b)
                for (i0, i1, j0, j1, _sc) in path:
                    l1_text = " ".join(a[i0:i1]) if i0 < i1 else "N/A"
                    l2_text = " ".join(b[j0:j1]) if j0 < j1 else "N/A"
                    l1_id = ",".join(a_ids[i0:i1]) if i0 < i1 else "N/A"
                    l2_id = ",".join(b_ids[j0:j1]) if j0 < j1 else "N/A"
                    rows.append({
                        "book": bk, "verse": span,
                        f"{l1_name} ID": l1_id, f"{l1_name}": l1_text,
                        f"{l2_name} ID": l2_id, f"{l2_name}": l2_text,
                    })
                    aligned_units += 1
            elif a and not b:
                for i in range(len(a)):
                    rows.append({
                        "book": bk, "verse": span,
                        f"{l1_name} ID": a_ids[i], f"{l1_name}": a[i],
                        f"{l2_name} ID": "N/A", f"{l2_name}": "N/A",
                    })
                    aligned_units += 1
            elif b and not a:
                for j in range(len(b)):
                    rows.append({
                        "book": bk, "verse": span,
                        f"{l1_name} ID": "N/A", f"{l1_name}": "N/A",
                        f"{l2_name} ID": b_ids[j], f"{l2_name}": b[j],
                    })
                    aligned_units += 1
            else:
                pass

        per_book_stats.append({
            "book": bk,
            f"{l1_name}_rows": l1_rows_total,
            f"{l2_name}_rows": l2_rows_total,
            "aligned_units": aligned_units,
        })

    cols = ["book", "verse", f"{l1_name} ID", f"{l1_name}", f"{l2_name} ID", f"{l2_name}"]
    df = pd.DataFrame(rows, columns=cols)

    out_base = out_base_name if out_dir is None else os.path.join(out_dir, out_base_name)
    out_csv, out_txt = write_outputs(df, out_base, l1_name, l2_name)

    common_books = sorted(set(l1_books.keys()) & set(l2_books.keys()), key=_natural_key)
    missing_l1 = sorted(set(l2_books.keys()) - set(l1_books.keys()), key=_natural_key)
    missing_l2 = sorted(set(l1_books.keys()) - set(l2_books.keys()), key=_natural_key)

    print(f"Books (total): {len(all_books)}; Common: {len(common_books)}")
    if missing_l1:
        print(f"Books only in {l2_name}: {missing_l1[:10]}{'...' if len(missing_l1)>10 else ''}")
    if missing_l2:
        print(f"Books only in {l1_name}: {missing_l2[:10]}{'...' if len(missing_l2)>10 else ''}")
    print(f"Total aligned units (rows): {len(df)}")
    print(f"CSV → {out_csv}")
    print(f"TXT → {out_txt}")

    print("\nPer-book stats (first 6):")
    for s in per_book_stats[:6]:
        print(s)

    for i in range(min(3, len(df))):
        r = df.iloc[i]
        print(f"[{i+1}] [{r['book']} {r['verse']}] "
              f"{l1_name}({r[f'{l1_name} ID']}): {r[f'{l1_name}']}  ||  "
              f"{l2_name}({r[f'{l2_name} ID']}): {r[f'{l2_name}']}")

    return out_csv, out_txt, df

results = []
for pair in PAIRS:
    csv_path, txt_path, df_out = merge_pair_by_verse_dirs(
        pair["lang1_dir"], pair["lang2_dir"],
        out_base_name=pair["out_txt"],
        out_dir=OUT_DIR,
        use_fixed_headers=USE_FIXED_HEADERS,
        keep_empty=KEEP_EMPTY_LINES,
    )
    results.append((csv_path, txt_path, len(df_out)))

print("\nDone.")
print("Summary:")
for (csv_path, txt_path, nrows) in results:
    print(f"  {os.path.basename(csv_path)} | rows={nrows}")

Books (total): 77; Common: 77
Total aligned units (rows): 35515
CSV → tagalog_kapampangan_sentence.csv
TXT → tagalog_kapampangan_sentence.txt

Per-book stats (first 6):
{'book': '1CH', 'Tagalog_rows': 867, 'Kapampangan_rows': 942, 'aligned_units': 954}
{'book': '1CO', 'Tagalog_rows': 436, 'Kapampangan_rows': 436, 'aligned_units': 436}
{'book': '1JN', 'Tagalog_rows': 104, 'Kapampangan_rows': 105, 'aligned_units': 106}
{'book': '1KI', 'Tagalog_rows': 810, 'Kapampangan_rows': 815, 'aligned_units': 819}
{'book': '1MA', 'Tagalog_rows': 921, 'Kapampangan_rows': 924, 'aligned_units': 927}
{'book': '1PE', 'Tagalog_rows': 105, 'Kapampangan_rows': 105, 'aligned_units': 105}
[1] [1CH 1CH.1.1] Tagalog(144): Si Adan ang ama ni Set at si Set ang ama ni Enos.  ||  Kapampangan(1141): Anak neng Adan i Set, at anak neng Set i Enos.
[2] [1CH 1CH.1.2] Tagalog(144): Si Enos ang ama ni Kenan at si Kenan ang ama ni Mahalalel na ama ni Jared.  ||  Kapampangan(1141): Anak neng Enos i Kenan, at anak neng Kenan 