# Parallel Corpora

## Imports

In [37]:
from pathlib import Path
import re
from typing import Dict, Iterable, Union
import csv

import pandas as pd
import glob
import os

from itertools import zip_longest

## By verse
This script provides a reusable pipeline for building parallel corpora of Bible translations by verse across multiple language pairs. Each pair is configured with input directories containing verse-aligned .txt files for two languages and an optional output filename. 

The program parses these files into dictionaries keyed by USFM identifiers (e.g., 1CO.5.12), applies a configurable de-duplication strategy (first, last, or join with a separator), and merges all verses within a folder. For each language pair, the script aligns verses across both languages, fills in missing entries with a customizable placeholder token, and writes an interleaved output file where verses appear in matched pairs. Output files are automatically named based on the language directories if not explicitly specified. 

A summary is produced for each pair, including counts of total verses, lines written, and missing verses per language. This design supports flexible scaling: additional language pairs can be added to the PAIRS list with minimal configuration, while defaults for deduplication, missing values, and join behavior ensure consistency across runs.

In [28]:
# CONFIGURE: Language pairs
PAIRS = [
    {   # Tagalog-Kapampangan
        "lang1_dir": "../parser/cj/parsed/Tagalog",
        "lang2_dir": "../parser/trish/parsed/Kapampangan",
        "out_txt": "tagalog_kapampangan_verse.txt",
    },
    {   # Tagalog-Bikolano
        "lang1_dir": "../parser/cj/parsed/Tagalog",
        "lang2_dir": "../parser/trish/parsed/Bikolano",
        "out_txt": "tagalog_bikolano_verse.txt",
    },
    {   # Cebuano-Spanish
        "lang1_dir": "../parser/yna/parsed/Cebuano",
        "lang2_dir": "../parser/yna/parsed/Spanish",
        "out_txt": "cebuano_spanish_verse.txt",
    },
    {   # Cebuano-Tausug
        "lang1_dir": "../parser/yna/parsed/Cebuano",
        "lang2_dir": "../parser/yna/parsed/Tausug",
        "out_txt": "cebuano_tausug_verse.txt",
    },
    {   # Chavacano-Spanish
        "lang1_dir": "../parser/yna/parsed/Chavacano",
        "lang2_dir": "../parser/yna/parsed/Spanish",
        "out_txt": "chavacano_spanish_verse.txt",
    },
    {   # Ivatan-Yami
        "lang1_dir": "../parser/cj/parsed/Ivatan",
        "lang2_dir": "../parser/cj/parsed/Yami",
        "out_txt": "ivatan_yami_verse.txt",
    },
    {   # Pangasinene-Ilokano
        "lang1_dir": "../parser/cj/parsed/Pangasinense",
        "lang2_dir": "../parser/trish/parsed/Ilokano",
        "out_txt": "pangasinense_ilokano_verse.txt",
    },
]

In [None]:
for pair in PAIRS:
    print(Path(pair["lang1_dir"]).resolve(), Path(pair["lang1_dir"]).is_dir())
    print(Path(pair["lang2_dir"]).resolve(), Path(pair["lang2_dir"]).is_dir())


/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/Tagalog True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/trish/parsed/Kapampangan True
---
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/Tagalog True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/trish/parsed/Bikolano True
---
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Cebuano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Spanish True
---
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Cebuano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Tausug True
---
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Chavacano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Spanish True
---
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/Ivatan True
/Users/armina/Documents/GitHub/bible-dot-com-scrap

In [30]:
# Defaults / helpers

DEFAULT_DEDUP_STRATEGY = "first"  
DEFAULT_MISSING = '"N/A"'
DEFAULT_JOIN_SEP = " | "

LINE_RE = re.compile(r'^\s*([0-9A-Z]+\.\d+\.\d+)\s+(.*\S)\s*$')

def _to_path(p: Union[str, Path, None]) -> Union[Path, None]:
    return None if p is None else (p if isinstance(p, Path) else Path(p))

def parse_txt_file(fp: Path, dedup_strategy: str, join_sep: str) -> Dict[str, str]:
    """
    Parse a single .txt file into {usfm_id: text}, deduping within the file.
    """
    d: Dict[str, str] = {}
    with fp.open("r", encoding="utf-8-sig", errors="replace") as f:
        for raw in f:
            m = LINE_RE.match(raw)
            if not m:
                continue
            usfm_id, text = m.group(1), m.group(2).strip()
            if usfm_id in d:
                if dedup_strategy == "join" and text not in d[usfm_id]:
                    d[usfm_id] = d[usfm_id] + join_sep + text
                elif dedup_strategy == "last":
                    d[usfm_id] = text
            else:
                d[usfm_id] = text
    return d

def load_folder(folder: Path, dedup_strategy: str, join_sep: str) -> Dict[str, str]:
    """
    Merge all *.txt files in a folder into a single map {usfm_id: text}.
    """
    combined: Dict[str, str] = {}
    txts = sorted(folder.glob("*.txt"))
    for fp in txts:
        part = parse_txt_file(fp, dedup_strategy, join_sep)
        for k, v in part.items():
            if k in combined:
                if dedup_strategy == "join" and v not in combined[k]:
                    combined[k] = combined[k] + join_sep + v
                elif dedup_strategy == "last":
                    combined[k] = v
            else:
                combined[k] = v
    return combined

def usfm_sort_key(usfm: str):
    """
    Sort key: (book_code, chapter:int, verse:int).
    Note: book order is lexical by code; adjust if you want canonical order.
    """
    parts = usfm.split(".")
    book = parts[0] if parts else ""
    chap = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
    verse = int(parts[2]) if len(parts) > 2 and parts[2].isdigit() else 0
    return (book, chap, verse)

def split_usfm(usfm: str):
    """
    Return (book, chapter:int, verse:int) from a USFM like '1CO.5.11'.
    """
    parts = usfm.split(".")
    book = parts[0] if len(parts) > 0 else ""
    chap = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
    verse = int(parts[2]) if len(parts) > 2 and parts[2].isdigit() else 0
    return book, chap, verse

def _last_dir_name(p: Path) -> str:
    return p.name or p.parent.name

def process_pair(
    lang1_dir: Path,
    lang2_dir: Path,
    out_txt: Path,
    dedup_strategy: str = DEFAULT_DEDUP_STRATEGY,
    missing: str = DEFAULT_MISSING,
    join_sep: str = DEFAULT_JOIN_SEP,
) -> dict:
    """
    Process one language pair and write:
      - TXT with stacked lines per USFM:  "<USFM> <lang1_text>" then "<USFM> <lang2_text>"
      - CSV with columns: usfm, book, verse, chapter, language 1, language2
    Returns a small summary dict.
    """
    assert lang1_dir.is_dir(), f"Not a directory: {lang1_dir}"
    assert lang2_dir.is_dir(), f"Not a directory: {lang2_dir}"

    if out_txt is None:
        name1 = _last_dir_name(lang1_dir)
        name2 = _last_dir_name(lang2_dir)
        out_txt = Path(f"{name1}_{name2}_verse.txt")
    out_csv = out_txt.with_suffix(".csv")

    # Load data
    lang1 = load_folder(lang1_dir, dedup_strategy, join_sep)
    lang2 = load_folder(lang2_dir, dedup_strategy, join_sep)

    all_usfm = sorted(set(lang1.keys()) | set(lang2.keys()), key=usfm_sort_key)

    # Write TXT
    lines_out = []
    for u in all_usfm:
        t1 = lang1.get(u, missing)
        t2 = lang2.get(u, missing)
        lines_out.append(f"{u} {t1}")
        lines_out.append(f"{u} {t2}")
    out_txt.write_text("\n".join(lines_out) + "\n", encoding="utf-8")

    # Write CSV
    with out_csv.open("w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(["usfm", "book", "verse", "chapter", "language 1", "language2"])
        for u in all_usfm:
            t1 = lang1.get(u, missing)
            t2 = lang2.get(u, missing)
            book, chap, ver = split_usfm(u)
            w.writerow([u, book, ver, chap, t1, t2])

    missing_in_lang1 = sum(1 for u in all_usfm if u not in lang1)
    missing_in_lang2 = sum(1 for u in all_usfm if u not in lang2)

    summary = {
        "out_txt": str(out_txt),
        "out_csv": str(out_csv),
        "txt_lines_written": len(lines_out),
        "csv_rows_written": len(all_usfm) + 1,  # +1 for header
        "verses_total": len(all_usfm),
        "missing_in_lang1": missing_in_lang1,
        "missing_in_lang2": missing_in_lang2,
        "dedup_strategy": dedup_strategy,
        "join_sep": join_sep,
        "missing_token": missing,
    }
    print(
        f"Processed {out_txt.name} & {out_csv.name} | verses: {summary['verses_total']} | "
        f"missing(lang1): {summary['missing_in_lang1']} | missing(lang2): {summary['missing_in_lang2']} | "
        f"dedup={dedup_strategy}"
    )
    return summary

In [31]:
# main

all_summaries = []
for cfg in PAIRS:
    lang1_dir = _to_path(cfg.get("lang1_dir"))
    lang2_dir = _to_path(cfg.get("lang2_dir"))
    out_txt   = _to_path(cfg.get("out_txt"))

    dedup_strategy = cfg.get("dedup_strategy", DEFAULT_DEDUP_STRATEGY)
    missing        = cfg.get("missing", DEFAULT_MISSING)
    join_sep       = cfg.get("join_sep", DEFAULT_JOIN_SEP)

    if lang1_dir is None or lang2_dir is None:
        raise ValueError("Each pair must include 'lang1_dir' and 'lang2_dir'.")

    summary = process_pair(
        lang1_dir=lang1_dir,
        lang2_dir=lang2_dir,
        out_txt=out_txt,
        dedup_strategy=dedup_strategy,
        missing=missing,
        join_sep=join_sep,
    )
    all_summaries.append(summary)

print("\nDone processing all pairs.")
for s in all_summaries:
    print(f"- Wrote TXT: {s['out_txt']} | CSV: {s['out_csv']}")

Processed tagalog_kapampangan_verse.txt & tagalog_kapampangan_verse.csv | verses: 35379 | missing(lang1): 1 | missing(lang2): 27 | dedup=first
Processed tagalog_bikolano_verse.txt & tagalog_bikolano_verse.csv | verses: 35379 | missing(lang1): 1 | missing(lang2): 36 | dedup=first
Processed cebuano_spanish_verse.txt & cebuano_spanish_verse.csv | verses: 31105 | missing(lang1): 17 | missing(lang2): 1 | dedup=first
Processed cebuano_tausug_verse.txt & cebuano_tausug_verse.csv | verses: 31104 | missing(lang1): 16 | missing(lang2): 23145 | dedup=first
Processed chavacano_spanish_verse.txt & chavacano_spanish_verse.csv | verses: 31105 | missing(lang1): 23146 | missing(lang2): 1 | dedup=first
Processed ivatan_yami_verse.txt & ivatan_yami_verse.csv | verses: 9566 | missing(lang1): 1 | missing(lang2): 1607 | dedup=first
Processed pangasinense_ilokano_verse.txt & pangasinense_ilokano_verse.csv | verses: 35326 | missing(lang1): 5 | missing(lang2): 4243 | dedup=first

Done processing all pairs.
- W

## By sentence

This code builds parallel corpora for multiple language pairs by aligning sentence-level CSV files book by book. The PAIRS list defines which languages to pair, with input directories and output filenames. Helper functions handle reading CSVs, normalizing columns, detecting missing values, and natural sorting of files. 

The script matches books between two languages by first building a map of all available books in each directory, using the book code extracted from filenames (e.g., MAT, 1CO). It then takes the union of both sets of book codes so every book that appears in either language is included. If a book exists in one language but not the other, the missing side is padded with placeholder entries (N/A). This ensures that all books are represented, with sentences aligned where both languages exist and gaps clearly marked when they do not.

Each book’s sentences are extracted, aligned with padding (N/A if missing), and combined into a DataFrame. The results are written both as CSV (for structured analysis) and TXT (for human-readable format). Along the way, the script logs statistics on books, rows, and missing entries. In short, it automates the process of matching bilingual sentence data into clean, aligned outputs.



In [34]:
# CONFIGURE: Language pairs
PAIRS = [
    {   # Tagalog-Kapampangan
        "lang1_dir": "../parser/by_sentence/cj/sentence/Tagalog",
        "lang2_dir": "../parser/by_sentence/trish/sentence/Kapampangan",
        "out_txt": "tagalog_kapampangan_sentence",
    },
    {   # Tagalog-Bikolano
        "lang1_dir": "../parser/by_sentence/cj/sentence/Tagalog",
        "lang2_dir": "../parser/by_sentence/trish/sentence/Bikolano",
        "out_txt": "tagalog_bikolano_sentence",
    },
    {   # Cebuano-Spanish
        "lang1_dir": "../parser/by_sentence/yna/sentence/Cebuano",
        "lang2_dir": "../parser/by_sentence/yna/sentence/Spanish",
        "out_txt": "cebuano_spanish_sentence",
    },
    {   # Cebuano-Tausug
        "lang1_dir": "../parser/by_sentence/yna/sentence/Cebuano",
        "lang2_dir": "../parser/by_sentence/yna/sentence/Tausug",
        "out_txt": "cebuano_tausug_sentence",
    },
    {   # Chavacano-Spanish
        "lang1_dir": "../parser/by_sentence/yna/sentence/Chavacano",
        "lang2_dir": "../parser/by_sentence/yna/sentence/Spanish",
        "out_txt": "chavacano_spanish_sentence",
    },
    {   # Ivatan-Yami
        "lang1_dir": "../parser/by_sentence/cj/sentence/Ivatan",
        "lang2_dir": "../parser/by_sentence/cj/sentence/Yami",
        "out_txt": "ivatan_yami_sentence",
    },
    {   # Pangasinene-Ilokano
        "lang1_dir": "../parser/by_sentence/cj/sentence/Pangasinense",
        "lang2_dir": "../parser/by_sentence/trish/sentence/Ilokano",
        "out_txt": "pangasinense_ilokano_sentence",
    },
]

OUT_DIR = None
USE_FIXED_HEADERS = False
KEEP_EMPTY_LINES = True

In [35]:
for pair in PAIRS:
    print(Path(pair["lang1_dir"]).resolve(), Path(pair["lang1_dir"]).is_dir())
    print(Path(pair["lang2_dir"]).resolve(), Path(pair["lang2_dir"]).is_dir())


/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/cj/sentence/Tagalog True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/trish/sentence/Kapampangan True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/cj/sentence/Tagalog True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/trish/sentence/Bikolano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Cebuano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Spanish True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Cebuano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Tausug True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Chavacano True
/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Spanish True
/Users/armin

In [40]:
def _natural_key(s: str):
    # natural sort so 10 comes after 9
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]


def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    new_cols = {}
    for c in df.columns:
        nc = c.replace("\ufeff", "").strip().lower()  # strip BOM + normalize
        new_cols[c] = nc
    return df.rename(columns=new_cols)

def _is_missing_str(s) -> bool:
    if s is None:
        return True
    t = str(s).strip()
    return t == "" or t.lower() in {"nan", "none", "null"}

def read_csv_sentences(path, keep_empty=False):
    """
    Read a CSV that should contain a single column 'sentence' (case/BOM tolerant).
    Returns list[str]. If keep_empty=True, preserves blank lines as "".
    """
    last_err = None
    df = None
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            df = pd.read_csv(path, dtype=str, encoding=enc, on_bad_lines="skip")
            break
        except Exception as e:
            last_err = e
    if df is None:
        raise RuntimeError(f"Failed to read CSV: {path}\n{last_err}")

    df = _normalize_columns(df)

    if "sentence" not in df.columns:
        if df.shape[1] == 1:
            only = df.columns[0]
            df = df.rename(columns={only: "sentence"})
        else:
            raise ValueError(f"'sentence' column not found in {path}. Columns: {list(df.columns)}")

    s = (
        df["sentence"]
        .astype(str)
        .map(lambda x: x.replace("\r", " ").strip())
        .tolist()
    )
    if not keep_empty:
        s = [v for v in s if not _is_missing_str(v)]
    else:
        s = [v if not _is_missing_str(v) else "" for v in s]
    return s


def list_csvs(root_dir):
    """
    Recursively list CSVs under root_dir, natural-sorted for determinism.
    """
    pattern = os.path.join(root_dir, "**", "*.csv")
    files = [p for p in glob.glob(pattern, recursive=True) if os.path.isfile(p)]
    files.sort(key=_natural_key)
    return files

def language_name_from_dir(dir_path):
    return os.path.basename(os.path.normpath(dir_path)) or "Language"

def ensure_ext(base, ext):
    b, _ = os.path.splitext(base)
    return b + ext


def write_outputs(df, out_base):
    out_csv = ensure_ext(out_base, ".csv")
    out_txt = ensure_ext(out_base, ".txt")
    os.makedirs(os.path.dirname(out_csv) or ".", exist_ok=True)

    df.to_csv(out_csv, index=False, encoding="utf-8")

    cols = list(df.columns)
    with open(out_txt, "w", encoding="utf-8") as f:
        for _, r in df.iterrows():
            book = str(r[cols[0]])
            l1_id, l1_sent, l2_id, l2_sent = r[cols[1]], r[cols[2]], r[cols[3]], r[cols[4]]

            f.write(f"[{book}] {cols[2]} ({l1_id})\n")
            f.write(f"{'' if pd.isna(l1_sent) else str(l1_sent)}\n")
            f.write(f"{cols[4]} ({l2_id})\n")
            f.write(f"{'' if pd.isna(l2_sent) else str(l2_sent)}\n\n")

    return out_csv, out_txt


BOOK_RE = re.compile(r'(?<![A-Z0-9])([1-3]?[A-Z]{2,3})(?![A-Z])')

def extract_book_code(path):
    """
    Extract the book code strictly as the token between the first and second underscore.
    Examples:
      'MBB05_MAT_raw_lines.csv'   -> 'MAT'
      'VTSP_1CO_raw_lines.csv'    -> '1CO'
    Returns None if the filename doesn't have at least two underscores.
    """
    base = os.path.splitext(os.path.basename(path))[0]
    parts = base.split('_')
    if len(parts) >= 3:             
        return parts[1].upper()      
    return None

def build_book_map(root_dir):
    """
    Returns dict: {book_code: [csv_paths_for_that_book_sorted]}
    """
    book_map = {}
    for f in list_csvs(root_dir):
        code = extract_book_code(f)
        if code:
            book_map.setdefault(code, []).append(f)
    for k in book_map:
        book_map[k].sort(key=_natural_key)
    return book_map

def read_book_entries(file_list, keep_empty=False):
    """
    For a given book, read all files (in order) and return a list of entries:
    [{'id': '<file_stem>#<1based>', 'sentence': '<text>'}, ...]
    """
    entries = []
    for f in file_list:
        stem = os.path.splitext(os.path.basename(f))[0]
        sents = read_csv_sentences(f, keep_empty=keep_empty)
        for i, sent in enumerate(sents, start=1):
            entries.append({"id": f"{stem}#{i}", "sentence": sent})
    return entries

def merge_pair_by_book(lang1_dir, lang2_dir, out_base_name,
                       out_dir=None, use_fixed_headers=False, keep_empty=False):
    lang1_dir = os.path.normpath(lang1_dir)
    lang2_dir = os.path.normpath(lang2_dir)

    l1_name = "Language 1" if use_fixed_headers else language_name_from_dir(lang1_dir)
    l2_name = "Language 2" if use_fixed_headers else language_name_from_dir(lang2_dir)

    l1_books = build_book_map(lang1_dir)
    l2_books = build_book_map(lang2_dir)

    all_books = sorted(set(l1_books.keys()) | set(l2_books.keys()), key=_natural_key)

    rows = []
    per_book_stats = []

    for bk in all_books:
        l1_files = l1_books.get(bk, [])
        l2_files = l2_books.get(bk, [])

        s1 = read_book_entries(l1_files, keep_empty=keep_empty)
        s2 = read_book_entries(l2_files, keep_empty=keep_empty)

        filler1 = {"id": "N/A", "sentence": "N/A"}
        filler2 = {"id": "N/A", "sentence": "N/A"}

        for e1, e2 in zip_longest(s1, s2, fillvalue=None):
            e1 = e1 if e1 is not None else filler1
            e2 = e2 if e2 is not None else filler2
            rows.append({
                "book": bk,
                f"{l1_name} ID": e1["id"],
                f"{l1_name}": e1["sentence"],
                f"{l2_name} ID": e2["id"],
                f"{l2_name}": e2["sentence"],
            })

        per_book_stats.append({
            "book": bk,
            f"{l1_name}_files": len(l1_files),
            f"{l2_name}_files": len(l2_files),
            f"{l1_name}_rows": len(s1),
            f"{l2_name}_rows": len(s2),
            "aligned_pairs": max(len(s1), len(s2)),  
        })

    cols = ["book", f"{l1_name} ID", f"{l1_name}", f"{l2_name} ID", f"{l2_name}"]
    df = pd.DataFrame(rows, columns=cols)

    out_base = out_base_name if out_dir is None else os.path.join(out_dir, out_base_name)
    out_csv, out_txt = write_outputs(df, out_base)

    common_books = sorted(set(l1_books.keys()) & set(l2_books.keys()), key=_natural_key)
    missing_l1 = sorted(set(l2_books.keys()) - set(l1_books.keys()), key=_natural_key)
    missing_l2 = sorted(set(l1_books.keys()) - set(l2_books.keys()), key=_natural_key)

    print(f"Books (total): {len(all_books)}; Common books: {len(common_books)}")
    if missing_l1:
        print(f"Books only in {l2_name}: {missing_l1[:10]}{'...' if len(missing_l1)>10 else ''}")
    if missing_l2:
        print(f"Books only in {l1_name}: {missing_l2[:10]}{'...' if len(missing_l2)>10 else ''}")
    print(f"Total rows (after padding with N/A): {len(df)}")
    print(f"CSV → {out_csv}")
    print(f"TXT → {out_txt}")

    print("\nPer-book stats (first 6):")
    for s in per_book_stats[:6]:
        print(s)

    for i in range(min(3, len(df))):
        r = df.iloc[i]
        print(f"[{i+1}] [{r['book']}] {l1_name}({r[f'{l1_name} ID']}): {r[f'{l1_name}']}  ||  "
              f"{l2_name}({r[f'{l2_name} ID']}): {r[f'{l2_name}']}")

    return out_csv, out_txt, df

results = []
for pair in PAIRS:
    csv_path, txt_path, df_out = merge_pair_by_book(
        pair["lang1_dir"], pair["lang2_dir"],
        out_base_name=pair["out_txt"],
        out_dir=OUT_DIR,
        use_fixed_headers=USE_FIXED_HEADERS,
        keep_empty=KEEP_EMPTY_LINES,
    )
    results.append((csv_path, txt_path, len(df_out)))

print("\nDone.")

Books (total): 77; Common books: 77
Total rows (after padding with N/A): 53139
CSV → tagalog_kapampangan_sentence.csv
TXT → tagalog_kapampangan_sentence.txt

Per-book stats (first 6):
{'book': '1CH', 'Tagalog_files': 1, 'Kapampangan_files': 1, 'Tagalog_rows': 1130, 'Kapampangan_rows': 1257, 'aligned_pairs': 1257}
{'book': '1CO', 'Tagalog_files': 1, 'Kapampangan_files': 1, 'Tagalog_rows': 606, 'Kapampangan_rows': 609, 'aligned_pairs': 609}
{'book': '1JN', 'Tagalog_files': 1, 'Kapampangan_files': 1, 'Tagalog_rows': 156, 'Kapampangan_rows': 159, 'aligned_pairs': 159}
{'book': '1KI', 'Tagalog_files': 1, 'Kapampangan_files': 1, 'Tagalog_rows': 1361, 'Kapampangan_rows': 1263, 'aligned_pairs': 1361}
{'book': '1MA', 'Tagalog_files': 1, 'Kapampangan_files': 1, 'Tagalog_rows': 1322, 'Kapampangan_rows': 1208, 'aligned_pairs': 1322}
{'book': '1PE', 'Tagalog_files': 1, 'Kapampangan_files': 1, 'Tagalog_rows': 154, 'Kapampangan_rows': 156, 'aligned_pairs': 156}
[1] [1CH] Tagalog(MBB05_1CH_raw#1): Si 