In [35]:
from pathlib import Path
import pandas as pd
import re

In [34]:
Cebuano_dir = Path("/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Cebuano")
Tausug_dir  = Path("/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/parsed/Tausug")

out_csv   = Path("cebuano_tausug_verse.csv")

DEDUP_STRATEGY = "first" 

def load_language_folder(folder: Path, lang_name: str) -> pd.DataFrame:
    """
    Load all CSV files under `folder`, keep relevant columns, and return a DataFrame
    where the verse text column is renamed to the language name (e.g., 'Cebuano', 'Tausug').
    """
    csv_files = sorted(folder.glob("*.csv"))
    if not csv_files:
        print(f"[WARN] No CSV files found in {folder}")
        return pd.DataFrame(columns=["usfm","book","chapter","verse",lang_name])

    frames = []
    for fp in csv_files:
        try:
            df = pd.read_csv(fp, dtype=str, encoding="utf-8")
        except UnicodeDecodeError:
            df = pd.read_csv(fp, dtype=str, encoding_errors="ignore")

        needed = ["usfm","book","chapter","verse","text"]
        missing = [c for c in needed if c not in df.columns]
        if missing:
            print(f"[WARN] {fp} is missing columns {missing}; skipping.")
            continue

        df = df[needed].copy()
        frames.append(df)

    if not frames:
        return pd.DataFrame(columns=["usfm","book","chapter","verse",lang_name])

    df_all = pd.concat(frames, ignore_index=True)

    if DEDUP_STRATEGY == "join":
        agg = {"text": lambda s: " | ".join(pd.Series(s, dtype=str).dropna().unique())}
        df_all = (
            df_all.groupby(["usfm","book","chapter","verse"], as_index=False)
                  .agg(agg)
        )
    else:  
        df_all = df_all.drop_duplicates(subset=["usfm","book","chapter","verse"], keep="first")

    df_all = df_all.rename(columns={"text": lang_name})

    for col in ["chapter","verse"]:
        df_all[col + "_num"] = pd.to_numeric(df_all[col], errors="coerce")

    return df_all

df_bik = load_language_folder(Cebuano_dir, "Cebuano")
df_tag = load_language_folder(Tausug_dir,  "Tausug")

on_keys = ["usfm","book","chapter","verse"]
merged = pd.merge(
    df_bik[on_keys + ["Cebuano","chapter_num","verse_num"]],
    df_tag[on_keys + ["Tausug","chapter_num","verse_num"]],
    on=on_keys,
    how="outer",
    suffixes=("", "_y")
)

merged["chapter_num"] = merged["chapter_num"].combine_first(merged.pop("chapter_num_y"))
merged["verse_num"]   = merged["verse_num"].combine_first(merged.pop("verse_num_y"))

for col in ["Cebuano", "Tausug"]:
    if col in merged.columns:
        merged[col] = merged[col].replace(r"^\s*$", pd.NA, regex=True).fillna("N/A")

merged = merged.sort_values(["book","chapter_num","verse_num","usfm"], kind="mergesort")
merged = merged[["usfm","book","chapter","verse","Cebuano","Tausug"]]

merged.to_csv(out_csv, index=False)
print(f"Wrote CSV {out_csv.resolve()}")

Wrote CSV /Users/armina/Documents/GitHub/bible-dot-com-scraper/parallel_corpora/cebuano_tausug_verse.csv


In [44]:
lang1_dir = Path("/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed/Tagalog")  
lang2_dir = Path("/Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/trish/parsed/Kapampangan")      
out_txt   = Path("tagalog_kapampangan_verse.txt")
DEDUP_STRATEGY = "first"  
MISSING = '"N/A"'

LINE_RE = re.compile(r'^\s*([0-9A-Z]+\.\d+\.\d+)\s+(.*\S)\s*$') 

def parse_txt_file(fp: Path):
    d = {}
    with fp.open("r", encoding="utf-8-sig", errors="replace") as f:
        for raw in f:
            m = LINE_RE.match(raw)
            if not m:
                continue
            usfm_id, text = m.group(1), m.group(2).strip()
            if usfm_id in d:
                if DEDUP_STRATEGY == "join" and text not in d[usfm_id]:
                    d[usfm_id] = d[usfm_id] + " | " + text
            else:
                d[usfm_id] = text
    return d

def load_folder(folder: Path):
    combined = {}
    for fp in sorted(folder.glob("*.txt")):
        part = parse_txt_file(fp)
        for k, v in part.items():
            if k in combined:
                if DEDUP_STRATEGY == "join" and v not in combined[k]:
                    combined[k] = combined[k] + " | " + v
            else:
                combined[k] = v
    return combined

def usfm_sort_key(usfm: str):
    parts = usfm.split(".")
    book = parts[0] if parts else ""
    chap = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
    verse = int(parts[2]) if len(parts) > 2 and parts[2].isdigit() else 0
    return (book, chap, verse)

# main
lang1 = load_folder(lang1_dir)
lang2 = load_folder(lang2_dir)

all_usfm = sorted(set(lang1.keys()) | set(lang2.keys()), key=usfm_sort_key)

lines_out = []
for u in all_usfm:
    t1 = lang1.get(u, MISSING)
    t2 = lang2.get(u, MISSING)
    lines_out.append(f"{u} {t1}")
    lines_out.append(f"{u} {t2}")

out_txt.write_text("\n".join(lines_out) + "\n", encoding="utf-8")

print(f"Done. Wrote {len(lines_out)} lines to {out_txt}")
print(f"Verses total: {len(all_usfm)} | missing in first: {sum(lang1.get(u) is None for u in all_usfm)} | "
      f"missing in second: {sum(lang2.get(u) is None for u in all_usfm)}")

Done. Wrote 70758 lines to tagalog_kapampangan_verse.txt
Verses total: 35379 | missing in first: 1 | missing in second: 27
