In [4]:
# --- Setup (OS-based)
import os
import re

# --- Configure your folders
IN_ROOT  = "parser/yna/Cebuano"
OUT_ROOT = "sentence/yna/Cebuano"

# --- Regexes
HEADER_RE = re.compile(r'(?im)^\s*ORIGINAL\s+TEXT\s*\n?')
# Verse markers like "1CO.1.11" at the start of lines
VERSE_RE  = re.compile(r'(?m)^\s*(?:[1-3]?[A-Z]{2,}\.\d+\.\d+)\s*')

# --- Cleaning function
def clean_and_sentence_break(text: str) -> str:
    # 1) Drop header labels
    text = HEADER_RE.sub("", text)

    # 2) Strip verse markers at line starts
    text = VERSE_RE.sub("", text)

    # 3) Turn all newlines into spaces; collapse multiple spaces
    text = re.sub(r'[ \t]*\n[ \t]*', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()

    # 4) Split using your rule:
    #    - If . ! ? is followed by ” or " then sentence ends AFTER the quote
    #    - Else, sentence ends right after . ! ?
    SPLIT_RE = re.compile(r'(?:(?<=[.!?][”"])\s+|(?<=[.!?])\s+)')
    sentences = [s.strip() for s in SPLIT_RE.split(text) if s.strip()]

    # 5) One sentence per paragraph
    return "\n\n".join(sentences) + "\n"

# --- Batch process: mirror IN_ROOT into OUT_ROOT
count = 0
for root, dirs, files in os.walk(IN_ROOT):
    for fname in files:
        if not fname.lower().endswith(".txt"):
            continue

        in_path = os.path.join(root, fname)
        # relative path (to mirror folder structure)
        rel_path = os.path.relpath(in_path, IN_ROOT)
        out_path = os.path.join(OUT_ROOT, rel_path)

        # ensure output directory exists
        out_dir = os.path.dirname(out_path)
        os.makedirs(out_dir, exist_ok=True)

        # read -> clean -> write
        with open(in_path, "r", encoding="utf-8", errors="ignore") as f:
            raw = f.read()
        cleaned = clean_and_sentence_break(raw)
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(cleaned)

        count += 1

print(f"Processed {count} files.")
print("Output root:", os.path.abspath(OUT_ROOT))

Processed 0 files.
Output root: /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/sentence/yna/Cebuano
