In [1]:
import re
from pathlib import Path

BOOK_CODE = "NOTW"
INPUT_PATH = Path("../data/raw/NOTW.txt")
OUT_DIR = Path(f"../data/processed/{BOOK_CODE}")

# Matches: "CHAPTER TWO" / "CHAPTER THIRTY-SEVEN" etc. (no dash, no title)
CHAPTER_LINE_RE = re.compile(r"^CHAPTER\s+([A-Z][A-Z\- ]+)$")

# Prologue/Epilogue markers
PROLOGUE_RE = re.compile(r"^PROLOGUE\b", re.IGNORECASE)
EPILOGUE_RE = re.compile(r"^EPILOGUE\b", re.IGNORECASE)

# Word-to-number (extend as needed)
WORD_NUM = {
    "ONE": 1, "TWO": 2, "THREE": 3, "FOUR": 4, "FIVE": 5,
    "SIX": 6, "SEVEN": 7, "EIGHT": 8, "NINE": 9, "TEN": 10,
    "ELEVEN": 11, "TWELVE": 12, "THIRTEEN": 13, "FOURTEEN": 14, "FIFTEEN": 15,
    "SIXTEEN": 16, "SEVENTEEN": 17, "EIGHTEEN": 18, "NINETEEN": 19, "TWENTY": 20,
    "THIRTY": 30, "FORTY": 40, "FIFTY": 50, "SIXTY": 60, "SEVENTY": 70,
    "EIGHTY": 80, "NINETY": 90, "HUNDRED": 100,
}

def words_to_int(s: str) -> int:
    """Handles 'TWENTY-ONE', 'THIRTY SEVEN', etc."""
    parts = re.split(r"[\s\-]+", s.strip())
    current = 0
    for p in parts:
        if p not in WORD_NUM:
            raise ValueError(f"Unknown number word: {p}")
        val = WORD_NUM[p]
        if val == 100:
            current = max(1, current) * 100
        elif val >= 20 and val % 10 == 0:
            current += val
        else:
            current += val
    return current

def is_real_chapter_start(lines, i) -> bool:
    """
    Exclude TOC/index lines like 'CHAPTER X - TITLE' by requiring:
    - line is exactly 'CHAPTER <WORDS>' (already enforced by regex)
    - within next ~10 lines there's a non-empty title line that isn't another chapter marker
    """
    for j in range(i + 1, min(i + 12, len(lines))):
        if lines[j].strip():
            return CHAPTER_LINE_RE.match(lines[j].strip()) is None
    return False

def find_first_line_index(lines, pattern_re, start: int = 0) -> int | None:
    for i in range(start, len(lines)):
        if pattern_re.match(lines[i].strip()):
            return i
    return None

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    text = INPUT_PATH.read_text(encoding="utf-8", errors="replace")
    lines = text.splitlines()

    # --- Find real chapter starts ---
    chapter_starts: list[tuple[int, int]] = []
    for i, line in enumerate(lines):
        m = CHAPTER_LINE_RE.match(line.strip())
        if m and is_real_chapter_start(lines, i):
            chapter_words = m.group(1).strip().replace("  ", " ")
            try:
                chap_num = words_to_int(chapter_words)
            except ValueError:
                continue
            chapter_starts.append((chap_num, i))

    chapter_starts.sort(key=lambda x: (x[0], x[1]))
    if not chapter_starts:
        raise RuntimeError("No chapters found. Check regex against your chapter headings.")

    first_chapter_i = chapter_starts[0][1]
    last_chap_num = chapter_starts[-1][0]
    last_start_i = chapter_starts[-1][1]
    ep_out_num = last_chap_num + 1  # 93 for a 92-chapter book

    # --- Prologue (write as 000.txt if it appears before first chapter) ---
    pro_i = find_first_line_index(lines, PROLOGUE_RE)
    if pro_i is not None and pro_i < first_chapter_i:
        out_path = OUT_DIR / "000.txt"
        out_path.write_text("\n".join(lines[pro_i:first_chapter_i]).strip() + "\n", encoding="utf-8")
        print("Wrote prologue to 000.txt")

    # --- Write chapters ---
    for idx, (chap_num, start_i) in enumerate(chapter_starts):
        end_i = chapter_starts[idx + 1][1] if idx + 1 < len(chapter_starts) else len(lines)
        out_path = OUT_DIR / f"{chap_num:03d}.txt"
        out_path.write_text("\n".join(lines[start_i:end_i]).strip() + "\n", encoding="utf-8")

    # --- Epilogue (ONLY after last chapter; take the LAST epilogue hit) ---
    epilogue_hits = []
    i = last_start_i
    while True:
        hit = find_first_line_index(lines, EPILOGUE_RE, start=i)
        if hit is None:
            break
        epilogue_hits.append(hit)
        i = hit + 1

    epi_i = epilogue_hits[-1] if epilogue_hits else None

    if epi_i is not None and epi_i > last_start_i:
        out_path = OUT_DIR / f"{ep_out_num:03d}.txt"
        out_path.write_text("\n".join(lines[epi_i:]).strip() + "\n", encoding="utf-8")
        print(f"Wrote epilogue to {ep_out_num:03d}.txt")
    else:
        print("Did not detect an epilogue after the final chapter. (Nothing written.)")

    print(f"Found {len(chapter_starts)} chapters.")
    print(f"Wrote chapter files to: {OUT_DIR}")

if __name__ == "__main__":
    main()

Wrote prologue to 000.txt
Wrote epilogue to 093.txt
Found 92 chapters.
Wrote chapter files to: ../data/processed/NOTW
