In [1]:
import json
from pathlib import Path
import re

BOOK_CODE = "NOTW"
CHAPTER_DIR = Path(f"../data/processed/{BOOK_CODE}")
OUT_PATH = Path(f"../data/processed/{BOOK_CODE}_chunks.jsonl")

# Rough token estimate: ~4 chars per token
TARGET_CHARS = 5000      # ~800 tokens
OVERLAP_CHARS = 800      # ~100 tokens

PARA_SPLIT_RE = re.compile(r"\n\s*\n+")

def chunk_text(text: str):
    paragraphs = [p.strip() for p in PARA_SPLIT_RE.split(text) if p.strip()]
    chunks = []
    current = ""
    for p in paragraphs:
        if len(current) + len(p) <= TARGET_CHARS:
            current += ("\n\n" if current else "") + p
        else:
            chunks.append(current)
            # overlap from end of previous chunk
            overlap = current[-OVERLAP_CHARS:] if OVERLAP_CHARS < len(current) else current
            current = overlap + "\n\n" + p
    if current:
        chunks.append(current)
    return chunks

def main():
    with OUT_PATH.open("w", encoding="utf-8") as out_f:
        for chapter_file in sorted(CHAPTER_DIR.glob("*.txt")):
            chapter_num = int(chapter_file.stem)
            text = chapter_file.read_text(encoding="utf-8", errors="replace").strip()

            chunks = chunk_text(text)
            for i, chunk in enumerate(chunks, start=1):
                record = {
                    "book": BOOK_CODE,
                    "chapter": chapter_num,
                    "chunk_id": f"{BOOK_CODE}_{chapter_num:03d}_{i:03d}",
                    "text": chunk,
                }
                out_f.write(json.dumps(record, ensure_ascii=False) + "\n")

    print(f"Wrote chunks to {OUT_PATH}")

if __name__ == "__main__":
    main()

Wrote chunks to ../data/processed/NOTW_chunks.jsonl
