In [2]:
from pathlib import Path
import re, unicodedata, pandas as pd

IN_DIR  = Path("../books")       # run from emobook/notebooks/
OUT_DIR = Path("../books_clean")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- core: extract story between PG START/END (prefer modern START) ----
MODERN_START = re.compile(r"^\s*\*\*\*\s*start of (?:this|the) project gutenberg ebook.*$", re.I|re.M)
LEGACY_START = re.compile(r"^\s*the project gutenberg e?book of .*$", re.I|re.M)
END_RES = [re.compile(p, re.I|re.M) for p in [
    r"^\s*\*\*\*\s*end of (?:this|the) project gutenberg ebook.*$",
    r"^\s*end of (?:this|the) project gutenberg ebook.*$",
    r"^\s*end of project gutenberg'?s .*?$",
    r"^\s*end of the project gutenberg ebook of .*$",
    r"^\s*project gutenberg(?:™|) license.*$",
]]

def extract_gutenberg_story(text: str) -> str:
    t = text.replace("\r\n","\n").replace("\r","\n")
    if t.startswith("\ufeff"):  # strip BOM
        t = t.lstrip("\ufeff")

    m = MODERN_START.search(t)
    if m:
        # jump to the newline AFTER the marker line
        start_idx = t.find("\n", m.end())
        start_idx = start_idx + 1 if start_idx != -1 else m.end()
    else:
        mh = LEGACY_START.search(t)
        if mh:
            start_idx = t.find("\n", mh.end())
            start_idx = start_idx + 1 if start_idx != -1 else mh.end()
            # prefer a later modern START if present
            m2 = MODERN_START.search(t, pos=start_idx)
            if m2:
                start_idx = t.find("\n", m2.end())
                start_idx = start_idx + 1 if start_idx != -1 else m2.end()
        else:
            start_idx = 0  # no markers, pass through from start

    # earliest END marker AFTER start
    ends = [m.start() for R in END_RES if (m := R.search(t, pos=start_idx))]
    end_idx = min(ends) if ends else len(t)
    return t[start_idx:end_idx].lstrip("\n")

# --- optional: drop 'Contents' block safely (handles roman item lists like "I. ...") ---
ROMAN_ITEM = re.compile(r"(?:^|\n)\s*[IVXLCDM]+\.\s", re.M)
CHAP_OR_SCENE = re.compile(r"(?:^|\n)\s*(?:chapter|book|part|canto|volume|act|scene)\s+[ivxlcdm0-9]+\b", re.I|re.M)

def drop_contents_block(core: str):
    m = re.search(r"(?:^|\n)\s*contents\s*\n", core[:200_000], re.I)
    if not m:
        return core, False
    after = core[m.end():]
    c1 = CHAP_OR_SCENE.search(after)
    c2 = ROMAN_ITEM.search(after)
    idxs = [x.start() for x in (c1, c2) if x]
    return (after[min(idxs):], True) if idxs else (core, False)

# --- normalization (minimal, tokenizer-friendly) ---
def normalize_basic(txt: str) -> str:
    txt = unicodedata.normalize("NFC", txt)
    txt = txt.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
    txt = re.sub(r"[ \t]+", " ", txt)      # collapse spaces
    txt = re.sub(r"\n{3,}", "\n\n", txt)   # collapse blank lines
    return txt.strip()

# --- process all books in ../books ----------------------------------------
def preprocess_all(in_dir: Path, out_dir: Path, remove_contents=True):
    rows = []
    for p in sorted(in_dir.glob("*.txt")):
        raw = p.read_text(encoding="utf-8", errors="ignore")
        core = extract_gutenberg_story(raw)

        contents_removed = False
        if remove_contents:
            core, contents_removed = drop_contents_block(core)

        cleaned = normalize_basic(core)
        outp = out_dir / (p.stem + ".clean.txt")
        outp.write_text(cleaned, encoding="utf-8")

        rows.append({
            "file": p.name,
            "chars_raw": len(raw),
            "chars_clean": len(cleaned),
            "reduction_%": round(100*(1 - len(cleaned)/max(1, len(raw))), 2),
            "contents_removed": contents_removed
        })
        print(f"✔ {p.name} → {outp.name} (contents_removed={contents_removed})")

    df = pd.DataFrame(rows)
    df.to_csv(out_dir / "preprocess_report.csv", index=False)
    df

_ = preprocess_all(IN_DIR, OUT_DIR, remove_contents=True)


✔ Frankenstein.txt → Frankenstein.clean.txt (contents_removed=True)
✔ Mobi Dick.txt → Mobi Dick.clean.txt (contents_removed=True)
✔ Pride and Prejudice.txt → Pride and Prejudice.clean.txt (contents_removed=False)
✔ Romeo and Juliet.txt → Romeo and Juliet.clean.txt (contents_removed=True)
✔ The Adventures of Sherlock Holmes.txt → The Adventures of Sherlock Holmes.clean.txt (contents_removed=True)


In [3]:
from pathlib import Path
import re, pandas as pd

CLEAN_DIR = Path("../books_clean")

def audit(text: str):
    head = text[:5000]
    return {
        "has_pg_marker": bool(re.search(r"project gutenberg", text, re.I)),
        "has_license": bool(re.search(r"project gutenberg.*license", text, re.I)),
        "contents_in_head": bool(re.search(r"(?:^|\n)\s*contents\s*(?:\n|:)", head, re.I)),
        "has_illustrations": bool(re.search(r"\[?illustration[:\]]", text, re.I)),
        "stage_dir_count": len(re.findall(r"\[[^\[\]\n]{0,200}\]", text)),
        "act_scene_cues": len(re.findall(r"^\s*(ACT|SCENE)\s+[IVXLC]+", text, re.I|re.M)),
        "speaker_lines": len(re.findall(r"^[A-Z][A-Z '\-]{2,}[:\.]$", text, re.M)),
        "chars": len(text)
    }

rows = []
for p in sorted(CLEAN_DIR.glob("*.clean.txt")):
    t = p.read_text(encoding="utf-8")
    rows.append({"file": p.name, **audit(t)})

df = pd.DataFrame(rows)
display(df)


Unnamed: 0,file,has_pg_marker,has_license,contents_in_head,has_illustrations,stage_dir_count,act_scene_cues,speaker_lines,chars
0,Frankenstein.clean.txt,False,False,False,False,3,0,0,419006
1,Mobi Dick.clean.txt,True,False,False,False,1,0,2,1218030
2,Pride and Prejudice.clean.txt,False,False,False,True,125,0,58,721387
3,Romeo and Juliet.clean.txt,False,False,False,False,123,58,840,142317
4,The Adventures of Sherlock Holmes.clean.txt,False,False,False,False,0,0,1,561782


In [5]:
# In your 02_chunk.ipynb, replace split_sentences with this:

def split_sentences(text: str):
    """
    Pluggable sentence splitter:
    1) PySBD  2) BlingFire  3) NLTK Punkt  4) Regex fallback
    """
    # 1) PySBD
    try:
        import pysbd
        seg = pysbd.Segmenter(language='en', clean=False)
        return seg.segment(text)
    except Exception:
        pass

    # 2) BlingFire
    try:
        import blingfire
        s = blingfire.text_to_sentences(text)
        return [t.strip() for t in s.split('\n') if t.strip()]
    except Exception:
        pass

    # 3) NLTK Punkt
    try:
        import nltk
        try:
            _ = nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt', quiet=True)
        from nltk.tokenize import sent_tokenize
        return sent_tokenize(text)
    except Exception:
        pass

    # 4) Regex fallback (no look-behinds; protects common traps)
    import re
    t = text.replace("...", "<ELLIPSIS>")
    t = re.sub(r"(?<=\d)\.(?=\d)", "<DECIMAL_DOT>", t)
    # insert EOS markers after sentence-final punct followed by space+capital
    t = re.sub(r'([.!?]["\')\]]*)(\s+)(?=[A-Z])', r'\1<EOS>\2', t)
    parts = [p.strip() for p in t.split("<EOS>") if p.strip()]
    return [p.replace("<ELLIPSIS>", "...").replace("<DECIMAL_DOT>", ".") for p in parts]


In [6]:
# === Chunking pipeline (run after defining split_sentences) ==============

from pathlib import Path
import pandas as pd

# --- Config ---
CLEAN_DIR = Path("../books_clean")   # run notebook from emobook/notebooks/
CHUNK_DIR = Path("../chunks")
TARGET_WORDS = 120   # avg words per chunk
STRIDE_WORDS = 60    # overlap (words) between consecutive chunks (~50%)
CAP_WORDS = 220      # hard cap to avoid very long last sentences inflating size

CHUNK_DIR.mkdir(parents=True, exist_ok=True)

# --- Chunk builders ---
def chunk_by_words(sentences, target=TARGET_WORDS, stride=STRIDE_WORDS, cap=CAP_WORDS):
    """Pack sentences until ~target words; slide forward keeping `stride` words of overlap."""
    chunks, buf, cur_words = [], [], 0
    for s in sentences:
        w = len(s.split())
        if cur_words + w <= target or not buf:
            buf.append(s); cur_words += w
        else:
            text = " ".join(buf)
            words = text.split()
            chunks.append(" ".join(words[:cap]))
            if stride > 0:
                keep_words = " ".join(words[-stride:])
                buf = [keep_words, s]
                cur_words = len(keep_words.split()) + w
            else:
                buf = [s]
                cur_words = w
    if buf:
        text = " ".join(buf)
        words = text.split()
        chunks.append(" ".join(words[:cap]))
    return chunks

def chunk_text_by_words_direct(text: str, target=TARGET_WORDS, stride=STRIDE_WORDS, cap=CAP_WORDS):
    """Optional: chunk directly by words (no sentence split)."""
    tokens = text.split()
    chunks, i, step = [], 0, max(1, target - stride)
    while i < len(tokens):
        window = tokens[i:i+target]
        if not window: break
        chunks.append(" ".join(window[:cap]))
        i += step
    return chunks

# --- Driver ---
def build_chunks_for_all(use_sentence_split: bool = True,
                         target: int = TARGET_WORDS,
                         stride: int = STRIDE_WORDS,
                         cap: int = CAP_WORDS):
    stats = []
    for p in sorted(CLEAN_DIR.glob("*.clean.txt")):
        txt = p.read_text(encoding="utf-8", errors="ignore")

        if use_sentence_split:
            sents = split_sentences(txt)           # uses your pluggable splitter
            chunks = chunk_by_words(sents, target=target, stride=stride, cap=cap)
            method = "sentence+word-pack"
        else:
            chunks = chunk_text_by_words_direct(txt, target=target, stride=stride, cap=cap)
            method = "direct-word"

        out_csv = CHUNK_DIR / f"{p.stem}.chunks.csv"
        pd.DataFrame({
            "book": p.stem,
            "chunk_id": range(len(chunks)),
            "text": chunks
        }).to_csv(out_csv, index=False)

        n_words = [len(c.split()) for c in chunks] if chunks else []
        stats.append({
            "book": p.stem,
            "num_chunks": len(chunks),
            "words_total": len(txt.split()),
            "mean_words_per_chunk": round(sum(n_words)/len(n_words), 2) if n_words else 0,
            "min_words_per_chunk": min(n_words) if n_words else 0,
            "max_words_per_chunk": max(n_words) if n_words else 0,
            "target": target, "stride": stride, "cap": cap, "method": method
        })
        print(f"✔ {p.stem}: {len(chunks)} chunks → {out_csv.name}")

    stats_df = pd.DataFrame(stats)
    stats_df.to_csv(CHUNK_DIR / "chunks_summary.csv", index=False)
    display(stats_df)
    print(f"Saved summary → {CHUNK_DIR / 'chunks_summary.csv'}")

# --- Run it ---
build_chunks_for_all(use_sentence_split=True)

# --- Optional quick peek ---
try:
    any_file = next(CHUNK_DIR.glob("*.chunks.csv"))
    display(pd.read_csv(any_file).head(5))
except StopIteration:
    pass


✔ Frankenstein.clean: 1559 chunks → Frankenstein.clean.chunks.csv
✔ Mobi Dick.clean: 4194 chunks → Mobi Dick.clean.chunks.csv
✔ Pride and Prejudice.clean: 2598 chunks → Pride and Prejudice.clean.chunks.csv
✔ Romeo and Juliet.clean: 499 chunks → Romeo and Juliet.clean.chunks.csv
✔ The Adventures of Sherlock Holmes.clean: 2168 chunks → The Adventures of Sherlock Holmes.clean.chunks.csv


Unnamed: 0,book,num_chunks,words_total,mean_words_per_chunk,min_words_per_chunk,max_words_per_chunk,target,stride,cap,method
0,Frankenstein.clean,1559,75023,108.04,70,220,120,60,220,sentence+word-pack
1,Mobi Dick.clean,4194,212780,110.48,61,220,120,60,220,sentence+word-pack
2,Pride and Prejudice.clean,2598,127359,108.91,65,220,120,60,220,sentence+word-pack
3,Romeo and Juliet.clean,499,25946,111.88,66,205,120,60,220,sentence+word-pack
4,The Adventures of Sherlock Holmes.clean,2168,104496,108.08,62,220,120,60,220,sentence+word-pack


Saved summary → ../chunks/chunks_summary.csv


Unnamed: 0,book,chunk_id,text
0,Frankenstein.clean,0,Chapter 1 Chapter 2 Chapter 3 Chapter 4 Chapte...
1,Frankenstein.clean,1,"Letter 1 _To Mrs. Saville, England._ St. Peter..."
2,Frankenstein.clean,2,first task is to assure my dear sister of my w...
3,Frankenstein.clean,3,"of Petersburgh, I feel a cold northern breeze ..."
4,Frankenstein.clean,4,"of promise, my daydreams become more fervent a..."
