In [1]:
# EmoBook — Benchmark Downloader (Project Gutenberg, 25 titles)
# - Saves to emobook/data/raw/
# - Tries multiple URL patterns per ID (.txt first, then .zip)
# - Validates PG START marker when possible
# - Skips existing files unless force=True
# ------------------------------------------------------------

import os, time, io, zipfile, re, unicodedata
from pathlib import Path
import requests
import pandas as pd

# --- Paths ---------------------------------------------------
ROOT = Path("emobook/data/raw/")                     # change if your notebook is elsewhere
RAW_DIR = ROOT
RAW_DIR.mkdir(parents=True, exist_ok=True)

# --- HTTP session with polite headers ------------------------
S = requests.Session()
S.headers.update({
    "User-Agent": "EmoBook/1.0 (research; contact: you@example.com)"
})
TIMEOUT = 30

# --- PG URL candidates ---------------------------------------
def candidate_urls(gid: int):
    g = str(gid)
    return [
        # preferred plain text (UTF-8) patterns
        f"https://www.gutenberg.org/files/{g}/{g}-0.txt",
        f"https://www.gutenberg.org/files/{g}/{g}.txt",
        f"https://www.gutenberg.org/cache/epub/{g}/pg{g}.txt",
        f"https://www.gutenberg.org/cache/epub/{g}/pg{g}.txt.utf8",
        # zip fallbacks
        f"https://www.gutenberg.org/files/{g}/{g}-0.zip",
        f"https://www.gutenberg.org/files/{g}/{g}.zip",
        f"https://www.gutenberg.org/cache/epub/{g}/pg{g}.zip",
    ]

PG_START = re.compile(r"\*\*\*\s*START OF (?:THIS|THE) PROJECT GUTENBERG EBOOK", re.I)

# --- Safe filename -------------------------------------------
def slugify(s: str) -> str:
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    s = re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_")
    return s

def out_name(title: str, author: str, gid: int) -> str:
    return f"{slugify(title)}__{slugify(author)}__{gid}.txt"

# --- Download function ---------------------------------------
def fetch_book(title: str, author: str, primary_id: int, alt_ids=None, force=False, sleep=1.0):
    ids = [primary_id] + (alt_ids or [])
    tried = []
    saved_path = None
    used_gid = None
    used_url = None
    status = "failed"
    size = 0
    for gid in ids:
        for url in candidate_urls(gid):
            tried.append(url)
            try:
                r = S.get(url, timeout=TIMEOUT)
            except Exception as e:
                continue
            if r.status_code != 200:
                continue

            # If it's a zip, try to extract the likely text file
            content = r.content
            text = None
            if url.endswith(".zip"):
                try:
                    with zipfile.ZipFile(io.BytesIO(content)) as z:
                        # prefer utf-8 file names; pick *-0.txt or pg{gid}.txt
                        candidates = [f"{gid}-0.txt", f"pg{gid}.txt", f"{gid}.txt", f"pg{gid}.txt.utf8"]
                        # find first present candidate; else the first .txt
                        names = z.namelist()
                        pick = None
                        for c in candidates:
                            if c in names:
                                pick = c; break
                        if pick is None:
                            txts = [n for n in names if n.lower().endswith(".txt")]
                            pick = sorted(txts)[0] if txts else None
                        if pick:
                            with z.open(pick) as f:
                                raw = f.read()
                                try:
                                    text = raw.decode("utf-8")
                                except UnicodeDecodeError:
                                    text = raw.decode("latin-1", errors="ignore")
                except Exception:
                    text = None
            else:
                # plain text response
                r.encoding = r.encoding or "utf-8"
                try:
                    text = r.text
                except Exception:
                    try:
                        text = r.content.decode("utf-8")
                    except Exception:
                        text = r.content.decode("latin-1", errors="ignore")

            if not text:
                continue

            # Validate marker (soft): keep even if missing, but note it
            has_marker = bool(PG_START.search(text))
            # Save
            out = RAW_DIR / out_name(title, author, gid)
            if out.exists() and not force:
                saved_path = out
                used_gid = gid
                used_url = url
                status = "exists"
                size = out.stat().st_size
                break
            out.write_text(text, encoding="utf-8")
            saved_path = out
            used_gid = gid
            used_url = url
            status = "ok_marker" if has_marker else "ok_nomarker"
            size = len(text.encode("utf-8"))
            break
        if saved_path:
            break
        time.sleep(sleep)  # be polite between ID tries

    return {
        "title": title,
        "author": author,
        "gid_used": used_gid,
        "saved_as": str(saved_path) if saved_path else "",
        "status": status,
        "size_bytes": size,
        "url": used_url,
        "tried": tried,
    }

# --- Benchmark list (25 titles) ------------------------------------------
# Note: multiple IDs provided where editions/translations vary on PG.
books = [
    # Austen & Brontë
    {"title":"Pride and Prejudice", "author":"Jane Austen", "id":1342},
    {"title":"Sense and Sensibility", "author":"Jane Austen", "id":161, "alt":[21839]},
    {"title":"Jane Eyre", "author":"Charlotte Brontë", "id":1260, "alt":[10210]},
    {"title":"Wuthering Heights", "author":"Emily Brontë", "id":768},
    # Dickens
    {"title":"Great Expectations", "author":"Charles Dickens", "id":1400},
    {"title":"A Tale of Two Cities", "author":"Charles Dickens", "id":98},
    {"title":"Oliver Twist", "author":"Charles Dickens", "id":730},
    # American 19thC
    {"title":"Moby-Dick; or, The Whale", "author":"Herman Melville", "id":2701},
    {"title":"Adventures of Huckleberry Finn", "author":"Mark Twain", "id":76},
    {"title":"The Scarlet Letter", "author":"Nathaniel Hawthorne", "id":33},
    # Gothic / Horror
    {"title":"Frankenstein; or, The Modern Prometheus", "author":"Mary Shelley", "id":84},
    {"title":"Dracula", "author":"Bram Stoker", "id":345},
    {"title":"Strange Case of Dr. Jekyll and Mr. Hyde", "author":"Robert Louis Stevenson", "id":43},
    {"title":"The Picture of Dorian Gray", "author":"Oscar Wilde", "id":174},
    {"title":"Heart of Darkness", "author":"Joseph Conrad", "id":526},
    # Detective
    {"title":"The Adventures of Sherlock Holmes", "author":"Arthur Conan Doyle", "id":1661},
    {"title":"The Hound of the Baskervilles", "author":"Arthur Conan Doyle", "id":2852},
    # Russian (public-domain translations)
    {"title":"Crime and Punishment", "author":"Fyodor Dostoevsky (tr. Garnett)", "id":2554},
    {"title":"The Brothers Karamazov", "author":"Fyodor Dostoevsky (tr. Garnett)", "id":28054, "alt":[58054, 28049]},
    {"title":"War and Peace", "author":"Leo Tolstoy (tr. Maude)", "id":2600},
    {"title":"Anna Karenina", "author":"Leo Tolstoy (tr. Garnett/Maude)", "id":1399, "alt":[158, 34901]},
    # French epics
    {"title":"Les Misérables", "author":"Victor Hugo (tr. Hapgood)", "id":135, "alt":[2600, 17489]},
    {"title":"The Count of Monte Cristo", "author":"Alexandre Dumas", "id":1184, "alt":[2760, 17989]},
    # Classical (English trans.)
    {"title":"The Odyssey", "author":"Homer (tr. Samuel Butler)", "id":1727},
    # Shakespeare
    {"title":"Romeo and Juliet", "author":"William Shakespeare", "id":1513},
]

# --- Run the downloader ---------------------------------------------------
rows = []
for b in books:
    row = fetch_book(b["title"], b["author"], b["id"], alt_ids=b.get("alt", []), force=False, sleep=0.5)
    print(f"{row['title']:<35} {row['status']:<12} id={row['gid_used']} -> {Path(row['saved_as']).name if row['saved_as'] else '—'}")
    rows.append(row)

report = pd.DataFrame(rows)
display(report[["title","author","gid_used","status","size_bytes","url","saved_as"]])
print(f"\nSaved {report['status'].isin(['ok_marker','ok_nomarker','exists']).sum()} / {len(report)} files to: {RAW_DIR}")


Pride and Prejudice                 ok_marker    id=1342 -> Pride_and_Prejudice__Jane_Austen__1342.txt
Sense and Sensibility               ok_marker    id=161 -> Sense_and_Sensibility__Jane_Austen__161.txt
Jane Eyre                           ok_marker    id=1260 -> Jane_Eyre__Charlotte_Bronte__1260.txt
Wuthering Heights                   ok_marker    id=768 -> Wuthering_Heights__Emily_Bronte__768.txt
Great Expectations                  ok_marker    id=1400 -> Great_Expectations__Charles_Dickens__1400.txt
A Tale of Two Cities                ok_marker    id=98 -> A_Tale_of_Two_Cities__Charles_Dickens__98.txt
Oliver Twist                        ok_marker    id=730 -> Oliver_Twist__Charles_Dickens__730.txt
Moby-Dick; or, The Whale            ok_marker    id=2701 -> Moby_Dick_or_The_Whale__Herman_Melville__2701.txt
Adventures of Huckleberry Finn      ok_marker    id=76 -> Adventures_of_Huckleberry_Finn__Mark_Twain__76.txt
The Scarlet Letter                  ok_marker    id=33 -> The_Scarlet

Unnamed: 0,title,author,gid_used,status,size_bytes,url,saved_as
0,Pride and Prejudice,Jane Austen,1342,ok_marker,752575,https://www.gutenberg.org/files/1342/1342-0.txt,/Users/nageshs/Desktop/emobook/emobook/data/ra...
1,Sense and Sensibility,Jane Austen,161,ok_marker,712924,https://www.gutenberg.org/files/161/161-0.txt,/Users/nageshs/Desktop/emobook/emobook/data/ra...
2,Jane Eyre,Charlotte Brontë,1260,ok_marker,1065068,https://www.gutenberg.org/files/1260/1260-0.txt,/Users/nageshs/Desktop/emobook/emobook/data/ra...
3,Wuthering Heights,Emily Brontë,768,ok_marker,693647,https://www.gutenberg.org/files/768/768-0.txt,/Users/nageshs/Desktop/emobook/emobook/data/ra...
4,Great Expectations,Charles Dickens,1400,ok_marker,1038512,https://www.gutenberg.org/files/1400/1400-0.txt,/Users/nageshs/Desktop/emobook/emobook/data/ra...
5,A Tale of Two Cities,Charles Dickens,98,ok_marker,807231,https://www.gutenberg.org/files/98/98-0.txt,/Users/nageshs/Desktop/emobook/emobook/data/ra...
6,Oliver Twist,Charles Dickens,730,ok_marker,917357,https://www.gutenberg.org/files/730/730-0.txt,/Users/nageshs/Desktop/emobook/emobook/data/ra...
7,"Moby-Dick; or, The Whale",Herman Melville,2701,ok_marker,1256529,https://www.gutenberg.org/files/2701/2701-0.txt,/Users/nageshs/Desktop/emobook/emobook/data/ra...
8,Adventures of Huckleberry Finn,Mark Twain,76,ok_marker,590756,https://www.gutenberg.org/files/76/76-0.txt,/Users/nageshs/Desktop/emobook/emobook/data/ra...
9,The Scarlet Letter,Nathaniel Hawthorne,33,ok_marker,513556,https://www.gutenberg.org/files/33/33-0.txt,/Users/nageshs/Desktop/emobook/emobook/data/ra...



Saved 25 / 25 files to: /Users/nageshs/Desktop/emobook/emobook/data/raw
