In [None]:
# https://www.gutenberg.org/ebooks/offline_catalogs.html#the-project-gutenberg-catalog-metadata-in-machine-readable-format
!wget https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv

In [26]:
import pandas as pd

def attach_gutenberg_ids(
    my_catalog_path="../catalog.csv",
    pg_catalog_path="pg_catalog.csv",
    output_path="catalog_with_ids.csv"
):
    # Load your curated catalog and the PG catalog
    my_df = pd.read_csv(my_catalog_path)
    pg_df = pd.read_csv(pg_catalog_path)
    pg_df = pg_df[pg_df["Type"] == "Text"]

    # Normalize titles for matching
    my_df["_title_key"] = my_df["title"].astype(str).str.strip().str.lower()
    pg_df["_title_key"] = pg_df["Title"].astype(str).str.strip().str.lower()

    # Build a mapping from title -> Text# from pg_df
    title_to_id = pg_df.set_index("_title_key")["Text#"].to_dict()

    # Replace the 'id' only, preserve all other columns
    new_ids = []
    not_found = []
    
    for _, row in my_df.iterrows():
        key = row["_title_key"]
        if key in title_to_id:
            new_ids.append(title_to_id[key])
        else:
            new_ids.append(row["id"])  # keep original id if not found
            not_found.append(row["title"])

    # Create a new DataFrame with updated IDs
    my_df["id"] = new_ids
    my_df.drop(columns=["_title_key"], inplace=True)

    # Save to new CSV
    my_df.to_csv(output_path, index=False)
    print(f"Updated catalog saved to '{output_path}'")

    # Report unmatched titles
    if not_found:
        print("\nTitles NOT found in pg_catalog.csv:")
        for title in not_found:
            print(f" - {title}")
    else:
        print("✅ All titles matched successfully.")

attach_gutenberg_ids()

Updated catalog saved to 'catalog_with_ids.csv'

Titles NOT found in pg_catalog.csv:
 - Tales of the Grotesque and Arabesque


In [25]:
!grep "Jane" pg_catalog.csv

105,Text,1994-02-01,Persuasion,en,"Austen, Jane, 1775-1817",England -- Social life and customs -- 19th century -- Fiction; Psychological fiction; Young women -- Fiction; Motherless families -- Fiction; Rejection (Psychology) -- Fiction; First loves -- Fiction; Dysfunctional families -- Fiction; Ship captains -- Fiction; Love stories; Regency fiction,PR,Browsing: Culture/Civilization/Society; Browsing: Literature; Browsing: Fiction; Category: Novels; Category: British Literature
121,Text,1994-04-01,Northanger Abbey,en,"Austen, Jane, 1775-1817",England -- Social life and customs -- 19th century -- Fiction; Satire; England -- Fiction; Young women -- Fiction; Love stories; Gothic fiction; Horror tales -- Appreciation -- Fiction; Books and reading -- Fiction; Gentry -- England -- Fiction; Marriage -- Economic aspects -- Fiction,PR,Gothic Fiction; Browsing: Culture/Civilization/Society; Browsing: Literature; Browsing: Fiction; Category: Novels; Category: Classics of Literature; Category: Bri

In [31]:
import pandas as pd
import os
import requests

BASE_URL = "https://mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg"
OUTPUT_DIR = "books"

def get_possible_txt_urls(book_id):
    """Return list of possible URLs and corresponding filenames"""
    id_str = str(book_id)
    subpath = "/".join(id_str[:-1]) + f"/{id_str}" if len(id_str) > 1 else f"{id_str}"
    base_path = f"{BASE_URL}/{subpath}/{id_str}"

    return [
        (f"{base_path}.txt", f"{book_id}.txt"),
        (f"{base_path}-0.txt", f"{book_id}-0.txt"),
    ]

def download_and_save(url, dest_path):
    try:
        r = requests.get(url, stream=True, timeout=15)
        if r.ok:
            with open(dest_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
            return True
    except Exception as e:
        print(f"  ⚠️ Error downloading {url}: {e}")
    return False

def process_catalog(input_path="catalog_with_ids.csv", output_path="catalog_with_ids_and_files.csv",verbose=True):
    df = pd.read_csv(input_path)

    for i, row in df.iterrows():
        book_id = row["id"]
        if pd.isna(book_id):
            print(f"⚠️ Missing book ID at row {i}, skipping...")
            continue

        try:
            book_id = int(book_id)
        except:
            print(f"⚠️ Non Project Gutenburg book ID '{book_id}' at row {i}, skipping...")
            continue

        book_dir = os.path.join(OUTPUT_DIR, str(book_id))
        os.makedirs(book_dir, exist_ok=True)

        # Skip if already downloaded
        existing_files = os.listdir(book_dir)
        if existing_files:
            if verbose: print(f"✅ Already downloaded: {book_id}")
            dest_file_path = os.path.join(book_dir, existing_files[0])
            df.at[i, "file"] = dest_file_path
            continue


        if verbose: print(f"⬇️  Downloading book ID {book_id}...")

        for url, filename in get_possible_txt_urls(book_id):
            dest_file_path = os.path.join(book_dir, filename)
            if download_and_save(url, dest_file_path):
                print(f"   ✔️  Saved as {filename}")
                df.at[i, "file"] = dest_file_path
                break
        else:
            print(f"   ❌ Failed to download book ID {book_id}")

    df.to_csv(output_path, index=False)
    print(f"\n📄 Updated catalog saved to '{output_path}'")

process_catalog(verbose=False)


⚠️ Non Project Gutenburg book ID 'ng35' at row 34, skipping...

📄 Updated catalog saved to 'catalog_with_ids_and_files.csv'


In [32]:
!head catalog_with_ids_and_files.csv

id,title,author,year,description,file,tags,source,language
42671,Pride and Prejudice,Jane Austen,1813,A witty novel of manners and marriage among the British gentry.,books/42671/42671-0.txt,"literary-fiction/classic,romance",Project Gutenberg,English
15,"Moby-Dick; or, The Whale",Herman Melville,1851,An epic adventure of obsession and revenge on the high seas.,books/15/15-0.txt,"literary-fiction/classic,adventure,sea",Project Gutenberg,English
1661,The Adventures of Sherlock Holmes,Arthur Conan Doyle,1892,Twelve Sherlock Holmes stories that defined modern detective fiction.,books/1661/1661-0.txt,"mystery/classic,short-stories",Project Gutenberg,English
42324,"Frankenstein; or, The Modern Prometheus",Mary Shelley,1818,"The pioneering Gothic tale of science, ambition, and creation turned monster.",books/42324/42324-0.txt,"science-fiction/classic,gothic",Project Gutenberg,English
45839,Dracula,Bram Stoker,1897,The classic vampire novel that shaped modern horror.,books/45839/45839-0.txt,"h

In [34]:
!head -n 100 books/42671/42671-0.txt

*** START OF THE PROJECT GUTENBERG EBOOK 42671 ***

Note: Project Gutenberg also has an HTML version of this
      file which includes the original illustrations.
      See 42671-h.htm or 42671-h.zip:
      (http://www.gutenberg.org/files/42671/42671-h/42671-h.htm)
      or
      (http://www.gutenberg.org/files/42671/42671-h.zip)


      Images of the original pages are available through
      Internet Archive. See
      http://archive.org/stream/novelstextbasedo02austuoft#page/n23/mode/2up


Transcriber's note:

      Text enclosed by underscores is in italics (_italics_).

      A carat character is used to denote superscription. Multiple
      superscripted characters are enclosed by curly brackets
      (example: M^{rs}).





PRIDE AND PREJUDICE:

A Novel.

In Three Volumes.

By the Author of "Sense and Sensibility."

VOL. I.







London:
Printed for T. Egerton,
Military Library, Whitehall.
1813.




[Illustration: Morning Dress.

_Invented by M^{rs} Bell 26 Charlotte Street Bed

In [37]:
pd.read_csv('catalog_with_ids_and_files.csv').to_json('catalog.json', orient='records')

In [38]:
!mkdir library
!mv books library/
!mv catalog.json library/
!rm -f library.zip
!zip -9r library.zip library/

mkdir: cannot create directory ‘library’: File exists
mv: cannot stat 'books': No such file or directory
  adding: library/ (stored 0%)
  adding: library/books/ (stored 0%)
  adding: library/books/7872/ (stored 0%)
  adding: library/books/7869/ (stored 0%)
  adding: library/books/1661/ (stored 0%)
  adding: library/books/1661/1661-0.txt (deflated 63%)
  adding: library/books/2383/ (stored 0%)
  adding: library/books/2383/2383-0.txt (deflated 62%)
  adding: library/books/20270/ (stored 0%)
  adding: library/books/49487/ (stored 0%)
  adding: library/books/49487/49487-0.txt (deflated 64%)
  adding: library/books/26289/ (stored 0%)
  adding: library/books/15492/ (stored 0%)
  adding: library/books/15492/15492.txt (deflated 65%)
  adding: library/books/9288/ (stored 0%)
  adding: library/books/22382/ (stored 0%)
  adding: library/books/22382/22382-0.txt (deflated 62%)
  adding: library/books/35/ (stored 0%)
  adding: library/books/35/35-0.txt (deflated 62%)
  adding: library/books/21839/ (