In [5]:
import os
import requests
import re
import unicodedata

In [11]:
# Define a list of URLs to download
urls = [
    "https://www.gutenberg.org/cache/epub/730/pg730.txt",
    "https://www.gutenberg.org/cache/epub/24022/pg24022.txt",
    "https://www.gutenberg.org/cache/epub/786/pg786.txt",
    # "https://www.gutenberg.org/cache/epub/98/pg98.txt",
    "https://www.gutenberg.org/cache/epub/174/pg174.txt",
    "https://www.gutenberg.org/cache/epub/902/pg902.txt",
    "https://www.gutenberg.org/cache/epub/14522/pg14522.txt",
    "https://gutenberg.org/cache/epub/885/pg885.txt",
    "https://www.gutenberg.org/cache/epub/1513/pg1513.txt",
    "https://www.gutenberg.org/cache/epub/23042/pg23042.txt",
    "https://www.gutenberg.org/cache/epub/1533/pg1533.txt",
    "https://www.gutenberg.org/cache/epub/27761/pg27761.txt"
    # Add more URLs here
]

In [12]:
# Function to sanitize folder and file names
def sanitize_name(name):
    # Remove invalid characters
    name = ''.join(c for c in name if c.isalnum() or c in [' ', '_', '-'])
    # Replace spaces with underscores
    name = name.replace(' ', '_')
    return name

# Function to extract title and author from raw text
def extract_title_author(text):
    title_match = re.search(r"Title: (.+)", text)
    author_match = re.search(r"Author: (.+)", text)

    if title_match and author_match:
        title = sanitize_name(title_match.group(1))
        author = sanitize_name(author_match.group(1))

        return title, author
    else:
        return None, None

# Function to download and save the text
def download_and_save_text(url):
    response = requests.get(url)
    if response.status_code == 200:
        raw_text = response.text

        title, author = extract_title_author(raw_text)

        if title and author:
            folder_name = f"book_datasets/{author}"
            os.makedirs(folder_name, exist_ok=True)

            with open(f"{folder_name}/{title}.txt", "w", encoding="utf-8") as file:
                file.write(raw_text)

            print(f"Downloaded and saved: {folder_name}/{title}.txt")
        else:
            print(f"Title and author not found for URL: {url}")
    else:
        print(f"Failed to download URL: {url}")

# Iterate over the list of URLs and process each one
for url in urls:
    download_and_save_text(url)

Downloaded and saved: book_datasets/Charles_Dickens/Oliver_Twist.txt
Downloaded and saved: book_datasets/Charles_Dickens/A_Christmas_Carol.txt
Downloaded and saved: book_datasets/Charles_Dickens/Hard_Times.txt
Downloaded and saved: book_datasets/Charles_Dickens/A_Tale_of_Two_Cities.txt
Downloaded and saved: book_datasets/Oscar_Wilde/The_Picture_of_Dorian_Gray.txt
Downloaded and saved: book_datasets/Oscar_Wilde/The_Happy_Prince_and_Other_Tales.txt
Downloaded and saved: book_datasets/Oscar_Wilde/The_Canterville_Ghost.txt
Downloaded and saved: book_datasets/Oscar_Wilde/An_Ideal_Husband.txt
Downloaded and saved: book_datasets/William_Shakespeare/Romeo_and_Juliet.txt
Downloaded and saved: book_datasets/William_Shakespeare/The_Tempest.txt
Downloaded and saved: book_datasets/William_Shakespeare/Macbeth.txt
Downloaded and saved: book_datasets/William_Shakespeare/Hamlet_Prince_of_Denmark.txt
