In [1]:
from gutenbergpy.gutenbergcache import GutenbergCache, GutenbergCacheTypes
import os
import json
import pandas as pd
import numpy as np
import pickle as pkl
import random
import gutenbergpy
import gutenbergpy.textget


In [2]:
# from gutenbergpy.gutenbergcache import GutenbergCache
# #for sqlite
# GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, cache=True, deleteTemp=True)

## Exploring the Data Distribution of Project Gutenberg
Our first step in developing a sampling approach is to determine what metadata is available and how best to create a rich, diverse sample for language modeling. We start by initializing the cache and looking at the schema:

In [3]:
cache  = GutenbergCache.get_cache()
# from IPython.display import Image
# Image(filename='sqlitescheme.png')

#### Types of Media

We want our sample to consist solely of text based media; however, the type field is not well maintained for the majority of the works on project gutenberg.

In [4]:
types = [t for t in cache.native_query("SELECT DISTINCT id, name FROM types")]
types

[(1, 'Sound'),
 (2, 'Image'),
 (3, 'Dataset'),
 (4, 'Collection'),
 (5, 'Text'),
 (6, 'MovingImage'),
 (7, 'StillImage')]

#### Gathering Bookshelves

Books on Project Gutenberg are sorted into collections called bookshelves; there are Main Categories each with their own Bookshelf ID; we are interested in English Language (Language.id = 1) text (type.id = 5) texts. However, it also seems that bookshelves themselves are not well populated in the metadata either, meaning a new approach is warranted.

In [5]:
shelves = [sh for sh in cache.native_query(
        "SELECT * FROM (" +
        "SELECT bs.id, bs.name, COUNT(b.id) as BOOK_COUNT FROM " +
        "(SELECT DISTINCT id, name FROM bookshelves) as bs " +    
        "RIGHT JOIN " + 
        "(SELECT id, bookshelveid, languageid, typeid FROM books " + 
        "WHERE languageid=1) as b " +
        "ON bs.id=b.bookshelveid) as bbs " +
        "GROUP BY bbs.id, bbs.name " + 
        "ORDER BY BOOK_COUNT DESC;")]

shelves


[(1, 'The Great Round World And What Is Going On In It', 56694)]

#### Subjects vs. Book Subjects In Project Gutenberg

In [6]:

# Retrieves all subjects

subjects = [s for s in cache.native_query(
    "SELECT bs.subjectid, s.name, count(bs.bookid) AS book_count FROM book_subjects AS bs \
    LEFT JOIN subjects AS s \
    ON s.id = bs.subjectid \
    GROUP BY s.id, s.name \
    ORDER BY book_count DESC \
    LIMIT 25;")]

# See the script "subjectid.sql" for a full listing of the actual subject names
subjects

[(44, 'PS', 11113),
 (10, 'PR', 10009),
 (13, 'PZ', 7282),
 (16, 'PQ', 4843),
 (28, 'PT', 2956),
 (80, 'Science fiction', 2938),
 (14, 'Short stories', 2748),
 (2, 'AP', 2584),
 (71, 'Fiction', 1979),
 (46, 'DA', 1663),
 (105, 'Adventure stories', 1475),
 (146, 'PH', 1379),
 (18, 'PN', 992),
 (79, 'DC', 961),
 (308, 'Historical fiction', 947),
 (91, 'DS', 943),
 (30, 'Conduct of life -- Juvenile fiction', 876),
 (158, 'Love stories', 858),
 (291, 'D501', 827),
 (192, 'Detective and mystery stories', 818),
 (51, 'Man-woman relationships -- Fiction', 809),
 (197, 'BX', 745),
 (29, 'Poetry', 681),
 (68, 'QH', 670),
 (242, 'QL', 670)]

#### Querying a Subject

Some of the subject titles aren't particularly informative; in order to get a better picture of what each subject is, we can query them individually to retrieve top downloads for each subject:  

In [7]:
# Top N Downloads for subject ID:

top25 = [s for s in cache.native_query(
    "SELECT bs.subjectid, b.name, b.numdownloads FROM \
    (SELECT * FROM books \
     LEFT JOIN titles AS T \
     ON T.bookid = books.id) AS b \
    LEFT JOIN book_subjects AS bs \
    ON b.id = bs.bookid and bs.subjectid=44 \
    ORDER BY b.numdownloads DESC \
    LIMIT 25;")]

# See the script "subjectid.sql" for a full listing of the actual subject names

top25

[(None, 'Romeo and Juliet', 197127),
 (None, 'A Room with a View', 167726),
 (None, 'Middlemarch', 165079),
 (44, 'Little Women; Or, Meg, Jo, Beth, and Amy', 152480),
 (44, 'Moby Dick; Or, The Whale', 148386),
 (None, 'The Enchanted April', 147692),
 (44, 'The Blue Castle: a novel', 147236),
 (None, 'Cranford', 143503),
 (None, 'The Complete Works of William Shakespeare', 141886),
 (None, 'The Adventures of Ferdinand Count Fathom — Complete', 141299),
 (None, 'The Expedition of Humphry Clinker', 139335),
 (None, 'The Adventures of Roderick Random', 137522),
 (None, 'History of Tom Jones, a Foundling', 131062),
 (None, 'Twenty Years After', 128560),
 (None, 'Vingt ans après. English', 128560),
 (None, 'My Life — Volume 1', 128316),
 (None, 'Frankenstein; Or, The Modern Prometheus', 75145),
 (None, 'Pride and Prejudice', 68103),
 (None, 'Alice in Wonderland', 31948),
 (None, "Alice's Adventures in Wonderland", 31948),
 (44, 'The Great Gatsby', 25354),
 (44, 'The Yellow Wallpaper', 24783)

#### Querying an Author

In our analysis, we intend to have the model attempt to learn to detect different writers for based on their style of writing; in order to do this, we need a good sample of writers for each subject.

In [8]:
topAuthors = [s for s in cache.native_query(
    "SELECT a.authorid, a.name, count(a.bookid) as book_count from \
    (SELECT * from authors \
    LEFT JOIN book_authors \
    ON id = authorid) as a \
    LEFT JOIN \
    (SELECT * FROM books \
    WHERE languageid = 1) as b \
    ON b.id = a.bookid \
    GROUP BY a.id, a.name \
    HAVING book_count >= 5 \
    ORDER BY book_count DESC;")]

# We have an 80 / 20 rule for sampling; at least 10 passages per author, and 
# Note this is the authors with the most books, not the authors with most downloads
# Furthermore, there are many repeats in the list of authors; Shakespeare appears 3 times, Jules Verne 4
pd.DataFrame(topAuthors).to_csv("top_authors.csv")

# TODO:
# 1 ) Write Sampling Scheme At most 1 hour
# np.random.choice() #What is n? How many authors?

# 2 ) SQL CODE to then generate new 

# 3 ) Combine and consolidate text: Half a day
# Topics
# Authors
# Book Title
# Line Numbers
# And Text

# 4 ) Consolidate and split to train / test

# Bonus: Find better metadata

Notes:
 * No anonymous / (Various) / etc.
 * Key identifiers such as character names e.g. Captain Ahab

Filter for English Authors with enough books to sample
random subset of 5 works
10 samples from each work, 50 lines each
 * Each book has at min 500 lines
 * Every author has 2500 Lines

How many authors?

Issues:
Herman Melville's Moby Dick

Post Sampling:
* Text processing (removing punctuation)
    * Keep punctuation in one dataset -> We want punctuation for tet generation?
    * Strip from the other (learning language, classification, etc.) -> reduce number of tokens

.pkl (pickle)


In [9]:
cache  = GutenbergCache.get_cache()
# Get all English authors with at least 5 books
engAuthors = [s for s in cache.native_query("SELECT a.id, a.name\
    FROM authors a\
    JOIN book_authors ba ON a.id = ba.authorid\
    JOIN books b ON ba.bookid = b.id\
    WHERE b.languageid = 1 AND a.name NOT IN ('Anonymous', 'Various')\
    GROUP BY a.id, a.name \
    HAVING COUNT(b.id) >= 5\
    ORDER BY a.name;")]

engAuthors

[(24076, 'A Square'),
 (3758, 'A. L. O. E.'),
 (14741, 'A.E.'),
 (3756, 'A.L.O.E.'),
 (14738, 'AE'),
 (10528, 'Abbot, Willis J. (Willis John)'),
 (10525, 'Abbot, Willis John'),
 (24077, 'Abbott, E. A. (Edwin Abbott)'),
 (24078, 'Abbott, Edwin A. (Edwin Abbott)'),
 (24079, 'Abbott, Edwin Abbott'),
 (3185, 'Abbott, Eleanor Hallowell'),
 (3301, 'Abbott, Henry'),
 (5197, 'Abbott, Jacob'),
 (14425, 'Abbott, Jane'),
 (5201, 'Abbott, John S. C. (John Stevens Cabot)'),
 (5200, 'Abbott, John Stevens Cabot'),
 (10527, 'Abbott, Willis J.'),
 (10526, 'Abbott, Willis John'),
 (12239, 'Abdul-Baha'),
 (7639, 'Abernathy, Robert'),
 (7638, 'Abernathy, Robert Harwood'),
 (23085, 'Accum, Frederick'),
 (23084, 'Accum, Fredrick'),
 (23086, 'Accum, Friedrich Christian'),
 (4399, 'Acton, John Emerich Edward Dalberg Acton, Baron'),
 (4398, 'Acton, Lord'),
 (5423, 'Adair, Cecil'),
 (15674, 'Adams, Andy'),
 (20026, 'Adams, C. F. (Charles Francis)'),
 (1402, "Adams, Capt. 'Bruin'"),
 (20027, 'Adams, Charles F. (

In [10]:
# Count the number of authors in engAuthors
num_authors = len(engAuthors)
# Print the result
print(f"There are {num_authors} English authors >=5 works in the dataset.")

There are 4868 English authors >=5 works in the dataset.


In [11]:
# def get_works(engAuthors):
#     query2 = """
#         SELECT b.authorid, a.name, s.gutenbergbookid, t.name
#         FROM titles t
#
#         LEFT JOIN books s
#         ON s.id = t.bookid
#         LEFT JOIN book_authors b
#         ON b.bookid = s.id
#         LEFT JOIN authors a
#         ON b.authorid = a.id
#
#
#         WHERE b.authorid = {}
#     """
#
#     cursor = cache.native_query(query2)
#     gutenbergbookids = []
#     for row in cursor:
#         gutenbergbookids.append(row[2])
#     return gutenbergbookids

In [12]:
# for authorid in engAuthors:
#     gutenbergbooks = get_works(authorid)
#     print(gutenbergbooks)

In [13]:
engAuthor_ids = [str(a[0]) for a in engAuthors]
query = """
    SELECT a.id AS author_id, a.name AS author_name, b.id AS book_id, t.name AS book_title
    FROM authors a
    JOIN book_authors ba ON a.id = ba.authorid
    JOIN books b ON ba.bookid = b.id
    JOIN titles t ON t.bookid = b.id
    WHERE b.languageid = 1 AND a.name NOT IN ('Anonymous', 'Various') AND a.id IN ({})
    ORDER BY a.name, b.id
""".format(','.join(engAuthor_ids))

results = cache.native_query(query)
df = pd.DataFrame(results, columns=["author_id", "author_name", "book_id", "book_title"])
df

Unnamed: 0,author_id,author_name,book_id,book_title
0,24076,A Square,24287,How to Write Clearly: Rules and Exercises on E...
1,24076,A Square,35549,Silanus the Christian
2,24076,A Square,43047,Flatland: A Romance of Many Dimensions
3,24076,A Square,52765,Onesimus: Memoirs of a Disciple of St. Paul
4,24076,A Square,58073,Flatland: A Romance of Many Dimensions
...,...,...,...,...
89604,1427,孫子,23449,Sunzi bing fa. English
89605,1427,孫子,23449,The Art of War
89606,1427,孫子,33602,Sunzi bing fa. English
89607,1427,孫子,33602,The Art of War


In [14]:
num_books_per_author = 5
random_seed = 42

df_sampled = df.groupby("author_id", group_keys=False).apply(lambda x: x.sample(n=min(len(x), num_books_per_author), random_state=random_seed))
df_sampled

Unnamed: 0,author_id,author_name,book_id,book_title
80263,1,Verschillende,6193,"The Southern Literary Messenger, Vol. II., No...."
80686,1,Verschillende,14077,"Harper's Young People, January 17, 1882\nAn Il..."
80920,1,Verschillende,18579,"The American Missionary — Volume 39, No. 07, J..."
82528,1,Verschillende,51235,"Punch, Or The London Charivari, Volume 102, Ma..."
82890,1,Verschillende,58800,"Encyclopaedia Britannica, 11th Edition, 'Cinci..."
...,...,...,...,...
66640,45848,"Sherman, Francis",63980,In Memorabilia Mortis
66641,45848,"Sherman, Francis",64864,A Canadian Calendar: 12 Lyrics
66645,45848,"Sherman, Francis",68572,A Prelude
66642,45848,"Sherman, Francis",64864,A Canadian Calendar: Twelve Lyrics


In [15]:
book_texts = {}  # create an empty dictionary to store the book texts
for index, row in df_sampled.iterrows():
    book_id = row["book_id"]
    book_text = gutenbergpy.textget.get_text_by_id(book_id)
    book_texts[book_id] = book_text


TypeError: exceptions must derive from BaseException

In [None]:
id = df_sampled["book_id"].tolist()
id

In [None]:


import random
import gutenbergpy
import gutenbergpy.textget
import nltk # To split the book by sentences



import pandas as pd
import gutenbergpy
import gutenbergpy.textget

id = df_sampled["book_id"].tolist()

def get_text(id):
    raw_book = gutenbergpy.textget.get_text_by_id(id)
    clean_book = gutenbergpy.textget.strip_headers(raw_book)
    return clean_book, raw_book

# Create an empty DataFrame with two columns: "book_id" and "text"

# Loop over book IDs and append their text data to the DataFrame


# def trim_book(clean_book, trim_pct=0.1):
#     lines = clean_book.strip().decode('utf-8').replace('\n', ' ')
#     n = len(lines)
#     start = n * trim_pct // 1
#     end = n * (1 - trim_pct) // 1
#     return lines[int(start):int(end)]
#
#
# def get_sample_sentences(trimmed):
#     sentences = nltk.sent_tokenize(trimmed)
#     non_empty_sentences = [s for s in sentences if s.strip()]
#     n = len(non_empty_sentences)
#     sample_sentences = random.sample(range(n), k=min(n, 10))
#     sample = [(i+1, non_empty_sentences[i]) for i in sample_sentences]
#     return sample



In [16]:
for i in id:
    gutenbergBooks = get_text(i)
    print(gutenbergBooks)

TypeError: 'builtin_function_or_method' object is not iterable

In [18]:

# # Get all English authors with at least 5 books
# engAuthors = [s for s in cache.native_query("SELECT a.id, a.name\
#     FROM authors a\
#     JOIN book_authors ba ON a.id = ba.authorid\
#     JOIN books b ON ba.bookid = b.id\
#     WHERE b.languageid = 1\
#     GROUP BY a.id, a.name \
#     HAVING COUNT(b.id) >= 3\
#     ORDER BY a.name;")]
#
# engAuthors
# num_authors = len(engAuthors)
# num_authors

In [19]:
# # For each engauthor, get a random subset of 5 works
# for author in engAuthors:
#     # Get the author name
#     author_name = author[0]
#
#     # Get all works by the author
#     works = [s for s in cache.native_query(f"SELECT a.name\
#     FROM authors a\
#     JOIN book_authors ba ON a.id = ba.authorid\
#     JOIN books b ON ba.bookid = b.id\
#     WHERE b.languageid = 1 AND a.name NOT IN ('Anonymous', 'Various')\
#     GROUP BY a.name\
#     HAVING COUNT(DISTINCT b.id) >= 5")]
#
#     # Get a random subset of 5 works
#     subset = random.sample(works, min(len(works), 5))
#
#     # Print the result
#     print(works)
#     print (subset)
#     print(f"{author_name}: {subset}")

In [20]:
# import random
# from gutenbergpy.gutenbergcache import GutenbergCache
#
# cache = GutenbergCache.get_cache()
#
# # Get English authors with at least 5 distinct works
# eng_authors = [author[0] for author in cache.native_query("SELECT a.name\
# FROM authors a\
# JOIN book_authors ba ON a.id = ba.authorid\
# JOIN books b ON ba.bookid = b.id\
# WHERE b.languageid = 1 AND a.name NOT IN ('Anonymous', 'Various')\
# GROUP BY a.name\
# HAVING COUNT(DISTINCT b.id) >= 5")]
#
# # Get a random subset of 5 works for each author
# author_works = {}
# for author in eng_authors:
#     works = [work[0] for work in cache.native_query(f"""
#     SELECT t.name
#     FROM titles t
#     JOIN books b ON t.bookid = b.id
#     JOIN book_authors ba ON b.id = ba.bookid
#     JOIN authors a ON a.id = ba.authorid
#     WHERE a.name = '{author}' AND b.languageid = 1
#     """)]
#     random_works = random.sample(works, 5)
#     author_works[author] = random_works
#
#
# # Print the random works for each author
# for author, works in author_works.items():
#     print(f'{author}:')
#     for work in works:
#         print(f'  - {work}')
#     print()
