In [1]:
from gutenbergpy.gutenbergcache import GutenbergCache

In [2]:
import os
import json
import pandas as pd
import numpy as np

In [3]:
GutenbergCache.create()

Cache already exists


In [4]:
cache = GutenbergCache.get_cache()

In [22]:
# The number of ENGLISH authors who have at least 5 works 
query = """
    SELECT COUNT(*) FROM (
        SELECT a.authorid
        FROM book_authors a
        LEFT JOIN books b
        ON b.id = a.bookid
        LEFT JOIN authors u
        ON u.id = a.authorid
        WHERE b.languageid = 1
        AND u.name NOT IN ('Anonyme', 'Anonymous, Anonymous', 'Anonymous', 'Various, Various', 'Various', 'Verschillende')
        GROUP BY a.authorid
        HAVING COUNT(a.bookid) >= 5
    ) AS author_counts;
"""

cursor = cache.native_query(query)
for row in cursor:
    print(row)

(4864,)


In [21]:
query = """
        SELECT u.name
        FROM book_authors a
        LEFT JOIN books b
        ON b.id = a.bookid
        LEFT JOIN authors u
        ON u.id = a.authorid
        WHERE b.languageid = 1
        AND u.name NOT IN ('Anonyme', 'Anonymous, Anonymous', 'Anonymous', 'Various, Various', 'Various', 'Verschillende')
        GROUP BY a.authorid
        HAVING COUNT(a.bookid) = 5
"""
cursor = cache.native_query(query)
for row in cursor:
    print(row)

('Marie-Henri Beyle',)
('Beyle, Marie Henri',)
('Stendhal',)
('Meek, Sterner St. Paul',)
('Meek, S. P. (Sterner St. Paul)',)
('U.S. Marine Corps',)
('United States. Department of the Navy. Marine Corps',)
('United States. Marine Corps',)
('Dunne, Finley Peter',)
('Lie, Jonas Lauritz Idemil',)
('Lie, Jonas',)
('Townsend, G. A. (George Alfred)',)
('Bouquet, Johnny',)
('Townsend, George Alfred',)
('Knox, John',)
('Salvianus, Gildas',)
('Reed, Mr.',)
('Baxter, R. (Richard)',)
('Baxter, Richard',)
('Radcliffe, Mrs.',)
('Ratcliffe, Mrs.',)
('Rattcliffe, Anne',)
('Radcliffe, Anne',)
('Radcliffe, Ann Ward',)
('Ernst, Paul Frederick',)
('Ernst, Paul',)
('Seaman, Elizabeth Cochrane',)
('Cochran, Elizabeth',)
('Cochrane, Elizabeth',)
('Bly, Nellie',)
('Brewer, E. Cobham (Ebenezer Cobham)',)
('Brewer, Ebenezer Cobham',)
('Le Sage, Alain Rene',)
('Sage, Alain René Le',)
('Lesage, Alain-René',)
('Sage, Alain-René le',)
('Le Sage, Alain René',)
('Bergson, Henri Louis',)
('Bergson, H. (Henri)',)
('Ber

In [6]:
# query to get 5 random works of each qulified authorid 
queryy = """
    SELECT b.authorid, t.name
    FROM titles t
    LEFT JOIN book_authors b 
    ON b.bookid = t.bookid
    WHERE b.authorid IN (
        SELECT ba.authorid
        FROM book_authors ba
        LEFT JOIN books b
        ON b.id = ba.bookid
        WHERE b.languageid = 1
        GROUP BY ba.authorid
        HAVING COUNT(ba.bookid) >= 5
    )
    ORDER BY RANDOM()
    LIMIT 5;
"""
cursor = cache.native_query(queryy)
for row in cursor:
    print(row)

(87, 'Adventures of Huckleberry Finn, Chapters 31 to 35')
(1396, 'The Flying Boys to the Rescue')
(627, 'The Price of Power\nBeing Chapters from the Secret History of the Imperial Court of Russia')
(3876, 'Henry Dunbar: A Novel')
(2116, 'A Girl of High Adventure')


In [26]:
# get random qulified authorid

def get_n_random_authorids(n):
    query = """
        SELECT a.authorid
        FROM book_authors a
        LEFT JOIN books b
        ON b.id = a.bookid
        LEFT JOIN authors u
        ON u.id = a.authorid
        WHERE b.languageid = 1
        AND u.name NOT IN ('Anonyme', 'Anonymous, Anonymous', 'Anonymous', 'Various, Various', 'Various', 'Verschillende')
        GROUP BY a.authorid
        HAVING COUNT(a.bookid) >= 5
    """
    
    cursor = cache.native_query(query)
    authorids = [row[0] for row in cursor]
    return np.random.choice(authorids, n, replace=False)


In [27]:
authorids = get_n_random_authorids(1)
authorids

array([12083])

In [28]:
# get 5 random works of each qulified authorid 

def get_5_random_works(authorid):
    query2 = """
        SELECT b.authorid, a.name, s.gutenbergbookid, t.name
        FROM titles t

        LEFT JOIN books s 
        ON s.id = t.bookid
        LEFT JOIN book_authors b 
        ON b.bookid = s.id
        LEFT JOIN authors a 
        ON b.authorid = a.id


        WHERE b.authorid = {}
        ORDER BY RANDOM()
        LIMIT 5;
    """.format(authorid)

    cursor = cache.native_query(query2)
    gutenbergbookids = []
    for row in cursor:
        gutenbergbookids.append(row[2])
    return gutenbergbookids

In [45]:
for authorid in authorids:
    gutenbergbookids = get_5_random_works(authorid)
    print(gutenbergbookids)

[51898, 39519, 50855, 49987, 44254]


In [32]:
# randomly get 10 sample sentences with line number from each work, 50 lines each author 

import random
import gutenbergpy
import gutenbergpy.textget
import nltk # To split the book by sentences



def get_text(id):
    raw_book = gutenbergpy.textget.get_text_by_id(id)
    clean_book = gutenbergpy.textget.strip_headers(raw_book)
    return clean_book, raw_book

def trim_book(clean_book, trim_pct=0.1):
    lines = clean_book.strip().decode('utf-8').replace('\n', ' ')
    n = len(lines)
    start = n * trim_pct // 1
    end = n * (1 - trim_pct) // 1
    return lines[int(start):int(end)]

# def get_sample_lines(trimmed):
#     non_empty_lines = [line for line in trimmed if line.strip()]
#     n = len(non_empty_lines)
#     sample_lines = random.sample(range(n), k=min(n, 10))
#     sample = [(i+1, non_empty_lines[i]) for i in sample_lines]
#     return sample

def get_sample_sentences(trimmed):
    sentences = nltk.sent_tokenize(trimmed)
    non_empty_sentences = [s for s in sentences if s.strip()]
    n = len(non_empty_sentences)
    sample_sentences = random.sample(range(n), k=min(n, 10))
    sample = [(i+1, non_empty_sentences[i]) for i in sample_sentences]
    return sample

# def get_sample_sentences(trimmed, authorid, bookid):
#     sentences = nltk.sent_tokenize(trimmed)
#     non_empty_sentences = [s for s in sentences if s.strip()]
#     n = len(non_empty_sentences)
#     sample_sentences = random.sample(range(n), k=min(n, 10))
#     sample = [{'authorid': authorid, 'bookid': bookid, 'line_number': i+1, 'text': non_empty_sentences[i]} for i in sample_sentences]
#     return sample



# for authorid in authorids:
#     gutenbergbookids = get_5_random_works(authorid)
#     for id in gutenbergbookids:
#         clean, raw = get_text(id)
#         trimmed = trim_book(clean)
#         sample = get_sample_lines(trimmed)
#         print(f"Sample of 10 lines from work {id}:")
#         print(sample)

# for authorid in authorids:
#     gutenbergbookids = get_5_random_works(authorid)
#     for id in gutenbergbookids:
#         clean, raw = get_text(id)
#         trimmed = trim_book(clean)
#         sample = get_sample_sentences(trimmed)
#         print(f"Sample of 10 non-empty sentences from gutenberg book id {id}:")
#         print(sample)

In [33]:
# Combine and consolidate

def get_consolidated_results(n):
    authorids = get_n_random_authorids(n)

    results = []
    
    for authorid in authorids:
        gutenbergbookids = get_5_random_works(authorid)
        for gutenbergbookid in gutenbergbookids:
            clean_book, raw_book = get_text(gutenbergbookid)
            trimmed = trim_book(clean_book, trim_pct=0.1)
            sample_sentences = get_sample_sentences(trimmed)

            for line_number, sentence in sample_sentences:
                results.append({
                    'authorid': authorid,
                    'gutenbergbookid': gutenbergbookid,
                    'line_number': line_number,
                    'text': sentence
                })
    
    df = pd.DataFrame(results)
    
    return df

In [34]:
df = get_consolidated_results(1)
df

Unnamed: 0,authorid,gutenbergbookid,line_number,text
0,1148,4941,366,Back of the clouds there was a pretty good-siz...
1,1148,4941,346,"It was frittering snow, and the prospect of a ..."
2,1148,4941,1460,"Perhaps it's just as you say, and this is some..."
3,1148,4941,1312,Maurice shook his head in the negative.
4,1148,4941,1176,"You'd like to get rid of our responsibility, a..."
5,1148,4941,606,"He waited just three seconds, until Maurice, f..."
6,1148,4941,853,what are you after now?
7,1148,4941,1705,"exclaimed Maurice, pulling up."
8,1148,4941,510,"I've got a notion there are a lot around here,..."
9,1148,4941,191,"That little old gun is as good as ever, I do b..."
