In [3]:
from gutenbergpy.gutenbergcache import GutenbergCache, GutenbergCacheTypes
import os
import json
import pandas as pd
import numpy as np
import pickle as pkl

# load config
with open('config.json', 'r') as f:
    config = json.load(f)
cwd = os.getcwd()
os.chdir(config['REPODIR'])
import Utils as U
from Corpus import Corpus
os.chdir(cwd)

from collections import Counter, defaultdict
import itertools


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# TODO: Extract gutenberg book ids based on english language, THEN query the dataset

In [2]:
U.load_file('config.json', 'json')

{'REPODIR': '/Users/stephentoner/Desktop/Winter 2023/SI 699/SI699Project',
 'DATADIR': 'data/',
 'OUTPUT': 'output/'}

In [6]:
corpus = U.load_file('corpus.pkl', 'pkl', config['REPODIR'] + '//' + config['DATADIR'])

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [6]:
# Constants
NUM_AUTHORS = 50
WORKS_PER_AUTHOR = 5
CHUNKS_PER_WORK = 10
CHUNK_LENGTH = 50
np.random.seed(699)

TypeError: load_file() missing 2 required positional arguments: 'filedir' and 'format'

## Exploring the Data Distribution of Project Gutenberg
Our first step in developing a sampling approach is to determine what metadata is available and how best to create a rich, diverse sample for language modeling. We start by initializing the cache and looking at the schema:

In [7]:
df = pd.read_csv("pg_catalog.csv")
df.head()

  df = pd.read_csv("pg_catalog.csv")


Unnamed: 0,Text#,Type,Issued,Title,Language,Authors,Subjects,LoCC,Bookshelves
0,1,Text,12/1/71,The Declaration of Independence of the United ...,en,"Jefferson, Thomas, 1743-1826","United States -- History -- Revolution, 1775-1...",E201; JK,Politics; American Revolutionary War; United S...
1,2,Text,12/1/72,The United States Bill of Rights\r\nThe Ten Or...,en,United States,Civil rights -- United States -- Sources; Unit...,JK; KF,Politics; American Revolutionary War; United S...
2,3,Text,11/1/73,John F. Kennedy's Inaugural Address,en,"Kennedy, John F. (John Fitzgerald), 1917-1963",United States -- Foreign relations -- 1961-196...,E838,
3,4,Text,11/1/73,Lincoln's Gettysburg Address\r\nGiven November...,en,"Lincoln, Abraham, 1809-1865",Consecration of cemeteries -- Pennsylvania -- ...,E456,US Civil War
4,5,Text,12/1/75,The United States Constitution,en,United States,United States -- Politics and government -- 17...,JK; KF,United States; Politics; American Revolutionar...


In [8]:
catalog = df[df["Type"] == "Text"]
catalog = catalog[catalog["Language"] == "en"]

#### Filter for Authors

In [9]:
author_set = df.groupby("Authors").count().sort_values("Text#")
author_set['Text#'][::-1]

Authors
Various                                                                                                          2690
Anonymous                                                                                                         827
Lytton, Edward Bulwer Lytton, Baron, 1803-1873                                                                    217
Twain, Mark, 1835-1910                                                                                            191
Shakespeare, William, 1564-1616                                                                                   169
                                                                                                                 ... 
Menzies, Sutherland, active 1840-1883; Wilton, Mary Margaret Stanley Egerton, Countess of, 1801-1858 [Editor]       1
Menéndez Pidal, Ramón, 1869-1968 [Compiler]                                                                         1
Menéndez y Pelayo, Marcelino, 1856-1912         

#### Subjects

In [26]:
# Get distinct subjects
subjects = pd.unique(catalog['Subjects'])
subjects = [[temp.strip() for temp in str(subj).split(sep=";")] for subj in subjects]
subjects

[['United States -- History -- Revolution, 1775-1783 -- Sources',
  'United States. Declaration of Independence'],
 ['Civil rights -- United States -- Sources',
  'United States. Constitution. 1st-10th Amendments'],
 ['United States -- Foreign relations -- 1961-1963',
  'Presidents -- United States -- Inaugural addresses'],
 ['Consecration of cemeteries -- Pennsylvania -- Gettysburg',
  "Soldiers' National Cemetery (Gettysburg, Pa.)",
  'Lincoln, Abraham, 1809-1865. Gettysburg address'],
 ['United States -- Politics and government -- 1783-1789 -- Sources',
  'United States. Constitution'],
 ['Speeches, addresses, etc., American',
  'United States -- Politics and government -- 1775-1783 -- Sources',
  'Virginia -- Politics and government -- 1775-1783 -- Sources'],
 ['Massachusetts -- History -- New Plymouth, 1620-1691 -- Sources',
  'Pilgrims (New Plymouth Colony)',
  'Mayflower Compact (1620)'],
 ['United States -- Politics and government -- 1861-1865',
  'Presidents -- United States -

In [44]:
subject_set = set()
major_subjects = set()
subject_counts = Counter()

for s in subjects:
    subject_set.update(s)
    sub_subjects = [subj.split(' -- ') for subj in s]
    major_subjects.update(sub_subjects[-1]) # Outermost -- is the meta subject; could retain inner subjects if desired
    for s2 in sub_subjects:
        subject_counts.update([_s.strip() for _s in s2])

In [None]:
def extract_subjects(entry):
    return [temp.strip() for temp in str(entry).split(sep=";")]

In [45]:
target_subjects = [elt[0] for elt in subject_counts.most_common(10)]

In [None]:
subject_set
# major_subjects

{'Nuclear energy -- Popular works',
 'Aztecs',
 'American poetry -- African American authors',
 'Legends -- France -- Translations into English',
 'Paranormal fiction, French -- Translations into English',
 'Faust, -approximately 1540 -- Fiction',
 'Social status -- Fiction',
 'Coleridge, Samuel Taylor, 1772-1834 -- Interviews',
 'Selkirk Range -- Description and travel',
 'Bahai Faith -- Doctrines',
 'Browning, Robert, 1812-1889',
 'Rosicrucians',
 'Success -- Fiction',
 'Clocks and watches -- History',
 'Pins and needles -- Juvenile fiction',
 'Copyright -- United States -- History',
 'Great Britain. Royal Navy -- History -- 16th century',
 'Drinking customs -- England',
 'Dream interpretation',
 'Gaul -- History -- 58 B.C.-511 A.D. -- Fiction',
 'Waterloo, Battle of, Waterloo, Belgium, 1815 -- Poetry',
 'Adams, Abigail, 1744-1818 -- Correspondence',
 'Dye plants',
 'El Morro National Monument (N.M.) -- Guidebooks',
 'Natural history -- Dominica',
 'Hotels -- Fiction',
 'Bible. Colos

### Bookshelves

In [None]:
shelves = pd.unique(catalog['Bookshelves'])

shelves = [[temp.strip() for temp in str(shelf).split(sep=";")] for shelf in shelves]
shelf_set = set()
shelf_counts = Counter()

for s in shelves:
    shelf_set.update(s)
    shelf_counts.update(s)
    
len(shelf_set)

248

In [None]:
shelf_counts.most_common(50)

[('Best Books Ever Listings', 51),
 ('Movie Books', 43),
 ("Banned Books from Anne Haight's list", 41),
 ('Harvard Classics', 40),
 ('Bestsellers, American, 1895-1923', 31),
 ("Children's Literature", 30),
 ('Technology', 29),
 ('Historical Fiction', 26),
 ('World War I', 25),
 ("Children's History", 25),
 ("Children's Instructional Books", 24),
 ('US Civil War', 22),
 ('Adventure', 22),
 ("Children's Fiction", 21),
 ('Animal', 21),
 ("Children's Picture Books", 19),
 ('Native America', 18),
 ('Science Fiction', 18),
 ('India', 17),
 ('Biology', 17),
 ('Philosophy', 16),
 ('Humor', 16),
 ("Children's Myths, Fairy Tales, etc.", 16),
 ('Christianity', 15),
 ('Classical Antiquity', 15),
 ('Folklore', 15),
 ("Children's Book Series", 15),
 ('Germany', 15),
 ('United States', 14),
 ('Poetry', 14),
 ('Gothic Fiction', 14),
 ('Art', 14),
 ('United Kingdom', 14),
 ('Politics', 13),
 ('Slavery', 13),
 ('Horror', 13),
 ('Opera', 13),
 ('Contemporary Reviews', 13),
 ('Animals-Wild-Birds', 13),
 (

### Collecting Metadata for Dataset

In [40]:
with open("data_v2.pkl", "rb") as datafile:
    data = pkl.load(datafile)

In [33]:
titles = [dat['title'] for dat in data]
cat_titles = [t.strip() for t in df['Title']]
len(titles)

2444

In [34]:
catalog

Unnamed: 0,Text#,Type,Issued,Title,Language,Authors,Subjects,LoCC,Bookshelves
0,1,Text,12/1/71,The Declaration of Independence of the United ...,en,"Jefferson, Thomas, 1743-1826","United States -- History -- Revolution, 1775-1...",E201; JK,Politics; American Revolutionary War; United S...
1,2,Text,12/1/72,The United States Bill of Rights\r\nThe Ten Or...,en,United States,Civil rights -- United States -- Sources; Unit...,JK; KF,Politics; American Revolutionary War; United S...
2,3,Text,11/1/73,John F. Kennedy's Inaugural Address,en,"Kennedy, John F. (John Fitzgerald), 1917-1963",United States -- Foreign relations -- 1961-196...,E838,
3,4,Text,11/1/73,Lincoln's Gettysburg Address\r\nGiven November...,en,"Lincoln, Abraham, 1809-1865",Consecration of cemeteries -- Pennsylvania -- ...,E456,US Civil War
4,5,Text,12/1/75,The United States Constitution,en,United States,United States -- Politics and government -- 17...,JK; KF,United States; Politics; American Revolutionar...
...,...,...,...,...,...,...,...,...,...
70185,70264,Text,3/11/23,Women as army surgeons,en,"Murray, Flora",,,
70186,70265,Text,3/11/23,The pot of basil,en,"Capes, Bernard, 1854-1918",,,
70187,70266,Text,3/12/23,Lyrical tales,en,"Robinson, Mary, 1758-1800",,,
70188,70267,Text,3/12/23,The condition of England,en,"Masterman, C. F. G.",,,


In [37]:
data[0]

{'author_id': '5959',
 'author_name': 'Sharkey, John Michael',
 'book_id': 3670,
 'gutenbergbookid': 33871,
 'title': 'Old Friends Are the Best',
 'text': ['',
  '',
  '',
  'CHAPTER VIII.',
  '',
  'JAMES TRIPLET, water in his eye, but fire in his heart, went home on',
  'wings. Arrived there, he anticipated curiosity by informing all hands he',
  'should answer no questions. Only in the intervals of a work, which was',
  'to take the family out of all its troubles, he should gradually unfold',
  'a tale, verging on the marvelous--a tale whose only fault was, that',
  'fiction, by which alone the family could hope to be great, paled beside',
  'it. He then seized some sheets of paper fished out some old dramatic',
  'sketches, and a list of _dramatis personae,_ prepared years ago, and',
  'plunged into a comedy. As he wrote, true to his promise, he painted,',
  'Triplet-wise, that story which we have coldly related, and made it',
  'appear, to all but Mrs. Triplet, that he was under t

In [41]:
dataset_subjects = Counter()

for d in data:
    temp_subjects = catalog[catalog['Text#'] == d['gutenbergbookid']]['Subjects'].to_string(index = False).split(";")
    d['subjects'] = [dat.strip() for dat in list(itertools.chain.from_iterable([temp.strip().split("--") for temp in temp_subjects]))]
    dataset_subjects.update(d['subjects']) # Need

In [53]:
for d in data:
    d['subject'] = sorted([subj for subj in d['subjects'] if subj in target_subjects])
    if len(d['subject']) == 0:
        d['subject'] = "Other"
    else:
        d['subject'] = d['subject'][0]

In [54]:
data[0]

{'author_id': '8824',
 'author_name': 'Dixon, Thomas F.',
 'book_id': 17427,
 'gutenbergbookid': 18721,
 'title': 'The Victim: A Romance of the Real Jefferson Davis',
 'text': ['us are very disappointing. No doubt you have found that justice in the',
  'United States goes only with a pure heart and a right purpose as it does',
  'everywhere else in the world. No doubt what you found here did not seem',
  'touched for you, after all, with the complete beauty of the ideal which',
  'you had conceived beforehand. But remember this: If we had grown at all',
  'poor in the ideal, you brought some of it with you. A man does not go',
  'out to seek the thing that is not in him. A man does not hope for the',
  'thing that he does not believe in, and if some of us have forgotten what',
  'America believed in, you, at any rate, imported in your own hearts a',
  'renewal of the belief. That is the reason that I, for one, make you',
  'welcome. If I have in any degree forgotten what America was in

In [None]:
pd.DataFrame(data).to_csv("data_w_subj_new.csv")