In [1]:
from gutenbergpy.gutenbergcache import GutenbergCache, GutenbergCacheTypes
import os
import json
import pandas as pd
import numpy as np
import pickle as pkl

# load config
with open('config.json', 'r') as f:
    config = json.load(f)
cwd = os.getcwd()
os.chdir(config['REPODIR'])
import Utils as U
os.chdir(cwd)

from collections import Counter, defaultdict
import itertools


In [2]:
# TODO: Extract gutenberg book ids based on english language, THEN query the dataset

In [2]:
U.load_file('config.json', 'json')

{'REPODIR': '/Users/stephentoner/Desktop/Winter 2023/SI 699/SI699Project',
 'DATADIR': 'data/',
 'OUTPUT': 'output/'}

In [6]:
# Constants
NUM_AUTHORS = 50
WORKS_PER_AUTHOR = 5
CHUNKS_PER_WORK = 10
CHUNK_LENGTH = 50
np.random.seed(699)

TypeError: load_file() missing 2 required positional arguments: 'filedir' and 'format'

## Exploring the Data Distribution of Project Gutenberg
Our first step in developing a sampling approach is to determine what metadata is available and how best to create a rich, diverse sample for language modeling. We start by initializing the cache and looking at the schema:

In [4]:
df = pd.read_csv("pg_catalog.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'pg_catalog.csv'

In [None]:
catalog = df[df["Type"] == "Text"]
catalog = catalog[catalog["Language"] == "en"]

#### Filter for Authors

In [None]:
author_set = df.groupby("Authors").count().sort_values("Text#")
author_set['Text#'][::-1]

Authors
Various                                                                                                          2690
Anonymous                                                                                                         827
Lytton, Edward Bulwer Lytton, Baron, 1803-1873                                                                    217
Twain, Mark, 1835-1910                                                                                            191
Shakespeare, William, 1564-1616                                                                                   169
                                                                                                                 ... 
Menzies, Sutherland, active 1840-1883; Wilton, Mary Margaret Stanley Egerton, Countess of, 1801-1858 [Editor]       1
Menéndez Pidal, Ramón, 1869-1968 [Compiler]                                                                         1
Menéndez y Pelayo, Marcelino, 1856-1912         

#### Subjects

In [None]:
# Get distinct subjects
subjects = pd.unique(catalog['Subjects'])
subjects = [[temp.strip() for temp in str(subj).split(sep=";")] for subj in subjects]
# subjects

[['United States -- History -- Revolution, 1775-1783 -- Sources',
  'United States. Declaration of Independence'],
 ['Civil rights -- United States -- Sources',
  'United States. Constitution. 1st-10th Amendments'],
 ['United States -- Foreign relations -- 1961-1963',
  'Presidents -- United States -- Inaugural addresses'],
 ['Consecration of cemeteries -- Pennsylvania -- Gettysburg',
  "Soldiers' National Cemetery (Gettysburg, Pa.)",
  'Lincoln, Abraham, 1809-1865. Gettysburg address'],
 ['United States -- Politics and government -- 1783-1789 -- Sources',
  'United States. Constitution'],
 ['Speeches, addresses, etc., American',
  'United States -- Politics and government -- 1775-1783 -- Sources',
  'Virginia -- Politics and government -- 1775-1783 -- Sources'],
 ['Massachusetts -- History -- New Plymouth, 1620-1691 -- Sources',
  'Pilgrims (New Plymouth Colony)',
  'Mayflower Compact (1620)'],
 ['United States -- Politics and government -- 1861-1865',
  'Presidents -- United States -

In [None]:
subject_set = set()
major_subjects = set()
subject_counts = Counter()

for s in subjects:
    subject_set.update(s)
    sub_subjects = [subj.split(' -- ') for subj in s]
    major_subjects.update(sub_subjects[-1]) # Outermost -- is the meta subject; could retain inner subjects if desired
    for s2 in sub_subjects:
        subject_counts.update([_s.strip() for _s in s2])

In [None]:
def extract_subjects(entry):
    return [temp.strip() for temp in str(entry).split(sep=";")]

In [None]:
subject_counts.most_common(1000)

[('Fiction', 21825),
 ('Juvenile fiction', 16019),
 ('History', 5650),
 ('United States', 3286),
 ('Great Britain', 2265),
 ('England', 2237),
 ('Description and travel', 2161),
 ('Biography', 2106),
 ('Social life and customs', 1865),
 ('Conduct of life', 1732),
 ('Science fiction', 1680),
 ('19th century', 1670),
 ('Short stories', 1502),
 ('World War, 1914-1918', 1042),
 ('Adventure stories', 1026),
 ('Drama', 999),
 ('Juvenile literature', 899),
 ('Early works to 1800', 847),
 ('France', 827),
 ('Translations into English', 818),
 ('Poetry', 731),
 ('Man-woman relationships', 698),
 ('Civil War, 1861-1865', 691),
 ('Love stories', 633),
 ('20th century', 627),
 ('Women', 567),
 ('Children', 557),
 ('Young women', 542),
 ('Indians of North America', 540),
 ('Politics and government', 526),
 ('History and criticism', 516),
 ('Christian life', 451),
 ('Historical fiction', 449),
 ('Friendship', 437),
 ('Folklore', 430),
 ('India', 426),
 ('Canada', 403),
 ('Periodicals', 403),
 ('Voya

In [None]:
subject_set
# major_subjects

{'Nuclear energy -- Popular works',
 'Aztecs',
 'American poetry -- African American authors',
 'Legends -- France -- Translations into English',
 'Paranormal fiction, French -- Translations into English',
 'Faust, -approximately 1540 -- Fiction',
 'Social status -- Fiction',
 'Coleridge, Samuel Taylor, 1772-1834 -- Interviews',
 'Selkirk Range -- Description and travel',
 'Bahai Faith -- Doctrines',
 'Browning, Robert, 1812-1889',
 'Rosicrucians',
 'Success -- Fiction',
 'Clocks and watches -- History',
 'Pins and needles -- Juvenile fiction',
 'Copyright -- United States -- History',
 'Great Britain. Royal Navy -- History -- 16th century',
 'Drinking customs -- England',
 'Dream interpretation',
 'Gaul -- History -- 58 B.C.-511 A.D. -- Fiction',
 'Waterloo, Battle of, Waterloo, Belgium, 1815 -- Poetry',
 'Adams, Abigail, 1744-1818 -- Correspondence',
 'Dye plants',
 'El Morro National Monument (N.M.) -- Guidebooks',
 'Natural history -- Dominica',
 'Hotels -- Fiction',
 'Bible. Colos

### Bookshelves

In [None]:
shelves = pd.unique(catalog['Bookshelves'])

shelves = [[temp.strip() for temp in str(shelf).split(sep=";")] for shelf in shelves]
shelf_set = set()
shelf_counts = Counter()

for s in shelves:
    shelf_set.update(s)
    shelf_counts.update(s)
    
len(shelf_set)

248

In [None]:
shelf_counts.most_common(50)

[('Best Books Ever Listings', 51),
 ('Movie Books', 43),
 ("Banned Books from Anne Haight's list", 41),
 ('Harvard Classics', 40),
 ('Bestsellers, American, 1895-1923', 31),
 ("Children's Literature", 30),
 ('Technology', 29),
 ('Historical Fiction', 26),
 ('World War I', 25),
 ("Children's History", 25),
 ("Children's Instructional Books", 24),
 ('US Civil War', 22),
 ('Adventure', 22),
 ("Children's Fiction", 21),
 ('Animal', 21),
 ("Children's Picture Books", 19),
 ('Native America', 18),
 ('Science Fiction', 18),
 ('India', 17),
 ('Biology', 17),
 ('Philosophy', 16),
 ('Humor', 16),
 ("Children's Myths, Fairy Tales, etc.", 16),
 ('Christianity', 15),
 ('Classical Antiquity', 15),
 ('Folklore', 15),
 ("Children's Book Series", 15),
 ('Germany', 15),
 ('United States', 14),
 ('Poetry', 14),
 ('Gothic Fiction', 14),
 ('Art', 14),
 ('United Kingdom', 14),
 ('Politics', 13),
 ('Slavery', 13),
 ('Horror', 13),
 ('Opera', 13),
 ('Contemporary Reviews', 13),
 ('Animals-Wild-Birds', 13),
 (

### Collecting Metadata for Dataset

In [None]:
with open("data_v2.pkl", "rb") as datafile:
    data = pkl.load(datafile)

In [None]:
titles = [dat['title'] for dat in data]
cat_titles = [t.strip() for t in df['Title']]
len(titles)

2444

In [None]:
catalog

NameError: name 'catalog' is not defined

In [None]:
dataset_subjects = Counter()

for d in data:
    temp_subjects = catalog[catalog['Text#'] == d['gutenbergbookid']]['Subjects'].to_string(index = False).split(";")
    d['subjects'] = [dat.strip() for dat in list(itertools.chain.from_iterable([temp.strip().split("--") for temp in d['subjects']]))]
    dataset_subjects.update(d['subjects']) # Need

KeyError: 'subjects'

In [None]:
len(dataset_subjects)

len(data)

data[51]

# sum([subj[1] for subj in dataset_subjects.most_common(25)])

{'author_id': '972',
 'author_name': 'Masefield, John',
 'book_id': 27120,
 'gutenbergbookid': 61286,
 'title': 'Selected Poems',
 'text': ['         erqaumadlarlavulle!',
  '',
  '  Mel. 244.',
  '',
  'ILL.  2. Nâlegaq ânialik,',
  '         najoromagapkit,',
  '         qemergoluarlagit,',
  '         quviagigapkit!',
  '         Ittikattit mãna',
  '         erqiniarlakka,',
  '         ijîkik illutortuk',
  '         ullernaigilâkka!',
  '',
  'KOR.  Atâtâk! anernera aggangnut perqovara! Niaqune nairlugo toqolerpoq.',
  '',
  '  Mell. 22a.',
  '',
  'ILL.  3. Oqautsivit kingorlingat,',
  '         toqokuma, atorpara;',
  '         akkitudlartoq tarniga',
  '         aggangnut perqolârpara.',
  '',
  '      4. Sennerakkut angmartokut',
  '         tarniga angerartiuk;',
  '         oqautsivit kingorlingata',
  '         toqumne manigorlinga!',
  '',
  '  Mel. 232',
  '',
  'ANG.  5. Najuinaromavagit,',
  '         najormigamgatauq igvit,',
  '         tagga quviasutiga;',
  '',
  '

In [None]:
pd.DataFrame(data).to_csv("data_w_subj.csv")