In [1]:
import os
import json
import random
from pypdf import PdfReader
from tqdm.notebook import tqdm

In [2]:
folders = ['/eberron/3e', '/eberron/5e']
corpus_folder = '/corpus/texts'

In [3]:
metadata = {}
for folder in tqdm(folders):
    for filename in tqdm(os.listdir(folder)):
        if filename[-4:].lower() != '.pdf':
            continue
        reader = PdfReader(f'{folder}/{filename}')
        if reader.outline:
            metadata[filename] = reader.metadata

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [4]:
def get_obj(outline, depth=1, obj='/Title'):
    object_list = []
    if isinstance(outline, list):
        for part in outline:
            object_list += get_obj(part, depth + 1, obj)
    else:
        if obj in outline and outline[obj] is not None:
            object_list.append(outline[obj])
        else:
            object_list.append('')
    return object_list



In [5]:
def get_title(outline, depth=1):
    titles = []
    if isinstance(outline, list):
        for part in outline:
            titles += get_title(part, depth + 1)
    else:
        titles.append('#' * depth + ' ' + outline['/Title'])
    return titles



In [6]:
chunk_size = 2  # pdf pages
parts_of_books = {}
page_objects = {}
for folder in tqdm(folders):
    for filename in tqdm(os.listdir(folder)):
        if filename[-4:].lower() != '.pdf':
            continue
        reader = PdfReader(f'{folder}/{filename}')
        # labels = reader.get_page_labels()
        num_pages = reader.get_num_pages()
        for i in range(num_pages):
            first = i
            last = i + chunk_size - 1
            text_filename = f'{filename[:-4]}-{first:03}-{last:03}.txt'
            pages = reader.pages[first:last]
            text = ""
            for page in pages:
                text += page.extract_text()
            if text:
                with open(f'{corpus_folder}/{text_filename}', 'w') as f:
                    f.write(text)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [7]:
parts_of_books = {}
page_objects = {}
for folder in tqdm(folders):
    for filename in tqdm(os.listdir(folder)):
        if filename[-4:].lower() != '.pdf':
            continue
        parts_of_books[filename] = []
        page_objects[filename] = []
        reader = PdfReader(f'{folder}/{filename}')
        if reader.outline:
            for part in reader.outline:
                parts_of_books[filename] += get_title(part, 1)
                page_objects[filename] += get_obj(part, 1, '/Page')


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [6]:
os.listdir('/eberron/3e')

['Across Eberron.pdf',
 'Blades Of The Quori.pdf',
 'D&D 3E Races of Eberron.pdf',
 'Dragonmarked.pdf',
 'Dragonmarks, Dragonshards & Dynasties of Power.pdf',
 'Dragons of eberron.pdf',
 'Eberron - Eyes_of_the_Lich_Queen.pdf',
 'Eberron Campaign Setting.pdf',
 'Eberron Character Sheet.pdf',
 'Eberron-CityofStormreach.pdf',
 'Encounter the Warforged.pdf',
 'Errata',
 "Explorer's Handbook.pdf",
 'Faiths of Eberron.pdf',
 'Fallen Angel.pdf',
 'Five Nations.pdf',
 'Grasp of The Emerald Claw.pdf',
 'Keith Baker - The City of Towers (Eberron_ The Dreaming Dark, Book 1)-Wizards of the Coast (2005).pdf',
 'Keith Baker - The Gates of Night (The Dreaming Dark, Book 3)-Wizards of the Coast (2006).pdf',
 'Keith Baker - The Shattered Land (Eberron_ The Dreaming Dark)-Wizards of the Coast (2005).pdf',
 'Keith Baker, Scott Fitzgerald Gray, Glenn McDonald, Chris Sims - Secrets of Sarlona (Dungeons & Dragons d20 3.5 Fantasy Roleplaying, Eberron Supplement)-Wizards of the Coast (2007).pdf',
 'Keith Bake

In [8]:
metadata

{'Blades Of The Quori.pdf': {'/ModDate': "D:20050110134502+01'00'",
  '/CreationDate': 'D:20050110134458Z',
  '/Title': 'Blades of the Quori',
  '/Producer': 'Acrobat Web Capture 6.0'},
 'Dragons of eberron.pdf': {'/CreationDate': "D:20071229214352-06'00'",
  '/ModDate': "D:20240604203339-04'00'"},
 'Eberron - Eyes_of_the_Lich_Queen.pdf': {'/CreationDate': "D:20080209075428+01'00'",
  '/ModDate': "D:20080209151404-05'00'"},
 'Eberron-CityofStormreach.pdf': {'/CreationDate': "D:20080820143821-05'00'",
  '/Creator': 'Adobe InDesign CS2 (4.0.5)',
  '/Producer': 'PDF PT 3.10 (pdf-tools.com)',
  '/Title': 'City of Stormreach',
  '/ModDate': "D:20240522204102-04'00'"},
 'Keith Baker, Scott Fitzgerald Gray, Glenn McDonald, Chris Sims - Secrets of Sarlona (Dungeons & Dragons d20 3.5 Fantasy Roleplaying, Eberron Supplement)-Wizards of the Coast (2007).pdf': {'/CreationDate': 'D:20070603130541Z',
  '/Producer': 'ABBYY FineReader 8.0 Professional Edition',
  '/ModDate': "D:20240523122935-04'00'"}