In [101]:
import os
import json
import re
import string
from collections import Counter
import random

from tqdm.notebook import tqdm
from IPython.display import clear_output, display

In [22]:
# Input parameters
DOCUMENT_VERSION = '03'
CORPUS_FOLDER = f'/jupyterlab/corpus/eberron/v{DOCUMENT_VERSION}'

In [23]:
# Output parameters
ARTEFACT_VERSION = '02'

### Read Document Metadata

In [24]:
with open(os.path.join(CORPUS_FOLDER, 'metadata.json')) as f:
    metadata = json.load(f)
metadata_dict = {}
for data in metadata:
    metadata_dict[data['filename'][:-4]] = data

In [91]:
metadata[0]

{'filename': 'Across Eberron.pdf',
 'edition': '3e',
 'pdf/title': '',
 'pdf/author': ''}

### First Pass (Fast): Split Into Sections

In [147]:
title_fixes = {
    '881665-eberron_cannith_cat2': 'House Cannith Catalogue #2'
}

In [1255]:
disallowed_section_titles = {'Actions', 'Reactions', 'Cannith Catalogue 2', 'THREAT DISPATCH', 'Legendary Actions', 'Wizard Level Feature'
                             'Languages —', '-', 'Finesse', 'Weight Properties', 'Player’s Handbook','Spell Resistance:', 'Capital: Korth',
                             'Combat', 'Capital: Rhukaan Draal', 'STR DEX CON INT WIS CHA', 'Favor', 'DIALECTS', 'WIS CHA',
                             'CLASS FEATURES',
}
disallowed_section_title_regexp = [
    r'Skills\s+.+\+[0-9].*',
    r'Saving Throws\s+.+\+[0-9].*',
    r'.*\-level.*feature',
    r'Languages.*Common.*',
    r'^[0-9\s.\(\)]+$',
    r'Hit Points\s+[0-9]+.*'
    r'Challenge\s+[0-9]+.*',
    r'Damage Immunities.*',
    r'Damage Resistances.*',
    r'Level Adjustment:.*',
    r'Challenge Rating:.*',
    r'Initiative:.*',
    r'Treasure:.*',
    r'Environment: .*',
    r'Skills: .*',
    r'Feats: .*',
    r'Organization: .*',
    r'Base Atk .*',
    r'Base Attack .*',
    r'Special Attacks: .*',
    r'Range: .*',
    r'Spell Resistance: .*',
    r'Graft Location: .*',
    r'Weight: .*',
    r'Light: .*\.',
    r'Scripts: .*',
    r'Script: .*',
    r'Speakers: .*',
    r'Format: .*',
    r'Knowledge ([a-z]+)',
    r'[0-9](st|nd|th).*level.*',
    r'.*KORRANBERG CHRONICLE: THREAT DISPATCH',
    r'SIDEBAR: .*',
    r'WIZARDS OF EBERRON: FIVE ARCANE TRADITIONS [0-9]+.*',
    r'.*DUNGEON DECEMBER 2004',
    r'.*[cves]\s+[0-9]+',
]

In [1256]:
ignore_list = {'Eberron Character Sheet', 'New & Expanded Feat List', '476764-sample', 'SharnIndexIntegrated'}

In [1257]:
sections = []
section_metadata = []
book_count = 0

for file_count, file_name in enumerate(tqdm(os.listdir(CORPUS_FOLDER))):
    if file_name[-3:] != '.md':
        continue
    if file_name[:-3] in ignore_list:
        continue
    book_title = file_name[:-3]
    pdf_title = metadata_dict[file_name[:-3]]['pdf/title']
    edition = metadata_dict[file_name[:-3]]['edition']
    if pdf_title:
        if book_title.endswith('sample'):
            book_title = pdf_title
    book_title = title_fixes.get(book_title, book_title)
    current_page = 0
    current_section_title = ''
    current_section_lines = []
    empty_line_ctr = 0
    file_path = os.path.join(CORPUS_FOLDER, file_name)
    with open(file_path) as f:
        document = f.read()
        lines = document.split('\n')
        line_count = len(lines)
        for line_no, line in enumerate(lines):
            previous_line = lines[line_no - 1].strip() if line_no > 0 else '\0'
            next_line = lines[line_no + 1].strip() if line_no < line_count - 1 else '\0'
            current_line = line.strip()

            if not current_line:
                continue

            m = re.match(r'##\s+Page\s+([0-9]+)', line)
            if m:
                current_page = int(m[1])
                continue

            current_section_word_count = len("\n".join(current_section_lines).split(' '))
    
            if previous_line == '' and (next_line == '' or next_line.startswith('Medium') or next_line.startswith('Large') or next_line.startswith('District Type')):
                may_be_section_title = True
                if current_line.upper() != current_line and string.capwords(current_line) != current_line:
                    may_be_section_title = False
                if current_section_title.lower() == current_line.lower():
                    may_be_section_title = False
                if may_be_section_title and is_integer_string(current_line):
                    may_be_section_title = False
                if may_be_section_title and current_line in disallowed_section_titles:
                    may_be_section_title = False
                if len(re.sub(r'[^a-zA-Z\.]', '', current_line)) < 4:
                    may_be_section_title = False
                if ',' in current_line:
                    may_be_section_title = False
                if may_be_section_title:
                    for regexp in disallowed_section_title_regexp:
                        m = re.match(regexp, current_line, re.IGNORECASE)
                        if m:
                            may_be_section_title = False

                if may_be_section_title:
                    if current_section_title:
                        if current_section_lines and current_section_word_count > 15:
                            text = "\n".join(current_section_lines)
                            if len(text.split(' ')) > 5:
                                sections.append(text)
                                section_metadata.append({
                                    'book_title': book_title,
                                    'file_name': file_name,
                                    'edition': edition,
                                    'section_title': current_section_title,
                                    'section_pages': current_section_pages,
                                    'initial_word_count': current_section_word_count
                                })
                    current_section_title = current_line
                    current_section_lines = []
                    current_section_pages = (current_page, current_page)
                    continue

            if current_section_word_count > 450:
                end = line_no + 10
                start = line_no - len(current_section_lines) + 10
                text = "\n".join([l for l in lines[start:end] if l and not l.startswith('## Page ')])
                sections.append(text)
                section_metadata.append({
                    'book_title': book_title,
                    'file_name': file_name,
                    'edition': edition,
                    'section_title': current_section_title,
                    'section_pages': current_section_pages,
                    'initial_word_count': len(text.split(' '))
                })
                current_section_lines = []
                current_section_pages = (current_page, current_page)
                continue

            if current_section_title and current_line:
                current_section_lines.append(current_line)
                current_section_pages = (current_section_pages[0], current_page)
        if current_section_lines:
            text = ' '.join(current_section_lines)
            sections.append(text)
            section_metadata.append({
                'book_title': book_title,
                'file_name': file_name,
                'edition': edition,
                'section_title': current_section_title,
                'section_pages': current_section_pages,
                'initial_word_count': len(text.split(' '))
            })
    # book_count += 1
    # if book_count > 20:
    #     break
section_count = len(sections)
assert len(section_metadata) == section_count
assert section_metadata[1]['section_title'] == 'Litmus Strips'
assert section_metadata[11]['section_title'] == 'Automatic Chatelaine'
section_titles = {d['section_title'] for d in section_metadata}
assert 'Automatic Chatelaine' in section_titles
assert 'HORRID HYENA' in section_titles
assert 'HORRID BADGER' in section_titles
assert 'NAZTHARUNE RAKSHASA' in section_titles
assert 'CRYSTEEL' in section_titles
assert 'DENDRITIC' in section_titles
assert 'IRONBARK' in section_titles
assert 'DARKLEAF' in section_titles
assert 'KNIGHT PHANTOM' in section_titles
assert 'SHARN SKYMAGE' in section_titles
assert 'WEAPONS OF KHORVAIRE' in section_titles
assert 'ADVENTURING GEAR' in section_titles
assert '7. The Library' in section_titles
assert '8. The Entry Hall' in section_titles
assert 'THE ORIGIN OF THE FIVE NATIONS' in section_titles
assert 'AUNDAIR AT A GLANCE' in section_titles
assert 'THE COMING OF GALIFAR' in section_titles
assert 'Highhold' in section_titles
assert 'PERIPLANAR OF ICE ~ PERISIAN' in section_titles
assert 'HOUSE THARASHK' in section_titles
assert 'ARGON' in section_titles
assert 'METRON' in section_titles
assert 'DARKLEAF' in section_titles
assert 'DARKLEAF' in section_titles
section_count, book_title

  0%|          | 0/127 [00:00<?, ?it/s]

(19991, 'Magic of Eberron')

In [1258]:
# TODO: For second pass, delete:
# Contents
# Thanks
# CREDITS


In [1259]:
random.choice([(d['section_title'], d['initial_word_count']) for d in section_metadata])

('Maruk Ghaash’kala (clan) 159', 26)

In [1260]:
def get_sections_by_title(title):
    for i, d in enumerate(section_metadata):
        if d['section_title'] == title:
            yield sections[i]

In [1282]:
book_titles = {d['book_title'] for d in section_metadata}
book_title = random.choice(list(book_titles))
book_title, {d['section_title'] for d in section_metadata if d['book_title'] == book_title}

('1262926-Politics_of_Breland_v1.4',
 {'AS AN ADVENTURE',
  'CONCLUSION',
  'Crown New Cyre',
  'DEMOGRAPHICS',
  'INDUSTRY',
  'MILITARY',
  'PARLIAMENT',
  'REIGNITING THE LAST WAR',
  'THE CROWN',
  'THE HEIRS',
  'THE MACE OF PARLIAMENT'})

In [1274]:
def truncate(text, k):
    """
    Truncate the text to the first k words and append '...' if there are more words.
    
    :param text: The input string.
    :param k: The maximum number of words to include.
    :return: The truncated string.
    """
    words = text.split()
    if len(words) > k:
        return " ".join(words[:k]) + " ..."
    return text
i = random.randint(0, section_count - 1)
print(i)
print(section_metadata[i]['section_title'])
print(section_metadata[i])
print(sections[i])

7053
DROW ENCOUNTERS
{'book_title': 'D&D 3E Races of Eberron', 'file_name': 'D&D 3E Races of Eberron.md', 'edition': '3e', 'section_title': 'DROW ENCOUNTERS', 'section_pages': (77, 77), 'initial_word_count': 71}
check can improve the attitude of a sentient being. The
scout rolls 1d20+3, or 1d20—1 if attempting to influence
a magical beast with an Intelligence score of 1 or 2.
Light Blindness: Abrupt exposure to bright light (such
assunlight or a daylight spell) blinds drow for 1 round.
On subsequent rounds, they are dazzled as long as they
remain in the affected area.
Possessions: +1 studded leather, masterwork longsword,
masterwork composite longbow (+2 Str bonus) with 20
arrows, 1,500 gp.


In [1254]:
[d['section_title'] for d in section_metadata if d['book_title'] == '1598836-Languages_of_Eberron_2E']

['CREDITS',
 'CONTENTS',
 'INTRODUCTION',
 'INTRODUCTION',
 'STARTING FEATURES',
 'STARTING FEATURES',
 'SCRIPTS',
 'SCRIPTS',
 'SCRIPTS',
 'LANGUAGE FAMILIES',
 'DEAD LANGUAGES',
 'SECRET LANGUAGES',
 'LANGUAGES',
 'CORVAGURI FAMILY',
 'ADARKHA',
 'CORVAGURI',
 'ENTUUSI',
 'ISOCHAR',
 'KHUNANI',
 'LOW KALUUNI',
 'SYRKHA',
 'AVENTUUAN +',
 'SUNYAGIRI +',
 'CRYPTOGRAPHIC FAMILY',
 'JOLAN',
 'THE NULAKESH HARMONIES',
 'THE REKKENMARK MANUAL',
 'SKIN CANT',
 'STONESPEECH',
 'WEAK LINK',
 'DHAKAAL FAMILY',
 'DARGUUL',
 "GAA'RAK",
 "TAARKA'VUUL",
 'EBERRAL FAMILY',
 'ALCATHAH',
 'ALULA',
 'EBERRAL',
 'EMEA',
 'VVAAR¥',
 'ASHTAGARI',
 'BAPHOM',
 'SHARGONIC',
 'XXIPHU',
 'YEENOGH',
 "OUR'AT",
 "« OUR'SIR",
 '© Script: Oural',
 "OUR'USH",
 'JUNGLE GIANTS',
 'ELEVENTH +',
 "SUL'AT +",
 "OTHER XEN'DRIKAN LANGUAGES",
 'QUORI FAMILY',
 'KOSTET',
 'LASHTI',
 'OLD KREEN +',
 'YANNAHI +',
 'RHIAVHAAN FAMILY',
 'DASKARI',
 'KORTHISH',
 'LHAZAARMAL',
 'METRON',
 'THALIAIS',
 'WROATI',
 'MALEERIQ +',
 '

In [1240]:
[(d['section_title'], d['initial_word_count']) for d in section_metadata if d['book_title'] == '1598836-Languages_of_Eberron_2E']

[('CREDITS', 345),
 ('CONTENTS', 40),
 ('INTRODUCTION', 360),
 ('INTRODUCTION', 172),
 ('STARTING FEATURES', 359),
 ('STARTING FEATURES', 220),
 ('SCRIPTS', 364),
 ('SCRIPTS', 372),
 ('SCRIPTS', 418),
 ('LANGUAGE FAMILIES', 111),
 ('DEAD LANGUAGES', 80),
 ('SECRET LANGUAGES', 55),
 ('LANGUAGES', 52),
 ('CORVAGURI FAMILY', 22),
 ('ADARKHA', 86),
 ('CORVAGURI', 119),
 ('ENTUUSI', 407),
 ('ISOCHAR', 107),
 ('KHUNANI', 96),
 ('LOW KALUUNI', 124),
 ('SYRKHA', 56),
 ('AVENTUUAN +', 438),
 ('SUNYAGIRI +', 139),
 ('CRYPTOGRAPHIC FAMILY', 31),
 ('JOLAN', 114),
 ('THE NULAKESH HARMONIES', 80),
 ('THE REKKENMARK MANUAL', 72),
 ('Format: Signed', 73),
 ('STONESPEECH', 69),
 ('WEAK LINK', 58),
 ('DHAKAAL FAMILY', 30),
 ('DARGUUL', 169),
 ("GAA'RAK", 87),
 ("TAARKA'VUUL", 243),
 ('EBERRAL FAMILY', 26),
 ('ALCATHAH', 58),
 ('ALULA', 149),
 ('EBERRAL', 77),
 ('EMEA', 84),
 ('Speakers: Druids', 165),
 ('ASHTAGARI', 66),
 ('Speakers: Minotaurs', 86),
 ('SHARGONIC', 135),
 ('XXIPHU', 118),
 ('YEENOGH', 1

In [216]:
sections[i]

IndexError: list index out of range

In [225]:
len('Symbiotic Nature. A crawling gauntlet can’t be remo'.split(' '))

8