In [1]:
pdf_path = "./data/SRD_CC_v5.1.pdf"

In [2]:
from pprint import pprint

# PyMuPDF

In [6]:
import fitz

doc = fitz.open(pdf_path) # open a document

In [8]:
page = doc[3]
page

page 3 of ./data/SRD_CC_v5.1.pdf

In [9]:
import re

def strip_boilerplate(text: str) -> str:
    text = text
    page_boilerplate = 'System\xa0Reference\xa0Document\xa05.1\xa0\n\xa0 \d+\xa0\n'
    text = re.sub(page_boilerplate, '', text)
    text = re.sub('System\s+Reference\s+Document\s+5.1\s+\d+\s+', '', text)
    
    error_note = 'If you note any errors in this document, please let us know by \nemailing askdnd@wizards.com. \n'
    if error_note in text:
        text = text.replace(error_note, '')
    
    return text

In [10]:

def clean_page_contents(contents: str) -> str:
    contents = contents.replace('\t\r \xa0', ' ')
    contents = strip_boilerplate(contents)
    return contents

In [11]:
# collect all text blocks on all pages, and sort them by length (to see if we can find headers)

all_blocks = []

for i, page in enumerate(doc):
    blocks = page.get_text_blocks()
    for block in page.get_text_blocks():
        (x0, y0, x1, y1, block_text, block_no, block_type) = block

        block_content = clean_page_contents(block_text)
        if (block_content == ''):
            continue
        all_blocks.append((x0, y0, x1, y1, block_content, block_no, block_type))

len(all_blocks)

11166

In [12]:
import re
import math


def is_header(block) -> bool:
    (x0, y0, x1, y1, text, block_no, block_type) = block

    HEADER_LEFT_COLUMN_X = 57.599998474121094
    HEADER_RIGHT_COLUMN_X = 328.55999755859375
    TOLERANCE = 0.001

    if not math.isclose(x0, HEADER_LEFT_COLUMN_X, rel_tol=TOLERANCE) and not math.isclose(x0, HEADER_RIGHT_COLUMN_X, rel_tol=TOLERANCE):
        return False

    # if the first character is capital, and the line length is less than 20, and it ends in \n
    if len(text) == 0:
        return False
    if len(text) > 40:
        return False
    if text[-1] != '\n':
        return False
    if text[0].isupper():
        return True
    
    return False

headers = [block[4] for block in all_blocks if is_header(block)]

from pprint import pprint
pprint(headers)

['Legal\xa0Information\xa0\n',
 'Races \nRacial Traits \n',
 'Dwarf \nDwarf Traits \n',
 'Elf \nElf Traits \n',
 'Halfling \nHalfling Traits \n',
 'Human \nHuman Traits \n',
 'Dragonborn \nDragonborn Traits \n',
 'Draconic Ancestry \n',
 'Dragon \nDamage Type \nBreath Weapon \n',
 'Gold \nFire \n15 ft. cone (Dex. save) \n',
 'Green \nPoison \n15 ft. cone (Con. save) \n',
 'Red \nFire \n15 ft. cone (Dex. save) \n',
 'Silver \nCold \n15 ft. cone (Con. save) \n',
 'White \nCold \n15 ft. cone (Con. save) \n',
 'Gnome \nGnome Traits \n',
 'Half-Elf \nHalf-Elf Traits \n',
 'Half-Orc \nHalf-Orc Traits \n',
 'Tiefling \nTiefling Traits \n',
 'Barbarian \n',
 'The Barbarian \n',
 'Level \n',
 'Rage \n',
 'Unarmored Defense \n',
 'Reckless Attack \n',
 'Danger Sense \n',
 'Primal Path \n',
 'Ability Score Improvement \n',
 'Extra Attack \n',
 'Fast Movement \n',
 'Feral Instinct \n',
 'Brutal Critical \n',
 'Relentless Rage \n',
 'Persistent Rage \n',
 'Indomitable Might \n',
 'Primal Champion \

... okay.  The above approach sucks.  It's unreliable.  Positioning isn't a good enough indicator of a section header.

## Alternate approach - find heads by font sizes
Are the font colors / sizes predictable enough?

In [13]:
# ('Races'): 'GillSans-SemiBold', size ~25.920000076293945, color 9647668
CHAPTER = { 'font': 'GillSans-SemiBold', 'size': 25.920000076293945, 'color': 9647668 }

# ('Dwarf'): 'GillSans-SemiBold', size 18, color 9647668
SECTION = { 'font': 'GillSans-SemiBold', 'size': 18, 'color': 9647668 }

# ('Racial Traits'): 'GillSans-SemiBold', size ~13.920000076293945, color 9647668
SUBSECTION = { 'font': 'GillSans-SemiBold', 'size': 13.920000076293945, 'color': 9647668 }

# ('Ability Score Increase'): 'GillSans-SemiBold', size 12, color 9647668
SUBSUBSECTION = { 'font': 'GillSans-SemiBold', 'size': 12, 'color': 9647668 }

# TODO: term & definition - e.g., Adventuring Gear with the bold, italic lead in to the paragraph
# TODO: table header / capturing tables
# TODO: sidebars like "Self-Sufficiency" on page 73E
# TODO: handle two back-to-back sections, like "Using Ablility" followed by "Scores" on 76

# TODO: monster headers for individual monster stat blocks

In [14]:
for i, page in enumerate(doc):
    d = page.get_text('dict')
    blocks = d['blocks']
    for block in blocks:
        if "lines" in block.keys():
            spans = block['lines']
            for span in spans:
                data = span['spans']
                for lines in data:
                    if lines['font'] == CHAPTER['font'] and lines['size'] == CHAPTER['size'] and lines['color'] == CHAPTER['color']:
                        print(f"CHAPTER: {lines['text']}")
                    if lines['font'] == SECTION['font'] and lines['size'] == SECTION['size'] and lines['color'] == SECTION['color']:
                        print(f"  SECTION: {lines['text']}")
                    if lines['font'] == SUBSECTION['font'] and lines['size'] == SUBSECTION['size'] and lines['color'] == SUBSECTION['color']:
                        print(f"    SUBSECTION: {lines['text']}")
                    if lines['font'] == SUBSUBSECTION['font'] and lines['size'] == SUBSUBSECTION['size'] and lines['color'] == SUBSUBSECTION['color']:
                        print(f"      SUB-SUBSECTION: {lines['text']}")

CHAPTER: Races 
    SUBSECTION: Racial Traits 
      SUB-SUBSECTION: Ability Score Increase 
      SUB-SUBSECTION: Age 
      SUB-SUBSECTION: Alignment 
      SUB-SUBSECTION: Size 
      SUB-SUBSECTION: Speed 
      SUB-SUBSECTION: Languages 
      SUB-SUBSECTION: Subraces 
  SECTION: Dwarf 
    SUBSECTION: Dwarf Traits 
      SUB-SUBSECTION: Hill Dwarf 
  SECTION: Elf 
    SUBSECTION: Elf Traits 
      SUB-SUBSECTION: High Elf 
  SECTION: Halfling 
    SUBSECTION: Halfling Traits 
      SUB-SUBSECTION: Lightfoot 
  SECTION: Human 
    SUBSECTION: Human Traits 
  SECTION: Dragonborn 
    SUBSECTION: Dragonborn Traits 
  SECTION: Gnome 
    SUBSECTION: Gnome Traits 
      SUB-SUBSECTION: Rock Gnome 
  SECTION: Half-Elf 
    SUBSECTION: Half-Elf Traits 
  SECTION: Half-Orc 
    SUBSECTION: Half-Orc Traits 
  SECTION: Tiefling 
    SUBSECTION: Tiefling Traits 
CHAPTER: Barbarian 
  SECTION: Class Features 
      SUB-SUBSECTION: Hit Points 
      SUB-SUBSECTION: Proficiencies 
      SUB-SU

In [15]:
import re

def clean_text_lines(text: str) -> str:
    text = text.replace('\t\r \xa0', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('\xa0', ' ')
    text = text.replace('-\xad‐‑', '-')
    text = strip_boilerplate(text)
    return text

In [16]:
# looks good, so let's record wayfinding and make document chunks
def is_heading_block(line, heading_traits) -> bool:
    if line['font'] == heading_traits['font'] and line['size'] == heading_traits['size'] and line['color'] == heading_traits['color']:
        return True
    return False

wayfinding = { 'chapter': 'Legal Information', 'section': None, 'subsection': None, 'subsubsection': None, 'starting_page': None}
chunk_content = ""
doc_chunks = []

for i, page in enumerate(doc):
    page_num = i+1

    d = page.get_text('dict')
    blocks = d['blocks']
    for block in blocks:
        if "lines" in block.keys():
            spans = block['lines']
            for span in spans:
                data = span['spans']
                for lines in data:
                    found_heading = False
                    trimmed_text = clean_text_lines(lines['text'].strip())
                    # clone last heading
                    last_heading = wayfinding.copy()

                    # each new heading resets the wayfinding for the lower levels
                    if is_heading_block(lines, CHAPTER):
                        wayfinding['chapter'] = trimmed_text
                        wayfinding['section'] = None
                        wayfinding['subsection'] = None
                        wayfinding['subsubsection'] = None
                        found_heading = True

                    if is_heading_block(lines, SECTION):
                        wayfinding['section'] = trimmed_text
                        wayfinding['subsection'] = None
                        wayfinding['subsubsection'] = None
                        found_heading = True

                    if is_heading_block(lines, SUBSECTION):
                        wayfinding['subsection'] = trimmed_text
                        wayfinding['subsubsection'] = None
                        found_heading = True

                    if is_heading_block(lines, SUBSUBSECTION):
                        wayfinding['subsubsection'] = trimmed_text
                        found_heading = True

                    if not found_heading:
                        chunk_content += trimmed_text + " "
                    else:
                        # end the content of the previous chunk, if there was any
                        if chunk_content != "":
                            previous_chunk = {
                                'location': last_heading,
                                'content': chunk_content
                            }
                            doc_chunks.append(previous_chunk)
                        chunk_content = ""
                        wayfinding['starting_page'] = page_num

# finish the last chunk
previous_chunk = {
    'location': wayfinding,
    'content': chunk_content
}
doc_chunks.append(previous_chunk)

print(len(doc_chunks))
pprint(doc_chunks[100])

1392
{'content': 'Drawing on the divine essence of nature itself, you can cast '
            'spells to shape that essence to your will. System Reference '
            'Document 5.1 20  ',
 'location': {'chapter': 'Druid',
              'section': 'Class Features',
              'starting_page': 19,
              'subsection': 'Spellcasting',
              'subsubsection': None}}


In [17]:
# show 5 randomly selected chunks
import random
for i in range(5):
    chunk_index = random.randint(0, len(doc_chunks))
    pprint(doc_chunks[chunk_index])

{'content': 'Weapon (any sword), legendary (requires attunement) You gain a +3 '
            'bonus to attack and damage rolls made with this magic weapon. The '
            'first time you attack with the sword on each of your turns, you '
            'can transfer some or all of the sword’s bonus to your Armor '
            'Class, instead of using the bonus on any attacks that turn. For '
            'example, you could reduce the bonus to your attack and damage '
            'rolls to +1 and gain a +2 bonus to AC. The adjusted bonuses '
            'remain in effect until the start of your next turn, although you '
            'must hold the sword to gain a bonus to AC from it. ',
 'location': {'chapter': 'Magic Items',
              'section': 'Magic Items A-Z',
              'starting_page': 218,
              'subsection': None,
              'subsubsection': 'Defender'}}
{'content': 'Wondrous item, rarity varies (requires attunement) An Ioun stone '
            'is named after 