In [1]:
import os
import json
import re
import string
from collections import Counter
import random
from tempfile import mkdtemp
import shutil
import pickle

from tqdm.notebook import tqdm
from IPython.display import clear_output, display

import pyarrow as pa
from sentence_transformers import SentenceTransformer
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from pydantic import BaseModel

In [2]:
# Input parameters
DOCUMENT_VERSION = '03'
CORPUS_FOLDER = f'/jupyterlab/corpus/eberron/v{DOCUMENT_VERSION}'

In [3]:
# Output parameters
ARTEFACT_VERSION = '03'
ARTEFACT_ROOT_FOLDER = os.environ.get('ARTEFACT_ROOT_FOLDER', '/artefact')
ARTEFACT_FOLDER = os.path.join(ARTEFACT_ROOT_FOLDER, 'eberron', f'v{ARTEFACT_VERSION}')

### Read Document Metadata

In [4]:
with open(os.path.join(CORPUS_FOLDER, 'metadata.json')) as f:
    metadata = json.load(f)
metadata_dict = {}
for data in metadata:
    metadata_dict[data['filename'][:-4]] = data

### First Pass (Fast): Split Into Sections

In [5]:
def is_integer_string(value):
    """
    Check if the given value (string) represents an integer.
    
    :param value: The value to check.
    :return: True if the string represents an integer, False otherwise.
    """
    try:
        int(value)
        return True
    except (ValueError, TypeError):
        return False

In [6]:
title_fixes = {
    '881665-eberron_cannith_cat2': 'House Cannith Catalogue #2'
}

In [7]:
disallowed_section_titles = {'Actions', 'Reactions', 'Cannith Catalogue 2', 'THREAT DISPATCH', 'Legendary Actions', 'Wizard Level Feature'
                             'Languages —', '-', 'Finesse', 'Weight Properties', 'Player’s Handbook','Spell Resistance:', 'Capital: Korth',
                             'Combat', 'Capital: Rhukaan Draal', 'STR DEX CON INT WIS CHA', 'Favor', 'DIALECTS', 'WIS CHA',
                             'CLASS FEATURES',
}
disallowed_section_title_regexp = [
    r'Skills\s+.+\+[0-9].*',
    r'Saving Throws\s+.+\+[0-9].*',
    r'.*\-level.*feature',
    r'Languages.*Common.*',
    r'^[0-9\s.\(\)]+$',
    r'Hit Points\s+[0-9]+.*'
    r'Challenge\s+[0-9]+.*',
    r'Damage Immunities.*',
    r'Damage Resistances.*',
    r'Level Adjustment:.*',
    r'Challenge Rating:.*',
    r'Initiative:.*',
    r'Treasure:.*',
    r'Environment: .*',
    r'Skills: .*',
    r'Feats: .*',
    r'Organization: .*',
    r'Base Atk .*',
    r'Base Attack .*',
    r'Special Attacks: .*',
    r'Range: .*',
    r'Spell Resistance: .*',
    r'Graft Location: .*',
    r'Weight: .*',
    r'Light: .*\.',
    r'Scripts: .*',
    r'Script: .*',
    r'Speakers: .*',
    r'Format: .*',
    r'Knowledge ([a-z]+)',
    r'[0-9](st|nd|th).*level.*',
    r'.*KORRANBERG CHRONICLE: THREAT DISPATCH',
    r'SIDEBAR: .*',
    r'WIZARDS OF EBERRON: FIVE ARCANE TRADITIONS [0-9]+.*',
    r'.*DUNGEON DECEMBER 2004',
    r'.*[cves]\s+[0-9]+',
]

In [8]:
ignore_list = {'Eberron Character Sheet', 'New & Expanded Feat List', '476764-sample', 'SharnIndexIntegrated'}

In [9]:
sections = []
section_metadata = []
book_count = 0

for file_count, file_name in enumerate(tqdm(os.listdir(CORPUS_FOLDER))):
    if file_name[-3:] != '.md':
        continue
    if file_name[:-3] in ignore_list:
        continue
    book_title = file_name[:-3]
    pdf_title = metadata_dict[file_name[:-3]]['pdf/title']
    edition = metadata_dict[file_name[:-3]]['edition']
    if pdf_title:
        if book_title.endswith('sample'):
            book_title = pdf_title
    book_title = title_fixes.get(book_title, book_title)
    current_page = 0
    current_section_title = ''
    current_section_lines = []
    empty_line_ctr = 0
    file_path = os.path.join(CORPUS_FOLDER, file_name)
    with open(file_path) as f:
        document = f.read()
        lines = document.split('\n')
        line_count = len(lines)
        for line_no, line in enumerate(lines):
            previous_line = lines[line_no - 1].strip() if line_no > 0 else '\0'
            next_line = lines[line_no + 1].strip() if line_no < line_count - 1 else '\0'
            current_line = line.strip()

            if not current_line:
                continue

            m = re.match(r'##\s+Page\s+([0-9]+)', line)
            if m:
                current_page = int(m[1])
                continue

            current_section_word_count = len(" ".join(current_section_lines).split(' '))
    
            if previous_line == '' and (next_line == '' or next_line.startswith('Medium') or next_line.startswith('Large') or next_line.startswith('District Type')):
                may_be_section_title = True
                if current_line.upper() != current_line and string.capwords(current_line) != current_line:
                    may_be_section_title = False
                if current_section_title.lower() == current_line.lower():
                    may_be_section_title = False
                if may_be_section_title and is_integer_string(current_line):
                    may_be_section_title = False
                if may_be_section_title and current_line in disallowed_section_titles:
                    may_be_section_title = False
                if len(re.sub(r'[^a-zA-Z\.]', '', current_line)) < 4:
                    may_be_section_title = False
                if ',' in current_line:
                    may_be_section_title = False
                if may_be_section_title:
                    for regexp in disallowed_section_title_regexp:
                        m = re.match(regexp, current_line, re.IGNORECASE)
                        if m:
                            may_be_section_title = False

                if may_be_section_title:
                    if current_section_title:
                        if current_section_lines and current_section_word_count > 15:
                            text = " ".join(current_section_lines)
                            if len(text.split(' ')) > 5:
                                sections.append(text)
                                section_metadata.append({
                                    'book_title': book_title,
                                    'file_name': file_name,
                                    'edition': edition,
                                    'section_title': current_section_title,
                                    'section_pages': current_section_pages,
                                    'initial_word_count': current_section_word_count
                                })
                    current_section_title = current_line
                    current_section_lines = []
                    current_section_pages = (current_page, current_page)
                    continue

            if current_section_word_count > 450:
                end = line_no + 10
                start = line_no - len(current_section_lines) + 10
                text = "\n".join([l for l in lines[start:end] if l and not l.startswith('## Page ')])
                sections.append(text)
                section_metadata.append({
                    'book_title': book_title,
                    'file_name': file_name,
                    'edition': edition,
                    'section_title': current_section_title,
                    'section_pages': current_section_pages,
                    'initial_word_count': len(text.split(' '))
                })
                current_section_lines = []
                current_section_pages = (current_page, current_page)
                continue

            if current_section_title and current_line:
                current_section_lines.append(current_line)
                current_section_pages = (current_section_pages[0], current_page)
        if current_section_lines:
            text = ' '.join(current_section_lines)
            sections.append(text)
            section_metadata.append({
                'book_title': book_title,
                'file_name': file_name,
                'edition': edition,
                'section_title': current_section_title,
                'section_pages': current_section_pages,
                'initial_word_count': len(text.split(' '))
            })
section_count = len(sections)
assert len(section_metadata) == section_count
assert section_metadata[1]['section_title'] == 'Litmus Strips'
assert section_metadata[11]['section_title'] == 'Automatic Chatelaine'
section_titles = {d['section_title'] for d in section_metadata}
assert 'Automatic Chatelaine' in section_titles
assert 'HORRID HYENA' in section_titles
assert 'HORRID BADGER' in section_titles
assert 'NAZTHARUNE RAKSHASA' in section_titles
assert 'CRYSTEEL' in section_titles
assert 'DENDRITIC' in section_titles
assert 'IRONBARK' in section_titles
assert 'DARKLEAF' in section_titles
assert 'KNIGHT PHANTOM' in section_titles
assert 'SHARN SKYMAGE' in section_titles
assert 'WEAPONS OF KHORVAIRE' in section_titles
assert 'ADVENTURING GEAR' in section_titles
assert '7. The Library' in section_titles
assert '8. The Entry Hall' in section_titles
assert 'THE ORIGIN OF THE FIVE NATIONS' in section_titles
assert 'AUNDAIR AT A GLANCE' in section_titles
assert 'THE COMING OF GALIFAR' in section_titles
assert 'Highhold' in section_titles
assert 'PERIPLANAR OF ICE ~ PERISIAN' in section_titles
assert 'HOUSE THARASHK' in section_titles
assert 'ARGON' in section_titles
assert 'METRON' in section_titles
assert 'DARKLEAF' in section_titles
assert 'DARKLEAF' in section_titles
section_count, book_title

  0%|          | 0/127 [00:00<?, ?it/s]

(21118, 'Magic of Eberron')

In [10]:
# TODO: For second pass, delete:
# Contents
# Thanks
# CREDITS


### Embed

In [11]:
model_name = 'Alibaba-NLP/gte-base-en-v1.5'
model_revision = 'a829fd0e060bb84554da0dfd354d0de0f7712b7f'
# model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(model_name, trust_remote_code=True, revision=model_revision)
model = model.to("cuda")




2025-01-23 19:26:05.901616: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-23 19:26:05.921589: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-23 19:26:05.927581: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [30]:
embeddings = []
for chunk in tqdm(sections):
    embedding = model.encode(chunk, normalize_embeddings=True)
    embeddings.append(embedding)

  0%|          | 0/21118 [00:00<?, ?it/s]

In [31]:
assert all([embedding.size == embeddings[0].size for embedding in embeddings])
embeddings[0].size

768

In [117]:
type(embeddings[0])

numpy.ndarray

### Save

In [102]:
tmp_artefact_folder = mkdtemp()

In [103]:
embeddings_folder = os.path.join(tmp_artefact_folder, 'embeddings')
os.makedirs(embeddings_folder, exist_ok=True)
db = lancedb.connect(embeddings_folder)

In [104]:
model_registry = get_registry().get('sentence-transformers')
sentence_transformer_embeddings = model_registry.create(name=model_name)

In [105]:
class MetaData(BaseModel):
    book_title: str
    file_name: str
    edition: str
    section_title: str
    page_from: int
    page_to: int
    initial_word_count: int


In [106]:
class Document(LanceModel):
    text: str = sentence_transformer_embeddings.SourceField()
    vector: Vector(embeddings[0].size) = sentence_transformer_embeddings.VectorField
    metadata: MetaData
    book_title: str
    file_name: str
    edition: str
    section_title: str
    page_from: int
    page_to: int
    initial_word_count: int


In [107]:
table_name = "documents"
if table_name not in db.table_names():
    table = db.create_table(
        table_name, 
        schema=Document
    )
else:
    table = db.open_table(table_name)

In [108]:
    MetaData(
        book_title=section_metadata[i]['book_title'],
        file_name=section_metadata[i]['file_name'],
        edition=section_metadata[i]['edition'],
        section_title=section_metadata[i]['section_title'],
        page_from=section_metadata[i]['section_pages'][0],
        page_to=section_metadata[i]['section_pages'][1],
        initial_word_count=section_metadata[i]['initial_word_count'],
    )

MetaData(book_title='Magic of Eberron', file_name='Magic of Eberron.md', edition='3e', section_title='Advancement: None', page_from=160, page_to=160, initial_word_count=150)

In [109]:
assert not os.path.exists(os.path.join(embeddings_folder, 'documents.lance', 'data'))
data_to_insert = []
for i in range(len(sections)):
    row = section_metadata[i].copy()
    del row['section_pages']
    row['page_from'] = section_metadata[i]['section_pages'][0]
    row['page_to'] = section_metadata[i]['section_pages'][1]
    row['vector'] = embeddings[i]
    row['text'] = sections[i]
    row['metadata'] = dict(
        book_title=section_metadata[i]['book_title'],
        file_name=section_metadata[i]['file_name'],
        edition=section_metadata[i]['edition'],
        section_title=section_metadata[i]['section_title'],
        page_from=section_metadata[i]['section_pages'][0],
        page_to=section_metadata[i]['section_pages'][1],
        initial_word_count=section_metadata[i]['initial_word_count'],
    )
    data_to_insert.append(row)
    if i % 500 == 0:
        clear_output()
        display(f'{i} / {len(sections)}')
table.add(data_to_insert)
assert os.path.exists(os.path.join(embeddings_folder, 'documents.lance', 'data'))

'21000 / 21118'

In [110]:
model_metadata = {
    'version': ARTEFACT_VERSION,
    'document_version': DOCUMENT_VERSION,
    'chunk_count': len(sections),
    'embedding_format': 'lancedb',
    'embedding_model': {
        'name': model_name, 
        'str': str(model).replace('\n', ''), 
        'revision': model_revision,
    }
}
with open(os.path.join(tmp_artefact_folder, 'model_metadata.pkl'), 'wb') as f:
	pickle.dump(model_metadata, f)

In [111]:
backup_made = False
if os.path.exists(ARTEFACT_FOLDER):
    tmp_backup_folder = mkdtemp()
    shutil.move(ARTEFACT_FOLDER, tmp_backup_folder)
    backup_made = True
shutil.move(tmp_artefact_folder, ARTEFACT_FOLDER)
if backup_made:
    shutil.rmtree(tmp_backup_folder)

### Cross-check
### Do Not Remove, Do Not Ignore, Run To Make Sure Things Are There

In [112]:
assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl')) or os.path.exists(os.path.join(ARTEFACT_FOLDER, 'model_metadata.json'))

In [113]:
with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl'), 'rb') as f:
    model_metadata = pickle.load(f)

assert model_metadata['embedding_model']['str'].startswith('SentenceTransformer')
assert 'version' in model_metadata
assert model_metadata['version'] == ARTEFACT_VERSION
assert 'embedding_format' in model_metadata
if model_metadata['embedding_format'] == 'lancedb':
    assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'embeddings'))
assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'embeddings', 'documents.lance', 'data'))

In [114]:
test_table = lancedb.connect(os.path.join(ARTEFACT_FOLDER, 'embeddings')).open_table(table_name)
df = test_table.head().to_pandas()
assert len(df) == 5
df

[2025-01-23T20:24:25Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-23T20:24:25Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.


Unnamed: 0,text,vector,metadata,book_title,file_name,edition,section_title,page_from,page_to,initial_word_count
0,Cannith Catalogue 2 The following sections des...,"[0.043904938, -0.071870774, 0.06289965, 0.0281...","{'book_title': 'House Cannith Catalogue #2', '...",House Cannith Catalogue #2,881665-eberron_cannith_cat2.md,5e,DAVIDE QUATRINTI'S,1,2,30
1,This carnet contains ten strips of parchment a...,"[-0.012223399, -0.04427961, -0.0037638887, -0....","{'book_title': 'House Cannith Catalogue #2', '...",House Cannith Catalogue #2,881665-eberron_cannith_cat2.md,5e,Litmus Strips,2,2,150
2,Made of an extremely robust leather obtained b...,"[-0.013698697, -0.028550351, 0.002355276, 0.00...","{'book_title': 'House Cannith Catalogue #2', '...",House Cannith Catalogue #2,881665-eberron_cannith_cat2.md,5e,Muck Springers,2,2,89
3,"An apparently ordinary item, this box is made ...","[0.031405162, -0.040899698, 0.10993457, 0.0552...","{'book_title': 'House Cannith Catalogue #2', '...",House Cannith Catalogue #2,881665-eberron_cannith_cat2.md,5e,Metaprocessor,2,2,66
4,"A mass of writhing worms (50% of probability),...","[-0.019326776, -0.027411627, 0.043334335, -0.0...","{'book_title': 'House Cannith Catalogue #2', '...",House Cannith Catalogue #2,881665-eberron_cannith_cat2.md,5e,Input Output,2,2,101
