In [74]:
import os
import json
import re
import string
from collections import Counter
import random
from tempfile import mkdtemp
import shutil
import pickle
from typing import Optional

from tqdm.notebook import tqdm
from IPython.display import clear_output, display

import pyarrow as pa
from sentence_transformers import SentenceTransformer
import sentence_transformers
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from pydantic import BaseModel

In [2]:
# Input parameters
DOCUMENT_VERSION = '03'
CORPUS_FOLDER = f'/jupyterlab/corpus/eberron/v{DOCUMENT_VERSION}'

In [3]:
# Output parameters
ARTEFACT_VERSION = '04'
ARTEFACT_ROOT_FOLDER = os.environ.get('ARTEFACT_ROOT_FOLDER', '/artefact')
ARTEFACT_FOLDER = os.path.join(ARTEFACT_ROOT_FOLDER, 'eberron', f'v{ARTEFACT_VERSION}')

### Read Document Metadata

In [4]:
with open(os.path.join(CORPUS_FOLDER, 'metadata.json')) as f:
    metadata = json.load(f)
metadata_dict = {}
for data in metadata:
    metadata_dict[data['filename'][:-4]] = data

### First Pass (Fast): Split Into Sections

In [5]:
def is_integer_string(value):
    """
    Check if the given value (string) represents an integer.
    
    :param value: The value to check.
    :return: True if the string represents an integer, False otherwise.
    """
    try:
        int(value)
        return True
    except (ValueError, TypeError):
        return False

In [6]:
title_fixes = {
    '881665-eberron_cannith_cat2': 'House Cannith Catalogue #2'
}

In [7]:
disallowed_section_titles = {'Actions', 'Reactions', 'Cannith Catalogue 2', 'THREAT DISPATCH', 'Legendary Actions', 'Wizard Level Feature'
                             'Languages —', '-', 'Finesse', 'Weight Properties', 'Player’s Handbook','Spell Resistance:', 'Capital: Korth',
                             'Combat', 'Capital: Rhukaan Draal', 'STR DEX CON INT WIS CHA', 'Favor', 'DIALECTS', 'WIS CHA',
                             'CLASS FEATURES',
}
disallowed_section_title_regexp = [
    r'Skills\s+.+\+[0-9].*',
    r'Saving Throws\s+.+\+[0-9].*',
    r'.*\-level.*feature',
    r'Languages.*Common.*',
    r'^[0-9\s.\(\)]+$',
    r'Hit Points\s+[0-9]+.*'
    r'Challenge\s+[0-9]+.*',
    r'Damage Immunities.*',
    r'Damage Resistances.*',
    r'Level Adjustment:.*',
    r'Challenge Rating:.*',
    r'Initiative:.*',
    r'Treasure:.*',
    r'Environment: .*',
    r'Skills: .*',
    r'Feats: .*',
    r'Organization: .*',
    r'Base Atk .*',
    r'Base Attack .*',
    r'Special Attacks: .*',
    r'Range: .*',
    r'Spell Resistance: .*',
    r'Graft Location: .*',
    r'Weight: .*',
    r'Light: .*\.',
    r'Scripts: .*',
    r'Script: .*',
    r'Speakers: .*',
    r'Format: .*',
    r'Knowledge ([a-z]+)',
    r'[0-9](st|nd|th).*level.*',
    r'.*KORRANBERG CHRONICLE: THREAT DISPATCH',
    r'SIDEBAR: .*',
    r'WIZARDS OF EBERRON: FIVE ARCANE TRADITIONS [0-9]+.*',
    r'.*DUNGEON DECEMBER 2004',
    r'.*[cves]\s+[0-9]+',
]

In [8]:
ignore_list = {'Eberron Character Sheet', 'New & Expanded Feat List', '476764-sample', 'SharnIndexIntegrated'}

In [9]:
sections = []
section_metadata = []
book_count = 0

for file_count, file_name in enumerate(tqdm(os.listdir(CORPUS_FOLDER))):
    if file_name[-3:] != '.md':
        continue
    if file_name[:-3] in ignore_list:
        continue
    book_title = file_name[:-3]
    pdf_title = metadata_dict[file_name[:-3]]['pdf/title']
    edition = metadata_dict[file_name[:-3]]['edition']
    if pdf_title:
        if book_title.endswith('sample'):
            book_title = pdf_title
    book_title = title_fixes.get(book_title, book_title)
    current_page = 0
    current_section_title = ''
    current_section_lines = []
    empty_line_ctr = 0
    file_path = os.path.join(CORPUS_FOLDER, file_name)
    with open(file_path) as f:
        document = f.read()
        lines = document.split('\n')
        line_count = len(lines)
        for line_no, line in enumerate(lines):
            previous_line = lines[line_no - 1].strip() if line_no > 0 else '\0'
            next_line = lines[line_no + 1].strip() if line_no < line_count - 1 else '\0'
            current_line = line.strip()

            if not current_line:
                continue

            m = re.match(r'##\s+Page\s+([0-9]+)', line)
            if m:
                current_page = int(m[1])
                continue

            current_section_word_count = len(" ".join(current_section_lines).split(' '))
    
            if previous_line == '' and (next_line == '' or next_line.startswith('Medium') or next_line.startswith('Large') or next_line.startswith('District Type')):
                may_be_section_title = True
                if current_line.upper() != current_line and string.capwords(current_line) != current_line:
                    may_be_section_title = False
                if current_section_title.lower() == current_line.lower():
                    may_be_section_title = False
                if may_be_section_title and is_integer_string(current_line):
                    may_be_section_title = False
                if may_be_section_title and current_line in disallowed_section_titles:
                    may_be_section_title = False
                if len(re.sub(r'[^a-zA-Z\.]', '', current_line)) < 4:
                    may_be_section_title = False
                if ',' in current_line:
                    may_be_section_title = False
                if may_be_section_title:
                    for regexp in disallowed_section_title_regexp:
                        m = re.match(regexp, current_line, re.IGNORECASE)
                        if m:
                            may_be_section_title = False

                if may_be_section_title:
                    if current_section_title:
                        if current_section_lines and current_section_word_count > 15:
                            text = " ".join(current_section_lines)
                            if len(text.split(' ')) > 5:
                                sections.append(text)
                                section_metadata.append({
                                    'book_title': book_title,
                                    'file_name': file_name,
                                    'edition': edition,
                                    'section_title': current_section_title,
                                    'section_pages': current_section_pages,
                                    'initial_word_count': current_section_word_count
                                })
                    current_section_title = current_line
                    current_section_lines = []
                    current_section_pages = (current_page, current_page)
                    continue

            if current_section_word_count > 450:
                end = line_no + 10
                start = line_no - len(current_section_lines) + 10
                text = "\n".join([l for l in lines[start:end] if l and not l.startswith('## Page ')])
                sections.append(text)
                section_metadata.append({
                    'book_title': book_title,
                    'file_name': file_name,
                    'edition': edition,
                    'section_title': current_section_title,
                    'section_pages': current_section_pages,
                    'initial_word_count': len(text.split(' '))
                })
                current_section_lines = []
                current_section_pages = (current_page, current_page)
                continue

            if current_section_title and current_line:
                current_section_lines.append(current_line)
                current_section_pages = (current_section_pages[0], current_page)
        if current_section_lines:
            text = ' '.join(current_section_lines)
            sections.append(text)
            section_metadata.append({
                'book_title': book_title,
                'file_name': file_name,
                'edition': edition,
                'section_title': current_section_title,
                'section_pages': current_section_pages,
                'initial_word_count': len(text.split(' '))
            })
section_count = len(sections)
assert len(section_metadata) == section_count
assert section_metadata[1]['section_title'] == 'Litmus Strips'
assert section_metadata[11]['section_title'] == 'Automatic Chatelaine'
section_titles = {d['section_title'] for d in section_metadata}
assert 'Automatic Chatelaine' in section_titles
assert 'HORRID HYENA' in section_titles
assert 'HORRID BADGER' in section_titles
assert 'NAZTHARUNE RAKSHASA' in section_titles
assert 'CRYSTEEL' in section_titles
assert 'DENDRITIC' in section_titles
assert 'IRONBARK' in section_titles
assert 'DARKLEAF' in section_titles
assert 'KNIGHT PHANTOM' in section_titles
assert 'SHARN SKYMAGE' in section_titles
assert 'WEAPONS OF KHORVAIRE' in section_titles
assert 'ADVENTURING GEAR' in section_titles
assert '7. The Library' in section_titles
assert '8. The Entry Hall' in section_titles
assert 'THE ORIGIN OF THE FIVE NATIONS' in section_titles
assert 'AUNDAIR AT A GLANCE' in section_titles
assert 'THE COMING OF GALIFAR' in section_titles
assert 'Highhold' in section_titles
assert 'PERIPLANAR OF ICE ~ PERISIAN' in section_titles
assert 'HOUSE THARASHK' in section_titles
assert 'ARGON' in section_titles
assert 'METRON' in section_titles
assert 'DARKLEAF' in section_titles
assert 'DARKLEAF' in section_titles
section_count, book_title

  0%|          | 0/127 [00:00<?, ?it/s]

(21118, 'Magic of Eberron')

In [10]:
# TODO: For second pass, delete:
# Contents
# Thanks
# CREDITS


In [11]:
ls -al /jupyterlab/models/hf/hub

total 56
drwxr-xr-x 13 root root 4096 Feb  4 20:42 [0m[01;34m.[0m/
drwxr-xr-x  4 root root 4096 Jan  3 21:52 [01;34m..[0m/
drwxr-xr-x 12 root root 4096 Feb  4 20:42 [01;34m.locks[0m/
drwxr-xr-x  6 root root 4096 Jan  3 04:32 [01;34mmodels--Alibaba-NLP--gte-base-en-v1.5[0m/
drwxr-xr-x  5 root root 4096 Jan  3 04:32 [01;34mmodels--Alibaba-NLP--new-impl[0m/
drwxr-xr-x  6 root root 4096 Feb  4 20:22 [01;34mmodels--BAAI--bge-large-en-v1.5[0m/
drwxr-xr-x  6 root root 4096 Feb  4 20:23 [01;34mmodels--HIT-TMG--KaLM-embedding-multilingual-mini-instruct-v1.5[0m[K/
drwxr-xr-x  6 root root 4096 Feb  4 20:34 [01;34mmodels--intfloat--e5-mistral-7b-instruct[0m/
drwxr-xr-x  5 root root 4096 Feb  4 20:42 [01;34mmodels--jinaai--jina-embeddings-v3[0m/
drwxr-xr-x  6 root root 4096 Jan  4 17:38 [01;34mmodels--mistralai--Mistral-7B-Instruct-v0.3[0m/
drwxr-xr-x  6 root root 4096 Feb  3 18:45 [01;34mmodels--mistralai--Mistral-Small-24B-Instruct-2501[0m/
drwxr-xr-x  6 root root 4096 Jan

In [123]:
ls -al /jupyterlab/models/hf/hub/models--mistralai--Mistral-Small-24B-Instruct-2501/snapshots

total 12
drwxr-xr-x 3 root root 4096 Feb  3 18:26 [0m[01;34m.[0m/
drwxr-xr-x 6 root root 4096 Feb  3 18:45 [01;34m..[0m/
drwxr-xr-x 2 root root 4096 Feb  3 18:45 [01;34m20b2ed1c4e9af44b9ad125f79f713301e27737e2[0m/


In [13]:
!df

Filesystem     1K-blocks      Used Available Use% Mounted on
overlay         52416492  30778468  21638024  59% /
tmpfs              65536         0     65536   0% /dev
tmpfs           16193236         0  16193236   0% /sys/fs/cgroup
/dev/nvme2n1   515858840 103498116 412344340  21% /jupyterlab
/dev/nvme0n1p1  52416492  30778468  21638024  59% /etc/hosts
shm                65536         4     65532   1% /dev/shm
tmpfs           31696296        12  31696284   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs           16193236        12  16193224   1% /proc/driver/nvidia
tmpfs           16193236      2120  16191116   1% /run/nvidia-persistenced/socket
tmpfs           16193236         0  16193236   0% /proc/acpi
tmpfs           16193236         0  16193236   0% /sys/firmware


### Embed

In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
ls -al /

In [14]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_revision = 'fa97f6e7cb1a59073dff9e6b13e2715cf7475ac9'
model_name = 'HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5'
model_revision = '30cf7fd484e8c883443e0632e9a8e9caeffe2830'
model_name = 'BAAI/bge-large-en-v1.5'
model_revision = 'd4aa6901d3a41ba39fb536a557fa166f842b0e09'
# model_name = 'jinaai/jina-embeddings-v3'
# model_revision = None
# model_name = 'intfloat/e5-mistral-7b-instruct'
# model_revision = None
# model_revision = None
model = SentenceTransformer(model_name, trust_remote_code=True, revision=model_revision)
model = model.to("cuda")




In [15]:
embeddings = []
for chunk in tqdm(sections):
    embedding = model.encode(chunk, normalize_embeddings=True)
    embeddings.append(embedding)

  0%|          | 0/21118 [00:00<?, ?it/s]

In [16]:
assert all([embedding.size == embeddings[0].size for embedding in embeddings])
embeddings[0].size

1024

In [17]:
type(embeddings[0])

numpy.ndarray

### Save

In [110]:
tmp_artefact_folder = mkdtemp()

In [111]:
embeddings_folder = os.path.join(tmp_artefact_folder, 'embeddings')
os.makedirs(embeddings_folder, exist_ok=True)
db = lancedb.connect(embeddings_folder)

In [112]:
model_registry = get_registry().get('sentence-transformers')
sentence_transformer_embeddings = model_registry.create(name=model_name)

In [113]:
class MetaData(BaseModel):
    book_title: Optional[str] = None
    edition: Optional[str] = None
    file_name: Optional[str] = None
    initial_word_count: Optional[int] = None
    page_from: Optional[int] = None
    page_to: Optional[int] = None
    section_title: Optional[str] = None


In [114]:
class Document(LanceModel):
    text: str = sentence_transformer_embeddings.SourceField()
    vector: Vector(embeddings[0].size) = sentence_transformer_embeddings.VectorField
    metadata: MetaData
    book_title: str
    edition: str
    file_name: str
    initial_word_count: int
    page_from: int
    page_to: int
    section_title: str


In [115]:
table_name = "documents"
if table_name not in db.table_names():
    table = db.create_table(
        table_name, 
        schema=Document
    )
else:
    table = db.open_table(table_name)

In [117]:
assert not os.path.exists(os.path.join(embeddings_folder, 'documents.lance', 'data'))
data_to_insert = []
for i in range(len(sections)):
    row = section_metadata[i].copy()
    del row['section_pages']
    row['page_from'] = section_metadata[i]['section_pages'][0]
    row['page_to'] = section_metadata[i]['section_pages'][1]
    row['vector'] = embeddings[i]
    row['text'] = sections[i]
    row['metadata'] = dict(
        book_title=section_metadata[i]['book_title'],
        edition=section_metadata[i]['edition'],
        file_name=section_metadata[i]['file_name'],
        initial_word_count=section_metadata[i]['initial_word_count'],
        page_from=section_metadata[i]['section_pages'][0],
        page_to=section_metadata[i]['section_pages'][1],
        section_title=section_metadata[i]['section_title'],
    )
    data_to_insert.append(row)
    if i % 500 == 0:
        clear_output()
        display(f'{i} / {len(sections)}')
table.add(data_to_insert)
assert os.path.exists(os.path.join(embeddings_folder, 'documents.lance', 'data'))

'21000 / 21118'

In [118]:
model_metadata = {
    'version': ARTEFACT_VERSION,
    'document_version': DOCUMENT_VERSION,
    'chunk_count': len(sections),
    'embedding_format': 'lancedb',
    'embedding_model': {
        'name': model_name, 
        'str': str(model).replace('\n', ''), 
        'revision': model_revision,
        'sentence_transformers_version': sentence_transformers.__version__,
    }
}
with open(os.path.join(tmp_artefact_folder, 'model_metadata.pkl'), 'wb') as f:
	pickle.dump(model_metadata, f)

In [119]:
backup_made = False
if os.path.exists(ARTEFACT_FOLDER):
    tmp_backup_folder = mkdtemp()
    shutil.move(ARTEFACT_FOLDER, tmp_backup_folder)
    backup_made = True
shutil.move(tmp_artefact_folder, ARTEFACT_FOLDER)
if backup_made:
    shutil.rmtree(tmp_backup_folder)

### Cross-check
### Do Not Remove, Do Not Ignore, Run To Make Sure Things Are There

In [120]:
assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl')) or os.path.exists(os.path.join(ARTEFACT_FOLDER, 'model_metadata.json'))

In [121]:
with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl'), 'rb') as f:
    model_metadata = pickle.load(f)

assert model_metadata['embedding_model']['str'].startswith('SentenceTransformer')
assert 'version' in model_metadata
assert model_metadata['version'] == ARTEFACT_VERSION
assert 'embedding_format' in model_metadata
if model_metadata['embedding_format'] == 'lancedb':
    assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'embeddings'))
assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'embeddings', 'documents.lance', 'data'))

In [122]:
test_table = lancedb.connect(os.path.join(ARTEFACT_FOLDER, 'embeddings')).open_table(table_name)
df = test_table.head().to_pandas()
assert len(df) == 5
df

[2025-02-05T00:20:57Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-02-05T00:20:57Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.


Unnamed: 0,text,vector,metadata,book_title,edition,file_name,initial_word_count,page_from,page_to,section_title
0,Cannith Catalogue 2 The following sections des...,"[-0.00028172263, 0.0038665095, 0.03842889, 0.0...","{'book_title': 'House Cannith Catalogue #2', '...",House Cannith Catalogue #2,5e,881665-eberron_cannith_cat2.md,30,1,2,DAVIDE QUATRINTI'S
1,This carnet contains ten strips of parchment a...,"[0.010030885, 0.0015645161, 0.05549274, 0.0111...","{'book_title': 'House Cannith Catalogue #2', '...",House Cannith Catalogue #2,5e,881665-eberron_cannith_cat2.md,150,2,2,Litmus Strips
2,Made of an extremely robust leather obtained b...,"[0.0031268378, 0.032974523, 0.020474432, -0.01...","{'book_title': 'House Cannith Catalogue #2', '...",House Cannith Catalogue #2,5e,881665-eberron_cannith_cat2.md,89,2,2,Muck Springers
3,"An apparently ordinary item, this box is made ...","[-0.012128428, 0.01501773, 0.035322897, -0.018...","{'book_title': 'House Cannith Catalogue #2', '...",House Cannith Catalogue #2,5e,881665-eberron_cannith_cat2.md,66,2,2,Metaprocessor
4,"A mass of writhing worms (50% of probability),...","[0.0016219477, 0.0032230471, 0.008808815, 0.01...","{'book_title': 'House Cannith Catalogue #2', '...",House Cannith Catalogue #2,5e,881665-eberron_cannith_cat2.md,101,2,2,Input Output
