In [34]:
import json
import os
import re
import string
import requests
import random
from tempfile import mkdtemp
from typing import Optional
from pydantic import BaseModel, Field
import pickle
import shutil
from time import sleep


from tqdm.notebook import tqdm

import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
import sentence_transformers
import numpy as np

In [2]:
%timeit
response = requests.get("http://embed:11434/api/tags")
[m['model'] for m in response.json()['models']]

['bge-large:335m', 'mxbai-embed-large:335m', 'all-minilm:33m', 'bge-m3:567m']

In [6]:
%timeit
response = requests.get("http://ollama:11434/api/tags")
[m['model'] for m in response.json()['models']]

['deepseek-r1:7b', 'deepseek-r1:1.5b', 'gemma2:2b']

In [4]:
%timeit
response = requests.post("http://ollama:11434/api/generate", json={
    "model": "gemma2:2b",
    "prompt": "Why is the sky blue?",
    "options": {"use_mmap": False},
    "stream": False
})

In [5]:
response

<Response [200]>

In [6]:
print(response.json()['response'])

The sky appears blue due to a phenomenon called **Rayleigh scattering**. Here's a breakdown:

**1. Sunlight Composition:** Sunlight contains all colors of the rainbow, which we perceive as white light. However, this light travels in different wavelengths (colors). 

**2. Earth's Atmosphere:** Earth's atmosphere is filled with tiny molecules like nitrogen and oxygen. These molecules are much smaller than the wavelength of visible light.

**3. Scattering of Light:** When sunlight enters the atmosphere, it collides with these molecules.  The shorter wavelength colors (blue and violet) scatter more effectively than longer wavelengths (red and orange).

**4. Why Blue?** This scattering effect causes blue light to bounce off in all directions, leading to what we see as the sky's vibrant blue color. 

**5. The Sun's Influence:**  The sun also plays a role; it emits more red and orange wavelengths than blue, which are then scattered throughout the atmosphere. This combination of factors create

In [7]:
# Input parameters
DOCUMENT_VERSION = '04'
CORPUS_FOLDER = f'/jupyterlab/corpus/eberron/v{DOCUMENT_VERSION}'
CORPUS_FOLDER = '/corpus/texts_v4'

In [8]:
# Output parameters
ARTEFACT_VERSION = '06'
ARTEFACT_ROOT_FOLDER = os.environ.get('ARTEFACT_ROOT_FOLDER', '/artefact')
ARTEFACT_FOLDER = os.path.join(ARTEFACT_ROOT_FOLDER, 'eberron', f'v{ARTEFACT_VERSION}')

In [9]:
def is_integer_string(value):
    """
    Check if the given value (string) represents an integer.
    
    :param value: The value to check.
    :return: True if the string represents an integer, False otherwise.
    """
    try:
        int(value)
        return True
    except (ValueError, TypeError):
        return False

# Read Document Metadata

In [10]:
with open(os.path.join(CORPUS_FOLDER, 'metadata.json')) as f:
    metadata = json.load(f)
metadata_dict = {}
for data in metadata:
    metadata_dict[data['filename'][:-4]] = data

# First Pass: Get TOC, Organize By Page

In [11]:
toc_pattern = re.compile(r".+\.+\s*\d+$")

In [12]:
toc = {}
toc_pages = {}
pages = {}
corpus = {}
for file_count, file_name in enumerate(tqdm(os.listdir(CORPUS_FOLDER))):
    
    if file_name[-3:] != '.md':
        continue
    book_title = file_name[:-3]
    toc[book_title] = []
    toc_pages[book_title] = []
    pages[book_title] = {}

    file_path = os.path.join(CORPUS_FOLDER, file_name)

    with open(file_path) as f:
        document = f.read()
        lines = document.split('\n')
        line_count = len(lines)
        current_page_content = []
        count_toc_lines = 0
        current_page = 0
        for line_no, line in enumerate(lines):
            previous_line = lines[line_no - 1].strip() if line_no > 0 else '\0'
            next_line = lines[line_no + 1].strip() if line_no < line_count - 1 else '\0'
            current_line = line.strip()

            m = re.match(r'##\s+Page\s+([0-9]+)', line)
            if m:
                if is_current_page_toc:
                    toc[file_name[:-3]].append(current_page_content)
                else:
                    pages[file_name[:-3]][current_page] = current_page_content
                current_page = int(m[1])
                count_toc_lines = 0
                current_page_content = []
                is_current_page_toc = False
                continue

            current_page_content.append(current_line)

            if toc_pattern.search(line):
                count_toc_lines += 1

            is_current_page_toc = (count_toc_lines > 5)



  0%|          | 0/159 [00:00<?, ?it/s]

In [13]:
line_pattern = re.compile(r"([^\.]+)\.[\.a-z]+\s*(\d+)")
chapter_line_pattern = re.compile(r"\s*chapter\s*(\d+):?[^0-9]*\s([0-9]+)")

In [14]:
chapter_pages = {}
for book, contents in toc.items():
    if not contents:
        continue
    chapter_pages[book] = {}
    current_chapter = 0
    for page in contents:
        for line in page:
            for match in chapter_line_pattern.finditer(line.lower()):
                previous_chapter = current_chapter
                current_chapter = int(match[1])
                chapter_pages[book][int(match[2])] = current_chapter


In [15]:
starting_pages = {}
for book, contents in toc.items():
    if not contents:
        continue
    starting_pages[book] = {}
    text = ""
    current_chapter = 0
    # print(book)
    for page in contents:
        for line in page:
            for match in chapter_line_pattern.finditer(line.lower()):
                # print(line)
                previous_chapter = current_chapter
                current_chapter = int(match[1])
            for match in line_pattern.finditer(line):
                if match[1].strip():
                    starting_pages[book][int(match[2])] = (match[1].strip(), current_chapter)
        text += '\n'.join(page)



# Second Pass: Organize by Section

In [16]:
sections = []
section_metadata = []

for file_count, file_name in enumerate(tqdm(os.listdir(CORPUS_FOLDER))):
    if file_name[-3:] != '.md':
        continue
    book_title = file_name[:-3]

    current_section_title = ''
    current_section_lines = []
    
    file_path = os.path.join(CORPUS_FOLDER, file_name)
    with open(file_path) as f:
        document = f.read()
        lines = document.split('\n')
        line_count = len(lines)
        for line_no, line in enumerate(lines):
            previous_line = lines[line_no - 1].strip() if line_no > 0 else '\0'
            next_line = lines[line_no + 1].strip() if line_no < line_count - 1 else '\0'
            current_line = line.strip()

            is_section_end = is_book_end = False

            m = re.match(r'##\s+Page\s+([0-9]+)', line)
            if m:
                current_page = int(m[1])
                pg = toc_title = chapter = None
                for pg, (toc_title, chapter) in starting_pages.get(book_title, {}).items():
                    if current_section_pages[0] < pg < current_section_pages[1]:
                        break
                if not chapter:
                    for pg, chapter in chapter_pages.get(book_title, {}).items():
                        if current_section_pages[0] < pg < current_section_pages[1]:
                            break
                continue

            current_section_word_count = len(" ".join(current_section_lines).split(' '))

            if re.match(r'^\[he', current_line):
                current_line = current_line.replace('[he', 'The')

    
            if previous_line == '' and (current_line.upper() == current_line or next_line == '' or next_line.startswith('Medium') or next_line.startswith('Large') or next_line.startswith('District Type') or next_line == '\0'):
                if next_line == '\0':
                    is_book_end = True
                else:
                    may_be_section_title = True
                    
                if current_line.upper() != current_line and string.capwords(current_line) != current_line:
                    may_be_section_title = False
                if current_section_title.lower() == current_line.lower():
                    may_be_section_title = False
                if may_be_section_title and is_integer_string(current_line):
                    may_be_section_title = False
                if len(re.sub(r'[^a-zA-Z\.]', '', current_line)) < 4:
                    may_be_section_title = False
                if ',' in current_line:
                    may_be_section_title = False
                if current_line.endswith('.') and not current_line.endswith('...'):
                    may_be_section_title = False

                if may_be_section_title or is_book_end:
                    if current_section_lines and current_section_word_count > 15 or is_book_end:
                        is_section_end = True
                        # text = current_section_title + '\n\n' + '\n'.join(current_section_lines)
                        text = '\n'.join(current_section_lines)
                        if len(text.split(' ')) > 5:
                            sections.append(text)
                            section_metadata.append({
                                'book_title': book_title,
                                'file_name': file_name,
                                'edition': metadata_dict[book_title]['edition'],
                                'pdf_book_title': metadata_dict[book_title]['pdf/title'],
                                'pdf_book_author': metadata_dict[book_title]['pdf/author'],
                                'book_category': metadata_dict[book_title]['book_category'],
                                'section_title': current_section_title,
                                'toc_title': toc_title,
                                'page_start': current_section_pages[0],
                                'page_end': current_section_pages[1],
                                'initial_word_count': current_section_word_count,
                                'chapter': chapter,
                            })
            if is_section_end or not current_section_title:
                current_section_title = current_line
                current_section_lines = []
                current_section_pages = (current_page, current_page)

            if current_section_title:
                current_section_lines.append(current_line)
                current_section_pages = (current_section_pages[0], current_page)
section_count = len(sections)


  0%|          | 0/159 [00:00<?, ?it/s]

In [17]:
len(sections)

21722

# Third Pass: Further Chunking

In [18]:
cache_dir = 'tmp'


In [19]:
file_list = sorted(os.listdir(cache_dir))
json_list = [filename for filename in file_list if filename.endswith('.json')]
md_list = [filename for filename in file_list if filename.endswith('.md')]
assert all([isinstance(int(filename.split('.')[0]), int) for filename in file_list])
# TODO: Change this to accommodate the new filename format.
chunk_metadata = []
chunks = []
with open(os.path.join(cache_dir, json_list[0]), 'r') as f:
    _metadata = json.load(f)
is_metadata_cached_for_all_md = False
if isinstance(_metadata, list):
    chunk_metadata = _metadata
    if len(chunk_metadata) == len(md_list):
        is_metadata_cached_for_all_md = True
else:
    if len(json_list) == len(md_list):
        is_metadata_cached_for_all_md = True
if not is_metadata_cached_for_all_md:
    raise NotImplementedError("Cache seems incomplete.")
chunk_count = int(max(json_list).split('.')[0]) + 1
if chunk_count != len(md_list):
    raise NotImplementedError("Cache seems incomplete.")
if len(chunk_metadata) == chunk_count:
    for filename in tqdm(md_list):
        with open(os.path.join(cache_dir, filename), 'r') as f:
            chunks.append(f.read())
else:
    for filename in tqdm(md_list):
        basename, extension = filename.split('.')
        with open(os.path.join(cache_dir, f'{basename}.md'), 'r') as f:
            chunks.append(f.read())
        with open(os.path.join(cache_dir, f'{basename}.json'), 'r') as f:
            chunk_metadata.append(json.load(f))
len(chunk_metadata), len(chunks), chunk_count

  0%|          | 0/22181 [00:00<?, ?it/s]

(22181, 22181, 22181)

In [20]:
section_metadata_hashes = [hashlib.sha1(json.dumps(d).encode('ascii')).hexdigest() for d in section_metadata]
chunk_metadata_hashes = [hashlib.sha1(json.dumps(d).encode('ascii')).hexdigest() for d in chunk_metadata]
len(set(section_metadata_hashes)), len(set(chunk_metadata_hashes))

NameError: name 'hashlib' is not defined

In [21]:
is_cache_recovered = False
if len(chunk_metadata) == len(chunks):
    if len(chunks) == chunk_count:
        is_cache_recovered = True
is_cache_recovered

True

In [34]:
if not is_cache_recovered:
    chunks = []
    chunk_metadata = []
    j = 0
    
    if os.exists(cache_dir):
        os.rename(cache_dir, f'{cache_dir}.bak')
    else
        os.mkdir('tmp')
    
    for i, section in enumerate(tqdm(sections)):
        if section_processsed[i]:
            continue
        if section_metadata[i]['initial_word_count'] > 400:
            _chunks = section.split('\n\n\n')
        else:
            _chunks = [section]
        for chunk in _chunks:
            prompt = f"The following text concerns a fantasy setting. Does it have at least one coherent paragraph? Answer with yes or no. Do not include anything else. The text:\n\n"
            response = requests.post("http://ollama:11434/api/generate", 
                                     # timeout=4*60,
                                     json={
                                        "model": "gemma2:2b",
                                        "prompt": prompt + '========================\n\n\n' + chunk + '\n\n\n========================',
                                        "options": {
                                            "use_mmap": False
                                        },
                                        "stream": False
                                    })
            output = response.json().get('response', '').lower().strip()
            if not (output.startswith('yes') or output.startswith('no')):
                _logs.append(f'Section {i} is not assessed properly as being a coherent paragraph. Output from LLM: {output}')
            is_coherent = output.startswith('yes')
            if not is_coherent:
                continue
    
    
            prompt = f"The following text may have errors and typos. Fix the errors and typos and return a clean text. Do not include anything else
            
            in your response. The text:\n\n"
            response = requests.post("http://ollama:11434/api/generate", json={
                "model": "gemma2:2b",
                "prompt": prompt + '========================\n\n\n' + chunk + '\n\n\n========================',
                "options": {
                    "temperature": 0.1, 
                    "use_mmap": False
                },
                "stream": False
            })
            corrected_chunk = response.json().get('response', '')
            chunks.append(corrected_chunk)
            chunk_metadata.append(section_metadata[i])
    
            # filename = str(j).zfill(no_of_digits)
            filename = str(i).zfill(no_of_digits) + '-' + str(j).zfill(no_of_digits)
            with open(os.path.join(cache_dir, f'{basename}.md'), 'w') as f:
                f.write(corrected_chunk)
            with open(os.path.join(cache_dir, f'{basename}.json'), 'w') as f:
                json.dump(section_metadata[i], f)
        section_processsed[i] = True


  0%|          | 0/21722 [00:00<?, ?it/s]

In [30]:
response = requests.get("http://ollama:11434/api/tags")
models = response.json().get('models', [])
[m['model'] for m in response.json()['models']]

['bge-m3:567m',
 'all-minilm:33m',
 'mxbai-embed-large:335m',
 'deepseek-r1:7b',
 'deepseek-r1:1.5b',
 'gemma2:2b']

In [27]:
response = requests.get("http://embed:11434/api/tags")
models = response.json().get('models', [])
[m['model'] for m in response.json()['models']]

['bge-large:335m', 'bge-m3:567m', 'all-minilm:33m', 'mxbai-embed-large:335m']

# Save the Embeddings

In [48]:
embedding_model_name = 'all-minilm'
embedding_model_tag = '33m'
# embedding_model_name = 'bge-large'
# embedding_model_tag = '335m'

In [49]:
response = requests.get("http://ollama:11434/api/tags")
model_names = [m['model'] for m in response.json()['models']]
while f'{embedding_model_name}:{embedding_model_tag}' not in model_names:
    response = requests.get("http://ollama:11434/api/tags")
    model_names = [m['model'] for m in response.json()['models']]
    sleep(60)

In [61]:
model_registry = get_registry().get('sentence-transformers')
sentence_transformer_embeddings = model_registry.create(name=embedding_model_name)

In [79]:
tmp_artefact_folder = mkdtemp()

In [51]:
# response = requests.post("http://ollama:11434/api/embed",
#                          json={
#                              "model": f"{embedding_model_name}:{embedding_model_tag}",
#                               "options": {
#                                   "use_mmap": False
#                               },
#                              "input": random.choice(sections),
#                          })
# embeddings = np.array(response.json().get('embeddings'))
# embeddings[0].shape

(384,)

In [80]:
def get_embeddings_vector(text: str) -> np.ndarray:
    response = requests.post("http://ollama:11434/api/embed",
                         json={
                             "model": f"{embedding_model_name}:{embedding_model_tag}",
                              "options": {
                                  "use_mmap": False
                              },
                             "input": text,
                         })
    embeddings = response.json().get('embeddings')
    return np.array(embeddings)[0]

In [81]:
embeddings_folder = os.path.join(tmp_artefact_folder, 'embeddings')
os.makedirs(embeddings_folder, exist_ok=True)
db = lancedb.connect(embeddings_folder)

In [82]:
embeddings = get_embeddings_vector(random.choice(sections))
embeddings.size

384

In [83]:
class MetaData(BaseModel):
    book_category: Optional[str] = None
    book_title: Optional[str] = None
    chapter: Optional[str] = None
    edition: Optional[str] = None
    file_name: Optional[str] = None
    initial_word_count: Optional[int] = None
    page_end: Optional[int] = None
    page_start: Optional[int] = None
    pdf_book_author: Optional[str] = None
    pdf_book_title: Optional[str] = None
    section_title: Optional[str] = None
    toc_title: Optional[str] = None


In [84]:
class Document(LanceModel):
    text: str = sentence_transformer_embeddings.SourceField()
    vector: Vector(embeddings.size) = sentence_transformer_embeddings.VectorField
    metadata: MetaData
    book_title: Optional[str] = None
    edition: Optional[str] = None
    pdf_book_title: Optional[str] = None
    pdf_book_author: Optional[str] = None
    book_category: Optional[str] = None
    file_name: Optional[str] = None
    initial_word_count: Optional[int] = None
    page_start: Optional[int] = None
    page_end: Optional[int] = None
    section_title: Optional[str] = None
    toc_title: Optional[str] = None
    chapter: Optional[str] = None


In [85]:
table_name = "documents"
if table_name not in db.table_names():
    table = db.create_table(
        table_name, 
        schema=Document
    )
else:
    table = db.open_table(table_name)

In [86]:
len(chunks)

22181

In [59]:
rows = []
for i, chunk in enumerate(tqdm(chunks)):
    # TODO: Change to get_embeddings_vector(chunk), remove [0] below.
    response = requests.post("http://ollama:11434/api/embed",
                                 json={
                                     "model": f"{embedding_model_name}:{embedding_model_tag}",
                                      "options": {
                                          "use_mmap": False
                                      },
                                     "input": chunk,
                                 })
    embeddings = response.json().get('embeddings')

    row = {}
    row['vector'] = embeddings[0]
    row['text'] = chunk
    row['metadata'] = dict(
        book_title=chunk_metadata[i]['book_title'],
        edition=chunk_metadata[i]['edition'],
        pdf_book_title=chunk_metadata[i]['pdf_book_title'],
        pdf_book_author=chunk_metadata[i]['pdf_book_author'],
        book_category=chunk_metadata[i]['book_category'],
        file_name=chunk_metadata[i]['file_name'],
        initial_word_count=chunk_metadata[i]['initial_word_count'],
        page_start=chunk_metadata[i]['page_start'],
        page_end=chunk_metadata[i]['page_end'],
        section_title=chunk_metadata[i]['section_title'],
        toc_title=chunk_metadata[i]['toc_title'],
        chapter=chunk_metadata[i]['chapter'],
    )
    row['book_title'] = chunk_metadata[i]['book_title']
    row['edition'] = chunk_metadata[i]['edition']
    row['pdf_book_title'] = chunk_metadata[i]['pdf_book_title']
    row['pdf_book_author'] = chunk_metadata[i]['pdf_book_author']
    row['book_category'] = chunk_metadata[i]['book_category']
    row['file_name'] = chunk_metadata[i]['file_name']
    row['initial_word_count'] = chunk_metadata[i]['initial_word_count']
    row['page_start'] = chunk_metadata[i]['page_start']
    row['page_end'] = chunk_metadata[i]['page_end']
    row['section_title'] = chunk_metadata[i]['section_title']
    row['toc_title'] = chunk_metadata[i]['toc_title']
    row['chapter'] = chunk_metadata[i]['chapter']
    rows.append(row)
    

  0%|          | 0/22181 [00:00<?, ?it/s]

In [None]:
# TODO: Move all the embedding generation here.

In [87]:
table.add(rows)

In [88]:
assert os.path.exists(os.path.join(embeddings_folder, 'documents.lance', 'data'))

In [89]:
model_metadata = {
    'version': ARTEFACT_VERSION,
    'document_version': DOCUMENT_VERSION,
    'chunk_count': len(sections),
    'embedding_format': 'lancedb',
    'embedding_model': {
        'name': embedding_model_name, 
        'tag': embedding_model_tag, 
    }
}
with open(os.path.join(tmp_artefact_folder, 'model_metadata.pkl'), 'wb') as f:
	pickle.dump(model_metadata, f)

In [90]:
backup_made = False
if os.path.exists(ARTEFACT_FOLDER):
    tmp_backup_folder = mkdtemp()
    shutil.move(ARTEFACT_FOLDER, tmp_backup_folder)
    backup_made = True
shutil.move(tmp_artefact_folder, ARTEFACT_FOLDER)
if backup_made:
    shutil.rmtree(tmp_backup_folder)

### Cross-check
### Do Not Remove, Do Not Ignore, Run To Make Sure Things Are There

In [91]:
assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl')) or os.path.exists(os.path.join(ARTEFACT_FOLDER, 'model_metadata.json'))

In [92]:
with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl'), 'rb') as f:
    model_metadata = pickle.load(f)

assert 'name' in  model_metadata['embedding_model']
assert 'version' in model_metadata
assert model_metadata['version'] == ARTEFACT_VERSION
assert 'embedding_format' in model_metadata
if model_metadata['embedding_format'] == 'lancedb':
    assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'embeddings'))
assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'embeddings', 'documents.lance', 'data'))

In [93]:
table_name = "documents"
test_table = lancedb.connect(os.path.join(ARTEFACT_FOLDER, 'embeddings')).open_table(table_name)
df = test_table.head().to_pandas()
assert len(df) == 5
df

[2025-05-01T06:53:23Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-05-01T06:53:23Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.


Unnamed: 0,text,vector,metadata,book_title,edition,pdf_book_title,pdf_book_author,book_category,file_name,initial_word_count,page_start,page_end,section_title,toc_title,chapter
0,CHANGELINGS\n\nThe question of identity is arg...,"[0.00035601965, 0.06574339, -0.013138834, 0.09...","{'book_category': 'eberron_5e_homebrew', 'book...",1090548-Blessed_of_The_Traveler,5e,Blessed of The Traveler: Queer Gender Identity...,Lex Permann,eberron_5e_homebrew,1090548-Blessed_of_The_Traveler.md,91,3,3,CHANGELINGS,,
1,BLESSED OF THE TRAVELER: QUEER GENDER IDENTITY...,"[0.0031311298, 0.08553847, -0.031814065, 0.021...","{'book_category': 'eberron_5e_homebrew', 'book...",1090548-Blessed_of_The_Traveler,5e,Blessed of The Traveler: Queer Gender Identity...,Lex Permann,eberron_5e_homebrew,1090548-Blessed_of_The_Traveler.md,323,3,4,BLESSED OF THE TRAVELER: QUEER GENDER IDENTITY...,,
2,"KALASHTAR\n\nFrom birth, a kalashtar stands ap...","[0.07028267, 0.058363415, -0.026595244, 0.0451...","{'book_category': 'eberron_5e_homebrew', 'book...",1090548-Blessed_of_The_Traveler,5e,Blessed of The Traveler: Queer Gender Identity...,Lex Permann,eberron_5e_homebrew,1090548-Blessed_of_The_Traveler.md,442,4,4,KALASHTAR,,
3,BLESSED OF THE TRAVELER: QUEER GENDER IDENTITY...,"[0.015201003, 0.08552606, -0.042522456, -0.018...","{'book_category': 'eberron_5e_homebrew', 'book...",1090548-Blessed_of_The_Traveler,5e,Blessed of The Traveler: Queer Gender Identity...,Lex Permann,eberron_5e_homebrew,1090548-Blessed_of_The_Traveler.md,60,4,5,BLESSED OF THE TRAVELER: QUEER GENDER IDENTITY...,,
4,TAIRNADAL ELVES\n\nAt the point when a Tairnad...,"[-0.074120075, 0.12916176, -0.0452898, -0.0289...","{'book_category': 'eberron_5e_homebrew', 'book...",1090548-Blessed_of_The_Traveler,5e,Blessed of The Traveler: Queer Gender Identity...,Lex Permann,eberron_5e_homebrew,1090548-Blessed_of_The_Traveler.md,482,5,5,TAIRNADAL ELVES,,


In [94]:
assert len(chunks) == len(chunk_metadata)
len(chunks)

22181

# === END OF PROCESS ===

In [None]:
prompt = f"The following text concerns a fantasy setting. Does it have at least one coherent paragraph? Answer with yes or no. Do not include anything else. The text:\n\n"
response = requests.post("http://ollama:11434/api/generate", json={
    "model": "gemma2:2b",
    "prompt": prompt + '========================\n\n\n' + section + '\n\n\n========================',
    "stream": False
})
output = response.json().get('response', '').lower()
is_coherent = output == 'yes' or output.startswith('yes')
print(response.json().get('response', response.json().get('error')))
# print(section)
is_coherent


In [307]:
prompt = f"The following text may have errors and typos. Fix the errors and typos and return a clean text. Do not include anything else in your response. The text:\n\n"
response = requests.post("http://ollama:11434/api/generate", json={
    "model": "gemma2:2b",
    "prompt": prompt + '========================\n\n\n' + section + '\n\n\n========================',
    "options": {"temperature": 0.1},
    "stream": False
})
corrected_text = response.json().get('response', '')
print(response.json().get('response', response.json().get('error')))



MARK OF PASSAGE

The Mark of Passage bestows magical powers related to transportation and teleportation.

House: The humans of House Orien are the bearers of the Mark of Passage. They control a worldwide Couriers Guild that uses its powers to carry parcels, messages, and people over great distances instantaneously—for a high price. The house has holdings and operates across Khorvaire, though it traces its origins to Aundair. The house’s Transportation Guild oversees lightning rail and caravan routes throughout central Khorvaire.

Unmarked members of House Orien work as mundane couriers, often within the boundaries of a single city or nation.

Marks: The aspects of the Mark of Passage grant the following benefits.

Least Mark of Passage: expeditious retreat 1/day, mount 1/day, or dimension leap 1/day; +2 bonus on Survival checks.

A character with the dimension leap spell-like ability can teleport up to a total of 10 feet per character level. 



In [293]:
prompt = f"The following text concerns a fantasy setting. Create an appropriate title for the text. Do not include anything else. Text:\n\n"
response = requests.post("http://ollama:11434/api/generate", json={
    "model": "gemma2:2b",
    "prompt": prompt + '========================\n\n\n' + corrected_text + '\n\n\n========================',
    "options": {"temperature": 0.1},
    "stream": False
})
title = response.json().get('response', '')
print(response.json().get('response', response.json().get('error')))



Mark of Passage: A Guide to Transportation and Teleportation 



In [301]:
category_list = "character, culture, community, location, trap, item, dragonmark, spell, power, monster, organization, concept, other"
prompt = f"The following text concerns a fantasy setting. Categorize the following text. Answer with one of the following: {category_list}. Include only one category. Do not include anything else. The text:\n\n"
response = requests.post("http://ollama:11434/api/generate", json={
    "model": "gemma2:2b",
    "prompt": prompt + '========================\n\n\n' + corrected_text + '\n\n\n========================',
    "options": {"temperature": 0.1},
    "stream": False
})
tag = response.json().get('response', '')
print(response.json().get('response', response.json().get('error')))



power 



In [302]:
prompt = f"The following text concerns a {tag} in a fantasy world. What is the name of this {tag}? Do not include anything else in your response. The text:\n\n"
response = requests.post("http://ollama:11434/api/generate", json={
    "model": "gemma2:2b",
    "prompt": prompt + '========================\n\n\n' + corrected_text + '\n\n\n========================',
    "stream": False
})
print(response.json().get('response', response.json().get('error')))



Mark of Passage 



In [303]:
prompt = f"The following text concerns a fantasy world. You are trying to find first or full names or individuals in the text. If there are individuals in the text, and their first or full names are included, list the names of the individuals. This has to be individuals, not groups of people. If not, just return 'no names'. Do not include anything else. The text:\n\n"
response = requests.post("http://ollama:11434/api/generate", json={
    "model": "gemma2:2b",
    "prompt": prompt + '========================\n\n\n' + corrected_text + '\n\n\n========================',
    "options": {"temperature": 0.1},
    "stream": False
})
print(response.json().get('response', response.json().get('error')))



no names 



In [304]:
prompt = f"The following text concerns a fantasy world. If there are monsters, list the names of the monsters in the text. If there are no monsters, just return 'no monsters'. Do not include anything else. The text:\n\n"
response = requests.post("http://ollama:11434/api/generate", json={
    "model": "gemma2:2b",
    "prompt": prompt + '========================\n\n\n' + corrected_text + '\n\n\n========================',
    "stream": False
})
print(response.json().get('response', response.json().get('error')))



no monsters 



In [305]:
prompt = f"The following text concerns a fantasy world. If there are locations, list the names of the locations in the text. If there are no locations, just return 'no locations'. Do not include anything else. The text:\n\n"
response = requests.post("http://ollama:11434/api/generate", json={
    "model": "gemma2:2b",
    "prompt": prompt + '========================\n\n\n' + corrected_text + '\n\n\n========================',
    "options": {"temperature": 0.1},
    "stream": False
})
print(response.json().get('response', response.json().get('error')))



Khorvaire, Aundair 



In [98]:
category_list = "character, location, trap, item, spell, power, monster, organization, concept"
prompt = f"The following text is coming from OCR and may have illegible lines and characters, page numbers and legal notices. Remove them and return a clean text. Do not include anything else. The text:\n\n"
response = requests.post("http://ollama:11434/api/generate", json={
    "model": "gemma2:2b",
    "prompt": prompt + '========================\n\n\n' + section + '\n\n\n========================',
    "stream": False
})
print(response.json().get('response', response.json().get('error')))



ACTIONS

Bite. Melee Weapon Attack: +4 to hit, reach 5 ft., one target. Hit: 5 (1d6 + 2) piercing damage plus 10 (3d6) poison damage.

Kiss. The vargouille kisses one incapacitated humanoid within 5 feet of it. The target must succeed on a DC 12 Charisma saving throw or become cursed. The cursed target loses 1 point of Charisma after each hour, as its head takes on fiendish aspects. The curse doesn’t advance while the target is in sunlight or the area of a daylight spell; don’t count that time. When the cursed target's Charisma becomes 2, it dies, and its head tears from its body and becomes a new vargouille. Casting remove curse, greater restoration, or a similar spell on the target before the transformation is complete can end the curse. Doing so undoes the changes made to the target by the curse.

Stunning Shriek. The vargouille shrieks. Each humanoid and beast within 30 feet of the vargouille and able to hear it must succeed on a DC 12 Wisdom saving throw or be frightened until the

In [25]:
response.json()

{'error': 'timed out waiting for llama runner to start - progress 0.00 - '}

In [20]:
try:
    with open(os.path.join('is_legible_text.json'), 'r') as is_legible_text_file:
        is_legible_text = json.load(is_legible_text_file)
        assert len(is_legible_text) == len(sections)
except (AssertionError, FileNotFoundError):
    prompt = f"The following text concerns a fantasy setting. Is this a complete, understandable text with multiple complete sentences and one coherent paragraph? Answer with yes or no. Do not include anything else:\n\n"
    is_legible_text = [None] * len(sections)
    for i, section in enumerate(tqdm(sections)):
        messages = [ 
            {"model": "gemma2:2b", "prompt": prompt + '========================\n\n\n' + section + '\n\n\n========================'}, 
        ]
        if section_metadata[i]['initial_word_count'] > 300:
            is_legible_text[i] = True
        else:
            output = pipe(messages, **generation_args)[0]['generated_text'].strip().strip('.').lower()
            is_legible_text[i] = output == 'yes' or output.startswith('yes')
            assert output in ['yes', 'no'] or output.startswith('yes') or output.startswith('no')

In [21]:
with open(os.path.join('is_legible_text.json'), 'w') as is_legible_text_file:
    json.dump(is_legible_text, is_legible_text_file)

In [22]:
sum(is_legible_text)

18937

In [23]:
i = random.choice([i for i in range(len(sections)) if is_legible_text[i]])
print(section_metadata[i])
print(sections[i])


{'book_title': '328949-Artificer_Book_1.0', 'file_name': '328949-Artificer_Book_1.0.md', 'edition': '5e', 'pdf_book_title': "Adam d'Cannith's Guide to Artificers v1.0", 'pdf_book_author': 'Bradford Yurkiw', 'book_category': 'eberron_5e_homebrew', 'section_title': 'WEAVER ALCHEMISTS', 'toc_title': None, 'page_start': 15, 'page_end': 15, 'initial_word_count': 28, 'chapter': None}
WEAVER ALCHEMISTS
Weaver Elixers may be a string wrapped around the finger

or a small article of clothing that falls apart when the potion’s
effects ends.



In [None]:
is_any_changes = True
for i, section in enumerate(tqdm(sections)):
    if is_any_changes:
        if os.path.exists('section_metadata.json'):
            with open('section_metadata.json', 'r') as section_metadata_file:
                section_metadata = json.load(section_metadata_file)
    if not is_legible_text[i]:
        continue
    is_any_changes = False
    category_list = "character, location, trap, item, spell, power, monster, organization, concept, other"
    section = '\n'.join(sections[i].split('\n')[1:])
    if 'ai_tags' not in section_metadata[i]:
        if len(section.split(' ')) > 450:
            continue
        messages = [ 
            # {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
            {"role": "user", "content": f"The following text concerns a fantasy setting. Categorize the following text. Answer with one of the following: {category_list}. Include only one category. Do not include anything else:\n\n" + section}, 
        ] 
        tags = [t.strip() for t in pipe(messages, **generation_args)[0]['generated_text'].strip().split(',')]
    
        section_metadata[i]['ai_tags'] = tags
        is_any_changes = True
    
        categories = category_list.split(", ")
        for tag in tags:
            if tag not in categories:
                break
            if tag != 'other':
                messages = [ 
                    # {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
                    {"role": "user", "content": f"The following text is about a {tag}. If you can find the name of this {tag} in the text, print it out. If it looks like an typo, correct it. Otherwise print ''. If there are multiple {tag}s, separate them by commas. Do not include anything else.:\n\n" + section}, 
                ] 
                output = pipe(messages, **generation_args)
                section_metadata[i]['labels'] = list(set(output[0]['generated_text'].strip().split(', ')))
                # print(tag, output[0]['generated_text'].strip())
    if 'is_table' not in section_metadata[i]:
        messages = [ 
            {"role": "user", "content": f"Is the following text a D&D table? If so, say yes, otherwise say no. Do not include anything else:\n\n" + section}, 
        ] 
        output = pipe(messages, **generation_args)[0]['generated_text'].strip().strip('.').lower()
        section_metadata[i]['is_table'] = output.startswith('yes')
        is_any_changes = True
    if is_any_changes:
        with open('section_metadata.json', 'w') as section_metadata_file:
            json.dump(section_metadata, section_metadata_file)

  0%|          | 0/20556 [00:00<?, ?it/s]

In [None]:
with open(os.path.join('sections_metadata.json'), 'w') as sections_metadata_file:
    json.dump(sections_metadata, sections_metadata_file)

In [None]:
folder_name = "sections"

num_digits = len(str(len(sections) - 1))
if os.path.exists(folder_name):
    shutil.rmtree(folder_name)
os.makedirs(folder_name)
for i, content in enumerate(tqdm(sections)):
    file_name = os.path.join(folder_name, f"{i:0{num_digits}}.md")  # Zero-padded index
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(content)

# == End of Chunking ==

In [75]:
df = pd.DataFrame(section_metadata)
print(df['initial_word_count'].describe())
bins = [60, 125, 275, 500, 1000, 1500, float("inf")]
df["bins"] = pd.cut(df["initial_word_count"], bins=bins, right=False)
df.groupby("bins").size().reset_index(name="count")

count    19993.000000
mean       234.841895
std        443.748781
min         11.000000
25%         60.000000
50%        132.000000
75%        275.000000
max      26124.000000
Name: initial_word_count, dtype: float64


  df.groupby("bins").size().reset_index(name="count")


Unnamed: 0,bins,count
0,"[60.0, 125.0)",4683
1,"[125.0, 275.0)",5399
2,"[275.0, 500.0)",2865
3,"[500.0, 1000.0)",1667
4,"[1000.0, 1500.0)",270
5,"[1500.0, inf)",202


In [19]:
i = random.choice([i for i in range(len(sections)) if is_legible_text[i]])
print(section_metadata[i])
print(sections[i])
print(len(sections[i].split(' ')))

{'book_title': '831833-Morgrave_Miscellany_2020_Full-Res', 'file_name': '831833-Morgrave_Miscellany_2020_Full-Res.md', 'edition': '5e', 'pdf_book_title': '', 'pdf_book_author': '', 'book_category': 'eberron_5e_kanon', 'section_title': 'TIEFLING ORIGIN', 'toc_title': 'Appenpix: Know Your FACULTY', 'page_start': 99, 'page_end': 99, 'initial_word_count': 403, 'chapter': 4}
TIEFLING ORIGIN
dé Tiefling Origin
1 Arcane Experiment. You weren't born a
tiefling, but you became one as a result of the
magebreeding programs conducted by House
Vadalis, or the result of a personal experiment
you conducted on yourself.

2 Cursed Child. The people of your town
considered you to be cursed. You were driven
from your home as a child and had to make
your own way in the world. Perhaps you eked
out a living as an urchin or turned to a life of
crime while living in the shadows of a major
city. Do you hate the common people because
of how they treated you, or have you forgiven
them?

3 Direct Influence. You h

In [17]:
for j, chunk in enumerate(sections[i].split(".\n\n")):
    print(j, len(chunk.split(' ')))

0 88
1 67
2 80
3 80


In [112]:
sample_section_indices = [i for i, s in enumerate(sections) if section_metadata[i]['file_name'] == 'D&D 3E Eberron Campaign Setting.md']
len(sample_section_indices)

1104

In [115]:
i = sample_section_indices[50]
# print(sample_sections[4])
print(section_metadata[i])
print(sections[i])


{'book_title': 'D&D 3E Eberron Campaign Setting', 'file_name': 'D&D 3E Eberron Campaign Setting.md', 'edition': '3e', 'pdf_book_title': 'Acr179.tmp', 'pdf_book_author': '', 'book_category': 'eberron_3e', 'section_title': 'CHARACTER RACES', 'toc_title': 'The Ruins of Dorasharn', 'page_start': 25, 'page_end': 25, 'initial_word_count': 679, 'chapter': 11}
CHARACTER RACES

J ————————————————————————TH——

THE NATURE OF THE WARFORGED

Before the death of King Jarot and the start of the Last
War, the master crafters of House Cannith turned their
creation forges to the task of churning out new con-
structs for a new age. Constructs designed for labor and
industry soon led to experiments with models developed
for exploration and defense. When King Jarot saw the
possibilities inherent in the work of House Cannith,
he began to outline his plan to protect Galifar from
the threats he imagined were gathering all around the
kingdom. King Jarot was growing more and more ner-
vous about the dangers he 

In [28]:
section = '\n'.join(sections[i].split('\n')[1:])
messages = [ 
    {"role": "user", "content": f"The following text concerns a fantasy setting. Is this a complete, understandable text with multiple complete sentences and one coherent paragraph? Answer with yes or no. Do not include anything else:\n\n" + section}, 
] 
output = pipe(messages, **generation_args)[0]['generated_text'].strip().strip('.').lower()
print(output)
if output.startswith('yes'):
    messages = [ 
        {"role": "user", "content": f"Put the following table into a table in markdown format:\n\n" + section}, 
    ] 
    output = pipe(messages, **generation_args)[0]['generated_text']
    print(output)

yes
 Here is the table in markdown format:

| Subrace Origin | Description |
|----------------|-------------|
| Arcane Experiment | You weren't born a tiefling, but you became one as a result of the magebreeding programs conducted by House Vadalis, or the result of a personal experiment you conducted on yourself. |
| Cursed Child | The people of your town considered you to be cursed. You were driven from your home as a child and had to make your own way in the world. Perhaps you eked out a living as an urchin or turned to a life of crime while living in the shadows of a major city. Do you hate the common people because of how they treated you, or have you forgiven them? |
| Direct Influence | You have a direct connection to a particular fiend. Your parents might have made a bargain with this being, you might believe you've descended from a demon, or your bloodline cursed, and your physical form is the result. This degree of direct influence is common among the Sakah tieflings of the De

In [None]:
section = '\n'.join(sections[i].split('\n')[1:])
messages = [ 
    {"role": "user", "content": f"Is the following text a D&D table? If so, say yes, otherwise say no. Do not include anything else:\n\n" + section}, 
] 
output = pipe(messages, **generation_args)[0]['generated_text'].strip().strip('.').lower()
output

In [30]:
print(section)

dé Tiefling Origin
1 Arcane Experiment. You weren't born a
tiefling, but you became one as a result of the
magebreeding programs conducted by House
Vadalis, or the result of a personal experiment
you conducted on yourself.

2 Cursed Child. The people of your town
considered you to be cursed. You were driven
from your home as a child and had to make
your own way in the world. Perhaps you eked
out a living as an urchin or turned to a life of
crime while living in the shadows of a major
city. Do you hate the common people because
of how they treated you, or have you forgiven
them?

3 Direct Influence. You have a direct connection
to a particular fiend. Your parents might have
made a bargain with this being, you might
believe you've descended from a demon, or
your bloodline cursed, and your physical form
is the result. This degree of direct influence
is common among the Sakah tieflings of the
Demon Wastes.

4 Mysterious Past. Something obliterated your
memories of your childhood. You woke 

In [33]:
i = random.choice(range(len(sections)))
print(section_metadata[i])
print(sections[i])
print(len(sections[i].split(' ')))

{'book_title': 'Eberron_ Rising From the Last War - Jeremy Crawford & James Wyatt & Keith Baker', 'file_name': 'Eberron_ Rising From the Last War - Jeremy Crawford & James Wyatt & Keith Baker.md', 'section_title': 'HUMAN ORIGINS', 'toc_title': 'MageWTight c', 'page_start': 30, 'page_end': 30, 'initial_word_count': 206, 'chapter': 0}
HUMAN ORIGINS
d10 Origin

1 An impoverished wizard from Aundair, striving to
prove you're as intelligent as any other Aundairian

2 Astreetwise rogue from Breland who wants nothing
more than to escape the bustle of Sharn forever

3 Adisplaced Cyran fighter who was engaged in war
outside Cyre at the time of the Mourning and is now
stranded with no home

4 Abarbarian from the Demon Wastes who repented
from a life of cruelty and hopes to atone for past evils

5  Akindly druid from the Eldeen Reaches who wants
to learn about the flora and fauna of the rest of Khor-
vaire and beyond

6  Acleric from Karrnath who aspires to learn the arts of
necromancy—for the no

In [85]:
category_list = "character, location, trap, item, spell, power, monster, organization, concept, other"
section = '\n'.join(sections[i].split('\n')[1:])
messages = [ 
    # {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
    {"role": "user", "content": f"The following text concerns a fantasy setting. Categorize the following text. Answer with one of the following: {category_list}. Include only one category. Do not include anything else:\n\n" + section}, 
] 
tags = [t.strip() for t in pipe(messages, **generation_args)[0]['generated_text'].strip().split(',')]

print(tags)
categories = category_list.split(", ")
for tag in tags:
    if tag not in categories:
        break
    if tag != 'other':
        messages = [ 
            # {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
            {"role": "user", "content": f"The following text is about a {tag}. If you can find the name of this {tag} in the text, print it out. If it looks like an typo, correct it. Otherwise print ''. If there are multiple {tag}s, separate them by commas. Do not include anything else.:\n\n" + section}, 
        ] 
        output = pipe(messages, **generation_args)
        print(tag, output[0]['generated_text'].strip())

RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [193]:
messages = [ 
    # {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
    {"role": "user", "content": "Categorize the following text. Answer with one of the following: person, location, item, spell, power, creature, other. Do not include anything else:\n\n" + sections[i]}, 
] 
tag = pipe(messages, **generation_args)[0]['generated_text'].strip()
print(tag)
if tag != 'other':
    messages = [ 
        # {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
        {"role": "user", "content": f"The following text is about a {tag}. If you can find the name of this {tag} in the text, print it out. Otherwise print ''. Do not include anything else.:\n\n" + sections[i]}, 
    ] 
    output = pipe(messages, **generation_args)
    print(output[0]['generated_text'].strip())

item
Feather Fall Talisman


In [63]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
12288 MiB, 10622 MiB, 1494 MiB


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [65]:
%%time
messages = [ 
    # {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
    {"role": "user", "content": "Is there a D&D NPC description in the following paragraph? If so, extract the information and print. Do not include anything about other characters, or any other inforamation.\n\n" + sections[i]}, 
] 
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

 Avtack Fort Ref Will
Level Bonus Save Save Save Special
Ist =1 +2 «2 +0 Catch the scent,
heirs mark,
hunter's insight +1,
swift tracker
Dragon's guidance,
sharpened senses

Ind +2 CER EE

3rd +1 «4 +5 +1 Improved dragonmark

4th +4 =4 Ld #1 Favored enemy

Sth +5 +4 +4 +] Hunter'sinsight+2,
trackless step

fh «th «5 «5 #2 Perfect awareness

Tih 7 #5 +5 +2 Track the trickles

Sch +8 wh [13 $2 Improved dragonmack

9th re] eh 46 +3 Favored enemy

Tdih +10 7 7 «3 Hunter's Insight +3

Class Skills (6 « Int modifier per level) Bluff, Climb, Disguise, Gather Information, Hide, Intimidate, Jump, Knowledge (grography), Knowledge (local), Knowledge {nsure), Listen, Move Silently, Open Lock, Search, Seniene Motive, Spot, Survival. Swim. and Use Rope.

af your dragonmark. You can have a maximum of five effective levels in dragonmark heir for the purpose of determining the caster level of your dragonmark.

Hunter's Insight (Ex): While fighting a creature who is the current target of your Ibeate cre

In [326]:
!nvidia-smi

Sat Mar 15 01:29:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.02              Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060        On  |   00000000:01:00.0  On |                  N/A |
| 30%   45C    P5             21W /  170W |   11771MiB /  12288MiB |     11%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [49]:
book = random.choice(list(pages.keys()))
page_no = random.choice(list(pages[book].keys()))
# book, page_no, len(pages[book]), pages[book][page_no]
print('\n'.join(pages[book][page_no]))


fleshy tendrils twined together form the
entire figure—robe and all.

Hashalags serve as the loremasters of
Dal Quor. Even those hashalaq without
Inspired vessels devote a great deal

of time studying EBERRON and its sur-
rounding planes. Unlike most quori
castes, they understand the ways of
magic as well as psionic power. In Dal
Quor, hashalags typically work as advi-
sors to the higher pow-

ers. They also

serve as inquisitors, policing the quori
and ensuring that the rivalries of the
tsucoras never threaten the greater plans
of the Dreaming Dark.

In its natural form, a hashalaq quori
is composed of hundreds of translucent
tendrils, similar to the tentacles of a
jellyfish. It can compress its tendrils
to form a wide range of shapes, from
a humanoid figure to a giant floating
hand. A point of blue light suspended
within the tendrils serves as its eyes
and ears; it can move this sense organ
around to suit its current shape.

Hashalags telepathically inherit the
languages of any crea

In [56]:
%%time
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

 Fleshy tendrils twined together form the entire figure—robe and all.

Hashalags serve as the loremasters of Dal Quor. Even those hashalaq without Inspired vessels devote a great deal of time studying EBERRON and its surrounding planes. Unlike most quori castes, they understand the ways of magic as well as psionic power. In Dal Quor, hashalags typically work as advisors to the higher powers. They also serve as inquisitors, policing the quori and ensuring that the rivalries of the tsucoras never threaten the greater plans of the Dreaming Dark.

In its natural form, a hashalaq quori is composed of hundreds of translucent tendrils, similar to the tentacles of a jellyfish. It can compress its tendrils to form a wide range of shapes, from a humanoid figure to a giant floating hand. A point of blue light suspended within the tendrils serves as its eyes and ears; it can move this sense organ around to suit its current shape.

Hashalags telepathically inherit the languages of any creature they

In [131]:
messages = [ 
    {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
    {"role": "user", "content": "Correct the typos in the following text without changing its content. Remove if there are unintentional double newlines:\n\n" + sections[i]}, 
] 
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

 Possessions: leather armor, crossbow with 10 bolts, dagger, 8 sp.

Development: The guards know they can't stand against the adventurers, so they decide to try to stall them by talking to them. During the discussion, the PCs can make a DC 20 Spot check to notice the third guard as he sneaks toward the entrance to the ruin.

Here are some of the questions the guards might answer.

Who Are You? “We are members of the Order of the Emerald Claw. We work for the advancement of the nation of Karrnath, despite the current king's lack of vision and ambition.”

Who Is Your Leader? “The great and powerful Garrow leads us.”

Is Your Leader a Vampire? (After a short pause) “Yes, a powerful vampire.” (This is a lie.)

Does Garrow Have the Schemas? “He has some objects in his pack, and he guards them fiercely, but we know nothing of detail.”

How Many Men Are with Garrow? “Nearly fifty, including a fierce warforged warrior.”

How Long Have They Been in the Ruin? “Since yesterday morning.”

The char

In [57]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
12288 MiB, 11898 MiB, 218 MiB


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [39]:
print('\n'.join(pages['1920353-AE01-07_-_The_Silvered_Edge_of_Twilight_1.3'][27]))


ASHBOUND DRUID

Medium humanoid (any race, shapechanger), neutral evil
Armor Class 11 (16 with barkskin)

Hit Points 27 (5d8 + 5)

Speed 30 ft.
STR DEX CON INT WIS CHA
1042) © 121) 13 (ela 1 20ely 15 (+2) | ATTEO)

Skills Medicine +4, Nature +3, Perception +4
Senses passive Perception 14

Languages Common, Druidic, Elven
Challenge 2 (450 XP)

Spellcasting. The druid is a 4th-level spellcaster. Its spellcasting
ability is Wisdom (spell save DC 12, +4 to hit with spell attacks). It
has the following druid spells prepared:

Cantrips (at will): druidcraft, produce flame, shillelagh
1st level (4 slots): entangle, faerie fire, healing word, thunderwave
2nd level (3 slots): flaming sphere, barkskin

Actions

Quarterstaff Melee Weapon Attack: +2 to hit (or +4 to hit with
shillelagh), reach 5 ft., one creature. Hit: 3 (1d6) bludgeoning
damage, or 4 (1d8) if used with two hands, or 6 (1d8 + 2)
bludgeoning damage with shillelagh.

Longbow. Ranged Weapon Attack: +5 to hit, range 150/600 ft., one


In [62]:
%%time
messages = [ 
    # {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
    {"role": "user", "content": "Is there a D&D NPC description in the following paragraph? If so, extract the information and print. Do not include anything about other characters, or any other inforamation." + '\n'.join(pages['Grasp of The Emerald Claw'][16])}, 
] 
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

 NPC Description:

Captain Chinxero:
- Race: Human
- Class: Fighter
- Level: 5
- CR: 5
- Hit Points: 42
- Armor Class: 16
- Speed: 20 ft.
- Attack: +10 melee (1d10+5/19-20, master-work bastard sword) or +8 ranged (1d8+3, masterwork composite longbow)
- Senses: None
- Languages: Common, Giant
- Skills: Bluff +2, Intimidate +6, Profession (riverboat captain) +3, Sense Motive +4
- Feats: Improved Initiative, Power Attack, Weapon Focus (bastard sword), Weapon Specialization (bastard sword)
- Possessions: Full plate, heavy steel shield, masterwork bastard sword, masterwork composite longbow (+3 Str bonus), 10 arrows (cold iron, silvery), 10 dagger
- Sailors:
  - Race: Human
  - Class: Warrior
  - Level: 1/2
  - Hit Points: 5
  - Armor Class: 10
  - Speed: 30 ft.
  - Attack: +4 melee (1d4+2, dagger)
  - Senses: None
  - Languages: Common
  - Skills: Climb +4, Jump +3, Swim +3
  - Feats: Improved Initiative, Weapon Focus (dagger)
  - Possessions: dagger

Part Three:

RIDING THE MARLOW

The Ma



 NPC Description:

Captain Chinxero:
- Race: Human
- Class: Fighter
- Level: 5
- CR: 5
- Hit Points: 42
- Armor Class: 16
- Speed: 20 ft.
- Attack: +10 melee (1d10+5/19-20, master-work bastard sword) or +8 ranged (1d8+3, masterwork composite longbow)
- Senses: None
- Languages: Common, Giant
- Skills: Bluff +2, Intimidate +6, Profession (riverboat captain) +3, Sense Motive +4
- Feats: Improved Initiative, Power Attack, Weapon Focus (bastard sword), Weapon Specialization (bastard sword)
- Possessions: Full plate, heavy steel shield, masterwork bastard sword, masterwork composite longbow (+3 Str bonus) with 10 arrows (10 cold iron arrows, 10 silver arrows)
- Sailors:
  - Race: Human
  - Class: Warrior
  - Level: 1/2
  - Hit Points: 5
  - Armor Class: 10
  - Speed: 30 ft.
  - Attack: +1 melee (1d4+2, dagger)
  - Senses: None
  - Languages: Common
  - Skills: Climb +4, Jump +3, Swim +3
  - Feats: Improved Initiative, Weapon Focus (dagger)
- Development: Once all preparations are made, Capt

In [31]:
messages = [ 
    {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
    {"role": "user", "content": "Correct the following text. Fix errors from OCR. Remove tables and character stats. Print the corrected text without tables or character stats, do not include anything else." + '\n'.join(pages['Grasp of The Emerald Claw'][16])}, 
] 


In [32]:
%%time
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

 Corrected text:

PART THREE:

RIDING THE MARLOW

The Marlow is an elemental-powered riverboat, one of the only such ships in use at Stormreach. Chinxero is a privateer, with no connection to any of the dragonmarked houses, although he works for many of them from time to time. The Marlow has a complement of ten sailors.

Captain Chinxero: Male human fighter 5; CR 5; Medium humanoid; HD 5d10+15; hp 42; Init +6; Spd 20 ft.; AC 16, touch 10, flat-footed 16; Grp +7; Atk or Full Atk +10 melee (1d10+5/19-20, master-work bastard sword) or +8 ranged (1d8+3, masterwork composite longbow); SA —; SQ —; AL N; SV Fort +8, Ref +2, Will +3; Str 16, Dex 11, Con 16, Int 10, Wis 12, Cha 10.

Skills and Feats: Bluff +2, Intimidate +6, Profession (riverboat captain) +3, Sense Motive +4; Exotic Weapon Proficiency (bastard sword), Improved Initiative, Power Attack, Weapon Focus (bastard sword), Weapon Specialization (bastard sword).

Languages: Common, Giant.

Possessions: full plate, heavy steel shield, ma

In [37]:
messages = [ 
    {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
    {"role": "user", "content": "Is there a D&D NPC description in the following paragraph? If so, extract the information and print. Do not include anything about other characters, or any other inforamation." + '\n'.join(pages['1920353-AE01-07_-_The_Silvered_Edge_of_Twilight_1.3'][27])}, 
] 


In [38]:
%%time
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

 Ashbound Druid:
- Medium humanoid (any race, shapechanger), neutral evil
- Armor Class 11 (16 with barkskin)
- Hit Points 27 (5d8 + 5)
- Speed 30 ft.
- STR DEX CON INT WIS CHA
- Skills Medicine +4, Nature +3, Perception +4
- Senses passive Perception 14
- Languages Common, Druidic, Elven
- Challenge 2 (450 XP)
- Spellcasting: Cantrips (at will): druidcraft, produce flame, shillelagh; 1st level (4 slots): entangle, faerie fire, healing word, thunderwave; 2nd level (3 slots): flaming sphere, barkskin
- Quarterstaff Melee Weapon Attack: +2 to hit (or +4 to hit with spell attacks)
- Longbow Ranged Weapon Attack: +5 to hit
- Wild Shape (Recharges after a Short or Long Rest): Assumes the shape of a giant eagle, retains its alignment, Intelligence, Wisdom, and Charisma scores, and known languages, but uses the eagle's statistics, including its hit points, and cannot cast spells in eagle form.

Hairy Wolf:
- Medium beast, neutral evil
- Armor Class 13 (natural armor)
- Hit Points 20 (3d8 + 6)

In [46]:
messages = [ 
    {"role": "system", "content": "You correct typos and OCR errors in text. Correct the text given by user without changing its content."}, 
    {"role": "user", "content": "Is there a D&D NPC description in the following paragraph? If so, extract the information and print. Do not include anything about other characters, or any other inforamation." + '\n'.join(pages['1920353-AE01-07_-_The_Silvered_Edge_of_Twilight_1.3'][27])}, 
] 


In [47]:
%%time
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

 Ashbound Druid:
- Medium humanoid (any race, shapechanger), neutral evil
- Armor Class 11 (16 with barkskin)
- Hit Points 27 (5d8 + 5)
- Speed 30 ft.
- STR DEX CON INT WIS CHA
- Skills Medicine +4, Nature +3, Perception +4
- Senses passive Perception 14
- Languages Common, Druidic, Elven
- Challenge 2 (450 XP)
- Spellcasting: Cantrips (at will): druidcraft, produce flame, shillelagh; 1st level (4 slots): entangle, faerie fire, healing word, thunderwave; 2nd level (3 slots): flaming sphere, barkskin
- Quarterstaff Melee Weapon Attack: +2 to hit (or +4 to hit with spell attacks)
- Longbow Ranged Weapon Attack: +5 to hit
- Wild Shape (Recharges after a Short or Long Rest): Assumes the shape of a giant eagle, retains its alignment, Intelligence, Wisdom, and Charisma scores, and known languages, but uses the eagle's statistics, including its hit points, and cannot cast spells in eagle form.

Hairy Wolf:
- Medium beast, neutral evil
- Armor Class 13 (natural armor)
- Hit Points 20 (3d8 + 6)

In [72]:
starting_pages

{'D&D 3E Eberron Campaign Setting': {7: ('The Tone of Eberron', None),
  8: ('TheWorld', None),
  11: ('Character Races', 1),
  14: ('Cnomes', 1),
  15: ('Half-Elves', 1),
  16: ('Half Ores', 1),
  24: ('Region of Origin', 1),
  27: ('Vital Statistics', 1),
  29: ('Character Classes', 2),
  33: ('Barbarian', 2),
  36: ('Druidi', 2),
  38: ('Fighter', 2),
  39: ('Paladin', 2),
  40: ('Psionic Classes', 2),
  41: ('Ranger', 2),
  45: ('Action Points', 3),
  47: ('Feats', 3),
  62: ('Dragonmarks', 3),
  67: ('Religion', 3),
  0: ('Horrid Animal', 11),
  73: ('Dragonmark Heir', 4),
  74: ('Eldeen Ranger', 4),
  77: ('Exorcist of the Silver Flame', 4),
  79: ('Extreme Explorer', 4),
  80: ('Heir of Siberys', 4),
  82: ('Master Inquisitive', 4),
  83: ('Warforged Juggernaut', 4),
  85: ('Weretouched Master', 4),
  89: ('Magic in the World', 5),
  92: ('Planes of Existence', 5),
  100: ('Outsiders in Eberron', 5),
  119: ('Adventuring Equipment', 6),
  120: ('Special Substances and Items', 6)

In [73]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
12288 MiB, 2178 MiB, 9938 MiB


# Second Pass - Organize by section

In [None]:
for file_count, file_name in enumerate(tqdm(os.listdir(CORPUS_FOLDER))):
    
    if file_name[-3:] != '.md':
        continue
    corpus[file_name[-3:]] = []
    toc[file_name[:-3]] = []

    file_path = os.path.join(CORPUS_FOLDER, file_name)

    with open(file_path) as f:
        document = f.read()
        lines = document.split('\n')
        line_count = len(lines)
        current_page_content = ""
        for line_no, line in enumerate(lines):
            previous_line = lines[line_no - 1].strip() if line_no > 0 else '\0'
            next_line = lines[line_no + 1].strip() if line_no < line_count - 1 else '\0'
            current_line = line.strip()

            if not current_line:
                continue

            m = re.match(r'##\s+Page\s+([0-9]+)', line)
            if m:
                if is_current_page_toc:
                    toc[file_name[:-3]].append(current_page_content)
                current_page = int(m[1])
                count_toc_lines = 0
                current_page_content = []
                is_current_page_toc = False
                continue


### First Pass (Fast): Split Into Sections

In [5]:
def is_integer_string(value):
    """
    Check if the given value (string) represents an integer.
    
    :param value: The value to check.
    :return: True if the string represents an integer, False otherwise.
    """
    try:
        int(value)
        return True
    except (ValueError, TypeError):
        return False

In [6]:
title_fixes = {
    '881665-eberron_cannith_cat2': 'House Cannith Catalogue #2'
}

In [7]:
disallowed_section_titles = {'Actions', 'Reactions', 'Cannith Catalogue 2', 'THREAT DISPATCH', 'Legendary Actions', 'Wizard Level Feature'
                             'Languages —', '-', 'Finesse', 'Weight Properties', 'Player’s Handbook','Spell Resistance:', 'Capital: Korth',
                             'Combat', 'Capital: Rhukaan Draal', 'STR DEX CON INT WIS CHA', 'Favor', 'DIALECTS', 'WIS CHA',
                             'CLASS FEATURES',
}
disallowed_section_title_regexp = [
    r'Skills\s+.+\+[0-9].*',
    r'Saving Throws\s+.+\+[0-9].*',
    r'.*\-level.*feature',
    r'Languages.*Common.*',
    r'^[0-9\s.\(\)]+$',
    r'Hit Points\s+[0-9]+.*'
    r'Challenge\s+[0-9]+.*',
    r'Damage Immunities.*',
    r'Damage Resistances.*',
    r'Level Adjustment:.*',
    r'Challenge Rating:.*',
    r'Initiative:.*',
    r'Treasure:.*',
    r'Environment: .*',
    r'Skills: .*',
    r'Feats: .*',
    r'Organization: .*',
    r'Base Atk .*',
    r'Base Attack .*',
    r'Special Attacks: .*',
    r'Range: .*',
    r'Spell Resistance: .*',
    r'Graft Location: .*',
    r'Weight: .*',
    r'Light: .*\.',
    r'Scripts: .*',
    r'Script: .*',
    r'Speakers: .*',
    r'Format: .*',
    r'Knowledge ([a-z]+)',
    r'[0-9](st|nd|th).*level.*',
    r'.*KORRANBERG CHRONICLE: THREAT DISPATCH',
    r'SIDEBAR: .*',
    r'WIZARDS OF EBERRON: FIVE ARCANE TRADITIONS [0-9]+.*',
    r'.*DUNGEON DECEMBER 2004',
    r'.*[cves]\s+[0-9]+',
]

In [8]:
ignore_list = {'Eberron Character Sheet', 'New & Expanded Feat List', '476764-sample', 'SharnIndexIntegrated'}

In [9]:
sections = []
section_metadata = []
book_count = 0

for file_count, file_name in enumerate(tqdm(os.listdir(CORPUS_FOLDER))):
    if file_name[-3:] != '.md':
        continue
    if file_name[:-3] in ignore_list:
        continue
    book_title = file_name[:-3]
    pdf_title = metadata_dict[file_name[:-3]]['pdf/title']
    edition = metadata_dict[file_name[:-3]]['edition']
    if pdf_title:
        if book_title.endswith('sample'):
            book_title = pdf_title
    book_title = title_fixes.get(book_title, book_title)
    current_page = 0
    current_section_title = ''
    current_section_lines = []
    empty_line_ctr = 0
    file_path = os.path.join(CORPUS_FOLDER, file_name)
    with open(file_path) as f:
        document = f.read()
        lines = document.split('\n')
        line_count = len(lines)
        for line_no, line in enumerate(lines):
            previous_line = lines[line_no - 1].strip() if line_no > 0 else '\0'
            next_line = lines[line_no + 1].strip() if line_no < line_count - 1 else '\0'
            current_line = line.strip()

            if not current_line:
                continue

            m = re.match(r'##\s+Page\s+([0-9]+)', line)
            if m:
                current_page = int(m[1])
                continue

            current_section_word_count = len(" ".join(current_section_lines).split(' '))
    
            if previous_line == '' and (next_line == '' or next_line.startswith('Medium') or next_line.startswith('Large') or next_line.startswith('District Type')):
                may_be_section_title = True
                if current_line.upper() != current_line and string.capwords(current_line) != current_line:
                    may_be_section_title = False
                if current_section_title.lower() == current_line.lower():
                    may_be_section_title = False
                if may_be_section_title and is_integer_string(current_line):
                    may_be_section_title = False
                if may_be_section_title and current_line in disallowed_section_titles:
                    may_be_section_title = False
                if len(re.sub(r'[^a-zA-Z\.]', '', current_line)) < 4:
                    may_be_section_title = False
                if ',' in current_line:
                    may_be_section_title = False
                if may_be_section_title:
                    for regexp in disallowed_section_title_regexp:
                        m = re.match(regexp, current_line, re.IGNORECASE)
                        if m:
                            may_be_section_title = False

                if may_be_section_title:
                    if current_section_title:
                        if current_section_lines and current_section_word_count > 15:
                            text = " ".join(current_section_lines)
                            if len(text.split(' ')) > 5:
                                sections.append(text)
                                section_metadata.append({
                                    'book_title': book_title,
                                    'file_name': file_name,
                                    'edition': edition,
                                    'section_title': current_section_title,
                                    'section_pages': current_section_pages,
                                    'initial_word_count': current_section_word_count
                                })
                    current_section_title = current_line
                    current_section_lines = []
                    current_section_pages = (current_page, current_page)
                    continue

            if current_section_word_count > 450:
                end = line_no + 10
                start = line_no - len(current_section_lines) + 10
                text = "\n".join([l for l in lines[start:end] if l and not l.startswith('## Page ')])
                sections.append(text)
                section_metadata.append({
                    'book_title': book_title,
                    'file_name': file_name,
                    'edition': edition,
                    'section_title': current_section_title,
                    'section_pages': current_section_pages,
                    'initial_word_count': len(text.split(' '))
                })
                current_section_lines = []
                current_section_pages = (current_page, current_page)
                continue

            if current_section_title and current_line:
                current_section_lines.append(current_line)
                current_section_pages = (current_section_pages[0], current_page)
        if current_section_lines:
            text = ' '.join(current_section_lines)
            sections.append(text)
            section_metadata.append({
                'book_title': book_title,
                'file_name': file_name,
                'edition': edition,
                'section_title': current_section_title,
                'section_pages': current_section_pages,
                'initial_word_count': len(text.split(' '))
            })
section_count = len(sections)
assert len(section_metadata) == section_count
assert section_metadata[1]['section_title'] == 'Litmus Strips'
assert section_metadata[11]['section_title'] == 'Automatic Chatelaine'
section_titles = {d['section_title'] for d in section_metadata}
assert 'Automatic Chatelaine' in section_titles
assert 'HORRID HYENA' in section_titles
assert 'HORRID BADGER' in section_titles
assert 'NAZTHARUNE RAKSHASA' in section_titles
assert 'CRYSTEEL' in section_titles
assert 'DENDRITIC' in section_titles
assert 'IRONBARK' in section_titles
assert 'DARKLEAF' in section_titles
assert 'KNIGHT PHANTOM' in section_titles
assert 'SHARN SKYMAGE' in section_titles
assert 'WEAPONS OF KHORVAIRE' in section_titles
assert 'ADVENTURING GEAR' in section_titles
assert '7. The Library' in section_titles
assert '8. The Entry Hall' in section_titles
assert 'THE ORIGIN OF THE FIVE NATIONS' in section_titles
assert 'AUNDAIR AT A GLANCE' in section_titles
assert 'THE COMING OF GALIFAR' in section_titles
assert 'Highhold' in section_titles
assert 'PERIPLANAR OF ICE ~ PERISIAN' in section_titles
assert 'HOUSE THARASHK' in section_titles
assert 'ARGON' in section_titles
assert 'METRON' in section_titles
assert 'DARKLEAF' in section_titles
assert 'DARKLEAF' in section_titles
section_count, book_title

  0%|          | 0/127 [00:00<?, ?it/s]

(21118, 'Magic of Eberron')

In [10]:
# TODO: For second pass, delete:
# Contents
# Thanks
# CREDITS


In [11]:
ls -al /jupyterlab/models/hf/hub

total 56
drwxr-xr-x 13 root root 4096 Feb  4 20:42 [0m[01;34m.[0m/
drwxr-xr-x  4 root root 4096 Jan  3 21:52 [01;34m..[0m/
drwxr-xr-x 12 root root 4096 Feb  4 20:42 [01;34m.locks[0m/
drwxr-xr-x  6 root root 4096 Jan  3 04:32 [01;34mmodels--Alibaba-NLP--gte-base-en-v1.5[0m/
drwxr-xr-x  5 root root 4096 Jan  3 04:32 [01;34mmodels--Alibaba-NLP--new-impl[0m/
drwxr-xr-x  6 root root 4096 Feb  4 20:22 [01;34mmodels--BAAI--bge-large-en-v1.5[0m/
drwxr-xr-x  6 root root 4096 Feb  4 20:23 [01;34mmodels--HIT-TMG--KaLM-embedding-multilingual-mini-instruct-v1.5[0m[K/
drwxr-xr-x  6 root root 4096 Feb  4 20:34 [01;34mmodels--intfloat--e5-mistral-7b-instruct[0m/
drwxr-xr-x  5 root root 4096 Feb  4 20:42 [01;34mmodels--jinaai--jina-embeddings-v3[0m/
drwxr-xr-x  6 root root 4096 Jan  4 17:38 [01;34mmodels--mistralai--Mistral-7B-Instruct-v0.3[0m/
drwxr-xr-x  6 root root 4096 Feb  3 18:45 [01;34mmodels--mistralai--Mistral-Small-24B-Instruct-2501[0m/
drwxr-xr-x  6 root root 4096 Jan

In [123]:
ls -al /jupyterlab/models/hf/hub/models--mistralai--Mistral-Small-24B-Instruct-2501/snapshots

total 12
drwxr-xr-x 3 root root 4096 Feb  3 18:26 [0m[01;34m.[0m/
drwxr-xr-x 6 root root 4096 Feb  3 18:45 [01;34m..[0m/
drwxr-xr-x 2 root root 4096 Feb  3 18:45 [01;34m20b2ed1c4e9af44b9ad125f79f713301e27737e2[0m/


In [13]:
!df

Filesystem     1K-blocks      Used Available Use% Mounted on
overlay         52416492  30778468  21638024  59% /
tmpfs              65536         0     65536   0% /dev
tmpfs           16193236         0  16193236   0% /sys/fs/cgroup
/dev/nvme2n1   515858840 103498116 412344340  21% /jupyterlab
/dev/nvme0n1p1  52416492  30778468  21638024  59% /etc/hosts
shm                65536         4     65532   1% /dev/shm
tmpfs           31696296        12  31696284   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs           16193236        12  16193224   1% /proc/driver/nvidia
tmpfs           16193236      2120  16191116   1% /run/nvidia-persistenced/socket
tmpfs           16193236         0  16193236   0% /proc/acpi
tmpfs           16193236         0  16193236   0% /sys/firmware


### Embed

In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
ls -al /

In [14]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_revision = 'fa97f6e7cb1a59073dff9e6b13e2715cf7475ac9'
model_name = 'HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5'
model_revision = '30cf7fd484e8c883443e0632e9a8e9caeffe2830'
model_name = 'BAAI/bge-large-en-v1.5'
model_revision = 'd4aa6901d3a41ba39fb536a557fa166f842b0e09'
# model_name = 'jinaai/jina-embeddings-v3'
# model_revision = None
# model_name = 'intfloat/e5-mistral-7b-instruct'
# model_revision = None
# model_revision = None
model = SentenceTransformer(model_name, trust_remote_code=True, revision=model_revision)
model = model.to("cuda")




In [15]:
embeddings = []
for chunk in tqdm(sections):
    embedding = model.encode(chunk, normalize_embeddings=True)
    embeddings.append(embedding)

  0%|          | 0/21118 [00:00<?, ?it/s]

In [16]:
assert all([embedding.size == embeddings[0].size for embedding in embeddings])
embeddings[0].size

1024

In [17]:
type(embeddings[0])

numpy.ndarray