In [1]:
import os
import json
import pickle
import requests
import random

from tqdm.notebook import tqdm

import lancedb
import numpy as np
import pandas as pd


In [2]:
ARTEFACT_VERSION = '06'

In [3]:
ARTEFACT_ROOT_FOLDER = os.environ.get('ARTEFACT_ROOT_FOLDER', '/artefact')
ARTEFACT_FOLDER = os.path.join(ARTEFACT_ROOT_FOLDER, 'eberron', f'v{ARTEFACT_VERSION}')

# Load the Artefact

In [4]:
try:
    with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.json'), 'r') as f:
        model_metadata = json.load(f)
except FileNotFoundError:
    with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl'), 'rb') as f:
        model_metadata = pickle.load(f)

if ARTEFACT_VERSION < '05':
    assert model_metadata['embedding_model']['str'].startswith('SentenceTransformer')
else:
    assert isinstance(model_metadata['embedding_model']['name'], str)
    assert isinstance(model_metadata['embedding_model']['tag'], str)

In [5]:
if model_metadata['embedding_format'] == 'pickle':
    with open(os.path.join(ARTEFACT_FOLDER, 'embeddings.pkl'), 'rb') as f:
        embeddings = pickle.load(f)
elif model_metadata['embedding_format'] == 'lancedb':
    embeddings_folder = os.path.join(ARTEFACT_FOLDER, 'embeddings')
    db = lancedb.connect(embeddings_folder)
    table = db.open_table('documents')

In [6]:
if model_metadata['version'] < '02':
    # I used a JSON to store metadata only in artefact v01. v02 onwards, they are in lancedb fields
    with open(os.path.join(ARTEFACT_FOLDER, 'chunk_metadata.json'), 'r') as f:
        chunk_metadata = json.load(f)


In [7]:
if model_metadata['version'] < '02':
    file_names = [f for f in os.listdir(os.path.join(ARTEFACT_FOLDER, 'chunks')) if f.endswith('.md')]
    file_names = sorted(file_names)
    chunks = [None] * len(file_names)
    for file_name in tqdm(file_names):
        file_path = os.path.join(ARTEFACT_FOLDER, 'chunks', file_name)
        with open(file_path, 'r') as f:
            chunks[int(file_name.split('.')[0])] = f.read()

# Load the Embedding Model

In [10]:
def get_embeddings_vector(text: str) -> np.ndarray:
    response = requests.post("http://ollama:11434/api/embed",
                         json={
                             "model": model_metadata['embedding_model']['name'] + ':' + model_metadata['embedding_model']['tag'],
                              "options": {
                                  "use_mmap": False
                              },
                             "input": text,
                         })
    embeddings = response.json().get('embeddings')
    return np.array(embeddings)[0]

# Evaluate

In [8]:
# TODO: DO NOT CHANGE. Move to a common library.
if ARTEFACT_VERSION < '05':
    def retrieve_baseline(q: str, model: SentenceTransformer, k: int=5, metric='cosine') -> pd.DataFrame:
        """
        Retrieves the top-k most similar documents from a LanceDB table based on a query string, 
        using vector search to compare the query embedding to precomputed embeddings in the table.
    
        Args:
            q (str): The query string for which similar documents are to be retrieved.
            model (SentenceTransformer): The SentenceTransformer model used to generate the embedding for the query.
            k (int, optional): The number of top similar documents to retrieve. Default is 5.
            metric: cosine, l2, dot or hamming.
    
        Returns:
            pd.DataFrame: A Pandas DataFrame containing the top-k most similar documents, including their metadata 
                          and similarity scores. The columns will include the document text and any associated 
                          metadata fields (e.g., similarity score, etc.).
        
        Example:
            query = "What is the history of Eberron?"
            model = SentenceTransformer('all-MiniLM-L6-v2')
            results = retrieve_baseline(query, model)
            print(results)
        """
        embedding = model.encode(q, normalize_embeddings=True)
        results = table.search(embedding).metric(metric)
    
        return results.limit(k).to_pandas()
else:
    def retrieve_baseline(q: str, _: None) -> pd.DataFrame:
        metric = 'cosine'
        k = 5
        response = requests.post("http://embed:11434/api/embed",
                         json={
                             "model": model_metadata['embedding_model']['name'] + ':' + model_metadata['embedding_model']['tag'],
                              "options": {
                                  "use_mmap": False
                              },
                             "input": q,
                         })
        response
        embeddings = response.json().get('embeddings')
        results = table.search(np.array(embeddings[0])).metric(metric)

        return results.limit(k).to_pandas()

In [9]:
%timeit
if ARTEFACT_VERSION >= '05':
    embedding_model = None
results = retrieve_baseline("Tell me about the languages of Eberron.", embedding_model)
# TODO: DEbug this error, it is actually a Kubernetes error.

[2025-05-09T00:51:43Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-05-09T00:51:43Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-05-09T00:51:43Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-05-09T00:51:46Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.


In [11]:
results

Unnamed: 0,text,vector,metadata,book_title,edition,pdf_book_title,pdf_book_author,book_category,file_name,initial_word_count,page_start,page_end,section_title,toc_title,chapter,_distance
0,STANDARD LANGUAGES OF EBERRON\n\nLanguage Main...,"[0.0033499573, -0.041267857, 0.0026151466, -0....","{'book_category': 'eberron_5e_canon', 'book_ti...",Eberron_ Rising From the Last War - Jeremy Cra...,5e,Eberron: Rising From the Last War,Jeremy Crawford & James Wyatt & Keith Baker,eberron_5e_canon,Eberron_ Rising From the Last War - Jeremy Cra...,96,7,7,STANDARD LANGUAGES OF EBERRON,MageWTight c,0.0,0.234881
1,CHAPTER 2 | CULTURES OF EBERRON \n,"[0.008676128, 0.10729609, 0.003150241, 0.03256...","{'book_category': 'eberron_5e_kanon', 'book_ti...",831833-Morgrave_Miscellany_2020_Full-Res,5e,,,eberron_5e_kanon,831833-Morgrave_Miscellany_2020_Full-Res.md,448,117,119,CHAPTER 2 | CULTURES OF EBERRON,Appenpix: Know Your FACULTY,4.0,0.23577
2,CHAPTER 2 | CULTURES OF EBERRON \n,"[0.008676128, 0.10729609, 0.003150241, 0.03256...","{'book_category': 'eberron_5e_kanon', 'book_ti...",831833-Morgrave_Miscellany_2020_Full-Res,5e,,,eberron_5e_kanon,831833-Morgrave_Miscellany_2020_Full-Res.md,1359,113,116,CHAPTER 2 | CULTURES OF EBERRON,Appenpix: Know Your FACULTY,4.0,0.23577
3,CHAPTER 2 | CULTURES OF EBERRON \n,"[0.008676128, 0.10729609, 0.003150241, 0.03256...","{'book_category': 'eberron_5e_kanon', 'book_ti...",831833-Morgrave_Miscellany_2020_Full-Res,5e,,,eberron_5e_kanon,831833-Morgrave_Miscellany_2020_Full-Res.md,763,129,131,CHAPTER 2 | CULTURES OF EBERRON,Appenpix: Know Your FACULTY,4.0,0.23577
4,INTRODUCTION\n\nMuch as Exploring Eberron is p...,"[0.0026689493, 0.0217771, -0.03418862, 0.01263...","{'book_category': 'my_eberron', 'book_title': ...",1598836-Languages_of_Eberron_2E,,Languages of Eberron 2E mk iii,,my_eberron,1598836-Languages_of_Eberron_2E.md,170,3,3,INTRODUCTION,,,0.274883


## Define the Test

In [101]:

# TODO: Add TF-IDF ReRanker
# table.create_fts_index('text')
# ImportError: Please install tantivy-py `pip install tantivy` to use the full text search feature.

# TODO: Do the TF-IDF MVP++ from Ben Clavié

In [138]:
evaluation_queries_and_conditions = [
    ("Tell me about the languages of Eberron.", lambda x: 'language' in x['book_title'].lower() or 'language' in x['section_title'].lower() ),
    ("Who is Professor Dash Dannigan?", lambda x: 'dannigan' in x['text'].lower() ),
    ("Who is Commander Iyanna?", lambda x: 'iyanna' in x['text'].lower() ),
    ("Tell me about Menthis Plateau.", lambda x: 'menthis' in x['book_title'].lower() or 'menthis' in x['section_title'].lower() or 'menthis' in x['text'].lower()),
    ("Tell me about Eldeen Reaches.", lambda x: 'eldeen' in x['book_title'].lower() or 'eldeen' in x['section_title'].lower() or 'eldeen' in x['text'].lower()),
    # ("Tell me about the rivers of Khorvaire.", lambda x: 'eldeen' in x['book_title'].lower() or 'eldeen' in x['section_title'].lower() or 'eldeen' in x['text'].lower()),
    ("Tell me about fashion in Khorvaire.", lambda x: '496317-sample' == x['file_name'] or 'fashion' in x['section_title'].lower() or 'clothing' in x['section_title'].lower()),
    ("Create a House Cannith item.", lambda x: 'cannith' in x['file_name'].lower() or 'cannith' in x['section_title'].lower() or 'making' in x['section_title'].lower()),
    ("Which one is the weakest of the quori?", lambda x: 'TSOREVA' in x['text'].upper() or 'quor' in x['text'].lower() or 'quor' in x['section_title'].lower() or 'quor' in x['book_title'].lower()),
    ("What are dolgrims?", lambda x: 'dolgrim' in x['text'].lower() or 'dolgrim' in x['section_title'].lower()),
    ("What is the name of the prominent university in Sharn?",  lambda x: 'morgrave' in x['text'].lower()),
]
# Give me some professor names in Morgraive.
# What type of magic does Karrnath use in its military?

In [139]:
func = retrieve_baseline
model_metadata[func.__name__] = {}
for k in [5, 10, 25]:
    scores = []
    for q, cond in evaluation_queries_and_conditions:
        results = func(q, embedding_model, k=k)
        score = 0
        for _, row in results.iterrows():
            score += cond(row)
        scores.append(score)
    model_metadata[func.__name__][f'MAP@{k}'] = (np.array(scores) / k).mean()

[2025-01-20T03:09:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:09:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:09:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:09:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:09:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported config

In [13]:
retrieve_baseline("Give ma an item from the House Cannith Catalogue.", embedding_model)

[2025-05-09T00:53:03Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-05-09T00:53:03Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-05-09T00:53:03Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-05-09T00:53:03Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.


Unnamed: 0,text,vector,metadata,book_title,edition,pdf_book_title,pdf_book_author,book_category,file_name,initial_word_count,page_start,page_end,section_title,toc_title,chapter,_distance
0,Cannith Catalogue\n\nThe following sections de...,"[0.033253603, 0.029232005, -0.0035685813, -0.0...","{'book_category': 'eberron_5e_homebrew', 'book...",881665-cc_fin,5e,MergedFile,Monica Rizzi,eberron_5e_homebrew,881665-cc_fin.md,31,2,2,Cannith Catalogue,,,0.457648
1,MAGIC ITEMS\n\nHouse Cannith artificers and ot...,"[-0.022306444, -0.0056243427, 0.015578911, 0.0...","{'book_category': 'eberron_3e', 'book_title': ...",Sharn - City of Towers,3e,Acr61.tmp,,eberron_3e,Sharn - City of Towers.md,31,170,170,MAGIC ITEMS,Dynamic Setting,0.0,0.467527
2,MAKING MY HOUSE BUILT THE MODERN WORLD. ORIEN ...,"[0.052608512, 0.03588805, -0.04046229, 0.01605...","{'book_category': 'eberron_5e_canon', 'book_ti...",Eberron_ Rising From the Last War - Jeremy Cra...,5e,Eberron: Rising From the Last War,Jeremy Crawford & James Wyatt & Keith Baker,eberron_5e_canon,Eberron_ Rising From the Last War - Jeremy Cra...,295,46,46,MAKING,MageWTight c,0.0,0.469044
3,Cannith South Additional Benefits®\n\nDC' Bene...,"[-0.006655756, -0.044040617, -0.007319519, 0.0...","{'book_category': 'eberron_3e', 'book_title': ...",Dragonmarked,3e,,,eberron_3e,Dragonmarked.md,63,22,22,Cannith South Additional Benefits®,in the Modern Day,31.0,0.483155
4,"HOUSE CANNITH\n\nHouse Cannith, consisting of ...","[0.00910592, 0.038783737, -0.009571925, -0.023...","{'book_category': 'eberron_3e', 'book_title': ...",Eberron Campaign Setting,3e,Eberron - Campaign Setting,,eberron_3e,Eberron Campaign Setting.md,674,234,235,HOUSE CANNITH,Conclusion,4.0,0.50178


In [136]:
def ranking_evaluation(column: pd.Series, expected_values: list):
    score = 0
    for value in column.tolist():
        if value in expected_values:
            score += 1
        else:
            return score

    return score

In [141]:
top_row_evaluation = [
    ("Tell me about fashion in Khorvaire.", lambda x: ranking_evaluation(x['file_name'], ['496317-sample.md'])),
    ("Give ma an item from the House Cannith Catalogue.", lambda x: ranking_evaluation(x['file_name'], ['881665-eberron_cannith_cat2.md', '881665-cc_fin.md'])),
]

In [142]:
func = retrieve_baseline
k = 15
scores = []
for q, cond in top_row_evaluation:
    results = func(q, embedding_model, k=k)
    score = cond(results)
    scores.append(score)
# model_metadata[func.__name__][f'MAPQ{k}'] = (np.array(scores) / k).mean()
scores

[2025-01-20T03:11:08Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:11:08Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:11:08Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:11:08Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:11:08Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported config

[1, 2]

## Save the Retrieval Evaluation

In [131]:
with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl'), 'wb') as f:
	pickle.dump(model_metadata, f)

### Cross-check
### Do Not Remove, Do Not Ignore, Run To Make Sure Things Are There

In [132]:
assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl')) or os.path.exists(os.path.join(ARTEFACT_FOLDER, 'model_metadata.json'))

In [133]:
with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl'), 'rb') as f:
    model_metadata = pickle.load(f)

assert model_metadata['embedding_model']['str'].startswith('SentenceTransformer')
assert 'version' in model_metadata
assert model_metadata['version'] == ARTEFACT_VERSION
assert 'embedding_format' in model_metadata
if model_metadata['embedding_format'] == 'lancedb':
    assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'embeddings'))
assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'embeddings', 'documents.lance', 'data'))

# Experiment

In [15]:
response = requests.get("http://embed:11434/api/tags")
models = response.json().get('models', [])
models

[{'name': 'all-minilm:33m',
  'model': 'all-minilm:33m',
  'modified_at': '2025-04-19T03:35:41.0085611Z',
  'size': 67319908,
  'digest': '4f5da3bd944d9ad1cd3acc7d065ee54367a4c703f51fb6295bd8bc5007ed0c4a',
  'details': {'parent_model': '',
   'format': 'gguf',
   'family': 'bert',
   'families': ['bert'],
   'parameter_size': '33M',
   'quantization_level': 'F16'}},
 {'name': 'mxbai-embed-large:335m',
  'model': 'mxbai-embed-large:335m',
  'modified_at': '2025-04-19T03:35:40.7171194Z',
  'size': 669615493,
  'digest': '468836162de7f81e041c43663fedbbba921dcea9b9fefea135685a39b2d83dd8',
  'details': {'parent_model': '',
   'format': 'gguf',
   'family': 'bert',
   'families': ['bert'],
   'parameter_size': '334M',
   'quantization_level': 'F16'}}]

In [28]:
q = "Tell me about the languages of Eberron."

response = requests.post("http://embed:11434/api/embed",
                 json={
                     "model": 'mxbai-embed-large:335m',
                      "options": {
                          "use_mmap": False
                      },
                     "input": q,
                 })
response
embeddings_from_server = response.json().get('embeddings')
len(embeddings_from_server[0])

1024

In [23]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_metadata['embedding_model'].get('str', 'mixedbread-ai/mxbai-embed-large-v1'), 
                                      trust_remote_code=True)
embedding_model = embedding_model.to("cpu")
embeddings_from_local = embedding_model.encode(q, normalize_embeddings=True)
embeddings_from_local.shape

(1024,)

In [31]:
embeddings_from_local

array([ 0.00167786, -0.02518029, -0.01086697, ..., -0.01688098,
       -0.03808224,  0.00063177], dtype=float32)

In [32]:
np.array(embeddings_from_server[0])

array([ 0.0017244 , -0.02524555, -0.01087736, ..., -0.01686934,
       -0.03814513,  0.00067584])