In [45]:
import os
import json
import pickle

from tqdm.notebook import tqdm

from sentence_transformers import SentenceTransformer
import lancedb
import numpy as np
import pandas as pd


In [30]:
ARTEFACT_VERSION = '02'

In [93]:
ARTEFACT_ROOT_FOLDER = os.environ.get('ARTEFACT_ROOT_FOLDER', '/artefact')
ARTEFACT_FOLDER = os.path.join(ARTEFACT_ROOT_FOLDER, 'eberron', f'v{ARTEFACT_VERSION}')

# Load the Artefact

In [95]:
try:
    with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.json'), 'r') as f:
        model_metadata = json.load(f)
except FileNotFoundError:
    with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl'), 'rb') as f:
        model_metadata = pickle.load(f)

assert model_metadata['embedding_model']['str'].startswith('SentenceTransformer')

In [94]:
if model_metadata['embedding_format'] == 'pickle':
    with open(os.path.join(ARTEFACT_FOLDER, 'embeddings.pkl'), 'rb') as f:
        embeddings = pickle.load(f)
elif model_metadata['embedding_format'] == 'lancedb':
    embeddings_folder = os.path.join(ARTEFACT_FOLDER, 'embeddings')
    db = lancedb.connect(embeddings_folder)
    table = db.open_table('documents')

In [96]:
if model_metadata['version'] == '01':
    # I used a JSON to store metadata only in artefact v01. v02 onwards, they are in lancedb fields
    with open(os.path.join(ARTEFACT_FOLDER, 'chunk_metadata.json'), 'r') as f:
        chunk_metadata = json.load(f)


In [97]:
if model_metadata['version'] == '01':
    file_names = [f for f in os.listdir(os.path.join(ARTEFACT_FOLDER, 'chunks')) if f.endswith('.md')]
    file_names = sorted(file_names)
    chunks = [None] * len(file_names)
    for file_name in tqdm(file_names):
        file_path = os.path.join(ARTEFACT_FOLDER, 'chunks', file_name)
        with open(file_path, 'r') as f:
            chunks[int(file_name.split('.')[0])] = f.read()

# Load the Embedding Model

In [98]:
embedding_model = SentenceTransformer(model_metadata['embedding_model']['name'], 
                                      trust_remote_code=True, 
                                      revision=model_metadata['embedding_model']['revision'])
embedding_model = embedding_model.to("cpu")
# embedding_model = embedding_model.to("cuda")

# Evaluate

## Example

In [143]:
# TODO: DO NOT CHANGE. Move to a common library.
def retrieve_baseline(q: str, model: SentenceTransformer, k: int=5, metric='cosine') -> pd.DataFrame:
    """
    Retrieves the top-k most similar documents from a LanceDB table based on a query string, 
    using vector search to compare the query embedding to precomputed embeddings in the table.

    Args:
        q (str): The query string for which similar documents are to be retrieved.
        model (SentenceTransformer): The SentenceTransformer model used to generate the embedding for the query.
        k (int, optional): The number of top similar documents to retrieve. Default is 5.
        metric: cosine, l2, dot or hamming.

    Returns:
        pd.DataFrame: A Pandas DataFrame containing the top-k most similar documents, including their metadata 
                      and similarity scores. The columns will include the document text and any associated 
                      metadata fields (e.g., similarity score, etc.).
    
    Example:
        query = "What is the history of Eberron?"
        model = SentenceTransformer('all-MiniLM-L6-v2')
        results = retrieve_baseline(query, model)
        print(results)
    """
    embedding = model.encode(q, normalize_embeddings=True)
    results = table.search(embedding).metric(metric)

    return results.limit(k).to_pandas()


In [100]:
retrieve_baseline("Tell me about the languages of Eberron.", embedding_model)
# TODO: DEbug this error, it is actually a Kubernetes error.

[2025-01-20T02:38:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T02:38:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T02:38:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T02:38:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.


Unnamed: 0,text,vector,book_title,file_name,edition,section_title,page_from,page_to,initial_word_count,_distance
0,"In Eberron, languages reflect culture and geog...","[-0.029826447, -0.017027168, 0.05794495, 0.011...",Eberron_ Rising From the Last War - Jeremy Cra...,Eberron_ Rising From the Last War - Jeremy Cra...,5e,LANGUAGES,7,7,182,0.225519
1,Common languages in Eberron and their alphabet...,"[-0.0142765865, -0.018694216, 0.0049784207, -0...",Eberron Campaign Setting,Eberron Campaign Setting.md,3e,SPEAK LANGUAGE,49,50,190,0.264524
2,"Speakers: Elementals and fey of Fernia,\nLaman...","[-0.01267345, -0.0069440915, 0.01278992, 0.022...",1598836-Languages_of_Eberron_2E,1598836-Languages_of_Eberron_2E.md,5e,EBERRAL,18,18,91,0.270994
3,Common languages in Eberron and their alphabet...,"[-0.011303049, -0.015604924, -0.0069146133, -0...",D&D 3E Eberron Campaign Setting,D&D 3E Eberron Campaign Setting.md,3e,SPEAK LANGUAGE,49,50,192,0.271466
4,Eberral is the language of Lamannia and the\nn...,"[-0.012463877, 0.0048175147, 0.038165424, -0.0...",1598836-Languages_of_Eberron_2E,1598836-Languages_of_Eberron_2E.md,5e,EBERRAL FAMILY,17,17,30,0.300364


## Define the Test

In [101]:

# TODO: Add TF-IDF ReRanker
# table.create_fts_index('text')
# ImportError: Please install tantivy-py `pip install tantivy` to use the full text search feature.

# TODO: Do the TF-IDF MVP++ from Ben Clavié

In [138]:
evaluation_queries_and_conditions = [
    ("Tell me about the languages of Eberron.", lambda x: 'language' in x['book_title'].lower() or 'language' in x['section_title'].lower() ),
    ("Who is Professor Dash Dannigan?", lambda x: 'dannigan' in x['text'].lower() ),
    ("Who is Commander Iyanna?", lambda x: 'iyanna' in x['text'].lower() ),
    ("Tell me about Menthis Plateau.", lambda x: 'menthis' in x['book_title'].lower() or 'menthis' in x['section_title'].lower() or 'menthis' in x['text'].lower()),
    ("Tell me about Eldeen Reaches.", lambda x: 'eldeen' in x['book_title'].lower() or 'eldeen' in x['section_title'].lower() or 'eldeen' in x['text'].lower()),
    # ("Tell me about the rivers of Khorvaire.", lambda x: 'eldeen' in x['book_title'].lower() or 'eldeen' in x['section_title'].lower() or 'eldeen' in x['text'].lower()),
    ("Tell me about fashion in Khorvaire.", lambda x: '496317-sample' == x['file_name'] or 'fashion' in x['section_title'].lower() or 'clothing' in x['section_title'].lower()),
    ("Create a House Cannith item.", lambda x: 'cannith' in x['file_name'].lower() or 'cannith' in x['section_title'].lower() or 'making' in x['section_title'].lower()),
    ("Which one is the weakest of the quori?", lambda x: 'TSOREVA' in x['text'].upper() or 'quor' in x['text'].lower() or 'quor' in x['section_title'].lower() or 'quor' in x['book_title'].lower()),
    ("What are dolgrims?", lambda x: 'dolgrim' in x['text'].lower() or 'dolgrim' in x['section_title'].lower()),
    ("What is the name of the prominent university in Sharn?",  lambda x: 'morgrave' in x['text'].lower()),
]
# Give me some professor names in Morgraive.
# What type of magic does Karrnath use in its military?

In [139]:
func = retrieve_baseline
model_metadata[func.__name__] = {}
for k in [5, 10, 25]:
    scores = []
    for q, cond in evaluation_queries_and_conditions:
        results = func(q, embedding_model, k=k)
        score = 0
        for _, row in results.iterrows():
            score += cond(row)
        scores.append(score)
    model_metadata[func.__name__][f'MAP@{k}'] = (np.array(scores) / k).mean()

[2025-01-20T03:09:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:09:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:09:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:09:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:09:54Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported config

In [134]:
retrieve_baseline("Give ma an item from the House Cannith Catalogue.", embedding_model, k=10)

[2025-01-20T03:00:06Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:00:06Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:00:06Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:00:06Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.


Unnamed: 0,text,vector,book_title,file_name,edition,section_title,page_from,page_to,initial_word_count,_distance
0,Cannith Catalogue 2\nThe following sections de...,"[0.043904938, -0.071870774, 0.06289965, 0.0281...",House Cannith Catalogue #2,881665-eberron_cannith_cat2.md,5e,DAVIDE QUATRINTI'S,1,2,30,0.30467
1,The following sections describe various useful...,"[0.031992488, -0.055226155, 0.07406002, 0.0398...",881665-cc_fin,881665-cc_fin.md,5e,Cannith Catalogue,2,2,27,0.371331
2,In the four years since Cyre hecame the Mournl...,"[0.04190334, -0.059561834, -0.02720324, 0.0461...",Shadows of The Last War,Shadows of The Last War.md,3e,ADVENTURE BACKGROUND,3,3,358,0.377653
3,MY HOUSE BUILT THE MODERN WORLD. ORIEN MAY\ndr...,"[0.037841205, -0.043679465, -0.030823583, 0.03...",Eberron_ Rising From the Last War - Jeremy Cra...,Eberron_ Rising From the Last War - Jeremy Cra...,5e,MAKING,46,46,283,0.398035
4,The leader of the regional enclave of a house ...,"[0.036026288, -0.03159787, 0.015698912, 0.0229...",Keith Baker - The City of Towers (Eberron_ The...,Keith Baker - The City of Towers (Eberron_ The...,3e,The Dragonmarked,219,220,326,0.414918
5,MANY oF us IN House D’CANNITH CARRY ON ARTIFIC...,"[0.010896899, -0.038355052, -0.00015329469, 0....",328949-Artificer_Book_1.0,328949-Artificer_Book_1.0.md,5e,Machine.,6,6,46,0.415166
6,of Mourning has left 1he house divided in\nres...,"[0.030273145, -0.052222442, -0.02130118, 0.008...",Dragonmarked,Dragonmarked.md,3e,TRY REQUIREMENTS,21,22,361,0.415255
7,Z\nN\n5\nQ\n=z\n“\nI\nZz\nQo\n=\nN\nZz\nLe\n©)...,"[0.015966594, -0.03208863, -0.0544327, 0.03742...",D&D 3E Eberron Campaign Setting,D&D 3E Eberron Campaign Setting.md,3e,HOUSE CANNITH,234,235,306,0.419075
8,=\n>\n=\nS\n=\nwn\n231\nnN\n=\n2\n=\n<\n>\n=\n...,"[0.014979331, -0.036476403, -0.048033714, 0.02...",Eberron Campaign Setting,Eberron Campaign Setting.md,3e,HOUSE CANNITH,234,235,306,0.42171
9,"You can speak to Jury d’Cannith at the Anvil, ...","[0.022721272, -0.052742325, -0.00131476, 0.020...",2255601-Frontiers_of_Eberron_Quickstone_v1.0.3,2255601-Frontiers_of_Eberron_Quickstone_v1.0.3.md,5e,SPELLCASTING,116,116,51,0.424834


In [136]:
def ranking_evaluation(column: pd.Series, expected_values: list):
    score = 0
    for value in column.tolist():
        if value in expected_values:
            score += 1
        else:
            return score

    return score

In [141]:
top_row_evaluation = [
    ("Tell me about fashion in Khorvaire.", lambda x: ranking_evaluation(x['file_name'], ['496317-sample.md'])),
    ("Give ma an item from the House Cannith Catalogue.", lambda x: ranking_evaluation(x['file_name'], ['881665-eberron_cannith_cat2.md', '881665-cc_fin.md'])),
]

In [142]:
func = retrieve_baseline
k = 15
scores = []
for q, cond in top_row_evaluation:
    results = func(q, embedding_model, k=k)
    score = cond(results)
    scores.append(score)
# model_metadata[func.__name__][f'MAPQ{k}'] = (np.array(scores) / k).mean()
scores

[2025-01-20T03:11:08Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:11:08Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:11:08Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:11:08Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported configuration. using 1 CPU for compute intensive tasks.
[2025-01-20T03:11:08Z WARN  lance_core::utils::tokio] Number of CPUs is less than or equal to the number of IO core reservations. This is not a supported config

[1, 2]

## Save the Retrieval Evaluation

In [131]:
with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl'), 'wb') as f:
	pickle.dump(model_metadata, f)

### Cross-check
### Do Not Remove, Do Not Ignore, Run To Make Sure Things Are There

In [132]:
assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl')) or os.path.exists(os.path.join(ARTEFACT_FOLDER, 'model_metadata.json'))

In [133]:
with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.pkl'), 'rb') as f:
    model_metadata = pickle.load(f)

assert model_metadata['embedding_model']['str'].startswith('SentenceTransformer')
assert 'version' in model_metadata
assert model_metadata['version'] == ARTEFACT_VERSION
assert 'embedding_format' in model_metadata
if model_metadata['embedding_format'] == 'lancedb':
    assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'embeddings'))
assert os.path.exists(os.path.join(ARTEFACT_FOLDER, 'embeddings', 'documents.lance', 'data'))