In [73]:
import os
import re
import json
import pickle
from tempfile import mkdtemp
import shutil

import numpy as np

import torch
from sentence_transformers import SentenceTransformer
from langchain_mistralai import ChatMistralAI

from tqdm.notebook import tqdm


In [13]:
ARTEFACT_VERSION = '01'

In [31]:
DOCUMENT_VERSION = '03'
CORPUS_FOLDER = f'/jupyterlab/corpus/eberron/v{DOCUMENT_VERSION}'
ARTEFACT_ROOT_FOLDER = os.environ.get('ARTEFACT_ROOT_FOLDER', '/artefact')
ARTEFACT_FOLDER = os.path.join(ARTEFACT_ROOT_FOLDER, 'eberron', f'v{ARTEFACT_VERSION}')



In [6]:
os.makedirs(ARTEFACT_FOLDER, exist_ok=True)

In [56]:
model_name = 'Alibaba-NLP/gte-base-en-v1.5'
model_revision = 'a829fd0e060bb84554da0dfd354d0de0f7712b7f'
model = SentenceTransformer(model_name, trust_remote_code=True, revision=model_revision)
model = model.to("cuda")




In [8]:
# Are we using cuda? Needs to be True
all([param.is_cuda for param in model.parameters()])

True

In [32]:
with open(os.path.join(CORPUS_FOLDER, 'metadata.json')) as f:
    metadata = json.load(f)
metadata_dict = {}
for data in metadata:
    metadata_dict[data['filename'][:-4]] = data

In [120]:
include_previous = 1
include_next = 1
drop_if_less_than_n_chars = 100
chunks = []
chunk_metadata = []
chunk_idx = 0
for file_name in os.listdir(CORPUS_FOLDER):
    file_path = os.path.join(CORPUS_FOLDER, file_name)
    with open(file_path) as f:
        document = f.read()
    page_splits = re.split(r'\n## .*\n', document)
    for idx, split in enumerate(page_splits):
        if len(split) < drop_if_less_than_n_chars:
            page_splits[idx] = ''
    page_index = {}
    for idx, split in enumerate(page_splits):

        if idx < include_previous or idx > len(page_splits) - include_next - 1:
            continue
        chunk_metadata.append(metadata_dict.get(file_name[:-3], {'filename': file_name[:-3] + '.pdf'}).copy())
        chunk_metadata[chunk_idx]['pages'] = list(range(idx-include_previous, idx+include_next+1))
        chunks.append('   '.join(page_splits[idx-include_previous:idx+include_next+1]))
        chunk_idx += 1



In [121]:
idx = 5
list(range(idx-include_previous, idx+include_next+1))

[4, 5, 6]

In [122]:
len(chunks), len(chunk_metadata), chunk_metadata[50:55]

(7639,
 7639,
 [{'filename': '248087-sample.pdf',
   'edition': '5e',
   'pdf/title': 'Korranberg Chronicle: Threat Dispatch',
   'pdf/author': 'Anthony J. Turco',
   'pages': [44, 45, 46]},
  {'filename': '248087-sample.pdf',
   'edition': '5e',
   'pdf/title': 'Korranberg Chronicle: Threat Dispatch',
   'pdf/author': 'Anthony J. Turco',
   'pages': [45, 46, 47]},
  {'filename': '248087-sample.pdf',
   'edition': '5e',
   'pdf/title': 'Korranberg Chronicle: Threat Dispatch',
   'pdf/author': 'Anthony J. Turco',
   'pages': [46, 47, 48]},
  {'filename': '248087-sample.pdf',
   'edition': '5e',
   'pdf/title': 'Korranberg Chronicle: Threat Dispatch',
   'pdf/author': 'Anthony J. Turco',
   'pages': [47, 48, 49]},
  {'filename': '248087-sample.pdf',
   'edition': '5e',
   'pdf/title': 'Korranberg Chronicle: Threat Dispatch',
   'pdf/author': 'Anthony J. Turco',
   'pages': [48, 49, 50]}])

In [123]:
embeddings = []
for chunk in tqdm(chunks):
    embedding = model.encode(chunk, normalize_embeddings=True)
    embeddings.append(embedding)

  0%|          | 0/7639 [00:00<?, ?it/s]

# Save the Artefact

In [130]:
# TODO: Turn into class
# TODO: Add lancedb
# TODO: Understand cross-encoder
tmp_artefact_folder = mkdtemp()


In [131]:
model_metadata = {
    'version': ARTEFACT_VERSION,
    'document_version': DOCUMENT_VERSION,
    'chunk_count': len(chunks),
    'embedding_format': 'pickle',
    'embedding_model': {
        'name': model_name, 
        'str': str(model).replace('\n', ''), 
        'revision': model_revision,
    }
}
with open(os.path.join(tmp_artefact_folder, 'model_metadata.json'), 'w') as f:
	json.dump(model_metadata, f)

In [132]:
with open(os.path.join(tmp_artefact_folder, 'chunk_metadata.json'), 'w') as f:
	json.dump(chunk_metadata, f)

In [133]:
with open(os.path.join(tmp_artefact_folder, 'embeddings.pkl'), 'wb') as f:
	pickle.dump(embeddings, f)

In [134]:

os.makedirs(os.path.join(tmp_artefact_folder, 'chunks'), exist_ok=True)
for i, chunk in enumerate(tqdm(chunks)):
    filename_width = len(str(len(chunks)))
    file_path = os.path.join(tmp_artefact_folder, 'chunks', str(i).zfill(filename_width) + '.md')
    with open(file_path, 'w') as f:
        f.write(chunk)

  0%|          | 0/7639 [00:00<?, ?it/s]

In [135]:
shutil.rmtree(ARTEFACT_FOLDER)
shutil.move(tmp_artefact_folder, ARTEFACT_FOLDER)

'/jupyterlab/artefacts/eberron/v01'

# Unserialize the Embeddings

In [136]:
with open(os.path.join(ARTEFACT_FOLDER, 'embeddings.pkl'), 'rb') as f:
    embeddings = pickle.load(f)


# Evaluate

In [137]:
# TODO: Experiment with BM25 and compare

In [153]:
# query = "Who is Dash Dannigan?"
# query = "Who is Commander Iyanna?"
# query = "Tell me about Menthis Plateau."
# query = "Tell me about Eldeen Reaches."
# query = "Tell me about the rivers of Khorvaire."
# query = "Tell me about Xen'drik."
# query = "Tell me about fashion in Khorvaire."
# query = "Create a House Cannith item."
# query = "Tell me about the lnaguages of Eberron."
# query = "What's the weakest of the quori" - TSOREVA
# query = "What are dolgrims?"
query_embed = model.encode(query, normalize_embeddings=True)

In [154]:
k = 7
similarities = torch.from_numpy(np.dot(embeddings, query_embed.T))
similarities.topk(k).indices.tolist()

[4113, 4115, 4147, 4114, 1077, 4141, 574]

In [155]:
for i in similarities.topk(k).indices.tolist():
    print(chunk_metadata[i])

{'filename': '496317-sample.pdf', 'edition': '5e', 'pdf/title': '', 'pdf/author': '', 'pages': [0, 1, 2]}
{'filename': '496317-sample.pdf', 'edition': '5e', 'pdf/title': '', 'pdf/author': '', 'pages': [2, 3, 4]}
{'filename': '496317-sample.pdf', 'edition': '5e', 'pdf/title': '', 'pdf/author': '', 'pages': [34, 35, 36]}
{'filename': '496317-sample.pdf', 'edition': '5e', 'pdf/title': '', 'pdf/author': '', 'pages': [1, 2, 3]}
{'filename': '2255601-Exploring_Eberron_1.05.pdf', 'edition': '5e', 'pdf/title': 'Exploring Eberron', 'pdf/author': 'Keith Baker', 'pages': [16, 17, 18]}
{'filename': '496317-sample.pdf', 'edition': '5e', 'pdf/title': '', 'pdf/author': '', 'pages': [28, 29, 30]}
{'filename': 'Five Nations.pdf', 'edition': '3e', 'pdf/title': 'Five Nations', 'pdf/author': '', 'pages': [80, 81, 82]}


In [156]:
# for i in similarities.topk(k).indices.tolist():
#     print(chunks[i])

In [157]:
# TODO: Add a model here to actually respond.