In [55]:
import os
import re
import json
import pickle

import numpy as np

import torch
from sentence_transformers import SentenceTransformer
# from langchain_mistralai import ChatMistralAI

from tqdm.notebook import tqdm


In [2]:
DOCUMENT_VERSION = 3
FOLDER = f'/corpus/texts_v{DOCUMENT_VERSION}'



In [12]:
model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)
model = model.to("cuda")




In [18]:
# Are we using cuda? Needs to be True
all([param.is_cuda for param in model.parameters()])

True

In [70]:
with open('/corpus/metadata.json') as f:
    metadata = json.load(f)
metadata_dict = {}
for data in metadata:
    metadata_dict[data['filename'][:-4]] = data

In [5]:
include_previous = 1
include_next = 1
drop_if_less_than_n_chars = 100
chunks = []
chunk_metadata = []
chunk_idx = 0
for file_name in os.listdir(FOLDER):
    file_path = os.path.join(FOLDER, file_name)
    with open(file_path) as f:
        document = f.read()
    page_splits = re.split(r'\n## .*\n', document)
    for idx, split in enumerate(page_splits):
        if len(split) < drop_if_less_than_n_chars:
            page_splits[idx] = ''
    page_index = {}
    for idx, split in enumerate(page_splits):

        if idx < include_previous or idx > len(page_splits) - include_next - 1:
            continue
        chunk_metadata.append(metadata_dict.get(file_name[:-3], {'filename': file_name[:-3] + '.pdf'}))
        chunk_metadata[chunk_idx]['pages'] = tuple(range(idx-include_previous, idx+include_next))
        chunks.append('   '.join(page_splits[idx-include_previous:idx+include_next]))
        chunk_idx += 1



In [13]:
len(chunks), len(chunk_metadata), chunk_metadata[:2]

(7639,
 7639,
 [{'filename': '881665-eberron_cannith_cat2.pdf',
   'edition': '5e',
   'pdf/title': '',
   'pdf/author': '',
   'pages': (2, 3)},
  {'filename': '881665-eberron_cannith_cat2.pdf',
   'edition': '5e',
   'pdf/title': '',
   'pdf/author': '',
   'pages': (2, 3)}])

In [22]:
embeddings = []
for chunk in tqdm(chunks):
    embedding = model.encode(chunk, normalize_embeddings=True)
    embeddings.append(embedding)

  0%|          | 0/7639 [00:00<?, ?it/s]

In [59]:
with open('embeddings_v01.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

# Unserialize the Embeddings

In [60]:
with open('embeddings_v01.pkl', 'rb') as f:
    embeddings = pickle.load(f)


In [73]:
query = "Who is Dash Donnigan?"
query = "Who is Commander Iyanna?"
query = "Tell me about Menthis Plateau."
query = "Tell me about Eldeen Reaches."
query = "Tell me about the rivers of Khorvaire."
query = "Tell me about Xen'drik."
query = "Tell me about fashion in Khorvaire."
query = "Create a House Cannith item."
query = "Tell me about the lnaguages of Eberron."
query_embed = model.encode(query, normalize_embeddings=True)

In [74]:
k = 7
similarities = torch.from_numpy(np.dot(embeddings, query_embed.T))
similarities.topk(k).indices.tolist()

[2720, 5585, 2718, 3732, 4294, 4203, 3749]

In [75]:
for i in similarities.topk(k).indices.tolist():
    print(chunk_metadata[i])

{'filename': 'Eberron_ Rising From the Last War - Jeremy Crawford & James Wyatt & Keith Baker.pdf', 'edition': '5e', 'pdf/title': 'Eberron: Rising From the Last War', 'pdf/author': 'Jeremy Crawford & James Wyatt & Keith Baker', 'pages': (322, 323)}
{'filename': '1920353-Eberronicon_1.2.pdf', 'edition': '5e', 'pdf/title': 'Eberronicon: A Pocket Guide to the World', 'pdf/author': 'Across Eberron', 'pages': (53, 54)}
{'filename': 'Eberron_ Rising From the Last War - Jeremy Crawford & James Wyatt & Keith Baker.pdf', 'edition': '5e', 'pdf/title': 'Eberron: Rising From the Last War', 'pdf/author': 'Jeremy Crawford & James Wyatt & Keith Baker', 'pages': (322, 323)}
{'filename': "Wayfinder's Guide to Eberron - Keith Baker.pdf", 'edition': '5e', 'pdf/title': '', 'pdf/author': '', 'pages': (174, 175)}
{'filename': '297249-sample.pdf', 'pages': (2, 3)}
{'filename': '1598836-Languages_of_Eberron_2E.pdf', 'edition': '5e', 'pdf/title': 'Languages of Eberron 2E mk iii', 'pdf/author': '', 'pages': (38

In [69]:
for i in similarities.topk(k).indices.tolist():
    print(chunks[i])


one-half the total cost af houte expeditions to Xen'drik
or Cyre, on lung as such expeditions leave from Sharn and
include #1 least one Cannith South member.

Gear: Members of House Cannith are always well
cuthtied—anyi hing less would be an affront 10 the house's
pride. Those on Cannith business are fully outfitted with
mundane equipment. and those on personal business pay
only 25% of the mandard con for mundane gear.

Information: Members can have magic items identi-
fied for 50% of the normal cost and get sccess to maps and
reports from Cannith explorers in Cyre and Xen'drik,

Access Only members of the house and their assoc i-
ares have access to the house's forges and workshops.

Favored in House Benefits

Though all members of House Cannith have a level of
privilege, the split in the house has led to ill will between
competing factions, Requests from members of Can-
nith Eas maghi be ignared by Cannith South unless the
southern faction leely generous or will profit in some
way. 

In [40]:
len(docs_embed), docs_embed.size
# First number is the number of documents, second is I think the number of tokens.


(3, 2304)