In [None]:
%pip install einops datasets

In [None]:
%pip install pynvml

Load articles and prune ones without abstracts, since we're using the abstracts for generating the embeddings.

In [None]:
from tinydb import TinyDB, Query

db = TinyDB('db.json')
table = db.table('articles')

articles = table.all()
print(f'loaded {len(articles)} articles')

articles = [x for x in articles if x['abstract'] != 'No abstract available.']
print(f'retaining {len(articles)} articles')

Stage the articles so that they can easily be loaded into the vector database.

In [None]:
documents = []
ids = []

for article in articles:
    documents.append(article['abstract'])
    ids.append(article['link'])

For finding semantically related documents, we'll use Chroma (https://www.trychroma.com/), which is a lightweight vector data store. Chroma supports swappable embedding models, filtering using metadata, keyword search, and multiple distance measurements. We'll use these features for evlauating approaches to organizing papers for downstream processing (search, summarization, keyword extraction, etc.).

In [None]:
import chromadb
from chromadb.utils import embedding_functions

client = chromadb.PersistentClient(path="vectors_db")

In [None]:
import utils
utils.print_device_info()

In [None]:
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="nvidia/NV-Embed-v2",
    # device='cuda',
    trust_remote_code=True
)

client.delete_collection(name="articles-nv_embed_v2-embeddings")
collection = client.create_collection(
    name="articles2",
    embedding_function=ef
)

In [None]:
from ipywidgets import IntProgress
from IPython.display import display

progress_bar = IntProgress(min=0, max=len(documents))
display(progress_bar)

for i, item in enumerate(documents):
    collection.add(
        documents=item,
        ids=ids[i]
    )

    progress_bar.value += 1

In [None]:
from transformers import AutoModel
from torch.nn import DataParallel

model = AutoModel.from_pretrained(
    "nvidia/NV-Embed-v2",
    trust_remote_code=True,
    device_map='auto'
)

# model = DataParallel(model)
for module_key, module in model._modules.items():
    model._modules[module_key] = DataParallel(module)

In [None]:
model.encode(articles[0]['abstract'])