In [None]:
%pip install --upgrade chromadb

In [None]:
%pip install --upgrade sentence_transformers

In [None]:
from tinydb import TinyDB, Query

db = TinyDB('db.json')
table = db.table('articles')

articles = table.all()
print(f'loaded {len(articles)} articles')

articles = [x for x in articles if x['abstract'] != 'No abstract available.']
print(f'retaining {len(articles)} articles')

In [None]:
documents = []
ids = []

for article in articles:
    documents.append(article['abstract'])
    ids.append(article['link'])

In [None]:
import chromadb
from chromadb.utils import embedding_functions

client = chromadb.PersistentClient(path="vectors_db")

In [None]:
ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="allenai-specter")

# client.delete_collection(name="articles")
collection = client.create_collection(
    name="articles",
    embedding_function=ef
)

collection.add(
    documents=documents,
    ids=ids
)

In [None]:
results = collection.query(
    query_texts=["infectious diseases transmitted by mosquitoes affecting children"],
    n_results=5
)

results

Embedding with the NVIDIA Embed v2 model: https://huggingface.co/nvidia/NV-Embed-v2

In [None]:
%pip install einops datasets

In [None]:
%pip install pynvml

In [None]:
import utils
utils.print_device_info()

In [None]:
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="nvidia/NV-Embed-v2",
    # device='cuda',
    trust_remote_code=True
)

client.delete_collection(name="articles2")
collection = client.create_collection(
    name="articles2",
    embedding_function=ef
)

In [None]:
from ipywidgets import IntProgress
from IPython.display import display

progress_bar = IntProgress(min=0, max=len(documents))
display(progress_bar)

for i, item in enumerate(documents):
    collection.add(
        documents=item,
        ids=ids[i]
    )

    progress_bar.value += 1

In [None]:
from transformers import AutoModel
from torch.nn import DataParallel

model = AutoModel.from_pretrained(
    "nvidia/NV-Embed-v2",
    trust_remote_code=True,
    device_map='auto'
)

# model = DataParallel(model)
for module_key, module in model._modules.items():
    model._modules[module_key] = DataParallel(module)

In [None]:
model.encode(articles[0]['abstract'])

Generate embeddings using Allen AI SPECTER2: https://huggingface.co/allenai/specter2

In [None]:
%pip install adapters

In [None]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel

tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')

model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)