In [None]:
%pip install adapters

Load articles and prune ones without abstracts, since we're using the abstracts for generating the embeddings.

In [None]:
from tinydb import TinyDB, Query

db = TinyDB("db.json")
table = db.table("articles")

articles = table.all()
print(f"loaded {len(articles)} articles")

articles = [x for x in articles if x["abstract"] != "No abstract available."]
print(f"retaining {len(articles)} articles")

Stage the articles so that they can easily be loaded into the vector database.

In [None]:
documents = []
ids = []

for article in articles:
    documents.append(article["abstract"])
    ids.append(article["link"])

For finding semantically related documents, we'll use Chroma (https://www.trychroma.com/), which is a lightweight vector data store. Chroma supports swappable embedding models, filtering using metadata, keyword search, and multiple distance measurements. We'll use these features for evlauating approaches to organizing papers for downstream processing (search, summarization, keyword extraction, etc.).

In [None]:
import chromadb
from chromadb.utils import embedding_functions

client = chromadb.PersistentClient(path="vectors_db")

In [None]:
# TODO Use SPECTER2 model for automatically generating embeddings in Chroma

Generate embeddings using Allen AI SPECTER2 (https://huggingface.co/allenai/specter2). SPCTER2 uses the Hugging Face Adapter library for managing model extensions for specific tasks.

In [None]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel

tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")
model = AutoAdapterModel.from_pretrained("allenai/specter2_base")

model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)

In [None]:
papers = [
    {
        "title": "BERT",
        "abstract": "We introduce a new language representation model called BERT",
    },
    {
        "title": "Attention is all you need",
        "abstract": " The dominant sequence transduction models are based on complex recurrent or convolutional neural networks",
    },
]

# concatenate title and abstract
text_batch = [d["title"] + tokenizer.sep_token + (d.get("abstract") or "") for d in papers]
# preprocess the input
inputs = tokenizer(
    text_batch,
    padding=True,
    truncation=True,
    return_tensors="pt",
    return_token_type_ids=False,
    max_length=512,
)
output = model(**inputs)
# take the first token in the batch as the embedding
embeddings = output.last_hidden_state[:, 0, :]

embeddings

In [None]:
def embed_input(tokenizer, text_batch):
    # preprocess the input
    inputs = tokenizer(
        text_batch,
        padding=True,
        truncation=True,
        return_tensors="pt",
        return_token_type_ids=False,
        max_length=512,
    )
    output = model(**inputs)
    # take the first token in the batch as the embedding
    embeddings = output.last_hidden_state[:, 0, :]
    return embeddings


model.load_adapter(
    "allenai/specter2_adhoc_query",
    source="hf",
    load_as="specter2_adhoc_query",
    set_active=True,
)
query = ["Bidirectional transformers"]
query_embedding = embed_input(tokenizer, query)