In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Instantiating the function for document similarity computation using embeddings from a LLM.

In [2]:
def compare_documents(doc1, doc2):

    # Tokenize and encode the documents
    encoding1 = embedding_tokenizer(doc1, return_tensors='pt', truncation=True,max_length=200)
    encoding2 = embedding_tokenizer(doc2, return_tensors='pt', truncation=True,max_length=200)

    # Compute model scores
    with torch.no_grad():
        outputs1 = embedding_model(**encoding1)
        outputs2 = embedding_model(**encoding2)

    embeddings1 = outputs1.last_hidden_state.mean(dim=1)
    embeddings2 = outputs2.last_hidden_state.mean(dim=1)

    # Compare the scores
    similarity_score = cosine_similarity(embeddings1, embeddings2)[0][0]

    return similarity_score


# Sample documents

In [3]:
doc1 = "The quick brown fox jumps over the lazy dog."
doc2 = "A lazy dog is jumped over by a quick brown fox."

# Model 1 - distilbert-base-uncased

In [4]:
embedding_model_name = "distilbert-base-uncased"
embedding_model = AutoModel.from_pretrained(embedding_model_name)
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)

embedding_tokenizer.pad_token = embedding_tokenizer.eos_token

similarity_score = compare_documents(doc1, doc2)
print("Similarity Score:", similarity_score)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Similarity Score: 0.9402323


# Model 2 - Writer/palmyra-small

In [5]:
embedding_model_name = "Writer/palmyra-small"
embedding_model = AutoModel.from_pretrained(embedding_model_name)
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)

embedding_tokenizer.pad_token = embedding_tokenizer.eos_token

similarity_score = compare_documents(doc1, doc2)
print("Similarity Score:", similarity_score)

Similarity Score: 0.88224614


# Model 3 - gpt2

In [6]:
embedding_model_name = "gpt2"
embedding_model = AutoModel.from_pretrained(embedding_model_name)
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)

embedding_tokenizer.pad_token = embedding_tokenizer.eos_token

similarity_score = compare_documents(doc1, doc2)
print("Similarity Score:", similarity_score)

Similarity Score: 0.99865556
