In [1]:
import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('running on GPU')    
else:
    device = torch.device('cpu')
    print('running on CPU')


running on GPU


In [2]:
query  = ['If I become involved in treatment, what do I need to know?']

corpus = ['Feeling comfortable with the professional you or your child is working with is critical to the success of your treatment. Finding the professional who best fits your needs may require some research.',
          'There are many types of mental health professionals. Finding the right one for you may require some research.',
          'There are many types of mental health professionals. The variety of providers and their services may be confusing. Each have various levels of education, training, and may have different areas of expertise. Finding the professional who best fits your needs may require some research.',
          'When healing from mental illness, early identification and treatment are of vital importance. Based on the nature of the illness, there are a range of effective treatments available. For any type of treatment, it is essential that the person affected is proactive and fully engaged in their own recovery process.\nMany people with mental illnesses who are diagnosed and treated respond well, although some might experience a return of symptoms. Even in such cases, with careful monitoring and management of the disorder, it is still quite possible to live a fulfilled and productive life.']

In [3]:
# iterate over the query list and append the string 'query: ' to the start of each item. Same the list as query_e5
query_e5 = ['query: ' + x for x in query]

# iterate over the corpus list and append the string 'corpus: ' to the start of each item. Same the list as corpus_e5
corpus_e5 = ['passage: ' + x for x in corpus]

In [8]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')
model = AutoModel.from_pretrained('intfloat/e5-large-v2')

In [4]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


In [9]:
batch_query_dict = tokenizer(query_e5, max_length=512, padding=True, truncation=True, return_tensors='pt')
outputs = model(**batch_query_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_query_dict['attention_mask'])
query_embeddings_as_nparray = F.normalize(embeddings, p=2, dim=1).detach().numpy()

In [10]:
query_embeddings_as_nparray.shape

(1, 1024)

In [48]:
batch_corpus_dict = tokenizer(corpus_e5, max_length=512, padding=True, truncation=True, return_tensors='pt')
outputs = model(**batch_corpus_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_corpus_dict['attention_mask'])
#get the embeddings as a numpy array where each row is a vector of dimension 768
corpus_embeddings_as_nparray = F.normalize(embeddings, p=2, dim=1).detach().numpy()


In [47]:
corpus_embeddings_as_nparray.shape

#convert corpus_embeddings_as_nparray from a torch tensor of size (4, 768) to a numpy array of size (4, 768)
corpus_embeddings_as_nparray = corpus_embeddings_as_nparray.detach().numpy()
corpus_embeddings_as_nparray.shape


(4, 768)

In [50]:
from sklearn.metrics.pairwise import cosine_similarity

In [51]:
cosine_similarity(query_embeddings_as_nparray, corpus_embeddings_as_nparray)

array([[0.8379889 , 0.77799237, 0.7885269 , 0.8345152 ]], dtype=float32)