In [1]:
import transformers

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

print(model_output[0], model_output.hidden_states)
# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

tensor([[[ 9.6638e-02, -1.7067e-01,  7.6038e-03,  ...,  2.3386e-02,
           1.0138e-01, -4.2516e-02],
         [ 6.8078e-02, -1.5394e-01, -1.2517e-01,  ..., -1.1160e-02,
           1.1047e-02, -1.3179e-03],
         [ 1.5145e-02, -3.7817e-01, -1.0184e-01,  ..., -8.7419e-02,
           1.0200e-01,  7.7285e-02],
         ...,
         [ 1.0243e-01, -3.4813e-01, -8.4518e-02,  ..., -1.1049e-01,
           8.5594e-02, -4.6411e-02],
         [-2.5399e-02,  7.7970e-03, -4.4150e-02,  ...,  9.5833e-02,
          -6.1752e-02, -6.1012e-03],
         [ 1.1801e-01, -1.2156e-01,  2.4940e-02,  ..., -9.5819e-03,
           1.4038e-01, -3.2524e-02]],

        [[ 1.2760e-01,  2.1539e-02, -4.1318e-02,  ..., -1.0661e-01,
          -1.9259e-01, -5.4022e-03],
         [ 1.4989e-01, -7.3531e-03, -9.6599e-02,  ..., -8.0247e-02,
          -3.2525e-01, -1.0313e-04],
         [ 2.1020e-01,  7.2041e-02, -3.7893e-02,  ...,  3.0247e-02,
          -3.0968e-01,  1.3309e-02],
         ...,
         [ 9.5321e-02,  2

In [3]:
encoded_input.word_ids()

[None, 0, 1, 2, 3, 4, None]

In [4]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel


def get_word_idx(sent: str, word: str):
    return sent.split(" ").index(word)


def get_hidden_states(encoded, token_ids_word, model, layers):
    """Push input IDs through model. Stack and sum `layers` (last four by default).
       Select only those subword token outputs that belong to our word of interest
       and average them."""
    with torch.no_grad():
        output = model(**encoded)

    # Get all hidden states
    states = output.hidden_states
    # Stack and sum all requested layers
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
    # Only select the tokens that constitute the requested word
    word_tokens_output = output[token_ids_word]

    return word_tokens_output.mean(dim=0)


def get_word_vector(sent, idx, tokenizer, model, layers):
    """Get a word vector by first tokenizing the input sentence, getting all token idxs
       that make up the word of interest, and then `get_hidden_states`."""
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    # get all token idxs that belong to the word of interest
    token_ids_word = np.where(np.array(encoded.word_ids()) == idx)

    return get_hidden_states(encoded, token_ids_word, model, layers)


def main(sent="I like cookies .", word="cookies", layers=None):
    # Use last four layers by default
    layers = [-4, -3, -2, -1] if layers is None else layers
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
    model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2", output_hidden_states=True)

    idx = get_word_idx(sent, word)

    word_embedding = get_word_vector(sent, idx, tokenizer, model, layers)
    
    return word_embedding 


if __name__ == '__main__':
    main()

In [5]:
bank1 = main("She sat on the river bank across from a series of wide, large steps leading up a hill to the park where the Arch stood, framed against a black sky.", "bank")

In [6]:
river = main("He swam across the river Thames.", "river")

In [7]:
def cosine_similarity(a, b):
    assert a.ndim == b.ndim
    if a.ndim == 1:
        a = a.unsqueeze(0)
        b = b.unsqueeze(0)
    a_norm = a / a.norm(dim=1)[:, None]
    b_norm = b / b.norm(dim=1)[:, None]
    res = a_norm @ b_norm.transpose(0,1)
    return res

In [8]:
bank1@river/bank1.norm()/river.norm()

tensor(0.3417)

In [9]:
cosine_similarity(bank1, river)

tensor([[0.3417]])

In [10]:
river.shape

torch.Size([768])

In [14]:
from sentence_transformers import SentenceTransformer
sentences = ["fall", "iron", "witch", "ham", "note", "cat", "beijing", "bear", "ambulance"]

model = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model.encode(sentences)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
def test_similarity(word, corpus_embeddings):
    query_embeddings = torch.tensor(model.encode(word))
    query_embeddings /= query_embeddings.norm()
    corpus_embeddings = torch.tensor(corpus_embeddings)
    corpus_embeddings = corpus_embeddings/corpus_embeddings.norm(dim=1)[:, None]
    return corpus_embeddings @ query_embeddings

In [16]:
test_similarity("wok", corpus_embeddings)

tensor([0.1688, 0.3186, 0.3222, 0.3003, 0.3719, 0.2479, 0.2199, 0.2653, 0.2037])

In [17]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('nq-distilbert-base-v1')

query_embedding = model.encode('How many people live in London?')

#The passages are encoded as [ [title1, text1], [title2, text2], ...]
passage_embedding = model.encode([['London', 'London has 9,787,426 inhabitants at the 2011 census.']])

print("Similarity:", util.pytorch_cos_sim(query_embedding, passage_embedding))

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/540 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/554 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Similarity: tensor([[0.6503]])
