In [1]:

import torch
from transformers import AutoTokenizer, AutoModel

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]


In [2]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
sentences = [
    "chocolates are my favourite items.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "white chocolates and dark chocolates are favourites for many people.",
    "I love chocolates"
]

In [4]:
encoding = tokenizer.batch_encode_plus(sentences, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")


In [5]:
attention_mask = encoding['attention_mask']


In [6]:
#outputs[0] has the last hidden_state and outputs[1] has the pooler_output
outputs = model(**encoding)


In [7]:
# we are interested in the embedddings , i.e. the last hidden state 
embeddings = outputs[0] 
embeddings.size()

torch.Size([5, 512, 768])

In [8]:
#let us ge tthe attention_mask from our encoding.
attention_mask = encoding['attention_mask']


In [9]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([5, 512, 768])

In [10]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 

In [11]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([5, 512, 768])

In [12]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([5, 768])

In [14]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([5, 768])

In [15]:
mean_pooled = summed / summed_mask
mean_pooled.shape

torch.Size([5, 768])

In [16]:
mean_pooled

tensor([[ 0.2974,  0.1260,  0.1364,  ...,  0.1278,  0.0883, -0.2417],
        [ 0.0527, -0.1373,  0.1023,  ..., -0.0057,  0.0762,  0.2454],
        [ 0.0273, -0.1436,  0.4123,  ..., -0.2376,  0.2174, -0.1170],
        [ 0.0664,  0.2313,  0.5076,  ..., -0.2370, -0.0501,  0.0009],
        [ 0.3953,  0.5431,  0.0312,  ...,  0.0463,  0.1247,  0.1673]],
       grad_fn=<DivBackward0>)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.5116807 , 0.5200646 , 0.76747036, 0.8000997 ]], dtype=float32)