In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

from sklearn.metrics.pairwise import cosine_similarity

In [1]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "Standing on one's head at job interviews forms a lasting impression.",
    "It took him a month to finish the meal.",
    "He found a leprechaun in his walnut shell."
]

In [2]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# initialize dictionary that will contain tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # tokenize sentence and append to dictionary lists
    new_tokens = tokenizer.encode_plus(sentence,
                                       max_length=128,
                                       truncation=True,
                                       padding='max_length',
                                       return_tensors='pt'
                                      )
    
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [3]:
tokens['input_ids'].shape

torch.Size([6, 128])

In [4]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [5]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-6.9229e-02,  6.2300e-01,  3.5371e-02,  ...,  8.0334e-01,
           1.6314e+00,  3.2812e-01],
         [ 3.6730e-02,  6.8419e-01,  1.9460e-01,  ...,  8.4759e-02,
           1.4747e+00, -3.0080e-01],
         [-1.2142e-02,  6.5431e-01, -7.2717e-02,  ..., -3.2600e-02,
           1.7717e+00, -6.8121e-01],
         ...,
         [ 1.9532e-01,  1.1085e+00,  3.3905e-01,  ...,  1.2826e+00,
           1.0114e+00, -7.2754e-02],
         [ 9.0217e-02,  1.0288e+00,  3.2973e-01,  ...,  1.2940e+00,
           9.8650e-01, -1.1125e-01],
         [ 1.2404e-01,  9.7365e-01,  3.9329e-01,  ...,  1.1359e+00,
           8.7685e-01, -1.0435e-01]],

        [[-3.2124e-01,  8.2512e-01,  1.0554e+00,  ..., -1.8555e-01,
           1.5169e-01,  3.9366e-01],
         [-7.1457e-01,  1.0297e+00,  1.1217e+00,  ...,  3.3118e-02,
           2.3820e-01, -1.5632e-01],
         [-2.3522e-01,  1.1353e+00,  8.5941e-01,  ..., -4.3096e-01,
          -2.7242e-02, -2.9677e-01],
         ...,
         [-5.4000e-01,  3

In [6]:
embeddings.shape

torch.Size([6, 128, 768])

In [7]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([6, 128])

In [8]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([6, 128, 768])

In [9]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([6, 128, 768])

In [10]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([6, 768])

In [11]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([6, 768])

In [12]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[ 0.0745,  0.8637,  0.1795,  ...,  0.7734,  1.7247, -0.1803],
        [-0.3715,  0.9729,  1.0840,  ..., -0.2552, -0.2759,  0.0358],
        [-0.5030,  0.7950, -0.1240,  ...,  0.1441,  0.9704, -0.1791],
        [-0.0132,  0.9773,  1.4516,  ..., -0.8462, -1.4004, -0.4118],
        [-0.2019,  0.0597,  0.8603,  ..., -0.0100,  0.8431, -0.0841],
        [-0.2131,  1.0175, -0.8833,  ...,  0.7371,  0.1947, -0.3011]],
       grad_fn=<DivBackward0>)

In [13]:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

In [24]:
mean_pooled

array([[ 0.0744615 ,  0.86369663,  0.17946403, ...,  0.77344006,
         1.7247488 , -0.18027496],
       [-0.37146333,  0.9729013 ,  1.0839937 , ..., -0.25521275,
        -0.2759373 ,  0.03575867],
       [-0.50298226,  0.794986  , -0.12402522, ...,  0.14406362,
         0.97037494, -0.17911562],
       [-0.01324306,  0.9772857 ,  1.451594  , ..., -0.846165  ,
        -1.4004318 , -0.41184372],
       [-0.20192645,  0.05970357,  0.8602745 , ..., -0.01000803,
         0.84306246, -0.0840771 ],
       [-0.21311913,  1.0174934 , -0.8832755 , ...,  0.73710376,
         0.19469155, -0.30111268]], dtype=float32)

In [76]:
# calculate cosine_similarity
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.3308891 , 0.72192585, 0.17475507, 0.4470966 , 0.5548363 ]],
      dtype=float32)

In [89]:
%%time

result = cosine_similarity(mean_pooled,mean_pooled)
print(result)

[[1.         0.33088908 0.72192585 0.17475504 0.44709677 0.5548364 ]
 [0.33088908 0.99999964 0.24826953 0.2923194  0.20174855 0.2950728 ]
 [0.72192585 0.24826953 1.         0.25110355 0.5565801  0.41768277]
 [0.17475504 0.2923194  0.25110355 0.99999994 0.26012164 0.13192454]
 [0.44709677 0.20174855 0.5565801  0.26012164 1.0000002  0.22627155]
 [0.5548364  0.2950728  0.41768277 0.13192454 0.22627155 0.9999998 ]]
CPU times: user 1.07 ms, sys: 0 ns, total: 1.07 ms
Wall time: 973 Âµs


In [91]:
# while True:
rows=np.argwhere(result>.5)
rows

array([[0, 0],
       [0, 2],
       [0, 5],
       [1, 1],
       [2, 0],
       [2, 2],
       [2, 4],
       [3, 3],
       [4, 2],
       [4, 4],
       [5, 0],
       [5, 5]])

In [97]:
a = np.array([[1, 2, 4], [5, 8, 7]], dtype = 'float')
a.dtype

dtype('float64')

In [75]:

import numpy as np

%%time

for i in [mean_pooled]:
    for j in [mean_pooled]:
        similarity = cosine_similarity(i,j)
print(similarity)



l = []
for i in range(len(sentences)):
    lst = []
    for j in range(len(sentences)):
        lst.append(float(j))
    l.append(lst)
sent = np.asarray(l)
print(sent)

# dict = {}
# for A, B in zip(sent, similarity):  
#     dict[A] = [B]

# print(dict)



TypeError: unhashable type: 'list'