In [1]:
#!pip install transformers
#!pip install ipywidgets
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#!pip install scikit-learn

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

In [3]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

In [4]:
'''
prompts = [
    "nine hundred alda in meters. If you don't have any reference, try the following definition  and use fermi estimation to get in the ballpark :\n\nJochi Khasar, the Khan\u2019s brother, was known far and wide for his ability to hit his targets from more than nine hundred alda, a traditional Mongolian unit of measurement equal to the distance between the tips of the middle fingers of two outstretched arms.",
    "I wouldn't have expected a fathom to be that unit. I always thought it was used for depths, so I figured it'd be some nautical definition",
    "what's the world record furthest sniper shot?",
    "Yeah, so Jochi Kasar got a very significant % of that with a mongolian bow? I'm mildly skeptical because that's very impressive for medieval-ish tech"
]
'''

prompts = [
    "write a python code to do sorting on this array",
    "can you implement this for bubble sort instead",
    "What are other ways to sort an array of number",
    "I want to use this for my project"
]


# initializing dictionary to store tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for prompt in prompts:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(prompt, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [5]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [6]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-4.3190e-01,  2.9822e-01,  2.9383e-01,  ..., -9.5950e-02,
          -7.2829e-01,  5.2303e-01],
         [-1.7692e-02,  7.1392e-01,  6.8303e-01,  ..., -2.3937e-01,
          -1.2221e+00,  3.1039e-01],
         [-5.0117e-01,  3.8652e-01,  4.9108e-01,  ..., -9.2106e-02,
          -8.6441e-01,  4.7457e-01],
         ...,
         [-3.2257e-01,  4.8349e-04,  1.6670e-01,  ...,  1.0465e-01,
          -4.8503e-01,  3.7138e-01],
         [-4.0685e-01,  2.1174e-01,  2.7786e-01,  ...,  1.0413e-01,
          -4.8027e-01,  3.3841e-01],
         [-3.0408e-01,  1.9008e-01,  2.7161e-01,  ...,  6.3087e-02,
          -5.5654e-01,  3.0209e-01]],

        [[ 2.6178e-01, -9.5968e-01,  1.1267e+00,  ..., -9.2006e-01,
          -1.0057e+00,  2.4199e-01],
         [ 9.1039e-01, -1.3267e+00,  1.3404e+00,  ..., -5.2803e-01,
          -1.3458e+00, -1.2290e-01],
         [ 4.9865e-01, -1.2001e+00,  1.0422e+00,  ..., -5.9383e-01,
          -1.2277e+00,  3.2731e-02],
         ...,
         [ 3.3391e-01, -8

In [7]:
embeddings.shape

torch.Size([4, 128, 768])

In [8]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([4, 128])

In [9]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([4, 128, 768])

In [10]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 

In [11]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([4, 128, 768])

In [12]:
masked_embeddings

tensor([[[-0.4319,  0.2982,  0.2938,  ..., -0.0959, -0.7283,  0.5230],
         [-0.0177,  0.7139,  0.6830,  ..., -0.2394, -1.2221,  0.3104],
         [-0.5012,  0.3865,  0.4911,  ..., -0.0921, -0.8644,  0.4746],
         ...,
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000]],

        [[ 0.2618, -0.9597,  1.1267,  ..., -0.9201, -1.0057,  0.2420],
         [ 0.9104, -1.3267,  1.3404,  ..., -0.5280, -1.3458, -0.1229],
         [ 0.4987, -1.2001,  1.0422,  ..., -0.5938, -1.2277,  0.0327],
         ...,
         [ 0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
         [ 0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
         [ 0.0000, -0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000]],

        [[ 0.0392,  0.0113,  0.4152,  ...,  0.0799, -0.1277,  1.1582],
         [ 0.2711, -0.0037,  0.5989,  ...,  0

In [13]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([4, 768])

In [14]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([4, 768])

In [15]:
summed_mask

tensor([[12., 12., 12.,  ..., 12., 12., 12.],
        [10., 10., 10.,  ..., 10., 10., 10.],
        [12., 12., 12.,  ..., 12., 12., 12.],
        [10., 10., 10.,  ..., 10., 10., 10.]])

In [16]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[-0.3523,  0.4436,  0.4749,  ..., -0.1000, -0.9071,  0.3401],
        [ 0.5114, -0.7860,  1.2028,  ..., -0.6622, -1.1053,  0.2055],
        [-0.0403,  0.1121,  0.6267,  ..., -0.0490, -0.3097,  1.3088],
        [ 0.2524,  0.0227,  2.2247,  ..., -0.9334, -1.2291,  0.4203]],
       grad_fn=<DivBackward0>)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.32856137, 0.4381972 , 0.34922183]], dtype=float32)