In [1]:
import numpy as np
import torch
from transformers import BertTokenizer, BertModel

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [3]:
sentences = [
    'What the heck? Anyway, I do not care!', 
    'I am a British citizen.',
    'He is the king of England.', 
    'He is the king of Python, a good machine learning engineer.',
    'He is the king of Spain.'
]

In [4]:
tokenizer.tokenize(sentences[0])

['what', 'the', 'heck', '?', 'anyway', ',', 'i', 'do', 'not', 'care', '!']

In [5]:
input_ids = []
attention_masks = []
tokenized_texts = []

for sentence in sentences:
    marked_text = '[CLS]' + sentence + '[SEP]'
    
    encoded_dict = \
        tokenizer.encode_plus(
                         sentence,
                         add_special_tokens=True,
                         trunction=True,
                         max_length=48,
                         pad_to_max_length=True,
                         return_tensors='pt')
    
    
    tokenized_texts.append(tokenizer.tokenize(marked_text))
    
    input_ids.append(encoded_dict['input_ids'])

W0820 18:17:35.605566 46912496406208 tokenization_utils_base.py:1447] Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
W0820 18:17:35.607071 46912496406208 tokenization_utils.py:275] Keyword arguments {'trunction': True} not recognized.
W0820 18:17:35.610715 46912496406208 tokenization_utils_base.py:1447] Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `trunc

In [6]:
encoded_dict

{'input_ids': tensor([[ 101, 2002, 2003, 1996, 2332, 1997, 3577, 1012,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [7]:
input_ids = torch.cat(input_ids, dim=0)
input_ids

tensor([[  101,  2054,  1996, 17752,  1029,  4312,  1010,  1045,  2079,  2025,
          2729,   999,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1045,  2572,  1037,  2329,  6926,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2002,  2003,  1996,  2332,  1997,  2563,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,

In [15]:
segments_id = torch.LongTensor(np.array(input_ids>0))

In [16]:
with torch.no_grad():
    vec1, vec2, hidden_state = model(input_ids, segments_id)

In [17]:
vec1.shape

torch.Size([5, 48, 768])

In [18]:
vec2.shape

torch.Size([5, 768])

In [19]:
for vec in hidden_state:
    print(vec.shape)

torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])
torch.Size([5, 48, 768])


In [20]:
vec1

tensor([[[ 0.2708,  0.3256,  0.0573,  ..., -0.0324,  0.3578,  0.5502],
         [ 0.3386,  0.4716, -0.1418,  ...,  0.4641,  0.7584, -0.0971],
         [-0.0324,  0.2588,  0.3092,  ...,  0.2473,  1.4089, -0.2652],
         ...,
         [ 0.6796,  0.3689,  0.7275,  ...,  0.4562,  0.0854,  0.3063],
         [ 0.1484,  0.0723,  0.4016,  ...,  0.5883,  0.1694,  0.1883],
         [ 0.2493,  0.2640,  0.4613,  ...,  0.2719,  0.2291,  0.3422]],

        [[ 0.1190,  0.3330, -0.2150,  ..., -0.2455,  0.4145,  0.7172],
         [ 0.5392,  0.2716, -0.4211,  ..., -0.2646,  0.8108,  0.7869],
         [ 0.5022,  0.1801, -0.0575,  ..., -0.7293,  0.6794,  0.8216],
         ...,
         [ 0.5527,  0.0522,  0.5830,  ..., -0.0652, -0.1205,  0.2030],
         [-0.1811, -0.3234, -0.0653,  ...,  0.2794,  0.5110, -0.0428],
         [-0.0302, -0.0978, -0.1857,  ...,  0.2594,  0.4029,  0.0332]],

        [[-0.4268,  0.2247, -0.2767,  ..., -0.3985,  0.4350,  0.6681],
         [-0.3115, -0.0246,  0.0555,  ..., -0

In [21]:
vec2

tensor([[-0.8521, -0.3779, -0.9208,  ..., -0.6868, -0.6339,  0.9125],
        [-0.8736, -0.4717, -0.8633,  ..., -0.6980, -0.6654,  0.8896],
        [-0.8704, -0.5004, -0.7735,  ..., -0.4925, -0.7196,  0.8515],
        [-0.7944, -0.5811, -0.9006,  ..., -0.6408, -0.7001,  0.7812],
        [-0.9025, -0.5265, -0.8410,  ..., -0.6424, -0.7370,  0.8596]])

In [22]:
token_embeddings = torch.stack(hidden_state, dim=0)
token_embeddings

tensor([[[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
            3.8253e-02,  1.6400e-01],
          [ 9.8677e-01,  4.5401e-01, -1.0180e+00,  ...,  7.5422e-01,
            2.9288e-01, -1.5443e+00],
          [-7.7901e-01,  4.7813e-01,  8.5864e-02,  ..., -2.4891e-01,
            5.0784e-01, -5.6890e-01],
          ...,
          [ 1.4135e-01, -3.9700e-01, -6.2236e-02,  ...,  8.2293e-02,
           -2.5761e-01,  2.5720e-01],
          [ 2.0530e-01, -6.2230e-01, -2.4686e-01,  ..., -1.2149e-01,
           -1.6731e-01,  2.4943e-01],
          [ 4.1845e-01, -4.7524e-01, -1.8688e-01,  ..., -1.8926e-01,
           -2.6207e-01,  1.4601e-01]],

         [[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
            3.8253e-02,  1.6400e-01],
          [-3.4023e-04,  5.3974e-01, -2.8805e-01,  ...,  7.5731e-01,
            8.9008e-01,  1.6575e-01],
          [-6.3496e-01,  1.9748e-01,  2.5116e-01,  ..., -4.0819e-02,
            1.3468e+00, -6.9357e-01],
          ...,
     

In [23]:
token_embeddings.shape

torch.Size([13, 5, 48, 768])

In [24]:
# swap dimensions: [sentence, tokens, hidden layers, features]
token_embeddings = token_embeddings.permute(1, 2, 0, 3)

In [25]:
token_embeddings.shape

torch.Size([5, 48, 13, 768])

In [26]:
processed_embeddings = token_embeddings[:, :, 9:, :]
processed_embeddings.shape

torch.Size([5, 48, 4, 768])

In [27]:
embeddings = torch.reshape(processed_embeddings, (5, 48, -1))
embeddings = embeddings.detach().numpy()
embeddings.shape

(5, 48, 3072)

In [28]:
from scipy.spatial.distance import cosine

In [29]:
cosine(embeddings[2][4], embeddings[3][4])

0.3443491458892822

In [30]:
cosine(embeddings[2][4], embeddings[4][4])

0.06491076946258545

In [31]:
embeddings

array([[[ 0.24550708,  0.20207022, -0.615036  , ..., -0.0323792 ,
          0.3577894 ,  0.550167  ],
        [ 0.381902  ,  1.4579368 ,  0.01995093, ...,  0.46406403,
          0.75837046, -0.09713171],
        [-0.10853177,  0.5075319 ,  0.11464608, ...,  0.24733742,
          1.4089295 , -0.2651754 ],
        ...,
        [ 1.0337685 ,  1.2701299 ,  1.4596736 , ...,  0.45624638,
          0.08538438,  0.30630097],
        [-0.625774  ,  0.62008536, -0.14527622, ...,  0.58833843,
          0.1694477 ,  0.18828699],
        [-0.41849947,  1.0807803 ,  0.67688525, ...,  0.2719449 ,
          0.22910678,  0.3421992 ]],

       [[ 0.03494433, -0.22927988, -1.027218  , ..., -0.24553123,
          0.41452816,  0.7172312 ],
        [ 0.7127029 ,  0.44174722, -0.62983584, ..., -0.2645788 ,
          0.81075966,  0.7868859 ],
        [ 0.7622329 ,  0.00343795, -0.3987933 , ..., -0.7293487 ,
          0.67936385,  0.8215848 ],
        ...,
        [ 0.6663198 ,  0.15924904,  1.0996404 , ..., -

In [35]:
tokenized_texts

[['[CLS]',
  'what',
  'the',
  'heck',
  '?',
  'anyway',
  ',',
  'i',
  'do',
  'not',
  'care',
  '!',
  '[SEP]'],
 ['[CLS]', 'i', 'am', 'a', 'british', 'citizen', '.', '[SEP]'],
 ['[CLS]', 'he', 'is', 'the', 'king', 'of', 'england', '.', '[SEP]'],
 ['[CLS]',
  'he',
  'is',
  'the',
  'king',
  'of',
  'python',
  ',',
  'a',
  'good',
  'machine',
  'learning',
  'engineer',
  '.',
  '[SEP]'],
 ['[CLS]', 'he', 'is', 'the', 'king', 'of', 'spain', '.', '[SEP]']]

In [56]:
cosine(vec2[0], vec2[1])

0.003948211669921875