In [1]:
# INSTALL REQUIRED PACKAGES.

!pip install transformers -q

In [2]:
from transformers import BertModel, BertTokenizer
import torch

In [3]:
sentence = "I like to workout everyday."

## **Extracting Embeddings from Pre-Trained BERT**

### **Tokenization**

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
tokens = tokenizer.tokenize(sentence)
print(tokens)

['i', 'like', 'to', 'workout', 'everyday', '.']


Add the [CLS] token (classifer) at the beginning and [SEP] token (separator) at the end of the tokens list.

In [6]:
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)

['[CLS]', 'i', 'like', 'to', 'workout', 'everyday', '.', '[SEP]']


In [7]:
print(len(tokens))
print(tokens)

8
['[CLS]', 'i', 'like', 'to', 'workout', 'everyday', '.', '[SEP]']


In [8]:
# ADD PAD TOKENS

tokens = tokens + ['[PAD]'] + ['[PAD]']
print(tokens)

['[CLS]', 'i', 'like', 'to', 'workout', 'everyday', '.', '[SEP]', '[PAD]', '[PAD]']


In [9]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


### **Token IDs**

In [10]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

[101, 1045, 2066, 2000, 27090, 10126, 1012, 102, 0, 0]


In [11]:
token_ids = torch.tensor(token_ids).unsqueeze(0)

attention_mask = torch.tensor(attention_mask).unsqueeze(0)

### **Embeddings**

Embeddings from the top-most encoder layer (12th encoder)

In [12]:
model = BertModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [13]:
output = model(
    token_ids,
    attention_mask = attention_mask
)

output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.4208, -0.0586, -0.3886,  ..., -0.0695,  0.4563,  0.3193],
         [ 0.9853, -0.3510, -0.5643,  ..., -0.2356,  0.6147,  0.0520],
         [ 1.0001,  0.4729,  0.2697,  ...,  0.1627, -0.0112,  0.0290],
         ...,
         [ 0.8115,  0.0555,  0.0949,  ...,  0.3182, -0.4634, -0.4141],
         [ 0.8258,  0.0069, -0.0302,  ...,  0.1734,  0.0204, -0.0843],
         [ 0.5143, -0.1884, -0.0828,  ...,  0.4509, -0.0834, -0.0878]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-8.4668e-01, -4.7854e-01, -9.1003e-01,  7.3080e-01,  8.4408e-01,
         -2.0879e-01,  7.2997e-01,  1.9190e-01, -8.3580e-01, -9.9988e-01,
         -5.4148e-01,  9.8079e-01,  9.6518e-01,  5.5649e-01,  8.9265e-01,
         -7.2597e-01, -3.8701e-01, -5.8026e-01,  1.1038e-01, -2.4481e-01,
          7.1762e-01,  9.9999e-01, -1.8475e-01,  2.9750e-01,  4.0805e-01,
          9.9449e-01, -7.5387e-01,  9.1026e-01,  9.2421e-01,  7.493

In [14]:
output[0].shape

torch.Size([1, 10, 768])

In [15]:
output[1]

tensor([[-8.4668e-01, -4.7854e-01, -9.1003e-01,  7.3080e-01,  8.4408e-01,
         -2.0879e-01,  7.2997e-01,  1.9190e-01, -8.3580e-01, -9.9988e-01,
         -5.4148e-01,  9.8079e-01,  9.6518e-01,  5.5649e-01,  8.9265e-01,
         -7.2597e-01, -3.8701e-01, -5.8026e-01,  1.1038e-01, -2.4481e-01,
          7.1762e-01,  9.9999e-01, -1.8475e-01,  2.9750e-01,  4.0805e-01,
          9.9449e-01, -7.5387e-01,  9.1026e-01,  9.2421e-01,  7.4930e-01,
         -5.8893e-01,  1.8151e-01, -9.9026e-01, -3.7076e-03, -8.7553e-01,
         -9.8144e-01,  4.9391e-01, -5.5191e-01,  1.7465e-01,  1.8843e-01,
         -8.9124e-01,  2.3312e-01,  9.9992e-01, -5.1525e-01,  3.7970e-01,
         -1.0369e-01, -1.0000e+00,  2.0750e-01, -8.9042e-01,  9.4703e-01,
          8.7286e-01,  9.4726e-01,  1.1170e-01,  4.0460e-01,  4.6015e-01,
         -4.3178e-01, -1.8588e-01, -1.7188e-02, -1.9675e-01, -5.2704e-01,
         -6.1443e-01,  4.2228e-01, -8.9837e-01, -7.2606e-01,  9.4511e-01,
          8.2882e-01, -6.0071e-02, -1.

In [16]:
output[1].shape

torch.Size([1, 768])