## **Extracting embeddings from pre-trained BERT**

In [1]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import BertModel, BertTokenizer
import torch

In [3]:
model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
sentence = 'She is a MachineLearning Engineer and works in California'

# **Understanding Token IDs**
The token ids are indices in a vocabulary.

The ids themselves are not used during the training of a network, rather the ids are transformed into vectors.

Say you are inputting three words, and their ids are 12,14, and 4. What is actually is given as input is three vectors (say each of n-dimension) where each id is mapped to a unique vector. These vectors could be one-hot, i.e 1 at the index 4 for the token Id 4 and rest zeros, or they could be pre-trained embedding like GloVe.

The token ID specifically is used in the embedding layer, which you can see as a matrix where row indices are the token IDs.

The token ID is the row ID in the embedding matrix. So every row is a token representation

So one row for each item in the total vocabulary, for instance 30K rows for 30k tokens.

Every token therefore has a (learned!) representation.

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
tokens = tokenizer.tokenize(sentence)

In [7]:
tokens

['she',
 'is',
 'a',
 'machine',
 '##lea',
 '##rn',
 '##ing',
 'engineer',
 'and',
 'works',
 'in',
 'california']

In [8]:
tokens = ['[CLS]'] + tokens + ['[SEP]']

In [9]:
print(tokens)

['[CLS]', 'she', 'is', 'a', 'machine', '##lea', '##rn', '##ing', 'engineer', 'and', 'works', 'in', 'california', '[SEP]']


In [10]:
len(tokens)

14

In [11]:
tokens = tokens + ['[PAD]'] + ['[PAD]']

In [12]:
print(len(tokens))

16


In [13]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]

In [14]:
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


# **unique token ID**

In [15]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [16]:
print(token_ids)

[101, 2016, 2003, 1037, 3698, 19738, 6826, 2075, 3992, 1998, 2573, 1999, 2662, 102, 0, 0]


In [17]:
['[CLS]', 'she', 'is', 'a', 'machine', '##lea', '##rn', '##ing', 'engineer', 'and', 'works', 'in', 'california', '[SEP]']

['[CLS]',
 'she',
 'is',
 'a',
 'machine',
 '##lea',
 '##rn',
 '##ing',
 'engineer',
 'and',
 'works',
 'in',
 'california',
 '[SEP]']

In [18]:
token_ids = torch.tensor(token_ids).unsqueeze(0)

attention_mask = torch.tensor(attention_mask).unsqueeze(0)

# **Getting the embedding**

In [19]:
output = model(token_ids, attention_mask = attention_mask)

In [20]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1925,  0.1684, -0.4252,  ..., -0.2599,  0.3736,  0.0529],
         [ 0.2417, -0.2748, -0.4909,  ...,  0.1372,  0.3408, -0.4655],
         [-0.0871,  0.0837,  0.2605,  ..., -0.4635, -0.0462,  0.2621],
         ...,
         [ 0.6711, -0.0076, -0.3847,  ..., -0.1289, -0.5171, -0.8002],
         [-0.2731,  0.1098, -0.5440,  ...,  0.0314,  0.4467, -0.3448],
         [-0.2387,  0.0119, -0.4760,  ...,  0.4656,  0.5837, -0.3774]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.9531, -0.4914, -0.8872,  0.9035,  0.8174, -0.2919,  0.9511,  0.4982,
         -0.7595, -1.0000, -0.6996,  0.9459,  0.9890,  0.4754,  0.9723, -0.8460,
         -0.1423, -0.7209,  0.4428, -0.7905,  0.7822,  1.0000,  0.2119,  0.4066,
          0.5813,  0.9923, -0.8380,  0.9670,  0.9746,  0.8324, -0.8227,  0.4136,
         -0.9931, -0.2821, -0.8860, -0.9961,  0.5261, -0.8722, -0.0915, -0.0950,
         -0.9237,  0.5106,  1.00

In [21]:
output[0].shape

torch.Size([1, 16, 768])

In [22]:
output[0].shape

torch.Size([1, 16, 768])

In [23]:
output[1].shape

torch.Size([1, 768])