### Bert embedding
- a table lookup process
  - word embeddings: 30522*768
  - segment embeddings: 2*768
  - position embedding: 512*768
- post-processing
  - layer norm
  - dropout

In [1]:
from transformers import BertModel, BertTokenizer
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [5]:
test_sentence = 'today is a good day'

In [15]:
input = tokenizer(test_sentence, return_tensors='pt')
input_ids = input['input_ids']

In [20]:
input_ids.shape

torch.Size([1, 7])

In [21]:
word_embed = model.embeddings.word_embeddings(input_ids)
word_embed

tensor([[[ 1.3630e-02, -2.6490e-02, -2.3503e-02,  ...,  8.6805e-03,
           7.1340e-03,  1.5147e-02],
         [-6.9300e-02, -1.9083e-02, -5.6926e-02,  ..., -3.6711e-05,
          -1.1248e-02, -8.8072e-03],
         [-3.6044e-02, -2.4606e-02, -2.5735e-02,  ...,  3.3691e-03,
          -1.8300e-03,  2.6855e-02],
         ...,
         [ 1.1832e-03, -9.3691e-03, -3.8018e-02,  ..., -6.9194e-02,
          -8.0286e-02, -8.7258e-02],
         [-2.8552e-02,  1.1842e-03, -4.4948e-04,  ...,  5.7283e-03,
          -2.7316e-02, -4.4553e-02],
         [-1.4521e-02, -9.9615e-03,  6.0263e-03,  ..., -2.5035e-02,
           4.6379e-03, -1.5378e-03]]], grad_fn=<EmbeddingBackward0>)

In [22]:
word_embed.shape

torch.Size([1, 7, 768])

In [18]:
input_type = input['token_type_ids']
input_type

tensor([[0, 0, 0, 0, 0, 0, 0]])

In [24]:
token_type_embed = model.embeddings.token_type_embeddings(input_type)
token_type_embed

tensor([[[ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         ...,
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086]]],
       grad_fn=<EmbeddingBackward0>)

In [28]:
import torch
pos_ids = torch.arange(input_ids.shape[1])
pos_ids

tensor([0, 1, 2, 3, 4, 5, 6])

In [30]:
pos_embed = model.embeddings.position_embeddings(pos_ids)
pos_embed

tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
          6.8312e-04,  1.5441e-02],
        [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
          2.9753e-02, -5.3247e-03],
        [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
          1.8741e-02, -7.3140e-03],
        ...,
        [-5.6087e-03, -1.0445e-02, -7.2288e-03,  ...,  2.0837e-02,
          3.5402e-03,  4.7708e-03],
        [-3.0871e-03, -1.8956e-02, -1.8930e-02,  ...,  7.4045e-03,
          2.0183e-02,  3.4077e-03],
        [ 6.4257e-03, -1.7664e-02, -2.2067e-02,  ...,  6.7531e-04,
          1.1108e-02,  3.7521e-03]], grad_fn=<EmbeddingBackward0>)

In [31]:
pos_embed.shape

torch.Size([7, 768])

In [17]:
input

{'input_ids': tensor([[ 101, 2651, 2003, 1037, 2204, 2154,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [32]:
word_embed + token_type_embed + pos_embed.unsqueeze(0)

tensor([[[ 0.0316, -0.0411, -0.0564,  ...,  0.0021,  0.0044,  0.0219],
         [-0.0611, -0.0058, -0.0727,  ...,  0.0223,  0.0151, -0.0228],
         [-0.0469, -0.0156, -0.0336,  ...,  0.0117,  0.0135,  0.0109],
         ...,
         [-0.0040, -0.0088, -0.0415,  ..., -0.0550, -0.0801, -0.0911],
         [-0.0312, -0.0068, -0.0157,  ...,  0.0065, -0.0105, -0.0498],
         [-0.0077, -0.0166, -0.0123,  ..., -0.0310,  0.0124, -0.0064]]],
       grad_fn=<AddBackward0>)