In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '8'

from transformers import AutoTokenizer
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

In [2]:
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, cache_dir='./cache')
model = BertModel.from_pretrained(model_ckpt, cache_dir='./cache')
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (den

In [3]:
text = "Time flies like an arrow."
show(model, "bert", tokenizer, text, display_mode='light', layer=0, head=8)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
text_2 = "Fruit flies like a banana."
show(model, "bert", tokenizer, text_2, display_mode='light', layer=0, head=8)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
inputs = tokenizer(text, return_tensors='pt', add_special_tokens=False)
inputs

{'input_ids': tensor([[ 2051, 10029,  2066,  2019,  8612,  1012]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [6]:
input_ids = inputs['input_ids']
print(input_ids.shape)
input_ids

torch.Size([1, 6])


tensor([[ 2051, 10029,  2066,  2019,  8612,  1012]])

In [7]:
from torch import nn
from transformers import AutoConfig

In [8]:
config = AutoConfig.from_pretrained(model_ckpt, cache_dir='./cache')
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.39.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [9]:
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
token_emb

Embedding(30522, 768)

In [10]:
nn.Embedding.__dict__

mappingproxy({'__module__': 'torch.nn.modules.sparse',
              '__annotations__': {'num_embeddings': int,
               'embedding_dim': int,
               'padding_idx': typing.Optional[int],
               'max_norm': typing.Optional[float],
               'norm_type': float,
               'scale_grad_by_freq': bool,
               'weight': torch.Tensor,
               'freeze': bool,
               'sparse': bool},
              '__doc__': 'A simple lookup table that stores embeddings of a fixed dictionary and size.\n\n    This module is often used to store word embeddings and retrieve them using indices.\n    The input to the module is a list of indices, and the output is the corresponding\n    word embeddings.\n\n    Args:\n        num_embeddings (int): size of the dictionary of embeddings\n        embedding_dim (int): the size of each embedding vector\n        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient

The token embeddings at this point are NOT contextualized. They are static embeddings. The subsequent attention layers will mix these token embeddings to disambiguate and inform the representation of each token with the content of its context.

In [11]:
token_emb.__dict__

{'training': True,
 '_parameters': OrderedDict([('weight', Parameter containing:
               tensor([[-1.0808, -0.4450, -0.7923,  ...,  0.7059, -0.4682,  1.5237],
                       [ 0.0546, -0.8265,  0.5422,  ..., -0.7210,  0.3176,  0.6092],
                       [ 1.3479,  1.5353, -1.1573,  ...,  1.0432,  1.7784, -1.2654],
                       ...,
                       [-0.0122, -0.5778,  0.6023,  ..., -0.5060,  0.6677, -1.9856],
                       [ 0.5653, -0.8399, -0.2348,  ..., -1.3068, -1.0952,  0.8706],
                       [-1.1287,  1.1438, -1.8044,  ...,  0.5818,  0.2767,  0.2174]],
                      requires_grad=True))]),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_pre_hooks': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_is_full_backward_hook': None,
 '_forward_hooks': OrderedDict(),
 '_forward_hooks_with_kwargs': OrderedDict(),
 '_forward_hooks_always_called': OrderedDict(),
 '_forward_pre_hooks': OrderedDi

In [12]:
# Generate the token embeddings by feeding the input_ids.
inputs_embeds = token_emb(input_ids)
print(inputs_embeds.shape)
inputs_embeds

torch.Size([1, 6, 768])


tensor([[[ 1.3969,  1.2611, -1.1523,  ...,  0.4819,  0.5900,  0.0288],
         [ 0.7508, -1.1430, -0.7338,  ...,  1.2043, -0.5933, -1.3617],
         [-1.1058,  0.5725, -1.5255,  ...,  0.3847, -0.9194,  0.5148],
         [-1.6923, -0.5253, -0.3221,  ..., -0.3957,  2.0860,  0.1895],
         [ 0.3731,  3.5894, -0.4419,  ..., -0.5011, -0.9587,  0.9027],
         [-0.9394, -0.4156, -1.6516,  ..., -0.7945, -0.7414,  0.9029]]],
       grad_fn=<EmbeddingBackward0>)

[batch_size, seq_len, hidden_dim] = [1, 6, 768]

Create the query, key and value vectors and calculate the attention scores using the dot product as the similarity function.

In [13]:
import torch
from math import sqrt

In [14]:
# Create the query, key and value.
# Keep query, key and value vectors equal for simplicity.
# In fact, they should be generated by applying independent weight matrices W_q, W_k and W_v to the input embeddings.
query = key = value = inputs_embeds
print(query, query.shape)

tensor([[[ 1.3969,  1.2611, -1.1523,  ...,  0.4819,  0.5900,  0.0288],
         [ 0.7508, -1.1430, -0.7338,  ...,  1.2043, -0.5933, -1.3617],
         [-1.1058,  0.5725, -1.5255,  ...,  0.3847, -0.9194,  0.5148],
         [-1.6923, -0.5253, -0.3221,  ..., -0.3957,  2.0860,  0.1895],
         [ 0.3731,  3.5894, -0.4419,  ..., -0.5011, -0.9587,  0.9027],
         [-0.9394, -0.4156, -1.6516,  ..., -0.7945, -0.7414,  0.9029]]],
       grad_fn=<EmbeddingBackward0>) torch.Size([1, 6, 768])


In [15]:
dim_k = key.size(-1)
dim_k

768

In [16]:
# attention scores
scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
print(scores, scores.shape)

tensor([[[25.6507,  0.2572, -1.9256, -0.8663, -0.1115,  1.5502],
         [ 0.2572, 27.7087, -0.9734,  0.4901,  0.7850,  0.8002],
         [-1.9256, -0.9734, 29.7933, -0.4361,  0.7908, -0.6917],
         [-0.8663,  0.4901, -0.4361, 27.5050, -1.1419,  0.1749],
         [-0.1115,  0.7850,  0.7908, -1.1419, 27.0164,  0.4325],
         [ 1.5502,  0.8002, -0.6917,  0.1749,  0.4325, 30.0708]]],
       grad_fn=<DivBackward0>) torch.Size([1, 6, 6])


In [17]:
import torch.nn.functional as F

In [18]:
weights = F.softmax(scores, dim=-1)
print(weights, weights.shape)
weights.sum(dim=-1)
print(weights)

tensor([[[1.0000e+00, 9.3693e-12, 1.0562e-12, 3.0463e-12, 6.4805e-12,
          3.4140e-11],
         [1.1966e-12, 1.0000e+00, 3.4957e-13, 1.5105e-12, 2.0287e-12,
          2.0598e-12],
         [1.6774e-14, 4.3469e-14, 1.0000e+00, 7.4389e-14, 2.5372e-13,
          5.7612e-14],
         [4.7695e-13, 1.8517e-12, 7.3334e-13, 1.0000e+00, 3.6207e-13,
          1.3511e-12],
         [1.6539e-12, 4.0539e-12, 4.0772e-12, 5.9021e-13, 1.0000e+00,
          2.8495e-12],
         [4.1084e-13, 1.9408e-13, 4.3654e-14, 1.0385e-13, 1.3436e-13,
          1.0000e+00]]], grad_fn=<SoftmaxBackward0>) torch.Size([1, 6, 6])
tensor([[[1.0000e+00, 9.3693e-12, 1.0562e-12, 3.0463e-12, 6.4805e-12,
          3.4140e-11],
         [1.1966e-12, 1.0000e+00, 3.4957e-13, 1.5105e-12, 2.0287e-12,
          2.0598e-12],
         [1.6774e-14, 4.3469e-14, 1.0000e+00, 7.4389e-14, 2.5372e-13,
          5.7612e-14],
         [4.7695e-13, 1.8517e-12, 7.3334e-13, 1.0000e+00, 3.6207e-13,
          1.3511e-12],
         [1.6539e-

In [19]:
# Multiply the attention weights by the values.
attn_outputs = torch.bmm(weights, value)
print(attn_outputs, attn_outputs.shape)

tensor([[[ 1.3969,  1.2611, -1.1523,  ...,  0.4819,  0.5900,  0.0288],
         [ 0.7508, -1.1430, -0.7338,  ...,  1.2043, -0.5933, -1.3617],
         [-1.1058,  0.5725, -1.5255,  ...,  0.3847, -0.9194,  0.5148],
         [-1.6923, -0.5253, -0.3221,  ..., -0.3957,  2.0860,  0.1895],
         [ 0.3731,  3.5894, -0.4419,  ..., -0.5011, -0.9587,  0.9027],
         [-0.9394, -0.4156, -1.6516,  ..., -0.7945, -0.7414,  0.9029]]],
       grad_fn=<BmmBackward0>) torch.Size([1, 6, 768])


Done!

In [20]:
def scaled_dot_product_attention(query, key, value):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

The self-attention layer applies three independent linear transformations to each embedding to generate the query, key and value vectors. These transformations project the embeddings, each with its own set of learnable parameters, allowing the self-attention layer to focus on different aspects of the sequence.

Why do we need multiple attention heads?
- The softmax of one head tends to focus on mostly one aspect of similarity. Having multiple heads allows the model to focus on several aspects at once.

In [21]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state),
            self.k(hidden_state),
            self.v(hidden_state)
        )
        return attn_outputs

In practice, head_dim is chosen to be a multiple of embed_dim, so that the computation across each head is constant.

In [22]:
class MultiHeadAttention(nn.Module):
    """The full multi-head attention layer is the concatenation of the outputs of each single attention head."""
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        # https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        # https://pytorch.org/docs/stable/generated/torch.cat.html
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)  # [batch_size, seq_len, hidden_dim]
        return x

In [23]:
multihead_attn = MultiHeadAttention(config)
multihead_attn

MultiHeadAttention(
  (heads): ModuleList(
    (0-11): 12 x AttentionHead(
      (q): Linear(in_features=768, out_features=64, bias=True)
      (k): Linear(in_features=768, out_features=64, bias=True)
      (v): Linear(in_features=768, out_features=64, bias=True)
    )
  )
  (output_linear): Linear(in_features=768, out_features=768, bias=True)
)

In [24]:
print(inputs_embeds, inputs_embeds.shape)

tensor([[[ 1.3969,  1.2611, -1.1523,  ...,  0.4819,  0.5900,  0.0288],
         [ 0.7508, -1.1430, -0.7338,  ...,  1.2043, -0.5933, -1.3617],
         [-1.1058,  0.5725, -1.5255,  ...,  0.3847, -0.9194,  0.5148],
         [-1.6923, -0.5253, -0.3221,  ..., -0.3957,  2.0860,  0.1895],
         [ 0.3731,  3.5894, -0.4419,  ..., -0.5011, -0.9587,  0.9027],
         [-0.9394, -0.4156, -1.6516,  ..., -0.7945, -0.7414,  0.9029]]],
       grad_fn=<EmbeddingBackward0>) torch.Size([1, 6, 768])


In [25]:
attn_output = multihead_attn(inputs_embeds)
print(attn_output, attn_output.shape)

tensor([[[-0.1338,  0.1524, -0.0356,  ...,  0.0996,  0.0351,  0.1458],
         [-0.1807,  0.1397,  0.0218,  ...,  0.2087,  0.0627,  0.0523],
         [-0.0435,  0.1799, -0.0861,  ...,  0.0962,  0.1237,  0.0492],
         [-0.1325,  0.1168, -0.0992,  ...,  0.1298,  0.0592,  0.0346],
         [-0.1124,  0.1286, -0.0149,  ...,  0.0169,  0.0377,  0.0984],
         [-0.1514,  0.1089, -0.1218,  ..., -0.0015,  0.1750,  0.0165]]],
       grad_fn=<ViewBackward0>) torch.Size([1, 6, 768])


In [26]:
from bertviz import head_view
from transformers import AutoModel

In [27]:
model = AutoModel.from_pretrained(model_ckpt, cache_dir='./cache')
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [28]:
sentence_a = "time flies like an arrow"
sentence_b = "fruit flies like a banana"

In [29]:
viz_inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt')
viz_inputs

{'input_ids': tensor([[  101,  2051, 10029,  2066,  2019,  8612,   102,  5909, 10029,  2066,
          1037, 15212,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [30]:
# output = model(**viz_inputs)
output = model(**viz_inputs, output_attentions=True)
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.5540,  0.4613, -0.3415,  ..., -0.5262,  0.1079,  0.2478],
         [-0.1491,  0.5334, -0.0309,  ...,  0.1952,  0.8541, -0.4381],
         [ 0.2653,  0.3182,  0.6856,  ..., -0.4000, -0.4291, -0.3609],
         ...,
         [-0.5878,  0.2938,  0.0078,  ...,  0.0288, -0.2259, -0.0198],
         [-0.2056,  0.1479, -0.1057,  ..., -0.3118, -0.2503, -0.9456],
         [ 1.0812,  0.1233, -0.4355,  ..., -0.0316, -0.7760, -0.4457]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.9860, -0.8872, -0.9969,  0.9886,  0.9506, -0.6760,  0.9906,  0.8082,
         -0.9908, -1.0000, -0.8833,  0.9979,  0.9949,  0.9057,  0.9906, -0.9584,
         -0.9100, -0.9055,  0.7985, -0.9519,  0.9562,  1.0000, -0.6571,  0.7525,
          0.9041,  0.9999, -0.9630,  0.9779,  0.9895,  0.8878, -0.9626,  0.8007,
         -0.9971, -0.7160, -0.9970, -0.9995,  0.8925, -0.9110, -0.6191, -0.6000,
         -0.9738,  0.8240,  1.00

In [31]:
attention = output.attentions
attention

(tensor([[[[5.0308e-02, 4.8489e-02, 3.3783e-02,  ..., 1.0662e-01,
            3.4201e-02, 1.5616e-01],
           [2.3186e-01, 5.4006e-02, 1.9014e-01,  ..., 1.2277e-03,
            6.9611e-03, 5.2383e-03],
           [3.9468e-02, 1.4838e-01, 2.1820e-01,  ..., 2.4379e-03,
            1.2240e-02, 4.4097e-03],
           ...,
           [8.4569e-02, 2.6371e-03, 4.6116e-03,  ..., 8.3565e-02,
            2.1223e-01, 1.5234e-01],
           [6.7516e-02, 1.4132e-03, 3.4118e-03,  ..., 3.7773e-02,
            2.0925e-01, 2.3536e-01],
           [5.3773e-02, 1.2619e-03, 1.4584e-03,  ..., 1.5566e-01,
            1.3315e-01, 3.4202e-01]],
 
          [[8.5989e-01, 1.3223e-02, 4.4054e-03,  ..., 1.9977e-02,
            5.8908e-03, 9.8486e-03],
           [5.3689e-03, 5.8076e-02, 3.4999e-01,  ..., 3.4334e-03,
            4.7788e-02, 1.2578e-02],
           [7.3117e-02, 1.1438e-01, 9.3617e-02,  ..., 3.7564e-03,
            2.1223e-02, 2.4680e-02],
           ...,
           [1.5048e-02, 1.2647e-02, 2.

In [32]:
print(viz_inputs.token_type_ids == 0)
sentence_b_start = (viz_inputs.token_type_ids == 0).sum(dim=1)
sentence_b_start

tensor([[ True,  True,  True,  True,  True,  True,  True, False, False, False,
         False, False, False]])


tensor([7])

In [33]:
tokens = tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0])
print(len(tokens))
tokens

13


['[CLS]',
 'time',
 'flies',
 'like',
 'an',
 'arrow',
 '[SEP]',
 'fruit',
 'flies',
 'like',
 'a',
 'banana',
 '[SEP]']

[source code of head_view()](https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://github.com/jessevig/bertviz/blob/master/bertviz/head_view.py&ved=2ahUKEwiqv_-QhrCFAxXChf0HHRMpC80QFnoECBUQAQ&usg=AOvVaw1dbdguAKCdGzyFJtLYCKIs)

In [34]:
head_view(attention, tokens, sentence_b_start, heads=[8])

<IPython.core.display.Javascript object>

### Feed-Forward Layer
The FFN processes each embedding independently.  
rule of thumb: hidden size of the first layer = 4 * size of embeddings, apply a [GELU activation](https://pytorch.org/docs/stable/generated/torch.nn.GELU.html).

In [35]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

In [36]:
feed_forward = FeedForward(config)
feed_forward

FeedForward(
  (linear_1): Linear(in_features=768, out_features=3072, bias=True)
  (linear_2): Linear(in_features=3072, out_features=768, bias=True)
  (gelu): GELU(approximate='none')
  (dropout): Dropout(p=0.1, inplace=False)
)

In [37]:
feed_forward.linear_1.in_features * 4 == feed_forward.linear_1.out_features

True

In [38]:
ff_outputs = feed_forward(attn_output)
print(ff_outputs, ff_outputs.shape)

tensor([[[ 0.0024, -0.0000,  0.0355,  ..., -0.0602,  0.0249, -0.0000],
         [ 0.0049, -0.0317,  0.0263,  ..., -0.0474,  0.0185, -0.0090],
         [ 0.0000, -0.0089,  0.0462,  ..., -0.0000,  0.0259, -0.0174],
         [-0.0088, -0.0139,  0.0000,  ..., -0.0442,  0.0000, -0.0240],
         [ 0.0006, -0.0131,  0.0335,  ..., -0.0000,  0.0143, -0.0228],
         [-0.0039, -0.0283,  0.0358,  ..., -0.0711,  0.0318, -0.0260]]],
       grad_fn=<MulBackward0>) torch.Size([1, 6, 768])


### Adding Layer Normalization
- Layer normalization: Normalize each input in a batch such that they have zero mean and unity variance.
- Skip connections: Pass a tensor to the next layer of the model without processing, and add it to the processed tensor.

Layer normalization:
- Post layer normalization: 
    - Place layer normalization between the skip connections.
    - Gradients can diverge. --> Training is not stable enough.
    - learning rate warm-up: gradually increase learning rate from a small value to some maximum value.
- Pre layer normalization: 
    - Place layer normalization within the span of the skip connections.
    - Training is more stable.

#### Adding pre layer normalization:

In [39]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # Apply layer normalization and then copy input into query, key, value.
        hidden_state = self.layer_norm_1(x)
        # Apply attention with a skip connection.
        x += self.attention(hidden_state)
        # Apply feed-forward layer with a skip connection.
        x += self.feed_forward(self.layer_norm_2(x))
        return x

In [40]:
encoder_layer = TransformerEncoderLayer(config)
encoder_layer

TransformerEncoderLayer(
  (layer_norm_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (layer_norm_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attention): MultiHeadAttention(
    (heads): ModuleList(
      (0-11): 12 x AttentionHead(
        (q): Linear(in_features=768, out_features=64, bias=True)
        (k): Linear(in_features=768, out_features=64, bias=True)
        (v): Linear(in_features=768, out_features=64, bias=True)
      )
    )
    (output_linear): Linear(in_features=768, out_features=768, bias=True)
  )
  (feed_forward): FeedForward(
    (linear_1): Linear(in_features=768, out_features=3072, bias=True)
    (linear_2): Linear(in_features=3072, out_features=768, bias=True)
    (gelu): GELU(approximate='none')
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [41]:
print(inputs_embeds.shape)
print(encoder_layer(inputs_embeds).shape)

torch.Size([1, 6, 768])
torch.Size([1, 6, 768])


### Positional Embeddings
Idea: Augment the token embeddings with a postion-dependent pattern of values arranged in a vector.  

[torch.arange()](https://pytorch.org/docs/stable/generated/torch.arange.html)

In [42]:
class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        # The token embedding layer projects the input_ids to a dense hidden state.
        self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        # The positional embedding layer projects the position_ids to a dense hidden state.
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        # Create position_ids for input sequence.
        seq_len = input_ids.size(1)
        print(f"Sequence length: {seq_len}")
        position_ids = torch.arange(start=0, end=seq_len, step=1, dtype=torch.long).unsqueeze(0)

        # Create token embeddings and positional embeddings.
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)

        # Combine token embeddings and positional embeddings.
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [43]:
embedding_layer = Embeddings(config)
embedding_layer

Embeddings(
  (token_embeddings): Embedding(30522, 768)
  (position_embeddings): Embedding(512, 768)
  (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [44]:
embedding_layer(inputs.input_ids)

Sequence length: 6


tensor([[[ 0.0000,  0.0000,  1.0609,  ..., -0.0000, -0.7060, -0.0000],
         [ 0.5761, -0.0000, -0.0000,  ..., -0.0000,  0.0000, -2.5691],
         [ 0.0000, -2.2611,  0.3830,  ..., -0.0000,  0.0000,  3.2704],
         [ 0.0000, -0.2408, -0.0000,  ...,  0.0000, -1.0626,  3.4184],
         [-0.0000,  0.0000, -0.0000,  ..., -0.0000,  0.0000,  0.0000],
         [-2.0633,  0.0000, -0.0000,  ..., -0.0000, -0.0000, -0.0000]]],
       grad_fn=<MulBackward0>)

Other positional representations:
- Absolute positional representations
    - Static patterns, e.g. modulated sine and cosine
    - Works well on small datasets
- Relative positional representations
    - Intuition: Surrounding tokens are most important.
    - Dynamic patterns: changes for each token, depending on the position of the sequence from which the token is attened to.

In [45]:
class TransformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList(
            [TransformerEncoderLayer(config) for _ in range(config.num_hidden_layers)]
        )

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x

In [46]:
encoder = TransformerEncoder(config)
encoder

TransformerEncoder(
  (embeddings): Embeddings(
    (token_embeddings): Embedding(30522, 768)
    (position_embeddings): Embedding(512, 768)
    (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (layers): ModuleList(
    (0-11): 12 x TransformerEncoderLayer(
      (layer_norm_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layer_norm_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attention): MultiHeadAttention(
        (heads): ModuleList(
          (0-11): 12 x AttentionHead(
            (q): Linear(in_features=768, out_features=64, bias=True)
            (k): Linear(in_features=768, out_features=64, bias=True)
            (v): Linear(in_features=768, out_features=64, bias=True)
          )
        )
        (output_linear): Linear(in_features=768, out_features=768, bias=True)
      )
      (feed_forward): FeedForward(
        (linear_1): Linear(in_features=768, out_features=3072, b

In [47]:
print(encoder(inputs.input_ids), encoder(inputs.input_ids).shape)

Sequence length: 6
Sequence length: 6
tensor([[[ 2.3911,  0.9190, -2.9242,  ..., -2.4835, -3.6301,  2.6935],
         [-1.2722, -1.8229, -1.6285,  ...,  0.6255, -1.0470,  1.1359],
         [ 0.9295,  0.0059, -0.5548,  ..., -0.8678, -0.0946,  1.1840],
         [ 0.0103, -0.3058, -2.5968,  ...,  1.2101,  1.6774,  1.2296],
         [-0.1825, -0.0490, -1.4526,  ...,  4.3759, -1.7355,  3.1178],
         [ 1.3689, -3.1072, -2.2317,  ...,  1.2978, -0.8875,  1.1900]]],
       grad_fn=<AddBackward0>) torch.Size([1, 6, 768])


### Adding a Classification Head

Transformer $\rightarrow$ a task-independent body + a task-specific head

In [48]:
class TransformerForSequenceClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        # This is the body.
        self.encoder = TransformerEncoder(config)
        # Create a dropout.
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # Create a classification head.
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, x):
        # Select the hidden state of the [CLS] token.
        x = self.encoder(x)[:, 0, :]
        x = self.dropout(x)
        x = self.classifier(x)
        return x

In [49]:
# Define the number of classes for our specific task.
print(config.num_labels)
config.num_labels = 3
print(config.num_labels)

2
3


In [50]:
encoder_classifier = TransformerForSequenceClassification(config)
encoder_classifier

TransformerForSequenceClassification(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (token_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (layers): ModuleList(
      (0-11): 12 x TransformerEncoderLayer(
        (layer_norm_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (layer_norm_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attention): MultiHeadAttention(
          (heads): ModuleList(
            (0-11): 12 x AttentionHead(
              (q): Linear(in_features=768, out_features=64, bias=True)
              (k): Linear(in_features=768, out_features=64, bias=True)
              (v): Linear(in_features=768, out_features=64, bias=True)
            )
          )
          (output_linear): Linear(in_features=768, out_features=768, bias=True)
        )
        

In [51]:
encoder_classifier(inputs.input_ids), encoder_classifier(inputs.input_ids).shape

Sequence length: 6
Sequence length: 6


(tensor([[ 0.5958, -0.0456, -0.2554]], grad_fn=<AddmmBackward0>),
 torch.Size([1, 3]))

In [52]:
inputs.input_ids.shape

torch.Size([1, 6])

For each example in the batch, the classifier returns the unnormalized logits for each candidate class.

### The Decoder
two attention sublayers:
- Masked multi-head self-attention
- Encoder-decoder attention

In [53]:
seq_len = inputs.input_ids.size(1)
print(seq_len)

6


A trick to create masked self-attention: Mask Matrix.  
Mask matrix: a lower triangular matrix with ones on the lower diagonal and zeros above.  

[torch.tril()](https://pytorch.org/docs/stable/generated/torch.tril.html)

In [54]:
mask = torch.tril(torch.ones(seq_len, seq_len))
print(mask, mask.shape, '\n')
mask = mask.unsqueeze(0)
print(mask, mask.shape)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]]) torch.Size([6, 6]) 

tensor([[[1., 0., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0.],
         [1., 1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 1., 1.]]]) torch.Size([1, 6, 6])


[masked_fill()](https://pytorch.org/docs/stable/generated/torch.Tensor.masked_fill_.html#torch.Tensor.masked_fill_)

In [55]:
print(scores, '\n')
scores_masked = scores.masked_fill(mask == 0, float('-inf'))
print(scores_masked)

tensor([[[25.6507,  0.2572, -1.9256, -0.8663, -0.1115,  1.5502],
         [ 0.2572, 27.7087, -0.9734,  0.4901,  0.7850,  0.8002],
         [-1.9256, -0.9734, 29.7933, -0.4361,  0.7908, -0.6917],
         [-0.8663,  0.4901, -0.4361, 27.5050, -1.1419,  0.1749],
         [-0.1115,  0.7850,  0.7908, -1.1419, 27.0164,  0.4325],
         [ 1.5502,  0.8002, -0.6917,  0.1749,  0.4325, 30.0708]]],
       grad_fn=<DivBackward0>) 

tensor([[[25.6507,    -inf,    -inf,    -inf,    -inf,    -inf],
         [ 0.2572, 27.7087,    -inf,    -inf,    -inf,    -inf],
         [-1.9256, -0.9734, 29.7933,    -inf,    -inf,    -inf],
         [-0.8663,  0.4901, -0.4361, 27.5050,    -inf,    -inf],
         [-0.1115,  0.7850,  0.7908, -1.1419, 27.0164,    -inf],
         [ 1.5502,  0.8002, -0.6917,  0.1749,  0.4325, 30.0708]]],
       grad_fn=<MaskedFillBackward0>)


The negative infinity values guarantee that the attention weights are all zero once the softmax is applied to the scores.

In [56]:
def scaled_dot_product_attention(query, key, value, mask=None):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))
    weights = F.softmax(scores, dim=-1)
    attn_outputs = torch.bmm(weights, value)
    return attn_outputs