In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoConfig, AutoModel

In [3]:
"""
Pure Transformer Masking Experiment
Huggingface AutoTokenizer는 attention mask를 이용해 패딩값에 마스킹을 적용
"""

text = 'I am very hungry now. But I can not eat anymore. I am on diet.'
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
tokenizer.encode_plus(
    text,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors=None,
    add_special_tokens=True,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'input_ids': [1, 273, 481, 379, 7092, 394, 260, 420, 273, 295, 298, 1672, 3731, 260, 273, 481, 277, 2609, 260, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [57]:
"""
Encoder Padding Token Maksing Method
1) 처음부터 row, col 함께 마스킹
2) col만 마스킹 하고 나중에 loss 계산할 때 row 마스킹
"""

"""Pytorch nn.Tensor.masked_fill_() Test """
x = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
mask = torch.Tensor([[1, 2, 3], [4, 5, 6], [0, 0, 0]])

x.masked_fill(mask == 0, float('-inf'))

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [-inf, -inf, -inf]])

In [64]:
pad_index = 1
src = torch.tensor([[4,6,5,1,1,1],[7,7,1,1,1,1]])  # 이게 문장 두개가 되는거구나
src = (src == pad_index)  # [batch, sequence_length]
print(src)
src.repeat(1, 6).view(2, 6, 6)  # [batch, sequence_length, sequence_length]

tensor([[False, False, False,  True,  True,  True],
        [False, False,  True,  True,  True,  True]])


tensor([[[False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True]],

        [[False, False,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True]]])

In [89]:
(src != pad_index).int()

tensor([[1, 1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0, 0]], dtype=torch.int32)

In [62]:
""" 행과 열에 존재 하는 마스킹 값을 동시에 처리해 하나의 seqxseq 마스킹 행렬로 만드는게 쉽지가 않다. """

src = src.int()
row, col = src[1], src[1].view(-1, 1)
row_matrix = row.repeat(1, 6).view(6, 6)
col_matrix = col.repeat(1, 6).view(6, 6)
row_matrix | col_matrix

tensor([[0, 0, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1]], dtype=torch.int32)

In [14]:
(src != pad_index).int().repeat(1, 4).view(2, 4, 6).shape

torch.Size([2, 4, 6])

In [9]:
pad_index = 1
src = torch.tensor([[4,6,5,1,1,1],[7,7,1,1,1,1]])  # 이게 문장 두개가 되는거구나
src.shape

torch.Size([2, 6])

In [3]:
enc_x = (src != pad_index).int().repeat(1, src.shape[-1]).view(src.shape[0], src.shape[-1], src.shape[-1])
enc_x

tensor([[[1, 1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0, 0]],

        [[1, 1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0, 0]]], dtype=torch.int32)

In [4]:
q = torch.randn(2, 6, 64)
k = torch.randn(2, 6, 64)
attention_matrix = torch.matmul(q, k.transpose(-1, -2))
attention_matrix.masked_fill(enc_x == 0, float('-inf'))

tensor([[[  1.3864,   2.4091, -11.7833,     -inf,     -inf,     -inf],
         [ -8.7445,   5.7077,   0.4128,     -inf,     -inf,     -inf],
         [ 12.8448,  -4.3385,  14.7385,     -inf,     -inf,     -inf],
         [  5.9451,  -4.9951,   9.4302,     -inf,     -inf,     -inf],
         [ -7.9134,  -6.1186, -10.4371,     -inf,     -inf,     -inf],
         [ -4.3155,  -6.2908,  -0.2527,     -inf,     -inf,     -inf]],

        [[ 14.3429,   6.3632,     -inf,     -inf,     -inf,     -inf],
         [-13.8716,  -4.7118,     -inf,     -inf,     -inf,     -inf],
         [  2.7359,   9.3584,     -inf,     -inf,     -inf,     -inf],
         [-15.9456,  -5.7949,     -inf,     -inf,     -inf,     -inf],
         [  6.2284,  -6.7821,     -inf,     -inf,     -inf,     -inf],
         [-13.9051,   0.3745,     -inf,     -inf,     -inf,     -inf]]])

In [5]:
""" Testing for Decoder LM Mask """
pad_index = 1
trg = torch.tensor([[4,6,5,1],[7,7,1,1]])  # 이게 문장 두개가 되는거구나
pad_mask = (trg != pad_index).int().repeat(1, trg.shape[-1]).view(trg.shape[0], trg.shape[-1], trg.shape[-1])
lm_mask = torch.tril(torch.ones(trg.shape[0], trg.shape[-1], trg.shape[-1]))
pad_mask, pad_mask.shape, lm_mask, lm_mask.shape

(tensor([[[1, 1, 1, 0],
          [1, 1, 1, 0],
          [1, 1, 1, 0],
          [1, 1, 1, 0]],
 
         [[1, 1, 0, 0],
          [1, 1, 0, 0],
          [1, 1, 0, 0],
          [1, 1, 0, 0]]], dtype=torch.int32),
 torch.Size([2, 4, 4]),
 tensor([[[1., 0., 0., 0.],
          [1., 1., 0., 0.],
          [1., 1., 1., 0.],
          [1., 1., 1., 1.]],
 
         [[1., 0., 0., 0.],
          [1., 1., 0., 0.],
          [1., 1., 1., 0.],
          [1., 1., 1., 1.]]]),
 torch.Size([2, 4, 4]))

In [6]:
""" Testing for Decoder Mask """
dec_mask = pad_mask * lm_mask
dec_mask

tensor([[[1., 0., 0., 0.],
         [1., 1., 0., 0.],
         [1., 1., 1., 0.],
         [1., 1., 1., 0.]],

        [[1., 0., 0., 0.],
         [1., 1., 0., 0.],
         [1., 1., 0., 0.],
         [1., 1., 0., 0.]]])

In [8]:
""" Testing for Encoder-Decoder Mask """
enc_x.shape, dec_mask.shape

(torch.Size([2, 6, 6]), torch.Size([2, 4, 4]))

In [None]:
"""
Testing for Encoder-Decoder Mask
Make Rectangle Masking Matrix
"""
enc_dec_mask =


In [3]:
class Projector(nn.Module):
    """
    Making projection matrix(Q, K, V) for each attention head
    When you call this class, it returns projection matrix of each attention head
    For example, if you call this class with 8 heads, it returns 8 set of projection matrices (Q, K, V)
    Args:
        num_heads: number of heads in MHA, default 8
        dim_head: dimension of each attention head, default 64
    """
    def __init__(self, num_heads: int = 8, dim_head: int = 64) -> None:
        super(Projector, self).__init__()
        self.dim_model = num_heads * dim_head
        self.num_heads = num_heads
        self.dim_head = dim_head

    def __call__(self):
        fc_q = nn.Linear(self.dim_model, self.dim_head)
        fc_k = nn.Linear(self.dim_model, self.dim_head)
        fc_v = nn.Linear(self.dim_model, self.dim_head)
        return fc_q, fc_k, fc_v


class MultiHeadAttention(nn.Module):
    """
    Class for multi-head attention (MHA) module in vanilla transformer
    We apply linear transformation to input vector by each attention head's projection matrix (8, 512, 64)
    Other approaches are possible, such as using one projection matrix for all attention heads (1, 512, 512)
    and then split into each attention heads (8. 512, 64)
    Args:
        dim_model: dimension of model's latent vector space, default 512 from official paper
        num_heads: number of heads in MHA, default 8 from official paper
        dropout: dropout rate, default 0.1
    Math:
        MHA(Q, K, V) = Concat(Head1, Head2, ... Head8) * W_concat
    Reference:
        https://arxiv.org/abs/1706.03762
    """
    def __init__(self, dim_model: int = 512, num_heads: int = 8, dropout: float = 0.1) -> None:
        super(MultiHeadAttention, self).__init__()
        self.dim = dim_model
        self.num_heads = num_heads
        self.dropout = dropout
        self.dim_head = int(self.dim / self.num_heads)  # dimension of each attention head
        self.dot_scale = torch.sqrt(torch.tensor(self.dim_head))  # scale factor for Q•K^T Result

        # linear combination: projection matrix(Q_1, K_1, V_1, ... Q_n, K_n, V_n) for each attention head
        self.projector = Projector(self.num_heads, self.dim_head)  # init instance
        self.projector_list = [list(self.projector()) for _ in range(self.num_heads)]  # call instance
        self.fc_concat = nn.Linear(self.dim, self.dim)  # for concatenation of each attention head

    def forward(self, x: torch.Tensor, mask: bool = None) -> torch.Tensor:
        """
        1) make Q, K, V matrix for each attention head: [BS, HEAD, SEQ_LEN, DIM_HEAD], ex) [10, 8, 512, 64]
        2) Do self-attention in each attention head
            - Matmul (Q, K^T) with scale factor (sqrt(DIM_HEAD))
            - Mask for padding token (Option for Decoder)
            - Softmax
            - Matmul (Softmax, V)
        3) Concatenate each attention head & linear transformation (512, 512)
        """
        # 1) make Q, K, V matrix for each attention head
        Q, K, V = [], [], []

        for i in range(self.num_heads):
            Q.append(self.projector_list[i][0](x))
            K.append(self.projector_list[i][1](x))
            V.append(self.projector_list[i][2](x))

        Q = torch.stack(Q, dim=1)
        K = torch.stack(K, dim=1)
        V = torch.stack(V, dim=1)
        # 2) Do self-attention in each attention head
        attention_score = torch.matmul(Q, K.transpose(-1, -2)) / self.dot_scale
        if mask is not None:  # for padding token
            attention_score[mask] = float('-inf')
        attention_dist = F.softmax(attention_score, dim=-1)  # [BS, HEAD, SEQ_LEN, SEQ_LEN]
        attention_matrix = torch.matmul(attention_dist, V).transpose(1, 2).reshape(x.shape[0], x.shape[1], self.dim)  # [BS, SEQ_LEN, DIM]

        # 3) Concatenate each attention head & linear transformation (512, 512)
        x = self.fc_concat(attention_matrix)
        return x

In [None]:
""" Debug for MultiHeadAttention """

x = torch.randn(10, 512, 512)
test_head = MultiHeadAttention()
test_result = test_head(x)
test_result, test_result.shape

In [47]:
""" torch.reshape test for making input shape in Vision Transformers """
patch_size, num_patches = 16, 32
x = torch.randn(10, 3, 512, 512)
x = x.reshape(x.shape[0], num_patches**2, patch_size**2 * x.shape[1])
x.shape

torch.Size([10, 1024, 768])

In [48]:
""" Check Input Embedding shape """
input_embedding = nn.Linear(768, 1024)
x = input_embedding(x)
x.shape

torch.Size([10, 1024, 1024])

In [51]:
""" make classification token for Vision Transformers """
cls_token = torch.zeros(x.shape[0], 1, x.shape[2])  # can change init method
cls_token.shape

torch.Size([10, 1, 1024])

In [52]:
torch.cat([cls_token, x], dim=1).shape

torch.Size([10, 1025, 1024])

In [11]:
""" Test for Hybrid Model """
x = torch.randn(10, 3, 512, 512)
dim_model = 512
patch_size = 16
num_patches = 32
conv = nn.Conv2d(
            in_channels=3,
            out_channels=dim_model,
            kernel_size=patch_size,
            stride=16
)
x = conv(x).reshape(x.shape[0], dim_model, num_patches**2).transpose(-1, -2)
x.shape

torch.Size([10, 1024, 512])

In [4]:
32*32

1024

In [61]:
""" Test for DeBERTa Disentangled Self-Attention """
batch, sequence, dim_model, dim_head, k = 10, 1024, 2048, 64, 512
position_embedding = nn.Embedding(2*k, dim_model)
x = torch.randn(sequence, dim_model)  # [Batch, Sequence, Dim]
p_x = position_embedding(torch.arange(2*k))


fc_q = nn.Linear(dim_model, dim_head)
fc_k = nn.Linear(dim_model, dim_head)
fc_v = nn.Linear(dim_model, dim_head)
fc_qr = nn.Linear(dim_model, dim_head)  # projector for Relative Position Query matrix
fc_kr = nn.Linear(dim_model, dim_head)  # projector for Relative Position Key matrix

q = fc_q(x)
kr = fc_kr(p_x)

# c2p attention matrix
tmp_c2p= torch.stack(
    [torch.matmul(q[i, :], kr.transpose(-1, -2)) for i in range(x.shape[0])],
    dim=0
)
tmp_c2p, tmp_c2p.shape

(tensor([[-4.4441, -2.0285,  3.8037,  ..., -2.1064,  0.4153, -1.5477],
         [ 0.5457, -2.0689,  2.4735,  ...,  2.0869, -0.8964, -1.1958],
         [-1.5544, -1.4172,  0.4664,  ...,  1.3028,  0.0901,  1.3808],
         ...,
         [ 2.2167, -0.3013, -2.5602,  ..., -3.3431, -5.7226, -0.5048],
         [-1.1391,  1.3073,  4.0575,  ...,  2.8553, -1.3381, -3.5350],
         [ 0.8327,  3.4481,  7.1666,  ...,  0.8348,  8.0450,  1.2452]],
        grad_fn=<StackBackward0>),
 torch.Size([1024, 1024]))

i번째 토큰의 latent vector space 1024 차원에서 max relative position 값인 k만 뽑아 내는게 목적
그리고 빼는 것도 max sequence length 만큼 상대 위치 임베딩 토큰을 구하는거구나.

In [62]:
""" tmp_c2p matrix calculation """
tmp_c2p = torch.matmul(q, kr.transpose(-1, -2))
tmp_c2p, tmp_c2p.shape

(tensor([[-4.4441, -2.0285,  3.8037,  ..., -2.1064,  0.4153, -1.5477],
         [ 0.5457, -2.0689,  2.4735,  ...,  2.0869, -0.8964, -1.1958],
         [-1.5544, -1.4172,  0.4664,  ...,  1.3028,  0.0901,  1.3808],
         ...,
         [ 2.2167, -0.3013, -2.5602,  ..., -3.3431, -5.7226, -0.5048],
         [-1.1391,  1.3073,  4.0575,  ...,  2.8553, -1.3381, -3.5350],
         [ 0.8327,  3.4481,  7.1666,  ...,  0.8348,  8.0450,  1.2452]],
        grad_fn=<MmBackward0>),
 torch.Size([1024, 1024]))

In [63]:
""" Build Subtraction of token index in c2p matrix """
max_seq, max_relative_position = 1024, 512
q_index, k_index = torch.arange(max_seq), torch.arange(2*max_relative_position)
q_index, k_index

(tensor([   0,    1,    2,  ..., 1021, 1022, 1023]),
 tensor([   0,    1,    2,  ..., 1021, 1022, 1023]))

In [64]:
q_index.view(-1, 1), k_index.view(1, -1)  # like as transpose & vstack

(tensor([[   0],
         [   1],
         [   2],
         ...,
         [1021],
         [1022],
         [1023]]),
 tensor([[   0,    1,    2,  ..., 1021, 1022, 1023]]))

In [65]:
""" 지금 이건 i - j 값이 어느 범위에 속하는지 알아내는게 목적이니까 """
tmp_pos = q_index.view(-1, 1) - k_index.view(1, -1)
rel_pos_matrix = tmp_pos + max_relative_position
rel_pos_matrix

tensor([[ 512,  511,  510,  ..., -509, -510, -511],
        [ 513,  512,  511,  ..., -508, -509, -510],
        [ 514,  513,  512,  ..., -507, -508, -509],
        ...,
        [1533, 1532, 1531,  ...,  512,  511,  510],
        [1534, 1533, 1532,  ...,  513,  512,  511],
        [1535, 1534, 1533,  ...,  514,  513,  512]])

In [66]:
rel_pos_matrix = torch.clamp(rel_pos_matrix, 0, 2*max_relative_position - 1).repeat(10, 1, 1)
tmp_c2p = tmp_c2p.repeat(10, 1, 1)
rel_pos_matrix, rel_pos_matrix.shape, tmp_c2p.shape

(tensor([[[ 512,  511,  510,  ...,    0,    0,    0],
          [ 513,  512,  511,  ...,    0,    0,    0],
          [ 514,  513,  512,  ...,    0,    0,    0],
          ...,
          [1023, 1023, 1023,  ...,  512,  511,  510],
          [1023, 1023, 1023,  ...,  513,  512,  511],
          [1023, 1023, 1023,  ...,  514,  513,  512]],
 
         [[ 512,  511,  510,  ...,    0,    0,    0],
          [ 513,  512,  511,  ...,    0,    0,    0],
          [ 514,  513,  512,  ...,    0,    0,    0],
          ...,
          [1023, 1023, 1023,  ...,  512,  511,  510],
          [1023, 1023, 1023,  ...,  513,  512,  511],
          [1023, 1023, 1023,  ...,  514,  513,  512]],
 
         [[ 512,  511,  510,  ...,    0,    0,    0],
          [ 513,  512,  511,  ...,    0,    0,    0],
          [ 514,  513,  512,  ...,    0,    0,    0],
          ...,
          [1023, 1023, 1023,  ...,  512,  511,  510],
          [1023, 1023, 1023,  ...,  513,  512,  511],
          [1023, 1023, 1023,  .

In [74]:
"""
torch.gather 정리 필요 => dim은 내가 인덱싱을 적용하고 싶은 차원을 지정하는 것
인덱스 매개변수에 전달하는 행렬 안에 텐서 원소의 인덱스를 의미하는 숫자들이 마구 있는데,
저 숫자를 어느 차원에 적용할 것인가 그걸 지정 해주는 것
"""

torch.gather(tmp_c2p, dim=-2, index=rel_pos_matrix)

tensor([[[ 1.7927,  2.9470,  2.6425,  ..., -2.1064,  0.4153, -1.5477],
         [-0.2741,  0.8960,  4.7799,  ..., -2.1064,  0.4153, -1.5477],
         [-0.4359,  1.8331, -2.4628,  ..., -2.1064,  0.4153, -1.5477],
         ...,
         [ 0.8327,  3.4481,  7.1666,  ...,  1.8588, -0.0798,  3.0647],
         [ 0.8327,  3.4481,  7.1666,  ..., -0.8953,  3.0796,  1.9657],
         [ 0.8327,  3.4481,  7.1666,  ...,  0.1886,  0.8063, -0.7158]],

        [[ 1.7927,  2.9470,  2.6425,  ..., -2.1064,  0.4153, -1.5477],
         [-0.2741,  0.8960,  4.7799,  ..., -2.1064,  0.4153, -1.5477],
         [-0.4359,  1.8331, -2.4628,  ..., -2.1064,  0.4153, -1.5477],
         ...,
         [ 0.8327,  3.4481,  7.1666,  ...,  1.8588, -0.0798,  3.0647],
         [ 0.8327,  3.4481,  7.1666,  ..., -0.8953,  3.0796,  1.9657],
         [ 0.8327,  3.4481,  7.1666,  ...,  0.1886,  0.8063, -0.7158]],

        [[ 1.7927,  2.9470,  2.6425,  ..., -2.1064,  0.4153, -1.5477],
         [-0.2741,  0.8960,  4.7799,  ..., -2

In [73]:
""" 검증 완료 """
tmp_c2p[0][0][512]

tensor(0.9275, grad_fn=<SelectBackward0>)

In [69]:
tmp_c2p

tensor([[[-4.4441, -2.0285,  3.8037,  ..., -2.1064,  0.4153, -1.5477],
         [ 0.5457, -2.0689,  2.4735,  ...,  2.0869, -0.8964, -1.1958],
         [-1.5544, -1.4172,  0.4664,  ...,  1.3028,  0.0901,  1.3808],
         ...,
         [ 2.2167, -0.3013, -2.5602,  ..., -3.3431, -5.7226, -0.5048],
         [-1.1391,  1.3073,  4.0575,  ...,  2.8553, -1.3381, -3.5350],
         [ 0.8327,  3.4481,  7.1666,  ...,  0.8348,  8.0450,  1.2452]],

        [[-4.4441, -2.0285,  3.8037,  ..., -2.1064,  0.4153, -1.5477],
         [ 0.5457, -2.0689,  2.4735,  ...,  2.0869, -0.8964, -1.1958],
         [-1.5544, -1.4172,  0.4664,  ...,  1.3028,  0.0901,  1.3808],
         ...,
         [ 2.2167, -0.3013, -2.5602,  ..., -3.3431, -5.7226, -0.5048],
         [-1.1391,  1.3073,  4.0575,  ...,  2.8553, -1.3381, -3.5350],
         [ 0.8327,  3.4481,  7.1666,  ...,  0.8348,  8.0450,  1.2452]],

        [[-4.4441, -2.0285,  3.8037,  ..., -2.1064,  0.4153, -1.5477],
         [ 0.5457, -2.0689,  2.4735,  ...,  2

In [78]:
""" Test for p2c in disentangled self-attention """

if tmp_c2p is not None:
    print('t')

t
