In [4]:
import numpy as np
import math
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F

### Scaled-dot product Attention
$$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$


In [34]:
Q = torch.randn(10)
K = torch.randn(10)

print(Q @ K)
# Q @ K -> 차원수에 비례해서 커짐
# 값이 커지면 Softmax 기울이가 0에 가까워짐 -> Saturation 현상 발생
# sqrt(d_k)로 분산1을 유지 -> Scaling
print(Q @ K / (Q.shape[0] ** 0.5) )

tensor(0.4887)
tensor(0.1545)


In [35]:
# 확장
# (L,d) -> 문장의 길이가 L, 하나의 단어의 표현이 d 차원
L = 10
d = 3
Q_matrix = torch.randn(L, d)
K_matrix = torch.randn(L, d)
V_matrix = torch.randn(L, d)

scores = Q_matrix @ K_matrix.T
print(scores.shape)\


scaled_scores = scores / (d**0.5)

attention_weights = F.softmax(scaled_scores, dim=-1)

output = attention_weights @ V_matrix
output.shape

torch.Size([10, 10])


torch.Size([10, 3])

In [37]:
B,L,d = 2, 10,  3

Q = torch.randn(B, L, d)
K = torch.randn(B, L, d)
V = torch.randn(B, L, d)

# (B, L, d) (B, L, d) -> (B, L, L)
attention_scores = Q @ K.transpose(-2,-1)

attention_weights = F.softmax(attention_scores / (d**0.5), dim=-1)

output = attention_weights @ V

print(f"입력 형태: {Q.shape}")  # torch.Size([2, 3, 4])
print(f"가중치 형태: {attention_weights.shape}") # torch.Size([2, 3, 3])
print(f"출력 형태: {output.shape}")

입력 형태: torch.Size([2, 10, 3])
가중치 형태: torch.Size([2, 10, 10])
출력 형태: torch.Size([2, 10, 3])


In [43]:
B, L, d_model = 4, 10, 512 
d_k = 64

X = torch.randn(B, L, d_model)

W_q = torch.randn(d_model, d_k, requires_grad=True)
W_k = torch.randn(d_model, d_k, requires_grad=True)
W_v = torch.randn(d_model, d_k, requires_grad=True)

Q = X @ W_q
K = X @ W_k
V = X @ W_v

### Encoder
- Encoder Layer
    - Multi-Head Attention
    - PositionwiseFeedForward
    - LayerNorm

In [45]:
import torch

In [None]:
class EncoderLayer:
    def __init__(
        self,
        d_model: int,
        heads: int,
        d_ff: int,
        dropout: float = 0.1
    ):
        # Multi-Head Attention
        self.self_attn = MultiHeadAttention(d_model, heads)

        # Feed Forward
        self.feed_forward = PositionalwiseFeedForward(d_model, d_ff, dropout)

        # Layer Norm
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

        # Dropout
        self.dropout = Dropout(dropout)

    def forward(
        self,
        x: torch.Tensor,
        mask: torch.Tensor = None
    ) -> torch.Tensor:
        # 1
        # Attention
        attn_output = self.self_attn(x, x, x, mask)
        # Add & Norm
        x = self.norm1(x + self.dropout(attn_output))
        # 2
        # Sub-layer
        ffn_output = self.feed_forward(x)
        # Add & Norm
        x = self.norm2(x + self.dropout(ffn_output))

        return x


### 초기화 공부 필요!

In [None]:
class MultiHeadAttention:
    def __init__(self, d_model: int , heads: int):
        # 가중치
        self.d_model = d_model
        self.heads = heads
        self.d_head = d_model // heads

        self.w_q = torch.randn(d_model, d_model, requires_grad=True) / math.sqrt(d_model)
        self.w_k = torch.randn(d_model, d_model, requires_grad=True) / math.sqrt(d_model)
        self.w_v = torch.randn(d_model, d_model, requires_grad=True) / math.sqrt(d_model)

        self.w_o = torch.randn(d_model, d_model, requires_grad=True) / math.sqrt(d_model)

    def __call__(self, query, key, value):
        return self.forward(query, key, value)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # [B,L,dim]
        Q = torch.matmul(query, self.w_q)
        K = torch.matmul(key, self.w_k)
        V = torch.matmul(value, self.w_v)

        # [B,L,dim] -> [B,L,heads,d_head] -> [B,heads,L,d_head]
        Q = Q.view(batch_size, -1, self.heads, self.d_head).transpose(1,2)
        K = K.view(batch_size, -1, self.heads, self.d_head).transpose(1,2)
        V = V.view(batch_size, -1, self.heads, self.d_head).transpose(1,2)

        attention_scores = torch.matmul(Q,K.transpose(-2,-1)) / (self.d_head ** 0.5)
        attention_weights = F.softmax(attention_scores, dim=-1)

        context = torch.matmul(attention_weights, V)

        # [B,heads,L,d_head] -> [B,L,heads,d_head] -> [B,L,heads*d_head]
        context = context.transpose(1,2).contiguous().view(batch_size, -1, self.d_model)

        output = torch.matmul(context, self.w_o)

        return output

    def parameters(self):
        return [self.w_q, self.w_k, self.w_v, self.w_o]
    
    def train(self):
        self.training = True
    
    def eval(self):
        self.training = False

In [49]:
d_model, heads = 512, 8
B, L = 10, 16

model = MultiHeadAttention(d_model, heads)
X = torch.randn(B, L, d_model)

target = torch.randn(B, L, d_model)





<__main__.MultiHeadAttention at 0x11420b290>

In [52]:
scores = torch.tensor([
    [10.5, 2.3, 1.1],  # "Hi"가 "Hi", "PAD", "PAD"를 본 점수
    [2.3,  0.5, 0.2],
    [1.1,  0.2, 0.1]
])

In [55]:
mask = torch.tensor([1, 0, 0])
scores.masked_fill(mask == 0, -1e9)

tensor([[ 1.0500e+01, -1.0000e+09, -1.0000e+09],
        [ 2.3000e+00, -1.0000e+09, -1.0000e+09],
        [ 1.1000e+00, -1.0000e+09, -1.0000e+09]])

In [58]:
110000/7400

14.864864864864865

In [65]:
torch.randn(10,3, requires_grad=True)

tensor([[-6.5467e-01, -2.2579e-03, -1.2866e+00],
        [ 3.3274e-01,  1.8297e-01,  1.2852e+00],
        [-2.1007e+00, -2.1623e-01,  3.8635e-01],
        [-3.0317e-01,  1.6471e+00,  4.1609e-01],
        [-7.2088e-02,  5.0962e-01, -3.1438e+00],
        [-1.4600e+00, -3.5646e-01, -1.5728e+00],
        [-3.7831e-02,  4.7514e-01, -4.3947e-01],
        [ 4.9029e-01, -1.1372e-01, -1.1840e+00],
        [-2.3882e-01, -9.1874e-02,  1.0300e+00],
        [-9.0995e-01, -1.2539e-02,  9.3577e-01]], requires_grad=True)

In [9]:
class MultiHeadAttention:
    def __init__(
        self,
        d_model: int,
        heads: int,
    ):
        self.d_model = d_model
        self.heads = heads
        self.head_dim = d_model // heads
        self.training = True
        
        if d_model % heads != 0:
            raise ValueError("d_model must be divisible by heads.")
        
        self.w_q = torch.randn(d_model, d_model) / math.sqrt(d_model)
        self.w_k = torch.randn(d_model, d_model) / math.sqrt(d_model)
        self.w_v = torch.randn(d_model, d_model) / math.sqrt(d_model)
        self.w_o = torch.randn(d_model, d_model) / math.sqrt(d_model)

        self.w_q.requires_grad = True
        self.w_k.requires_grad = True
        self.w_v.requires_grad = True
        self.w_o.requires_grad = True

    def __call__(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        return self.forward(query, key, value, mask)
    
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        batch_size = query.size(0)

        # [B,L,d_model] -> [B,L,d_model]
        q = torch.matmul(query, self.w_q)
        k = torch.matmul(key, self.w_k)
        v = torch.matmul(value, self.w_v)

        # [B,L,d_model] -> [B,L,heads,d_heads] -> [B,heads,L,d_heads]
        q = q.view(batch_size, -1, self.heads, self.head_dim).transpose(1,2)
        k = k.view(batch_size, -1, self.heads, self.head_dim).transpose(1,2)
        v = v.view(batch_size, -1, self.heads, self.head_dim).transpose(1,2)

        # attention score
        # [B,heads,L,d_heads] @ [B,heads,d_heads,L] -> [B,heads,L,L]
        attention_scores = torch.matmul(q, k.transpose(-2,-1)) / (self.head_dim ** 0.5)

        # Padding mask
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
        
        # attention weights
        # [B,heads,L,L] @ [B,heads,L,d_heads] -> [B,heads,L,d_heads]
        attention_weights = torch.softmax(attention_scores, dim=-1)
        context = torch.matmul(attention_weights, v)

        # Concationate
        # [B,heads,L,d_heads] -> [B,L,heads,d_heads] -> [B,L,d_model]
        context = context.transpose(1, 2).contiguous()
        context = context.view(batch_size, -1, self.d_model)

        output = torch.matmul(context, self.w_o)

        return output

    def parameters(self):
        return [self.w_q, self.w_k, self.w_v, self.w_o]
    
    def train(self) -> None:
        self.training = True

    def eval(self) -> None:
        self.training = False

    def to(self, device: torch.device):
        self.w_q = self.w_q.to(device)
        self.w_k = self.w_k.to(device)
        self.w_v = self.w_v.to(device)
        self.w_o = self.w_o.to(device)

        return self
    
    def zero_grad(self) -> None:
        for param in self.parameters():
            if param.grad is not None:
                param.grad.zero_()

In [13]:
# NLP Example
batch, sentence_length, embedding_dim = 20, 5, 10
embedding = torch.randn(batch, sentence_length, embedding_dim)
layer_norm = nn.LayerNorm(embedding_dim)
# Activate module

layer_norm(embedding).shape


torch.Size([20, 5, 10])

In [14]:
torch.sum(layer_norm(embedding))

tensor(1.9073e-06, grad_fn=<SumBackward0>)