In [1]:
import torch
import torch.nn as nn

In [2]:
input_text = "나는 최근 파리 여행을 다녀왔다"
input_text_list = input_text.split()
print("input_text_list: ", input_text_list)

input_text_list:  ['나는', '최근', '파리', '여행을', '다녀왔다']


In [3]:
str2idx = {word:idx for idx, word in enumerate(input_text_list)}
idx2str = {idx:word for idx, word in enumerate(input_text_list)}
print("str2idx: ", str2idx)
print("idx2str: ", idx2str)

str2idx:  {'나는': 0, '최근': 1, '파리': 2, '여행을': 3, '다녀왔다': 4}
idx2str:  {0: '나는', 1: '최근', 2: '파리', 3: '여행을', 4: '다녀왔다'}


In [4]:
input_ids = [str2idx[word] for word in input_text_list]
print("input_ids: ",input_ids)

input_ids:  [0, 1, 2, 3, 4]


In [5]:
embedding_dim = 16
embedded_layer = nn.Embedding(len(input_ids),embedding_dim)

input_embeddings = embedded_layer(torch.tensor(input_ids))
input_embeddings = input_embeddings.unsqueeze(0)
input_embeddings.shape

torch.Size([1, 5, 16])

In [6]:
max_position = 12
embed_layer = nn.Embedding(len(input_ids),embedding_dim)
position_embed_layer = nn.Embedding(max_position,embedding_dim)

position_ids = torch.arange(len(input_ids), dtype=torch.long).unsqueeze(0)
position_ids.shape

torch.Size([1, 5])

In [7]:
position_encodings = position_embed_layer(position_ids)
token_embeddings = embed_layer(torch.tensor(input_ids)).unsqueeze(0)
input_embeddings = token_embeddings + position_encodings
input_embeddings.shape

torch.Size([1, 5, 16])

## **예제 2.4**

In [8]:
head_dim = 16

weight_q = nn.Linear(embedding_dim, head_dim)
weight_k = nn.Linear(embedding_dim, head_dim)
weight_v = nn.Linear(embedding_dim, head_dim)

querys = weight_q(input_embeddings)
keys = weight_k(input_embeddings)
values = weight_v(input_embeddings)
print(querys.shape)
print(keys.shape)
print(values.shape)

torch.Size([1, 5, 16])
torch.Size([1, 5, 16])
torch.Size([1, 5, 16])


## **예제 2.5**

In [9]:
from math import sqrt
import torch.nn.functional as F

def compute_attention(querys, keys, values, is_causal=False):
  dim_k = querys.size(-1)
  scores = querys @ keys.transpose(-2,-1) / sqrt(dim_k)
  weights = F.softmax(scores, dim=-1)
  return weights @ values

## **예제 2.6**

In [10]:
print("원본 입력 형태: ", input_embeddings.shape)

after_attention_embeddings = compute_attention(querys, keys,values)

print("어텐션 적용 후 형태: ", after_attention_embeddings.shape)

원본 입력 형태:  torch.Size([1, 5, 16])
어텐션 적용 후 형태:  torch.Size([1, 5, 16])


## **예제 2.7**

In [11]:
class AttentionHead(nn.Module):
  def __init__(self, token_embed_dim, head_dim, is_causal=False):
    super().__init__()
    self.is_causal = is_causal
    self.weight_q = nn.Linear(token_embed_dim, head_dim)
    self.weight_k = nn.Linear(token_embed_dim, head_dim)
    self.weight_v = nn.Linear(token_embed_dim, head_dim)

  def forward(self, querys, keys, values):
    outputs = compute_attention(
        self.weight_q(querys),
        self.weight_k(keys),
        self.weight_v(values),
        is_causal=self.is_causal
    )

In [12]:
attention_head = AttentionHead(embedding_dim, head_dim)
after_attention_embeddings = attention_head(input_embeddings,input_embeddings,input_embeddings)

## **예제 2.8**

In [13]:
class MultiHeadAttention(nn.Module):
  def __init__(self, token_embed_dim, d_model, n_head, is_causal=False):
    super().__init__()
    self.n_head = n_head
    self.is_causal = is_causal
    self.weight_q = nn.Linear(token_embed_dim, head_dim)
    self.weight_k = nn.Linear(token_embed_dim, head_dim)
    self.weight_v = nn.Linear(token_embed_dim, head_dim)
    self.concat_linear = nn.Linear(d_model, d_model)

  def forward(self, querys, keys, values):
    B, T, C = querys.size()
    querys = self.weight_q(querys).view(B,T,self.n_head,C//n_head).transpose(1,2)
    keys = self.weight_k(keys).view(B,T,self.n_head,C//n_head).transpose(1,2)
    values = self.weight_v(values).view(B,T,self.n_head,C//n_head).transpose(1,2)
    attention = compute_attention(querys, keys, values, self.is_causal)
    output = attention.transpose(1,2).contiguous().view(B,T,C)
    output = self.concat_linear(output)
    return output



In [14]:
n_head = 4
mh_attention = MultiHeadAttention(embedding_dim, embedding_dim, n_head)
after_attention_embeddings = mh_attention(input_embeddings, input_embeddings, input_embeddings)
after_attention_embeddings.shape

torch.Size([1, 5, 16])

## **예제 2.9**

In [15]:
norm = nn.LayerNorm(embedding_dim)
norm_x = norm(input_embeddings)
norm_x.shape

torch.Size([1, 5, 16])

In [16]:
norm_x.mean(dim=-1).data, norm_x.std(dim=-1).data

(tensor([[-1.4901e-08, -5.5879e-08, -2.9802e-08, -1.4901e-08, -1.1176e-08]]),
 tensor([[1.0328, 1.0328, 1.0328, 1.0328, 1.0328]]))

## **예제 2.10**

In [17]:
class PreLayerNormFeedForward(nn.Module):
  def __init__(self, d_model, dim_feedforward, dropout):
    super().__init__()
    self.linear1 = nn.Linear(d_model, dim_feedforward)
    self.linear2 = nn.Linear(dim_feedforward, d_model)
    self.dropout1 = nn.Dropout(dropout)
    self.dropout2 = nn.Dropout(dropout)
    self.activation = nn.GELU()
    self.norm = nn.LayerNorm(d_model)
  def forward(self, x):
    x = self.norm(x)
    x = x + self.linear2(self.dropout1(self.activation(self.linear1(x))))
    x = self.dropout2(x)
    return x

## **예제 2.11**