<a href="https://colab.research.google.com/github/sugarghost/hanghae99_AI_PLUS_2/blob/main/2_HOMEWORK_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## [2주차] 심화과제: Multi-head Attention으로 감정 분석 모델 구현하기

In [1]:
!pip install datasets sacremoses



In [2]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence # [my code] 추가
from transformers import BertTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)


ds = load_dataset("stanfordnlp/imdb")
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')


def collate_fn(batch):
  max_len = 400
  texts, labels = [], []
  for row in batch:
    labels.append(row['label'])
    texts.append(row['text'])

  texts = torch.LongTensor(tokenizer(texts, padding=True, truncation=True, max_length=max_len).input_ids)
  labels = torch.LongTensor(labels)

  return texts, labels


train_loader = DataLoader(
    ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


## Multi-head attention(MHA) 구현
  - Self-attention module을 MHA로 확장해주시면 됩니다. 여기서 MHA는 다음과 같이 구현합니다.
    1. 기존의 $W_q, W_k, W_v$를 사용하여 $Q, K, V$를 생성합니다. 이 부분은 코드 수정이 필요 없습니다.
    2. $Q, K, V \in \mathbb{R}^{S \times D}$가 있을 때, 이를 $Q, K, V \in \mathbb{R}^{S \times H \times D’}$으로 reshape 해줍니다. 여기서 $H$는 `n_heads`라는 인자로 받아야 하고, $D$가 $H$로 나눠 떨어지는 값이여야 하는 제약 조건이 필요합니다. $D = H \times D’$입니다.
    3. $Q, K, V$를 $Q, K, V \in \mathbb{R}^{H \times S \times D’}$의 shape으로 transpose해줍니다.
    4. $A = QK^T/\sqrt{D'} \in \mathbb{R}^{H \times S \times S}$를 기존의 self-attention과 똑같이 계산합니다. 이 부분은 코드 수정이 필요 없습니다.
    5. Mask를 더합니다. 기존과 $A$의 shape이 달라졌기 때문에 dimension을 어떻게 맞춰줘야할지 생각해줘야 합니다.
    6. $\hat{x} = \textrm{Softmax}(A)V \in \mathbb{R}^{H \times S \times D'}$를 계산해주고 transpose와 reshape을 통해 $\hat{x} \in \mathbb{R}^{S \times D}$의 shape으로 다시 만들어줍니다.
    7. 기존과 똑같이 $\hat{x} = \hat{x} W_o$를 곱해줘서 마무리 해줍니다. 이 또한 코드 수정이 필요 없습니다.

In [3]:
from torch import nn
from math import sqrt


class SelfAttention(nn.Module):
  def __init__(self, input_dim, d_model):
    super().__init__()

    self.input_dim = input_dim
    self.d_model = d_model

    self.wq = nn.Linear(input_dim, d_model)
    self.wk = nn.Linear(input_dim, d_model)
    self.wv = nn.Linear(input_dim, d_model)
    self.dense = nn.Linear(d_model, d_model)

    self.softmax = nn.Softmax(dim=-1)

  def forward(self, x, mask):
    q, k, v = self.wq(x), self.wk(x), self.wv(x)
    score = torch.matmul(q, k.transpose(-1, -2)) # (B, S, D) * (B, D, S) = (B, S, S)
    score = score / sqrt(self.d_model)

    if mask is not None:
      score = score + (mask * -1e9)

    score = self.softmax(score)
    result = torch.matmul(score, v)
    result = self.dense(result)

    return result


# [MY CODE] 멀티헤드 어텐션
class MultiHeadAttention(nn.Module):
    def __init__(self, input_dim, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.input_dim = input_dim
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_head = d_model // n_heads  # D' = D / H

        # Q, K, V 각각 가중치 매트릭스 생성
        self.W_q = nn.Linear(input_dim, d_model)
        self.W_k = nn.Linear(input_dim, d_model)
        self.W_v = nn.Linear(input_dim, d_model)
        self.dense = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask):
        batch_size, seq_len, _ = x.shape


        # Q, K, V 생성
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        Q = Q.view(batch_size, seq_len, self.n_heads, self.d_head).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.n_heads, self.d_head).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.n_heads, self.d_head).transpose(1, 2)


        score = torch.matmul(Q, K.transpose(-1, -2))
        score = score / sqrt(self.d_head)

        if mask is not None:
          # [MY CODE] 마스크 차원 변경
          mask = mask.unsqueeze(1)
          score = score + (mask * -1e9)

        score = self.softmax(score)
        output = torch.matmul(score, V)
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        output = self.dense(output)

        return output

## Layer normalization, dropout, residual connection 구현
  - 다시 `TransformerLayer` class로 돌아와서 과제를 진행하시면 됩니다.
  - Attention module을 $MHA$, feed-forward layer를 $FFN$이라고 하겠습니다.

In [4]:
class TransformerLayer(nn.Module):
  def __init__(self, input_dim, d_model, n_heads, dff, dropout_param):
    super().__init__()

    self.input_dim = input_dim
    self.d_model = d_model
    self.dff = dff

    self.mha = MultiHeadAttention(input_dim, d_model, n_heads) # [MY CODE] 멀티헤드 어텐션 추가
    #self.sa = SelfAttention(input_dim, d_model)
    self.ffn = nn.Sequential(
      nn.Linear(d_model, dff),
      nn.ReLU(),
      nn.Linear(dff, d_model)
    )

    # [MY CODE] Layer 추가 및 드롭아웃 적용
    self.layernorm1 = nn.LayerNorm(d_model)
    self.layernorm2 = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout_param)

  def forward(self, x, mask):

    # [MY CODE] Multi-Head Attention 적용
    mha = self.mha(x, mask)
    mha = self.dropout(mha)
    x = self.layernorm1(mha + x)

    ffn = self.ffn(x)
    ffn = self.dropout(ffn)
    x = self.layernorm2(ffn + x)

    return x

In [5]:
import numpy as np


def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, None], np.arange(d_model)[None, :], d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[None, ...]

    return torch.FloatTensor(pos_encoding)


max_len = 400
print(positional_encoding(max_len, 256).shape)

torch.Size([1, 400, 256])


## 5-layer 4-head Transformer
  - 기존 실습에서 사용한 hyper-parameter들과 위에서 구현한 Transformer를 가지고 5-layer 4-head Transformer의 성능 결과를 report해주시면 됩니다.

In [6]:
class TextClassifier(nn.Module):
  # [MY CODE] 매개변수 조정
  def __init__(self, vocab_size, d_model, n_layers, n_heads, dff, dropout_param = 0.1):
    super().__init__()

    self.vocab_size = vocab_size
    self.d_model = d_model
    self.n_layers = n_layers
    self.dff = dff

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = nn.parameter.Parameter(positional_encoding(max_len, d_model), requires_grad=False)
    self.layers = nn.ModuleList([TransformerLayer(d_model, d_model, n_heads, dff, dropout_param) for _ in range(n_layers)])
    self.classification = nn.Linear(d_model, 1)

  def forward(self, x):
    mask = (x == tokenizer.pad_token_id)
    mask = mask[:, None, :]
    seq_len = x.shape[1]

    x = self.embedding(x)
    x = x * sqrt(self.d_model)
    x = x + self.pos_encoding[:, :seq_len]

    for layer in self.layers:
      x = layer(x, mask)

    x = x[:, 0]
    x = self.classification(x)

    return x


model = TextClassifier(len(tokenizer), 32, 5, 4, 32, 0.1) # [MY CODE] 5-layer 4-head Transformer

In [7]:
from torch.optim import Adam

lr = 0.001
model = model.to('cuda')
loss_fn = nn.BCEWithLogitsLoss()

optimizer = Adam(model.parameters(), lr=lr)

In [8]:
import numpy as np
import matplotlib.pyplot as plt


def accuracy(model, dataloader):
  cnt = 0
  acc = 0

  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda')

    preds = model(inputs)
    # preds = torch.argmax(preds, dim=-1)
    preds = (preds > 0).long()[..., 0]

    cnt += labels.shape[0]
    acc += (labels == preds).sum().item()

  return acc / cnt

# [MY CODE] 측정 함수 추가
def plot_acc(train_accs, test_accs, label1='train', label2='test'):
  x = np.arange(len(train_accs))

  plt.plot(x, train_accs, label=label1)
  plt.plot(x, test_accs, label=label2)
  plt.legend()
  plt.show()

In [9]:
n_epochs = 50
# [MY CODE] train 로직 분리
def train(model, optimizer, trainloader, testloader, n_epochs):
  train_acc_list = []
  test_acc_list = []
  for epoch in range(n_epochs):
    total_loss = 0.
    model.train()
    for data in train_loader:
      model.zero_grad()
      inputs, labels = data
      inputs, labels = inputs.to('cuda'), labels.to('cuda').float()

      preds = model(inputs)[..., 0]
      loss = loss_fn(preds, labels)
      loss.backward()
      optimizer.step()

      total_loss += loss.item()

    print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

    with torch.no_grad():
      model.eval()
      train_acc = accuracy(model, train_loader)
      test_acc = accuracy(model, test_loader)
      train_acc_list.append(train_acc)
      test_acc_list.append(test_acc)
      print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")
  # MY CODE 끔직하게도 LIST를 내다 던져버림
  return train_acc, test_acc

In [10]:
train_acc_list, test_acc_list = train(model, optimizer, train_loader, test_loader, n_epochs)
plot_acc(train_acc_list, test_acc_list) # MY CODE 망함
# [LOG] 다시 돌릴 리소스가 모잘라서...기존 PRINT 내역으로 만족 학습은 안정적으로 된것같고 loss가 많이 줄은 상태라 test acc는 변동이 적음

Epoch   0 | Train Loss: 216.41581273078918
Epoch   1 | Train Loss: 146.84099520742893
Epoch   2 | Train Loss: 119.5985151603818
Epoch   3 | Train Loss: 95.06357414275408
Epoch   4 | Train Loss: 75.2876220792532
Epoch   5 | Train Loss: 55.99532072991133
Epoch   6 | Train Loss: 42.942104674875736
Epoch   7 | Train Loss: 31.157082250341773
Epoch   8 | Train Loss: 25.706993332132697
Epoch   9 | Train Loss: 22.0138824111782
Epoch  10 | Train Loss: 19.82174086244777
Epoch  11 | Train Loss: 16.57027206593193
Epoch  12 | Train Loss: 14.40194210736081
Epoch  13 | Train Loss: 13.799567785812542
Epoch  14 | Train Loss: 13.288637334946543
Epoch  15 | Train Loss: 12.95441692462191
Epoch  16 | Train Loss: 11.742997315945104
Epoch  17 | Train Loss: 11.555974917951971
Epoch  18 | Train Loss: 12.175457398523577
Epoch  19 | Train Loss: 10.716775225591846
Epoch  20 | Train Loss: 11.772582493256778
Epoch  21 | Train Loss: 10.46569542947691
Epoch  22 | Train Loss: 10.113245252985507
Epoch  23 | Train Loss:

TypeError: object of type 'float' has no len()