# Assignment 4: Attention
- The assignment is still under construction.
- But still, you can solve Problems 1 and 2
- If you find any error, please do not hesitate to report or make a question on Cyber Campus
    - Don't waste too much time on solving the error. The code is not thoroughly checked, and the error can be not your fault.

In [None]:
# If you are in Colab, install transformers 
!pip -q install transformers

In [None]:
import torch
import torch.nn as nn
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import numpy as np

from torch.nn.utils.rnn import PackedSequence, pad_sequence, pack_sequence, pad_packed_sequence, pack_padded_sequence
from torch.utils.data import DataLoader
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast

import os

# Below helps to run tokenizer with multiprocessing
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Below helps to print Korean letters in plt
def fix_font():
    # From https://HC.Dle.pw, By Jinseo Kim
    # v1.0.0
    import os
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    os.system("apt-get install -y fonts-nanum")
    os.system("fc-cache -fv")
    mpl.font_manager._rebuild()
    findfont = mpl.font_manager.fontManager.findfont
    mpl.font_manager.findfont = findfont
    mpl.backends.backend_agg.findfont = findfont
    plt.rcParams['font.family'] = "NanumBarunGothic"
    plt.rcParams['axes.unicode_minus'] = False

try:
  fix_font()
except:
  plt.rcParams['font.family'] = "NanumBarunGothic"
  plt.rcParams['axes.unicode_minus'] = False

#### Vectorization

In [None]:
'''
This is the example of vectorization of dot product for two different sequence length.
'''

e_states = torch.randn(100, 16)
d_states = torch.randn(80, 16)

dot_product = torch.mm(e_states, d_states.permute(1,0)) # (100, 16) x (16, 80) = (100, 80)
dot_product

## Problem 1: Implement Dot Product Attention (12 pts)

- Optimizing computation time is really important
    - Use `torch.mm()` or `torch.matmul()`
    - `torch.mm(a, b)` is a function for calculating matrix multiplcation of two matrices `a` and `b`
        - `a` and `b` has to be 2-dim tensors
        - `a.shape[1]` has to be equal to `b.shape[0]`
    - `torch.matmul()` is a function for matrix multiplication but with broadcasting
        - https://pytorch.org/docs/stable/generated/torch.matmul.html
        - It has less restriction on its input shape.
            - It automatically matches the dimension of two tensors following some rules
            - Therefore, it is a bit risky to use this funciton if you don't understand how it works

### Hint: Dot product as matrix multiplcation.

- Let's say there are two vector, $u=\begin{bmatrix}-3 \\ 2 \\ 1\end{bmatrix}$ and $v = \begin{bmatrix} 5 \\ 4 \\ 6\end{bmatrix}$
    - The dot product of the two vectors is $(-3 \times 5) + (2 \times 4) + (1 \times 6) = 1$
    - It is equivalent to $u^T \times v$
        - In this case $u\in\mathbb{R}^{3x1}$ and $v\in\mathbb{R}^{3x1}$
- In PyTorch, this can be described as below:
    - `u = torch.Tensor([-3, 2, 1])`
    - `v = torch.Tensor([5, 4, 6])`
    - Dot product of u and v can be calculated by one of belows:
        - `torch.mm(u.unsqueeze(0), v.unsqueeze(1))`
            - `u.unsqueeze(0).shape == [1, 3]`
            - `v.unsqueeze(1).shape == [3, 1]`
            - `unsqueeze()` returns a new tensor with a dimension of size one inserted at the specified position.
            - The result has shape of [1,1]
        - `torch.matmul(u, v)`
        - `u @ v`
            - `@` denotes matrix multiplication, which was introduced from Python 3.5
        - `(u * v).sum()`
            - This will be much slower than others, because it first do element-wise multiplcation

In [None]:
'''
Hint: Dot product as matrix multiplcation.
'''

u = torch.Tensor([-3, 2, 1])
v = torch.Tensor([5, 4, 6])

print(f"Result of (u * v).sum() is {(u * v).sum()}. This computation is much slower than others because it use element-wise multiplication instead of matrix multiplication") 
print(f"Result of torch.mm(u.unsqueeze(0), v.unsqueeze(1)) is {torch.mm(u.unsqueeze(0), v.unsqueeze(1))}")
print(f"Result of torch.matmul(u, v) is {torch.matmul(u, v)}")
print(f"Result of u @ v is {u @ v}")

In [None]:
def get_attention_score_for_a_single_query(keys, query):
  '''
  This function returns an attention score for each vector in keys for a given query.
  You can regard 'keys' as hidden states over timestep of Encoder, while query is a hidden state of specific time step of Decoder
  Name 'keys' are used because it is used for calculating attention score (match rate between given vector and query).
  
  For every C-dimensional vector key, the attention score is a dot product between the key and the query vector.
  
  Arguments:
    keys (torch.Tensor): Has a shape of [T, C]. These are vectors that a query wants attend to
    query (torch.Tensor): Has a shape of [C]. This is a vector that attends to other set of vectors (keys and values)
  
  Output:
    attention_score (torch.Tensor): The attention score in real number that represent how much does query have to attend to each vector in keys
                                    Has a shape of [T]
                                    
    attention_score[i] has to be a dot product value between keys[i] and query                                 


  TODO: Complete this sentence using torch.mm (matrix multiplication)
  Hint: You can use atensor.unsqueeze(dim) to expand a dimension (with a diemsion of length 1) without changing item value of the tensor.
  '''
  
  return


torch.manual_seed(0)
num_t = 23
h_size = 16

keys = torch.randn(num_t, h_size)
query = torch.randn(h_size)

att_score = get_attention_score_for_a_single_query(keys, query)
att_score

In [None]:
'''
Test Case
'''
assert att_score.ndim == 1 and len(att_score) == num_t, "Error: Check output shape"
answer = torch.Tensor([-3.0786,  2.1729,  1.7950, -5.0503,  3.3254,  0.2828, -0.9800, -1.8868,
         0.2550,  2.9389, -0.1799, -1.0586,  0.1465, -0.9441,  0.8888, -3.8108,
        -2.5662, -1.1660, -2.2327,  2.7087, -0.5800,  8.7984,  4.3816])
assert torch.max(torch.abs(att_score-answer)) < 1e-4, "Error: The output value is different"
print("Passed all the cases!")

In [None]:
def get_attention_weight_from_score(attention_score):
  '''
  This function converts attention score to attention weight.
  
  Argument:
    attention_score (torch.Tensor): Tensor of real number. Has a shape of [T]

  Output:
    attention_weight (torch.Tensor): Tensor of real number between 0 and 1. Sum of attention_weight is 1. Has a shape of [T]
  
  TODO: Complete this function
  '''
  assert attention_score.ndim == 1
  
  return

att_weight = get_attention_weight_from_score(att_score)
att_weight

In [None]:
answer = torch.Tensor([0.0000,     0.0013,     0.0009,     0.0000,     0.0041,     0.0002,
            0.0001,     0.0000,     0.0002,     0.0028,     0.0001,     0.0001,
            0.0002,     0.0001,     0.0004,     0.0000,     0.0000,     0.0000,
            0.0000,     0.0022,     0.0001,     0.9756,     0.0118])
assert att_weight.shape == att_score.shape, 'Shape has to be remained the same'
assert att_weight.sum() == 1, "Sum of attention weight has to be 1"
assert torch.max(torch.abs(att_weight-answer)) < 1e-4, "Error: The output value is different"

print("Passed all the cases!")


In [None]:
def get_weighted_sum(values, attention_weight):
  '''
  This function converts attention score to attention weight
  
  Argument:
    values (torch.Tensor): Has a shape of [T, C]. These are vectors that are used to form attention vector
    attention_weight: Has a shape of [T], which represents the weight for each vector to compose the attention vector

  Output:
    attention_vector (torch.Tensor): Weighted sum of values using the attention weight. Has a shape of [C]
  
  TODO: Complete this function using torch.mm
  '''
  return

att_vec = get_weighted_sum(keys, att_weight) # In simple dot-product-attention, key and value are the same
att_vec

In [None]:
answer = torch.Tensor([ 0.6280,  3.8540, -0.1042,  0.3148,  0.3711, -0.5095, -0.9663,  1.3295,
         1.9003, -1.2611, -2.2939, -2.0338,  0.8757, -0.6726,  1.9071, -1.0711])
assert att_vec.shape == query.shape, 'Shape has to be remained the same'
assert torch.max(torch.abs(att_vec-answer)) < 1e-4, "Error: The output value is different"
print("Passed all the cases")


## Problem 2: Attention in Batch ( 16 pts)
- In this problem, you have to calculate attention with batch
- You can use `torch.bmm()` for batch matrix multiplication https://pytorch.org/docs/stable/generated/torch.bmm.html 
    - `torch.bmm()` takes two 3-dim tensor as its input
    - Each tensor has to be 3-dim (atensor.ndim==3)


In [None]:
'''
Hint for Problem 2

You can calculate matrix multiplication of matrices in batch effectively using torch.bmm() or torch.matmul()
'''

torch.manual_seed(0)
matrix_left1 = torch.randn(5, 3)
matrix_left2 = torch.randn(5, 3)

print(f"matrix_left1: \n{matrix_left1}")
print(f"matrix_left2: \n{matrix_left2}")

matrix_right1 = torch.randn(3, 4)
matrix_right2 = torch.randn(3, 4)
print(f"matrix_right1: \n{matrix_right1}")
print(f"matrix_right2: \n{matrix_right2}")

print("Let's assume that we have batch of matrix, which is stack of these two matices")
matrix_left = torch.stack([matrix_left1, matrix_left2])
matrix_right = torch.stack([matrix_right1, matrix_right2])

print(f"matrix_left: \n{matrix_left} \n which is shape of {matrix_left.shape}")
print(f"matrix_right: \n{matrix_right}\n which is shape of {matrix_right.shape}")


'''
Exhaustive method: using torch.mm() only with for loop (This is SLOW when matrix gets much larger)
'''

mm_forloop_output = []
for sample_index in range(matrix_left.shape[0]):
  mat_left = matrix_left[sample_index]
  mat_right = matrix_right[sample_index]
  
  mm_result = torch.mm(mat_left, mat_right)
  mm_forloop_output.append(mm_result)
  
mm_forloop_stack = torch.stack(mm_forloop_output)
print(f"mat_mul_stack: \n{mm_forloop_stack}\n which is shape of {mm_forloop_stack.shape}")


'''
Good method: using torch.bmm()
'''

mat_mul_bmm = torch.bmm(matrix_left, matrix_right)
print(f"mat_mul_bmm: \n{mat_mul_bmm}\n which is shape of {mat_mul_bmm.shape}")


In [None]:
def get_attention_score_for_a_batch_query(keys, query):
  '''
  This function returns a batch of attention score for each vector in (multi-batch) keys for a given (single-batch) query.
  You can regard 'keys' as hidden states over timestep of Encoder, while query is a hidden state of specific time step of Decoder
  Name 'keys' are used because it is used for calculating attention score (match rate between given vector and query).
  
  For every C-dimensional vector key, the attention score is a dot product between the key and the query vector.
  
  Arguments:
    keys (torch.Tensor): Has a shape of [N, T, C]. These are vectors that a query wants attend to
    query (torch.Tensor): Has a shape of [N, C]. This is a vector that attends to other set of vectors (keys and values)
  
  Output:
    attention_score (torch.Tensor): The attention score in real number that represent how much does query have to attend to each vector in keys
                                    Has a shape of [N, T]
                                    
    attention_score[n, i] has to be a dot product value between keys[n, i] and query[n]                     
    
  TODO: Complete this function without using for loop
  Hint: Use torch.bmm or torch.matmul after make two input tensors as 3-dim tensors.

  '''
  return 

torch.manual_seed(0)
num_b = 6
num_t = 23
h_size = 16

keys = torch.randn(num_b,num_t, h_size)
query = torch.randn(num_b, h_size)
out = get_attention_score_for_a_batch_query(keys, query)

assert out.ndim == 2 and out.shape == torch.Size([num_b, num_t])

In [None]:
def get_attention_score_for_a_batch_multiple_query(keys, queries):
  '''
  Now you have to implement the attention score for not only single query, but multiple queries.
  
  This function returns a batch of attention score for each vector in keys for given queries.
  You can regard 'keys' as hidden states over timestep of Encoder, while querys are hidden states over timestep of Decoder
  Name 'keys' are used because it is used for calculating attention score (match rate between given vector and query).
  
  For every C-dimensional vector key, the attention score is a dot product between the key and the query vector.
  
  Arguments:
    keys (torch.Tensor): Has a shape of [N, Ts, C]. These are vectors that a query wants attend to
    queries (torch.Tensor): Has a shape of [N, Tt, C]. This is a vector that attends to other set of vectors (keys and values)
  
  Output:
    attention_score (torch.Tensor): The attention score in real number that represent how much does query have to attend to each vector in keys
                                    Has a shape of [N, Ts, Tt]
                                    
    attention_score[n, i, t] has to be a dot product value between keys[n, i] and query[n, t] 
    
  TODO: Complete this function without using for loop
  HINT: Use torch.bmm() with proper transpose (permutation) of given tensors. (You can use atensor.permute())
        Think about which dimension (axis) of tensors has to be multiplied together and resolved (disappear) after matrix multiplication,
        and how the result tensor has to look like (shape)
  '''
  return

torch.manual_seed(0)
num_b = 6
num_ts = 23
num_tt = 14
h_size = 16

keys = torch.randn(num_b, num_ts, h_size)
queries = torch.randn(num_b, num_tt, h_size)
att_score = get_attention_score_for_a_batch_multiple_query(keys, queries)

att_score

In [None]:
'''
Test cases
'''
answer = torch.Tensor([ 4.9620, -9.6091, -4.9472,  1.4543, -5.6273,  9.1436,  1.4172,  0.0464,
        -5.7033,  4.5473,  7.7498,  1.3405, -3.1877,  2.8759])
answer2 = torch.Tensor([[ 2.5171,  0.6216,  3.7929,  2.6163,  5.3290,  0.3592,  2.3067, -0.1099,
         1.8963,  0.4175, -1.4283,  1.4388, -2.7825, -1.3690, -1.9615, -1.9514,
        -6.4635,  1.9574,  0.1868,  8.5354,  4.6053,  2.8786, -2.1453]])
assert att_score.ndim == 3 and att_score.shape == torch.Size([num_b, num_ts, num_tt]), 'Check the output shape'
assert torch.max(torch.abs(att_score[2,4] - answer)) < 1e-4, 'Calculated result is wrong'
assert torch.max(torch.abs(att_score[3,:,2] - answer2)) < 1e-4,  'Calculated result is wrong'

print("Passed all the cases!")

In [None]:
def get_masked_softmax(attention_score, mask, mask_value=-1e10):
  '''
  During the batch computation, each sequence in the batch can have different length.
  To group them as in a single tensor, we usually pad values
    
  Arguments:
    attention_score (torch.Tensor): The attention score in real number that represent how much does query have to attend to each vector in keys
                                    Has a shape of [N, Ts, Tt]
    mask (torch.Tensor): Boolean tensor with a shape of [N, Ts] that represents whether the corresponding is valid or not.
                         mask[n, t] == 1 if and only if input_batch[n,t] is not a padded value.
                         If input_batch[n,t] is a padded value, then mask[n,t] == 0
  
  Output:
    attention_weight (torch.Tensor): The attention weight in real number between 0 and 1. The sum of attention_weight along keys timestep dimension is 1.
                                    Has a shape of [N, Ts, Tt]
                                    
    attention_weight[n, i, t] has to be an attention weight of values[n, i] for queries[n, t] 
    
  TODO: Complete this function without using for loop
  Hint: You can give -infinity value by -float("inf")

  '''

  return


'''
Don't change this codes
'''
mask = torch.ones_like(att_score)[..., 0]
mask[4, 15:] = 0
mask[5, 17:] = 0

attention_weight = get_masked_softmax(att_score, mask)
attention_weight

In [None]:
answer = torch.Tensor([0.0120,     0.0002,     0.0901,     0.0003,     0.0259,     0.0036,
            0.5617,     0.0108,     0.2508,     0.0054,     0.0001,     0.0010,
            0.0000,     0.0005,     0.0375,     0.0000,     0.0000,     0.0000,
            0.0000,     0.0000,     0.0000,     0.0000,     0.0000])
assert torch.max(torch.abs(attention_weight[4,:,3]-answer)) < 1e-4
assert torch.max(torch.abs(attention_weight.sum(1) -  1 )) < 1e-6

In [None]:
def get_batch_weighted_sum(values, attention_weight):
  '''
  This function converts attention score to attention weight
  
  Argument:
    values (torch.Tensor): Has a shape of [N, Ts, C]. These are vectors that are used to form attention vector
    attention_weight: Has a shape of [N, Ts, Tt], which represents the weight for each vector to compose the attention vector
                      attention_weight[n, s, t] represents weight for value[n, s] that corresponds to a given query, queries[n, t]

  Output:
    attention_vector (torch.Tensor): Weighted sum of values using the attention weight. 
                                     Has a shape of [N, Tt, C]
  
  TODO: Complete this function using torch.mm
  '''
  
  return

att_out = get_batch_weighted_sum(keys, attention_weight)
att_out

In [None]:
'''
Test cases
'''
answer = torch.Tensor([ 4.9620, -9.6091, -4.9472,  1.4543, -5.6273,  9.1436,  1.4172,  0.0464,
        -5.7033,  4.5473,  7.7498,  1.3405, -3.1877,  2.8759])
answer2 = torch.Tensor([[ 2.5171,  0.6216,  3.7929,  2.6163,  5.3290,  0.3592,  2.3067, -0.1099,
         1.8963,  0.4175, -1.4283,  1.4388, -2.7825, -1.3690, -1.9615, -1.9514,
        -6.4635,  1.9574,  0.1868,  8.5354,  4.6053,  2.8786, -2.1453]])
assert att_score.ndim == 3 and att_score.shape == torch.Size([num_b, num_ts, num_tt]), 'Check the output shape'
assert torch.max(torch.abs(att_score[2,4] - answer)) < 1e-4, 'Calculated result is wrong'
assert torch.max(torch.abs(att_score[3,:,2] - answer2)) < 1e-4,  'Calculated result is wrong'

print("Passed all the cases!")

## Problem 3: Make seq2seq with attention (14 pts)
- Using Pre-defined `TranslatorBi` class, complete a new `TranslatorAtt` class
- If you implement it correctly, you can translate 

### 3-0 Prepare dataset and tokenizer

In [None]:
'''
Download dataset (originally from NIA AI-Hub)
'''

!gdown 1CpsqOuuuB3I_PG5DbuqH1ssCFVerU46g
!unzip -q nia-aihub-korean-english.zip

In [None]:
dataset_dir = Path('nia_korean_english')
data_list = sorted(list(dataset_dir.glob('*.xlsx')))
 
# Use only first two xlsx files in the assignment
data_list = data_list[:2]
df = pd.concat([pd.read_excel(path) for path in data_list], axis=0)

### Download Pretrained Weights, and Tokenizers
To use the pretrained model correctly, you can use the pretrained vocabularies

In [None]:
!gdown 1lTo32Z9euLMSD1L1krgORceay9f-UU--
!unzip nlp_assignment4.zip

In [None]:
'''
for path in data_list:
  df = pd.read_excel(path)
  kor_text_path = path.parent / (path.stem+'_kor.txt') 
  eng_text_path = path.parent / (path.stem+'_eng.txt') 
  with open(kor_text_path, 'w', encoding='utf8') as f:
      f.write('\n'.join(df['원문']))
  with open(eng_text_path, 'w', encoding='utf8') as f:
      f.write('\n'.join(df['번역문']))


# Train Tokenizer
tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False)

vocab_size    = 32000  # Number of maximum size of the vocabulary
limit_alphabet= 6000   
min_frequency = 5 

corpus_file   =  [str(path.parent / (path.stem + '_kor.txt')) for path in data_list]
output_dir   = Path('hugging_kor_partial_%d'%(vocab_size))
output_dir.mkdir(exist_ok=True)

tokenizer.train(files=corpus_file,
               vocab_size=vocab_size,
               min_frequency=min_frequency,
               limit_alphabet=limit_alphabet, 
               show_progress=True)

tokenizer.save_model(str(output_dir))

limit_alphabet= 200
corpus_file   =  [str(path.parent / (path.stem + '_eng.txt')) for path in data_list]
output_dir   = Path('hugging_eng_partial_%d'%(vocab_size))
output_dir.mkdir(exist_ok=True)

tokenizer.train(files=corpus_file,
               vocab_size=vocab_size,
               min_frequency=min_frequency,
               limit_alphabet=limit_alphabet, 
               show_progress=True)

tokenizer.save_model(str(output_dir))
'''

In [None]:
src_tokenizer = BertTokenizerFast.from_pretrained('hugging_kor_32000',
                                                       strip_accents=False,
                                                       lowercase=False) 
tgt_tokenizer = BertTokenizerFast.from_pretrained('hugging_eng_32000',
                                                       strip_accents=False,
                                                       lowercase=False) 


In [None]:
class TranslationSet:
  def __init__(self, df, src_tokenizer, tgt_tokenizer):
    self.data = df[ ['원문', '번역문']].values
    self.src_tokenizer = src_tokenizer
    self.tgt_tokenizer = tgt_tokenizer
    
  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    src_str = self.data[idx, 0]
    tgt_str = self.data[idx, 1]

    # convert string to list of token ids
    src_ids = self.src_tokenizer.encode(src_str)
    tgt_ids = self.tgt_tokenizer.encode(tgt_str)

    return torch.LongTensor(src_ids), torch.LongTensor(tgt_ids) # idx-th datasample
  
entireset = TranslationSet(df, src_tokenizer, tgt_tokenizer)
trainset, validset, testset = torch.utils.data.random_split(entireset, [int(len(entireset)*0.9), int(len(entireset)*0.05), len(entireset)-int(len(entireset)*0.9)-int(len(entireset)*0.05)], generator=torch.Generator().manual_seed(42))
# trainset, validset, testset = torch.utils.data.random_split(entireset, [360000, 20000, 20000], generator=torch.Generator().manual_seed(42))

print(f'Dataset Item Example: {entireset[0]}')
print(f'Length of split : Train {len(trainset)}, Valid {len(validset)}, Test {len(testset)}')

def pack_collate(raw_batch):
  srcs = [x[0] for x in raw_batch]
  tgts_i = [x[1][:-1] for x in raw_batch]
  tgts_o = [x[1][1:] for x in raw_batch]
  
  srcs = pack_sequence(srcs, enforce_sorted=False)
  tgts_i = pack_sequence(tgts_i, enforce_sorted=False)
  tgts_o = pack_sequence(tgts_o, enforce_sorted=False)
  return srcs, tgts_i, tgts_o

single_loader = DataLoader(trainset, batch_size=1, collate_fn=pack_collate, shuffle=True, num_workers=4, pin_memory=True)
train_loader = DataLoader(trainset, batch_size=64, collate_fn=pack_collate, shuffle=True, num_workers=4, pin_memory=True)
valid_loader = DataLoader(validset, batch_size=128, collate_fn=pack_collate, shuffle=False, num_workers=0, pin_memory=True)
test_loader = DataLoader(testset, batch_size=128, collate_fn=pack_collate, shuffle=False, num_workers=0, pin_memory=True)

batch = next(iter(train_loader))
batch

In [None]:
'''
Pre-defined class
'''


class Trainer:
  def __init__(self, model, optimizer, loss_fn, train_loader, valid_loader, device, model_name='nmt_model'):
    self.model = model
    self.optimizer = optimizer
    self.loss_fn = loss_fn
    self.train_loader = train_loader
    self.valid_loader = valid_loader
    
    self.model.to(device)
    
    self.grad_clip = 1.0
    self.best_valid_accuracy = 0
    self.device = device
    
    self.training_loss = []
    self.validation_loss = []
    self.validation_acc = []
    self.model_name = model_name

  def save_model(self, path):
    torch.save({'model':self.model.state_dict(), 'optim':self.optimizer.state_dict()}, path)
    
  def train_by_num_epoch(self, num_epochs):
    for epoch in tqdm(range(num_epochs)):
      self.model.train()
      for batch in tqdm(self.train_loader, leave=False):
        loss_value = self._train_by_single_batch(batch)
        self.training_loss.append(loss_value)
      self.model.eval()
      validation_loss, validation_acc = self.validate()
      self.validation_loss.append(validation_loss)
      self.validation_acc.append(validation_acc)
      
      if validation_acc > self.best_valid_accuracy:
        print(f"Saving the model with best validation accuracy: Epoch {epoch+1}, Acc: {validation_acc:.4f} ")
        self.save_model(f'{self.model_name}_best.pt')
      else:
        self.save_model(f'{self.model_name}_last.pt')
      self.best_valid_accuracy = max(validation_acc, self.best_valid_accuracy)

      
  def _train_by_single_batch(self, batch):
    '''
    This method updates self.model's parameter with a given batch
    
    batch (tuple): (batch_of_input_text, batch_of_label)
    
    You have to use variables below:
    
    self.model (Translator/torch.nn.Module): A neural network model
    self.optimizer (torch.optim.adam.Adam): Adam optimizer that optimizes model's parameter
    self.loss_fn (function): function for calculating BCE loss for a given prediction and target
    self.device (str): 'cuda' or 'cpu'

    output: loss (float): Mean binary cross entropy value for every sample in the training batch
    The model's parameters, optimizer's steps has to be updated inside this method
    '''
    
    src, tgt_i, tgt_o = batch
    pred = self.model(src.to(self.device), tgt_i.to(self.device))
    loss = self.loss_fn(pred.data, tgt_o.data)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
    optimizer.step()
    optimizer.zero_grad()
    
    return loss.item()

    
  def validate(self, external_loader=None):
    '''
    This method calculates accuracy and loss for given data loader.
    It can be used for validation step, or to get test set result
    
    input:
      data_loader: If there is no data_loader given, use self.valid_loader as default.
      
    output: 
      validation_loss (float): Mean Binary Cross Entropy value for every sample in validation set
      validation_accuracy (float): Mean Accuracy value for every sample in validation set
    '''
    
    ### Don't change this part
    if external_loader and isinstance(external_loader, DataLoader):
      loader = external_loader
      print('An arbitrary loader is used instead of Validation loader')
    else:
      loader = self.valid_loader
      
    self.model.eval()
    
    '''
    Write your code from here, using loader, self.model, self.loss_fn.
    '''
    validation_loss = 0
    validation_acc = 0
    num_total_tokens = 0
    with torch.no_grad():
      for batch in tqdm(loader, leave=False):
        
        src, tgt_i, tgt_o = batch
        pred = self.model(src.to(self.device), tgt_i.to(self.device))
        loss = self.loss_fn(pred.data, tgt_o.data)
        num_tokens = tgt_i.data.shape[0]
        validation_loss += loss.item() * num_tokens
        num_total_tokens += num_tokens
        
        acc = torch.sum(torch.argmax(pred.data, dim=-1) == tgt_o.to(self.device).data)
        validation_acc += acc.item()
        
    return validation_loss / num_total_tokens, validation_acc / num_total_tokens

def get_cross_entropy_loss(predicted_prob_distribution, indices_of_correct_token):
  '''
  for PackedSequence, the input is 2D tensor
  
  predicted_prob_distribution has a shape of [num_entire_tokens_in_the_batch x vocab_size]
  indices_of_correct_token has a shape of [num_entire_tokens_in_the_batch]
  '''
  prob_of_correct_next_word = predicted_prob_distribution[torch.arange(len(predicted_prob_distribution)), indices_of_correct_token]
  loss = -torch.log(prob_of_correct_next_word)
  return loss.mean()

In [None]:
'''
Pre-defined class

You don't need to change this code
'''
class TranslatorBi(nn.Module):
  def __init__(self, src_tokenizer, tgt_tokenizer, hidden_size=256, num_layers=3):
    super().__init__()
    self.src_tokenizer = src_tokenizer
    self.tgt_tokenizer = tgt_tokenizer
    
    self.src_vocab_size = self.src_tokenizer.vocab_size
    self.tgt_vocab_size = self.tgt_tokenizer.vocab_size
    
    self.src_embedder = nn.Embedding(self.src_vocab_size, hidden_size)
    self.tgt_embedder = nn.Embedding(self.tgt_vocab_size, hidden_size)
    
    self.encoder = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True, batch_first=True)
    self.decoder = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
    
    self.decoder_proj = nn.Linear(hidden_size, self.tgt_vocab_size)
    
  def run_encoder(self, x):
    if isinstance(x, PackedSequence):
      emb_x = PackedSequence(self.src_embedder(x.data), batch_sizes=x.batch_sizes, sorted_indices=x.sorted_indices, unsorted_indices=x.unsorted_indices)
    else:
      emb_x = self.src_embedder(x)
      
    enc_hidden_state_by_t, last_hidden = self.encoder(emb_x)
    
    # Because we use bi-directional GRU, there are (num_layers * 2) last hidden states
    # Here, we make it to (num_layers) last hidden states by taking mean of [left-to-right-GRU] and [right-to-left-GRU]
    last_hidden_sum = last_hidden.reshape(self.encoder.num_layers, 2, last_hidden.shape[1], -1).mean(dim=1)
    if isinstance(x, PackedSequence):
      hidden_mean = enc_hidden_state_by_t.data.reshape(-1, 2, last_hidden_sum.shape[-1]).mean(1)
      enc_hidden_state_by_t = PackedSequence(hidden_mean, x[1], x[2], x[3])
    else:
      enc_hidden_state_by_t = enc_hidden_state_by_t.reshape(x.shape[0], x.shape[1], 2, -1).mean(dim=2)
      
    
    return enc_hidden_state_by_t, last_hidden_sum 

  def run_decoder(self, y, last_hidden_state):
    if isinstance(y, PackedSequence):
      emb_y = PackedSequence(self.tgt_embedder(y.data), batch_sizes=y.batch_sizes, sorted_indices=y.sorted_indices, unsorted_indices=y.unsorted_indices)
    else:
      emb_y = self.tgt_embedder(y)
    out, decoder_last_hidden = self.decoder(emb_y, last_hidden_state)
    return out, decoder_last_hidden

  def forward(self, x, y):
    '''
    x (torch.Tensor or PackedSequence): Batch of source sentences
    y (torch.Tensor or PackedSequence): Batch of target sentences
    '''
    
    enc_hidden_state_by_t, last_hidden_sum = self.run_encoder(x)
    out, decoder_last_hidden = self.run_decoder(y, last_hidden_sum)
    
    if isinstance(out, PackedSequence):
      logits = self.decoder_proj(out.data)
      probs = torch.softmax(logits, dim=-1)
      probs = PackedSequence(probs, batch_sizes=y.batch_sizes, sorted_indices=y.sorted_indices, unsorted_indices=y.unsorted_indices)
    else:
      logits = self.decoder_proj(out)
      probs = torch.softmax(logits, dim=-1)
    return probs



### Problem 3.1: Complete the Seq2Seq with Attention
- **Caution**: You have to concatenate [decoder_hidden_state; attention_out] for this implementation
    - You can use different order of concatenation, but the pre-trained model used that specific order, so please follow it so that you can use the pre-trained weight correctly

In [None]:
class TranslatorAtt(TranslatorBi):
  def __init__(self, src_tokenizer, tgt_tokenizer, hidden_size=512, num_layers=3):
    super().__init__(src_tokenizer, tgt_tokenizer, hidden_size, num_layers)
    
    # TODO: define new self.decoder_proj
    self.decoder_proj = nn.Linear(hidden_size * 2, self.tgt_vocab_size)
    
  def get_attention_vector(self, encoder_hidden_states, decoder_hidden_states, mask):
    '''
    Arguments:
      x (torch.Tensor or PackedSequence)
      y (torch.Tensor or PackedSequence)
    Outputs:
      attention_vectors (torch.Tensor or PackedSequence)
    
    TODO: Complete this function
    If the inputs are PackedSequence, the output has to be a PackedSequence
    Use torch.nn.utils.rnn.pad_packed_sequence(packed_sequence, batch_first=True)
    '''
    is_packed = isinstance(encoder_hidden_states, PackedSequence)
    
    # Write your code from here

    return 
  
  def forward(self, x, y):
    '''
    Arguments:
      x (torch.Tensor or PackedSequence): Batch of source sentences
      y (torch.Tensor or PackedSequence): Batch of target sentences
    Output:
      prob_dist (torch.Tensor or PackedSequence): Batch of probability distribution of word for target sentence
    
    TODO: Complete this function
    '''

    is_packed = isinstance(x, PackedSequence)
    enc_hidden_state_by_t, last_hidden_sum = self.run_encoder(x)
    dec_hidden_state_by_t, decoder_last_hidden = self.run_decoder(y, last_hidden_sum)
    
    if is_packed:
      mask = pad_packed_sequence(x, batch_first=True)[0] != 0
    else:
      mask = torch.ones(x.shape[0], x.shape[1])
    attention_vec = self.get_attention_vector(enc_hidden_state_by_t, dec_hidden_state_by_t, mask)

    # Write your code from here
    # For the concatenation, you have to concat [dec_hidden_state_by_t; attention_vec], not [attention_vec; dec_hidden_state_by_t]
    return
model = TranslatorAtt(src_tokenizer, tgt_tokenizer, hidden_size=32, num_layers=2)

model(batch[0], batch[1])

#### Test your model
- To evaluate your implementation, you have to load the pretrained weight of the same model.
- If your implementation is correct, the resulting value would be the same

In [None]:
# Load pretrained weight
model = TranslatorAtt(src_tokenizer, tgt_tokenizer, 512)
state_dict = torch.load('nmt_attention_512_grad1_lr1e-4_best.pt', map_location='cpu')['model']
model.eval()
model.load_state_dict(state_dict)

# Load the pre-calculated example and result
prob3_values = torch.load('assignment_4_values.pt')
single_batch_example, packed_batch_example, correct_single_out, correct_packed_out = prob3_values['single_test_batch'], prob3_values['packed_test_batch'], prob3_values['single_test_out'],  prob3_values['packed_test_out'] 

In [None]:
'''
Test Case for Single-size Batch
'''
single_out = model(single_batch_example[0], single_batch_example[1])

assert isinstance(single_out, torch.Tensor), "The output of model for Tensor has to be Tensor"
assert torch.max(torch.abs(single_out - correct_single_out)) < 1e-5, "The output value is different from the expected"

In [None]:
'''
Test Case for Batch with PackedSequence
'''
packed_out = model(packed_batch_example[0], packed_batch_example[1])

assert isinstance(packed_out, PackedSequence), "The output of model for PackedSequence has to be PackedSequence"
assert (batch_out.batch_sizes == correct_packed_out.batch_sizes).all(), "Output's batch_sizes is wrong"
assert (batch_out.sorted_indices == correct_packed_out.sorted_indices).all(), "Output's sorted_indices is wrong"

assert torch.max(torch.abs(batch_out.data - correct_packed_out.data)) < 1e-5,  "The output value is different from the expected"


### Train the model (Optional)
- You can try to train your model, but you can just load the pretrained data

In [None]:
model = TranslatorAtt(src_tokenizer, tgt_tokenizer, 512)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

trainer = Trainer(model, optimizer, get_cross_entropy_loss, train_loader, valid_loader, 'cuda', 'nmt_attention_512')

### Problem 3.2: Implement Inference with Attention Weights
- In this problem, you have to implement an inference code that returns translation for given source sentence, but also **attention weights** between source sentence and target sentence

In [None]:
def translate(model, source_sentence):
  '''
  
  Arguments:
    model (TranslatorAtt): Translator model with attention
    source_sentence (str): Sentence to translate

  Returns:
    input_tokens (list): Source sentence in a list of token in token_id
    predicted_tokens (list): Translated sentence in a list of token in token_id
    decoded_string (str): Translated sentence in string
    attention_map (torch.Tensor): Attention weight between each token of source sentence and target sentence. Has a shape of [Ts, Tt]
    
  '''
  
  input_tokens = model.src_tokenizer.encode(source_sentence)
  input_tensor = torch.LongTensor(input_tokens).unsqueeze(0)
  mask = torch.ones_like(input_tensor)
  enc_hidden_state_by_t, last_hidden_sum = model.run_encoder(input_tensor)
  
  # Setup for 0th step
  current_hidden = last_hidden_sum
  current_decoder_token = torch.LongTensor([[2]]) # start of sentence token
  total_output = []
  total_attetion_weights = []

  for i in range(100): # You can chage it to while True:
    emb = model.tgt_embedder(current_decoder_token)
    '''
    TODO: Complete the code here
    
    You have to 
      1) run decoder rnn for a single step
      2) get attention weight (variable name: att_weight) and attention vector.
         att_weight.shape == torch.Size([1, len(tokenized_sentence), 1])
      3) concat decoder out and attention vector
      4) calculate probabilty logit (variable name: logit)
    '''


    if current_decoder_token == 3: ## end of sentence token
      break
    total_output.append(selected_token[0])
    total_attetion_weights.append(att_weight[0,:,0])
  predicted_tokens = torch.cat(total_output, dim=0).tolist()
  attention_map = torch.stack(total_attetion_weights, dim=1)
  
  return  input_tokens, predicted_tokens, model.tgt_tokenizer.decode(predicted_tokens), attention_map

model.cpu()
input_tokens, pred_tokens, translated_string, att_weights  = translate(model, '이 알고리즘을 사용하면 한국어 단어와 영어 단어가 어떻게 연결되는지를 알 수 있습니다.')
print(translated_string)

### Plot attention map
- If you completed `translate()`, you can visualize the result of attention weight as below

In [None]:
plt.figure(figsize=(len(pred_tokens)*0.8, len(input_tokens)*0.8))
x_axis_label = [model.tgt_tokenizer.decode(x) for x in pred_tokens]
y_axis_label = [model.src_tokenizer.decode(x) for x in input_tokens]

plt.imshow(att_weights.detach())
plt.xticks(range(len(x_axis_label)), x_axis_label, fontsize=15,rotation = 45)
plt.yticks(range(len(y_axis_label)), y_axis_label, fontsize=15)
None

## Problem 4: Self Attention (8 pts)
- In this problem, you will implement the key-query-value calculation that was used for Transformer
- Also, you have to implement simple self-attention (without multiheaded attention)

In [None]:
def get_key_query_value(input_tensor, kqv_layer):
  '''
  This function returns key, query, and value that is calculated by input tensor and nn_layer.

  Arguments:
    input_tensor (torch.Tensor): Has a shape of [N, T, C]
    kqv_layer (torch.nn.Linear): Linear layer with in_features=C and out_features=Cn * 3
    
  Outputs:
    keys (torch.Tensor): Has a shape of [N, T, Cn]
    queries (torch.Tensor): Has a shape of [N, T, Cn]
    values (torch.Tensor): Has a shape of [N, T, Cn]
    
  Hint: Use torch.chunk() to split a tensor into given number of chunks
  '''
  return 

torch.manual_seed(0)
test = torch.randn(4, 17, 8)
linear = nn.Linear(8, 16 * 3)
keys, queries, values = get_key_query_value(test, linear)

In [None]:
'''
Test cases
'''
answer = torch.Tensor([-0.6166,  0.2079, -0.0225, -0.2324,  0.0254,  0.0093,  0.2242, -0.4207,
         0.1735, -0.3859,  0.1021, -0.4263,  0.6088,  0.2397,  0.7548,  0.0349])
answer2 = torch.Tensor([[ 0.8704, -0.2256,  0.6611,  0.0332, -0.5233, -0.1159,  0.1805,  0.7238,
         0.5590,  0.7260,  1.3096,  0.2465,  1.1961,  0.1751, -0.9674,  0.6297]])
assert keys.ndim == queries.ndim == values.ndim == 3
assert keys.shape == queries.shape == values.shape == torch.Size([4, 17, 16])
assert not (keys==queries).any() and not (keys==values).any() and not (values==queries).any()
assert torch.max(torch.abs(queries[2, 13]-answer)) < 1e-4
assert torch.max(torch.abs(values[0, 3]-answer2)) < 1e-4

print('Passed all the cases!')

In [None]:
def get_self_attention(input_tensor, kqv_layer, mask):
  '''
  This function returns output of self-attention for a given input tensor using with a given kqv_layer
  
  Arguments:
    input_tensor (torch.Tensor): Has a shape of [N, T, C]
    kqv_layer (torch.nn.Linear): Linear layer with in_features=C and out_features=Cn * 3
    mask (torch.Tensor): 
    
  Outputs:
    output (torch.Tensor): Has a shape of [N, T, Cn]

  TODO: Complete this function using your completed functions of below:
        get_attention_score_for_a_batch_multiple_query()
        get_masked_softmax()
        get_batch_weighted_sum()
        get_key_query_value()
  '''
  return


torch.manual_seed(0)
test = torch.randn(4, 17, 8)
linear = nn.Linear(8, 16 * 3)
mask = torch.ones_like(test)[..., 0]
mask[2, 4:] = 0
mask[3, 14:] = 0

att_vecs = get_self_attention(test, linear, mask)

In [None]:
'''
Test cases
'''
answer = torch.Tensor([-0.3316,  0.1992,  0.1699, -0.3703, -0.2126, -0.0147,  0.1185, -0.2360,
         0.2283,  0.1729,  0.0460,  0.1587,  0.1891,  0.4584, -0.3860,  0.0854])
answer2 = torch.Tensor([-0.9989,  0.4320,  0.0282, -0.6165, -0.0183,  0.1410,  0.6790, -1.3118,
         0.1059, -0.7182, -0.5426,  0.1642, -0.6460,  0.8397,  0.4638,  0.1082])
assert keys.ndim == queries.ndim == values.ndim == 3
assert keys.shape == queries.shape == values.shape == torch.Size([4, 17, 16])
assert not (keys==queries).any() and not (keys==values).any() and not (values==queries).any()
assert torch.max(torch.abs(att_vecs[3, 2]-answer)) < 1e-4
assert torch.max(torch.abs(att_vecs[0, 11]-answer2)) < 1e-4


print('Passed all the cases!')