In [76]:
!pip -q install transformers
!pip install koreanize-matplotlib



In [77]:
import torch
import torch.nn as nn
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import koreanize_matplotlib
import numpy as np

from torch.nn.utils.rnn import PackedSequence, pad_sequence, pack_sequence, pad_packed_sequence, pack_padded_sequence
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast

import os

# Below helps to run tokenizer with multiprocessing
os.environ["TOKENIZERS_PARALLELISM"] = "true"

### Vectorization

In [78]:
'''example of vectorization of dot product for the two different sequence length'''
e_states = torch.randn(100, 16)  # (input size, hidden size)
d_states = torch.rand(80, 16)  # (output size, hidden size)

dot_product = torch.mm(e_states, d_states.permute(1,0))  # (100, 16) x (16, 80) = (100, 80)
dot_product

tensor([[-2.2765, -0.0343, -4.8722,  ..., -3.9217, -4.8852, -2.4770],
        [ 0.5759,  1.3688,  1.4649,  ...,  1.4977,  2.5791, -0.4470],
        [ 4.3742,  3.6217,  2.2555,  ...,  5.5789,  3.7755,  2.9377],
        ...,
        [-0.8756, -2.6582, -3.0858,  ..., -1.9052, -4.6562, -3.1356],
        [-0.0086, -2.1049, -1.0701,  ..., -0.7043,  0.0869, -1.4096],
        [-4.2150, -3.2144, -5.7856,  ...,  1.0245, -3.6171, -3.2521]])

Q.) output sequence의 길이가 정해져있나?

## 1. Implement attention for a single batch


#### 1-1. Get attention score

In [79]:
def get_attention_score_for_a_single_query(keys, query):
      '''
      returns an attention score for each vector in keys for a given query.
      You can regard 'keys' as hidden states over timestep of Encoder, while query is a hidden state of specific time step of Decoder
      For every C-dimensional vector key, the attention score is a dot product between the key and the query vector.

      Arguments: keys (torch.Tensor): Has a shape of [T, C]
                 query (torch.Tensor): Has a shape of [C]

      Output: attention_score (torch.Tensor): Has a shape of [T]

      attention_score[i] has to be a dot product value between keys[i] and query

      TODO: Complete this sentence using torch.mm (matrix multiplication)
      Hint: You can use atensor.unsqueeze(dim) to expand a dimension (with a diemsion of length 1) without changing item value of the tensor.
      '''
      att_score = torch.mm(keys, query.unsqueeze(1)).squeeze(1)
      return att_score


torch.manual_seed(0)
num_t = 23
h_size = 16

keys = torch.randn(num_t, h_size)
query = torch.randn(h_size)

att_score = get_attention_score_for_a_single_query(keys, query)
att_score

tensor([-3.0786,  2.1729,  1.7950, -5.0503,  3.3254,  0.2828, -0.9800, -1.8868,
         0.2550,  2.9389, -0.1799, -1.0586,  0.1465, -0.9441,  0.8888, -3.8108,
        -2.5662, -1.1660, -2.2327,  2.7087, -0.5800,  8.7984,  4.3816])

#### 1-2. Get attention weight

In [80]:
def get_attention_weight_from_score(attention_score):
    '''
    converts attention score to attention weight.

    Argument: attention_score (torch.Tensor): Has a shape of [T]

    Output: attention_weight (torch.Tensor): Has a shape of [T]

    TODO: Complete this function
    '''
    assert attention_score.ndim == 1
    att_weight = torch.softmax(att_score, dim=0)

    return att_weight

att_weight = get_attention_weight_from_score(att_score)
att_weight

tensor([6.7782e-06, 1.2936e-03, 8.8653e-04, 9.4370e-07, 4.0957e-03, 1.9541e-04,
        5.5277e-05, 2.2321e-05, 1.9005e-04, 2.7829e-03, 1.2303e-04, 5.1099e-05,
        1.7052e-04, 5.7296e-05, 3.5821e-04, 3.2593e-06, 1.1314e-05, 4.5893e-05,
        1.5795e-05, 2.2107e-03, 8.2463e-05, 9.7556e-01, 1.1777e-02])

#### 1-3. Get attention vector

In [81]:
def get_weighted_sum(values, attention_weight):
    '''
    converts attention score to attention weight

    Argument: values (torch.Tensor): Has a shape of [T, C]
              attention_weight: Has a shape of [T]

    Output: attention_vector (torch.Tensor): Weighted sum of values using the attention weight. Has a shape of [C]

    TODO: Complete this function using torch.mm
    '''
    att_vector = torch.mm(values.permute(1,0), attention_weight.unsqueeze(1)).squeeze(1)
    return att_vector

att_vec = get_weighted_sum(keys, att_weight) # In simple dot-product-attention, key and value are the same
att_vec

tensor([ 0.6280,  3.8540, -0.1042,  0.3148,  0.3711, -0.5095, -0.9663,  1.3295,
         1.9003, -1.2611, -2.2939, -2.0338,  0.8757, -0.6726,  1.9071, -1.0711])

## 2. Implement attention in Batch

#### 2-1. Get attention score in batch

In [82]:
def get_attention_score_for_a_batch_query(keys, query):
    '''
    returns a batch of attention score for each vector in (multi-batch) keys for a given (single-batch) query.
    You can regard 'keys' as hidden states over timestep of Encoder, while query is a hidden state of specific time step of Decoder
    For every C-dimensional vector key, the attention score is a dot product between the key and the query vector.

    Arguments: keys (torch.Tensor): Has a shape of [N, T, C]
               query (torch.Tensor): Has a shape of [N, C]

    Output: attention_score (torch.Tensor): Has a shape of [N, T]
            attention_score[n, i] has to be a dot product value between keys[n, i] and query[n]

    TODO: Complete this function without using for loop
    Hint: Use torch.bmm or torch.matmul after make two input tensors as 3-dim tensors.
    '''
    att_score = torch.bmm(keys, query.unsqueeze(2)).squeeze(2)


    return att_score

torch.manual_seed(0)
num_b = 6
num_t = 23
h_size = 16

keys = torch.randn(num_b,num_t, h_size)
query = torch.randn(num_b, h_size)
out = get_attention_score_for_a_batch_query(keys, query)

#### 2-2. Get attention score in batch
- Implement the same function but in batchified queries

In [83]:
def get_attention_score_for_a_batch_multiple_query(keys, queries):
    '''
    implement the attention score for not only single query, but multiple queries.

    returns a batch of attention score for each vector in keys for given queries.
    You can regard 'keys' as hidden states over timestep of Encoder, while querys are hidden states over timestep of Decoder
    For every C-dimensional vector key, the attention score is a dot product between the key and the query vector.

    Arguments: keys (torch.Tensor): Has a shape of [N, Ts, C].
               queries (torch.Tensor): Has a shape of [N, Tt, C].

    Output: attention_score (torch.Tensor): Has a shape of [N, Ts, Tt]
            attention_score[n, i, t] has to be a dot product value between keys[n, i] and query[n, t]

    TODO: Complete this function without using for loop
    HINT: Use torch.bmm() with proper transpose (permutation) of given tensors. (You can use atensor.permute())
          Think about which dimension (axis) of tensors has to be multiplied together and resolved (disappear) after matrix multiplication,
          and how the result tensor has to look like (shape)
    '''
    att_score = torch.bmm(keys, queries.permute(0,2,1))
    return att_score

torch.manual_seed(0)
num_b = 6
num_ts = 23
num_tt = 14
h_size = 16

keys = torch.randn(num_b, num_ts, h_size)
queries = torch.randn(num_b, num_tt, h_size)
att_score = get_attention_score_for_a_batch_multiple_query(keys, queries)

att_score

tensor([[[  6.1462,   2.7917,   3.2981,  ...,  -1.7558,  -1.9945,   1.7817],
         [ -2.0761,   1.5621,  -4.6314,  ...,  -2.9616,   5.0151,  -0.5098],
         [ -3.7923,   0.6755,  -2.5517,  ...,  -6.6489,   2.2012,   0.5882],
         ...,
         [  2.9819,  12.6860,   6.7435,  ...,   3.5522,  -7.0258,   2.3800],
         [ -5.4682,  -2.9139,  -0.3054,  ...,   6.4960,  -1.4581, -12.5525],
         [ -1.0037,   1.1092,   1.3248,  ...,   2.8827,   3.8804,  -5.4968]],

        [[ -2.9907,  -0.1470,  -0.1703,  ...,   2.4992,  -1.8304,   1.1768],
         [-15.4518,   2.2430,   4.9486,  ...,   4.4271,  -4.3865,  -9.2907],
         [  4.2723,  -0.6171,   2.6252,  ...,  -2.2281,  -2.5648,  -4.1481],
         ...,
         [  1.3169,  -1.1141,  -1.6058,  ...,   1.9466,   2.6665,  -4.1625],
         [ -7.9481,   5.0494,   0.7725,  ...,   0.5016,  -3.3123,  -7.8802],
         [  1.8112,  -3.9315,   1.6521,  ...,  -0.2215,  -0.1541,  -6.4050]],

        [[ -3.0427,   1.6135,  -0.4640,  ...

#### 2-3, Get masked softmax

In [84]:
def get_masked_softmax(attention_score, mask):
    '''
    During the batch computation, each sequence in the batch can have different length.
    To group them as in a single tensor, we usually pad values

    Arguments: attention_score (torch.Tensor): Has a shape of [N, Ts, Tt]
               mask (torch.Tensor): Boolean tensor with a shape of [N, Ts] that represents whether the corresponding is valid or not.
                                    mask[n, t] == 1 if and only if input_batch[n,t] is not a padded value.
                                    If input_batch[n,t] is a padded value, then mask[n,t] == 0

    Output: attention_weight (torch.Tensor): Has a shape of [N, Ts, Tt]
                                             attention_weight[n, i, t] has to be an attention weight of values[n, i] for queries[n, t]

    TODO: Complete this function without using for loop
    Hint: You can give -infinity value by -float("inf")

    '''
    assert attention_score.ndim == 3 and mask.ndim == 2

    masked_att_score = attention_score.masked_fill(mask.unsqueeze(2)==0, -float('inf'))
    att_weight = torch.softmax(masked_att_score, dim=1)
    return att_weight


'''
Don't change this codes
'''
mask = torch.ones_like(att_score)[..., 0]
mask[4, 15:] = 0
mask[5, 17:] = 0
att_score_modified = att_score.clone()
att_score_modified[4, 15:] = 0
attention_weight = get_masked_softmax(att_score, mask)
attention_weight_for_modified = get_masked_softmax(att_score_modified, mask)
attention_weight, attention_weight_for_modified

(tensor([[[8.5095e-02, 5.0265e-05, 2.6409e-02,  ..., 1.8632e-04,
           4.1884e-05, 1.4554e-03],
          [2.2856e-05, 1.4698e-05, 9.5063e-06,  ..., 5.5794e-05,
           4.6370e-02, 1.4716e-04],
          [4.1080e-06, 6.0563e-06, 7.6066e-05,  ..., 1.3970e-06,
           2.7809e-03, 4.4123e-04],
          ...,
          [3.5946e-03, 9.9608e-01, 8.2803e-01,  ..., 3.7625e-02,
           2.7350e-07, 2.6474e-03],
          [7.6878e-07, 1.6724e-07, 7.1906e-04,  ..., 7.1448e-01,
           7.1609e-05, 8.6639e-10],
          [6.6794e-05, 9.3445e-06, 3.6707e-03,  ..., 1.9264e-02,
           1.4910e-02, 1.0046e-06]],
 
         [[1.4556e-06, 7.2275e-05, 8.3620e-06,  ..., 7.2220e-02,
           2.6008e-04, 2.2393e-03],
          [5.6399e-12, 7.8883e-04, 1.3978e-03,  ..., 4.9651e-01,
           2.0184e-05, 6.3696e-08],
          [2.0765e-03, 4.5171e-05, 1.3690e-04,  ..., 6.3913e-04,
           1.2479e-04, 1.0902e-05],
          ...,
          [1.0810e-04, 2.7478e-05, 1.9901e-06,  ..., 4.155

#### 2-4. Implement weighted sum in batchified version

In [85]:
def get_batch_weighted_sum(values, attention_weight):
    '''
    Argument: values (torch.Tensor): Has a shape of [N, Ts, C]
              attention_weight: Has a shape of [N, Ts, Tt],
                                attention_weight[n, s, t] represents weight for value[n, s] that corresponds to a given query, queries[n, t]

    Output: attention_vector (torch.Tensor): Has a shape of [N, Tt, C]

    TODO: Complete this function using torch.bmm
    '''
    att_vector = torch.bmm(attention_weight.permute(0,2,1), values)
    return att_vector

att_out = get_batch_weighted_sum(keys, attention_weight)
att_out

tensor([[[-3.3373e-01, -2.0938e+00, -4.5334e-02,  ...,  9.5458e-01,
           1.0229e+00,  1.1447e+00],
         [-1.3264e-01, -5.9684e-02,  4.0668e-01,  ..., -2.2277e+00,
           1.4654e+00, -1.2140e+00],
         [-2.1483e-01, -3.9624e-02,  3.4638e-01,  ..., -1.8784e+00,
           1.3381e+00, -1.0378e+00],
         ...,
         [ 4.7269e-01,  2.7082e+00, -1.5966e-01,  ..., -8.5101e-01,
           1.6295e+00, -6.9686e-01],
         [ 1.3049e+00, -7.9595e-01,  5.8837e-01,  ..., -4.2750e-01,
           9.4185e-02, -6.0422e-01],
         [-2.1447e-01,  4.5967e-01,  9.0154e-01,  ..., -1.1010e+00,
           1.5383e+00, -6.2885e-01]],

        [[-2.0093e-02,  9.5785e-01, -1.0546e+00,  ..., -5.6727e-01,
           9.5331e-01, -1.4754e+00],
         [ 2.6579e-01, -9.2264e-01,  8.4929e-01,  ...,  2.4586e+00,
          -2.5894e+00,  2.3314e+00],
         [ 5.3756e-01, -9.0356e-01,  1.7743e-01,  ...,  2.2658e+00,
           3.0777e-01,  2.0951e+00],
         ...,
         [ 1.3244e-01,  8

## 3. Make seq2seq with attention
- Using Pre-defined `TranslatorBi` class, complete a new `TranslatorAtt` class

### 3-0. Prepare dataset and tokenizer

In [86]:
# download dataset (originally from NIA AI-Hub)
!gdown 13CGLEULYccogSLByHXPAxSveLZTtnj8c
!unzip nia_korean_english_csv.zip --q

Downloading...
From (original): https://drive.google.com/uc?id=13CGLEULYccogSLByHXPAxSveLZTtnj8c
From (redirected): https://drive.google.com/uc?id=13CGLEULYccogSLByHXPAxSveLZTtnj8c&confirm=t&uuid=6662b6d2-dbea-472e-980b-478f8fd091f6
To: /content/nia_korean_english_csv.zip
100% 190M/190M [00:02<00:00, 68.2MB/s]
Archive:  nia_korean_english_csv.zip
caution: filename not matched:  --q


In [87]:
!unzip --q 'nia_korean_english_csv.zip' -d '/content'

Archive:  nia_korean_english_csv.zip
replace /content/nia_korean_english.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/nia_korean_english.csv  y

replace /content/hugging_eng_32000/vocab.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: /content/hugging_eng_32000/vocab.txt  
replace /content/hugging_kor_32000/vocab.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [88]:
# load data and tokenizer
df = pd.read_csv('nia_korean_english.csv')

src_tokenizer = BertTokenizerFast.from_pretrained('hugging_kor_32000',
                                                  strip_accents=False,
                                                  lower_case=False)
tgt_tokenizer = BertTokenizerFast.from_pretrained('hugging_eng_32000',
                                                  strip_accents=False,
                                                  lower_case=False)

In [89]:
df.head()

Unnamed: 0,원문,번역문
0,'Bible Coloring'은 성경의 아름다운 이야기를 체험 할 수 있는 컬러링 ...,Bible Coloring' is a coloring application that...
1,씨티은행에서 일하세요?,Do you work at a City bank?
2,푸리토의 베스트셀러는 해외에서 입소문만으로 4차 완판을 기록하였다.,"PURITO's bestseller, which recorded 4th rough ..."
3,11장에서는 예수님이 이번엔 나사로를 무덤에서 불러내어 죽은 자 가운데서 살리셨습니다.,In Chapter 11 Jesus called Lazarus from the to...
4,"6.5, 7, 8 사이즈가 몇 개나 더 재입고 될지 제게 알려주시면 감사하겠습니다.",I would feel grateful to know how many stocks ...


In [90]:
class TranslationSet:
    def __init__(self, df, src_tokenizer, tgt_tokenizer):
        self.data = df
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        selected_row = self.data.iloc[idx]
        source = selected_row['원문']
        target = selected_row['번역문']

        source_enc = self.src_tokenizer(source)['input_ids']
        target_enc = self.tgt_tokenizer(target)['input_ids']

        return torch.LongTensor(source_enc), torch.LongTensor(target_enc[:-1]), torch.LongTensor(target_enc[1:])


entire_set = TranslationSet(df, src_tokenizer, tgt_tokenizer)
train_set, valid_set, test_set = torch.utils.data.random_split(entire_set, [int(len(entire_set)*0.9), int(len(entire_set)*0.05), len(entire_set)-int(len(entire_set)*0.9)-int(len(entire_set)*0.05)])

print(f"Data Item Example: {entire_set[0]}\n")
print(f"Length of split: {len(train_set)}, {len(valid_set)}, {len(test_set)} (train, valid, test)")

Data Item Example: (tensor([    2,    11,    70,  4665,  5209, 13306,    71, 12901,  9565, 12435,
           11,  3546, 14567,  4325,  8934,  8407,  7400,  4154,  3252,  6420,
        12985,  4996,  3397,  6461,    18,     3]), tensor([    2, 26268, 23067,    11,  1056,    69, 23067,  2803,  1067,  5155,
         1117,  1042,  2405,  4024,  5520,  1039,  1023, 26268,    18]), tensor([26268, 23067,    11,  1056,    69, 23067,  2803,  1067,  5155,  1117,
         1042,  2405,  4024,  5520,  1039,  1023, 26268,    18,     3]))

Length of split: 1442176, 80120, 80122 (train, valid, test)


In [96]:
def pack_collate(raw_batch):
    source, target, shifted_target = zip(*raw_batch)
    return pack_sequence(source, enforce_sorted=False), pack_sequence(target, enforce_sorted=False), pack_sequence(shifted_target, enforce_sorted=False)

single_loader = DataLoader(train_set, batch_size=1, collate_fn=pack_collate, shuffle=True, num_workers=4)
train_loader = DataLoader(train_set, batch_size=64, collate_fn=pack_collate, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_set, batch_size=128, collate_fn=pack_collate, shuffle=False, num_workers=0)
test_loader = DataLoader(test_set, batch_size=128, collate_fn=pack_collate, shuffle=True, num_workers=4)

batch = next(iter(train_loader))
batch



(PackedSequence(data=tensor([2, 2, 2,  ..., 3, 3, 3]), batch_sizes=tensor([64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 57, 57, 56, 54, 51, 50, 49, 47,
         40, 37, 34, 34, 34, 33, 32, 30, 27, 25, 21, 20, 16, 15, 13, 13, 12, 11,
         11, 10,  8,  7,  6,  5,  5,  4,  4,  4,  2]), sorted_indices=tensor([19, 10, 63, 40, 56, 47,  2, 50, 42, 21, 61, 54, 20,  8, 29, 45,  6, 12,
         17, 37, 53, 39, 18, 22, 15,  3, 34, 27,  4, 35, 46, 30, 51, 60, 32, 52,
         33, 23, 14, 38, 16,  1, 59, 58, 55, 44, 43, 26,  0, 36,  5, 28,  9, 11,
         25, 13, 48, 57, 62, 31, 49, 41,  7, 24]), unsorted_indices=tensor([48, 41,  6, 25, 28, 50, 16, 62, 13, 52,  1, 53, 17, 55, 38, 24, 40, 18,
         22,  0, 12,  9, 23, 37, 63, 54, 47, 27, 51, 14, 31, 59, 34, 36, 26, 29,
         49, 19, 39, 21,  3, 61,  8, 46, 45, 15, 30,  5, 56, 60,  7, 32, 35, 20,
         11, 44,  4, 57, 43, 42, 33, 10, 58,  2])),
 PackedSequence(data=tensor([  2,   2,   2,  ...,  18, 269,   6]), batch_sizes=tensor([64, 64, 64,

In [None]:
class Trainer:
    def __init__(self, model, optimizer, loss_fn, train_loader, valid_loader, device):
        self.model = model
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.train_loader = train_loader
        self.valid_loader = valid_loader

        self.model.to(device)

        self.best_valid_accuracy = 0
        self.training_loss = []
        self.validation_loss = []
        self.validation_acc = []


    def save_model(self, path='kor_eng_translator_attention_model.pt'):
        torch.save({'model': self.model.state_dict(), 'optim': self.optimizer.state_dict()}, path)


    def train_by_num_epoch(self, num_epochs):
        for epoch in tqdm(range(num_epochs)):
            self.model.train()
            with tqdm(self.train_loader, leave=False) as pbar:
                for batch in pbar:
                    loss = self.train_by_single_batch(batch)
                    self.training_loss.append(loss)
                    pbar.set_description(f"Epoch {epoch+1}, Loss {loss:.4f}")

            self.model.eval()
            validation_loss,validation_acc = self.validate()
            self.validation_loss.append(validation_loss)
            self.validation_acc.append(validation_acc)

            self.best_valid_accuracy = max(validation_acc, self.best_valid_accuracy)
            self.save_model('kor_eng_translator_attention_model_last.pt')



    def train_by_single_batch(self, batch):
        '''
        batch (tuple): (batch_of_input_text, batch_of_label)

        output: loss (float): Mean binary cross entropy value for every sample in the training batch
        '''
        src, tgt, shifted_tft = batch
        src = src.to(self.device)
        tgt = tgt.to(self.device)
        shifted_tgt = shifted_tgt.to(self.device)

        output = self.model(src, tgt)

        if isinstance(output, PackedSequence):
            loss = self.loss_fn(output.data, shifted_tgt.data)
        else:
            loss = self.loss_fn(output, shifted_tgt)

        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()

        return loss.item()


    def validate(self, external_loader=None):
        '''
        input: data_loader

        output: validation_loss (float): Mean Binary Cross Entropy value for every sample in validation set
                validation_accuracy (float): Mean Accuracy value for every sample in validation set

        '''
        if external_loader and isinstance(external_loader, DataLoader):
            loader = external_loader
            print('An arbitrary loader is used instead of Validation loader')
        else:
            loader = self.valid_loader

        self.model_eval()

        validation_loss = 0
        num_correct = 0
        num_data = 0

        with torch.inference_mode():
            for batch in loader:
                src, tgt, shifted_tgt = batch
                src = src.to(self.device)
                tgt = tgt.to(self.device)
                shifted_tgt = shifted_tgt.to(self.device)

                output = self.model(src, tgt)

                if isinstance(output, PackedSequence):
                    loss = self.loss_fn(output.data, shifted_tgt.data)
                else:
                    loss = self.loss_fn(output, shifted_tgt)

                validation_loss += loss

                if isinstance(output, PackedSequence):
                    num_correct += (output.data.argmax(dim=-1) == shifted_tgt.data).sum().item()
                else:
                    num_correct += (output.argmax(dim=-1) == shifted_tgt).sum().item()

                num_data += len(output.data)

        return validation_loss / num_data, num_correct / num_data



    def nll_loss(self, output, target, eps=1e-8):
        '''
        for PackedSequence, the input is 2D tensor

        predicted_prob_distribution has a shape of [num_entire_tokens_in_the_batch x vocab_size]
        indices_of_correct_token has a shape of [num_entire_tokens_in_the_batch]
        '''



In [92]:
def nll_loss(pred, target, eps=1e-8):
  '''
  for PackedSequence, the input is 2D tensor

  predicted_prob_distribution has a shape of [num_entire_tokens_in_the_batch x vocab_size]
  indices_of_correct_token has a shape of [num_entire_tokens_in_the_batch]
  '''

  if pred.ndim == 3:
    pred = pred.flatten(0, 1)
  if target.ndim == 2:
    target = target.flatten(0, 1)
  assert pred.ndim == 2
  assert target.ndim == 1
  return -torch.log(pred[torch.arange(len(target)), target] + eps).mean()


In [93]:
'''
Pre-defined class

You don't need to change this code
'''
class TranslatorBi(nn.Module):
  def __init__(self, src_tokenizer, tgt_tokenizer, hidden_size=256, num_layers=3):
    super().__init__()
    self.src_tokenizer = src_tokenizer
    self.tgt_tokenizer = tgt_tokenizer

    self.src_vocab_size = self.src_tokenizer.vocab_size
    self.tgt_vocab_size = self.tgt_tokenizer.vocab_size

    self.src_embedder = nn.Embedding(self.src_vocab_size, hidden_size)
    self.tgt_embedder = nn.Embedding(self.tgt_vocab_size, hidden_size)

    self.encoder = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True, batch_first=True)
    self.decoder = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)

    self.decoder_proj = nn.Linear(hidden_size, self.tgt_vocab_size)

  def run_encoder(self, x):
    if isinstance(x, PackedSequence):
      emb_x = PackedSequence(self.src_embedder(x.data), batch_sizes=x.batch_sizes, sorted_indices=x.sorted_indices, unsorted_indices=x.unsorted_indices)
    else:
      emb_x = self.src_embedder(x)

    enc_hidden_state_by_t, last_hidden = self.encoder(emb_x)

    # Because we use bi-directional GRU, there are (num_layers * 2) last hidden states
    # Here, we make it to (num_layers) last hidden states by taking mean of [left-to-right-GRU] and [right-to-left-GRU]
    last_hidden_sum = last_hidden.reshape(self.encoder.num_layers, 2, last_hidden.shape[1], -1).mean(dim=1)
    if isinstance(x, PackedSequence):
      hidden_mean = enc_hidden_state_by_t.data.reshape(-1, 2, last_hidden_sum.shape[-1]).mean(1)
      enc_hidden_state_by_t = PackedSequence(hidden_mean, x[1], x[2], x[3])
    else:
      enc_hidden_state_by_t = enc_hidden_state_by_t.reshape(x.shape[0], x.shape[1], 2, -1).mean(dim=2)


    return enc_hidden_state_by_t, last_hidden_sum

  def run_decoder(self, y, last_hidden_state):
    if isinstance(y, PackedSequence):
      emb_y = PackedSequence(self.tgt_embedder(y.data), batch_sizes=y.batch_sizes, sorted_indices=y.sorted_indices, unsorted_indices=y.unsorted_indices)
    else:
      emb_y = self.tgt_embedder(y)
    out, decoder_last_hidden = self.decoder(emb_y, last_hidden_state)
    return out, decoder_last_hidden

  def forward(self, x, y):
    '''
    x (torch.Tensor or PackedSequence): Batch of source sentences
    y (torch.Tensor or PackedSequence): Batch of target sentences
    '''

    enc_hidden_state_by_t, last_hidden_sum = self.run_encoder(x)
    out, decoder_last_hidden = self.run_decoder(y, last_hidden_sum)

    if isinstance(out, PackedSequence):
      logits = self.decoder_proj(out.data)
      probs = torch.softmax(logits, dim=-1)
      probs = PackedSequence(probs, batch_sizes=y.batch_sizes, sorted_indices=y.sorted_indices, unsorted_indices=y.unsorted_indices)
    else:
      logits = self.decoder_proj(out)
      probs = torch.softmax(logits, dim=-1)
    return probs


In [94]:
class TranslatorAtt(TranslatorBi):
  def __init__(self, src_tokenizer, tgt_tokenizer, hidden_size=512, num_layers=3):
    super().__init__(src_tokenizer, tgt_tokenizer, hidden_size, num_layers)

    # define new self.decoder_proj
    self.decoder_proj = nn.Linear(hidden_size * 2, self.tgt_vocab_size)

  def get_attention_vector(self, encoder_hidden_states, decoder_hidden_states, mask):
    '''
    Arguments:
      encoder_hidden_states (torch.Tensor or PackedSequence): Hidden states of encoder GRU. Shape: [N, Ts, C]
      decoder_hidden_states (torch.Tensor or PackedSequence): Hidden states of decoder GRU. Shape: [N, Tt, C]
      mask (torch.Tensor): Masking tensor. If the mask value is 0, the attention weight has to be zero. Shape: [N, Ts]

    Outputs:
      attention_vectors (torch.Tensor or PackedSequence): Attention vectors that has the same shape as decoder_hidden_states
      attention_weights (torch.Tensor): Zero-padded attention weights.
                                You don't need to return it during the training, but it will help you to implement later problem

    TODO: Complete this function using following functions
      get_attention_score_for_a_batch_multiple_query
      get_masked_softmax
      get_batch_weighted_sum
    If the inputs are PackedSequence, the output has to be a PackedSequence
    Use torch.nn.utils.rnn.pad_packed_sequence(packed_sequence, batch_first=True) to convert PackedSequence to Tensor
    Use torch.nn.utils.rnn.pack_padded_sequence(tensor, batch_lens, batch_first=True) to convert Tensor to PackedSequence
    '''
    is_packed = isinstance(encoder_hidden_states, PackedSequence)
    if is_packed:
      encoder_hidden_states, source_lens = pad_packed_sequence(encoder_hidden_states, batch_first=True)
      decoder_hidden_states, target_lens = pad_packed_sequence(decoder_hidden_states, batch_first=True)

    # Write your code from here

    # 1. Calculate attention score using encoder_hidden_states and decoder_hidden_states
    # 2. Mask the attention score using mask and apply softmax to get attention weight
    # 3. Calculate attention vector using attention weight and encoder_hidden_states


    #


    return

  def forward(self, x, y):
    '''
    Arguments:
      x (torch.Tensor or PackedSequence): Batch of source sentences
      y (torch.Tensor or PackedSequence): Batch of target sentences
    Output:
      prob_dist (torch.Tensor or PackedSequence): Batch of probability distribution of word for target sentence

    TODO: Complete this function
    '''

    is_packed = isinstance(x, PackedSequence)
    enc_hidden_state_by_t, last_hidden_sum = self.run_encoder(x)
    dec_hidden_state_by_t, decoder_last_hidden = self.run_decoder(y, last_hidden_sum)

    if is_packed:
      mask = pad_packed_sequence(x, batch_first=True)[0] != 0
    else:
      mask = torch.ones(x.shape[0], x.shape[1])

    attention_vec, attention_weight = self.get_attention_vector(enc_hidden_state_by_t, dec_hidden_state_by_t, mask)

    # TODO: Write your code from here
    # CAUTION:
    #   For the concatenation, you have to concat [dec_hidden_state_by_t; attention_vec], not [attention_vec; dec_hidden_state_by_t]
    return


model = TranslatorAtt(src_tokenizer, tgt_tokenizer, hidden_size=32, num_layers=2)

model(batch[0], batch[1])

NameError: name 'batch' is not defined