In [1]:
! pip install einops sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Library

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
from einops import rearrange, reduce, repeat

from tqdm import tqdm

import time
import copy
from collections import defaultdict
import joblib
import gc
import os


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
N = 2
HIDDEN_DIM = 256
NUM_HEAD = 8 
INNER_DIM = 512

PAD_IDX = 0
EOS_IDX = 3

Datasets

In [5]:
ddir = '/content/drive/MyDrive/lecture/datasets/korean-parallel-corpora/bible/'

src_train_path = os.path.join(ddir,'src_train.pkl')
src_valid_path = os.path.join(ddir,'src_valid.pkl')
trg_train_path = os.path.join(ddir,'trg_train.pkl')
trg_valid_path = os.path.join(ddir,'trg_valid.pkl')


In [6]:
src_train = joblib.load(src_train_path)
src_valid = joblib.load(src_valid_path)
trg_train = joblib.load(trg_train_path)
trg_valid = joblib.load(trg_valid_path)

In [None]:
print(src_valid)

In [7]:
VOCAB_SIZE = 10000
SEQ_LEN = 60
BATCH_SIZE = 64

Dataloads

In [8]:
class TrainDataset(Dataset):
    def __init__(self, src_data, trg_data):
        super().__init__()

        assert len(src_data) == len(trg_data)

        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)
        
    def __getitem__ (self, idx):
        src = self.src_data[idx]
        trg_input = self.trg_data[idx]
        trg_output = trg_input[1:SEQ_LEN]
        trg_output = np.pad(trg_output, (0,1), 'constant', constant_values =0)
  
        return torch.Tensor(src).long(), torch.Tensor(trg_input).long(), torch.Tensor(trg_output).long()

train_dataset = TrainDataset(src_train, trg_train)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle= True, pin_memory=True)

In [9]:
class ValidDataset(Dataset):
    def __init__(self, src_data, trg_data):
        super().__init__()

        assert len(src_data) == len(trg_data)

        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)
        
    def __getitem__ (self, idx):
        src = self.src_data[idx]
        trg_input = self.trg_data[idx]
        trg_output = trg_input[1:SEQ_LEN]
        trg_output = np.pad(trg_output, (0,1), 'constant',constant_values= 0)

        return torch.Tensor(src).long(), torch.Tensor(trg_input).long(), torch.Tensor(trg_output).long()

valid_dataset = ValidDataset(src_valid, trg_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle= False, pin_memory=True)

Transformer

In [10]:
class Transformer(nn.Module):
    def __init__(self, N = 2, hidden_dim = 256, num_head = 8, inner_dim = 512):
        super().__init__()
        self.encoder = Encoder(N, hidden_dim, num_head, inner_dim)
        self.decoder = Decoder(N, hidden_dim, num_head, inner_dim)

    def forward(self, enc_src, dec_src):

        enc_output = self.encoder(enc_src)
        logits, output = self.decoder(dec_src, enc_src, enc_output)

        return logits, output

Encoder

In [11]:
class Encoder(nn.Module):
    def __init__ (self, N, hidden_dim, num_head, inner_dim,max_length=100):
        super().__init__()

        # N : number of encoder layer repeated 
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=0)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)
        self.enc_layers = nn.ModuleList([EncoderLayer(hidden_dim, num_head, inner_dim) for _ in range(N)])

        self.dropout = nn.Dropout(p=0.1)

    def forward(self, input):
        
        batch_size = input.shape[0]
        seq_len = input.shape[1]


        mask = makeMask(input, option='padding')

        pos = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(device)

        output = self.dropout(self.embedding(input) + self.pos_embedding(pos))

        # Dropout
        output = self.dropout(output)

        # N encoder layer
        for layer in self.enc_layers:
            output = layer(output, mask)


        return output

In [12]:
class EncoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim
        
        self.multiheadattention = Multiheadattention(hidden_dim, num_head)
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm1 = nn.LayerNorm(hidden_dim)
        self.layerNorm2 = nn.LayerNorm(hidden_dim)


        self.dropout1 = nn.Dropout(p=0.1)
        self.dropout2 = nn.Dropout(p=0.1)


    def forward(self, input, mask = None):

        output = self.multiheadattention(srcQ= input, srcK = input, srcV = input, mask = mask)
        output = self.dropout1(output)
        output = input + output
        output = self.layerNorm1(output)

        output_ = self.ffn(output)
        output_ = self.dropout2(output_)
        output = output + output_
        output = self.layerNorm2(output)

        return output

class Multiheadattention

In [13]:
class Multiheadattention(nn.Module):
    def __init__(self, hidden_dim: int, num_head: int):
        super().__init__()

        # embedding_dim, d_model, 512 in paper
        self.hidden_dim = hidden_dim
        # 8 in paper
        self.num_head = num_head
        # head_dim, d_key, d_query, d_value, 64 in paper (= 512 / 8)
        self.head_dim = hidden_dim // num_head
        self.scale = torch.sqrt(torch.FloatTensor()).to(device)

        self.fcQ = nn.Linear(hidden_dim, hidden_dim)
        self.fcK = nn.Linear(hidden_dim, hidden_dim)
        self.fcV = nn.Linear(hidden_dim, hidden_dim)
        self.fcOut = nn.Linear(hidden_dim, hidden_dim)

        self.dropout = nn.Dropout(0.1)


    def forward(self, srcQ, srcK, srcV, mask=None):

        ##### SCALED DOT PRODUCT ATTENTION ######

        Q = self.fcQ(srcQ)
        K = self.fcK(srcK)
        V = self.fcV(srcV)

        Q = rearrange(
            Q, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)
        K_T = rearrange(
            K, 'bs seq_len (num_head head_dim) -> bs num_head head_dim seq_len', num_head=self.num_head)
        V = rearrange(
            V, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)
        
        attention_energy = torch.matmul(Q, K_T)

        if mask is not None :
 
            attention_energy = torch.masked_fill(attention_energy, (mask == 0), -1e+4)
            
        attention_energy = torch.softmax(attention_energy, dim = -1)

        result = torch.matmul(self.dropout(attention_energy),V)

        ##### END OF SCALED DOT PRODUCT ATTENTION ######

        # CONCAT
        result = rearrange(result, 'bs num_head seq_len head_dim -> bs seq_len (num_head head_dim)')

        result = self.fcOut(result)

        return result

FFN

In [14]:
class FFN(nn.Module):
    def __init__ (self, hidden_dim, inner_dim):
        super().__init__()
 
        self.hidden_dim = hidden_dim

        self.inner_dim = inner_dim 

        self.fc1 = nn.Linear(hidden_dim, inner_dim)
        self.fc2 = nn.Linear(inner_dim, hidden_dim)
        self.relu = nn.ReLU(inplace=False)
        self.dropout = nn.Dropout(0.1)
   
    def forward(self, input):
        output = input
        output = self.fc1(output)
        output2 = self.relu(output)
        output2 = self.dropout(output)
        output3 = self.fc2(output2)

        return output3

Decoder

In [15]:
class Decoder(nn.Module):
    def __init__ (self, N, hidden_dim, num_head, inner_dim, max_length=100):
        super().__init__()

        # N : number of encoder layer repeated 
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=0)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)

        self.dec_layers = nn.ModuleList([DecoderLayer(hidden_dim, num_head, inner_dim) for _ in range(N)])

        self.dropout = nn.Dropout(p=0.1)
        
        self.finalFc = nn.Linear(hidden_dim, VOCAB_SIZE)


    def forward(self, input, enc_src, enc_output):

        
        lookaheadMask = makeMask(input, option= 'lookahead')
        paddingMask = makeMask(enc_src, option = 'padding')

        # embedding layer
        output = self.embedding(input)

        # Dropout
        output = self.dropout(output)

        for layer in self.dec_layers:
            output = layer(output, enc_output, paddingMask, lookaheadMask)

        logits = self.finalFc(output)

        output = torch.softmax(logits, dim = -1)

        output = torch.argmax(output, dim = -1)


        return logits, output

In [16]:
class DecoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.inner_dim = inner_dim

        self.multiheadattention1 = Multiheadattention(hidden_dim, num_head)
        self.layerNorm1 = nn.LayerNorm(hidden_dim)
        self.multiheadattention2 = Multiheadattention(hidden_dim, num_head)
        self.layerNorm2 = nn.LayerNorm(hidden_dim)
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm3 = nn.LayerNorm(hidden_dim)

        self.dropout1 = nn.Dropout(p=0.1)
        self.dropout2 = nn.Dropout(p=0.1)
        self.dropout3 = nn.Dropout(p=0.1)

    
    def forward(self, input, enc_output, paddingMask, lookaheadMask):

        # first multiheadattention
        output = self.multiheadattention1(input, input, input, lookaheadMask)
        output = self.dropout1(output)
        output = output + input
        output = self.layerNorm1(output)


        # second multiheadattention
        output_ = self.multiheadattention2(output, enc_output, enc_output, paddingMask)
        output_ = self.dropout2(output_)
        output = output_ + output
        output = self.layerNorm2(output)


        # Feedforward Network
        output_ = self.ffn(output)
        output_ = self.dropout3(output_)
        output = output + output_
        output = self.layerNorm3(output)



        return output

In [17]:

def makeMask(tensor, option: str) -> torch.Tensor:
  
    if option == 'padding':
        tmp = torch.full_like(tensor, fill_value=PAD_IDX).to(device)
       
        mask = (tensor != tmp).float()
        
        mask = rearrange(mask, 'bs seq_len -> bs 1 1 seq_len ')

    elif option == 'lookahead':

        padding_mask = makeMask(tensor, 'padding')
        padding_mask = repeat(
            padding_mask, 'bs 1 1 k_len -> bs 1 new k_len', new=padding_mask.shape[3])
        
        mask = torch.ones_like(padding_mask)
        mask = torch.tril(mask)

        mask = mask * padding_mask
        
    return mask

Model 생성

In [18]:
model = Transformer(N, HIDDEN_DIM, NUM_HEAD, INNER_DIM).to(device)

In [19]:
optimizer = torch.optim.Adam(params = model.parameters(), lr = 1e-4, weight_decay = 0)

def criterion(logits: torch.tensor, targets: torch.tensor):
    return nn.CrossEntropyLoss(ignore_index=PAD_IDX)(logits.view(-1,VOCAB_SIZE), targets.view(-1))

Training Function

In [20]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):

    model.train()

    dataset_size = 0
    running_loss = 0
    running_accuracy = 0
    accuracy = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))

    for step, (src, trg_input, trg_output) in bar:
        src = src.to(device)
        trg_input = trg_input.to(device)
        trg_output = trg_output.to(device)

        batch_size = src.shape[0]

        logits, output = model(enc_src=src, dec_src=trg_input)
        loss = criterion(logits, trg_output)

        loss.backward()
    
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)  
     
        optimizer.step()

        # zero the parameter gradients
        optimizer.zero_grad()

        # change learning rate by Scheduler
        if scheduler is not None:
            scheduler.step()

        running_loss += loss.item() * batch_size
        running_accuracy = np.mean(
            output.view(-1).detach().cpu().numpy() == trg_output.view(-1).detach().cpu().numpy())

        accuracy += running_accuracy

        dataset_size += batch_size
        epoch_loss = running_loss / dataset_size

        bar.set_postfix(
            Epoch=epoch, Train_Loss=epoch_loss, LR=optimizer.param_groups[0]["lr"], accuracy=accuracy / np.float(
                step+1)
        )

    accuracy /= len(dataloader)

    gc.collect()

    return epoch_loss, accuracy

Validation Frunction

In [21]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0
    accuracy = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))

    for step, (src, trg_input, trg_output) in bar:
        src = src.to(device)
        trg_input = trg_input.to(device)
        trg_output = trg_output.to(device)

        batch_size = src.shape[0]

        logits, output = model(enc_src = src, dec_src = trg_input)
        loss = criterion(logits, trg_output)

        running_loss += loss.item() * batch_size
        dataset_size += batch_size

     
        val_loss = running_loss / dataset_size
        running_accuracy = np.mean(output.view(-1).detach().cpu().numpy() == trg_output.view(-1).detach().cpu().numpy())
        
        accuracy += running_accuracy

        bar.set_postfix(
            Epoch=epoch, Valid_Loss=val_loss, LR=optimizer.param_groups[0]["lr"], accuracy = accuracy / np.float(step + 1)
        )

    accuracy /= len(dataloader)

    gc.collect()

    return val_loss, accuracy

Run

In [22]:
def run_training(
    model,
    optimizer,
    scheduler,
    device,
    num_epochs,
    metric_prefix="",
    file_prefix="",
    early_stopping=True,
    early_stopping_step=10,
):

    if torch.cuda.is_available():
        print("[INFO] Using GPU:{}\n".format(torch.cuda.get_device_name()))

    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = np.inf
    history = defaultdict(list)
    early_stop_counter = 0

    for epoch in range(1, num_epochs + 1):
        gc.collect()

        train_epoch_loss, train_accuracy = train_one_epoch(
            model,
            optimizer,
            scheduler,
            dataloader= train_dataloader,
            device=device,
            epoch=epoch,
        )

        val_loss, val_accuracy = valid_one_epoch(
            model, valid_dataloader, device=device, epoch=epoch
        )

        history[f"{metric_prefix}Train Loss"].append(train_epoch_loss)
        history[f"{metric_prefix}Train Accuracy"].append(train_accuracy)
        history[f"{metric_prefix}Valid Loss"].append(val_loss)
        history[f"{metric_prefix}Valid Accuracy"].append(val_accuracy)


        print(f"Valid Loss : {val_loss}")

        if val_loss <= best_loss:
            early_stop_counter = 0

            print(
                f"Validation Loss improved( {best_loss} ---> {val_loss}  )"
            )

            # Update Best Loss
            best_loss = val_loss
            
            best_model_wts = copy.deepcopy(model.state_dict())

            PATH = "{}epoch{:.0f}_Loss{:.4f}.bin".format(file_prefix, epoch, best_loss)
            torch.save(model.state_dict(), PATH)
            torch.save(model.state_dict(), f"{file_prefix}best_{epoch}epoch.bin")

            print(f"Model Saved")

        elif early_stopping:
            early_stop_counter += 1
            if early_stop_counter > early_stopping_step:
                break
        
    end = time.time()
    time_elapsed = end - start
    print(
        "Training complete in {:.0f}h {:.0f}m {:.0f}s".format(
            time_elapsed // 3600,
            (time_elapsed % 3600) // 60,
            (time_elapsed % 3600) % 60,
        )
    )
    print("Best Loss: {:.4f}".format(best_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return model, history


Training 실행

In [24]:
run_training(
    model = model,
    optimizer = optimizer,
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=100, eta_min=1e-5),
    device = device,
    num_epochs = 2000,
    metric_prefix="",
    file_prefix="",
    early_stopping=True,
    early_stopping_step=10,
)

[INFO] Using GPU:Tesla T4



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  7%|▋         | 26/389 [00:01<00:25, 14.01it/s, Epoch=1, LR=7.55e-5, Train_Loss=8.27, accuracy=0.0414]


KeyboardInterrupt: ignored

In [81]:
torch.save(model.state_dict(), 'final.bin')

Load text data

In [25]:
import pandas as pd

In [26]:
DATASET_PATH = '/content/drive/MyDrive/lecture/datasets/korean-parallel-corpora/bible/'

In [27]:
en_train = open(os.path.join(DATASET_PATH, 'bible-all.en.txt'))
en_train_content = en_train.read()

en_train_list = en_train_content.split('\n')

In [28]:
ko_train = open(os.path.join(DATASET_PATH, 'bible-all.kr.txt'))
ko_train_content = ko_train.read()

ko_train_list = ko_train_content.split('\n')

In [29]:
en_train_list[:10]

['Genesis1.1  In the beginning God created the heavens and the earth.',
 'Genesis1.2  Now the earth was formless and empty, darkness was over the surface of the deep, and the Spirit of God was hovering over the waters.',
 'Genesis1.3  And God said, "Let there be light," and there was light.',
 'Genesis1.4  God saw that the light was good, and He separated the light from the darkness.',
 'Genesis1.5  God called the light "day," and the darkness he called "night." And there was evening, and there was morning--the first day.',
 'Genesis1.6  And God said, "Let there be an expanse between the waters to separate water from water."',
 'Genesis1.7  So God made the expanse and separated the water under the expanse from the water above it. And it was so.',
 'Genesis1.8  God called the expanse "sky." And there was evening, and there was morning--the second day.',
 'Genesis1.9  And God said, "Let the water under the sky be gathered to one place, and let dry ground appear." And it was so.',
 'Genes

In [30]:
data = pd.DataFrame()
data['en_raw'] = en_train_list
data['ko_raw'] = ko_train_list

In [31]:
data.head()

Unnamed: 0,en_raw,ko_raw
0,Genesis1.1 In the beginning God created the h...,Genesis1.1 태초에 하나님이 천지를 창조하셨다.
1,Genesis1.2 Now the earth was formless and emp...,"Genesis1.2 땅이 혼돈하고 공허하며, 어둠이 깊음 위에 있고, 하나님의 영..."
2,"Genesis1.3 And God said, ""Let there be light,...","Genesis1.3 하나님이 말씀하시기를 ""빛이 생겨라"" 하시니, 빛이 생겼다."
3,"Genesis1.4 God saw that the light was good, a...","Genesis1.4 그 빛이 하나님 보시기에 좋았다. 하나님이 빛과 어둠을 나누셔서,"
4,"Genesis1.5 God called the light ""day,"" and th...","Genesis1.5 빛을 낮이라고 하시고, 어둠을 밤이라고 하셨다. 저녁이 되고 ..."


In [32]:
len(data)

31104

In [33]:
data = data.reset_index(drop = True)
data.head()

Unnamed: 0,en_raw,ko_raw
0,Genesis1.1 In the beginning God created the h...,Genesis1.1 태초에 하나님이 천지를 창조하셨다.
1,Genesis1.2 Now the earth was formless and emp...,"Genesis1.2 땅이 혼돈하고 공허하며, 어둠이 깊음 위에 있고, 하나님의 영..."
2,"Genesis1.3 And God said, ""Let there be light,...","Genesis1.3 하나님이 말씀하시기를 ""빛이 생겨라"" 하시니, 빛이 생겼다."
3,"Genesis1.4 God saw that the light was good, a...","Genesis1.4 그 빛이 하나님 보시기에 좋았다. 하나님이 빛과 어둠을 나누셔서,"
4,"Genesis1.5 God called the light ""day,"" and th...","Genesis1.5 빛을 낮이라고 하시고, 어둠을 밤이라고 하셨다. 저녁이 되고 ..."


In [34]:
data['en'] = data['en_raw'].apply(lambda x: x.split(' ')[1:])
data['en'] = data['en'].apply(lambda x: (' ').join(x))
data['ko'] = data['ko_raw'].apply(lambda x: x.split(' ')[1:])
data['ko'] = data['ko'].apply(lambda x: (' ').join(x))

In [35]:
data = data[['en','ko']]
data.head()

Unnamed: 0,en,ko
0,In the beginning God created the heavens and ...,태초에 하나님이 천지를 창조하셨다.
1,"Now the earth was formless and empty, darknes...","땅이 혼돈하고 공허하며, 어둠이 깊음 위에 있고, 하나님의 영은 물 위에 움직이고..."
2,"And God said, ""Let there be light,"" and there...","하나님이 말씀하시기를 ""빛이 생겨라"" 하시니, 빛이 생겼다."
3,"God saw that the light was good, and He separ...","그 빛이 하나님 보시기에 좋았다. 하나님이 빛과 어둠을 나누셔서,"
4,"God called the light ""day,"" and the darkness ...","빛을 낮이라고 하시고, 어둠을 밤이라고 하셨다. 저녁이 되고 아침이 되니, 하루가..."


Load weight

In [36]:
WEIGHT_FILE = 'final.bin'
WEIGHT_PATH = '/content/drive/MyDrive/Storage/transformer_lecture/'

model = Transformer(N, HIDDEN_DIM, NUM_HEAD, INNER_DIM).to(device)
model.load_state_dict(torch.load(os.path.join(WEIGHT_PATH, WEIGHT_FILE), map_location=device))
model.eval()

Transformer(
  (encoder): Encoder(
    (embedding): Embedding(10000, 256, padding_idx=0)
    (pos_embedding): Embedding(100, 256)
    (enc_layers): ModuleList(
      (0): EncoderLayer(
        (multiheadattention): Multiheadattention(
          (fcQ): Linear(in_features=256, out_features=256, bias=True)
          (fcK): Linear(in_features=256, out_features=256, bias=True)
          (fcV): Linear(in_features=256, out_features=256, bias=True)
          (fcOut): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ffn): FFN(
          (fc1): Linear(in_features=256, out_features=512, bias=True)
          (fc2): Linear(in_features=512, out_features=256, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (layerNorm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (layerNorm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Drop

In [37]:
import sentencepiece as spm

SRC_MODEL_FILE = os.path.join(ddir,'src.model')
TRG_MODEL_FILE = os.path.join(ddir,'trg.model')

sp_src = spm.SentencePieceProcessor()
sp_src.Load(SRC_MODEL_FILE)
sp_trg = spm.SentencePieceProcessor()
sp_trg.Load(TRG_MODEL_FILE)

True

In [40]:
def predict(src_sentence):
  
    dec_sentence = ''

    enc_src = sp_src.EncodeAsIds(src_sentence)
    dec_src = []
    dec_src = np.insert(dec_src, 0, sp_trg.bos_id())

    enc_src = torch.Tensor(enc_src).view(1, -1).int().to(device)
    dec_src = torch.Tensor(dec_src).view(1, -1).int().to(device)
 
    last_token = None
    last_token_idx = 0

    while(True):

        enc_output = model.encoder(enc_src)

        dec_logits, dec_output = model.decoder(
            input=dec_src, enc_src=enc_src, enc_output=enc_output
        )

        last_token = dec_output[:, last_token_idx].item()
        last_token = torch.Tensor([last_token]).view(-1, 1).int().to(device)

        dec_src = torch.cat((dec_src, last_token), dim=-1)

        last_token_idx = last_token_idx + 1

        if last_token.item() is EOS_IDX:
            break

    return sp_trg.Decode(dec_src.tolist())

In [42]:
indices = np.random.choice(len(data['en']), 10, replace=False)
sentences = data['en'][indices].to_list()
answers = data['ko'][indices].to_list()

for idx in range(len(sentences)):
    sentence = sentences[idx]
    print(f'en = {sentence}')
    print(f'answer = {answers[idx]}')
    print(f'ko = {predict(sentence)}')

en =  They did not thirst when he led them through the deserts; he made water flow for them from the rock; he split the rock and water gushed out.
answer =  주께서 그들을 사막으로 인도하셨으나, 그들이 전혀 목마르지 않았다. 주께서는 바위에서 물을 내셔서 그들로 마시게 하셨고, 바위를 쪼개셔서 물이 솟아나게 하셨다.
ko = ['내가 보니, 야벳의 아들 세베소포타미아 사람 가운데서 으뜸이 되어, 나는 늘어나고 해서, 하나님께 영광을 돌면서, 하나님께 영광을 돌처럼 보이는 것은 무엇이든지 다 털어놓고, 하나님께 영광을 돌처럼 여 ⁇ 을 낳았다.']
en =  "I am an alien and a stranger among you. Sell me some property for a burial site here so I can bury my dead."
answer =  "나는 여러분 가운데서 나그네로, 떠돌이로 살고 있습니다. 죽은 나의 아내를 묻으려고 하는데, 무덤으로 쓸 땅을 여러분들에게서 좀 살 수 있게 해주시기를 바랍니다."
ko = ['"내가 어찌 그리도 저런한 말로만 해도, 내가 어찌 그리워하는구나" 하고 말할 수 있을까?"']
en =  Then the iron, the clay, the bronze, the silver and the gold were broken to pieces at the same time and became like chaff on a threshing floor in the summer. The wind swept them away without leaving a trace. But the rock that struck the statue became a huge mountain and filled the whole earth.
answer =  그 때에 쇠와 진흙과 놋쇠와 은과 금이 다 부서졌으며, 