In [3]:
!pip install 'tqdm' 'fsspec==2023.9.2' 'datasets==2.14.6' 'sentencepiece==0.1.97' 'sacrebleu==2.3.1'
# !pip install fsspec==2023.9.2
# !pip install datasets==2.14.6



In [4]:
import os
import sys
import math
import copy
import heapq
import datetime

from tqdm import tqdm
import numpy as np

import sacrebleu

import datasets

import sentencepiece as spm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

In [6]:
corpus = datasets.load_dataset("mt_eng_vietnamese", "iwslt2015-en-vi")

In [8]:
corpus = datasets.load_dataset("iwslt2015-en-vi")


FileNotFoundError: Couldn't find a dataset script at /Users/vophananhquan/Downloads/Source Code/Machine-Translation-System/template/iwslt2015-en-vi/iwslt2015-en-vi.py or any data file in the same directory. Couldn't find 'iwslt2015-en-vi' on the Hugging Face Hub either: FileNotFoundError: Dataset 'iwslt2015-en-vi' doesn't exist on the Hub. If the repo is private or gated, make sure to log in with `huggingface-cli login`.

In [7]:
corpus

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 133318
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
})

In [None]:
class DataPreparing:
    def __init__(self, save_data_dir, source_lang, target_lang):
        self.save_data_dir = save_data_dir
        self.source_lang = source_lang
        self.target_lang = target_lang
    
    def download_dataset(self):
        if not(os.path.exists(self.save_data_dir)):
            print('Create Foler')
            os.mkdir(self.save_data_dir)
        if len(os.listdir(self.save_data_dir)) ==0:
            print('#1-Download Dataset')
            corpus = datasets.load_dataset("mt_eng_vietnamese", "iwslt2015-en-vi")
            
            print('#2-Save Dataset')
            for data in ['train', 'validation', 'test']:

                source_data, target_data = self.get_data(corpus[data])

                print('Source lang: {} - {}: {}'.format(self.source_lang, data, len(source_data)))
                print('Target lang: {} - {}: {}'.format(self.target_lang, data, len(target_data)))

                self.save_data(source_data, os.path.join(self.save_data_dir, data + '.' + self.source_lang))
                self.save_data(target_data, os.path.join(self.save_data_dir, data + '.' + self.target_lang))

        else:
            print('Dataset exit!')
        
    def get_data(self, corpus):
        source_data = []
        target_data = []
        for data in corpus:
            source_data.append(data['translation'][self.source_lang])
            target_data.append(data['translation'][self.target_lang])
        return source_data, target_data

    def save_data(self, data, save_path):
        print('=> Save data => Path: {}'.format(save_path))
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(data))

In [None]:
def train_sentencepiece(cfg, is_src=True):
    template = "--input={} \
                --pad_id={} \
                --bos_id={} \
                --eos_id={} \
                --unk_id={} \
                --model_prefix={} \
                --vocab_size={} \
                --character_coverage={} \
                --model_type={}"
    
    if is_src:
        train_file = f"{cfg.data_dir}/train.{cfg.src_lang}"
        model_prefix = f"{cfg.sp_dir}/{cfg.src_model_prefix}"
    else:
        train_file = f"{cfg.data_dir}/train.{cfg.tgt_lang}"
        model_prefix = f"{cfg.sp_dir}/{cfg.tgt_model_prefix}"

    print(f"===> Processing file: {train_file}")
    if not os.path.isdir(cfg.sp_dir):
        os.mkdir(cfg.sp_dir)

    sp_cfg = template.format(
        train_file,
        cfg.pad_id,
        cfg.sos_id,
        cfg.eos_id,
        cfg.unk_id,
        model_prefix,
        cfg.sp_vocab_size,
        cfg.character_coverage,
        cfg.model_type)
    
    spm.SentencePieceTrainer.Train(sp_cfg)

In [None]:
class NMTDataset(Dataset):
    def __init__(self, cfg, data_type="train"):
        super().__init__()
        self.cfg = cfg

        self.sp_src, self.sp_tgt = self.load_sp_tokenizer()
        self.src_texts, self.tgt_texts = self.read_data(data_type)

        src_tokenized_sequences = self.texts_to_sequences(self.src_texts, True)
        tgt_input_tokenized_sequences, tgt_output_tokenized_sequences = self.texts_to_sequences(self.tgt_texts, False)

        self.src_data = torch.LongTensor(src_tokenized_sequences)
        self.input_tgt_data = torch.LongTensor(tgt_input_tokenized_sequences)
        self.output_tgt_data = torch.LongTensor(tgt_output_tokenized_sequences)

    def read_data(self, data_type):
        print(f"===> Load data from: {self.cfg.data_dir}/{data_type}.{self.cfg.src_lang}")
        with open(f"{self.cfg.data_dir}/{data_type}.{self.cfg.src_lang}", 'r') as f:
            src_texts = f.readlines()

        print(f"===> Load data from: {self.cfg.data_dir}/{data_type}.{self.cfg.tgt_lang}")
        with open(f"{self.cfg.data_dir}/{data_type}.{self.cfg.tgt_lang}", 'r') as f:
            trg_texts = f.readlines()
        
        return src_texts, trg_texts
    
    def load_sp_tokenizer(self):
        sp_src = spm.SentencePieceProcessor()
        sp_src.Load(f"{self.cfg.sp_dir}/{self.cfg.src_model_prefix}.model")

        sp_tgt = spm.SentencePieceProcessor()
        sp_tgt.Load(f"{self.cfg.sp_dir}/{self.cfg.tgt_model_prefix}.model")

        return sp_src, sp_tgt
    
    def texts_to_sequences(self, texts, is_src=True):
        if is_src:
            src_tokenized_sequences = []
            for text in tqdm(texts):
                tokenized = self.sp_src.EncodeAsIds(text.strip())
                src_tokenized_sequences.append(
                    pad_or_truncate([self.cfg.sos_id] + tokenized + [self.cfg.eos_id], self.cfg.seq_len, self.cfg.pad_id)
                )
            return src_tokenized_sequences
        else:
            tgt_input_tokenized_sequences = []
            tgt_output_tokenized_sequences = []
            for text in tqdm(texts):
                tokenized = self.sp_tgt.EncodeAsIds(text.strip())
                tgt_input = [self.cfg.sos_id] + tokenized
                tgt_output = tokenized + [self.cfg.eos_id]
                tgt_input_tokenized_sequences.append(pad_or_truncate(tgt_input, self.cfg.seq_len, self.cfg.pad_id))
                tgt_output_tokenized_sequences.append(pad_or_truncate(tgt_output, self.cfg.seq_len, self.cfg.pad_id))

            return tgt_input_tokenized_sequences, tgt_output_tokenized_sequences

    def __getitem__(self, idx):
        return self.src_data[idx], self.input_tgt_data[idx], self.output_tgt_data[idx]

    def __len__(self):
        return np.shape(self.src_data)[0]

def pad_or_truncate(tokenized_sequence, seq_len, pad_id):
    if len(tokenized_sequence) < seq_len:
        left = seq_len - len(tokenized_sequence)
        padding = [pad_id] * left
        tokenized_sequence += padding
    else:
        tokenized_sequence = tokenized_sequence[:seq_len]
    return tokenized_sequence

def get_data_loader(cfg, data_type='train'):
    dataset = NMTDataset(cfg, data_type)

    if data_type == 'train':
        shuffle = True
    else:
        shuffle = False

    dataloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=shuffle)

    return dataset, dataloader

In [None]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_model, num_heads, drop_out=0.1):
        super().__init__()
        self.inf = 1e9

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        # W^Q, W^K, W^V in the paper
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(drop_out)
        self.attn_softmax = nn.Softmax(dim=-1)

        # Final output linear transformation
        self.w_0 = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        input_shape = q.shape

        # Linear calculation +  split into num_heads
        q = self.w_q(q).view(input_shape[0], -1, self.num_heads, self.d_k) # (B, L, num_heads, d_k)
        k = self.w_k(k).view(input_shape[0], -1, self.num_heads, self.d_k) # (B, L, num_heads, d_k)
        v = self.w_v(v).view(input_shape[0], -1, self.num_heads, self.d_k) # (B, L, num_heads, d_k)

        # For convenience, convert all tensors in size (B, num_heads, L, d_k)
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        # Conduct self-attention
        attn_values = self.self_attention(q, k, v, mask=mask) # (B, num_heads, L, d_k)
        concat_output = attn_values.transpose(1, 2)\
            .contiguous().view(input_shape[0], -1, self.d_model) # (B, L, d_model)

        return self.w_0(concat_output)

    def self_attention(self, q, k, v, mask=None):
        # Calculate attention scores with scaled dot-product attention
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) # (B, num_heads, L, L)
        attn_scores = attn_scores / math.sqrt(self.d_k)

        # If there is a mask, make masked spots -INF
        if mask is not None:
            mask = mask.unsqueeze(1) # (B, 1, L) => (B, 1, 1, L) or (B, L, L) => (B, 1, L, L)
            attn_scores = attn_scores.masked_fill_(mask == 0, -1 * self.inf)

        # Softmax and multiplying K to calculate attention value
        attn_distribs = self.attn_softmax(attn_scores)

        attn_distribs = self.dropout(attn_distribs)
        attn_values = torch.matmul(attn_distribs, v) # (B, num_heads, L, d_k)

        return attn_values

class FeedFowardLayer(nn.Module):
    def __init__(self, d_model, d_ff, drop_out=0.1):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff, bias=True)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(d_ff, d_model, bias=True)
        self.dropout = nn.Dropout(drop_out)

    def forward(self, x):
        x = self.relu(self.linear_1(x)) # (B, L, d_ff)
        x = self.dropout(x)
        x = self.linear_2(x) # (B, L, d_model)

        return x


class LayerNormalization(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.layer = nn.LayerNorm([d_model], elementwise_affine=True, eps=self.eps)

    def forward(self, x):
        x = self.layer(x)

        return x

class PositionalEncoder(nn.Module):
    def __init__(self, seq_len, d_model, device):
        super().__init__()
        self.seq_len = seq_len
        self.d_model = d_model
        # Make initial positional encoding matrix with 0
        pe_matrix= torch.zeros(seq_len, d_model) # (L, d_model)

        # Calculating position encoding values
        for pos in range(seq_len):
            for i in range(d_model):
                if i % 2 == 0:
                    pe_matrix[pos, i] = math.sin(pos / (10000 ** (2 * i / d_model)))
                elif i % 2 == 1:
                    pe_matrix[pos, i] = math.cos(pos / (10000 ** (2 * i / d_model)))

        pe_matrix = pe_matrix.unsqueeze(0) # (1, L, d_model)
        self.positional_encoding = pe_matrix.to(device=device).requires_grad_(False)

    def forward(self, x):
        x = x * math.sqrt(self.d_model) # (B, L, d_model)
        x = x + self.positional_encoding # (B, L, d_model)

        return x

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, drop_out=0.1):
        super().__init__()
        self.layer_norm_1 = LayerNormalization(d_model)
        self.multihead_attention = MultiheadAttention(d_model, num_heads, drop_out)
        self.drop_out_1 = nn.Dropout(drop_out)

        self.layer_norm_2 = LayerNormalization(d_model)
        self.feed_forward = FeedFowardLayer(d_model, d_ff, drop_out)
        self.drop_out_2 = nn.Dropout(drop_out)

    def forward(self, x, e_mask):
        x_1 = self.layer_norm_1(x) # (B, L, d_model)
        x = x + self.drop_out_1(
            self.multihead_attention(x_1, x_1, x_1, mask=e_mask)
        ) # (B, L, d_model)

        x_2 = self.layer_norm_2(x) # (B, L, d_model)
        x = x + self.drop_out_2(self.feed_forward(x_2)) # (B, L, d_model)

        return x # (B, L, d_model)

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, drop_out=0.1):
        super().__init__()
        self.layer_norm_1 = LayerNormalization(d_model)
        self.masked_multihead_attention = MultiheadAttention(d_model, num_heads, drop_out)
        self.drop_out_1 = nn.Dropout(drop_out)

        self.layer_norm_2 = LayerNormalization(d_model)
        self.multihead_attention = MultiheadAttention(d_model, num_heads, drop_out)
        self.drop_out_2 = nn.Dropout(drop_out)

        self.layer_norm_3 = LayerNormalization(d_model)
        self.feed_forward = FeedFowardLayer(d_model, d_ff, drop_out)
        self.drop_out_3 = nn.Dropout(drop_out)

    def forward(self, x, e_output, e_mask,  d_mask):
        x_1 = self.layer_norm_1(x) # (B, L, d_model)
        x = x + self.drop_out_1(
            self.masked_multihead_attention(x_1, x_1, x_1, mask=d_mask)
        ) # (B, L, d_model)
        x_2 = self.layer_norm_2(x) # (B, L, d_model)
        x = x + self.drop_out_2(
            self.multihead_attention(x_2, e_output, e_output, mask=e_mask)
        ) # (B, L, d_model)
        x_3 = self.layer_norm_3(x) # (B, L, d_model)
        x = x + self.drop_out_3(self.feed_forward(x_3)) # (B, L, d_model)

        return x # (B, L, d_model)

class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff, drop_out=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.layers = nn.ModuleList(
            [EncoderLayer(d_model, num_heads, d_ff, drop_out) for i in range(num_layers)]
        )
        self.layer_norm = LayerNormalization(d_model)

    def forward(self, x, e_mask):
        for i in range(self.num_layers):
            x = self.layers[i](x, e_mask)

        return self.layer_norm(x)
    
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff, drop_out):
        super().__init__()
        self.num_layers = num_layers
        self.layers = nn.ModuleList(
            [DecoderLayer(d_model, num_heads, d_ff, drop_out) for i in range(num_layers)]
        )
        self.layer_norm = LayerNormalization(d_model)

    def forward(self, x, e_output, e_mask, d_mask):
        for i in range(self.num_layers):
            x = self.layers[i](x, e_output, e_mask, d_mask)

        return self.layer_norm(x)

class Transformer(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        self.src_embedding = nn.Embedding(self.cfg.sp_vocab_size, self.cfg.d_model)
        self.tgt_embedding = nn.Embedding(self.cfg.sp_vocab_size, self.cfg.d_model)
        self.positional_encoder = PositionalEncoder(
            self.cfg.seq_len, 
            self.cfg.d_model, 
            self.cfg.device
        )
        self.encoder = Encoder(
            self.cfg.num_layers, 
            self.cfg.d_model, 
            self.cfg.num_heads, 
            self.cfg.d_ff, 
            self.cfg.drop_out
        )
        self.decoder = Decoder(
            self.cfg.num_layers, 
            self.cfg.d_model, 
            self.cfg.num_heads, 
            self.cfg.d_ff, 
            self.cfg.drop_out
        )
        self.output_linear = nn.Linear(self.cfg.d_model, self.cfg.sp_vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, src_input, tgt_input, e_mask=None, d_mask=None):
        src_input = self.src_embedding(src_input) # (B, L) => (B, L, d_model)
        tgt_input = self.tgt_embedding(tgt_input) # (B, L) => (B, L, d_model)
        src_input = self.positional_encoder(src_input) # (B, L, d_model) => (B, L, d_model)
        tgt_input = self.positional_encoder(tgt_input) # (B, L, d_model) => (B, L, d_model)

        e_output = self.encoder(src_input, e_mask) # (B, L, d_model)
        d_output = self.decoder(tgt_input, e_output, e_mask, d_mask) # (B, L, d_model)

        output = self.softmax(self.output_linear(d_output)) # (B, L, d_model) => # (B, L, trg_vocab_size)

        return output

In [None]:
class Trainer():
    def __init__(self, cfg, is_train=True, load_ckpt=True):
        self.cfg = cfg
        
        print("Loading Transformer model & Adam optimizer...")
        self.model = Transformer(self.cfg).to(self.cfg.device)
        print(self.cfg.device)

        self.optim = torch.optim.Adam(self.model.parameters(), lr=self.cfg.learning_rate)

        self.best_loss = 100.0
        if load_ckpt:
            print("Loading checkpoint...")
            checkpoint = torch.load(f"{self.cfg.ckpt_dir}/{self.cfg.ckpt_name}", map_location=self.cfg.device)
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.optim.load_state_dict(checkpoint['optim_state_dict'])
            self.best_loss = checkpoint['loss']
        else:
            print("Initializing the model...")
            for p in self.model.parameters():
                if p.dim() > 1:
                    nn.init.xavier_uniform_(p)
        
        # Prepare Tokenizer
        self.prepare_tokenizer()

        if is_train:
            # Load loss function
            print("Loading loss function...")
            self.criterion = nn.NLLLoss()

            # Load dataloaders
            print("Loading dataloaders...")
            self.train_dataset, self.train_loader = get_data_loader(self.cfg, 'train')
            self.valid_dataset, self.valid_loader = get_data_loader(self.cfg, 'validation')

        else:
            if os.path.exists(f"{self.cfg.ckpt_dir}/{self.cfg.ckpt_name}"):
                print("Loading sentencepiece tokenizer...")
                self.sp_src = spm.SentencePieceProcessor()
                self.sp_tgt = spm.SentencePieceProcessor()
                self.sp_src.Load(f"{self.cfg.sp_dir}/{self.cfg.src_model_prefix}.model")
                self.sp_tgt.Load(f"{self.cfg.sp_dir}/{self.cfg.tgt_model_prefix}.model")
            else:
                print("Checkpoint path not exits...")
        
        print("Setting finished.")
    
    def prepare_tokenizer(self):
        if not os.path.isdir(self.cfg.sp_dir):
            print('Training sentencepiece tokenizer...')
            train_sentencepiece(self.cfg, is_src=True)
            train_sentencepiece(self.cfg, is_src=False)
        else:
            print('Tokenization already...')

    def train(self):
        print("Training...")

        for epoch in range(1, self.cfg.num_epochs+1):
            print(f"#################### Epoch: {epoch} ####################")

            self.model.train()
            train_losses = []
            start_time = datetime.datetime.now()

            bar = tqdm(enumerate(self.train_loader), total=len(self.train_loader), desc='TRAINING')

            for batch_idx, batch in bar:
                src_input, tgt_input, tgt_output = batch
                src_input, tgt_input, tgt_output = src_input.to(self.cfg.device), tgt_input.to(self.cfg.device), tgt_output.to(self.cfg.device)

                e_mask, d_mask = self.create_mask(src_input, tgt_input)

                logits = self.model(src_input, tgt_input, e_mask, d_mask)

                self.optim.zero_grad()

                loss = self.criterion(
                    logits.view(-1, logits.shape[-1]),
                    tgt_output.reshape(-1)
                )
                
                loss.backward()
                self.optim.step()

                train_losses.append(loss.item())
                
                del src_input, tgt_input, tgt_output, e_mask, d_mask, logits
                torch.cuda.empty_cache()

                bar.set_postfix(TRAIN="Epoch {} - Batch_Loss {:.2f} - Train_Loss {:.2f} - Best_Valid_Loss {:.2f}".format(
                    epoch,
                    loss.item(),
                    np.mean(train_losses),
                    self.best_loss
                    )
                )

            end_time = datetime.datetime.now()
            training_time = end_time - start_time

            mean_train_loss = np.mean(train_losses)
            print(f"Train loss: {mean_train_loss} || Time: {training_time} secs")

            valid_loss, valid_time = self.validation()
            
            if valid_loss < self.best_loss:
                if not os.path.exists(self.cfg.ckpt_dir):
                    os.mkdir(self.cfg.ckpt_dir)
                    
                self.best_loss = valid_loss
                state_dict = {
                    'model_state_dict': self.model.state_dict(),
                    'optim_state_dict': self.optim.state_dict(),
                    'loss': self.best_loss
                }
                torch.save(state_dict, f"{self.cfg.ckpt_dir}/{self.cfg.ckpt_name}")
                print(f"***** Current best checkpoint is saved. *****")

            print(f"Best valid loss: {self.best_loss}")
            print(f"Valid loss: {valid_loss} || One epoch training time: {valid_time}")

        print(f"Training finished!")
        
    def validation(self):
        self.model.eval()
        
        valid_losses = []
        start_time = datetime.datetime.now()

        with torch.no_grad():
            bar = tqdm(enumerate(self.valid_loader), total=len(self.valid_loader), desc='VALIDATIION')
            for batch_idx, batch in bar:
                src_input, tgt_input, tgt_output = batch
                src_input, tgt_input, tgt_output = src_input.to(self.cfg.device), tgt_input.to(self.cfg.device), tgt_output.to(self.cfg.device)

                e_mask, d_mask = self.create_mask(src_input, tgt_input)

                logits = self.model(src_input, tgt_input, e_mask, d_mask)

                loss = self.criterion(
                    logits.view(-1, logits.shape[-1]),
                    tgt_output.reshape(-1)
                )

                valid_losses.append(loss.item())

                bar.set_postfix(TRAIN="Batch_Loss {:.2f} - Valid_Loss {:.2f}".format(
                    loss.item(),
                    np.mean(valid_losses)
                    )
                )

                del src_input, tgt_input, tgt_output, e_mask, d_mask, logits
                torch.cuda.empty_cache()

        end_time = datetime.datetime.now()
        validation_time = end_time - start_time
        
        mean_valid_loss = np.mean(valid_losses)
        
        return mean_valid_loss, f"{validation_time} secs"

    def inference(self, input_sentence):
        self.model.eval()

        print("Preprocessing input sentence...")
        tokenized = self.sp_src.EncodeAsIds(input_sentence)
        src_data = torch.LongTensor(
            pad_or_truncate([self.cfg.sos_id] + tokenized + [self.cfg.eos_id], self.cfg.seq_len, self.cfg.pad_id)
        ).unsqueeze(0).to(self.cfg.device)

        e_mask = (src_data != self.cfg.pad_id).unsqueeze(1).to(self.cfg.device) # (1, 1, L)

        start_time = datetime.datetime.now()

        print("Encoding input sentence...")
        src_data = self.model.src_embedding(src_data)
        src_data = self.model.positional_encoder(src_data)
        e_output = self.model.encoder(src_data, e_mask) # (1, L, d_model)

        result = self.greedy_search(e_output, e_mask)

        end_time = datetime.datetime.now()

        total_inference_time = end_time - start_time

        print(f"Input: {input_sentence}")
        print(f"Result: {result}")
        print(f"Inference finished! || Total inference time: {total_inference_time}secs")
        return result
        
    def greedy_search(self, e_output, e_mask):
        last_words = torch.LongTensor([self.cfg.pad_id] * self.cfg.seq_len).to(self.cfg.device) # (L)
        last_words[0] = self.cfg.sos_id # (L)
        cur_len = 1

        for i in range(self.cfg.seq_len):
            d_mask = (last_words.unsqueeze(0) != self.cfg.pad_id).unsqueeze(1).to(self.cfg.device) # (1, 1, L)
            nopeak_mask = torch.ones([1, self.cfg.seq_len, self.cfg.seq_len], dtype=torch.bool).to(self.cfg.device)  # (1, L, L)
            nopeak_mask = torch.tril(nopeak_mask)  # (1, L, L) to triangular shape
            d_mask = d_mask & nopeak_mask  # (1, L, L) padding false

            tgt_embedded = self.model.tgt_embedding(last_words.unsqueeze(0))
            tgt_positional_encoded = self.model.positional_encoder(tgt_embedded)
            decoder_output = self.model.decoder(
                tgt_positional_encoded,
                e_output,
                e_mask,
                d_mask
            ) # (1, L, d_model)

            output = self.model.softmax(
                self.model.output_linear(decoder_output)
            ) # (1, L, trg_vocab_size)

            output = torch.argmax(output, dim=-1) # (1, L)
            last_word_id = output[0][i].item()
            
            if i < self.cfg.seq_len-1:
                last_words[i+1] = last_word_id
                cur_len += 1
            
            if last_word_id == self.cfg.eos_id:
                break

        if last_words[-1].item() == self.cfg.pad_id:
            decoded_output = last_words[1:cur_len].tolist()
        else:
            decoded_output = last_words[1:].tolist()
        decoded_output = self.sp_tgt.decode_ids(decoded_output)
        
        return decoded_output

    def create_mask(self, src_input, tgt_input):
        e_mask = (src_input != self.cfg.pad_id).unsqueeze(1)  # (B, 1, L)
        d_mask = (tgt_input != self.cfg.pad_id).unsqueeze(1)  # (B, 1, L)

        nopeak_mask = torch.ones([1, self.cfg.seq_len, self.cfg.seq_len], dtype=torch.bool)  # (1, L, L)
        nopeak_mask = torch.tril(nopeak_mask).to(self.cfg.device)  # (1, L, L) to triangular shape
        d_mask = d_mask & nopeak_mask  # (B, L, L) padding false

        return e_mask, d_mask

In [None]:
class BaseConfig:
    """ base Encoder Decoder config """

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

class NMTConfig(BaseConfig):
    # Dataset
    data_dir = './transformer/data'
    src_lang = 'vi'
    tgt_lang = 'en'

    # Tokenizer
    sp_dir = data_dir + '/sp'
    pad_id = 0
    sos_id = 1
    eos_id = 2
    unk_id = 3
    src_model_prefix = 'sp_' + src_lang
    tgt_model_prefix = 'sp_' + tgt_lang
    sp_vocab_size = 4000
    character_coverage = 1.0
    model_type = 'unigram'

    # Model
    num_heads = 8
    num_layers = 6
    d_model = 512
    d_ff = 2048
    drop_out = 0.1

    # Training
    # device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
    device = 'cpu'
    learning_rate = 1e-4
    batch_size = 256
    seq_len = 150
    num_epochs = 20
    ckpt_dir = './transformer'
    ckpt_name = 'best_ckpt.tar'

In [None]:
cfg = NMTConfig()

In [None]:
data_pre = DataPreparing(cfg.data_dir, cfg.src_lang, cfg.tgt_lang)
data_pre.download_dataset()

In [None]:
trainer = Trainer(cfg, is_train=True, load_ckpt=False)
trainer.train()