## Data Preparation

In [42]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

In [43]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [44]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')



In [45]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [46]:
SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)
      
TRG = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

In [47]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.en', '.de'), fields = (SRC, TRG)) #exts = ('.de', '.en'), 
                                                    

In [48]:
print(vars(train_data.examples[0]))

{'src': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.'], 'trg': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']}


In [49]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [51]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = device)

## Encoder

In [52]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)

        self.layers = nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, scr, scr_mask):

        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]

        batch_size = scr.shape[0]
        scr_len = scr.shape[1]

        pos = torch.arange(0, scr_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        #pos = [batch_size, scr_len]

        scr = self.dropout((self.tok_embedding(scr) * self.scale) + self.pos_embedding(pos))
        #scr = [batch_size, scr_len, hid_dim]

        for layer in self.layers:
            scr = layer(scr, scr_mask)
            #scr = [batch_size, scr_len, hid_dim]

        return scr      



#### Encoder Layer

In [53]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()

        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, scr, scr_mask):

        #scr = [batch_size, scr_len, hid_dim]
        #scr_mask = [batch_size, 1, 1, scr_len]

        #self attention
        _scr, _ = self.self_attention(scr, scr, scr, scr_mask)

        scr = self.self_attn_layer_norm(scr + self.dropout(_scr))
        #scr = [batch_size, scr_len, hid_dim]

        _scr = self.positionwise_feedforward(scr)

        scr = self.ff_layer_norm(scr + self.dropout(_scr))
        #scr = [batch_size, scr_len, hid_dim]

        return scr

#### Multi Head Attention Layer

In [54]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()

        assert hid_dim % n_heads == 0

        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads

        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask=None):

        batch_size = query.shape[0]

        # query = [batch_size, query_len, hid_dim]
        # key = [batch_size, key_len, hid_dim]
        # value = [batch_size, value_len, hid_dim]
        # scr_len = query_len = key_len = value_len

        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)

        # Q = [batch_size, query_len, hid_dim]
        # K = [batch_size, key_len, hid_dim]
        # V = [batch_size, value_len, hid_dim]

        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)

        # Q = [batch_size, n_heads, query_len, head_dim]
        # K = [batch_size, n_heads, key_len, head_dim]
        # V = [batch_size, n_heads, value_len, head_dim]

        energy = torch.matmul(Q, K.permute(0,1,3,2)) / self.scale
        # energy = [batch_size, n_heads, query_len, key_len]

        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)

        attention = torch.softmax(energy, dim=-1)
        # attention = [batch_size, n_heads, query_len, key_len]

        x = torch.matmul(self.dropout(attention), V)
        # x = [batch_size, n_heads, query_len, head_dim]

        x = x.permute(0,2,1,3).contiguous()
        # x = [batch_size, query_len, n_heads, hid_dim]

        x = x.view(batch_size, -1, self.hid_dim)
        # x = [batch_size, query_len, hid_dim]

        x = self.fc_o(x)
        # x = [batch_size, query_len, hid_dim]

        return x, attention       

#### Position-wise Feedforward Layer

In [55]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()

        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        # x = [batch_size, seq_len, hid_dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        # x = [batch_size, seq_len, pf_dim]

        x = self.fc_2(x)
        # x = [batch_size, seq_len, hid_dim]

        return x

## Decoder

In [56]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length = 100):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)

        self.layers = nn.ModuleList([DecoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, trg, enc_scr, trg_mask, scr_mask):

        # trg = [batch_size, trg_len]
        # enc_scr = [batch_size, scr_len, hid_dim]
        # trg_mask = [batch_size, 1, trg_len, trg_len]
        # scr_mask = [batch_size, 1, 1, scr_len]

        batch_size =  trg.shape[0]
        trg_len = trg.shape[1]

        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # pos = [batch_size, trg_len]

        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        # trg = [batch_size, trg_len, hid_dim]

        for layer in self.layers:
            trg, attention = layer(trg, enc_scr, trg_mask, scr_mask)
            # trg = [batch_size, trg_len, hid_dim]
            # attention = [batch_size, n_heads, trg_len, scr_len]

        output = self.fc_out(trg)
        # output = [batch_size, trg_len, output_dim]

        return output, attention

#### Decoder Layer

In [57]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()

        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_scr, trg_mask, scr_mask):

        # trg = [batch_size, trg_len, hid_dim]
        # enc_scr = [batch_size, scr_len, hid_dim]
        # trg_mask = [batch_size, 1, trg_len, trg_len]
        # scr_mask = [batch_size, 1, 1, scr_len]

        # self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)

        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        #trg = [batch_size, trg_len, hid_dim]

        # encoder attention
        _trg, attention = self.encoder_attention(trg, enc_scr, enc_scr, scr_mask)

        # dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        # trg = [batch_size, trg_len, hid_dim]

        _trg = self.positionwise_feedforward(trg)

        # dropout, residual connection and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        # trg = [batch_size, trg_len, hid_dim]
        # attention = [batch_size, n_heads, trg_len, scr_len]

        return trg, attention

## Seq2Seq Model

In [58]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, scr_pad_idx, trg_pad_idx, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.scr_pad_idx = scr_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_scr_mask(self, scr):

        # scr = [batch_size, scr_len]

        scr_mask = (scr != self.scr_pad_idx).unsqueeze(1).unsqueeze(2)
        # scr_mask = [batch_size, 1, 1, scr_len]

        return scr_mask

    def make_trg_mask(self, trg):

        # trg = [batch_size, trg_len]

        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        # trg_pad_mask = [batch_size, 1, 1, trg_len]

        trg_len = trg.shape[1]

        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        # trg_sub_mask = [trg_len, trg_len]

        trg_mask = trg_pad_mask & trg_sub_mask
        # trg_mask = [batch_size, 1, trg_len, trg_len]
        
        return trg_mask

    def forward(self, scr, trg):

        # scr = [batch_size, scr_len]
        # trg = [batch_size, trg_len]

        scr_mask = self.make_scr_mask(scr)
        trg_mask = self.make_trg_mask(trg)

        # scr_mask = [batch_size, 1, 1, scr_len]
        # trg_mask = [batch_size, 1, trg_len, trg_len]

        enc_scr = self.encoder(scr, scr_mask)
        # enc_scr = [batch_size, scr_len, hid_dim]

        output, attention = self.decoder(trg, enc_scr, trg_mask, scr_mask)
        # output  = [batch_size, trg_len, output_dim]
        # attention = [batch_size, n_heads, trg_len, scr_len]

        return output, attention

#### Training & Testing

In [59]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, device)

dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, DEC_DROPOUT, device)

In [60]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [61]:
def initialize_weights(m):
    if hasattr(m, "weight") and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(5893, 256)
    (pos_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
     

In [62]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [63]:
def train(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0

    for i, batch in enumerate(iterator):

        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
        # output = [batch_size, trg_len - 1, output_dim]
        # trg = [batch_size, trg_len]

        output_dim = output.shape[-1]

        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
        # output = [batch_size * trg_len - 1, output_dim]
        # trg = [batch_size * trg_len - 1]

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss/len(iterator)

In [64]:
def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])

            # output = [batch_size, trg_len -1, output_dim]
            # trg = [batch_size, trg_len]

            output_dim = output.shape[-1]

            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)

            # output = [batch_size * trg_len - 1, output_dim]
            # trg = [batch_size * trg_len - 1]

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss/len(iterator)

In [65]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time/60)
    elapsed_secs = int(elapsed_time - (elapsed_mins*60))
    return elapsed_mins, elapsed_secs

In [66]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "my_translator.pt")

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL = {math.exp(train_loss):7.3f}")
    print(f"\tVal. Loss: {valid_loss:.3f} | Val. PPL = {math.exp(valid_loss):7.3f}")

Epoch: 01 | Time: 5m 13s
	Train Loss: 4.365 | Train PPL =  78.643
	Val. Loss: 3.116 | Val. PPL =  22.550
Epoch: 02 | Time: 5m 13s
	Train Loss: 2.864 | Train PPL =  17.526
	Val. Loss: 2.334 | Val. PPL =  10.315
Epoch: 03 | Time: 5m 12s
	Train Loss: 2.246 | Train PPL =   9.448
	Val. Loss: 1.954 | Val. PPL =   7.060
Epoch: 04 | Time: 5m 12s
	Train Loss: 1.866 | Train PPL =   6.465
	Val. Loss: 1.751 | Val. PPL =   5.762
Epoch: 05 | Time: 5m 7s
	Train Loss: 1.593 | Train PPL =   4.917
	Val. Loss: 1.624 | Val. PPL =   5.075
Epoch: 06 | Time: 5m 8s
	Train Loss: 1.385 | Train PPL =   3.994
	Val. Loss: 1.546 | Val. PPL =   4.693
Epoch: 07 | Time: 5m 4s
	Train Loss: 1.220 | Train PPL =   3.388
	Val. Loss: 1.493 | Val. PPL =   4.451
Epoch: 08 | Time: 5m 10s
	Train Loss: 1.082 | Train PPL =   2.951
	Val. Loss: 1.482 | Val. PPL =   4.400
Epoch: 09 | Time: 8m 35s
	Train Loss: 0.965 | Train PPL =   2.624
	Val. Loss: 1.457 | Val. PPL =   4.293
Epoch: 10 | Time: 7m 59s
	Train Loss: 0.867 | Train PPL = 

In [67]:
model.load_state_dict(torch.load("my_translator.pt"))

test_loss = evaluate(model, test_iterator, criterion)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |")

| Test Loss: 1.488 | Test PPL:   4.428 |


#### Inference

In [68]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):

    model.eval()

    if isinstance(sentence, str):
        nlp = spacy.load("en_core_web_sm")
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]

    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    src_mask = model.make_scr_mask(src_tensor)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):
        
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)

        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)

        pred_token = output.argmax(2)[:,-1].item()

        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]

    return trg_tokens[1:], attention

In [69]:
example_idx = 8

src = vars(train_data.examples[example_idx])["src"]
trg = vars(train_data.examples[example_idx])["trg"]

print(f"src = {src}")
print(f"trg = {trg}")

src = ['a', 'woman', 'with', 'a', 'large', 'purse', 'is', 'walking', 'by', 'a', 'gate', '.']
trg = ['eine', 'frau', 'mit', 'einer', 'großen', 'geldbörse', 'geht', 'an', 'einem', 'tor', 'vorbei', '.']


In [70]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f"predicted sentence = {translation}")

predicted sentence = ['eine', 'frau', 'mit', 'einer', 'großen', 'handtasche', 'geht', 'an', 'einem', 'tor', 'vorbei', '.', '<eos>']


In [71]:
example_idx = 6

src = vars(valid_data.examples[example_idx])["src"]
trg = vars(valid_data.examples[example_idx])["trg"]

print(f"src = {src}")
print(f"trg = {trg}")

src = ['a', 'brown', 'dog', 'is', 'running', 'after', 'the', 'black', 'dog', '.']
trg = ['ein', 'brauner', 'hund', 'rennt', 'dem', 'schwarzen', 'hund', 'hinterher', '.']


In [72]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f"predicted sentence = {translation}")

predicted sentence = ['ein', 'brauner', 'hund', 'rennt', 'nach', 'dem', 'schwarzen', 'hund', '.', '<eos>']


In [73]:
example_idx = 10

src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'trg = {trg}')

src = ['a', 'mother', 'and', 'her', 'young', 'song', 'enjoying', 'a', 'beautiful', 'day', 'outside', '.']
trg = ['eine', 'mutter', 'und', 'ihr', 'kleiner', 'sohn', 'genießen', 'einen', 'schönen', 'tag', 'im', 'freien', '.']


In [74]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {translation}')

predicted trg = ['eine', 'mutter', 'und', 'ihr', 'junge', 'genießen', 'ein', 'lied', 'im', 'freien', '.', '<eos>']


#### BLEU

In [75]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):

    trgs = []
    pred_trgs = []

    for datum in data:

        src = vars(datum)["src"]
        trg = vars(datum)["trg"]

        pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len)

        # cut off <eos> token
        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        trgs.append([trg])

    return bleu_score(pred_trgs, trgs)

In [76]:
bleu_score = calculate_bleu(test_data, SRC, TRG, model, device)

print(f"BLEU score = {bleu_score*100:.2f}")

BLEU score = 33.02
