In [1]:
# ! python -m spacy download en_core_web_sm --quiet
# ! python -m spacy download de_core_news_sm --quiet

In [2]:
# references: 
# https://hussainwali.medium.com/transforming-your-text-data-with-pytorch-12ec1b1c9ae6
# https://github.com/bentrevett/pytorch-seq2seq/tree/main
# https://adeveloperdiary.com/data-science/deep-learning/nlp/machine-translation-recurrent-neural-network-pytorch/

In [1]:
import torch
import torch.nn as nn
import torchtext
import spacy
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
import tqdm
import numpy as np
import datasets
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = datasets.load_dataset("bentrevett/multi30k")
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [3]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [4]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    en_tokens = [token.lower() for token in en_tokens]
    de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [5]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

In [6]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
    max_tokens=1_000
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
    max_tokens=1_000
)

In [7]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [8]:
def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [9]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

In [10]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_ids': [2, 16, 24, 15, 25, 778, 17, 57, 80, 202, 0, 5, 3],
 'de_ids': [2, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 0, 0, 4, 3]}

In [11]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [17]:
train_data[0]

{'en_ids': tensor([  2,  16,  24,  15,  25, 778,  17,  57,  80, 202,   0,   5,   3]),
 'de_ids': tensor([  2,  18,  26, 253,  30,  84,  20,  88,   7,  15, 110,   0,   0,   4,
           3]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [18]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn

In [19]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )

    return data_loader

In [20]:
batch_size = 128

train_dataloader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_dataloader = get_data_loader(valid_data, batch_size, pad_index)
test_dataloader = get_data_loader(test_data, batch_size, pad_index)

In [21]:
list(train_dataloader)[7]["en_ids"].shape
# batch is of shape seq_len, batch_sz

torch.Size([36, 128])

In [22]:
# RNN EncDec implementation: https://arxiv.org/pdf/1406.1078

### GRU Cell

In [37]:
class GRUEncoderCell(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.input_size = input_dim
        self.hidden_size = hidden_dim
        # the hidden dims in the GRU Cell is the number of neurons in one layer. 
        # each one of those neurons processes the sentence one token at a time (at each timestep)
        # paying attention to different parts of the sentence via the gating mechansims

        self.W = nn.Linear(input_dim, hidden_dim, bias=True)
        self.Wz = nn.Linear(input_dim, hidden_dim, bias=True)
        self.Wr = nn.Linear(input_dim, hidden_dim, bias=True)
        self.U = nn.Linear(hidden_dim, hidden_dim, bias=True) # gah! this was [hidden_dim, hidden_dim] NOT [embed_dim, hidden_dim]
        self.Uz = nn.Linear(hidden_dim, hidden_dim, bias=True)
        self.Ur = nn.Linear(hidden_dim, hidden_dim, bias=True)
    def forward(self, x, h):
        zj = torch.sigmoid(self.Wz(x) + self.Uz(h))
        rj = torch.sigmoid(self.Wr(x) + self.Ur(h))
        h_tilde = torch.tanh(self.W(x) + rj * self.U(h))
        return zj * h + (1 - zj) * h_tilde
        

In [38]:
class Encoder(nn.Module):
    def __init__(self, vocab_sz, embedding_dim, hidden_dim, n_layers=1, p_dropout=0.5):
        super().__init__()
        self.vocab_sz = vocab_sz
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(vocab_sz, embedding_dim)
        self.gru = GRUEncoderCell(embedding_dim, hidden_dim)
        self.dropout = nn.Dropout(p_dropout)
        self.V = nn.Linear(hidden_dim, hidden_dim)
    def forward(self, src):
        # input is of shape [seq_len, batch_sz]
        seq_len, batch_sz = src.shape
        # hidden is [batch_sz, hidden_dim]
        hidden = torch.zeros(batch_sz, self.hidden_dim) # so the entire issue was that this is batch_sz, self.hidden_dim annd NOT seq_len, batch_sz
        # convert src tokens tensor to embeddings
        embedded_src = self.embedding(src)
        # embedding_src is [seq_len, batch_sz, embedding_dim]
        for t in range(seq_len):
            input_ = embedded_src[t, :]
            # input_ is [batch_sz, embedding_dim]
            # consider how this looks in the math W(x) + U(h)
            # where x is [batch_sz, embedding_dim] and h is [batch_sz, hidden_dim]
            hidden = self.gru(input_, hidden)
        context = F.tanh(self.V(hidden))
        # hidden is of shape [n_directions * n_layers, batch_sz, hidden_dim]
        return None, context.unsqueeze(0)      

# wee test
voc_sz = 2
b_sz = 3
seq_l = 4
n_lrs = 1
x = torch.randint(0, 2, (seq_l, b_sz))
# input is [4, 3]
# enc = Encoder(voc_sz, b_sz, seq_l, n_lrs)
enc = Encoder(voc_sz, b_sz, seq_l, n_lrs)
o, h = enc(x)
# assert list(o.shape) == [4, 3, 4]
print(h.shape)
assert list(h.shape) == [1, 3, 4]

# Example parameters
vocab_size = 50  # Size of vocabulary
embedding_dim = 16  # Size of embeddings
hidden_size = 32  # Size of hidden state
seq_len = 10
batch_size = 4

# Random input tokens (integer indices)
input_tokens = torch.randint(0, vocab_size, (seq_len, batch_size))

# Initialize encoder
encoder = Encoder(vocab_size, embedding_dim, hidden_size)

# Forward pass
hidden_states, final_hidden_state = encoder(input_tokens)

# print(f"Hidden states shape: {hidden_states.shape}")
print(f"Final hidden state shape: {final_hidden_state.shape}")

torch.Size([1, 3, 4])
Final hidden state shape: torch.Size([1, 4, 32])


In [171]:
class GRUDecoderCell(nn.Module):
    def __init__(self, factor_dim, hidden_dim, io_dim, embedding_dim, context_dim, p_dropout=0.5):
        super().__init__()
        self.Oh1 = nn.Linear(hidden_dim, factor_dim, bias=True)
        self.Oh2 = nn.Linear(factor_dim, hidden_dim * 2, bias=True) # double the hidden_dim because in max_out will choose the max of two
        self.Oy1 = nn.Linear(embedding_dim, factor_dim, bias=True)
        self.Oy2 = nn.Linear(factor_dim, hidden_dim * 2, bias=True)
        self.Oc1 = nn.Linear(context_dim, factor_dim, bias=True)
        self.Oc2 = nn.Linear(factor_dim, hidden_dim * 2, bias=True)
        self.dropout = nn.Dropout(p_dropout)
    def forward(self, trg_embeddings, hidden, context):
        # context is of shape [batch_sz, hidden_dim]
        # hidden is of shape [batch_sz, hidden_dim]
        # trg_embeddings is [batch_sz, embeddings_dim]
        seq_len, batch_sz, _ = trg_embeddings.shape
        Oh = self.Oh2(self.Oh1(hidden)) # [batch_sz, hidden_dim * 2]
        Oy = self.Oy2(self.Oy1(trg_embeddings)) # [batch_sz, hidden_dim * 2]
        Oc = self.Oc2(self.Oc1(context)) # [batch_sz, hidden_dim * 2]
        # pre-activation state
        s_prime = Oh + Oy + Oc
        # s_prime is [batch_sz, hidden_dim * 2]
        # activation as maxout unit
        s_prime = s_prime.view(batch_sz, -1, 2)
        hidden = s_prime.max(dim=-1).values
        # hidden is [batch_sz, hidden_dim]
        return _, hidden

# wee test
fac_dim = 2
hid_dim = 4
embed_dim = 5
ctxt_dim = 4
io_dim = 3
sq_len = 3
btch_sz = 5
voc_sz = 2
ttrg_embs = torch.randn(1, btch_sz, embed_dim)
tctxt = torch.randn(btch_sz, ctxt_dim)
thidden = torch.randn(btch_sz, hid_dim)

test_dec = GRUDecoderCell(fac_dim, hid_dim, io_dim, embed_dim, ctxt_dim)
o, h = test_dec(ttrg_embs, thidden, tctxt)
assert list(h.shape) == [btch_sz, hid_dim]

In [208]:
class DecoderOneStep(nn.Module):
    def __init__(self, input_output_dim, factor_dim, embedding_dim, hidden_dim, context_dim, n_layers, p_dropout = 0.5):
        super().__init__()
        self.input_output_dim = input_output_dim
        self.embedding = nn.Embedding(input_output_dim, embedding_dim)
        self.gru = GRUDecoderCell(factor_dim, hidden_dim, io_dim, embedding_dim, context_dim, p_dropout)
        self.dropout = nn.Dropout(p_dropout)
        # need to include this to project output to vocab, i.e., have total number of vocab to do argmax over
        self.fc = nn.Linear(hidden_dim, input_output_dim)
    def forward(self, trg, hidden, context):
        # since we get only one token at a time, 
        # it is of shape [batch_sz]
        # so we need to unsqueeze to make it [1, batch_sz]
        trg = trg.unsqueeze(0) # we changed what it is expecting so maybe not unsqueeze
        embeddings = self.dropout(self.embedding(trg))
        _, hidden = self.gru(embeddings, hidden, context)
        # hidden is of shape [n_directions * n_layers, batch_sz, hidden_dim]
        # but input for fc has to be squeezed at 0-th dim to remove the dimension added earlier
        # so that shape of out is [batch_sz, hidden_dim]
        # compute output logits
        out = self.fc(hidden)
        # out here is [batch_sz, input_output_dim]
        # so here you turn hidden to vocab for output for predictions
        # and continue to use hidden to further process the tokens
        return out.squeeze(0), hidden.unsqueeze(0)


# wee test
io_dim = 5
fac_dim = 2
emb_dim = 10
h_dim = 4
ctxt_dim = 4
n_lrs = 1
voc_sz = 3
d1s = DecoderOneStep(io_dim, fac_dim, emb_dim, h_dim, ctxt_dim, n_lrs) 
single_step_input = torch.tensor([1,2,2,1])
h = torch.randn(4, h_dim) # batch_sz is 4 because the input into single step is [seq_len, batch_sz]
ctxt = torch.randn(4, ctxt_dim)
o1s, h1s = d1s(single_step_input, h, ctxt)
assert list(o1s.shape) == [4, 5]
assert list(h1s.shape) == [1, 4, 4] # should this be [1, 4, 4] ??????????

In [210]:
class Decoder(nn.Module):
    def __init__(self, one_step_decoder, hidden_emb):
        super().__init__()
        self.hidden_dim = hidden_emb
        self.one_step_decoder = one_step_decoder
        self.V = nn.Linear(hidden_emb, hidden_emb, bias=True)
    def forward(self, trg, context, teacher_forcing_ratio=0.5):
        # context is shape [n_directions * n_layers, batch_sz, hidden_dim]
        seq_len, batch_sz = trg.shape
        vocab_sz = self.one_step_decoder.input_output_dim
        # hidden = torch.tanh(self.V(context))
        hidden = torch.zeros(batch_sz, self.hidden_dim)
        predictions = torch.zeros(seq_len, batch_sz, vocab_sz)
        # predictions [seq_len, batch_sz, vocab_sz]
        input_ = trg[0, :]
        for t in range(1, seq_len):
            # input_ is [batch_sz]
            pred, hidden = self.one_step_decoder(input_, hidden, context) 
            # out [batch_sz, input_output_dim]
            # hidden [n_directions * n_layers, batch_sz, hidden_dim]
            predictions[t] = pred
            is_teacher_force =  random.random() < teacher_forcing_ratio 
            input_ = trg[t] if is_teacher_force else pred.argmax(dim=-1)
        # predictions are of shape [seq_len, batch_sz, input_output_dim]
        # predictions = torch.stack(predictions)
        return predictions

# wee test
io_dim = 5
emb_dim = 10
h_dim = 4
n_lrs = 1
d1s = DecoderOneStep(io_dim, fac_dim, emb_dim, h_dim, ctxt_dim, n_lrs) 
h = torch.randn(1, 4, 4) # [n_directions * n_layers, batch_sz, hidden_dim]
b_sz = 4
seq_l = 5
dec = Decoder(d1s, h_dim)
y = torch.randint(0, 2, (seq_l, b_sz))
odec = dec(y, h)
assert list(odec.shape) == [seq_l, b_sz, io_dim]

In [211]:
class EncDec(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, src, trg, teacher_forcing_ratio):
        _, context = self.encoder(src)
        predictions = self.decoder(trg, context, teacher_forcing_ratio)
        return predictions

In [212]:
def init_model(en_vocab, de_vocab, pad_index):
    # params
    input_dim = len(en_vocab)
    output_dim = len(de_vocab)
    encoder_embedding_dim = 256
    decoder_embedding_dim = 256
    factorized_dim = 64
    hidden_dim = 512
    context_dim = 512
    encoder_dropout = 0.5
    decoder_dropout = 0.5
    n_layers = 1
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # init models
    encoder = Encoder(input_dim, encoder_embedding_dim, hidden_dim, n_layers, encoder_dropout)
    decoder_one_step = DecoderOneStep(output_dim, factorized_dim, decoder_embedding_dim, hidden_dim, context_dim, n_layers, decoder_dropout)
    decoder = Decoder(decoder_one_step, hidden_dim)
    seq2seq = EncDec(encoder, decoder)

    # optimizer
    optimizer = optim.Adam(seq2seq.parameters())
    
    # loss function
    criterion = nn.CrossEntropyLoss(ignore_index=pad_index)
    
    return seq2seq, optimizer, criterion

In [213]:
def train(train_dataloader, val_dataloader, en_vocab, de_vocab, pad_index, n_epochs=3):
    model, optimizer, criterion = init_model(en_vocab, de_vocab, pad_index)
    clip = 1
    # training
    for epoch in tqdm.tqdm(range(n_epochs)):
        training_losses = []
        model.train()
        
        for i, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            
            src = batch["en_ids"]
            trg = batch["de_ids"]
            # src shape [src_seq_len, batch_sz]
            # trg shape [trg_seq_len, batch_sz]

            # forward pass
            y_pred = model(src, trg, teacher_forcing_ratio=0.5)
            # y_pred shape [trg_seq_len, batch_sz, trg_vocab_sz]
            trg_vocab_sz = y_pred.shape[-1]
            # trg_vocab_sz shape is just len of trg vocab
            # discard first token from output
            y_pred = y_pred[1:].view(-1, trg_vocab_sz) # means do whatever you want with other dims and last dim has to be trg_vocab_sz
            # or y_pred after discarding first token 
            # shape is [trg_seq_len - 1, batch_sz, vocab_sz]
            # .view(-1, trg_vocab_sz returns shape
            # [(trg_seq_len - 1) * batch_sz, vocab_sz]
            # so now trg needs to be updated same as above
            trg = trg[1:].view(-1) # not sure why this can be [(trg_seq_len - 1) * batch_sz]

            # calc loss
            loss = criterion(y_pred, trg)

            # backprop
            loss.backward()
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

            # update weights
            optimizer.step()

            training_losses.append(loss.item())
        print(f"epoch {epoch} average training loss: {sum(training_losses) / len(training_losses)}")
    
        # validation
        with torch.no_grad():
            model.eval()
            validation_losses = []
            for i, batch in enumerate(val_dataloader):
                src = batch["en_ids"]
                trg = batch["de_ids"]
    
                # forward pass
                y_pred = model(src, trg, teacher_forcing_ratio=0.0)
                trg_vocab_sz = y_pred.shape[-1]
                y_pred = y_pred[1:].view(-1, trg_vocab_sz)
                trg = trg[1:].view(-1)
    
                # calc loss
                loss = criterion(y_pred, trg)
    
                validation_losses.append(loss.item())
            print(f"epoch {epoch} average validation loss: {sum(validation_losses) / len(validation_losses)}")   
    return model

In [214]:
# train model
model = train(train_dataloader, valid_dataloader, en_vocab, de_vocab, pad_index)
model

  0%|                                                   | 0/3 [00:00<?, ?it/s]

epoch 0 average training loss: 3.9202741358248674


 33%|██████████████                            | 1/3 [02:59<05:58, 179.18s/it]

epoch 0 average validation loss: 3.854813516139984
epoch 1 average training loss: 3.0957113866764017


 67%|████████████████████████████              | 2/3 [05:42<02:49, 169.90s/it]

epoch 1 average validation loss: 3.337374657392502
epoch 2 average training loss: 2.672259381164013


100%|██████████████████████████████████████████| 3/3 [08:23<00:00, 167.76s/it]

epoch 2 average validation loss: 3.1824993193149567





EncDec(
  (encoder): Encoder(
    (embedding): Embedding(998, 256)
    (gru): GRUEncoderCell(
      (W): Linear(in_features=256, out_features=512, bias=True)
      (Wz): Linear(in_features=256, out_features=512, bias=True)
      (Wr): Linear(in_features=256, out_features=512, bias=True)
      (U): Linear(in_features=512, out_features=512, bias=True)
      (Uz): Linear(in_features=512, out_features=512, bias=True)
      (Ur): Linear(in_features=512, out_features=512, bias=True)
    )
    (dropout): Dropout(p=0.5, inplace=False)
    (V): Linear(in_features=512, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (one_step_decoder): DecoderOneStep(
      (embedding): Embedding(998, 256)
      (gru): GRUDecoderCell(
        (Oh1): Linear(in_features=512, out_features=64, bias=True)
        (Oh2): Linear(in_features=64, out_features=1024, bias=True)
        (Oy1): Linear(in_features=256, out_features=64, bias=True)
        (Oy2): Linear(in_features=64, out_features=1024, bias=True)
 

In [215]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 3,012,774 trainable parameters


In [225]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    with torch.no_grad():
        model.eval()
        tokens = [token.text for token in en_nlp.tokenizer(sentence)]
        print(tokens)
        tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = en_vocab.lookup_indices(tokens)
        print(f"in tokens: {ids}")
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        _, context = model.encoder(tensor)
        # hidden = context
        hidden =  torch.zeros(1, 512) # batch_sz of 1 because only one example for inference and hididen dim of 512 as in model training
        next_token = de_vocab.lookup_indices([sos_token])
        next_token = next_token[0]
        outputs = []
        for _ in range(max_output_length):
            next_token = torch.LongTensor([next_token])
            output, hidden = model.decoder.one_step_decoder(next_token, hidden, context)
            next_token = output.argmax(dim=-1).item()
            if next_token == de_vocab[eos_token]:
                break
            else:
                outputs.append(next_token)
        print(f"out tokens: {outputs}")
        tokens = de_vocab.lookup_tokens(outputs)
    return tokens

In [226]:
test_sentences = test_data
rando_idx = np.random.randint(low=0, high=len(test_sentences))
sentence = test_sentences[rando_idx]["en"]
expected_translation = test_sentences[rando_idx]["de"]
translation = translate_sentence(
    sentence=sentence,
    model=model,
    en_nlp=en_nlp,
    de_nlp=de_nlp,
    en_vocab=en_vocab,
    de_vocab=de_vocab,
    sos_token="<sos>",
    
    eos_token="<eos>",
    device="cpu",
)
print(f"\nsentence: {sentence}\n")
print(f"expected_translation: {expected_translation}\n")
print(f"actual translation: {' '.join(i for i in translation[0:])}")

['Several', 'Asian', 'men', 'wearing', 'black', 'clothing', 'in', 'some', 'kind', 'of', 'station', '.']
in tokens: [2, 113, 106, 30, 22, 26, 219, 6, 74, 958, 12, 481, 5, 3]
out tokens: [105, 315, 30, 7, 478, 883, 7, 6, 0, 4]

sentence: Several Asian men wearing black clothing in some kind of station.

expected_translation: Mehrere asiatische Männer in schwarzer Kleidung in einer Art Station.

actual translation: vier asiatische männer in uniform anzügen in einem <unk> .
