# Evaluating machine translation witht the BLEU score

So far, we have evaluated our model (both for validation and test) using the same loss as training (crossentropy). However, this is not relevant or meaningfull in terms of comparing sentences.

A more suitable metric is the [BLEU](https://en.wikipedia.org/wiki/BLEU) score, which compares a predicted translation to a (set of) references. It's implemented in torchtext, therefore easy to use.

Below, we provide the same preprocessing and model as in the previous lab (again, we use a subset and a small model for speed, but you can change these for performance).

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import math
import time

from matplotlib import pyplot as plt
import matplotlib.ticker as ticker

# We'll be using torchtext and spacy to do most of the pre-processing
import spacy
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score

# Set a random seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# German and English specific pipelines
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

# Tokenizers
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# Fields
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)

# Dataset
train_data, valid_data, test_data = Multi30k.splits(root='data/', exts = ('.de', '.en'), fields = (SRC, TRG))

# Take a subset of the dataset (for speed)
train_data.examples = train_data.examples[:1000]
valid_data.examples = valid_data.examples[:100]
test_data.examples = train_data.examples[:100]

# Vocabulary
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

# Dataloader (here we keep the validation dataloader)
batch_size = 128
train_dataloader, valid_dataloader, test_dataloader = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size = batch_size)

# Fetch one example
example_batch = next(iter(train_dataloader))

In [None]:
# Index to string functions
def itos_list_de(tensor_indx):
    return [SRC.vocab.itos[tensor_indx[i]] for i in range(len(tensor_indx))]

def itos_list_en(tensor_indx):
    return [TRG.vocab.itos[tensor_indx[i]] for i in range(len(tensor_indx))]

In [None]:
# Define all the parameters of the network.
input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)
embedding_dim_enc = 32
embedding_dim_dec = 32
hidden_dim_enc = 50
hidden_dim_dec = 50
n_layers = 1
dropout_rate = 0.5

In [None]:
# GRU encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim_enc, hidden_dim_enc, n_layers, dropout_rate):
        super().__init__()
        
        # Store the parameters
        self.input_dim = input_dim
        self.embedding_dim_enc = embedding_dim_enc
        self.hidden_dim_enc = hidden_dim_enc
        self.n_layers = n_layers
        self.dropout_rate = dropout_rate
        
        # Create the layers
        self.embedding_layer = nn.Embedding(input_dim, embedding_dim_enc)
        self.dropout = nn.Dropout(dropout_rate)
        self.gru = nn.GRU(embedding_dim_enc, hidden_dim_enc, n_layers)
        
    def forward(self, src):
        
        # Write the forward pass and return both outputs of the GRU (the set of all outputs and the context vector)
        y = self.embedding_layer(src)
        y = self.dropout(y)
        enc_outputs, enc_context = self.gru(y)
        
        return enc_outputs, enc_context
    

# Attention module
class Attention(nn.Module):
    def __init__(self, hidden_dim_enc, hidden_dim_dec):
        super().__init__()
        
        # Define the energy layer and the weighted sum layer
        self.energy_layer = nn.Sequential(nn.Linear(hidden_dim_enc + hidden_dim_dec, hidden_dim_dec), torch.nn.Tanh())
        self.v = nn.Linear(hidden_dim_dec, 1, bias = False)
        
    def forward(self, dec_hidden, enc_outputs):
        
        src_len = enc_outputs.shape[0]
        dec_hidden = dec_hidden.squeeze()
        
        # Repeat decoder hidden state src_len times
        dec_hidden = dec_hidden.unsqueeze(0).repeat(src_len, 1, 1)
        
        # Permute dec_hidden and enc_outputs so that 'batch_size' is the first dimension
        dec_hidden = dec_hidden.permute(1, 0, 2)
        enc_outputs = enc_outputs.permute(1, 0, 2)
        
        # Concatenate the inputs (hidden state of the decoder and encoder outputs)
        # Apply the linear layer to get the energy
        comb_inputs = torch.cat((dec_hidden, enc_outputs), dim = 2)
        energy = self.energy_layer(comb_inputs)
        
        # Apply the second linear layer to get a [batch_size, src_len] attention vector
        attn = self.v(energy).squeeze(2)
        attn = nn.Softmax(dim=1)(attn)
        
        return attn

# GRU decoder with attention
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim_dec, hidden_dim_enc, hidden_dim_dec, n_layers, dropout_rate):
        super().__init__()
        
        # Store parameters
        self.output_dim = output_dim
        self.embedding_dim_dec = embedding_dim_dec
        self.hidden_dim_enc = hidden_dim_enc
        self.hidden_dim_dec = hidden_dim_dec
        self.n_layers = n_layers
        self.dropout_rate = dropout_rate
        
        # Create the layers and attention module
        self.embedding_layer = nn.Embedding(output_dim, embedding_dim_dec)
        self.dropout_layer = nn.Dropout(dropout_rate)
        self.gru = nn.GRU(embedding_dim_dec + hidden_dim_enc, hidden_dim_dec, n_layers)
        self.linear_layer = nn.Linear(hidden_dim_dec, output_dim)
        self.attention = Attention(hidden_dim_enc, hidden_dim_dec)
        
    def forward(self, input_idx, input_hidden, enc_outputs):
        
        # Get the embeddings for the input token (same as in the previous lab)
        y = self.dropout_layer(self.embedding_layer(input_idx))
        y = y.unsqueeze(0)
        
        # Compute attention
        a = self.attention(input_hidden, enc_outputs)
        
        # Compute the weighted vector
        a = a.unsqueeze(1)
        enc_outputs = enc_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, enc_outputs)
        weighted = weighted.permute(1, 0, 2)
        
        # Concatenate the embeddings (after dropout) and the weighted vector
        rnn_input = torch.cat((y, weighted), dim=2)
        
        # Apply the GRU layer
        output, hidden = self.gru(rnn_input, input_hidden)
        
        # Squeeze the output of the GRU and pass it to the linear layer to have the predicted probabilites
        output = output.squeeze(0)
        pred_proba = self.linear_layer(output)
        
        return pred_proba, hidden, a

# Full model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        # Store the encoder, decoder, and the target vocabulary size
        self.encoder = encoder
        self.decoder = decoder
        self.trg_vocab_size = decoder.output_dim
        
    def forward(self, src, trg_len):
        
        batch_size = src.shape[-1]
        pred_probas = torch.zeros(trg_len, batch_size, self.trg_vocab_size)
        pred_probas[0, :, 2] = 1
        
        src_len = src.shape[0]
        attn = torch.zeros(batch_size, src_len, trg_len)
        
        enc_outputs, hidden = self.encoder(src)
        
        input_idx = torch.ones(batch_size).int() * 2
        
        for t in range(1, trg_len):
            pred_proba_t, hidden, a = self.decoder(input_idx, hidden, enc_outputs)
            pred_probas[t, :, :] = pred_proba_t
            input_idx = pred_proba_t.argmax(-1)
            attn[:, :, t] = a.squeeze()
        
        return pred_probas, attn

In [None]:
# Instanciate the full model and apply it to the example batch
encoder = Encoder(input_dim, embedding_dim_enc, hidden_dim_enc, n_layers, dropout_rate)
decoder = Decoder(output_dim, embedding_dim_dec, hidden_dim_enc, hidden_dim_dec, n_layers, dropout_rate)
model = Seq2Seq(encoder, decoder)

## Computing BLEU

To compute the BLEU score, we first need to compute the set of all references and predicted sentences over the dataset. As an example, we do that hereafter on a single batch.

In [None]:
# Get the source and target sentences in the example batch
src, trg = example_batch.src, example_batch.trg

# Apply the model
trg_len = trg.shape[0]
pred_probas, _ = model(src, trg_len)

# Get the predicted target indices (of maximum proba) for the batch
trg_pred = torch.argmax(pred_probas, -1)

# Take one sentence in the batch (for both trg and trg_pred) as example
ind_sent = 0
trg_pred_ex = trg_pred[:, ind_sent]
trg_ex = trg[:, ind_sent]

# We need to convert the indices back to tokens.
translation_pred = itos_list_en(trg_pred_ex)
translation = itos_list_en(trg_ex)
print(translation)
print(translation_pred)

We need to define a `cut_tokens` function which removes the useless tokens for computing the BLEU score. For instance, if the input sentence is:

\["\< sos \>", "Hello", "world", "!", "\< eos \>", "\< pad \>", "\< pad \>", "\< pad \>" \]

then the output should be:

\["Hello", "world", "!"\]

In [None]:
# TO DO; write this function
def cut_tokens(sentence):

    return sentence_cut

# TO DO: Apply it to to translation and translation_pred


Now that we have seen how to get a clean predicted and reference sentence, we can simply loop over the sentences in the batch. To do so, we initialize empty lists `trg_all = []` and `trg_pred_all = []` to store the result. We then loop over sentences and perform the operations above until we have clean sentences. Finally, we append them to `trg_all` and `trg_pred_all`.

**Note**: storing the predicted sentences is straightforward (`trg_pred_all.append(translation_pred_cut)`). However, for the references, we should append a *list of sentences* instead of a sentence. Indeed, the BLEU function expects us to provide possibly many sentences as reference translation. As a result, to append the reference we need to do: `trg_all.append([translation_cut])`

In [None]:
# Initialize the list of predictions and references
trg_pred_all = []
trg_all = []

# TO DO: as explained above, store all the reference and predicted target sentences in the batch


We can compute the BLEU score on this batch. BLEU ranges between 0 (bad) and 1 (perfect). The result on the batch should be 0 (or close to 0), because because the model has not been trained yet.

In [None]:
bleu = bleu_score(trg_pred_all, trg_all)
print('BLEU score on the batch: ', bleu)

## Evaluation and training

We can now write a function that performs evaluation by computing the BLEU score over an `eval_dataloader`.

In [None]:
def evaluate_bleu(model, eval_dataloader, device='cpu'):

    model.eval()
    model.to(device)
    loss_eval = 0

    trg_pred_all = []
    trg_all = []

    for batch in eval_dataloader:

        # Get the source and target sentence, and the target length, copy it to device
        src, trg = batch.src.to(device), batch.trg.to(device)
        trg_len = trg.shape[0]

        # Apply the model
        pred_probas, _ = model(src, trg_len)

        # Get the predicted index (of maximum proba)
        trg_pred = torch.argmax(pred_probas, -1)

        # TO DO: using the above, get the references/predicted target sentences in the batch and append these to trg_pred_all and trg_all


    # Get the BLEU score from the true and predicted targets
    bleu = bleu_score(trg_pred_all, trg_all)
    
    return bleu

In [None]:
# Training fonction with validation using the BLEU score
# It's similar to lab4.2, but here keep in mind that BLEU should be increased (unlike the previous lab where the loss should decrease)

def training_validation_bleu(model, train_dataloader, num_epochs, loss_fn, optimizer, model_name, valid_dataloader=None, device='cpu', verbose=True):

    model.train()
    model.to(device)
    loss_train_total = []
    bleu_val_total = []
    
    # TO DO: initialize the "optimal" BLEU score


    for epoch in range(num_epochs):

        loss_current_epoch = 0

        for i, batch in enumerate(train_dataloader):

            # Get the source and target sentence, and the target length, copy it to device
            src, trg = batch.src.to(device), batch.trg.to(device)
            trg_len = trg.shape[0]

            # Set the gradients at 0
            optimizer.zero_grad()

            # Apply the model
            pred_probas, _ = model(src, trg_len)

            # Remove the first token (always <sos>) to compute the loss
            output_dim = pred_probas.shape[-1]
            pred_probas = pred_probas[1:]

            # Reshape the pred_probas and target
            pred_probas = pred_probas.view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            # Backpropagation
            loss = loss_fn(pred_probas, trg)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            # Record the loss
            loss_current_epoch += loss.item()

        # At the end of each epoch, save the average loss over batches and display it
        loss_train_total.append(loss_current_epoch)
        if verbose:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss_current_epoch))

        # TO DO: Perform validation: save the current model only if it increases performance (i.e., decreases loss) on the validation set

                
    return loss_train_total, bleu_val_total

In [None]:
# Training parameters
num_epochs = 30
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
loss_fn = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
optimizer = optim.Adam(model.parameters())

# TO DO: Train the model


In [None]:
# TO DO: Evaluate it on the test set in terms of BLEU


Again, the results won't be good since we used a simple model / subset. You can use the whole dataset and a big model for obtaining a decent performance (the BLEU score on the test set should be slightly above 0.2 after 10 epochs).

Feel free to play arround with these scripts : you can compare the final performance with the one from the previous script (that is, not using validation with BLEU); you can display attention as we did in 5.1, etc. Enjoy!