In [1]:
!pip install spacy sacrebleu torchdata -U
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu, spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.5.2
    Uninstalling spacy-3.5.2:
      Successfully uninstalled spacy-3.5.2
Successfully installed colorama-0.4.6 portalocker-2.

In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


In [3]:
# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}


In [4]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

## Encoding
In the positional encoding, why are we using a combination of sinus and cosinus ?

    We want to modelize for each word the distance with other words. We want the way of calculating this to be length independant, deterministic, output a unique result for each word and it should generalize easily i.e values should be bounded. The best way to generate for each word this distance is using sinus and cosinus. Indeed they answer criterions previously mentioned. And result of cosine and sinus will correspond to the intuition that farther are words from a given word lower (means closer to 0) will be their distance associated, closer thery are bigger will be the distance (bigger means closer to 1 in that case).
    Cosiine and sinus have other practical properties like by modleing it like a mtrix multiplication we find that it can be modeled like a linear trnasformation. Also, distance between words is symetrical. That means that in the previous sentence "words" is as far of "distance" as symetrical is of "words".

In the Seq2SeqTransformer class,

    What is the parameter nhead for?
    
        The nhead parameter stands for the number of heads of the multi attention head layer.
    
    What is the point of the generator?
    
        "generator" is a linear layer apply to forward layers. This means every layers will take the same input.
    
Describe the goal of the create_mask function. Why does it handle differently the source and target masks?
    
    The create mask function return mask to apply to the output of attentions layer in the decoder. The goal of this masks is to prevent the transformer from making prediction on futur values. It handles source and target masks separatly because we do not necessarly need this precaution for source. Also we could want other behavior and we could have had it if the mask were different.


In [5]:
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [6]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [7]:
torch.manual_seed(72)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [8]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [9]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [10]:
from timeit import default_timer as timer
NUM_EPOCHS = 25

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))




Epoch: 1, Train loss: 5.324, Val loss: 4.110, Epoch time = 45.054s
Epoch: 2, Train loss: 3.761, Val loss: 3.339, Epoch time = 44.297s
Epoch: 3, Train loss: 3.159, Val loss: 2.901, Epoch time = 43.881s
Epoch: 4, Train loss: 2.767, Val loss: 2.640, Epoch time = 44.451s
Epoch: 5, Train loss: 2.478, Val loss: 2.460, Epoch time = 43.612s
Epoch: 6, Train loss: 2.245, Val loss: 2.326, Epoch time = 44.693s
Epoch: 7, Train loss: 2.055, Val loss: 2.212, Epoch time = 43.431s
Epoch: 8, Train loss: 1.895, Val loss: 2.147, Epoch time = 44.747s
Epoch: 9, Train loss: 1.752, Val loss: 2.083, Epoch time = 43.562s
Epoch: 10, Train loss: 1.629, Val loss: 2.024, Epoch time = 44.668s
Epoch: 11, Train loss: 1.519, Val loss: 1.986, Epoch time = 43.550s
Epoch: 12, Train loss: 1.421, Val loss: 1.995, Epoch time = 44.519s
Epoch: 13, Train loss: 1.332, Val loss: 1.996, Epoch time = 43.533s
Epoch: 14, Train loss: 1.248, Val loss: 1.961, Epoch time = 44.073s
Epoch: 15, Train loss: 1.175, Val loss: 1.933, Epoch time

In [11]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

# function to generate output sequence using greedy algorithm
def top_k_decode(model, src, src_mask, max_len, start_symbol, k, temperature):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        # Apply temperature to the probs
        logits = prob / temperature

        # Apply top-k sampling
        filtered_logits, indices = torch.topk(logits, k)
        probabilities = nn.Softmax(dim=-1)(filtered_logits)
        next_word_index = torch.multinomial(probabilities, num_samples=1).squeeze()
        next_word = indices[0][next_word_index.item()]

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# function to generate output sequence using greedy algorithm
def top_p_decode(model, src, src_mask, max_len, start_symbol, p, temperature):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        # Apply temperature to the prob
        logits = prob / temperature

        # Apply top-p (nucleus) sampling
        probabilities = nn.Softmax(dim=-1)(logits)
        sorted_logits, sorted_indices = torch.sort(probabilities, descending=True)
        cumulative_probs = torch.cumsum(sorted_logits, dim=-1)
        mask = cumulative_probs[0] < p
        mask[0] = True
        top = sorted_logits[0][mask]
        next_word_index = torch.multinomial(top, num_samples=1).squeeze()
        next_word = sorted_indices[0][next_word_index.item()]

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str, decode_function, **kargs):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = decode_function(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, **kargs).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

# Test

In [88]:
def test_translation(sample: str, target: str = None, k_list: list =[], p_list: list =[], temperature_list: list =[]) -> None:
  """
  Print the result of the different decode functions
  sample = the sample to test
  target = the target of the sample
  k_list = list of the k parameters to test
  p_list = list of the p parameters to test
  temperature_list = list of the temperature parameters to test
  return void
  """
  print(" ==== ")
  print("sample: '" + sample + "'")
  if target:
    print("target: '" + target + "'")
  
  print("\n==== Greedy ====")
  print("  '" + translate(transformer, sample, greedy_decode) + "'")
  
  print("\n==== Top K Decode ====")
  for k in k_list:
    for temperature in temperature_list:
      pred = translate(transformer, sample, top_k_decode, k=k, temperature=temperature)
      print(f"K={k} Temperature={temperature}: '{pred}'")
  
  print("\n==== Top P Decode ====")

  for p in p_list:
    for temperature in temperature_list:
      pred = translate(transformer, sample, top_p_decode, p=p, temperature=temperature)
      print(f"P={p} Temperature={temperature}: '{pred}'")

In [89]:
val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))

# Iterate on the i first element of the validation dataset and call test_translation
i = 3
for src, tgt in val_iter:
  test_translation(src, tgt, k_list=[2,3,5,10], p_list=[0.01,0.1,0.15], temperature_list=[1,2,3])
  if i < 0:
    break
  i -= 1

 ==== 
sample: 'Eine Gruppe von Männern lädt Baumwolle auf einen Lastwagen'
target: 'A group of men are loading cotton onto a truck'

==== Greedy ====
  ' A group of men are putting cotton into cotton candy . '

==== Top K Decode ====
K=2 Temperature=1: ' Group of men loading cotton into a truck . '
K=2 Temperature=2: ' A group of guys loading cotton into a truck . '
K=2 Temperature=3: ' A group of guys loading cotton candy onto a truck tunnel . '
K=3 Temperature=1: ' A group of men are loading into some cotton bowl . '
K=3 Temperature=2: ' The group of men is spinning dough into a truck . '
K=3 Temperature=3: ' Group of men loading cotton into a truck . '
K=5 Temperature=1: ' A group of men are putting cotton into into a truck . '
K=5 Temperature=2: ' A group of men are putting device into a truck . '
K=5 Temperature=3: ' A group of men making clay bowl raised onto truck . '
K=10 Temperature=1: ' A group of men loading cotton into a truck . '
K=10 Temperature=2: ' A group of men pole 

Compute the BLEU score of the model

In [90]:
from sacrebleu.metrics import BLEU, CHRF, TER

In [102]:
def score(k, p, temperature):
  """
  Compute the bleu score on the greedy, top_k and top_p decode.
  k = The k parameter for the top_k function
  p = The p parameter for the top_p function
  temperature = the temperature for the decode functions
  return greedy_score, top_k_score, top_p_score
  """
  bleu = BLEU()

  val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
  
  preds_k = []
  preds_p = []
  preds_greedy = []
  targets = []
  for src, tgt in val_iter:
    targets.append(tgt)

    pred_k = translate(transformer, src, top_k_decode, k=k, temperature=temperature)
    preds_k.append(pred_k)

    pred_p = translate(transformer, src, top_p_decode, p=p, temperature=temperature)
    preds_p.append(pred_p)

    pred_greedy = translate(transformer, src, greedy_decode)
    preds_greedy.append(pred_greedy)
  
  targets = [targets]
  greedy_score = bleu.corpus_score(preds_greedy, targets)
  top_k_score = bleu.corpus_score(preds_k, targets)
  top_p_score = bleu.corpus_score(preds_p, targets)

  return greedy_score, top_k_score, top_p_score

In [103]:
g, k, p = score(3, 0.15, 3)

In [104]:
print(f"Greedy score: {g}")
print(f"Top k score: {k}")
print(f"Top g score: {p}")

Greedy score: BLEU = 35.63 67.9/43.8/28.7/19.1 (BP = 0.997 ratio = 0.997 hyp_len = 13250 ref_len = 13289)
Top k score: BLEU = 18.48 53.9/25.5/12.7/6.7 (BP = 1.000 ratio = 1.007 hyp_len = 13376 ref_len = 13289)
Top g score: BLEU = 1.39 22.0/3.5/0.7/0.1 (BP = 1.000 ratio = 1.044 hyp_len = 13869 ref_len = 13289)


* The first number refers to the final BLEU score.
* The next 4 numbers represents the precision value for 1–4 ngram order.
* BP is the brevity penalty
* ratio indicates the ratio between hypothesis length and reference length
* hyp_len refers to the total number of characters for hypothesis text
* ref_len is the total number of characters for reference text

Here we can see that the greedy decode have the best final BLEU score.