### Objective

Build a machine translation model (Spainish --> English), using standard Encoder-Decoder architecture.

### SetUP

In [1]:
# connect to google drive
import os
import numpy as np

# mount google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
# Check for GPU free memory
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
#!pip install psutil
#!pip install humanize
import psutil
import humanize
import GPUtil as GPU
GPUs = GPU.getGPUs()

# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
  process = psutil.Process(os.getpid())
  print('='*40)
  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
  print('='*40)
printm() 

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 12.9 GB  | Proc size: 156.3 MB
GPU RAM Free: 15079MB | Used: 0MB | Util   0% | Total 15079MB


In [0]:
# change root directory such that models are saved in google drive during training
root_dir = "/content/gdrive/My Drive/NLP/Git_MT"
os.chdir(root_dir)

### Imports

In [0]:
# insert the path for utility custom functions
import sys
sys.path.insert(0, os.path.join(root_dir, 'code_utils'))

# custom python functions and classes
from utils import read_corpus, batch_iter, pad_sents
from vocab import Vocab, VocabEntry

In [0]:
# basic packages
import math
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter, namedtuple
from docopt import docopt
from itertools import chain
import json
from typing import List, Tuple, Dict, Set, Union
from docopt import docopt

#pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence



#others
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from tqdm import tqdm
from IPython.core.debugger import set_trace

In [0]:
# Logger

import logging
logger = logging.getLogger("Tensor")

file_handler = logging.FileHandler("Tensor_file.log")
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s:%(name)s:%(message)s')

file_handler.setFormatter(formatter)
stream_handler.setFormatter(formatter)

logger.addHandler(file_handler)

logger.addHandler(stream_handler)

logger.setLevel(logging.DEBUG)

# A helper function to check how tensor sizes change
def log_size(tsr: torch.Tensor, name: str):
    #cls = getclass()
    logger.debug(msg=f"{name} ==> size={tsr.shape}")

In [0]:
#! pip freeze > requirements.txt

### Load Data: EDA

In [9]:
# load data
train_es = 'en_es_data/train.es'
train_en = 'en_es_data/train.en'

dev_es = 'en_es_data/dev.es'
dev_en = 'en_es_data/dev.en'

test_es = 'en_es_data/test.es'
test_en = 'en_es_data/test.en'


train_data_src = read_corpus(train_es, source='src')
train_data_tgt = read_corpus(train_en, source='tgt')

dev_data_src = read_corpus(dev_es, source='src')
dev_data_tgt = read_corpus(dev_en, source='tgt')

test_data_src = read_corpus(test_es, source='src')
test_data_tgt = read_corpus(test_en, source='tgt')

train_data = list(zip(train_data_src,train_data_tgt))
dev_data = list(zip(dev_data_src,dev_data_tgt))
test_data = list(zip(test_data_src,test_data_tgt))

#
print("=="*40)
print("Number of examples in train: {}".format(len(train_data)))
print("Number of examples in valid: {}".format(len(dev_data)))
print("Number of examples in test: {}".format(len(test_data)))
#
print("=="*40)
print("Spanish --> English")
es, en = next(iter(dev_data))
print("Sp: {}".format(' '.join(es)))
print("En: {}".format(' '.join(en)))
print("=="*40)

# Build Vocab with train set

size = 50000
freq_cutoff= 2
vocab_file = 'en_es_data/vocab.json'

vocab = Vocab.build(train_data_src, train_data_tgt, size, freq_cutoff)
print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))

vocab.save(vocab_file)
print('vocabulary saved to %s' % vocab_file)

#
print("=="*40)
print('Note that the <s> and </s> tokens are added while vocab\
      initialization.\nThese tokens are also present in target\
      top frequent words. \nThat is why vocab size for target language is lesser by 2.')
print("=="*40)


# Check tokenization process
print("=="*40)
sents = [['I', 'asgjsssd', 'will', 'be', 'there', 'for', 'you.'], ['This', 'is', 'spartaaaaaaaa.']]
print("Tokenize:\n {} \n {}\n".format(' '.join(sents[0]), ' '.join(sents[1])))

print(vocab.tgt.to_input_tensor(sents, "cpu"))
#
print("=="*40)
print("Note that 3 and 0  are <unk> and <pad> tokens!")
print("=="*40)

Number of examples in train: 216617
Number of examples in valid: 851
Number of examples in test: 8064
Spanish --> English
Sp: El ao pasado proyect estas dos diapositivas para demostrar que la capa de hielo rtico, que durante los ltimos tres millones de aos ha sido del tamao de los 48 estados, se ha reducido en un 40 por ciento.
En: <s> Last year I showed these two slides so that  demonstrate that the arctic ice cap,  which for most of the last three million years  has been the size of the lower 48 states,  has shrunk by 40 percent. </s>
initialize source vocabulary ..
number of word types: 172418, number of word types w/ frequency >= 2: 80623
initialize target vocabulary ..
number of word types: 128873, number of word types w/ frequency >= 2: 64215
generated vocabulary, source 50004 words, target 50002 words
vocabulary saved to en_es_data/vocab.json
Note that the <s> and </s> tokens are added while vocab      initialization.
These tokens are also present in target      top frequent wor

### Encoder-decoder model parts

#### Model Embedding

In [0]:
class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """
    def __init__(self, embed_size, vocab):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py in code_utils.
        """
        super(ModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        # default values
        self.source = None
        self.target = None

        src_pad_token_idx = vocab.src['<pad>']
        tgt_pad_token_idx = vocab.tgt['<pad>']

        
        self.source = nn.Embedding(len(vocab.src), embed_size, padding_idx=src_pad_token_idx)
        self.target = nn.Embedding(len(vocab.tgt), embed_size, padding_idx=tgt_pad_token_idx)
      


#### Encoder 

In [0]:
## Encoder Block 
class Encoder(nn.Module):
  
  """ Class that encodes given sentences to finite dimensional vectors 
  """

  def __init__(self, embed_size, hidden_size, word_mat, n_layers=1, dropout_rate=0.2):
  
    """ Initialize the encoder block 
    @ param embed_size: embedding dimension
    @ param hidden_size: # of hidden units for LSTM
    @ word_mat:  ModelEmbeddings object
    @ param n_layers: # of LSTM layers
    @ dropout_rate: dropout for regularization
    """  

    super(Encoder, self).__init__()
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.dropout_rate = dropout_rate
    self.embed_mat = word_mat.source # to get embeddings of words from source language

    # Layers
    self.seq_src = nn.LSTM(embed_size, hidden_size, n_layers)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, source_padded, source_lengths):
    
    """ Encodes the source sentence
        @param: source_padded ==> Tensor (T,b) #T = max src length, b= batch size 
        @param: sorce_lengths ==> List (b) : length of source sentences
        @return last_hidden ==> Tensor (n_layers*n_directions,b,h) # h = hidden_size
        @return last_cell => Tensor (n_layers*n_directions,b,h) # h = hidden_size
    """
    #set_trace()
    X = self.dropout(self.embed_mat(source_padded))
    X = pack_padded_sequence(X, source_lengths) # for faster computation
    _, (last_hidden, last_cell) = self.seq_src(X) 

    return last_hidden, last_cell


#### Decoder

In [0]:
# Decoder block
class Decoder(nn.Module):
  
  """ Class that decodes source sentences
  """

  def __init__(self, embed_size, hidden_size, word_mat, n_layers=1, dropout_rate =0.2):
    
    super(Decoder, self).__init__()
    
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.dropout_rate = dropout_rate
    self.embed_mat = word_mat.target # embedding layer
    self.vocab_size = self.embed_mat.weight.size(0)
    
    # Layers
    self.seq_tgt = nn.LSTM(embed_size, hidden_size, n_layers)
    self.dropout = nn.Dropout(dropout_rate)
    self.out = nn.Linear(hidden_size, self.vocab_size)

  def forward(self, Y_in, dec_state):
    
    """ Takes input of time step t and predicts conditional 
    probability of words at time t+1
    
    @ param: Y_embed: embedding for input token at time t
    @ param: decoder state: hidden unit activations at time t
    @ returns updated decoder state and next time step word predictions
    """
    
    Y_embed = self.embed_mat(Y_in) #(1,b,embed_size)
    d_out, dec_state = self.seq_tgt(Y_embed, dec_state)
    d_out = self.dropout(torch.squeeze(d_out,dim=0)) #(b,hidden_size)
    dec_pred = self.out(d_out)
    
    return dec_pred, dec_state

### Seq2Seq Model: Encoder-Decoder

In [0]:
class NMT(nn.Module):
  def __init__(self, vocab, encoder, decoder):
    super(NMT, self).__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.vocab = vocab
    
    assert encoder.hidden_size == decoder.hidden_size, \
    "Hidden dimensions of encoder and decoder must be equal!"
    
    assert encoder.embed_size == decoder.embed_size, \
    "Embedding dimensions of encoder and decoder must be equal!"
    
    assert encoder.n_layers == decoder.n_layers, \
    "Encoder and decoder must have equal number of layers!"
    
    self.embed_size = encoder.embed_size
    self.hidden_size = encoder.hidden_size
    self.n_layers = encoder.n_layers
    
    
    # different layers  
    #self.model_embeddings = ModelEmbeddings(self.embed_size, vocab)
    
  def forward(self, source, target):
    # Compute sentence lengths
    source_lengths = [len(s) for s in source]
      
    # Convert list of lists into tensors
    source_padded = self.vocab.src.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
    target_padded_raw = self.vocab.tgt.to_input_tensor(target, device=self.device)   # Tensor: (tgt_len, b)
    
    # Encode source padded --> decoder initial hidden state
    dec_hidden, dec_cell = self.encoder(source_padded, source_lengths)
    dec_state = (dec_hidden, dec_cell)
    
    # Chop of the <END> token for max length sentences.
    target_padded = target_padded_raw[:-1] #(tgt_len-1,b) ==> Tensor
    
    # Get target sentence length 
    tgt_len = target_padded.size(0)
    
    
    # split Y for input to each time step
    Y_splited = torch.split(target_padded,1, dim=0)
    
    # Genearte outputs with decoder
    outputs = [] # placeholder
    
    for i in range(tgt_len):
      
      Y_t = Y_splited[i] # Tensor (1,b)
      
      dec_pred, dec_state = self.decoder(Y_t, dec_state) #dec_pred ==> (b, tgt_vocab_len)
      
      outputs.append(dec_pred) 
    
    # stack  
    outputs = torch.stack(outputs) #(tgt_len,b,tgt_vocab_len)
    
    # apply softmax
    P = F.log_softmax(outputs, dim = -1) # P ==> (tgt_len,b,tgt_vocab_len)
    
    # Zero out, probabilities for which we have nothing in the target text
    target_masks = (target_padded_raw != self.vocab.tgt['<pad>']).float()
    
    # Compute log probability of generating true target words
    target_gold_words_log_prob = torch.gather(P, index=target_padded_raw[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
    scores = target_gold_words_log_prob.sum(dim=0)
      
    return scores   
  
  @staticmethod
  def load(model_path: str):
      """ Load the model from a file.
      @param model_path (str): path to model
      """
      params = torch.load(model_path, map_location=lambda storage, loc: storage)
      args = params['args']
      model = NMT_base(vocab=params['vocab'], **args)
      model.load_state_dict(params['state_dict'])

      return model
    
    
  def save(self, path: str):
      """ Save the odel to a file.
      @param path (str): path to the model
      """
      print('save model parameters to [%s]' % path, file=sys.stderr)

      params = {
          'args': dict(embed_size=self.encoder.embed_size, hidden_size=self.hidden_size, n_layers=self.n_layers),
          'vocab': self.vocab,
          'state_dict': self.state_dict()
      }

      torch.save(params, path)
    
  @property
  def device(self) -> torch.device:
      """ Determine which device to place the Tensors upon, CPU or GPU.
      """
      return self.encoder.embed_mat.weight.device
    

### Build Model

In [0]:
# Hyperparameters

embed_size = 256
hidden_size = 512
n_layers = 1
enc_dropout = 0.2
dec_dropout = 0.2

# Build model
# Create model embedding object
word_mat = ModelEmbeddings(embed_size, vocab)
# Create Encoder
enc = Encoder(embed_size, hidden_size, word_mat, n_layers, enc_dropout)
# Create Decoder
dec = Decoder(embed_size, hidden_size, word_mat, n_layers, dec_dropout)
# Build Seq2Seq Model
model = NMT(vocab,enc, dec)

#### Initialize parameters and optimizer

In [42]:
uniform_init = 0.1

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -uniform_init, uniform_init)
        
model.apply(init_weights)

# Count total parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

# Use Adam Optimizaer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# transfer model to cuda if available
device = torch.device("cuda:0" if torch.cuda.device_count()>0 else "cpu")
print('use device: %s' % device)
model = model.to(device)

The model has 54,406,482 trainable parameters
use device: cuda:0


## Training

#### Perplexity (PPL)

In [0]:
## Compute Perplexity to keep track of training

def evaluate_ppl(model, dev_data, batch_size=32):
    """ Evaluate perplexity on dev sentences
    @param model (NMT): NMT Model
    @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (batch size)
    @returns ppl (perplixty on dev sentences)
    """
    was_training = model.training
    model.eval()

    cum_loss = 0.
    cum_tgt_words = 0.

    # no_grad() signals backend to throw away all gradients
    with torch.no_grad():
        for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
            loss = -model(src_sents, tgt_sents).sum()

            cum_loss += loss.item()
            tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            cum_tgt_words += tgt_word_num_to_predict

        ppl = np.exp(cum_loss / cum_tgt_words)

    if was_training:
        model.train()

    return ppl

#### Model training function

In [0]:
######## Train Model ########

model_save_path = 'NMT_LSTM_seq2seq_one_layer'

##
def train_model(model, optimizer, clip_grad =5.0, max_epoch =30, max_patience = 3, max_trial = 3, lr_decay = 0.5, train_batch_size = 128, log_every = 100, valid_niter = 1000):
  
  
  print('Training begins...')
  ## Temp variables
  num_trial = 0
  train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
  cum_examples = report_examples  = valid_num = 0
  hist_valid_scores = []
  train_time = begin_time = time.time()
  
  # put the model in training mode
  model.train()
  
  
  # iterate over the epochs
  for epoch in range(max_epoch):
    for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
        
        train_iter += 1
        optimizer.zero_grad()
        batch_size = len(src_sents)
        
        example_losses = -model(src_sents, tgt_sents)
        batch_loss = example_losses.sum()
        loss = batch_loss/batch_size
        loss.backward() # autograd
        
        # Clip gradient
        grad_norn = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step() # update parameters
        
        batch_losses_val = batch_loss.item()
        report_loss += batch_losses_val
        cum_loss += batch_losses_val
        
        tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
        report_tgt_words += tgt_words_num_to_predict
        cum_tgt_words += tgt_words_num_to_predict
        report_examples += batch_size
        cum_examples += batch_size
        
        # print interim report about training
        
        if train_iter % log_every == 0:
            #set_trace()
            print('| Epoch %d, Iter %d| Avg Loss = %.2f| Avg. ppl = %.2f| Speed %.2f words/sec| Time %.2f min|' % (epoch+1, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words),
                                                                                     report_tgt_words / (time.time() - train_time), (time.time() - begin_time)/60.0))

            train_time = time.time()
            report_loss = report_tgt_words = report_examples = 0.
        
        # validation
        if train_iter % valid_niter == 0:
            
            print('| <Train Summary> | Epoch %d, Iter %d| Cum. loss = %.2f| Cum. ppl = %.2f|' % (epoch+1, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words)))

            cum_loss = cum_examples = cum_tgt_words = 0.
            valid_num += 1

            print('Report on validation set:', file=sys.stderr)

            # compute dev. ppl and bleu
            dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
            valid_metric = -dev_ppl

            print('Validation:  Dev. ppl = %f' % (dev_ppl), file=sys.stderr)

            
            # learning rate scheduling
            
            is_better = (len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores))
            hist_valid_scores.append(valid_metric)

            if is_better:
                patience = 0
                print('Save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                model.save(model_save_path)

                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
                
            elif patience < int(max_patience):
                patience += 1
                print('Hit patience %d' % patience, file=sys.stderr)

                if patience == int(max_patience):
                    num_trial += 1
                    print('Hit #%d trial' % num_trial, file=sys.stderr)
                    
                    if num_trial == int(max_trial):
                        print('early stop!', file=sys.stderr)
                        return

                    # decay lr, and restore from previously best checkpoint
                    lr = optimizer.param_groups[0]['lr'] * float(lr_decay)
                    print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                    # load model
                    params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                    model.load_state_dict(params['state_dict'])
                    model = model.to(device)

                    print('restore parameters of the optimizers', file=sys.stderr)
                    optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                    # set new lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr

                    # reset patience
                    patience = 0

            if epoch +1 == int(max_epoch):
                print('Training stopped <-> Reached maximum number of epochs!', file=sys.stderr)
                return

In [45]:
# train parameters
max_epoch =30
train_batch_size = 128

# train the model
train_model(model, optimizer, max_epoch =max_epoch, train_batch_size = train_batch_size)

Training begins...
| Epoch 1, Iter 100| Avg Loss = 125.55| Avg. ppl = 1234.10| Speed 3880.12 words/sec| Time 0.97 min|
| Epoch 1, Iter 200| Avg Loss = 110.54| Avg. ppl = 506.61| Speed 3871.41 words/sec| Time 1.95 min|
| Epoch 1, Iter 300| Avg Loss = 104.44| Avg. ppl = 362.79| Speed 3818.79 words/sec| Time 2.94 min|
| Epoch 1, Iter 400| Avg Loss = 101.38| Avg. ppl = 305.26| Speed 3865.46 words/sec| Time 3.92 min|
| Epoch 1, Iter 500| Avg Loss = 97.59| Avg. ppl = 263.03| Speed 3797.05 words/sec| Time 4.90 min|
| Epoch 1, Iter 600| Avg Loss = 97.35| Avg. ppl = 237.80| Speed 3837.94 words/sec| Time 5.89 min|
| Epoch 1, Iter 700| Avg Loss = 95.62| Avg. ppl = 218.89| Speed 3817.28 words/sec| Time 6.88 min|
| Epoch 1, Iter 800| Avg Loss = 92.71| Avg. ppl = 195.04| Speed 3801.30 words/sec| Time 7.87 min|
| Epoch 1, Iter 900| Avg Loss = 91.98| Avg. ppl = 180.72| Speed 3830.55 words/sec| Time 8.85 min|
| Epoch 1, Iter 1000| Avg Loss = 89.82| Avg. ppl = 167.01| Speed 3805.90 words/sec| Time 9.84 

Report on validation set:
Validation:  Dev. ppl = 172.725205
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 1, Iter 1100| Avg Loss = 89.57| Avg. ppl = 156.58| Speed 3344.71 words/sec| Time 10.97 min|
| Epoch 1, Iter 1200| Avg Loss = 88.29| Avg. ppl = 147.12| Speed 3789.53 words/sec| Time 11.96 min|
| Epoch 1, Iter 1300| Avg Loss = 86.49| Avg. ppl = 137.82| Speed 3776.01 words/sec| Time 12.95 min|
| Epoch 1, Iter 1400| Avg Loss = 86.73| Avg. ppl = 132.59| Speed 3821.52 words/sec| Time 13.95 min|
| Epoch 1, Iter 1500| Avg Loss = 85.06| Avg. ppl = 123.91| Speed 3783.49 words/sec| Time 14.94 min|
| Epoch 1, Iter 1600| Avg Loss = 83.95| Avg. ppl = 119.23| Speed 3786.74 words/sec| Time 15.93 min|
| Epoch 2, Iter 1700| Avg Loss = 84.20| Avg. ppl = 113.32| Speed 3817.81 words/sec| Time 16.92 min|
| Epoch 2, Iter 1800| Avg Loss = 80.31| Avg. ppl = 92.75| Speed 3802.06 words/sec| Time 17.91 min|
| Epoch 2, Iter 1900| Avg Loss = 79.83| Avg. ppl = 90.46| Speed 3778.86 words/sec| Time 18.91 min|
| Epoch 2, Iter 2000| Avg Loss = 78.82| Avg. ppl = 87.68| Speed 3794.05 words/sec| Time 19.90 min|
| <

Report on validation set:
Validation:  Dev. ppl = 115.323163
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 2, Iter 2100| Avg Loss = 78.50| Avg. ppl = 85.78| Speed 3509.21 words/sec| Time 20.98 min|
| Epoch 2, Iter 2200| Avg Loss = 77.82| Avg. ppl = 84.01| Speed 3776.84 words/sec| Time 21.97 min|
| Epoch 2, Iter 2300| Avg Loss = 77.73| Avg. ppl = 81.99| Speed 3779.82 words/sec| Time 22.96 min|
| Epoch 2, Iter 2400| Avg Loss = 77.91| Avg. ppl = 80.33| Speed 3806.67 words/sec| Time 23.96 min|
| Epoch 2, Iter 2500| Avg Loss = 77.22| Avg. ppl = 79.77| Speed 3793.40 words/sec| Time 24.95 min|
| Epoch 2, Iter 2600| Avg Loss = 75.93| Avg. ppl = 77.60| Speed 3790.54 words/sec| Time 25.93 min|
| Epoch 2, Iter 2700| Avg Loss = 77.82| Avg. ppl = 76.69| Speed 3908.90 words/sec| Time 26.91 min|
| Epoch 2, Iter 2800| Avg Loss = 76.23| Avg. ppl = 74.13| Speed 3786.46 words/sec| Time 27.91 min|
| Epoch 2, Iter 2900| Avg Loss = 75.55| Avg. ppl = 73.04| Speed 3775.67 words/sec| Time 28.90 min|
| Epoch 2, Iter 3000| Avg Loss = 75.78| Avg. ppl = 71.36| Speed 3792.27 words/sec| Time 29.90 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 90.633073
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 2, Iter 3100| Avg Loss = 75.19| Avg. ppl = 69.37| Speed 3567.82 words/sec| Time 30.96 min|
| Epoch 2, Iter 3200| Avg Loss = 75.06| Avg. ppl = 69.35| Speed 3793.65 words/sec| Time 31.96 min|
| Epoch 2, Iter 3300| Avg Loss = 74.06| Avg. ppl = 67.56| Speed 3753.56 words/sec| Time 32.96 min|
| Epoch 3, Iter 3400| Avg Loss = 73.22| Avg. ppl = 63.65| Speed 3768.91 words/sec| Time 33.95 min|
| Epoch 3, Iter 3500| Avg Loss = 68.67| Avg. ppl = 49.30| Speed 3793.41 words/sec| Time 34.94 min|
| Epoch 3, Iter 3600| Avg Loss = 68.47| Avg. ppl = 48.23| Speed 3762.97 words/sec| Time 35.94 min|
| Epoch 3, Iter 3700| Avg Loss = 68.56| Avg. ppl = 48.82| Speed 3778.25 words/sec| Time 36.94 min|
| Epoch 3, Iter 3800| Avg Loss = 68.92| Avg. ppl = 48.38| Speed 3830.30 words/sec| Time 37.93 min|
| Epoch 3, Iter 3900| Avg Loss = 69.12| Avg. ppl = 49.19| Speed 3798.76 words/sec| Time 38.92 min|
| Epoch 3, Iter 4000| Avg Loss = 68.65| Avg. ppl = 48.46| Speed 3752.62 words/sec| Time 39.93 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 79.608680
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 3, Iter 4100| Avg Loss = 68.07| Avg. ppl = 48.22| Speed 3565.40 words/sec| Time 40.98 min|
| Epoch 3, Iter 4200| Avg Loss = 67.45| Avg. ppl = 47.18| Speed 3778.46 words/sec| Time 41.97 min|
| Epoch 3, Iter 4300| Avg Loss = 68.08| Avg. ppl = 47.25| Speed 3802.94 words/sec| Time 42.96 min|
| Epoch 3, Iter 4400| Avg Loss = 68.39| Avg. ppl = 47.23| Speed 3798.49 words/sec| Time 43.95 min|
| Epoch 3, Iter 4500| Avg Loss = 67.85| Avg. ppl = 47.17| Speed 3793.67 words/sec| Time 44.94 min|
| Epoch 3, Iter 4600| Avg Loss = 68.18| Avg. ppl = 47.25| Speed 3803.24 words/sec| Time 45.94 min|
| Epoch 3, Iter 4700| Avg Loss = 68.22| Avg. ppl = 46.66| Speed 3800.02 words/sec| Time 46.93 min|
| Epoch 3, Iter 4800| Avg Loss = 68.10| Avg. ppl = 47.03| Speed 3776.98 words/sec| Time 47.93 min|
| Epoch 3, Iter 4900| Avg Loss = 67.19| Avg. ppl = 45.48| Speed 3776.16 words/sec| Time 48.93 min|
| Epoch 3, Iter 5000| Avg Loss = 67.72| Avg. ppl = 45.85| Speed 3802.74 words/sec| Time 49.92 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 69.057400
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 4, Iter 5100| Avg Loss = 66.71| Avg. ppl = 42.53| Speed 3587.61 words/sec| Time 50.97 min|
| Epoch 4, Iter 5200| Avg Loss = 61.72| Avg. ppl = 32.01| Speed 3818.34 words/sec| Time 51.96 min|
| Epoch 4, Iter 5300| Avg Loss = 61.76| Avg. ppl = 32.33| Speed 3793.95 words/sec| Time 52.96 min|
| Epoch 4, Iter 5400| Avg Loss = 60.92| Avg. ppl = 31.89| Speed 3786.19 words/sec| Time 53.96 min|
| Epoch 4, Iter 5500| Avg Loss = 60.44| Avg. ppl = 31.85| Speed 3769.08 words/sec| Time 54.94 min|
| Epoch 4, Iter 5600| Avg Loss = 61.78| Avg. ppl = 32.89| Speed 3775.65 words/sec| Time 55.94 min|
| Epoch 4, Iter 5700| Avg Loss = 61.85| Avg. ppl = 33.00| Speed 3807.76 words/sec| Time 56.93 min|
| Epoch 4, Iter 5800| Avg Loss = 61.98| Avg. ppl = 32.86| Speed 3779.67 words/sec| Time 57.94 min|
| Epoch 4, Iter 5900| Avg Loss = 62.35| Avg. ppl = 33.41| Speed 3783.59 words/sec| Time 58.94 min|
| Epoch 4, Iter 6000| Avg Loss = 61.29| Avg. ppl = 33.37| Speed 3750.74 words/sec| Time 59.93 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 65.485592
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 4, Iter 6100| Avg Loss = 61.77| Avg. ppl = 33.23| Speed 3519.23 words/sec| Time 61.00 min|
| Epoch 4, Iter 6200| Avg Loss = 61.92| Avg. ppl = 33.32| Speed 3808.43 words/sec| Time 61.99 min|
| Epoch 4, Iter 6300| Avg Loss = 61.63| Avg. ppl = 33.08| Speed 3801.95 words/sec| Time 62.98 min|
| Epoch 4, Iter 6400| Avg Loss = 61.86| Avg. ppl = 33.30| Speed 3799.51 words/sec| Time 63.97 min|
| Epoch 4, Iter 6500| Avg Loss = 62.21| Avg. ppl = 33.51| Speed 3774.69 words/sec| Time 64.97 min|
| Epoch 4, Iter 6600| Avg Loss = 62.04| Avg. ppl = 33.10| Speed 3770.49 words/sec| Time 65.97 min|
| Epoch 4, Iter 6700| Avg Loss = 61.99| Avg. ppl = 33.27| Speed 3774.30 words/sec| Time 66.97 min|
| Epoch 5, Iter 6800| Avg Loss = 60.20| Avg. ppl = 30.01| Speed 3740.10 words/sec| Time 67.98 min|
| Epoch 5, Iter 6900| Avg Loss = 54.92| Avg. ppl = 22.72| Speed 3732.60 words/sec| Time 68.98 min|
| Epoch 5, Iter 7000| Avg Loss = 55.72| Avg. ppl = 23.17| Speed 3813.42 words/sec| Time 69.97 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 63.082473
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 5, Iter 7100| Avg Loss = 55.23| Avg. ppl = 23.40| Speed 3503.91 words/sec| Time 71.04 min|
| Epoch 5, Iter 7200| Avg Loss = 55.18| Avg. ppl = 23.19| Speed 3758.23 words/sec| Time 72.04 min|
| Epoch 5, Iter 7300| Avg Loss = 56.18| Avg. ppl = 24.07| Speed 3815.81 words/sec| Time 73.02 min|
| Epoch 5, Iter 7400| Avg Loss = 56.69| Avg. ppl = 24.35| Speed 3801.62 words/sec| Time 74.02 min|
| Epoch 5, Iter 7500| Avg Loss = 56.45| Avg. ppl = 24.30| Speed 3809.65 words/sec| Time 75.01 min|
| Epoch 5, Iter 7600| Avg Loss = 56.21| Avg. ppl = 24.08| Speed 3788.80 words/sec| Time 76.00 min|
| Epoch 5, Iter 7700| Avg Loss = 56.92| Avg. ppl = 24.85| Speed 3773.89 words/sec| Time 77.01 min|
| Epoch 5, Iter 7800| Avg Loss = 56.74| Avg. ppl = 24.85| Speed 3761.00 words/sec| Time 78.01 min|
| Epoch 5, Iter 7900| Avg Loss = 57.07| Avg. ppl = 25.38| Speed 3760.11 words/sec| Time 79.01 min|
| Epoch 5, Iter 8000| Avg Loss = 57.30| Avg. ppl = 25.22| Speed 3756.44 words/sec| Time 80.02 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 60.287420
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 5, Iter 8100| Avg Loss = 57.08| Avg. ppl = 25.53| Speed 3540.30 words/sec| Time 81.08 min|
| Epoch 5, Iter 8200| Avg Loss = 56.99| Avg. ppl = 25.31| Speed 3779.62 words/sec| Time 82.07 min|
| Epoch 5, Iter 8300| Avg Loss = 56.89| Avg. ppl = 25.25| Speed 3795.74 words/sec| Time 83.06 min|
| Epoch 5, Iter 8400| Avg Loss = 57.70| Avg. ppl = 25.48| Speed 3811.60 words/sec| Time 84.06 min|
| Epoch 6, Iter 8500| Avg Loss = 55.21| Avg. ppl = 22.12| Speed 3827.24 words/sec| Time 85.05 min|
| Epoch 6, Iter 8600| Avg Loss = 50.21| Avg. ppl = 17.33| Speed 3808.75 words/sec| Time 86.03 min|
| Epoch 6, Iter 8700| Avg Loss = 50.80| Avg. ppl = 17.69| Speed 3765.93 words/sec| Time 87.04 min|
| Epoch 6, Iter 8800| Avg Loss = 51.28| Avg. ppl = 18.02| Speed 3784.44 words/sec| Time 88.04 min|
| Epoch 6, Iter 8900| Avg Loss = 51.27| Avg. ppl = 17.99| Speed 3771.10 words/sec| Time 89.04 min|
| Epoch 6, Iter 9000| Avg Loss = 50.61| Avg. ppl = 18.19| Speed 3749.54 words/sec| Time 90.03 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 60.620972
Hit patience 1


| Epoch 6, Iter 9100| Avg Loss = 51.87| Avg. ppl = 18.68| Speed 3669.60 words/sec| Time 91.06 min|
| Epoch 6, Iter 9200| Avg Loss = 52.44| Avg. ppl = 19.06| Speed 3803.54 words/sec| Time 92.06 min|
| Epoch 6, Iter 9300| Avg Loss = 52.05| Avg. ppl = 19.04| Speed 3796.01 words/sec| Time 93.05 min|
| Epoch 6, Iter 9400| Avg Loss = 52.60| Avg. ppl = 19.21| Speed 3836.90 words/sec| Time 94.04 min|
| Epoch 6, Iter 9500| Avg Loss = 52.80| Avg. ppl = 19.61| Speed 3819.09 words/sec| Time 95.03 min|
| Epoch 6, Iter 9600| Avg Loss = 52.61| Avg. ppl = 19.66| Speed 3791.43 words/sec| Time 96.03 min|
| Epoch 6, Iter 9700| Avg Loss = 51.97| Avg. ppl = 19.48| Speed 3777.34 words/sec| Time 97.02 min|
| Epoch 6, Iter 9800| Avg Loss = 52.72| Avg. ppl = 20.03| Speed 3778.02 words/sec| Time 98.01 min|
| Epoch 6, Iter 9900| Avg Loss = 53.45| Avg. ppl = 20.28| Speed 3765.44 words/sec| Time 99.02 min|
| Epoch 6, Iter 10000| Avg Loss = 52.82| Avg. ppl = 20.10| Speed 3788.95 words/sec| Time 100.01 min|
| <Train

Report on validation set:
Validation:  Dev. ppl = 57.787351
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 6, Iter 10100| Avg Loss = 53.15| Avg. ppl = 20.25| Speed 3521.81 words/sec| Time 101.08 min|
| Epoch 7, Iter 10200| Avg Loss = 50.77| Avg. ppl = 17.34| Speed 3808.45 words/sec| Time 102.07 min|
| Epoch 7, Iter 10300| Avg Loss = 45.90| Avg. ppl = 13.65| Speed 3787.57 words/sec| Time 103.06 min|
| Epoch 7, Iter 10400| Avg Loss = 46.73| Avg. ppl = 14.13| Speed 3777.20 words/sec| Time 104.05 min|
| Epoch 7, Iter 10500| Avg Loss = 46.92| Avg. ppl = 14.31| Speed 3768.31 words/sec| Time 105.05 min|
| Epoch 7, Iter 10600| Avg Loss = 47.59| Avg. ppl = 14.70| Speed 3779.62 words/sec| Time 106.05 min|
| Epoch 7, Iter 10700| Avg Loss = 47.58| Avg. ppl = 14.86| Speed 3758.23 words/sec| Time 107.05 min|
| Epoch 7, Iter 10800| Avg Loss = 48.18| Avg. ppl = 15.12| Speed 3814.19 words/sec| Time 108.04 min|
| Epoch 7, Iter 10900| Avg Loss = 48.56| Avg. ppl = 15.32| Speed 3831.17 words/sec| Time 109.03 min|
| Epoch 7, Iter 11000| Avg Loss = 48.34| Avg. ppl = 15.35| Speed 3782.04 words/sec| Time 11

Report on validation set:
Validation:  Dev. ppl = 60.569942
Hit patience 1


| Epoch 7, Iter 11100| Avg Loss = 48.23| Avg. ppl = 15.65| Speed 3686.66 words/sec| Time 111.05 min|
| Epoch 7, Iter 11200| Avg Loss = 48.59| Avg. ppl = 15.80| Speed 3763.13 words/sec| Time 112.04 min|
| Epoch 7, Iter 11300| Avg Loss = 49.02| Avg. ppl = 15.79| Speed 3772.84 words/sec| Time 113.05 min|
| Epoch 7, Iter 11400| Avg Loss = 49.19| Avg. ppl = 16.23| Speed 3785.01 words/sec| Time 114.04 min|
| Epoch 7, Iter 11500| Avg Loss = 49.29| Avg. ppl = 16.21| Speed 3814.51 words/sec| Time 115.03 min|
| Epoch 7, Iter 11600| Avg Loss = 49.03| Avg. ppl = 16.27| Speed 3739.11 words/sec| Time 116.04 min|
| Epoch 7, Iter 11700| Avg Loss = 49.98| Avg. ppl = 16.55| Speed 3836.27 words/sec| Time 117.03 min|
| Epoch 7, Iter 11800| Avg Loss = 49.40| Avg. ppl = 16.52| Speed 3729.89 words/sec| Time 118.03 min|
| Epoch 8, Iter 11900| Avg Loss = 45.97| Avg. ppl = 13.64| Speed 3762.92 words/sec| Time 119.02 min|
| Epoch 8, Iter 12000| Avg Loss = 42.87| Avg. ppl = 11.31| Speed 3785.45 words/sec| Time 12

Report on validation set:
Validation:  Dev. ppl = 61.450189
Hit patience 2


| Epoch 8, Iter 12100| Avg Loss = 44.44| Avg. ppl = 11.79| Speed 3729.56 words/sec| Time 121.05 min|
| Epoch 8, Iter 12200| Avg Loss = 43.90| Avg. ppl = 11.97| Speed 3766.15 words/sec| Time 122.05 min|
| Epoch 8, Iter 12300| Avg Loss = 43.55| Avg. ppl = 12.01| Speed 3769.17 words/sec| Time 123.04 min|
| Epoch 8, Iter 12400| Avg Loss = 44.24| Avg. ppl = 12.26| Speed 3754.70 words/sec| Time 124.05 min|
| Epoch 8, Iter 12500| Avg Loss = 44.55| Avg. ppl = 12.54| Speed 3785.62 words/sec| Time 125.04 min|
| Epoch 8, Iter 12600| Avg Loss = 44.72| Avg. ppl = 12.63| Speed 3744.76 words/sec| Time 126.04 min|
| Epoch 8, Iter 12700| Avg Loss = 45.11| Avg. ppl = 12.88| Speed 3765.05 words/sec| Time 127.04 min|
| Epoch 8, Iter 12800| Avg Loss = 45.76| Avg. ppl = 13.19| Speed 3792.66 words/sec| Time 128.04 min|
| Epoch 8, Iter 12900| Avg Loss = 46.31| Avg. ppl = 13.45| Speed 3818.25 words/sec| Time 129.04 min|
| Epoch 8, Iter 13000| Avg Loss = 45.96| Avg. ppl = 13.43| Speed 3841.74 words/sec| Time 13

Report on validation set:
Validation:  Dev. ppl = 63.497742
Hit patience 3
Hit #1 trial
load previously best model and decay learning rate to 0.000500
restore parameters of the optimizers


| Epoch 8, Iter 13100| Avg Loss = 45.63| Avg. ppl = 13.46| Speed 3638.64 words/sec| Time 131.05 min|
| Epoch 8, Iter 13200| Avg Loss = 45.51| Avg. ppl = 13.36| Speed 3771.21 words/sec| Time 132.04 min|
| Epoch 8, Iter 13300| Avg Loss = 46.20| Avg. ppl = 13.53| Speed 3818.44 words/sec| Time 133.03 min|
| Epoch 8, Iter 13400| Avg Loss = 45.22| Avg. ppl = 13.24| Speed 3724.21 words/sec| Time 134.04 min|
| Epoch 8, Iter 13500| Avg Loss = 46.24| Avg. ppl = 13.69| Speed 3799.08 words/sec| Time 135.03 min|
| Epoch 9, Iter 13600| Avg Loss = 45.93| Avg. ppl = 13.32| Speed 3779.04 words/sec| Time 136.02 min|
| Epoch 9, Iter 13700| Avg Loss = 44.72| Avg. ppl = 12.62| Speed 3774.29 words/sec| Time 137.02 min|
| Epoch 9, Iter 13800| Avg Loss = 45.20| Avg. ppl = 12.67| Speed 3777.70 words/sec| Time 138.03 min|
| Epoch 9, Iter 13900| Avg Loss = 45.13| Avg. ppl = 12.82| Speed 3813.21 words/sec| Time 139.02 min|
| Epoch 9, Iter 14000| Avg Loss = 45.17| Avg. ppl = 12.94| Speed 3809.76 words/sec| Time 14

Report on validation set:
Validation:  Dev. ppl = 60.553490
Hit patience 1


| Epoch 9, Iter 14100| Avg Loss = 45.09| Avg. ppl = 13.16| Speed 3675.48 words/sec| Time 141.02 min|
| Epoch 9, Iter 14200| Avg Loss = 45.79| Avg. ppl = 13.20| Speed 3829.65 words/sec| Time 142.01 min|
| Epoch 9, Iter 14300| Avg Loss = 45.80| Avg. ppl = 13.20| Speed 3833.06 words/sec| Time 143.00 min|
| Epoch 9, Iter 14400| Avg Loss = 45.88| Avg. ppl = 13.44| Speed 3776.05 words/sec| Time 143.99 min|
| Epoch 9, Iter 14500| Avg Loss = 45.91| Avg. ppl = 13.49| Speed 3769.79 words/sec| Time 144.99 min|
| Epoch 9, Iter 14600| Avg Loss = 46.16| Avg. ppl = 13.48| Speed 3795.70 words/sec| Time 145.99 min|
| Epoch 9, Iter 14700| Avg Loss = 45.95| Avg. ppl = 13.65| Speed 3771.23 words/sec| Time 146.98 min|
| Epoch 9, Iter 14800| Avg Loss = 46.37| Avg. ppl = 13.77| Speed 3791.44 words/sec| Time 147.98 min|
| Epoch 9, Iter 14900| Avg Loss = 46.68| Avg. ppl = 13.93| Speed 3797.17 words/sec| Time 148.97 min|
| Epoch 9, Iter 15000| Avg Loss = 45.59| Avg. ppl = 13.66| Speed 3748.64 words/sec| Time 14

Report on validation set:
Validation:  Dev. ppl = 58.929521
Hit patience 2


| Epoch 9, Iter 15100| Avg Loss = 46.71| Avg. ppl = 13.88| Speed 3693.55 words/sec| Time 150.99 min|
| Epoch 9, Iter 15200| Avg Loss = 46.80| Avg. ppl = 14.04| Speed 3786.05 words/sec| Time 151.99 min|
| Epoch 10, Iter 15300| Avg Loss = 44.39| Avg. ppl = 12.10| Speed 3764.40 words/sec| Time 152.99 min|
| Epoch 10, Iter 15400| Avg Loss = 42.42| Avg. ppl = 10.97| Speed 3786.97 words/sec| Time 153.99 min|
| Epoch 10, Iter 15500| Avg Loss = 42.65| Avg. ppl = 11.07| Speed 3797.83 words/sec| Time 154.99 min|
| Epoch 10, Iter 15600| Avg Loss = 43.12| Avg. ppl = 11.35| Speed 3776.51 words/sec| Time 155.99 min|
| Epoch 10, Iter 15700| Avg Loss = 43.14| Avg. ppl = 11.42| Speed 3795.80 words/sec| Time 156.98 min|
| Epoch 10, Iter 15800| Avg Loss = 43.27| Avg. ppl = 11.56| Speed 3802.02 words/sec| Time 157.98 min|
| Epoch 10, Iter 15900| Avg Loss = 43.24| Avg. ppl = 11.59| Speed 3818.12 words/sec| Time 158.96 min|
| Epoch 10, Iter 16000| Avg Loss = 43.08| Avg. ppl = 11.62| Speed 3768.80 words/sec|

Report on validation set:
Validation:  Dev. ppl = 61.331199
Hit patience 3
Hit #2 trial
load previously best model and decay learning rate to 0.000250
restore parameters of the optimizers


| Epoch 10, Iter 16100| Avg Loss = 46.23| Avg. ppl = 13.56| Speed 3613.82 words/sec| Time 161.00 min|
| Epoch 10, Iter 16200| Avg Loss = 46.40| Avg. ppl = 13.38| Speed 3823.79 words/sec| Time 162.00 min|
| Epoch 10, Iter 16300| Avg Loss = 45.45| Avg. ppl = 13.18| Speed 3832.26 words/sec| Time 162.98 min|
| Epoch 10, Iter 16400| Avg Loss = 45.42| Avg. ppl = 13.27| Speed 3786.55 words/sec| Time 163.97 min|
| Epoch 10, Iter 16500| Avg Loss = 45.33| Avg. ppl = 13.12| Speed 3773.90 words/sec| Time 164.97 min|
| Epoch 10, Iter 16600| Avg Loss = 44.86| Avg. ppl = 13.04| Speed 3695.79 words/sec| Time 165.98 min|
| Epoch 10, Iter 16700| Avg Loss = 45.45| Avg. ppl = 13.17| Speed 3785.41 words/sec| Time 166.97 min|
| Epoch 10, Iter 16800| Avg Loss = 45.78| Avg. ppl = 13.30| Speed 3799.03 words/sec| Time 167.96 min|
| Epoch 10, Iter 16900| Avg Loss = 45.68| Avg. ppl = 13.37| Speed 3808.72 words/sec| Time 168.95 min|
| Epoch 11, Iter 17000| Avg Loss = 44.67| Avg. ppl = 12.52| Speed 3820.42 words/se

Report on validation set:
Validation:  Dev. ppl = 59.717359
Hit patience 1


| Epoch 11, Iter 17100| Avg Loss = 44.43| Avg. ppl = 12.25| Speed 3737.12 words/sec| Time 170.94 min|
| Epoch 11, Iter 17200| Avg Loss = 44.81| Avg. ppl = 12.39| Speed 3847.83 words/sec| Time 171.93 min|
| Epoch 11, Iter 17300| Avg Loss = 44.03| Avg. ppl = 12.22| Speed 3747.72 words/sec| Time 172.93 min|
| Epoch 11, Iter 17400| Avg Loss = 44.69| Avg. ppl = 12.43| Speed 3799.12 words/sec| Time 173.93 min|
| Epoch 11, Iter 17500| Avg Loss = 44.36| Avg. ppl = 12.40| Speed 3794.83 words/sec| Time 174.92 min|
| Epoch 11, Iter 17600| Avg Loss = 44.77| Avg. ppl = 12.56| Speed 3759.54 words/sec| Time 175.92 min|
| Epoch 11, Iter 17700| Avg Loss = 44.71| Avg. ppl = 12.50| Speed 3792.94 words/sec| Time 176.92 min|
| Epoch 11, Iter 17800| Avg Loss = 44.83| Avg. ppl = 12.54| Speed 3806.05 words/sec| Time 177.91 min|
| Epoch 11, Iter 17900| Avg Loss = 44.76| Avg. ppl = 12.54| Speed 3751.90 words/sec| Time 178.92 min|
| Epoch 11, Iter 18000| Avg Loss = 44.57| Avg. ppl = 12.58| Speed 3768.41 words/se

Report on validation set:
Validation:  Dev. ppl = 59.130569
Hit patience 2


| Epoch 11, Iter 18100| Avg Loss = 44.53| Avg. ppl = 12.67| Speed 3685.93 words/sec| Time 180.93 min|
| Epoch 11, Iter 18200| Avg Loss = 45.55| Avg. ppl = 12.85| Speed 3831.64 words/sec| Time 181.92 min|
| Epoch 11, Iter 18300| Avg Loss = 44.75| Avg. ppl = 12.71| Speed 3754.95 words/sec| Time 182.92 min|
| Epoch 11, Iter 18400| Avg Loss = 44.67| Avg. ppl = 12.73| Speed 3746.10 words/sec| Time 183.92 min|
| Epoch 11, Iter 18500| Avg Loss = 45.02| Avg. ppl = 12.80| Speed 3779.21 words/sec| Time 184.92 min|
| Epoch 11, Iter 18600| Avg Loss = 45.00| Avg. ppl = 12.90| Speed 3801.50 words/sec| Time 185.91 min|
| Epoch 12, Iter 18700| Avg Loss = 43.26| Avg. ppl = 11.48| Speed 3786.23 words/sec| Time 186.90 min|
| Epoch 12, Iter 18800| Avg Loss = 42.50| Avg. ppl = 11.16| Speed 3770.47 words/sec| Time 187.89 min|
| Epoch 12, Iter 18900| Avg Loss = 42.38| Avg. ppl = 11.17| Speed 3784.07 words/sec| Time 188.88 min|
| Epoch 12, Iter 19000| Avg Loss = 43.19| Avg. ppl = 11.45| Speed 3797.08 words/se

Report on validation set:
Validation:  Dev. ppl = 60.063730
Hit patience 3
Hit #3 trial
early stop!


### Evaluation

#### Beam Search

In [0]:
# Create Hypothesis tuple for beam search
Hypothesis = namedtuple('Hypothesis', ['value', 'score'])

In [0]:
def beam_search(model, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]:
    """ Given a single source sentence, perform beam search, yielding translations in the target language.
    @param src_sent (List[str]): a single source sentence (words)
    @param beam_size (int): beam size
    @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
    @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
            value: List[str]: the decoded target sentence, represented as a list of words
            score: float: the log-likelihood of the target sentence
    """
    src_sents_var = model.vocab.src.to_input_tensor([src_sent], model.device)

    dec_init_vec = model.encoder(src_sents_var, [len(src_sent)])
    #src_encodings_att_linear = self.att_projection(src_encodings)

    h_tm1 = dec_init_vec
    #att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

    eos_id = model.vocab.tgt['</s>']

    hypotheses = [['<s>']]
    hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=model.device)
    completed_hypotheses = []

    t = 0
    while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
        t += 1
        hyp_num = len(hypotheses)
        #set_trace()
        y_tm1 = torch.tensor([model.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=model.device)
        
        #y_tm1 = y_tm1.transpose(0,1)
        #set_trace()
        #y_t_embed = model.model_embeddings.target(y_tm1)
        y_tm1 = torch.unsqueeze(y_tm1,dim=0)
        #y_t_embed = torch.unsqueeze(y_t_embed, dim=0)
        
        w_t, (h_t, cell_t) = model.decoder(y_tm1, dec_init_vec) # w_t --> (1, batch, hidden_size)
        
        #w_t = w_t.squeeze(dim=0)
        
        #output_t = model.target_vocab_projection(w_t)
        
        log_p_t = F.log_softmax(w_t, dim=-1)
        #set_trace()

        live_hyp_num = beam_size - len(completed_hypotheses)
        contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
        top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)

        prev_hyp_ids = top_cand_hyp_pos / len(model.vocab.tgt)
        hyp_word_ids = top_cand_hyp_pos % len(model.vocab.tgt)

        new_hypotheses = []
        live_hyp_ids = []
        new_hyp_scores = []

        for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
            prev_hyp_id = prev_hyp_id.item()
            hyp_word_id = hyp_word_id.item()
            cand_new_hyp_score = cand_new_hyp_score.item()

            hyp_word = model.vocab.tgt.id2word[hyp_word_id]
            new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
            if hyp_word == '</s>':
                completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                       score=cand_new_hyp_score))
            else:
                new_hypotheses.append(new_hyp_sent)
                live_hyp_ids.append(prev_hyp_id)
                new_hyp_scores.append(cand_new_hyp_score)

        if len(completed_hypotheses) == beam_size:
            break
        #set_trace()
        live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=model.device)
        dec_init_vec = (h_t[:,live_hyp_ids], cell_t[:,live_hyp_ids])
        #att_tm1 = att_t[live_hyp_ids]
        

        hypotheses = new_hypotheses
        hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=model.device)
        #set_trace()

    if len(completed_hypotheses) == 0:
        completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                               score=hyp_scores[0].item()))

    completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

    return completed_hypotheses  

### BLEU
Evaluate Model BLEU score

In [0]:
def test_beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = beam_search(model, src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
  

In [0]:
def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float:
    """ Given decoding results and reference sentences, compute corpus-level BLEU score.
    @param references (List[List[str]]): a list of gold-standard reference target sentences
    @param hypotheses (List[Hypothesis]): a list of hypotheses, one for each reference
    @returns bleu_score: corpus-level BLEU score
    """
    if references[0][0] == '<s>':
        references = [ref[1:-1] for ref in references]
    bleu_score = corpus_bleu([[ref] for ref in references],
                             [hyp.value for hyp in hypotheses])
    return bleu_score

In [0]:
def decode(model):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences", file=sys.stderr)
    test_data_src = read_corpus(test_es, source='src')
    
    
    print("load test target sentences", file=sys.stderr)
    test_data_tgt = read_corpus(test_en, source='tgt')

    #print("load trained model", file=sys.stderr)
    #model = NMT_base.load(model_save_path)

    #device = torch.device("cuda:0" if torch.cuda.device_count()>0 else "cpu")
    if torch.cuda.is_available():
        print("Transfer to cuda!!")
        model = model.to(torch.device("cuda:0"))

    hypotheses = test_beam_search(model, test_data_src,
                             beam_size=5,
                             max_decoding_time_step=70)


    top_hypotheses = [hyps[0] for hyps in hypotheses]
    bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
    print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open('test_output.txt', 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')

In [51]:
decode(model)

Transfer to cuda!!
Decoding:   0%|          | 0/8064 [00:00<?, ?it/s]

load test source sentences
load test target sentences


Decoding: 100%|██████████| 8064/8064 [07:01<00:00, 19.15it/s]


Corpus BLEU: 8.029102353804074


#### Inference

In [52]:
####
src_sent =['También', 'tenemos', 'que', 'tener', 'cuidado', 'con', 'el', 'hielo', 'se', 'resbala', 'fácilmente', 'en', 'él.']
en_ref = ['We', 'also', 'have', 'to', 'be', 'careful', 'with', 'the', 'ice,','it', 'slides', 'easily', 'on', 'it.']
en_hat = beam_search(model,src_sent,5,70)
print("=="*40)
print("Model Translation:\n")
print('{}'.format(' '.join(en_hat[0].value)))
print("\n")
print("Human Reference:\n")
print('{}'.format(' '.join(en_ref)))
print("=="*40)

Model Translation:

<unk> we have to be able to cross the ice course of <unk> <unk>


Human Reference:

We also have to be careful with the ice, it slides easily on it.
