### Objective
Build a machine translation model (Spainish --> English) : LSTM Encoder-Decoder + Attention Mechanism

In [1]:
# connect to google drive
import os
import numpy as np

# mount google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
root_dir = "/content/gdrive/My Drive/NLP/MT_ENSP"
os.chdir(root_dir)

### Imports

In [0]:
# basic packages
import sys
import os
import math
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from collections import Counter, namedtuple
from docopt import docopt
from itertools import chain
import json
from typing import List, Tuple, Dict, Set, Union
from docopt import docopt

#pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence



#others
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from tqdm import tqdm
from IPython.core.debugger import set_trace

### Utility Functions

In [0]:
# post-padding for source/target sequences

def pad_sents(sents, pad_token):
    
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[str]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (str): padding token
    @returns sents_padded (list[list[str]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
    """
    sents_padded = []

    max_len = max([len(sent) for sent in sents])
    for sent in sents:
        sent_len = len(sent)
        sents_padded.append(sent + (max_len - sent_len) * [pad_token])

    return sents_padded

In [5]:
# verify padding
sents = [['a','clear','day'],['it','is','not','raining','today']]
pad_sents(sents,'<Pad>')

[['a', 'clear', 'day', '<Pad>', '<Pad>'],
 ['it', 'is', 'not', 'raining', 'today']]

In [0]:
# read from corpus: vocab building

def read_corpus(file_path, source):
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = line.strip().split(' ')
        #sent = line.split(' ')
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data

In [7]:
# verify if read_corpus is working
file_path = 'en_es_data/dev.en'
data = read_corpus(file_path, 'src')
data[1]

['But',
 'this',
 'understates',
 'the',
 'seriousness',
 'of',
 'this',
 'particular',
 'problem',
 '',
 'because',
 'it',
 "doesn't",
 'show',
 'the',
 'thickness',
 'of',
 'the',
 'ice.']

In [0]:
# generate batches for taining

def batch_iter(data, batch_size, shuffle=False):
    """ Yield batches of source and target sentences reverse sorted by length (largest to smallest).
    @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    """
    batch_num = math.ceil(len(data) / batch_size)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = [data[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
        src_sents = [e[0] for e in examples]
        tgt_sents = [e[1] for e in examples]

        yield src_sents, tgt_sents

In [9]:
# check batch_iter
es_path = 'en_es_data/dev.es'
en_path = 'en_es_data/dev.en'

es_data = read_corpus(es_path, source = 'src')
en_data = read_corpus(en_path, source = 'tgt')

data = list(zip(es_data, en_data))

for src, tgt in batch_iter(data[:4],2):
  print(src, tgt)

[['El', 'ao', 'pasado', 'proyect', 'estas', 'dos', 'diapositivas', 'para', 'demostrar', 'que', 'la', 'capa', 'de', 'hielo', 'rtico,', 'que', 'durante', 'los', 'ltimos', 'tres', 'millones', 'de', 'aos', 'ha', 'sido', 'del', 'tamao', 'de', 'los', '48', 'estados,', 'se', 'ha', 'reducido', 'en', 'un', '40', 'por', 'ciento.'], ['Pero', 'esto', 'minimiza', 'la', 'seriedad', 'de', 'este', 'problema', 'concreto', 'porque', 'no', 'muestra', 'el', 'grosor', 'del', 'hielo.']] [['<s>', 'Last', 'year', 'I', 'showed', 'these', 'two', 'slides', 'so', 'that', '', 'demonstrate', 'that', 'the', 'arctic', 'ice', 'cap,', '', 'which', 'for', 'most', 'of', 'the', 'last', 'three', 'million', 'years', '', 'has', 'been', 'the', 'size', 'of', 'the', 'lower', '48', 'states,', '', 'has', 'shrunk', 'by', '40', 'percent.', '</s>'], ['<s>', 'But', 'this', 'understates', 'the', 'seriousness', 'of', 'this', 'particular', 'problem', '', 'because', 'it', "doesn't", 'show', 'the', 'thickness', 'of', 'the', 'ice.', '</s>'

### Build Vocab

In [0]:
class VocabEntry(object):
    """ Vocabulary Entry, i.e. structure containing either
    src or tgt language terms.
    """
    def __init__(self, word2id=None):
        """ Init VocabEntry Instance.
        @param word2id (dict): dictionary mapping words 2 indices
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id['<pad>'] = 0   # Pad Token
            self.word2id['<s>'] = 1 # Start Token
            self.word2id['</s>'] = 2    # End Token
            self.word2id['<unk>'] = 3   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        """ Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.
        @param word (str): word to look up.
        @returns index (int): index of word 
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """ Check if word is captured by VocabEntry.
        @param word (str): word to look up
        @returns contains (bool): whether word is contained    
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """ Raise error, if one tries to edit the VocabEntry.
        """
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        """ Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.word2id)

    def __repr__(self):
        """ Representation of VocabEntry to be used
        when printing the object.
        """
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        """ Return mapping of index to word.
        @param wid (int): word index
        @returns word (str): word corresponding to index
        """
        return self.id2word[wid]

    def add(self, word):
        """ Add word to VocabEntry, if it is previously unseen.
        @param word (str): word to add to VocabEntry
        @return index (int): index that the word has been assigned
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2indices(self, sents):
        """ Convert list of words or list of sentences of words
        into list or list of list of indices.
        @param sents (list[str] or list[list[str]]): sentence(s) in words
        @return word_ids (list[int] or list[list[int]]): sentence(s) in indices
        """
        if type(sents[0]) == list:
            return [[self[w] for w in s] for s in sents]
        else:
            return [self[w] for w in sents]

    def indices2words(self, word_ids):
        """ Convert list of indices into words.
        @param word_ids (list[int]): list of word ids
        @return sents (list[str]): list of words
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size)
        """
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self['<pad>'])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return torch.t(sents_var)

    @staticmethod
    def from_corpus(corpus, size, freq_cutoff=2):
        """ Given a corpus construct a Vocab Entry.
        @param corpus (list[str]): corpus of text produced by read_corpus function
        @param size (int): # of words in vocabulary
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
        @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
        """
        vocab_entry = VocabEntry()
        word_freq = Counter(chain(*corpus))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
              .format(len(word_freq), freq_cutoff, len(valid_words)))
        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]
        for word in top_k_words:
            vocab_entry.add(word)
        return vocab_entry

In [11]:
# Check Vocab Entry Object
lang = VocabEntry()
#
print('Check if <pad> token is inside vocab:')
print('<pad>' in lang) #__contains__#
print('Length of vocab:{}'.format(len(lang))) # __len__
print('Adding a new word')
lang.add('new') # add new entry 'add' method
print(lang) #__repr__
print('The index of "new" is:{}'.format(lang.word2id['new']))
## 
print('Generate a vocab with the from_corpus static method:')
en_vocab = VocabEntry.from_corpus(en_data, 100)
print('The token for the word "the" is:{}'.format(en_vocab.word2id['the']))
# 
print('check to_input_tensor method:')
# set device name
device = torch.tensor(1).cuda().device

temp = en_vocab.to_input_tensor(sents,device)
print(temp)
# Note: output tensor shape--> (max_len, batch)
# len(en_vocab)

Check if <pad> token is inside vocab:
True
Length of vocab:4
Adding a new word
Vocabulary[size=5]
The index of "new" is:4
Generate a vocab with the from_corpus static method:
number of word types: 3955, number of word types w/ frequency >= 2: 1339
The token for the word "the" is:5
check to_input_tensor method:
tensor([[10, 18],
        [ 3, 11],
        [ 3, 37],
        [ 0,  3],
        [ 0,  3]], device='cuda:0')


In [0]:
class Vocab(object):
    """ Vocab encapsulating src and target langauges.
    """
    def __init__(self, src_vocab: VocabEntry, tgt_vocab: VocabEntry):
        """ Init Vocab.
        @param src_vocab (VocabEntry): VocabEntry for source language
        @param tgt_vocab (VocabEntry): VocabEntry for target language
        """
        self.src = src_vocab
        self.tgt = tgt_vocab

    @staticmethod
    def build(src_sents, tgt_sents, vocab_size, freq_cutoff) -> 'Vocab':
        """ Build Vocabulary.
        @param src_sents (list[str]): Source sentences provided by read_corpus() function
        @param tgt_sents (list[str]): Target sentences provided by read_corpus() function
        @param vocab_size (int): Size of vocabulary for both source and target languages
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word.
        """
        assert len(src_sents) == len(tgt_sents)

        print('initialize source vocabulary ..')
        src = VocabEntry.from_corpus(src_sents, vocab_size, freq_cutoff)

        print('initialize target vocabulary ..')
        tgt = VocabEntry.from_corpus(tgt_sents, vocab_size, freq_cutoff)

        return Vocab(src, tgt)

    def save(self, file_path):
        """ Save Vocab to file as JSON dump.
        @param file_path (str): file path to vocab file
        """
        json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2)

    @staticmethod
    def load(file_path):
        """ Load vocabulary from JSON dump.
        @param file_path (str): file path to vocab file
        @returns Vocab object loaded from JSON dump
        """
        entry = json.load(open(file_path, 'r'))
        src_word2id = entry['src_word2id']
        tgt_word2id = entry['tgt_word2id']

        return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))

    def __repr__(self):
        """ Representation of Vocab to be used
        when printing the object.
        """
        return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt))


In [13]:
# Now build source [Spanish] and target [English] vocab

train_es = 'en_es_data/train.es'
train_en = 'en_es_data/train.en'
vocab_file = 'en_es_data/vocab.json'

src_sents = read_corpus(train_es, source='src')
tgt_sents = read_corpus(train_en, source='tgt')

size = 50000
freq_cutoff= 2

vocab = Vocab.build(src_sents, tgt_sents, size, freq_cutoff)
print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))

vocab.save(vocab_file)
print('vocabulary saved to %s' % vocab_file)

#  
print('Note that the <s> and </s> tokens are added while vocab initialization.\n These tokens are also present in target top frequent words. \nThat is why vocab size for target language is lesser by 2.')

initialize source vocabulary ..
number of word types: 172418, number of word types w/ frequency >= 2: 80623
initialize target vocabulary ..
number of word types: 128873, number of word types w/ frequency >= 2: 64215
generated vocabulary, source 50004 words, target 50002 words
vocabulary saved to en_es_data/vocab.json
Note that the <s> and </s> tokens are added while vocab initialization.
 These tokens are also present in target top frequent words. 
That is why vocab size for target language is lesser by 2.


### Load Data and EDA

In [14]:
# load data
train_es = 'en_es_data/train.es'
train_en = 'en_es_data/train.en'

dev_es = 'en_es_data/dev.es'
dev_en = 'en_es_data/dev.en'

test_es = 'en_es_data/test.es'
test_en = 'en_es_data/test.en'


train_data_src = read_corpus(train_es, source='src')
train_data_tgt = read_corpus(train_en, source='tgt')

dev_data_src = read_corpus(dev_es, source='src')
dev_data_tgt = read_corpus(dev_en, source='tgt')

test_data_src = read_corpus(test_es, source='src')
test_data_tgt = read_corpus(test_en, source='tgt')

train_data = list(zip(train_data_src,train_data_tgt))
dev_data = list(zip(dev_data_src,dev_data_tgt))
test_data = list(zip(test_data_src,test_data_tgt))

#
print("=="*40)
print("Number of examples in train: {}".format(len(train_data)))
print("Number of examples in valid: {}".format(len(dev_data)))
print("Number of examples in test: {}".format(len(test_data)))
#
print("=="*40)
print("Spanish --> English")
es, en = next(iter(dev_data))
print("Sp: {}".format(' '.join(es)))
print("En: {}".format(' '.join(en)))
print("=="*40)


## Build Vocab
# Build Vocab with train set

size = 50000
freq_cutoff= 2
vocab_file = 'en_es_data/vocab.json'

vocab = Vocab.build(train_data_src, train_data_tgt, size, freq_cutoff)
print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))

vocab.save(vocab_file)
print('vocabulary saved to %s' % vocab_file)

#
print("=="*40)
print('Note that the <s> and </s> tokens are added while vocab\
      initialization.\nThese tokens are also present in target\
      top frequent words. \nThat is why vocab size for target language is lesser by 2.')
print("=="*40)


# Check tokenization process
print("=="*40)
sents = [['I', 'asgjsssd', 'will', 'be', 'there', 'for', 'you.'], ['This', 'is', 'spartaaaaaaaa.']]
print("Tokenize:\n {} \n {}\n".format(' '.join(sents[0]), ' '.join(sents[1])))

print(vocab.tgt.to_input_tensor(sents, "cpu"))
#
print("=="*40)
print("Note that 3 and 0  are <unk> and <pad> tokens!")
print("=="*40)

Number of examples in train: 216617
Number of examples in valid: 851
Number of examples in test: 8064
Spanish --> English
Sp: El ao pasado proyect estas dos diapositivas para demostrar que la capa de hielo rtico, que durante los ltimos tres millones de aos ha sido del tamao de los 48 estados, se ha reducido en un 40 por ciento.
En: <s> Last year I showed these two slides so that  demonstrate that the arctic ice cap,  which for most of the last three million years  has been the size of the lower 48 states,  has shrunk by 40 percent. </s>
initialize source vocabulary ..
number of word types: 172418, number of word types w/ frequency >= 2: 80623
initialize target vocabulary ..
number of word types: 128873, number of word types w/ frequency >= 2: 64215
generated vocabulary, source 50004 words, target 50002 words
vocabulary saved to en_es_data/vocab.json
Note that the <s> and </s> tokens are added while vocab      initialization.
These tokens are also present in target      top frequent wor

### Embedding
Initialize embedding matrix for words in source and target languages

In [0]:
class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """
    def __init__(self, embed_size, vocab):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        """
        super(ModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        # default values
        self.source = None
        self.target = None

        src_pad_token_idx = vocab.src['<pad>']
        tgt_pad_token_idx = vocab.tgt['<pad>']

        
        self.source = nn.Embedding(len(vocab.src),embed_size,padding_idx=src_pad_token_idx)
        self.target = nn.Embedding(len(vocab.tgt),embed_size,padding_idx=tgt_pad_token_idx)


## Encoder-Decoder model
With scaler dot-product attention mechanism 

In [0]:
## Hypethesis for Beam search to be used later
Hypothesis = namedtuple('Hypothesis', ['value', 'score'])

### Neural Machine Translation model
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        # different layers        
        self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.decoder = nn.LSTMCell(embed_size+hidden_size, hidden_size)
        self.h_projection= nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias=False)
        self.target_vocab_projection = nn.Linear(hidden_size,len(vocab.tgt),bias=False)
        self.dropout = nn.Dropout(dropout_rate)



    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)   # Tensor: (tgt_len, b)

        ###     Run the network forward:

        enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
        #print(combined_outputs.size())
        #temp = self.target_vocab_projection(combined_outputs)
        #print(temp.size())
        P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()
        
        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores


    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        
        X = self.model_embeddings.source(source_padded) #(src_len, b, embed_size)
        X = pack_padded_sequence(X, source_lengths)
        enc_hiddens, (last_hidden,last_cell) = self.encoder(X) #(h0,c0) defaults to zero
        enc_hiddens, _ = pad_packed_sequence(enc_hiddens, batch_first=True)
        last_hidden = torch.cat((last_hidden[0,:],last_hidden[1,:]),1)
        last_cell = torch.cat((last_cell[0,:],last_cell[1,:]),1)
        init_decoder_hidden = self.h_projection(last_hidden)
        init_decoder_cell = self.c_projection(last_cell)
        dec_init_state = (init_decoder_hidden, init_decoder_cell)


        return enc_hiddens, dec_init_state


    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
                dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []


        enc_hiddens_proj = self.att_projection(enc_hiddens)
        Y = self.model_embeddings.target(target_padded) #(tgt_len,b,e)
        Y_splited = torch.split(Y,1, dim=0)
        tgt_len = target_padded.size(0)
        for i in range(tgt_len):
            Y_t = Y_splited[i]
            Y_t = torch.squeeze(Y_t,dim =0) #(b,e) --> after removal of time dim
            Ybar_t = torch.cat((Y_t,o_prev),dim=1) #(b,e+h)
            dec_state, o_prev, e_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_prev)
        combined_outputs = torch.stack(combined_outputs)
        ### END YOUR CODE

        return combined_outputs


    def step(self, Ybar_t: torch.Tensor,
            dec_state: Tuple[torch.Tensor, torch.Tensor],
            enc_hiddens: torch.Tensor,
            enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        dec_state = self.decoder(Ybar_t, dec_state)
        dec_hidden, dec_cell = dec_state
        aug_dec_hidden = torch.unsqueeze(dec_hidden, dim=2) #(b,hidden_size,1)
 
        e_t = torch.bmm(enc_hiddens_proj,aug_dec_hidden) # (b,max_len, hidden_size) * (b,hidden_size,1) --> (b, max_len, 1)
        e_t = torch.squeeze(e_t, dim=2) #(b, max_len)

        ### END YOUR CODE

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        alpha_t = F.softmax(e_t, dim=1) #(b, src_len)
        # enc_hiddens --> (b, src_len, hidden_size*2) #
        aug_att = torch.unsqueeze(alpha_t,2) #(b, src_len, 1)
        tr_hiddens = enc_hiddens.transpose(1,2) #(b,hidden_size*2, src_len)
        a_t = torch.bmm(tr_hiddens,aug_att) #(b,2*hidden_size,1)
        a_t = torch.squeeze(a_t,dim=2) #(b,2*hidden_size)
        #print(a_t.size(),dec_hidden.size())
        
        U_t = torch.cat((a_t,dec_hidden), dim=1) #(b,3*hidden_size)
        V_t = self.combined_output_projection(U_t) #(b,hidden_size)
        O_t = self.dropout(torch.tanh(V_t))
        

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)


    def beam_search(self, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num,
                                                                           src_encodings_att_linear.size(1),
                                                                           src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor([self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _  = self.step(x, h_tm1,
                                                      exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                   score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate),
            'vocab': self.vocab,
            'state_dict': self.state_dict()
        }

        torch.save(params, path)


### Build Model

In [0]:
model = NMT(embed_size= 256, hidden_size=256, dropout_rate=0.3, vocab=vocab)
## Model in training mode
model.train();

In [26]:
## Parameter Initialization
uniform_init = 0.1

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -uniform_init, uniform_init)
        
model.apply(init_weights);
# Count total parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

# Use Adam Optimizaer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# transfer model to cuda if available
device = torch.device("cuda:0" if torch.cuda.device_count()>0 else "cpu")
print('use device: %s' % device)
model = model.to(device)

The model has 40,833,024 trainable parameters
use device: cuda:0


### Training

#### Perplexity (PPL)

In [0]:
## Compute Perplexity to keep track of training

def evaluate_ppl(model, dev_data, batch_size=32):
    """ Evaluate perplexity on dev sentences
    @param model (NMT): NMT Model
    @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (batch size)
    @returns ppl (perplixty on dev sentences)
    """
    was_training = model.training
    model.eval()

    cum_loss = 0.
    cum_tgt_words = 0.

    # no_grad() signals backend to throw away all gradients
    with torch.no_grad():
        for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
            loss = -model(src_sents, tgt_sents).sum()

            cum_loss += loss.item()
            tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            cum_tgt_words += tgt_word_num_to_predict

        ppl = np.exp(cum_loss / cum_tgt_words)

    if was_training:
        model.train()

    return ppl

#### Model Training function

In [0]:
######## Train Model ########

model_save_path = 'NMT_LSTM_seq2seq_one_layer'

##
def train_model(model, optimizer, clip_grad =5.0, max_epoch =30, max_patience = 3, max_trial = 3, lr_decay = 0.5, train_batch_size = 128, log_every = 100, valid_niter = 1000):
  
  
  print('Training begins...')
  ## Temp variables
  num_trial = 0
  train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
  cum_examples = report_examples  = valid_num = 0
  hist_valid_scores = []
  train_time = begin_time = time.time()
  
  # put the model in training mode
  model.train()
  
  
  # iterate over the epochs
  for epoch in range(max_epoch):
    for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
        
        train_iter += 1
        optimizer.zero_grad()
        batch_size = len(src_sents)
        
        example_losses = -model(src_sents, tgt_sents)
        batch_loss = example_losses.sum()
        loss = batch_loss/batch_size
        loss.backward() # autograd
        
        # Clip gradient
        grad_norn = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step() # update parameters
        
        batch_losses_val = batch_loss.item()
        report_loss += batch_losses_val
        cum_loss += batch_losses_val
        
        tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
        report_tgt_words += tgt_words_num_to_predict
        cum_tgt_words += tgt_words_num_to_predict
        report_examples += batch_size
        cum_examples += batch_size
        
        # print interim report about training
        
        if train_iter % log_every == 0:
            #set_trace()
            print('| Epoch %d, Iter %d| Avg Loss = %.2f| Avg. ppl = %.2f| Speed %.2f words/sec| Time %.2f min|' % (epoch+1, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words),
                                                                                     report_tgt_words / (time.time() - train_time), (time.time() - begin_time)/60.0))

            train_time = time.time()
            report_loss = report_tgt_words = report_examples = 0.
        
        # validation
        if train_iter % valid_niter == 0:
            
            print('| <Train Summary> | Epoch %d, Iter %d| Cum. loss = %.2f| Cum. ppl = %.2f|' % (epoch+1, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words)))

            cum_loss = cum_examples = cum_tgt_words = 0.
            valid_num += 1

            print('Report on validation set:', file=sys.stderr)

            # compute dev. ppl and bleu
            dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
            valid_metric = -dev_ppl

            print('Validation:  Dev. ppl = %f' % (dev_ppl), file=sys.stderr)

            
            # learning rate scheduling
            
            is_better = (len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores))
            hist_valid_scores.append(valid_metric)

            if is_better:
                patience = 0
                print('Save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                model.save(model_save_path)

                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
                
            elif patience < int(max_patience):
                patience += 1
                print('Hit patience %d' % patience, file=sys.stderr)

                if patience == int(max_patience):
                    num_trial += 1
                    print('Hit #%d trial' % num_trial, file=sys.stderr)
                    
                    if num_trial == int(max_trial):
                        print('early stop!', file=sys.stderr)
                        return

                    # decay lr, and restore from previously best checkpoint
                    lr = optimizer.param_groups[0]['lr'] * float(lr_decay)
                    print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                    # load model
                    params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                    model.load_state_dict(params['state_dict'])
                    model = model.to(device)

                    print('restore parameters of the optimizers', file=sys.stderr)
                    optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                    # set new lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr

                    # reset patience
                    patience = 0

            if epoch +1 == int(max_epoch):
                print('Training stopped <-> Reached maximum number of epochs!', file=sys.stderr)
                return

In [0]:
# mask pad tokens while computing attention
vocab_mask = torch.ones(len(vocab.tgt))
vocab_mask[vocab.tgt['<pad>']] = 0

In [27]:
# train parameters
max_epoch =30
train_batch_size = 64

# train the model
train_model(model, optimizer, max_epoch =max_epoch, train_batch_size = train_batch_size)

Training begins...
| Epoch 1, Iter 100| Avg Loss = 131.76| Avg. ppl = 1663.93| Speed 6365.70 words/sec| Time 0.30 min|
| Epoch 1, Iter 200| Avg Loss = 116.58| Avg. ppl = 719.71| Speed 6294.99 words/sec| Time 0.60 min|
| Epoch 1, Iter 300| Avg Loss = 108.04| Avg. ppl = 477.81| Speed 6449.24 words/sec| Time 0.89 min|
| Epoch 1, Iter 400| Avg Loss = 105.62| Avg. ppl = 382.42| Speed 6371.60 words/sec| Time 1.18 min|
| Epoch 1, Iter 500| Avg Loss = 103.62| Avg. ppl = 333.05| Speed 6440.80 words/sec| Time 1.48 min|
| Epoch 1, Iter 600| Avg Loss = 98.07| Avg. ppl = 266.42| Speed 6339.53 words/sec| Time 1.78 min|
| Epoch 1, Iter 700| Avg Loss = 94.69| Avg. ppl = 216.89| Speed 6358.89 words/sec| Time 2.07 min|
| Epoch 1, Iter 800| Avg Loss = 91.02| Avg. ppl = 179.22| Speed 6270.09 words/sec| Time 2.37 min|
| Epoch 1, Iter 900| Avg Loss = 88.90| Avg. ppl = 155.06| Speed 6418.97 words/sec| Time 2.66 min|
| Epoch 1, Iter 1000| Avg Loss = 85.37| Avg. ppl = 131.82| Speed 6432.22 words/sec| Time 2.95

Report on validation set:
Validation:  Dev. ppl = 131.942455
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 1, Iter 1100| Avg Loss = 83.70| Avg. ppl = 113.51| Speed 5175.07 words/sec| Time 3.32 min|
| Epoch 1, Iter 1200| Avg Loss = 81.31| Avg. ppl = 103.84| Speed 6339.46 words/sec| Time 3.61 min|
| Epoch 1, Iter 1300| Avg Loss = 81.51| Avg. ppl = 94.29| Speed 6522.23 words/sec| Time 3.90 min|
| Epoch 1, Iter 1400| Avg Loss = 78.87| Avg. ppl = 84.22| Speed 6412.45 words/sec| Time 4.20 min|
| Epoch 1, Iter 1500| Avg Loss = 76.81| Avg. ppl = 78.91| Speed 6318.65 words/sec| Time 4.50 min|
| Epoch 1, Iter 1600| Avg Loss = 75.55| Avg. ppl = 70.58| Speed 6502.78 words/sec| Time 4.79 min|
| Epoch 1, Iter 1700| Avg Loss = 74.88| Avg. ppl = 66.83| Speed 6335.77 words/sec| Time 5.09 min|
| Epoch 1, Iter 1800| Avg Loss = 72.58| Avg. ppl = 62.97| Speed 6316.53 words/sec| Time 5.38 min|
| Epoch 1, Iter 1900| Avg Loss = 71.52| Avg. ppl = 57.56| Speed 6368.53 words/sec| Time 5.68 min|
| Epoch 1, Iter 2000| Avg Loss = 71.52| Avg. ppl = 55.33| Speed 6500.88 words/sec| Time 5.97 min|
| <Train Summary> 

Report on validation set:
Validation:  Dev. ppl = 59.153163
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 1, Iter 2100| Avg Loss = 69.56| Avg. ppl = 52.27| Speed 5285.53 words/sec| Time 6.33 min|
| Epoch 1, Iter 2200| Avg Loss = 70.29| Avg. ppl = 50.81| Speed 6409.16 words/sec| Time 6.63 min|
| Epoch 1, Iter 2300| Avg Loss = 68.46| Avg. ppl = 47.32| Speed 6339.45 words/sec| Time 6.92 min|
| Epoch 1, Iter 2400| Avg Loss = 66.56| Avg. ppl = 44.61| Speed 6403.40 words/sec| Time 7.22 min|
| Epoch 1, Iter 2500| Avg Loss = 65.47| Avg. ppl = 42.04| Speed 6496.91 words/sec| Time 7.50 min|
| Epoch 1, Iter 2600| Avg Loss = 65.48| Avg. ppl = 40.42| Speed 6389.41 words/sec| Time 7.80 min|
| Epoch 1, Iter 2700| Avg Loss = 65.47| Avg. ppl = 40.19| Speed 6355.93 words/sec| Time 8.10 min|
| Epoch 1, Iter 2800| Avg Loss = 64.94| Avg. ppl = 38.89| Speed 6529.01 words/sec| Time 8.39 min|
| Epoch 1, Iter 2900| Avg Loss = 63.58| Avg. ppl = 36.92| Speed 6413.32 words/sec| Time 8.68 min|
| Epoch 1, Iter 3000| Avg Loss = 63.06| Avg. ppl = 35.34| Speed 6373.76 words/sec| Time 8.98 min|
| <Train Summary> | 

Report on validation set:
Validation:  Dev. ppl = 36.348401
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 1, Iter 3100| Avg Loss = 63.21| Avg. ppl = 35.34| Speed 5317.87 words/sec| Time 9.33 min|
| Epoch 1, Iter 3200| Avg Loss = 61.40| Avg. ppl = 32.78| Speed 6405.25 words/sec| Time 9.62 min|
| Epoch 1, Iter 3300| Avg Loss = 61.50| Avg. ppl = 32.57| Speed 6471.59 words/sec| Time 9.91 min|
| Epoch 2, Iter 3400| Avg Loss = 59.83| Avg. ppl = 29.69| Speed 6172.92 words/sec| Time 10.22 min|
| Epoch 2, Iter 3500| Avg Loss = 56.77| Avg. ppl = 24.77| Speed 6446.86 words/sec| Time 10.51 min|
| Epoch 2, Iter 3600| Avg Loss = 55.88| Avg. ppl = 23.55| Speed 6341.96 words/sec| Time 10.81 min|
| Epoch 2, Iter 3700| Avg Loss = 55.82| Avg. ppl = 23.71| Speed 6367.27 words/sec| Time 11.10 min|
| Epoch 2, Iter 3800| Avg Loss = 55.73| Avg. ppl = 23.58| Speed 6322.57 words/sec| Time 11.40 min|
| Epoch 2, Iter 3900| Avg Loss = 55.22| Avg. ppl = 23.01| Speed 6329.86 words/sec| Time 11.70 min|
| Epoch 2, Iter 4000| Avg Loss = 54.88| Avg. ppl = 22.76| Speed 6361.24 words/sec| Time 11.99 min|
| <Train Summ

Report on validation set:
Validation:  Dev. ppl = 30.443960
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 2, Iter 4100| Avg Loss = 54.36| Avg. ppl = 21.71| Speed 5258.12 words/sec| Time 12.35 min|
| Epoch 2, Iter 4200| Avg Loss = 54.69| Avg. ppl = 21.94| Speed 6226.70 words/sec| Time 12.65 min|
| Epoch 2, Iter 4300| Avg Loss = 54.72| Avg. ppl = 21.66| Speed 6309.30 words/sec| Time 12.96 min|
| Epoch 2, Iter 4400| Avg Loss = 53.64| Avg. ppl = 21.21| Speed 6354.49 words/sec| Time 13.25 min|
| Epoch 2, Iter 4500| Avg Loss = 55.03| Avg. ppl = 21.50| Speed 6369.87 words/sec| Time 13.55 min|
| Epoch 2, Iter 4600| Avg Loss = 53.57| Avg. ppl = 21.06| Speed 6355.45 words/sec| Time 13.85 min|
| Epoch 2, Iter 4700| Avg Loss = 52.57| Avg. ppl = 19.77| Speed 6408.50 words/sec| Time 14.14 min|
| Epoch 2, Iter 4800| Avg Loss = 53.16| Avg. ppl = 20.37| Speed 6388.99 words/sec| Time 14.43 min|
| Epoch 2, Iter 4900| Avg Loss = 53.51| Avg. ppl = 19.95| Speed 6403.84 words/sec| Time 14.73 min|
| Epoch 2, Iter 5000| Avg Loss = 53.17| Avg. ppl = 19.98| Speed 6340.88 words/sec| Time 15.03 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 25.710404
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 2, Iter 5100| Avg Loss = 52.10| Avg. ppl = 19.93| Speed 5325.16 words/sec| Time 15.38 min|
| Epoch 2, Iter 5200| Avg Loss = 51.89| Avg. ppl = 19.48| Speed 6375.90 words/sec| Time 15.67 min|
| Epoch 2, Iter 5300| Avg Loss = 51.48| Avg. ppl = 18.76| Speed 6345.78 words/sec| Time 15.97 min|
| Epoch 2, Iter 5400| Avg Loss = 52.58| Avg. ppl = 19.07| Speed 6404.67 words/sec| Time 16.26 min|
| Epoch 2, Iter 5500| Avg Loss = 51.87| Avg. ppl = 18.56| Speed 6465.69 words/sec| Time 16.56 min|
| Epoch 2, Iter 5600| Avg Loss = 51.64| Avg. ppl = 18.63| Speed 6355.96 words/sec| Time 16.85 min|
| Epoch 2, Iter 5700| Avg Loss = 51.22| Avg. ppl = 18.38| Speed 6479.85 words/sec| Time 17.14 min|
| Epoch 2, Iter 5800| Avg Loss = 51.11| Avg. ppl = 18.06| Speed 6276.02 words/sec| Time 17.44 min|
| Epoch 2, Iter 5900| Avg Loss = 51.10| Avg. ppl = 18.58| Speed 6428.42 words/sec| Time 17.73 min|
| Epoch 2, Iter 6000| Avg Loss = 51.31| Avg. ppl = 17.67| Speed 6357.69 words/sec| Time 18.03 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 23.047655
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 2, Iter 6100| Avg Loss = 51.35| Avg. ppl = 17.79| Speed 5416.21 words/sec| Time 18.38 min|
| Epoch 2, Iter 6200| Avg Loss = 50.67| Avg. ppl = 17.60| Speed 6348.74 words/sec| Time 18.68 min|
| Epoch 2, Iter 6300| Avg Loss = 49.99| Avg. ppl = 17.28| Speed 6149.87 words/sec| Time 18.98 min|
| Epoch 2, Iter 6400| Avg Loss = 50.59| Avg. ppl = 17.41| Speed 6411.18 words/sec| Time 19.28 min|
| Epoch 2, Iter 6500| Avg Loss = 50.64| Avg. ppl = 17.25| Speed 6423.20 words/sec| Time 19.57 min|
| Epoch 2, Iter 6600| Avg Loss = 49.98| Avg. ppl = 16.99| Speed 6450.06 words/sec| Time 19.87 min|
| Epoch 2, Iter 6700| Avg Loss = 50.15| Avg. ppl = 16.84| Speed 6443.41 words/sec| Time 20.16 min|
| Epoch 3, Iter 6800| Avg Loss = 47.50| Avg. ppl = 14.97| Speed 6311.32 words/sec| Time 20.46 min|
| Epoch 3, Iter 6900| Avg Loss = 44.60| Avg. ppl = 12.37| Speed 6472.18 words/sec| Time 20.75 min|
| Epoch 3, Iter 7000| Avg Loss = 44.89| Avg. ppl = 12.32| Speed 6480.36 words/sec| Time 21.04 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 21.919477
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 3, Iter 7100| Avg Loss = 44.14| Avg. ppl = 12.27| Speed 5260.15 words/sec| Time 21.40 min|
| Epoch 3, Iter 7200| Avg Loss = 44.80| Avg. ppl = 12.66| Speed 6271.03 words/sec| Time 21.70 min|
| Epoch 3, Iter 7300| Avg Loss = 44.87| Avg. ppl = 12.48| Speed 6381.03 words/sec| Time 22.00 min|
| Epoch 3, Iter 7400| Avg Loss = 44.73| Avg. ppl = 12.61| Speed 6326.54 words/sec| Time 22.29 min|
| Epoch 3, Iter 7500| Avg Loss = 44.64| Avg. ppl = 12.39| Speed 6357.76 words/sec| Time 22.59 min|
| Epoch 3, Iter 7600| Avg Loss = 43.87| Avg. ppl = 12.29| Speed 6425.00 words/sec| Time 22.88 min|
| Epoch 3, Iter 7700| Avg Loss = 44.76| Avg. ppl = 12.29| Speed 6437.82 words/sec| Time 23.18 min|
| Epoch 3, Iter 7800| Avg Loss = 43.90| Avg. ppl = 12.15| Speed 6340.73 words/sec| Time 23.47 min|
| Epoch 3, Iter 7900| Avg Loss = 44.21| Avg. ppl = 12.31| Speed 6216.72 words/sec| Time 23.78 min|
| Epoch 3, Iter 8000| Avg Loss = 44.11| Avg. ppl = 12.30| Speed 6325.54 words/sec| Time 24.07 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 20.620818
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 3, Iter 8100| Avg Loss = 44.29| Avg. ppl = 12.29| Speed 5294.59 words/sec| Time 24.43 min|
| Epoch 3, Iter 8200| Avg Loss = 44.66| Avg. ppl = 12.56| Speed 6214.30 words/sec| Time 24.73 min|
| Epoch 3, Iter 8300| Avg Loss = 44.39| Avg. ppl = 12.42| Speed 6138.13 words/sec| Time 25.04 min|
| Epoch 3, Iter 8400| Avg Loss = 44.57| Avg. ppl = 12.31| Speed 6414.55 words/sec| Time 25.33 min|
| Epoch 3, Iter 8500| Avg Loss = 44.71| Avg. ppl = 12.46| Speed 6243.35 words/sec| Time 25.64 min|
| Epoch 3, Iter 8600| Avg Loss = 43.48| Avg. ppl = 12.09| Speed 6373.77 words/sec| Time 25.93 min|
| Epoch 3, Iter 8700| Avg Loss = 44.29| Avg. ppl = 12.21| Speed 6316.10 words/sec| Time 26.23 min|
| Epoch 3, Iter 8800| Avg Loss = 44.57| Avg. ppl = 12.23| Speed 6476.40 words/sec| Time 26.52 min|
| Epoch 3, Iter 8900| Avg Loss = 45.04| Avg. ppl = 12.47| Speed 6268.86 words/sec| Time 26.82 min|
| Epoch 3, Iter 9000| Avg Loss = 44.10| Avg. ppl = 12.19| Speed 6516.32 words/sec| Time 27.11 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 19.123942
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 3, Iter 9100| Avg Loss = 44.30| Avg. ppl = 12.42| Speed 5325.18 words/sec| Time 27.46 min|
| Epoch 3, Iter 9200| Avg Loss = 44.09| Avg. ppl = 12.26| Speed 6274.90 words/sec| Time 27.76 min|
| Epoch 3, Iter 9300| Avg Loss = 44.49| Avg. ppl = 12.18| Speed 6343.04 words/sec| Time 28.06 min|
| Epoch 3, Iter 9400| Avg Loss = 44.35| Avg. ppl = 12.25| Speed 6404.62 words/sec| Time 28.36 min|
| Epoch 3, Iter 9500| Avg Loss = 43.80| Avg. ppl = 11.83| Speed 6326.38 words/sec| Time 28.66 min|
| Epoch 3, Iter 9600| Avg Loss = 44.39| Avg. ppl = 12.08| Speed 6293.19 words/sec| Time 28.96 min|
| Epoch 3, Iter 9700| Avg Loss = 43.82| Avg. ppl = 12.17| Speed 6250.46 words/sec| Time 29.26 min|
| Epoch 3, Iter 9800| Avg Loss = 43.96| Avg. ppl = 12.14| Speed 6349.68 words/sec| Time 29.55 min|
| Epoch 3, Iter 9900| Avg Loss = 43.39| Avg. ppl = 12.01| Speed 6277.00 words/sec| Time 29.85 min|
| Epoch 3, Iter 10000| Avg Loss = 43.49| Avg. ppl = 12.16| Speed 6295.43 words/sec| Time 30.14 min|
| <Train 

Report on validation set:
Validation:  Dev. ppl = 18.360140
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 3, Iter 10100| Avg Loss = 44.49| Avg. ppl = 11.87| Speed 5361.26 words/sec| Time 30.50 min|
| Epoch 4, Iter 10200| Avg Loss = 41.46| Avg. ppl = 10.31| Speed 6364.47 words/sec| Time 30.80 min|
| Epoch 4, Iter 10300| Avg Loss = 38.02| Avg. ppl = 8.73| Speed 6307.30 words/sec| Time 31.10 min|
| Epoch 4, Iter 10400| Avg Loss = 38.94| Avg. ppl = 9.05| Speed 6385.54 words/sec| Time 31.39 min|
| Epoch 4, Iter 10500| Avg Loss = 39.17| Avg. ppl = 8.95| Speed 6326.52 words/sec| Time 31.69 min|
| Epoch 4, Iter 10600| Avg Loss = 39.03| Avg. ppl = 8.97| Speed 6463.83 words/sec| Time 31.99 min|
| Epoch 4, Iter 10700| Avg Loss = 38.83| Avg. ppl = 9.19| Speed 6295.59 words/sec| Time 32.28 min|
| Epoch 4, Iter 10800| Avg Loss = 39.22| Avg. ppl = 9.17| Speed 6378.88 words/sec| Time 32.58 min|
| Epoch 4, Iter 10900| Avg Loss = 38.80| Avg. ppl = 9.15| Speed 6369.83 words/sec| Time 32.87 min|
| Epoch 4, Iter 11000| Avg Loss = 39.69| Avg. ppl = 9.17| Speed 6393.88 words/sec| Time 33.17 min|
| <Train

Report on validation set:
Validation:  Dev. ppl = 18.333865
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 4, Iter 11100| Avg Loss = 39.10| Avg. ppl = 9.18| Speed 5290.14 words/sec| Time 33.53 min|
| Epoch 4, Iter 11200| Avg Loss = 40.04| Avg. ppl = 9.49| Speed 6360.48 words/sec| Time 33.83 min|
| Epoch 4, Iter 11300| Avg Loss = 39.54| Avg. ppl = 9.43| Speed 6355.68 words/sec| Time 34.12 min|
| Epoch 4, Iter 11400| Avg Loss = 40.70| Avg. ppl = 9.59| Speed 6418.22 words/sec| Time 34.42 min|
| Epoch 4, Iter 11500| Avg Loss = 39.72| Avg. ppl = 9.47| Speed 6436.89 words/sec| Time 34.71 min|
| Epoch 4, Iter 11600| Avg Loss = 39.53| Avg. ppl = 9.54| Speed 6230.33 words/sec| Time 35.01 min|
| Epoch 4, Iter 11700| Avg Loss = 39.68| Avg. ppl = 9.52| Speed 6467.96 words/sec| Time 35.30 min|
| Epoch 4, Iter 11800| Avg Loss = 39.89| Avg. ppl = 9.57| Speed 6445.10 words/sec| Time 35.60 min|
| Epoch 4, Iter 11900| Avg Loss = 38.83| Avg. ppl = 9.35| Speed 6216.95 words/sec| Time 35.89 min|
| Epoch 4, Iter 12000| Avg Loss = 39.65| Avg. ppl = 9.33| Speed 6450.52 words/sec| Time 36.19 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 18.013487
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 4, Iter 12100| Avg Loss = 39.68| Avg. ppl = 9.68| Speed 5313.52 words/sec| Time 36.54 min|
| Epoch 4, Iter 12200| Avg Loss = 39.54| Avg. ppl = 9.42| Speed 6424.47 words/sec| Time 36.83 min|
| Epoch 4, Iter 12300| Avg Loss = 40.37| Avg. ppl = 9.56| Speed 6415.48 words/sec| Time 37.13 min|
| Epoch 4, Iter 12400| Avg Loss = 39.63| Avg. ppl = 9.58| Speed 6271.83 words/sec| Time 37.43 min|
| Epoch 4, Iter 12500| Avg Loss = 40.32| Avg. ppl = 9.62| Speed 6238.13 words/sec| Time 37.73 min|
| Epoch 4, Iter 12600| Avg Loss = 40.07| Avg. ppl = 9.60| Speed 6373.05 words/sec| Time 38.03 min|
| Epoch 4, Iter 12700| Avg Loss = 39.99| Avg. ppl = 9.62| Speed 6406.52 words/sec| Time 38.32 min|
| Epoch 4, Iter 12800| Avg Loss = 39.35| Avg. ppl = 9.61| Speed 6401.37 words/sec| Time 38.61 min|
| Epoch 4, Iter 12900| Avg Loss = 39.62| Avg. ppl = 9.54| Speed 6330.96 words/sec| Time 38.91 min|
| Epoch 4, Iter 13000| Avg Loss = 39.85| Avg. ppl = 9.63| Speed 6361.35 words/sec| Time 39.20 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 18.120077
Hit patience 1


| Epoch 4, Iter 13100| Avg Loss = 39.70| Avg. ppl = 9.73| Speed 6124.99 words/sec| Time 39.51 min|
| Epoch 4, Iter 13200| Avg Loss = 39.66| Avg. ppl = 9.50| Speed 6372.54 words/sec| Time 39.80 min|
| Epoch 4, Iter 13300| Avg Loss = 40.78| Avg. ppl = 9.61| Speed 6480.29 words/sec| Time 40.10 min|
| Epoch 4, Iter 13400| Avg Loss = 39.97| Avg. ppl = 9.60| Speed 6304.14 words/sec| Time 40.40 min|
| Epoch 4, Iter 13500| Avg Loss = 41.20| Avg. ppl = 9.93| Speed 6399.75 words/sec| Time 40.70 min|
| Epoch 5, Iter 13600| Avg Loss = 36.36| Avg. ppl = 8.00| Speed 6295.38 words/sec| Time 40.99 min|
| Epoch 5, Iter 13700| Avg Loss = 34.61| Avg. ppl = 7.14| Speed 6263.54 words/sec| Time 41.29 min|
| Epoch 5, Iter 13800| Avg Loss = 35.15| Avg. ppl = 7.36| Speed 6390.94 words/sec| Time 41.59 min|
| Epoch 5, Iter 13900| Avg Loss = 35.18| Avg. ppl = 7.34| Speed 6368.68 words/sec| Time 41.88 min|
| Epoch 5, Iter 14000| Avg Loss = 35.80| Avg. ppl = 7.46| Speed 6404.76 words/sec| Time 42.18 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.857683
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 5, Iter 14100| Avg Loss = 36.16| Avg. ppl = 7.58| Speed 5407.03 words/sec| Time 42.53 min|
| Epoch 5, Iter 14200| Avg Loss = 35.29| Avg. ppl = 7.56| Speed 6251.25 words/sec| Time 42.83 min|
| Epoch 5, Iter 14300| Avg Loss = 35.59| Avg. ppl = 7.50| Speed 6415.14 words/sec| Time 43.12 min|
| Epoch 5, Iter 14400| Avg Loss = 35.91| Avg. ppl = 7.67| Speed 6315.81 words/sec| Time 43.42 min|
| Epoch 5, Iter 14500| Avg Loss = 36.04| Avg. ppl = 7.78| Speed 6338.66 words/sec| Time 43.71 min|
| Epoch 5, Iter 14600| Avg Loss = 36.56| Avg. ppl = 7.65| Speed 6528.91 words/sec| Time 44.01 min|
| Epoch 5, Iter 14700| Avg Loss = 36.36| Avg. ppl = 7.72| Speed 6474.60 words/sec| Time 44.30 min|
| Epoch 5, Iter 14800| Avg Loss = 35.81| Avg. ppl = 7.68| Speed 6389.50 words/sec| Time 44.59 min|
| Epoch 5, Iter 14900| Avg Loss = 35.58| Avg. ppl = 7.68| Speed 6297.61 words/sec| Time 44.89 min|
| Epoch 5, Iter 15000| Avg Loss = 35.97| Avg. ppl = 7.79| Speed 6377.94 words/sec| Time 45.18 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.946360
Hit patience 1


| Epoch 5, Iter 15100| Avg Loss = 36.62| Avg. ppl = 7.96| Speed 6092.25 words/sec| Time 45.49 min|
| Epoch 5, Iter 15200| Avg Loss = 36.51| Avg. ppl = 8.04| Speed 6367.73 words/sec| Time 45.79 min|
| Epoch 5, Iter 15300| Avg Loss = 36.87| Avg. ppl = 7.93| Speed 6393.76 words/sec| Time 46.08 min|
| Epoch 5, Iter 15400| Avg Loss = 36.35| Avg. ppl = 7.91| Speed 6420.52 words/sec| Time 46.37 min|
| Epoch 5, Iter 15500| Avg Loss = 36.20| Avg. ppl = 7.88| Speed 6344.45 words/sec| Time 46.67 min|
| Epoch 5, Iter 15600| Avg Loss = 36.95| Avg. ppl = 8.06| Speed 6318.29 words/sec| Time 46.97 min|
| Epoch 5, Iter 15700| Avg Loss = 36.85| Avg. ppl = 8.05| Speed 6304.39 words/sec| Time 47.27 min|
| Epoch 5, Iter 15800| Avg Loss = 36.70| Avg. ppl = 8.02| Speed 6312.49 words/sec| Time 47.57 min|
| Epoch 5, Iter 15900| Avg Loss = 37.13| Avg. ppl = 8.00| Speed 6359.80 words/sec| Time 47.86 min|
| Epoch 5, Iter 16000| Avg Loss = 37.13| Avg. ppl = 8.13| Speed 6375.43 words/sec| Time 48.16 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.396996
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 5, Iter 16100| Avg Loss = 36.89| Avg. ppl = 8.01| Speed 5347.40 words/sec| Time 48.51 min|
| Epoch 5, Iter 16200| Avg Loss = 37.12| Avg. ppl = 8.18| Speed 6335.74 words/sec| Time 48.81 min|
| Epoch 5, Iter 16300| Avg Loss = 37.28| Avg. ppl = 8.18| Speed 6424.82 words/sec| Time 49.11 min|
| Epoch 5, Iter 16400| Avg Loss = 37.81| Avg. ppl = 8.22| Speed 6392.22 words/sec| Time 49.41 min|
| Epoch 5, Iter 16500| Avg Loss = 37.24| Avg. ppl = 8.11| Speed 6371.99 words/sec| Time 49.70 min|
| Epoch 5, Iter 16600| Avg Loss = 37.15| Avg. ppl = 8.20| Speed 6390.37 words/sec| Time 50.00 min|
| Epoch 5, Iter 16700| Avg Loss = 37.57| Avg. ppl = 8.25| Speed 6257.02 words/sec| Time 50.30 min|
| Epoch 5, Iter 16800| Avg Loss = 36.78| Avg. ppl = 8.29| Speed 6377.29 words/sec| Time 50.59 min|
| Epoch 5, Iter 16900| Avg Loss = 37.39| Avg. ppl = 8.28| Speed 6367.68 words/sec| Time 50.89 min|
| Epoch 6, Iter 17000| Avg Loss = 33.47| Avg. ppl = 6.67| Speed 6353.89 words/sec| Time 51.18 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.833943
Hit patience 1


| Epoch 6, Iter 17100| Avg Loss = 32.41| Avg. ppl = 6.25| Speed 6161.82 words/sec| Time 51.49 min|
| Epoch 6, Iter 17200| Avg Loss = 32.28| Avg. ppl = 6.22| Speed 6352.89 words/sec| Time 51.79 min|
| Epoch 6, Iter 17300| Avg Loss = 32.86| Avg. ppl = 6.36| Speed 6492.31 words/sec| Time 52.08 min|
| Epoch 6, Iter 17400| Avg Loss = 33.05| Avg. ppl = 6.41| Speed 6358.62 words/sec| Time 52.38 min|
| Epoch 6, Iter 17500| Avg Loss = 33.24| Avg. ppl = 6.44| Speed 6344.51 words/sec| Time 52.68 min|
| Epoch 6, Iter 17600| Avg Loss = 32.75| Avg. ppl = 6.45| Speed 6326.18 words/sec| Time 52.97 min|
| Epoch 6, Iter 17700| Avg Loss = 32.68| Avg. ppl = 6.49| Speed 6176.67 words/sec| Time 53.28 min|
| Epoch 6, Iter 17800| Avg Loss = 33.23| Avg. ppl = 6.56| Speed 6470.75 words/sec| Time 53.57 min|
| Epoch 6, Iter 17900| Avg Loss = 33.67| Avg. ppl = 6.60| Speed 6324.61 words/sec| Time 53.87 min|
| Epoch 6, Iter 18000| Avg Loss = 33.78| Avg. ppl = 6.71| Speed 6411.28 words/sec| Time 54.16 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.812225
Hit patience 2


| Epoch 6, Iter 18100| Avg Loss = 33.47| Avg. ppl = 6.68| Speed 6085.35 words/sec| Time 54.47 min|
| Epoch 6, Iter 18200| Avg Loss = 33.71| Avg. ppl = 6.63| Speed 6315.42 words/sec| Time 54.77 min|
| Epoch 6, Iter 18300| Avg Loss = 34.25| Avg. ppl = 6.82| Speed 6260.93 words/sec| Time 55.08 min|
| Epoch 6, Iter 18400| Avg Loss = 34.47| Avg. ppl = 6.83| Speed 6489.56 words/sec| Time 55.37 min|
| Epoch 6, Iter 18500| Avg Loss = 33.94| Avg. ppl = 6.77| Speed 6416.90 words/sec| Time 55.67 min|
| Epoch 6, Iter 18600| Avg Loss = 34.01| Avg. ppl = 6.98| Speed 6345.21 words/sec| Time 55.96 min|
| Epoch 6, Iter 18700| Avg Loss = 33.74| Avg. ppl = 6.81| Speed 6304.29 words/sec| Time 56.26 min|
| Epoch 6, Iter 18800| Avg Loss = 34.52| Avg. ppl = 6.93| Speed 6345.06 words/sec| Time 56.56 min|
| Epoch 6, Iter 18900| Avg Loss = 33.98| Avg. ppl = 6.96| Speed 6313.45 words/sec| Time 56.85 min|
| Epoch 6, Iter 19000| Avg Loss = 33.55| Avg. ppl = 6.82| Speed 6308.69 words/sec| Time 57.15 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.880647
Hit patience 3
Hit #1 trial
load previously best model and decay learning rate to 0.000500
restore parameters of the optimizers


| Epoch 6, Iter 19100| Avg Loss = 32.94| Avg. ppl = 6.36| Speed 5893.62 words/sec| Time 57.47 min|
| Epoch 6, Iter 19200| Avg Loss = 32.01| Avg. ppl = 6.25| Speed 6347.00 words/sec| Time 57.77 min|
| Epoch 6, Iter 19300| Avg Loss = 31.91| Avg. ppl = 6.20| Speed 6293.20 words/sec| Time 58.06 min|
| Epoch 6, Iter 19400| Avg Loss = 32.38| Avg. ppl = 6.24| Speed 6317.56 words/sec| Time 58.36 min|
| Epoch 6, Iter 19500| Avg Loss = 32.50| Avg. ppl = 6.37| Speed 6334.93 words/sec| Time 58.66 min|
| Epoch 6, Iter 19600| Avg Loss = 32.19| Avg. ppl = 6.24| Speed 6270.38 words/sec| Time 58.96 min|
| Epoch 6, Iter 19700| Avg Loss = 32.78| Avg. ppl = 6.29| Speed 6368.50 words/sec| Time 59.25 min|
| Epoch 6, Iter 19800| Avg Loss = 32.21| Avg. ppl = 6.15| Speed 6366.16 words/sec| Time 59.55 min|
| Epoch 6, Iter 19900| Avg Loss = 32.01| Avg. ppl = 6.18| Speed 6388.01 words/sec| Time 59.84 min|
| Epoch 6, Iter 20000| Avg Loss = 32.57| Avg. ppl = 6.32| Speed 6447.76 words/sec| Time 60.14 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.379159
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 6, Iter 20100| Avg Loss = 32.29| Avg. ppl = 6.24| Speed 5207.45 words/sec| Time 60.50 min|
| Epoch 6, Iter 20200| Avg Loss = 32.45| Avg. ppl = 6.27| Speed 6464.47 words/sec| Time 60.79 min|
| Epoch 6, Iter 20300| Avg Loss = 32.15| Avg. ppl = 6.22| Speed 6441.13 words/sec| Time 61.08 min|
| Epoch 7, Iter 20400| Avg Loss = 30.99| Avg. ppl = 5.71| Speed 6287.20 words/sec| Time 61.38 min|
| Epoch 7, Iter 20500| Avg Loss = 30.91| Avg. ppl = 5.81| Speed 6423.14 words/sec| Time 61.67 min|
| Epoch 7, Iter 20600| Avg Loss = 30.89| Avg. ppl = 5.70| Speed 6476.13 words/sec| Time 61.97 min|
| Epoch 7, Iter 20700| Avg Loss = 31.14| Avg. ppl = 5.82| Speed 6383.91 words/sec| Time 62.26 min|
| Epoch 7, Iter 20800| Avg Loss = 31.02| Avg. ppl = 5.86| Speed 6406.89 words/sec| Time 62.55 min|
| Epoch 7, Iter 20900| Avg Loss = 31.91| Avg. ppl = 5.99| Speed 6420.00 words/sec| Time 62.85 min|
| Epoch 7, Iter 21000| Avg Loss = 31.27| Avg. ppl = 5.92| Speed 6303.12 words/sec| Time 63.15 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.775141
Hit patience 1


| Epoch 7, Iter 21100| Avg Loss = 31.80| Avg. ppl = 5.99| Speed 6068.86 words/sec| Time 63.46 min|
| Epoch 7, Iter 21200| Avg Loss = 31.16| Avg. ppl = 5.88| Speed 6310.27 words/sec| Time 63.76 min|
| Epoch 7, Iter 21300| Avg Loss = 31.64| Avg. ppl = 5.89| Speed 6369.22 words/sec| Time 64.06 min|
| Epoch 7, Iter 21400| Avg Loss = 31.12| Avg. ppl = 5.86| Speed 6319.26 words/sec| Time 64.35 min|
| Epoch 7, Iter 21500| Avg Loss = 31.36| Avg. ppl = 5.87| Speed 6320.21 words/sec| Time 64.65 min|
| Epoch 7, Iter 21600| Avg Loss = 31.57| Avg. ppl = 6.04| Speed 6298.78 words/sec| Time 64.95 min|
| Epoch 7, Iter 21700| Avg Loss = 31.90| Avg. ppl = 6.08| Speed 6493.07 words/sec| Time 65.24 min|
| Epoch 7, Iter 21800| Avg Loss = 32.01| Avg. ppl = 6.07| Speed 6341.20 words/sec| Time 65.54 min|
| Epoch 7, Iter 21900| Avg Loss = 31.72| Avg. ppl = 6.00| Speed 6341.76 words/sec| Time 65.84 min|
| Epoch 7, Iter 22000| Avg Loss = 32.08| Avg. ppl = 6.05| Speed 6451.92 words/sec| Time 66.13 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.731136
Hit patience 2


| Epoch 7, Iter 22100| Avg Loss = 31.53| Avg. ppl = 5.96| Speed 6159.05 words/sec| Time 66.44 min|
| Epoch 7, Iter 22200| Avg Loss = 31.94| Avg. ppl = 6.18| Speed 6332.37 words/sec| Time 66.73 min|
| Epoch 7, Iter 22300| Avg Loss = 31.89| Avg. ppl = 6.07| Speed 6311.93 words/sec| Time 67.03 min|
| Epoch 7, Iter 22400| Avg Loss = 32.04| Avg. ppl = 6.14| Speed 6389.47 words/sec| Time 67.32 min|
| Epoch 7, Iter 22500| Avg Loss = 31.95| Avg. ppl = 6.17| Speed 6365.63 words/sec| Time 67.62 min|
| Epoch 7, Iter 22600| Avg Loss = 31.94| Avg. ppl = 6.16| Speed 6230.21 words/sec| Time 67.92 min|
| Epoch 7, Iter 22700| Avg Loss = 32.07| Avg. ppl = 6.10| Speed 6386.30 words/sec| Time 68.22 min|
| Epoch 7, Iter 22800| Avg Loss = 32.11| Avg. ppl = 6.14| Speed 6434.30 words/sec| Time 68.51 min|
| Epoch 7, Iter 22900| Avg Loss = 32.35| Avg. ppl = 6.18| Speed 6494.78 words/sec| Time 68.80 min|
| Epoch 7, Iter 23000| Avg Loss = 31.60| Avg. ppl = 6.11| Speed 6253.05 words/sec| Time 69.10 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.158697
Save currently the best model to [NMT_LSTM_seq2seq_one_layer]
save model parameters to [NMT_LSTM_seq2seq_one_layer]


| Epoch 7, Iter 23100| Avg Loss = 32.43| Avg. ppl = 6.20| Speed 5199.81 words/sec| Time 69.46 min|
| Epoch 7, Iter 23200| Avg Loss = 32.03| Avg. ppl = 6.12| Speed 6246.19 words/sec| Time 69.77 min|
| Epoch 7, Iter 23300| Avg Loss = 32.85| Avg. ppl = 6.35| Speed 6381.77 words/sec| Time 70.06 min|
| Epoch 7, Iter 23400| Avg Loss = 31.80| Avg. ppl = 6.16| Speed 6342.50 words/sec| Time 70.36 min|
| Epoch 7, Iter 23500| Avg Loss = 32.42| Avg. ppl = 6.27| Speed 6379.11 words/sec| Time 70.65 min|
| Epoch 7, Iter 23600| Avg Loss = 32.21| Avg. ppl = 6.21| Speed 6377.29 words/sec| Time 70.95 min|
| Epoch 8, Iter 23700| Avg Loss = 32.16| Avg. ppl = 6.14| Speed 6387.10 words/sec| Time 71.24 min|
| Epoch 8, Iter 23800| Avg Loss = 28.36| Avg. ppl = 4.97| Speed 6127.04 words/sec| Time 71.55 min|
| Epoch 8, Iter 23900| Avg Loss = 28.81| Avg. ppl = 5.06| Speed 6394.88 words/sec| Time 71.85 min|
| Epoch 8, Iter 24000| Avg Loss = 28.85| Avg. ppl = 5.12| Speed 6350.42 words/sec| Time 72.14 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.783496
Hit patience 1


| Epoch 8, Iter 24100| Avg Loss = 28.67| Avg. ppl = 5.10| Speed 6124.08 words/sec| Time 72.45 min|
| Epoch 8, Iter 24200| Avg Loss = 29.22| Avg. ppl = 5.22| Speed 6284.25 words/sec| Time 72.75 min|
| Epoch 8, Iter 24300| Avg Loss = 29.35| Avg. ppl = 5.30| Speed 6328.17 words/sec| Time 73.05 min|
| Epoch 8, Iter 24400| Avg Loss = 29.26| Avg. ppl = 5.24| Speed 6312.44 words/sec| Time 73.34 min|
| Epoch 8, Iter 24500| Avg Loss = 29.13| Avg. ppl = 5.22| Speed 6354.35 words/sec| Time 73.64 min|
| Epoch 8, Iter 24600| Avg Loss = 29.92| Avg. ppl = 5.37| Speed 6359.65 words/sec| Time 73.94 min|
| Epoch 8, Iter 24700| Avg Loss = 29.43| Avg. ppl = 5.30| Speed 6336.34 words/sec| Time 74.24 min|
| Epoch 8, Iter 24800| Avg Loss = 29.38| Avg. ppl = 5.24| Speed 6405.39 words/sec| Time 74.53 min|
| Epoch 8, Iter 24900| Avg Loss = 29.64| Avg. ppl = 5.39| Speed 6362.59 words/sec| Time 74.83 min|
| Epoch 8, Iter 25000| Avg Loss = 30.34| Avg. ppl = 5.48| Speed 6437.89 words/sec| Time 75.12 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 18.061083
Hit patience 2


| Epoch 8, Iter 25100| Avg Loss = 30.26| Avg. ppl = 5.46| Speed 6192.69 words/sec| Time 75.43 min|
| Epoch 8, Iter 25200| Avg Loss = 29.69| Avg. ppl = 5.45| Speed 6358.62 words/sec| Time 75.72 min|
| Epoch 8, Iter 25300| Avg Loss = 29.48| Avg. ppl = 5.35| Speed 6266.80 words/sec| Time 76.02 min|
| Epoch 8, Iter 25400| Avg Loss = 29.89| Avg. ppl = 5.37| Speed 6424.49 words/sec| Time 76.32 min|
| Epoch 8, Iter 25500| Avg Loss = 29.47| Avg. ppl = 5.47| Speed 6188.92 words/sec| Time 76.62 min|
| Epoch 8, Iter 25600| Avg Loss = 30.06| Avg. ppl = 5.47| Speed 6204.17 words/sec| Time 76.92 min|
| Epoch 8, Iter 25700| Avg Loss = 30.40| Avg. ppl = 5.49| Speed 6341.51 words/sec| Time 77.22 min|
| Epoch 8, Iter 25800| Avg Loss = 29.93| Avg. ppl = 5.49| Speed 6391.33 words/sec| Time 77.51 min|
| Epoch 8, Iter 25900| Avg Loss = 30.05| Avg. ppl = 5.53| Speed 6206.72 words/sec| Time 77.82 min|
| Epoch 8, Iter 26000| Avg Loss = 30.44| Avg. ppl = 5.54| Speed 6371.79 words/sec| Time 78.11 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 18.005420
Hit patience 3
Hit #2 trial
load previously best model and decay learning rate to 0.000250
restore parameters of the optimizers


| Epoch 8, Iter 26100| Avg Loss = 29.12| Avg. ppl = 5.16| Speed 5858.17 words/sec| Time 78.44 min|
| Epoch 8, Iter 26200| Avg Loss = 29.02| Avg. ppl = 5.14| Speed 6370.67 words/sec| Time 78.73 min|
| Epoch 8, Iter 26300| Avg Loss = 28.85| Avg. ppl = 5.15| Speed 6336.25 words/sec| Time 79.03 min|
| Epoch 8, Iter 26400| Avg Loss = 28.93| Avg. ppl = 5.17| Speed 6402.48 words/sec| Time 79.32 min|
| Epoch 8, Iter 26500| Avg Loss = 28.71| Avg. ppl = 5.11| Speed 6264.21 words/sec| Time 79.62 min|
| Epoch 8, Iter 26600| Avg Loss = 28.82| Avg. ppl = 5.12| Speed 6368.22 words/sec| Time 79.92 min|
| Epoch 8, Iter 26700| Avg Loss = 29.09| Avg. ppl = 5.17| Speed 6362.36 words/sec| Time 80.22 min|
| Epoch 8, Iter 26800| Avg Loss = 28.86| Avg. ppl = 5.20| Speed 6305.41 words/sec| Time 80.51 min|
| Epoch 8, Iter 26900| Avg Loss = 29.30| Avg. ppl = 5.21| Speed 6336.68 words/sec| Time 80.81 min|
| Epoch 8, Iter 27000| Avg Loss = 29.09| Avg. ppl = 5.16| Speed 6408.95 words/sec| Time 81.11 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.412656
Hit patience 1


| Epoch 9, Iter 27100| Avg Loss = 28.90| Avg. ppl = 5.06| Speed 6205.27 words/sec| Time 81.41 min|
| Epoch 9, Iter 27200| Avg Loss = 28.26| Avg. ppl = 4.94| Speed 6360.70 words/sec| Time 81.71 min|
| Epoch 9, Iter 27300| Avg Loss = 28.71| Avg. ppl = 4.99| Speed 6452.14 words/sec| Time 82.00 min|
| Epoch 9, Iter 27400| Avg Loss = 28.48| Avg. ppl = 5.08| Speed 6314.70 words/sec| Time 82.30 min|
| Epoch 9, Iter 27500| Avg Loss = 28.93| Avg. ppl = 5.11| Speed 6417.10 words/sec| Time 82.59 min|
| Epoch 9, Iter 27600| Avg Loss = 28.71| Avg. ppl = 5.02| Speed 6391.18 words/sec| Time 82.89 min|
| Epoch 9, Iter 27700| Avg Loss = 28.49| Avg. ppl = 5.01| Speed 6424.83 words/sec| Time 83.18 min|
| Epoch 9, Iter 27800| Avg Loss = 27.98| Avg. ppl = 5.01| Speed 6324.06 words/sec| Time 83.48 min|
| Epoch 9, Iter 27900| Avg Loss = 28.63| Avg. ppl = 5.07| Speed 6420.47 words/sec| Time 83.77 min|
| Epoch 9, Iter 28000| Avg Loss = 28.73| Avg. ppl = 5.03| Speed 6248.30 words/sec| Time 84.07 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.605742
Hit patience 2


| Epoch 9, Iter 28100| Avg Loss = 28.89| Avg. ppl = 5.09| Speed 6099.48 words/sec| Time 84.38 min|
| Epoch 9, Iter 28200| Avg Loss = 28.73| Avg. ppl = 5.12| Speed 6273.67 words/sec| Time 84.68 min|
| Epoch 9, Iter 28300| Avg Loss = 28.63| Avg. ppl = 5.06| Speed 6266.13 words/sec| Time 84.98 min|
| Epoch 9, Iter 28400| Avg Loss = 28.71| Avg. ppl = 5.07| Speed 6372.52 words/sec| Time 85.28 min|
| Epoch 9, Iter 28500| Avg Loss = 28.25| Avg. ppl = 5.02| Speed 6385.47 words/sec| Time 85.57 min|
| Epoch 9, Iter 28600| Avg Loss = 28.84| Avg. ppl = 5.21| Speed 6440.26 words/sec| Time 85.86 min|
| Epoch 9, Iter 28700| Avg Loss = 29.10| Avg. ppl = 5.12| Speed 6329.76 words/sec| Time 86.16 min|
| Epoch 9, Iter 28800| Avg Loss = 28.78| Avg. ppl = 5.13| Speed 6261.38 words/sec| Time 86.46 min|
| Epoch 9, Iter 28900| Avg Loss = 28.80| Avg. ppl = 5.16| Speed 6404.84 words/sec| Time 86.75 min|
| Epoch 9, Iter 29000| Avg Loss = 29.20| Avg. ppl = 5.17| Speed 6289.36 words/sec| Time 87.05 min|
| <Train S

Report on validation set:
Validation:  Dev. ppl = 17.636673
Hit patience 3
Hit #3 trial
early stop!


### Evaluation

#### Beam search

In [0]:
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses

In [0]:
def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float:
    """ Given decoding results and reference sentences, compute corpus-level BLEU score.
    @param references (List[List[str]]): a list of gold-standard reference target sentences
    @param hypotheses (List[Hypothesis]): a list of hypotheses, one for each reference
    @returns bleu_score: corpus-level BLEU score
    """
    if references[0][0] == '<s>':
        references = [ref[1:-1] for ref in references]
    bleu_score = corpus_bleu([[ref] for ref in references],
                             [hyp.value for hyp in hypotheses])
    return bleu_score

In [0]:
def decode():
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences", file=sys.stderr)
    test_data_src = read_corpus(test_es, source='src')
    
    
    print("load test target sentences", file=sys.stderr)
    test_data_tgt = read_corpus(test_en, source='tgt')

    print("load trained model", file=sys.stderr)
    model = NMT.load(model_save_path)

    #device = torch.device("cuda:0" if torch.cuda.device_count()>0 else "cpu")
    if torch.cuda.device_count()>0:
        print("Transfer to cuda!!")
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=5,
                             max_decoding_time_step=70)


    top_hypotheses = [hyps[0] for hyps in hypotheses]
    bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
    print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open('test_output.txt', 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')

In [31]:
decode()

load test source sentences
load test target sentences
load trained model


Transfer to cuda!!
Decoding: 100%|██████████| 8064/8064 [05:25<00:00, 24.77it/s]


Corpus BLEU: 22.35384594841707


### Inference

In [33]:
####
src_sent =['También', 'tenemos', 'que', 'tener', 'cuidado', 'con', 'el', 'hielo', 'se', 'resbala', 'fácilmente', 'en', 'él.']
en_ref = ['We', 'also', 'have', 'to', 'be', 'careful', 'with', 'the', 'ice,','it', 'slides', 'easily', 'on', 'it.']
en_hat = model.beam_search(src_sent,5,70)
print("=="*40)
print("Model Translation:\n")
print('{}'.format(' '.join(en_hat[0].value)))
print("\n")
print("Human Reference:\n")
print('{}'.format(' '.join(en_ref)))
print("=="*40)

Model Translation:

We have to be careful with the ice <unk> <unk> <unk>


Human Reference:

We also have to be careful with the ice, it slides easily on it.
