## Objective
Build a machine translation model (Spainish --> English), combining sub-word modelling, CNN, and LSTM.

## Setup

In [1]:
# connect to google drive
import os
import numpy as np

# mount google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# Check for GPU free memory
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
#!pip install psutil
#!pip install humanize
import psutil
import humanize
import GPUtil as GPU
GPUs = GPU.getGPUs()

# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
  process = psutil.Process(os.getpid())
  print('='*40)
  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
  print('='*40)
printm() 

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 12.9 GB  | Proc size: 120.4 MB
GPU RAM Free: 15079MB | Used: 0MB | Util   0% | Total 15079MB


In [0]:
# change root directory such that models are saved in google drive during training
root_dir = "/content/gdrive/My Drive/NLP/Char-Based-Model"
os.chdir(root_dir)
!ls

Char-Based-LSTM.ipynb		      NMT_model
code_utils			      NMT_model.optim
en_es_data			      outputs
Machine-Translation-Char-Based.ipynb  __pycache__
MT_char.ipynb			      sanity_check_en_es_data
NMT_char_model.pt		      Tensor_file.log
NMT_char_model.pt.optim


## Imports

In [0]:
# insert the path for utility custom functions
import sys
sys.path.insert(0, os.path.join(root_dir, 'code_utils'))

# custom python functions and classes
from utils import read_corpus, batch_iter, pad_sents, pad_sents_char
from vocab import Vocab, VocabEntry

In [0]:
# basic packages
import math
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from collections import Counter, namedtuple
from docopt import docopt
from itertools import chain
import json
from typing import List, Tuple, Dict, Set, Union
from docopt import docopt

#pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence



#others
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from tqdm import tqdm
from IPython.core.debugger import set_trace

In [0]:
# Logger

import logging
logger = logging.getLogger("tensor_tracker")

file_handler = logging.FileHandler("Tensor_file.log")
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(message)s')

file_handler.setFormatter(formatter)
stream_handler.setFormatter(formatter)

logger.addHandler(file_handler)

#logger.addHandler(stream_handler)

logger.setLevel(logging.DEBUG)

# A helper function to check how tensor sizes change
def log_size(tsr: torch.Tensor, name: str):
    #cls = getclass()
    logger.debug(msg=f"{name} ==> size={tsr.shape}")

In [0]:
logger.debug(msg = 'a')

## Load Data and EDA

In [0]:
# load data
train_es = 'en_es_data/train.es'
train_en = 'en_es_data/train.en'

dev_es = 'en_es_data/dev.es'
dev_en = 'en_es_data/dev.en'

test_es = 'en_es_data/test.es'
test_en = 'en_es_data/test.en'


train_data_src = read_corpus(train_es, source='src')
train_data_tgt = read_corpus(train_en, source='tgt')

dev_data_src = read_corpus(dev_es, source='src')
dev_data_tgt = read_corpus(dev_en, source='tgt')

test_data_src = read_corpus(test_es, source='src')
test_data_tgt = read_corpus(test_en, source='tgt')

train_data = list(zip(train_data_src,train_data_tgt))
dev_data = list(zip(dev_data_src,dev_data_tgt))
test_data = list(zip(test_data_src,test_data_tgt))

#
print("=="*40)
print("Number of examples in train: {}".format(len(train_data)))
print("Number of examples in valid: {}".format(len(dev_data)))
print("Number of examples in test: {}".format(len(test_data)))
#
print("=="*40)
print("Spanish --> English")
es, en = next(iter(dev_data))
print("Sp: {}".format(' '.join(es)))
print("En: {}".format(' '.join(en)))
print("=="*40)


## Build Vocab
# Build Vocab with train set

size = 50000
freq_cutoff= 2
vocab_file = 'en_es_data/vocab.json'

vocab = Vocab.build(train_data_src, train_data_tgt, size, freq_cutoff)
print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))

vocab.save(vocab_file)
print('vocabulary saved to %s' % vocab_file)

#
print("=="*40)
print('Note that the <s> and </s> tokens are added while vocab\
      initialization.\nThese tokens are also present in target\
      top frequent words. \nThat is why vocab size for target language is lesser by 2.')
print("=="*40)


# Check tokenization process
print("=="*40)
sents = [['I', 'asgjsssd', 'will', 'be', 'there', 'for', 'you.'], ['This', 'is', 'spartaaaaaaaa.']]
print("Tokenize:\n {} \n {}\n".format(' '.join(sents[0]), ' '.join(sents[1])))

print(vocab.tgt.to_input_tensor(sents, "cpu"))
#
print("=="*40)
print("Note that 3 and 0  are <unk> and <pad> tokens!")
print("=="*40)

Number of examples in train: 216617
Number of examples in valid: 851
Number of examples in test: 8064
Spanish --> English
Sp: El ao pasado proyect estas dos diapositivas para demostrar que la capa de hielo rtico, que durante los ltimos tres millones de aos ha sido del tamao de los 48 estados, se ha reducido en un 40 por ciento.
En: <s> Last year I showed these two slides so that  demonstrate that the arctic ice cap,  which for most of the last three million years  has been the size of the lower 48 states,  has shrunk by 40 percent. </s>
initialize source vocabulary ..
number of word types: 172418, number of word types w/ frequency >= 2: 80623
initialize target vocabulary ..
number of word types: 128873, number of word types w/ frequency >= 2: 64215
generated vocabulary, source 50004 words, target 50002 words
vocabulary saved to en_es_data/vocab.json
Note that the <s> and </s> tokens are added while vocab      initialization.
These tokens are also present in target      top frequent wor

## Char Based Embedding 

In [0]:
class CNN(nn.Module):
  """CNN part of the char-based-encoder system
  @param e_word: dimension of word embedding
  @param e_char: dimension of char embedding
  @param m_size: padded/truncated word length (e.g. number of character per word)
  @param kernel_size: convolition 1d kernel sizw
  """
  
  def __init__(self, e_char, e_word, m_size, kernel_size=5):
    
    super(CNN,self).__init__()
    
    # Attributes
    self.in_channels = e_char
    self.out_channels = e_word
    self.m_size = m_size
    self.kernel_size = kernel_size
    
    # Layers
    self.conv = nn.Conv1d(self.in_channels, self.out_channels, kernel_size)
    # input shape ==> (N, C_in, L) output shape ==> (N, C_out, N_out)
    self.maxpool = nn.MaxPool1d(kernel_size = m_size-kernel_size + 1)
    
    #torch.nn.MaxPool1d(kernel_size, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False)
     
  
  def forward(self,X_reshaped):
    """First applies conv1d on X_reshaped and maxpool
    @param X_reshaped: Tensor(batch_size, e_char, m_word)
    @returns X_conv_out: Tensor(batch_size,e_word)
    """
    assert X_reshaped.size(-1) == self.m_size, "Words need to be padded/truncated to size m_word"
    X_conv = self.conv(X_reshaped) # X_conv ==> (batch_size, e_word, m_word-kernel_size + 1)
    X_conv_out = self.maxpool(X_conv) #X_conv_out ==> (batch_size, e_word)
    
    return X_conv_out.squeeze(dim=-1) # remove last dimension

In [0]:
class Highway(nn.Module):
  """Implements the highway network. It provides skip connection feature. 
     Useful for building deep networks.
  """
  def __init__(self, e_word):
    
    super(Highway,self).__init__()
    
    
    #attributes
    self.e_word = e_word # dimension of word embedding
    
    
    #layers
    self.fc_proj = nn.Linear(e_word, e_word)
    self.fc_gate = nn.Linear(e_word, e_word)
    
  def forward(self, X_conv_out):
    """ Forward function for highway
    @param X_conv: Features after Maxpooling the of convolution layer outputs ==>
    Tensor of shape (b,e_word) where e_word is embedding dimension of each word and b is 
    the batch size
    
    @returns X_highway Tensor of shape (b,e_word)
    """
    #set_trace()
    X_proj = F.relu(self.fc_proj(X_conv_out)) #(b,e_word)
    X_gate = torch.sigmoid(self.fc_gate(X_conv_out)) #(b,e_word)
    #X_gate = torch.zeros(X_conv_out.size())
    X_highway = X_gate*X_proj + (1-X_gate)*X_conv_out
    
    return X_highway

In [0]:
class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their CNN-based embeddings.
    """
    def __init__(self, embed_size, vocab):
        """
        Init the Embedding layer for one language
        @param embed_size (int): Embedding size (dimensionality) for the output 
        @param vocab (VocabEntry): VocabEntry object. See vocab.py for documentation.
        """
        super(ModelEmbeddings, self).__init__()

        
        self.embed_size = embed_size
        self.vocab = vocab
        self.src_char_len = len(vocab.char2id)
        self.e_char = 50
        self.m_size = 21 # Maximum size of characters in a word (Should be consistent with pad_sents_char)
        self.dropout_rate = 0.3
        
        
        self.char_embeddings = nn.Embedding(self.src_char_len, self.e_char, padding_idx = vocab.char2id['<pad>'])
        self.dropout = nn.Dropout(p=self.dropout_rate)
        self.cnn = CNN(self.e_char, embed_size, self.m_size)
        self.highway = Highway(embed_size)

    def forward(self, input):
        """
        Looks up character-based CNN embeddings for the words in a batch of sentences.
        @param input: Tensor of integers of shape (sentence_length, batch_size, max_word_length) where
            each integer is an index into the character vocabulary

        @param output: Tensor of shape (sentence_length, batch_size, embed_size), containing the 
            CNN-based embeddings for each word of the sentences in the batch
        """

        output = []
        sentence_length = input.size(0)
        #set_trace()
        
        for i in range(sentence_length):
          #set_trace()
          X_t = input[i] #.squeeze(dim=0) #(batch, max_word_length)
          X_enc = self.char_embeddings(X_t) #(batch, max_word_length, e_char)
          X_reshaped = torch.transpose(X_enc,1,2) #(batch,e_char,max_word_len)
          X_conv_out = self.cnn(X_reshaped) #(batch, e_word)
          X_highway = self.highway(X_conv_out) #(batch, e_word)
          X_word = self.dropout(X_highway) #(batch, e_word)
          output.append(X_word)
          
        output = torch.stack(output, dim =0) #(sentence_length, batch, e_word)
        
        return output


## Char based Decoder

In [0]:
class CharDecoder(nn.Module):
    def __init__(self, hidden_size, char_embedding_size=50, target_vocab=None):
        """ Init Character Decoder.

        @param hidden_size (int): Hidden size of the decoder LSTM
        @param char_embedding_size (int): dimensionality of character embeddings
        @param target_vocab (VocabEntry): vocabulary for the target language. See vocab.py for details.
        """
        super(CharDecoder, self).__init__()
        
        self.target_vocab = target_vocab
        self.char_embedding_size = char_embedding_size
        self.hidden_size = hidden_size
        self.padding_index = target_vocab.char2id['<pad>']
        
        self.decoderCharEmb = nn.Embedding(len(target_vocab.char2id),char_embedding_size, padding_idx = target_vocab.char2id['<pad>'])
     
        self.char_output_projection = nn.Linear(hidden_size, len(target_vocab.char2id))
        
        self.charDecoder = nn.LSTM(input_size= char_embedding_size, hidden_size = hidden_size)
    
    def forward(self, input, dec_hidden=None):
        """ Forward pass of character decoder.

        @param input: tensor of integers, shape (length, batch)
        @param dec_hidden: internal state of the LSTM before reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)

        @returns scores: called s_t in the PDF, shape (length, batch, self.vocab_size)
        @returns dec_hidden: internal state of the LSTM after reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)
        """
        dec_input = self.decoderCharEmb(input) #(length, batch, char_embedding_size)
        dec_output, dec_hidden = self.charDecoder(dec_input, dec_hidden)
        
        ## dec_output ==> (length, batch, num_dir* hidden_size)
        ## Score compute
        
        scores = self.char_output_projection(dec_output) #scores ==> (length, batch, char_vocab_size)
        
        return scores, dec_hidden


    def train_forward(self, char_sequence, dec_hidden=None):
        """ Forward computation during training.

        @param char_sequence: tensor of integers, shape (length, batch). Note that "length" here and in forward() need not be the same.
        @param dec_hidden: initial internal state of the LSTM, obtained from the output of the word-level decoder. 
         A tuple of two tensors of shape (1, batch, hidden_size)

        @returns The cross-entropy loss, computed as the *sum* of cross-entropy losses of all the words in the batch.
        """

        ### char_sequence corresponds to the sequence x_1 ... x_{n+1} from the handout (e.g., <START>,m,u,s,i,c,<END>).
        
        input_dec = char_sequence[:-1,:] #(length-1,batch)
        targets = char_sequence[1:,:] #(length-1,batch)
        loss = nn.CrossEntropyLoss(reduction='sum', ignore_index = self.padding_index)
        
        #set_trace()
        scores, last_dec = self.forward(input_dec, dec_hidden)
        cum_loss = 0 
        #scores ==> (length-1,batch, vocab_size)
        for i in range(len(scores)):
          preds = scores[i] #(batch, vocab_size)
          target = targets[i] #(batch)
          loss_t = loss(preds,target) #number
          #set_trace()
          cum_loss += loss_t
        #set_trace()
        
        return cum_loss
        
        ### END YOUR CODE

    def decode_greedy(self, initialStates, device, max_length=21):
        """ Greedy decoding
        @param initialStates: initial internal state of the LSTM, a tuple of two tensors of size (1, batch, hidden_size)
        @param device: torch.device (indicates whether the model is on CPU or GPU)
        @param max_length: maximum length of words to decode

        @returns decodedWords: a list (of length batch) of strings, each of which has length <= max_length.
                              The decoded strings should NOT contain the start-of-word and end-of-word characters.
        """


        ### Use target_vocab.char2id and target_vocab.id2char to convert between integers and characters
        ### Use torch.tensor(..., device=device) to turn a list of character indices into a tensor.
        ### We use curly brackets as start-of-word and end-of-word characters. That is, use the character '{' for <START> and '}' for <END>.
        ### Their indices are self.target_vocab.start_of_word and self.target_vocab.end_of_word, respectively.
        
        
        batch = initialStates[0].size(1)
        
        char_in  = X = torch.tensor([[self.target_vocab.char2id['{'] for i in range(batch)]], device=device) #(1,batch)
        
        dec_hidden = initialStates
        
        output_word = []
        
        for i in range(max_length):
          scores, dec_hidden = self.forward(char_in, dec_hidden) 
          #char_id = torch.arg# scores ==> (length = 1, batch, vocab_size)
          char_in = torch.argmax(scores, dim = -1) #(1,batch)
          output_word.append(char_in)
        
        words = torch.squeeze(torch.stack(output_word),dim=1) #(max_length,batch)
        
        final_words = []
        
        for i in range(batch):
          word_ids = words[:,i]
          word = ''
          for idx in word_ids:
            if idx == self.target_vocab.char2id['}']:
              break
            else:
              word += self.target_vocab.id2char[idx.item()]
          final_words.append(word)
        return final_words

## Machine Translation 

### Encoder

In [0]:
class Encoder(nn.Module):
  """Encodes the input sequence"""
  
  def __init__(self, embed_size, hidden_size, vocab_src, dropout_rate=0.2):
    super(Encoder, self).__init__()
    
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.vocab = vocab_src # vocab in source language
    self.dropout_rate = dropout_rate
    
    self.encode_embeddings = ModelEmbeddings(embed_size, vocab_src)
    self.LSTM_encode = nn.LSTM(embed_size, hidden_size, bidirectional = True)
    self.h_projection = nn.Linear(2*hidden_size, hidden_size, bias = False)
    self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias = False)
    self.dropout = nn.Dropout(dropout_rate)
    
  def forward(self, src_seq_padded, source_lengths):
    """ Input sequence is passed through bidirectional LSTM for encoding 
        @ param: src_seq_padded ==> Tensor (src_len, batch)
        @ Note that embed size is word embed dimension
        @ param: source_lengths: actual length of input sentences before padding
        @ returns: hidden states at each time step of sequence
        @ returns: initial hidden state for decoder
    """
    X = self.encode_embeddings(src_seq_padded)
    X = pack_padded_sequence(X, source_lengths)
    enc_hiddens, (last_hidden, last_cell) = self.LSTM_encode(X)
    
    enc_hiddens, _ = pad_packed_sequence(enc_hiddens, batch_first = True) #(batch, seq_len, hidden_size*2)
    
    last_hidden = torch.cat((last_hidden[0,:], last_cell[1,:]), dim = 1) #(batch, hideen_size*2)
    last_cell = torch.cat((last_cell[0,:],last_cell[1,:]), dim = 1) #(batch, hiddden_size*2)
    
    init_decoder_hidden = self.h_projection(last_hidden) #(batch,hidden_size)
    init_decoder_cell = self.c_projection(last_cell)
    
    decoder_state = (init_decoder_hidden, init_decoder_cell)
    
    return enc_hiddens, decoder_state
    

In [0]:
## Test encoder
embed_size = 5
hidden_size = 10
vocab_src = VocabEntry()
#len(vocab_src.char2id)
sent_test = [['hi', 'i', 'can', 'go'], ['do', 'it']]
sent_lens = [4,2]
sent_padded = vocab_src.to_input_tensor_char(sent_test, device = torch.device("cpu"))
#sent_padded.shape #(4,2,21)
encoder_model = Encoder(embed_size, hidden_size, vocab_src, dropout_rate=0.2)
enc_hiddens, decoder_state = encoder_model(sent_padded, sent_lens)

print('enc_hiddens shape = {}, shape expected = {}'.format(enc_hiddens.shape, (2,4,20)))
print('decoder_state[0] shape = {}, shape expected = {}'.format(decoder_state[0].shape, (2,10)))

enc_hiddens shape = torch.Size([2, 4, 20]), shape expected = (2, 4, 20)
decoder_state[0] shape = torch.Size([2, 10]), shape expected = (2, 10)


### Decoder 

In [0]:
class Decoder(nn.Module):
  """ Decodes character by character """
  def __init__(self, embed_size, hidden_size, vocab_tgt, dropout_rate=0.2):
    super(Decoder, self).__init__()
    
    # attributes
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.vocab = vocab_tgt
    self.dropout_rate = dropout_rate
    
    # layers
    self.decode_embeddings = ModelEmbeddings(embed_size, vocab_tgt)
    self.LSTM_decoder = nn.LSTMCell(embed_size+hidden_size, hidden_size)
    self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias = False)
    self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias= False)
    self.dropout = nn.Dropout(dropout_rate)
    
  def forward(self, enc_hiddens, enc_masks, dec_state, target_padded):
    """ Decodes character by character
        @ param: enc_hiddens ==> Tensor (batch, src_len, 2*hidden_size)
        @ param: enc_masks ==> Tensor (batch, src_len) # mask over pad tokens (no attention given to masks)
        @ param: dec_state ==> Tuple (Tensor, Tensor) ==> Initial state of decoder (hidden_state, cell_state) #(batch, hidden_size)
        @ param: target_padded ==> gold stadard targets given to compute loss: Tensor (tgt_len, batch, max_word_len)
        @ return: combined_outputs ==> Tensor (tgt_len, batch, hidden_size)
    """
    # Chop the <End> token for max length sentences
    target_padded = target_padded[:-1]
    # initialize first combined output vector o_{t-1} as zeros
    batch_size = enc_hiddens.size(0)
    #set_trace()
    o_prev = torch.zeros(batch_size, self.hidden_size, device = self.att_projection.weight.device)
    
    # initialize a list that can be used to collect combined output at each step
    combined_outputs = []
    
    enc_hiddens_proj = self.att_projection(enc_hiddens) #(batch, len_src, hidden_size)
    Y =  self.decode_embeddings(target_padded) #(batch, len_tgt, embed_size)
    
    # split Y at each time step
    Y_splited = torch.split(Y, 1, dim = 0) # return tuple with Tensor elements of shape (1, batch, embed_size)
    
    tgt_len = target_padded.size(0)
    
    # start decoding
    for i in range(tgt_len):
      Y_t = torch.squeeze(Y_splited[i], dim=0) #(batch, embed_size)
      Ybar_t = torch.cat((Y_t, o_prev), dim =1) #(batch, embed_size + hidden_size) 
      dec_state, o_prev = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks)
      combined_outputs.append(o_prev)
    
    combined_outputs = torch.stack(combined_outputs)
    
    return combined_outputs
      
  ## Perform global attention mechanism
  def step(self, Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks):
    """ Returns combined output for current time step 
        @ param: Ybar_t ==> Tensor(batch, hidden_size+embed_size)
        @ param: dec_state ==> tuple(Tensor, Tensor) each Tensor of size (batch, hidden_size)
        @ param: enc_hiddens ==> Tensor (batch, len_src, 2*hidden_size)
        @ param: enc_hiddens_proj ==> Tensor (batch, len_src, hidden_size)
        @ param: enc_masks ==>  Tensor of sequence masks (batch, len_src)

        @ returns: dec_state ==> tuple(Tensor,Tensor) each Tensor (batch, hidden_size)
        @ returns: combined_output ==> Tensor (batch, hidden_size) # combined output at step t
    """

    combined_output = None

    ## compute attention scores
    dec_state = self.LSTM_decoder(Ybar_t, dec_state)
    dec_hidden, dec_cell = dec_state

    aug_dec_hidden = torch.unsqueeze(dec_hidden, dim=2) #(batch, hidden_size, 1)
    e_t = torch.bmm(enc_hiddens_proj, aug_dec_hidden) #(batch, len_src, hidden_size) * (batch, hidden_size, 1) ==> (batch, len_src, 1)
    e_t = torch.squeeze(e_t, dim =-1) #(batch, len_src)

    ## Set e_t to -inf if enc_mask has 1 (No attention to padded values)
    if enc_masks is not None:
      e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

    ## take softmax to get attention weights
    alpha_t = F.softmax(e_t, dim = 1) # (batch, len_src)
    aug_alpha_t = torch.unsqueeze(alpha_t, dim =-1) # (batch, len_src, 1)
    tr_hiddens = enc_hiddens.transpose(1,2) #(batch, 2*hidden_size, len_src)
    a_t = torch.bmm(tr_hiddens, aug_alpha_t) #(batch, 2*hidden_size,1)
    a_t = torch.squeeze(a_t, dim=-1) #(batch, 2*hidden_size)

    ## combined output
    U_t = torch.cat((a_t, dec_hidden), dim=1) #(batch, 3*hidden_size)
    V_t = self.combined_output_projection(U_t) #(batch, hidden_size)
    combined_output  = self.dropout(torch.tanh(V_t)) #(batch, hidden_size)

    return dec_state, combined_output

In [0]:
## test decoder model
embed_size = 5
hidden_size = 10

vocab_src = VocabEntry()
vocab_tgt = VocabEntry()

sent_src = [['Me', 'gustaria', 'una', 'no', 'cerveza'], ['Me', 'llamo', 'Mondly']]
sent_tgt = [['hi', 'i', 'can', 'go'], ['do', 'it']]

source_padded = vocab_src.to_input_tensor_char(sent_src, device = torch.device("cpu"))
sent_lens = [5,3]

target_padded = vocab_tgt.to_input_tensor_char(sent_tgt, device = torch.device("cpu"))

# source_padded.shape #(5,2,21)
# target_padded.shape #(4,2,21)

encoder_model = Encoder(embed_size, hidden_size, vocab_src, dropout_rate=0.2)
decoder_model = Decoder(embed_size, hidden_size, vocab_src, dropout_rate=0.2)

enc_hiddens, dec_state = encoder_model(source_padded, sent_lens)


print('enc_hiddens shape = {}, shape expected = {}'.format(enc_hiddens.shape, (2,5,20)))
print('decoder_state[0] shape = {}, shape expected = {}'.format(dec_state[0].shape, (2,10)))

## Decode 
enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
outputs = decoder_model(enc_hiddens, enc_masks, dec_state, target_padded)

##
print('output shape = {}, shape expected = {}'.format(outputs.shape, (3,2,10))) # len_tgt-1, batch, hidden_size

enc_hiddens shape = torch.Size([2, 5, 20]), shape expected = (2, 5, 20)
decoder_state[0] shape = torch.Size([2, 10]), shape expected = (2, 10)
output shape = torch.Size([3, 2, 10]), shape expected = (3, 2, 10)


### Machine Translation Model

In [0]:
class NMT_char(nn.Module):
  def __init__(self, embed_size, hidden_size, vocab, dropout_rate = 0.2, char_decoder = True):
    super(NMT_char, self).__init__()
    
    # attributes
    self.hidden_size = hidden_size
    self.embed_size = embed_size
    self.vocab = vocab
    self.dropout_rate = dropout_rate
    
    # layers/models
    self.encoder = Encoder(embed_size, hidden_size, vocab.src, dropout_rate)
    self.decoder = Decoder(embed_size, hidden_size, vocab.tgt, dropout_rate)
    
    self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False)
    self.dropout = nn.Dropout(dropout_rate)
    
    if char_decoder:
      self.charDecoder = CharDecoder(hidden_size, target_vocab = vocab.tgt)
    else:
      self.charDecoder = None
      
  def forward(self, source, target):
    """
    Machine translation model.
    @ param: source: source sentences List[List [st]]
    @ param target: target sentences List[List[str]]
    """
    source_lengths = [len(s) for s in source]
    
    target_padded = self.vocab.tgt.to_input_tensor(target, device = self.device) # Tensor: (tgt_len, batch)
    source_padded_chars = self.vocab.src.to_input_tensor_char(source, device = self.device) # Tensor: (src_len, batch, max_word_len)
    target_padded_chars = self.vocab.tgt.to_input_tensor_char(target, device = self.device) # Tensor: (tgt_len, batch, max_word_len)
    
    enc_hiddens, dec_init_state = self.encoder(source_padded_chars, source_lengths)
    enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
    combined_outputs = self.decoder(enc_hiddens, enc_masks, dec_init_state, target_padded_chars)
    
    ##
    #set_trace()
    token_scores = self.target_vocab_projection(combined_outputs) #(tgt_len, batch, tgt_vocab_size))
    P = F.log_softmax(token_scores, dim = -1) #(tgt_len, batch, tgt_vocab_size))
    
    # Zero out, probabilities for which we have nothing in the target text
    target_masks = (target_padded != self.vocab.tgt['<pad>']).float()
    
    # Compute log probability of generating true target words
    target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
    scores = target_gold_words_log_prob.sum() 
    
    if self.charDecoder is not None:
      max_word_len = target_padded_chars.shape[-1]
      #target_words = target_padded[1:].contiguous().view(-1,max_word_len)
      target_chars = target_padded_chars[1:].contiguous().view(-1,max_word_len) #(tgt_len-1*batch, max_word_len)
      target_outputs = combined_outputs.view(-1, self.hidden_size) # (tgt_len-1*batch, hidden_size)
      
      target_chars_oov = target_chars #torch.index_select(target_chars, dim=0, index=oovIndices)
      rnn_states_oov = target_outputs #torch.index_select(target_outputs, dim=0, index=oovIndices)
      oovs_losses = self.charDecoder.train_forward(target_chars_oov.t(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0)))
      scores = scores - oovs_losses

    return scores
  
  

  def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor:
      """ Generate sentence masks for encoder hidden states.

      @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                   src_len = max source length, h = hidden size.
      @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.

      @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                  where src_len = max source length, h = hidden size.
      """
      enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
      for e_id, src_len in enumerate(source_lengths):
          enc_masks[e_id, src_len:] = 1
          
      return enc_masks.to(self.device)
    
      
  # Beam search
    
  def beam_search(self, src_sent, beam_size =5, max_decoding_time_step =70):
      """ Given a single source sentence, perform beam search, yielding translations in the target language.
      @param src_sent (List[str]): a single source sentence (words)
      @param beam_size (int): beam size
      @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
      @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
              value: List[str]: the decoded target sentence, represented as a list of words
              score: float: the log-likelihood of the target sentence
      """

      #self.device = self.att_projection.weight.device
      Hypothesis = namedtuple('Hypothesis', ['value', 'score'])
      src_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) #(len_src, 1, max_word_len)

      src_encodings, dec_init_vec = self.encoder(src_sents_var, [len(src_sent)]) # src_encodings => (1, len_src, hidden_size*2)
      src_encodings_att_linear = self.decoder.att_projection(src_encodings) # (1, len_src, hidden_size*2)

      h_tm1 = dec_init_vec #(tensor, tensor) => tensor (1,hidden_size)
      att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

      eos_id = self.vocab.tgt['</s>']

      hypotheses = [['<s>']]
      hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
      completed_hypotheses = []


      t = 0
      while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
          t += 1
          hyp_num = len(hypotheses)
          #set_trace()

          exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) # share decoder state across current hypotheses

          exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2))

          y_tm1 = self.vocab.tgt.to_input_tensor_char(list([hyp[-1]] for hyp in hypotheses), device=self.device) # get last word in hypotheses (1,len(hypotheses)) --> (len(hypotheses), 1, max_word_length)
          y_t_embed = self.decoder.decode_embeddings(y_tm1) #(len(hypotheses), 1, embed_size)
          y_t_embed = torch.squeeze(y_t_embed, dim=0) #(len(hypotheses), embed_size)


          x = torch.cat([y_t_embed, att_tm1], dim=-1)

          (h_t, cell_t), att_t = self.decoder.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None)

          # log probabilities over target words
          log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1)

          live_hyp_num = beam_size - len(completed_hypotheses)
          contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
          top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)

          prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
          hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

          new_hypotheses = []
          live_hyp_ids = []
          new_hyp_scores = []
          
          #set_trace()

          decoderStatesForUNKsHere = []
          for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
              prev_hyp_id = prev_hyp_id.item()
              hyp_word_id = hyp_word_id.item()
              cand_new_hyp_score = cand_new_hyp_score.item()

              hyp_word = self.vocab.tgt.id2word[hyp_word_id]

              # Record output layer in case UNK was generated
              if hyp_word == "<unk>":
                hyp_word = "<unk>"+str(len(decoderStatesForUNKsHere))
                decoderStatesForUNKsHere.append(att_t[prev_hyp_id])

              new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
              if hyp_word == '</s>':
                  completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                         score=cand_new_hyp_score))
              else:
                  new_hypotheses.append(new_hyp_sent)
                  live_hyp_ids.append(prev_hyp_id)
                  new_hyp_scores.append(cand_new_hyp_score)
                  
          #set_trace()

          if len(decoderStatesForUNKsHere) > 0 and self.charDecoder is not None: # decode UNKs
              decoderStatesForUNKsHere = torch.stack(decoderStatesForUNKsHere, dim=0)
              decodedWords = self.charDecoder.decode_greedy((decoderStatesForUNKsHere.unsqueeze(0), \
                                                             decoderStatesForUNKsHere.unsqueeze(0)), max_length=21, device=self.device)
              assert len(decodedWords) == decoderStatesForUNKsHere.size()[0], "Incorrect number of decoded words"
              for hyp in new_hypotheses:
                if hyp[-1].startswith("<unk>"):
                      hyp[-1] = decodedWords[int(hyp[-1][5:])]#[:-1]

          if len(completed_hypotheses) == beam_size:
              break

          live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)
          h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
          att_tm1 = att_t[live_hyp_ids]

          hypotheses = new_hypotheses
          hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)
      
      #set_trace()
      
      if len(completed_hypotheses) == 0:
          completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                 score=hyp_scores[0].item()))

      completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)
      
      return completed_hypotheses
    

  @property
  def device(self):
      """ Determine which device to place the Tensors upon, CPU or GPU.
      """
      return self.target_vocab_projection.weight.device

  @staticmethod
  def load(model_path: str, char_decoder=True):
      """ Load the model from a file.
      @param model_path (str): path to model
      """
      params = torch.load(model_path, map_location=lambda storage, loc: storage)
      args = params['args']
      model = NMT(vocab=params['vocab'], char_decoder= char_decoder, **args)
      model.load_state_dict(params['state_dict'])

      return model

  def save(self, path: str):
      """ Save the odel to a file.
      @param path (str): path to the model
      """
      print('save model parameters to [%s]' % path)

      params = {
          'args': dict(embed_size=self.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate),
          'vocab': self.vocab,
          'state_dict': self.state_dict()
      }

      torch.save(params, path)    

## Training

In [0]:
## Evaluation metric
def evaluate_ppl(model, dev_data, batch_size=32):
    """ Evaluate perplexity on dev sentences
    @param model (NMT): NMT_char Model
    @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (batch size)
    @returns ppl (perplixty on dev sentences)
    """
    was_training = model.training
    model.eval()

    cum_loss = 0.
    cum_tgt_words = 0.

    # no_grad() signals backend to throw away all gradients
    with torch.no_grad():
        for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
            loss = -model(src_sents, tgt_sents).sum()

            cum_loss += loss.item()
            tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            cum_tgt_words += tgt_word_num_to_predict

        ppl = np.exp(cum_loss / cum_tgt_words)

    if was_training:
        model.train()

    return ppl

In [0]:
## Train

def train_model(model, optimizer, clip_grad =5.0, max_epoch =30, max_patience = 3, max_trial = 3, lr_decay = 0.5, train_batch_size = 64, log_every = 100, valid_niter = 1000):
  
  
  print('Training begins...')
  ## Temp variables
  num_trial = 0
  train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
  cum_examples = report_examples  = valid_num = 0
  hist_valid_scores = []
  train_time = begin_time = time.time()
  
  # put the model in training mode
  model.train()
  
  # iterate over the epochs
  for epoch in range(max_epoch):
    for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
        
        train_iter += 1
        optimizer.zero_grad()
        batch_size = len(src_sents)
        
        example_losses = -model(src_sents, tgt_sents)
        batch_loss = example_losses.sum()
        loss = batch_loss/batch_size
        loss.backward() # autograd
        
        # Clip gradient
        grad_norn = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step() # update parameters
        
        batch_losses_val = batch_loss.item()
        report_loss += batch_losses_val
        cum_loss += batch_losses_val
        
        tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
        report_tgt_words += tgt_words_num_to_predict
        cum_tgt_words += tgt_words_num_to_predict
        report_examples += batch_size
        cum_examples += batch_size
        
        # print interim report about training
        
        if train_iter % log_every == 0:
            #set_trace()
            print('| Epoch %d, Iter %d| Avg Loss = %.2f| Avg. ppl = %.2f| Speed %.2f words/sec| Time %.2f min|' % (epoch+1, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words),
                                                                                     report_tgt_words / (time.time() - train_time), (time.time() - begin_time)/60.0))

            train_time = time.time()
            report_loss = report_tgt_words = report_examples = 0.
        
        # validation
        if train_iter % valid_niter == 0:
            
            print('| <Train Summary> | Epoch %d, Iter %d| Cum. loss = %.2f| Cum. ppl = %.2f|' % (epoch+1, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words)))

            cum_loss = cum_examples = cum_tgt_words = 0.
            valid_num += 1

            print('Report on validation set:', file=sys.stderr)

            # compute dev. ppl and bleu
            dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
            valid_metric = -dev_ppl

            print('Validation:  Dev. ppl = %f' % (dev_ppl), file=sys.stderr)

            
            # learning rate scheduling
            
            is_better = (len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores))
            hist_valid_scores.append(valid_metric)

            if is_better:
                patience = 0
                print('Save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                model.save(model_save_path)

                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
                
            elif patience < int(max_patience):
                patience += 1
                print('Hit patience %d' % patience, file=sys.stderr)

                if patience == int(max_patience):
                    num_trial += 1
                    print('Hit #%d trial' % num_trial, file=sys.stderr)
                    
                    if num_trial == int(max_trial):
                        print('early stop!', file=sys.stderr)
                        return

                    # decay lr, and restore from previously best checkpoint
                    lr = optimizer.param_groups[0]['lr'] * float(lr_decay)
                    print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                    # load model
                    params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                    model.load_state_dict(params['state_dict'])
                    model = model.to(device)

                    print('restore parameters of the optimizers', file=sys.stderr)
                    optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                    # set new lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr

                    # reset patience
                    patience = 0

            if epoch +1 == int(max_epoch):
                print('Training stopped <-> Reached maximum number of epochs!', file=sys.stderr)
                return


In [0]:
# initialize the model
model = NMT_char(embed_size= 256, hidden_size=256, dropout_rate=0.3, vocab=vocab)

# define model saving path
model_save_path = 'NMT_char_model.pt'

# transfer the model to cuda if available
device = torch.device("cuda:0" if torch.cuda.device_count()>0 else "cpu")
print('Use device: %s' % device, file=sys.stderr)
model = model.to(device)

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# parameter initialization
uniform_init = 0.1
if np.abs(uniform_init) > 0.:
  print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
  for p in model.parameters():
    p.data.uniform_(-uniform_init, uniform_init)


# train parameters
max_epoch =30
train_batch_size = 64

# train the model
train_model(model, optimizer, max_epoch =max_epoch, train_batch_size = train_batch_size)

Use device: cuda:0
uniformly initialize parameters [-0.100000, +0.100000]


Training begins...
| Epoch 1, Iter 100| Avg Loss = 413.37| Avg. ppl = 16136669329.75| Speed 3308.79 words/sec| Time 0.57 min|
| Epoch 1, Iter 200| Avg Loss = 337.06| Avg. ppl = 197750203.67| Speed 3253.57 words/sec| Time 1.15 min|
| Epoch 1, Iter 300| Avg Loss = 313.76| Avg. ppl = 45282543.43| Speed 3182.33 words/sec| Time 1.74 min|
| Epoch 1, Iter 400| Avg Loss = 292.17| Avg. ppl = 16321832.60| Speed 3178.64 words/sec| Time 2.33 min|
| Epoch 1, Iter 500| Avg Loss = 283.48| Avg. ppl = 8095768.21| Speed 3226.37 words/sec| Time 2.92 min|
| Epoch 1, Iter 600| Avg Loss = 267.50| Avg. ppl = 4260720.87| Speed 3150.62 words/sec| Time 3.51 min|
| Epoch 1, Iter 700| Avg Loss = 264.25| Avg. ppl = 2643083.10| Speed 3209.91 words/sec| Time 4.11 min|
| Epoch 1, Iter 800| Avg Loss = 258.06| Avg. ppl = 1958380.35| Speed 3161.89 words/sec| Time 4.71 min|
| Epoch 1, Iter 900| Avg Loss = 248.80| Avg. ppl = 1216200.06| Speed 3142.00 words/sec| Time 5.31 min|
| Epoch 1, Iter 1000| Avg Loss = 239.33| Avg. 

Report on validation set:
Validation:  Dev. ppl = 764692.806821
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 1, Iter 1100| Avg Loss = 230.13| Avg. ppl = 528805.81| Speed 2755.74 words/sec| Time 6.57 min|
| Epoch 1, Iter 1200| Avg Loss = 228.33| Avg. ppl = 362236.16| Speed 3170.24 words/sec| Time 7.17 min|
| Epoch 1, Iter 1300| Avg Loss = 217.73| Avg. ppl = 255817.32| Speed 3169.29 words/sec| Time 7.76 min|
| Epoch 1, Iter 1400| Avg Loss = 214.80| Avg. ppl = 195466.50| Speed 3169.71 words/sec| Time 8.35 min|
| Epoch 1, Iter 1500| Avg Loss = 210.47| Avg. ppl = 147539.63| Speed 3171.74 words/sec| Time 8.95 min|
| Epoch 1, Iter 1600| Avg Loss = 208.82| Avg. ppl = 119077.53| Speed 3217.85 words/sec| Time 9.54 min|
| Epoch 1, Iter 1700| Avg Loss = 205.45| Avg. ppl = 98668.34| Speed 3160.25 words/sec| Time 10.14 min|
| Epoch 1, Iter 1800| Avg Loss = 196.66| Avg. ppl = 78043.82| Speed 3135.71 words/sec| Time 10.74 min|
| Epoch 1, Iter 1900| Avg Loss = 195.58| Avg. ppl = 64861.58| Speed 3158.60 words/sec| Time 11.33 min|
| Epoch 1, Iter 2000| Avg Lo

Report on validation set:
Validation:  Dev. ppl = 59222.300336
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 1, Iter 2100| Avg Loss = 190.59| Avg. ppl = 47500.56| Speed 2950.77 words/sec| Time 12.58 min|
| Epoch 1, Iter 2200| Avg Loss = 184.28| Avg. ppl = 38028.27| Speed 3142.93 words/sec| Time 13.17 min|
| Epoch 1, Iter 2300| Avg Loss = 183.56| Avg. ppl = 31485.66| Speed 3171.67 words/sec| Time 13.77 min|
| Epoch 1, Iter 2400| Avg Loss = 180.24| Avg. ppl = 29172.01| Speed 3078.35 words/sec| Time 14.38 min|
| Epoch 1, Iter 2500| Avg Loss = 180.07| Avg. ppl = 26478.90| Speed 3132.72 words/sec| Time 14.98 min|
| Epoch 1, Iter 2600| Avg Loss = 179.86| Avg. ppl = 24284.27| Speed 3218.43 words/sec| Time 15.57 min|
| Epoch 1, Iter 2700| Avg Loss = 176.30| Avg. ppl = 19835.42| Speed 3162.87 words/sec| Time 16.17 min|
| Epoch 1, Iter 2800| Avg Loss = 173.31| Avg. ppl = 19186.40| Speed 3173.07 words/sec| Time 16.76 min|
| Epoch 1, Iter 2900| Avg Loss = 172.10| Avg. ppl = 16149.67| Speed 3154.71 words/sec| Time 17.36 min|
| Epoch 1, Iter 3000| Avg Lo

Report on validation set:
Validation:  Dev. ppl = 18589.042962
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 1, Iter 3100| Avg Loss = 168.09| Avg. ppl = 13772.77| Speed 2968.56 words/sec| Time 18.58 min|
| Epoch 1, Iter 3200| Avg Loss = 165.81| Avg. ppl = 12160.82| Speed 3182.22 words/sec| Time 19.17 min|
| Epoch 1, Iter 3300| Avg Loss = 164.87| Avg. ppl = 11895.77| Speed 3222.98 words/sec| Time 19.75 min|
| Epoch 2, Iter 3400| Avg Loss = 162.82| Avg. ppl = 10053.36| Speed 3120.52 words/sec| Time 20.36 min|
| Epoch 2, Iter 3500| Avg Loss = 160.47| Avg. ppl = 8765.39| Speed 3193.89 words/sec| Time 20.95 min|
| Epoch 2, Iter 3600| Avg Loss = 158.07| Avg. ppl = 8087.84| Speed 3153.41 words/sec| Time 21.54 min|
| Epoch 2, Iter 3700| Avg Loss = 159.14| Avg. ppl = 7985.04| Speed 3144.34 words/sec| Time 22.14 min|
| Epoch 2, Iter 3800| Avg Loss = 156.59| Avg. ppl = 7276.83| Speed 3139.99 words/sec| Time 22.74 min|
| Epoch 2, Iter 3900| Avg Loss = 155.62| Avg. ppl = 6597.48| Speed 3185.04 words/sec| Time 23.33 min|
| Epoch 2, Iter 4000| Avg Loss = 

Report on validation set:
Validation:  Dev. ppl = 8524.770864
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 2, Iter 4100| Avg Loss = 155.23| Avg. ppl = 6106.20| Speed 2997.37 words/sec| Time 24.56 min|
| Epoch 2, Iter 4200| Avg Loss = 154.29| Avg. ppl = 5480.70| Speed 3162.21 words/sec| Time 25.17 min|
| Epoch 2, Iter 4300| Avg Loss = 152.50| Avg. ppl = 5236.89| Speed 3157.33 words/sec| Time 25.77 min|
| Epoch 2, Iter 4400| Avg Loss = 151.03| Avg. ppl = 4811.31| Speed 3201.75 words/sec| Time 26.36 min|
| Epoch 2, Iter 4500| Avg Loss = 150.58| Avg. ppl = 4854.88| Speed 3247.47 words/sec| Time 26.95 min|
| Epoch 2, Iter 4600| Avg Loss = 149.42| Avg. ppl = 4562.59| Speed 3131.00 words/sec| Time 27.55 min|
| Epoch 2, Iter 4700| Avg Loss = 148.40| Avg. ppl = 4197.07| Speed 3189.27 words/sec| Time 28.15 min|
| Epoch 2, Iter 4800| Avg Loss = 148.32| Avg. ppl = 4211.55| Speed 3188.61 words/sec| Time 28.74 min|
| Epoch 2, Iter 4900| Avg Loss = 147.42| Avg. ppl = 4040.59| Speed 3192.46 words/sec| Time 29.33 min|
| Epoch 2, Iter 5000| Avg Loss = 143.

Report on validation set:
Validation:  Dev. ppl = 4964.702547
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 2, Iter 5100| Avg Loss = 145.75| Avg. ppl = 3776.30| Speed 2950.69 words/sec| Time 30.56 min|
| Epoch 2, Iter 5200| Avg Loss = 144.14| Avg. ppl = 3624.95| Speed 3157.96 words/sec| Time 31.15 min|
| Epoch 2, Iter 5300| Avg Loss = 142.54| Avg. ppl = 3339.45| Speed 3185.46 words/sec| Time 31.74 min|
| Epoch 2, Iter 5400| Avg Loss = 145.44| Avg. ppl = 3258.99| Speed 3153.49 words/sec| Time 32.35 min|
| Epoch 2, Iter 5500| Avg Loss = 143.18| Avg. ppl = 3270.93| Speed 3164.31 words/sec| Time 32.95 min|
| Epoch 2, Iter 5600| Avg Loss = 141.46| Avg. ppl = 3122.54| Speed 3199.21 words/sec| Time 33.53 min|
| Epoch 2, Iter 5700| Avg Loss = 142.98| Avg. ppl = 3028.06| Speed 3191.67 words/sec| Time 34.13 min|
| Epoch 2, Iter 5800| Avg Loss = 140.66| Avg. ppl = 2919.64| Speed 3199.99 words/sec| Time 34.72 min|
| Epoch 2, Iter 5900| Avg Loss = 138.08| Avg. ppl = 2729.25| Speed 3180.11 words/sec| Time 35.30 min|
| Epoch 2, Iter 6000| Avg Loss = 140.

Report on validation set:
Validation:  Dev. ppl = 3238.770114
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 2, Iter 6100| Avg Loss = 137.69| Avg. ppl = 2620.16| Speed 2969.27 words/sec| Time 36.52 min|
| Epoch 2, Iter 6200| Avg Loss = 138.47| Avg. ppl = 2543.41| Speed 3178.98 words/sec| Time 37.12 min|
| Epoch 2, Iter 6300| Avg Loss = 137.23| Avg. ppl = 2459.07| Speed 3187.41 words/sec| Time 37.71 min|
| Epoch 2, Iter 6400| Avg Loss = 134.95| Avg. ppl = 2272.59| Speed 3145.45 words/sec| Time 38.30 min|
| Epoch 2, Iter 6500| Avg Loss = 136.65| Avg. ppl = 2341.41| Speed 3218.20 words/sec| Time 38.88 min|
| Epoch 2, Iter 6600| Avg Loss = 135.08| Avg. ppl = 2284.64| Speed 3176.56 words/sec| Time 39.47 min|
| Epoch 2, Iter 6700| Avg Loss = 136.34| Avg. ppl = 2227.57| Speed 3252.26 words/sec| Time 40.05 min|
| Epoch 3, Iter 6800| Avg Loss = 133.43| Avg. ppl = 2031.23| Speed 3124.87 words/sec| Time 40.64 min|
| Epoch 3, Iter 6900| Avg Loss = 132.97| Avg. ppl = 1756.85| Speed 3121.65 words/sec| Time 41.25 min|
| Epoch 3, Iter 7000| Avg Loss = 130.

Report on validation set:
Validation:  Dev. ppl = 2417.697553
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 3, Iter 7100| Avg Loss = 132.35| Avg. ppl = 1760.37| Speed 3017.20 words/sec| Time 42.47 min|
| Epoch 3, Iter 7200| Avg Loss = 132.45| Avg. ppl = 1685.92| Speed 3167.10 words/sec| Time 43.07 min|
| Epoch 3, Iter 7300| Avg Loss = 129.08| Avg. ppl = 1554.29| Speed 3169.54 words/sec| Time 43.66 min|
| Epoch 3, Iter 7400| Avg Loss = 131.49| Avg. ppl = 1708.92| Speed 3215.78 words/sec| Time 44.25 min|
| Epoch 3, Iter 7500| Avg Loss = 130.50| Avg. ppl = 1584.80| Speed 3230.49 words/sec| Time 44.83 min|
| Epoch 3, Iter 7600| Avg Loss = 128.97| Avg. ppl = 1501.78| Speed 3107.14 words/sec| Time 45.44 min|
| Epoch 3, Iter 7700| Avg Loss = 130.28| Avg. ppl = 1574.87| Speed 3235.93 words/sec| Time 46.02 min|
| Epoch 3, Iter 7800| Avg Loss = 130.86| Avg. ppl = 1592.18| Speed 3219.01 words/sec| Time 46.61 min|
| Epoch 3, Iter 7900| Avg Loss = 127.38| Avg. ppl = 1466.25| Speed 3152.77 words/sec| Time 47.20 min|
| Epoch 3, Iter 8000| Avg Loss = 128.

Report on validation set:
Validation:  Dev. ppl = 1980.133993
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 3, Iter 8100| Avg Loss = 127.97| Avg. ppl = 1411.76| Speed 2926.75 words/sec| Time 48.44 min|
| Epoch 3, Iter 8200| Avg Loss = 128.34| Avg. ppl = 1368.32| Speed 3193.39 words/sec| Time 49.04 min|
| Epoch 3, Iter 8300| Avg Loss = 128.63| Avg. ppl = 1411.49| Speed 3169.75 words/sec| Time 49.63 min|
| Epoch 3, Iter 8400| Avg Loss = 128.27| Avg. ppl = 1417.86| Speed 3140.27 words/sec| Time 50.24 min|
| Epoch 3, Iter 8500| Avg Loss = 126.59| Avg. ppl = 1346.96| Speed 3214.66 words/sec| Time 50.82 min|
| Epoch 3, Iter 8600| Avg Loss = 126.71| Avg. ppl = 1300.20| Speed 3197.73 words/sec| Time 51.41 min|
| Epoch 3, Iter 8700| Avg Loss = 128.24| Avg. ppl = 1396.25| Speed 3215.46 words/sec| Time 52.00 min|
| Epoch 3, Iter 8800| Avg Loss = 126.18| Avg. ppl = 1270.11| Speed 3159.28 words/sec| Time 52.59 min|
| Epoch 3, Iter 8900| Avg Loss = 126.78| Avg. ppl = 1255.19| Speed 3211.21 words/sec| Time 53.18 min|
| Epoch 3, Iter 9000| Avg Loss = 125.

Report on validation set:
Validation:  Dev. ppl = 1715.254622
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 3, Iter 9100| Avg Loss = 125.72| Avg. ppl = 1259.14| Speed 3006.79 words/sec| Time 54.39 min|
| Epoch 3, Iter 9200| Avg Loss = 125.27| Avg. ppl = 1227.30| Speed 3219.36 words/sec| Time 54.98 min|
| Epoch 3, Iter 9300| Avg Loss = 124.87| Avg. ppl = 1232.15| Speed 3152.86 words/sec| Time 55.57 min|
| Epoch 3, Iter 9400| Avg Loss = 126.76| Avg. ppl = 1161.08| Speed 3205.33 words/sec| Time 56.17 min|
| Epoch 3, Iter 9500| Avg Loss = 123.92| Avg. ppl = 1160.62| Speed 3260.12 words/sec| Time 56.74 min|
| Epoch 3, Iter 9600| Avg Loss = 125.51| Avg. ppl = 1129.05| Speed 3208.14 words/sec| Time 57.34 min|
| Epoch 3, Iter 9700| Avg Loss = 125.40| Avg. ppl = 1120.42| Speed 3139.38 words/sec| Time 57.94 min|
| Epoch 3, Iter 9800| Avg Loss = 122.60| Avg. ppl = 1075.02| Speed 3177.75 words/sec| Time 58.53 min|
| Epoch 3, Iter 9900| Avg Loss = 124.60| Avg. ppl = 1121.39| Speed 3218.36 words/sec| Time 59.12 min|
| Epoch 3, Iter 10000| Avg Loss = 123

Report on validation set:
Validation:  Dev. ppl = 1403.582833
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 3, Iter 10100| Avg Loss = 123.29| Avg. ppl = 1121.89| Speed 2960.84 words/sec| Time 60.34 min|
| Epoch 4, Iter 10200| Avg Loss = 123.83| Avg. ppl = 978.13| Speed 3234.77 words/sec| Time 60.93 min|
| Epoch 4, Iter 10300| Avg Loss = 119.30| Avg. ppl = 846.57| Speed 3143.69 words/sec| Time 61.53 min|
| Epoch 4, Iter 10400| Avg Loss = 122.45| Avg. ppl = 887.86| Speed 3206.25 words/sec| Time 62.13 min|
| Epoch 4, Iter 10500| Avg Loss = 120.21| Avg. ppl = 892.58| Speed 3209.14 words/sec| Time 62.72 min|
| Epoch 4, Iter 10600| Avg Loss = 119.09| Avg. ppl = 807.65| Speed 3135.53 words/sec| Time 63.33 min|
| Epoch 4, Iter 10700| Avg Loss = 119.74| Avg. ppl = 900.37| Speed 3166.18 words/sec| Time 63.92 min|
| Epoch 4, Iter 10800| Avg Loss = 120.70| Avg. ppl = 873.21| Speed 3278.36 words/sec| Time 64.50 min|
| Epoch 4, Iter 10900| Avg Loss = 119.84| Avg. ppl = 871.93| Speed 3172.45 words/sec| Time 65.10 min|
| Epoch 4, Iter 11000| Avg Loss = 11

Report on validation set:
Validation:  Dev. ppl = 1240.418767
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 4, Iter 11100| Avg Loss = 119.11| Avg. ppl = 861.91| Speed 3050.50 words/sec| Time 66.31 min|
| Epoch 4, Iter 11200| Avg Loss = 119.64| Avg. ppl = 883.79| Speed 3108.54 words/sec| Time 66.91 min|
| Epoch 4, Iter 11300| Avg Loss = 118.50| Avg. ppl = 847.53| Speed 3202.29 words/sec| Time 67.50 min|
| Epoch 4, Iter 11400| Avg Loss = 119.57| Avg. ppl = 841.00| Speed 3244.25 words/sec| Time 68.08 min|
| Epoch 4, Iter 11500| Avg Loss = 117.44| Avg. ppl = 805.28| Speed 3206.96 words/sec| Time 68.67 min|
| Epoch 4, Iter 11600| Avg Loss = 119.21| Avg. ppl = 858.21| Speed 3189.70 words/sec| Time 69.26 min|
| Epoch 4, Iter 11700| Avg Loss = 116.87| Avg. ppl = 809.95| Speed 3196.78 words/sec| Time 69.84 min|
| Epoch 4, Iter 11800| Avg Loss = 118.82| Avg. ppl = 803.79| Speed 3222.65 words/sec| Time 70.43 min|
| Epoch 4, Iter 11900| Avg Loss = 119.84| Avg. ppl = 819.12| Speed 3203.26 words/sec| Time 71.02 min|
| Epoch 4, Iter 12000| Avg Loss = 117

Report on validation set:
Validation:  Dev. ppl = 1163.011883
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 4, Iter 12100| Avg Loss = 116.85| Avg. ppl = 757.35| Speed 2995.88 words/sec| Time 72.24 min|
| Epoch 4, Iter 12200| Avg Loss = 115.18| Avg. ppl = 736.04| Speed 3141.58 words/sec| Time 72.83 min|
| Epoch 4, Iter 12300| Avg Loss = 117.96| Avg. ppl = 814.09| Speed 3207.77 words/sec| Time 73.42 min|
| Epoch 4, Iter 12400| Avg Loss = 117.58| Avg. ppl = 769.27| Speed 3169.34 words/sec| Time 74.01 min|
| Epoch 4, Iter 12500| Avg Loss = 118.42| Avg. ppl = 766.17| Speed 3146.05 words/sec| Time 74.62 min|
| Epoch 4, Iter 12600| Avg Loss = 116.51| Avg. ppl = 779.22| Speed 3193.86 words/sec| Time 75.20 min|
| Epoch 4, Iter 12700| Avg Loss = 119.01| Avg. ppl = 797.16| Speed 3198.19 words/sec| Time 75.79 min|
| Epoch 4, Iter 12800| Avg Loss = 116.52| Avg. ppl = 773.64| Speed 3138.10 words/sec| Time 76.39 min|
| Epoch 4, Iter 12900| Avg Loss = 116.54| Avg. ppl = 775.65| Speed 3214.03 words/sec| Time 76.97 min|
| Epoch 4, Iter 13000| Avg Loss = 116

Report on validation set:
Validation:  Dev. ppl = 1076.141028
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 4, Iter 13100| Avg Loss = 117.23| Avg. ppl = 728.98| Speed 3008.83 words/sec| Time 78.18 min|
| Epoch 4, Iter 13200| Avg Loss = 115.14| Avg. ppl = 699.35| Speed 3219.77 words/sec| Time 78.76 min|
| Epoch 4, Iter 13300| Avg Loss = 116.13| Avg. ppl = 746.15| Speed 3230.34 words/sec| Time 79.34 min|
| Epoch 4, Iter 13400| Avg Loss = 117.52| Avg. ppl = 724.69| Speed 3167.29 words/sec| Time 79.94 min|
| Epoch 4, Iter 13500| Avg Loss = 117.35| Avg. ppl = 729.84| Speed 3210.07 words/sec| Time 80.54 min|
| Epoch 5, Iter 13600| Avg Loss = 116.25| Avg. ppl = 643.67| Speed 3214.59 words/sec| Time 81.13 min|
| Epoch 5, Iter 13700| Avg Loss = 111.29| Avg. ppl = 558.00| Speed 3200.02 words/sec| Time 81.72 min|
| Epoch 5, Iter 13800| Avg Loss = 112.05| Avg. ppl = 573.20| Speed 3221.48 words/sec| Time 82.30 min|
| Epoch 5, Iter 13900| Avg Loss = 114.91| Avg. ppl = 595.03| Speed 3198.33 words/sec| Time 82.90 min|
| Epoch 5, Iter 14000| Avg Loss = 110

Report on validation set:
Validation:  Dev. ppl = 979.254655
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 5, Iter 14100| Avg Loss = 114.85| Avg. ppl = 654.63| Speed 2985.60 words/sec| Time 84.14 min|
| Epoch 5, Iter 14200| Avg Loss = 112.84| Avg. ppl = 592.92| Speed 3211.13 words/sec| Time 84.73 min|
| Epoch 5, Iter 14300| Avg Loss = 113.79| Avg. ppl = 577.44| Speed 3167.13 words/sec| Time 85.33 min|
| Epoch 5, Iter 14400| Avg Loss = 114.89| Avg. ppl = 610.07| Speed 3241.72 words/sec| Time 85.92 min|
| Epoch 5, Iter 14500| Avg Loss = 111.93| Avg. ppl = 582.51| Speed 3182.81 words/sec| Time 86.51 min|
| Epoch 5, Iter 14600| Avg Loss = 113.63| Avg. ppl = 596.38| Speed 3228.49 words/sec| Time 87.10 min|
| Epoch 5, Iter 14700| Avg Loss = 114.36| Avg. ppl = 603.05| Speed 3218.46 words/sec| Time 87.69 min|
| Epoch 5, Iter 14800| Avg Loss = 114.08| Avg. ppl = 584.85| Speed 3237.58 words/sec| Time 88.28 min|
| Epoch 5, Iter 14900| Avg Loss = 112.97| Avg. ppl = 603.36| Speed 3129.23 words/sec| Time 88.88 min|
| Epoch 5, Iter 15000| Avg Loss = 112

Report on validation set:
Validation:  Dev. ppl = 879.443240
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 5, Iter 15100| Avg Loss = 111.19| Avg. ppl = 562.54| Speed 2970.24 words/sec| Time 90.10 min|
| Epoch 5, Iter 15200| Avg Loss = 111.43| Avg. ppl = 580.45| Speed 3207.18 words/sec| Time 90.69 min|
| Epoch 5, Iter 15300| Avg Loss = 111.67| Avg. ppl = 593.69| Speed 3162.42 words/sec| Time 91.27 min|
| Epoch 5, Iter 15400| Avg Loss = 111.47| Avg. ppl = 572.44| Speed 3182.63 words/sec| Time 91.86 min|
| Epoch 5, Iter 15500| Avg Loss = 112.68| Avg. ppl = 596.11| Speed 3186.66 words/sec| Time 92.45 min|
| Epoch 5, Iter 15600| Avg Loss = 112.61| Avg. ppl = 572.88| Speed 3198.69 words/sec| Time 93.04 min|
| Epoch 5, Iter 15700| Avg Loss = 112.27| Avg. ppl = 582.44| Speed 3207.09 words/sec| Time 93.63 min|
| Epoch 5, Iter 15800| Avg Loss = 111.86| Avg. ppl = 565.82| Speed 3116.74 words/sec| Time 94.24 min|
| Epoch 5, Iter 15900| Avg Loss = 113.47| Avg. ppl = 594.18| Speed 3238.56 words/sec| Time 94.82 min|
| Epoch 5, Iter 16000| Avg Loss = 109

Report on validation set:
Validation:  Dev. ppl = 853.409954
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 5, Iter 16100| Avg Loss = 112.55| Avg. ppl = 574.04| Speed 2977.85 words/sec| Time 96.05 min|
| Epoch 5, Iter 16200| Avg Loss = 112.50| Avg. ppl = 565.07| Speed 3187.74 words/sec| Time 96.64 min|
| Epoch 5, Iter 16300| Avg Loss = 110.18| Avg. ppl = 532.08| Speed 3182.62 words/sec| Time 97.23 min|
| Epoch 5, Iter 16400| Avg Loss = 111.04| Avg. ppl = 538.11| Speed 3103.68 words/sec| Time 97.84 min|
| Epoch 5, Iter 16500| Avg Loss = 110.73| Avg. ppl = 565.77| Speed 3214.34 words/sec| Time 98.41 min|
| Epoch 5, Iter 16600| Avg Loss = 110.32| Avg. ppl = 534.49| Speed 3126.01 words/sec| Time 99.01 min|
| Epoch 5, Iter 16700| Avg Loss = 110.69| Avg. ppl = 551.20| Speed 3169.96 words/sec| Time 99.60 min|
| Epoch 5, Iter 16800| Avg Loss = 111.70| Avg. ppl = 569.34| Speed 3224.99 words/sec| Time 100.19 min|
| Epoch 5, Iter 16900| Avg Loss = 111.09| Avg. ppl = 542.29| Speed 3188.25 words/sec| Time 100.78 min|
| Epoch 6, Iter 17000| Avg Loss = 1

Report on validation set:
Validation:  Dev. ppl = 818.582296
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 6, Iter 17100| Avg Loss = 107.12| Avg. ppl = 445.24| Speed 2973.12 words/sec| Time 102.01 min|
| Epoch 6, Iter 17200| Avg Loss = 109.00| Avg. ppl = 465.71| Speed 3219.92 words/sec| Time 102.60 min|
| Epoch 6, Iter 17300| Avg Loss = 107.99| Avg. ppl = 447.47| Speed 3198.96 words/sec| Time 103.19 min|
| Epoch 6, Iter 17400| Avg Loss = 107.89| Avg. ppl = 453.53| Speed 3202.35 words/sec| Time 103.78 min|
| Epoch 6, Iter 17500| Avg Loss = 108.13| Avg. ppl = 449.38| Speed 3148.74 words/sec| Time 104.38 min|
| Epoch 6, Iter 17600| Avg Loss = 108.71| Avg. ppl = 456.87| Speed 3174.19 words/sec| Time 104.97 min|
| Epoch 6, Iter 17700| Avg Loss = 109.10| Avg. ppl = 464.53| Speed 3161.63 words/sec| Time 105.57 min|
| Epoch 6, Iter 17800| Avg Loss = 108.45| Avg. ppl = 466.91| Speed 3234.98 words/sec| Time 106.16 min|
| Epoch 6, Iter 17900| Avg Loss = 108.23| Avg. ppl = 486.46| Speed 3164.74 words/sec| Time 106.74 min|
| Epoch 6, Iter 18000| Avg L

Report on validation set:
Validation:  Dev. ppl = 784.627137
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 6, Iter 18100| Avg Loss = 107.91| Avg. ppl = 447.13| Speed 2944.17 words/sec| Time 107.97 min|
| Epoch 6, Iter 18200| Avg Loss = 109.07| Avg. ppl = 455.96| Speed 3157.49 words/sec| Time 108.57 min|
| Epoch 6, Iter 18300| Avg Loss = 109.23| Avg. ppl = 466.88| Speed 3169.91 words/sec| Time 109.17 min|
| Epoch 6, Iter 18400| Avg Loss = 108.73| Avg. ppl = 450.57| Speed 3195.19 words/sec| Time 109.76 min|
| Epoch 6, Iter 18500| Avg Loss = 108.67| Avg. ppl = 453.48| Speed 3160.40 words/sec| Time 110.36 min|
| Epoch 6, Iter 18600| Avg Loss = 108.15| Avg. ppl = 455.75| Speed 3201.29 words/sec| Time 110.95 min|
| Epoch 6, Iter 18700| Avg Loss = 109.95| Avg. ppl = 475.43| Speed 3225.65 words/sec| Time 111.54 min|
| Epoch 6, Iter 18800| Avg Loss = 109.13| Avg. ppl = 473.31| Speed 3241.96 words/sec| Time 112.12 min|
| Epoch 6, Iter 18900| Avg Loss = 106.92| Avg. ppl = 439.61| Speed 3141.72 words/sec| Time 112.72 min|
| Epoch 6, Iter 19000| Avg L

Report on validation set:
Validation:  Dev. ppl = 762.712660
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 6, Iter 19100| Avg Loss = 110.26| Avg. ppl = 472.55| Speed 2983.73 words/sec| Time 113.96 min|
| Epoch 6, Iter 19200| Avg Loss = 107.69| Avg. ppl = 451.43| Speed 3172.54 words/sec| Time 114.55 min|
| Epoch 6, Iter 19300| Avg Loss = 105.88| Avg. ppl = 432.12| Speed 3225.72 words/sec| Time 115.12 min|
| Epoch 6, Iter 19400| Avg Loss = 107.48| Avg. ppl = 459.55| Speed 3143.94 words/sec| Time 115.72 min|
| Epoch 6, Iter 19500| Avg Loss = 106.59| Avg. ppl = 447.35| Speed 3142.64 words/sec| Time 116.31 min|
| Epoch 6, Iter 19600| Avg Loss = 106.49| Avg. ppl = 441.69| Speed 3138.94 words/sec| Time 116.91 min|
| Epoch 6, Iter 19700| Avg Loss = 107.85| Avg. ppl = 442.28| Speed 3170.83 words/sec| Time 117.50 min|
| Epoch 6, Iter 19800| Avg Loss = 108.73| Avg. ppl = 467.95| Speed 3211.42 words/sec| Time 118.09 min|
| Epoch 6, Iter 19900| Avg Loss = 109.12| Avg. ppl = 453.22| Speed 3167.87 words/sec| Time 118.69 min|
| Epoch 6, Iter 20000| Avg L

Report on validation set:
Validation:  Dev. ppl = 727.473818
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 6, Iter 20100| Avg Loss = 108.33| Avg. ppl = 442.55| Speed 2992.33 words/sec| Time 119.91 min|
| Epoch 6, Iter 20200| Avg Loss = 108.34| Avg. ppl = 460.29| Speed 3180.37 words/sec| Time 120.51 min|
| Epoch 6, Iter 20300| Avg Loss = 106.97| Avg. ppl = 437.04| Speed 3170.56 words/sec| Time 121.10 min|
| Epoch 7, Iter 20400| Avg Loss = 102.58| Avg. ppl = 364.21| Speed 3191.02 words/sec| Time 121.68 min|
| Epoch 7, Iter 20500| Avg Loss = 103.80| Avg. ppl = 372.37| Speed 3154.92 words/sec| Time 122.27 min|
| Epoch 7, Iter 20600| Avg Loss = 103.78| Avg. ppl = 365.18| Speed 3138.78 words/sec| Time 122.87 min|
| Epoch 7, Iter 20700| Avg Loss = 105.43| Avg. ppl = 368.77| Speed 3145.88 words/sec| Time 123.47 min|
| Epoch 7, Iter 20800| Avg Loss = 105.68| Avg. ppl = 380.69| Speed 3181.55 words/sec| Time 124.07 min|
| Epoch 7, Iter 20900| Avg Loss = 105.15| Avg. ppl = 374.72| Speed 3196.94 words/sec| Time 124.66 min|
| Epoch 7, Iter 21000| Avg L

Report on validation set:
Validation:  Dev. ppl = 684.834031
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 7, Iter 21100| Avg Loss = 103.84| Avg. ppl = 379.56| Speed 2994.75 words/sec| Time 125.88 min|
| Epoch 7, Iter 21200| Avg Loss = 105.08| Avg. ppl = 387.46| Speed 3205.00 words/sec| Time 126.47 min|
| Epoch 7, Iter 21300| Avg Loss = 103.25| Avg. ppl = 377.47| Speed 3114.13 words/sec| Time 127.06 min|
| Epoch 7, Iter 21400| Avg Loss = 106.41| Avg. ppl = 395.46| Speed 3208.00 words/sec| Time 127.65 min|
| Epoch 7, Iter 21500| Avg Loss = 105.70| Avg. ppl = 389.09| Speed 3152.40 words/sec| Time 128.25 min|
| Epoch 7, Iter 21600| Avg Loss = 104.32| Avg. ppl = 361.26| Speed 3145.10 words/sec| Time 128.85 min|
| Epoch 7, Iter 21700| Avg Loss = 105.20| Avg. ppl = 377.30| Speed 3166.53 words/sec| Time 129.45 min|
| Epoch 7, Iter 21800| Avg Loss = 105.25| Avg. ppl = 363.85| Speed 3216.78 words/sec| Time 130.04 min|
| Epoch 7, Iter 21900| Avg Loss = 103.56| Avg. ppl = 371.86| Speed 3151.05 words/sec| Time 130.64 min|
| Epoch 7, Iter 22000| Avg L

Report on validation set:
Validation:  Dev. ppl = 672.666501
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 7, Iter 22100| Avg Loss = 104.04| Avg. ppl = 380.74| Speed 3027.61 words/sec| Time 131.85 min|
| Epoch 7, Iter 22200| Avg Loss = 105.97| Avg. ppl = 397.41| Speed 3135.93 words/sec| Time 132.46 min|
| Epoch 7, Iter 22300| Avg Loss = 105.84| Avg. ppl = 374.04| Speed 3267.42 words/sec| Time 133.04 min|
| Epoch 7, Iter 22400| Avg Loss = 105.95| Avg. ppl = 375.33| Speed 3180.19 words/sec| Time 133.64 min|
| Epoch 7, Iter 22500| Avg Loss = 106.16| Avg. ppl = 399.69| Speed 3199.33 words/sec| Time 134.23 min|
| Epoch 7, Iter 22600| Avg Loss = 104.65| Avg. ppl = 384.11| Speed 3185.38 words/sec| Time 134.82 min|
| Epoch 7, Iter 22700| Avg Loss = 104.82| Avg. ppl = 401.94| Speed 3197.93 words/sec| Time 135.40 min|
| Epoch 7, Iter 22800| Avg Loss = 105.64| Avg. ppl = 387.64| Speed 3200.93 words/sec| Time 135.99 min|
| Epoch 7, Iter 22900| Avg Loss = 104.28| Avg. ppl = 364.62| Speed 3170.12 words/sec| Time 136.59 min|
| Epoch 7, Iter 23000| Avg L

Report on validation set:
Validation:  Dev. ppl = 655.519211
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 7, Iter 23100| Avg Loss = 104.77| Avg. ppl = 380.49| Speed 2964.85 words/sec| Time 137.81 min|
| Epoch 7, Iter 23200| Avg Loss = 106.14| Avg. ppl = 390.24| Speed 3231.92 words/sec| Time 138.39 min|
| Epoch 7, Iter 23300| Avg Loss = 105.20| Avg. ppl = 373.26| Speed 3160.52 words/sec| Time 138.99 min|
| Epoch 7, Iter 23400| Avg Loss = 104.76| Avg. ppl = 361.75| Speed 3151.21 words/sec| Time 139.60 min|
| Epoch 7, Iter 23500| Avg Loss = 104.58| Avg. ppl = 368.89| Speed 3217.58 words/sec| Time 140.18 min|
| Epoch 7, Iter 23600| Avg Loss = 105.79| Avg. ppl = 393.26| Speed 3314.06 words/sec| Time 140.75 min|
| Epoch 8, Iter 23700| Avg Loss = 105.13| Avg. ppl = 376.73| Speed 3164.43 words/sec| Time 141.35 min|
| Epoch 8, Iter 23800| Avg Loss = 102.09| Avg. ppl = 314.34| Speed 3172.68 words/sec| Time 141.94 min|
| Epoch 8, Iter 23900| Avg Loss = 100.93| Avg. ppl = 308.37| Speed 3213.28 words/sec| Time 142.53 min|
| Epoch 8, Iter 24000| Avg L

Report on validation set:
Validation:  Dev. ppl = 630.951642
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 8, Iter 24100| Avg Loss = 103.10| Avg. ppl = 320.57| Speed 3007.74 words/sec| Time 143.74 min|
| Epoch 8, Iter 24200| Avg Loss = 101.05| Avg. ppl = 321.73| Speed 3146.74 words/sec| Time 144.34 min|
| Epoch 8, Iter 24300| Avg Loss = 102.54| Avg. ppl = 314.35| Speed 3139.73 words/sec| Time 144.94 min|
| Epoch 8, Iter 24400| Avg Loss = 102.28| Avg. ppl = 332.73| Speed 3178.06 words/sec| Time 145.53 min|
| Epoch 8, Iter 24500| Avg Loss = 101.83| Avg. ppl = 318.58| Speed 3179.39 words/sec| Time 146.13 min|
| Epoch 8, Iter 24600| Avg Loss = 102.39| Avg. ppl = 319.32| Speed 3166.57 words/sec| Time 146.72 min|
| Epoch 8, Iter 24700| Avg Loss = 102.24| Avg. ppl = 328.66| Speed 3156.51 words/sec| Time 147.32 min|
| Epoch 8, Iter 24800| Avg Loss = 104.59| Avg. ppl = 336.42| Speed 3194.19 words/sec| Time 147.92 min|
| Epoch 8, Iter 24900| Avg Loss = 101.00| Avg. ppl = 310.86| Speed 3139.66 words/sec| Time 148.52 min|
| Epoch 8, Iter 25000| Avg L

Report on validation set:
Validation:  Dev. ppl = 595.575111
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 8, Iter 25100| Avg Loss = 102.32| Avg. ppl = 347.06| Speed 2963.13 words/sec| Time 149.74 min|
| Epoch 8, Iter 25200| Avg Loss = 102.51| Avg. ppl = 342.56| Speed 3153.04 words/sec| Time 150.34 min|
| Epoch 8, Iter 25300| Avg Loss = 101.75| Avg. ppl = 322.86| Speed 3174.49 words/sec| Time 150.93 min|
| Epoch 8, Iter 25400| Avg Loss = 103.02| Avg. ppl = 334.74| Speed 3159.69 words/sec| Time 151.53 min|
| Epoch 8, Iter 25500| Avg Loss = 101.25| Avg. ppl = 325.41| Speed 3179.99 words/sec| Time 152.11 min|
| Epoch 8, Iter 25600| Avg Loss = 102.40| Avg. ppl = 349.48| Speed 3166.66 words/sec| Time 152.70 min|
| Epoch 8, Iter 25700| Avg Loss = 103.01| Avg. ppl = 332.55| Speed 3210.06 words/sec| Time 153.29 min|
| Epoch 8, Iter 25800| Avg Loss = 103.76| Avg. ppl = 328.47| Speed 3151.71 words/sec| Time 153.90 min|
| Epoch 8, Iter 25900| Avg Loss = 101.85| Avg. ppl = 327.39| Speed 3218.65 words/sec| Time 154.48 min|
| Epoch 8, Iter 26000| Avg L

Report on validation set:
Validation:  Dev. ppl = 587.557402
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 8, Iter 26100| Avg Loss = 101.14| Avg. ppl = 323.29| Speed 2956.87 words/sec| Time 155.69 min|
| Epoch 8, Iter 26200| Avg Loss = 103.12| Avg. ppl = 334.36| Speed 3220.69 words/sec| Time 156.28 min|
| Epoch 8, Iter 26300| Avg Loss = 103.54| Avg. ppl = 347.21| Speed 3210.68 words/sec| Time 156.87 min|
| Epoch 8, Iter 26400| Avg Loss = 102.11| Avg. ppl = 351.34| Speed 3147.98 words/sec| Time 157.46 min|
| Epoch 8, Iter 26500| Avg Loss = 103.83| Avg. ppl = 329.97| Speed 3190.07 words/sec| Time 158.06 min|
| Epoch 8, Iter 26600| Avg Loss = 101.39| Avg. ppl = 339.22| Speed 3145.19 words/sec| Time 158.65 min|
| Epoch 8, Iter 26700| Avg Loss = 103.79| Avg. ppl = 335.29| Speed 3172.36 words/sec| Time 159.25 min|
| Epoch 8, Iter 26800| Avg Loss = 101.67| Avg. ppl = 321.92| Speed 3175.42 words/sec| Time 159.84 min|
| Epoch 8, Iter 26900| Avg Loss = 102.00| Avg. ppl = 328.91| Speed 3247.53 words/sec| Time 160.42 min|
| Epoch 8, Iter 27000| Avg L

Report on validation set:
Validation:  Dev. ppl = 561.609681
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 9, Iter 27100| Avg Loss = 102.39| Avg. ppl = 325.29| Speed 3001.07 words/sec| Time 161.63 min|
| Epoch 9, Iter 27200| Avg Loss = 99.14| Avg. ppl = 273.10| Speed 3190.38 words/sec| Time 162.23 min|
| Epoch 9, Iter 27300| Avg Loss = 99.75| Avg. ppl = 290.64| Speed 3130.15 words/sec| Time 162.82 min|
| Epoch 9, Iter 27400| Avg Loss = 99.20| Avg. ppl = 274.41| Speed 3193.72 words/sec| Time 163.41 min|
| Epoch 9, Iter 27500| Avg Loss = 102.34| Avg. ppl = 292.06| Speed 3183.00 words/sec| Time 164.02 min|
| Epoch 9, Iter 27600| Avg Loss = 101.46| Avg. ppl = 296.01| Speed 3164.15 words/sec| Time 164.62 min|
| Epoch 9, Iter 27700| Avg Loss = 99.28| Avg. ppl = 290.95| Speed 3220.96 words/sec| Time 165.20 min|
| Epoch 9, Iter 27800| Avg Loss = 100.77| Avg. ppl = 285.95| Speed 3191.00 words/sec| Time 165.79 min|
| Epoch 9, Iter 27900| Avg Loss = 100.62| Avg. ppl = 292.87| Speed 3165.01 words/sec| Time 166.39 min|
| Epoch 9, Iter 28000| Avg Loss 

Report on validation set:
Validation:  Dev. ppl = 590.877289
Hit patience 1


| Epoch 9, Iter 28100| Avg Loss = 101.46| Avg. ppl = 294.43| Speed 3209.59 words/sec| Time 167.58 min|
| Epoch 9, Iter 28200| Avg Loss = 98.78| Avg. ppl = 284.87| Speed 3196.65 words/sec| Time 168.16 min|
| Epoch 9, Iter 28300| Avg Loss = 99.43| Avg. ppl = 293.06| Speed 3170.05 words/sec| Time 168.75 min|
| Epoch 9, Iter 28400| Avg Loss = 98.07| Avg. ppl = 276.53| Speed 3192.36 words/sec| Time 169.33 min|
| Epoch 9, Iter 28500| Avg Loss = 100.88| Avg. ppl = 298.71| Speed 3134.96 words/sec| Time 169.93 min|
| Epoch 9, Iter 28600| Avg Loss = 101.52| Avg. ppl = 298.25| Speed 3204.46 words/sec| Time 170.53 min|
| Epoch 9, Iter 28700| Avg Loss = 101.76| Avg. ppl = 302.31| Speed 3196.46 words/sec| Time 171.12 min|
| Epoch 9, Iter 28800| Avg Loss = 99.04| Avg. ppl = 292.89| Speed 3156.72 words/sec| Time 171.71 min|
| Epoch 9, Iter 28900| Avg Loss = 101.58| Avg. ppl = 305.07| Speed 3226.98 words/sec| Time 172.30 min|
| Epoch 9, Iter 29000| Avg Loss = 99.84| Avg. ppl = 290.87| Speed 3180.34 wor

Report on validation set:
Validation:  Dev. ppl = 555.064584
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 9, Iter 29100| Avg Loss = 100.20| Avg. ppl = 287.82| Speed 2998.49 words/sec| Time 173.52 min|
| Epoch 9, Iter 29200| Avg Loss = 100.77| Avg. ppl = 299.86| Speed 3163.16 words/sec| Time 174.11 min|
| Epoch 9, Iter 29300| Avg Loss = 100.51| Avg. ppl = 294.35| Speed 3182.05 words/sec| Time 174.71 min|
| Epoch 9, Iter 29400| Avg Loss = 101.60| Avg. ppl = 306.26| Speed 3204.52 words/sec| Time 175.30 min|
| Epoch 9, Iter 29500| Avg Loss = 101.16| Avg. ppl = 308.41| Speed 3199.95 words/sec| Time 175.88 min|
| Epoch 9, Iter 29600| Avg Loss = 100.91| Avg. ppl = 294.56| Speed 3167.25 words/sec| Time 176.48 min|
| Epoch 9, Iter 29700| Avg Loss = 100.62| Avg. ppl = 300.95| Speed 3122.88 words/sec| Time 177.08 min|
| Epoch 9, Iter 29800| Avg Loss = 101.73| Avg. ppl = 302.18| Speed 3298.23 words/sec| Time 177.66 min|
| Epoch 9, Iter 29900| Avg Loss = 99.86| Avg. ppl = 299.09| Speed 3128.27 words/sec| Time 178.26 min|
| Epoch 9, Iter 30000| Avg Lo

Report on validation set:
Validation:  Dev. ppl = 559.824012
Hit patience 1


| Epoch 9, Iter 30100| Avg Loss = 100.96| Avg. ppl = 301.69| Speed 3033.30 words/sec| Time 179.47 min|
| Epoch 9, Iter 30200| Avg Loss = 102.01| Avg. ppl = 311.62| Speed 3283.13 words/sec| Time 180.05 min|
| Epoch 9, Iter 30300| Avg Loss = 99.18| Avg. ppl = 296.58| Speed 3161.64 words/sec| Time 180.64 min|
| Epoch 9, Iter 30400| Avg Loss = 99.34| Avg. ppl = 295.98| Speed 3163.23 words/sec| Time 181.22 min|
| Epoch 10, Iter 30500| Avg Loss = 100.21| Avg. ppl = 279.09| Speed 3191.37 words/sec| Time 181.82 min|
| Epoch 10, Iter 30600| Avg Loss = 97.10| Avg. ppl = 241.08| Speed 3233.66 words/sec| Time 182.40 min|
| Epoch 10, Iter 30700| Avg Loss = 99.77| Avg. ppl = 268.41| Speed 3164.36 words/sec| Time 183.00 min|
| Epoch 10, Iter 30800| Avg Loss = 98.31| Avg. ppl = 258.96| Speed 3137.77 words/sec| Time 183.60 min|
| Epoch 10, Iter 30900| Avg Loss = 98.05| Avg. ppl = 248.83| Speed 3187.97 words/sec| Time 184.20 min|
| Epoch 10, Iter 31000| Avg Loss = 98.64| Avg. ppl = 265.86| Speed 3154.78

Report on validation set:
Validation:  Dev. ppl = 540.865459
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 10, Iter 31100| Avg Loss = 97.28| Avg. ppl = 252.99| Speed 2983.31 words/sec| Time 185.42 min|
| Epoch 10, Iter 31200| Avg Loss = 97.49| Avg. ppl = 247.06| Speed 3145.60 words/sec| Time 186.02 min|
| Epoch 10, Iter 31300| Avg Loss = 97.88| Avg. ppl = 268.02| Speed 3108.39 words/sec| Time 186.63 min|
| Epoch 10, Iter 31400| Avg Loss = 97.25| Avg. ppl = 261.86| Speed 3119.71 words/sec| Time 187.22 min|
| Epoch 10, Iter 31500| Avg Loss = 97.44| Avg. ppl = 256.90| Speed 3163.98 words/sec| Time 187.81 min|
| Epoch 10, Iter 31600| Avg Loss = 98.06| Avg. ppl = 261.15| Speed 3214.99 words/sec| Time 188.40 min|
| Epoch 10, Iter 31700| Avg Loss = 98.99| Avg. ppl = 268.95| Speed 3214.31 words/sec| Time 188.99 min|
| Epoch 10, Iter 31800| Avg Loss = 100.43| Avg. ppl = 275.92| Speed 3232.43 words/sec| Time 189.58 min|
| Epoch 10, Iter 31900| Avg Loss = 98.78| Avg. ppl = 273.57| Speed 3176.01 words/sec| Time 190.17 min|
| Epoch 10, Iter 32000| Avg

Report on validation set:
Validation:  Dev. ppl = 558.524091
Hit patience 1


| Epoch 10, Iter 32100| Avg Loss = 97.18| Avg. ppl = 261.15| Speed 3079.03 words/sec| Time 191.36 min|
| Epoch 10, Iter 32200| Avg Loss = 99.48| Avg. ppl = 270.23| Speed 3128.32 words/sec| Time 191.97 min|
| Epoch 10, Iter 32300| Avg Loss = 99.07| Avg. ppl = 264.62| Speed 3203.39 words/sec| Time 192.56 min|
| Epoch 10, Iter 32400| Avg Loss = 99.71| Avg. ppl = 284.05| Speed 3199.44 words/sec| Time 193.15 min|
| Epoch 10, Iter 32500| Avg Loss = 97.97| Avg. ppl = 256.79| Speed 3151.46 words/sec| Time 193.75 min|
| Epoch 10, Iter 32600| Avg Loss = 98.14| Avg. ppl = 263.89| Speed 3160.15 words/sec| Time 194.34 min|
| Epoch 10, Iter 32700| Avg Loss = 99.42| Avg. ppl = 273.85| Speed 3213.87 words/sec| Time 194.93 min|
| Epoch 10, Iter 32800| Avg Loss = 98.69| Avg. ppl = 259.89| Speed 3165.91 words/sec| Time 195.53 min|
| Epoch 10, Iter 32900| Avg Loss = 99.70| Avg. ppl = 274.73| Speed 3249.73 words/sec| Time 196.11 min|
| Epoch 10, Iter 33000| Avg Loss = 98.85| Avg. ppl = 276.23| Speed 3190.4

Report on validation set:
Validation:  Dev. ppl = 522.510481
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 10, Iter 33100| Avg Loss = 98.25| Avg. ppl = 276.70| Speed 3010.11 words/sec| Time 197.32 min|
| Epoch 10, Iter 33200| Avg Loss = 97.26| Avg. ppl = 261.24| Speed 3225.27 words/sec| Time 197.90 min|
| Epoch 10, Iter 33300| Avg Loss = 98.62| Avg. ppl = 262.50| Speed 3198.97 words/sec| Time 198.49 min|
| Epoch 10, Iter 33400| Avg Loss = 100.87| Avg. ppl = 275.20| Speed 3217.46 words/sec| Time 199.08 min|
| Epoch 10, Iter 33500| Avg Loss = 99.29| Avg. ppl = 282.44| Speed 3207.61 words/sec| Time 199.67 min|
| Epoch 10, Iter 33600| Avg Loss = 99.43| Avg. ppl = 278.43| Speed 3175.16 words/sec| Time 200.26 min|
| Epoch 10, Iter 33700| Avg Loss = 99.79| Avg. ppl = 289.92| Speed 3195.48 words/sec| Time 200.85 min|
| Epoch 10, Iter 33800| Avg Loss = 100.23| Avg. ppl = 282.08| Speed 3172.84 words/sec| Time 201.44 min|
| Epoch 11, Iter 33900| Avg Loss = 98.01| Avg. ppl = 257.72| Speed 3229.16 words/sec| Time 202.03 min|
| Epoch 11, Iter 34000| Av

Report on validation set:
Validation:  Dev. ppl = 526.362694
Hit patience 1


| Epoch 11, Iter 34100| Avg Loss = 96.44| Avg. ppl = 227.60| Speed 3126.30 words/sec| Time 203.22 min|
| Epoch 11, Iter 34200| Avg Loss = 95.69| Avg. ppl = 224.40| Speed 3237.96 words/sec| Time 203.80 min|
| Epoch 11, Iter 34300| Avg Loss = 95.48| Avg. ppl = 231.67| Speed 3109.76 words/sec| Time 204.41 min|
| Epoch 11, Iter 34400| Avg Loss = 98.73| Avg. ppl = 243.59| Speed 3199.41 words/sec| Time 205.01 min|
| Epoch 11, Iter 34500| Avg Loss = 97.62| Avg. ppl = 238.63| Speed 3163.41 words/sec| Time 205.61 min|
| Epoch 11, Iter 34600| Avg Loss = 95.56| Avg. ppl = 237.97| Speed 3122.24 words/sec| Time 206.20 min|
| Epoch 11, Iter 34700| Avg Loss = 98.12| Avg. ppl = 245.12| Speed 3257.41 words/sec| Time 206.79 min|
| Epoch 11, Iter 34800| Avg Loss = 96.45| Avg. ppl = 246.52| Speed 3148.92 words/sec| Time 207.38 min|
| Epoch 11, Iter 34900| Avg Loss = 97.82| Avg. ppl = 240.53| Speed 3227.46 words/sec| Time 207.97 min|
| Epoch 11, Iter 35000| Avg Loss = 97.31| Avg. ppl = 243.46| Speed 3194.0

Report on validation set:
Validation:  Dev. ppl = 503.706882
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 11, Iter 35100| Avg Loss = 97.70| Avg. ppl = 246.21| Speed 3007.55 words/sec| Time 209.19 min|
| Epoch 11, Iter 35200| Avg Loss = 98.13| Avg. ppl = 246.68| Speed 3209.21 words/sec| Time 209.78 min|
| Epoch 11, Iter 35300| Avg Loss = 98.60| Avg. ppl = 245.82| Speed 3211.71 words/sec| Time 210.38 min|
| Epoch 11, Iter 35400| Avg Loss = 97.52| Avg. ppl = 248.85| Speed 3202.11 words/sec| Time 210.97 min|
| Epoch 11, Iter 35500| Avg Loss = 97.10| Avg. ppl = 241.79| Speed 3115.17 words/sec| Time 211.57 min|
| Epoch 11, Iter 35600| Avg Loss = 97.85| Avg. ppl = 257.71| Speed 3220.63 words/sec| Time 212.16 min|
| Epoch 11, Iter 35700| Avg Loss = 97.10| Avg. ppl = 249.18| Speed 3179.27 words/sec| Time 212.75 min|
| Epoch 11, Iter 35800| Avg Loss = 96.61| Avg. ppl = 241.45| Speed 3249.98 words/sec| Time 213.32 min|
| Epoch 11, Iter 35900| Avg Loss = 98.19| Avg. ppl = 261.90| Speed 3217.48 words/sec| Time 213.91 min|
| Epoch 11, Iter 36000| Avg 

Report on validation set:
Validation:  Dev. ppl = 494.733653
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 11, Iter 36100| Avg Loss = 96.78| Avg. ppl = 252.69| Speed 2988.40 words/sec| Time 215.13 min|
| Epoch 11, Iter 36200| Avg Loss = 97.01| Avg. ppl = 247.41| Speed 3197.48 words/sec| Time 215.71 min|
| Epoch 11, Iter 36300| Avg Loss = 96.18| Avg. ppl = 242.31| Speed 3173.38 words/sec| Time 216.30 min|
| Epoch 11, Iter 36400| Avg Loss = 96.91| Avg. ppl = 249.82| Speed 3204.89 words/sec| Time 216.89 min|
| Epoch 11, Iter 36500| Avg Loss = 97.46| Avg. ppl = 254.65| Speed 3203.91 words/sec| Time 217.47 min|
| Epoch 11, Iter 36600| Avg Loss = 97.92| Avg. ppl = 253.37| Speed 3177.60 words/sec| Time 218.07 min|
| Epoch 11, Iter 36700| Avg Loss = 97.91| Avg. ppl = 248.47| Speed 3191.65 words/sec| Time 218.66 min|
| Epoch 11, Iter 36800| Avg Loss = 98.45| Avg. ppl = 255.78| Speed 3191.84 words/sec| Time 219.25 min|
| Epoch 11, Iter 36900| Avg Loss = 98.16| Avg. ppl = 250.75| Speed 3226.92 words/sec| Time 219.84 min|
| Epoch 11, Iter 37000| Avg 

Report on validation set:
Validation:  Dev. ppl = 478.419461
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 11, Iter 37100| Avg Loss = 97.16| Avg. ppl = 250.22| Speed 3044.91 words/sec| Time 221.05 min|
| Epoch 11, Iter 37200| Avg Loss = 98.20| Avg. ppl = 259.95| Speed 3217.06 words/sec| Time 221.64 min|
| Epoch 12, Iter 37300| Avg Loss = 95.71| Avg. ppl = 223.43| Speed 3206.31 words/sec| Time 222.22 min|
| Epoch 12, Iter 37400| Avg Loss = 95.53| Avg. ppl = 209.44| Speed 3238.49 words/sec| Time 222.81 min|
| Epoch 12, Iter 37500| Avg Loss = 95.23| Avg. ppl = 213.70| Speed 3238.86 words/sec| Time 223.40 min|
| Epoch 12, Iter 37600| Avg Loss = 93.63| Avg. ppl = 211.52| Speed 3146.86 words/sec| Time 223.99 min|
| Epoch 12, Iter 37700| Avg Loss = 94.77| Avg. ppl = 222.12| Speed 3190.60 words/sec| Time 224.58 min|
| Epoch 12, Iter 37800| Avg Loss = 94.87| Avg. ppl = 220.25| Speed 3210.63 words/sec| Time 225.16 min|
| Epoch 12, Iter 37900| Avg Loss = 95.22| Avg. ppl = 217.05| Speed 3238.99 words/sec| Time 225.74 min|
| Epoch 12, Iter 38000| Avg 

Report on validation set:
Validation:  Dev. ppl = 488.564996
Hit patience 1


| Epoch 12, Iter 38100| Avg Loss = 96.81| Avg. ppl = 222.25| Speed 3096.90 words/sec| Time 226.95 min|
| Epoch 12, Iter 38200| Avg Loss = 96.29| Avg. ppl = 226.78| Speed 3178.29 words/sec| Time 227.54 min|
| Epoch 12, Iter 38300| Avg Loss = 95.59| Avg. ppl = 226.10| Speed 3175.72 words/sec| Time 228.14 min|
| Epoch 12, Iter 38400| Avg Loss = 96.14| Avg. ppl = 215.41| Speed 3190.16 words/sec| Time 228.74 min|
| Epoch 12, Iter 38500| Avg Loss = 94.53| Avg. ppl = 220.15| Speed 3141.75 words/sec| Time 229.33 min|
| Epoch 12, Iter 38600| Avg Loss = 94.89| Avg. ppl = 225.17| Speed 3134.24 words/sec| Time 229.93 min|
| Epoch 12, Iter 38700| Avg Loss = 96.25| Avg. ppl = 228.01| Speed 3158.13 words/sec| Time 230.53 min|
| Epoch 12, Iter 38800| Avg Loss = 95.91| Avg. ppl = 237.34| Speed 3159.39 words/sec| Time 231.12 min|
| Epoch 12, Iter 38900| Avg Loss = 95.68| Avg. ppl = 235.42| Speed 3203.71 words/sec| Time 231.70 min|
| Epoch 12, Iter 39000| Avg Loss = 95.13| Avg. ppl = 223.10| Speed 3192.4

Report on validation set:
Validation:  Dev. ppl = 497.884536
Hit patience 2


| Epoch 12, Iter 39100| Avg Loss = 95.89| Avg. ppl = 225.67| Speed 3149.82 words/sec| Time 232.89 min|
| Epoch 12, Iter 39200| Avg Loss = 98.35| Avg. ppl = 238.33| Speed 3221.21 words/sec| Time 233.48 min|
| Epoch 12, Iter 39300| Avg Loss = 96.51| Avg. ppl = 233.24| Speed 3183.60 words/sec| Time 234.08 min|
| Epoch 12, Iter 39400| Avg Loss = 98.17| Avg. ppl = 255.88| Speed 3217.26 words/sec| Time 234.66 min|
| Epoch 12, Iter 39500| Avg Loss = 96.50| Avg. ppl = 234.55| Speed 3169.40 words/sec| Time 235.26 min|
| Epoch 12, Iter 39600| Avg Loss = 96.19| Avg. ppl = 226.29| Speed 3279.59 words/sec| Time 235.83 min|
| Epoch 12, Iter 39700| Avg Loss = 96.20| Avg. ppl = 237.56| Speed 3173.65 words/sec| Time 236.43 min|
| Epoch 12, Iter 39800| Avg Loss = 96.32| Avg. ppl = 250.18| Speed 3192.90 words/sec| Time 237.01 min|
| Epoch 12, Iter 39900| Avg Loss = 97.11| Avg. ppl = 238.83| Speed 3179.06 words/sec| Time 237.60 min|
| Epoch 12, Iter 40000| Avg Loss = 96.19| Avg. ppl = 234.78| Speed 3159.0

Report on validation set:
Validation:  Dev. ppl = 472.882123
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 12, Iter 40100| Avg Loss = 95.17| Avg. ppl = 229.87| Speed 2958.09 words/sec| Time 238.83 min|
| Epoch 12, Iter 40200| Avg Loss = 95.78| Avg. ppl = 230.26| Speed 3224.01 words/sec| Time 239.41 min|
| Epoch 12, Iter 40300| Avg Loss = 97.14| Avg. ppl = 240.91| Speed 3169.41 words/sec| Time 240.01 min|
| Epoch 12, Iter 40400| Avg Loss = 97.46| Avg. ppl = 241.36| Speed 3199.83 words/sec| Time 240.60 min|
| Epoch 12, Iter 40500| Avg Loss = 96.00| Avg. ppl = 237.75| Speed 3175.20 words/sec| Time 241.19 min|
| Epoch 12, Iter 40600| Avg Loss = 96.28| Avg. ppl = 243.08| Speed 3114.76 words/sec| Time 241.79 min|
| Epoch 13, Iter 40700| Avg Loss = 94.27| Avg. ppl = 210.34| Speed 3123.69 words/sec| Time 242.39 min|
| Epoch 13, Iter 40800| Avg Loss = 93.64| Avg. ppl = 199.95| Speed 3127.08 words/sec| Time 242.99 min|
| Epoch 13, Iter 40900| Avg Loss = 93.59| Avg. ppl = 194.36| Speed 3146.00 words/sec| Time 243.59 min|
| Epoch 13, Iter 41000| Avg 

Report on validation set:
Validation:  Dev. ppl = 470.880589
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 13, Iter 41100| Avg Loss = 94.58| Avg. ppl = 195.98| Speed 3012.09 words/sec| Time 244.83 min|
| Epoch 13, Iter 41200| Avg Loss = 94.59| Avg. ppl = 200.89| Speed 3119.80 words/sec| Time 245.44 min|
| Epoch 13, Iter 41300| Avg Loss = 94.02| Avg. ppl = 201.73| Speed 3211.92 words/sec| Time 246.02 min|
| Epoch 13, Iter 41400| Avg Loss = 95.96| Avg. ppl = 219.08| Speed 3183.69 words/sec| Time 246.62 min|
| Epoch 13, Iter 41500| Avg Loss = 93.79| Avg. ppl = 201.41| Speed 3111.47 words/sec| Time 247.23 min|
| Epoch 13, Iter 41600| Avg Loss = 94.77| Avg. ppl = 219.21| Speed 3166.10 words/sec| Time 247.82 min|
| Epoch 13, Iter 41700| Avg Loss = 94.75| Avg. ppl = 211.61| Speed 3194.54 words/sec| Time 248.41 min|
| Epoch 13, Iter 41800| Avg Loss = 95.09| Avg. ppl = 224.79| Speed 3109.96 words/sec| Time 249.01 min|
| Epoch 13, Iter 41900| Avg Loss = 95.87| Avg. ppl = 213.07| Speed 3158.76 words/sec| Time 249.62 min|
| Epoch 13, Iter 42000| Avg 

Report on validation set:
Validation:  Dev. ppl = 474.874284
Hit patience 1


| Epoch 13, Iter 42100| Avg Loss = 93.91| Avg. ppl = 208.92| Speed 3024.61 words/sec| Time 250.83 min|
| Epoch 13, Iter 42200| Avg Loss = 95.97| Avg. ppl = 220.11| Speed 3148.94 words/sec| Time 251.43 min|
| Epoch 13, Iter 42300| Avg Loss = 95.84| Avg. ppl = 219.19| Speed 3225.21 words/sec| Time 252.02 min|
| Epoch 13, Iter 42400| Avg Loss = 94.40| Avg. ppl = 213.99| Speed 3111.57 words/sec| Time 252.62 min|
| Epoch 13, Iter 42500| Avg Loss = 95.84| Avg. ppl = 232.10| Speed 3135.13 words/sec| Time 253.22 min|
| Epoch 13, Iter 42600| Avg Loss = 93.76| Avg. ppl = 212.30| Speed 3164.10 words/sec| Time 253.81 min|
| Epoch 13, Iter 42700| Avg Loss = 95.08| Avg. ppl = 231.61| Speed 3076.73 words/sec| Time 254.41 min|
| Epoch 13, Iter 42800| Avg Loss = 95.79| Avg. ppl = 219.14| Speed 3187.18 words/sec| Time 255.01 min|
| Epoch 13, Iter 42900| Avg Loss = 93.81| Avg. ppl = 210.02| Speed 3150.77 words/sec| Time 255.60 min|
| Epoch 13, Iter 43000| Avg Loss = 96.14| Avg. ppl = 233.58| Speed 3141.7

Report on validation set:
Validation:  Dev. ppl = 442.781733
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 13, Iter 43100| Avg Loss = 94.43| Avg. ppl = 218.82| Speed 2920.80 words/sec| Time 256.84 min|
| Epoch 13, Iter 43200| Avg Loss = 95.78| Avg. ppl = 222.38| Speed 3108.26 words/sec| Time 257.45 min|
| Epoch 13, Iter 43300| Avg Loss = 96.19| Avg. ppl = 234.81| Speed 3157.44 words/sec| Time 258.04 min|
| Epoch 13, Iter 43400| Avg Loss = 96.37| Avg. ppl = 228.95| Speed 3164.99 words/sec| Time 258.64 min|
| Epoch 13, Iter 43500| Avg Loss = 95.66| Avg. ppl = 228.63| Speed 3185.33 words/sec| Time 259.23 min|
| Epoch 13, Iter 43600| Avg Loss = 94.73| Avg. ppl = 219.40| Speed 3114.87 words/sec| Time 259.83 min|
| Epoch 13, Iter 43700| Avg Loss = 96.05| Avg. ppl = 222.75| Speed 3230.63 words/sec| Time 260.42 min|
| Epoch 13, Iter 43800| Avg Loss = 94.50| Avg. ppl = 217.35| Speed 3241.21 words/sec| Time 261.00 min|
| Epoch 13, Iter 43900| Avg Loss = 95.29| Avg. ppl = 226.12| Speed 3222.14 words/sec| Time 261.58 min|
| Epoch 13, Iter 44000| Avg 

Report on validation set:
Validation:  Dev. ppl = 440.886462
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 14, Iter 44100| Avg Loss = 93.36| Avg. ppl = 199.76| Speed 2972.19 words/sec| Time 262.81 min|
| Epoch 14, Iter 44200| Avg Loss = 91.84| Avg. ppl = 180.09| Speed 3175.67 words/sec| Time 263.41 min|
| Epoch 14, Iter 44300| Avg Loss = 92.49| Avg. ppl = 198.42| Speed 3147.16 words/sec| Time 264.00 min|
| Epoch 14, Iter 44400| Avg Loss = 92.99| Avg. ppl = 186.15| Speed 3220.35 words/sec| Time 264.59 min|
| Epoch 14, Iter 44500| Avg Loss = 93.59| Avg. ppl = 195.75| Speed 3242.74 words/sec| Time 265.17 min|
| Epoch 14, Iter 44600| Avg Loss = 92.86| Avg. ppl = 190.26| Speed 3184.87 words/sec| Time 265.77 min|
| Epoch 14, Iter 44700| Avg Loss = 93.19| Avg. ppl = 201.65| Speed 3187.64 words/sec| Time 266.35 min|
| Epoch 14, Iter 44800| Avg Loss = 93.66| Avg. ppl = 199.15| Speed 3195.31 words/sec| Time 266.94 min|
| Epoch 14, Iter 44900| Avg Loss = 92.01| Avg. ppl = 199.32| Speed 3213.56 words/sec| Time 267.52 min|
| Epoch 14, Iter 45000| Avg 

Report on validation set:
Validation:  Dev. ppl = 464.560321
Hit patience 1


| Epoch 14, Iter 45100| Avg Loss = 93.59| Avg. ppl = 197.24| Speed 3128.19 words/sec| Time 268.71 min|
| Epoch 14, Iter 45200| Avg Loss = 95.13| Avg. ppl = 205.24| Speed 3126.19 words/sec| Time 269.32 min|
| Epoch 14, Iter 45300| Avg Loss = 94.35| Avg. ppl = 199.54| Speed 3152.23 words/sec| Time 269.92 min|
| Epoch 14, Iter 45400| Avg Loss = 92.62| Avg. ppl = 198.76| Speed 3204.45 words/sec| Time 270.50 min|
| Epoch 14, Iter 45500| Avg Loss = 94.42| Avg. ppl = 212.55| Speed 3187.91 words/sec| Time 271.09 min|
| Epoch 14, Iter 45600| Avg Loss = 94.80| Avg. ppl = 199.00| Speed 3211.77 words/sec| Time 271.69 min|
| Epoch 14, Iter 45700| Avg Loss = 94.87| Avg. ppl = 208.76| Speed 3172.05 words/sec| Time 272.28 min|
| Epoch 14, Iter 45800| Avg Loss = 93.37| Avg. ppl = 203.88| Speed 3192.19 words/sec| Time 272.87 min|
| Epoch 14, Iter 45900| Avg Loss = 94.55| Avg. ppl = 205.98| Speed 3221.56 words/sec| Time 273.46 min|
| Epoch 14, Iter 46000| Avg Loss = 94.30| Avg. ppl = 211.38| Speed 3148.9

Report on validation set:
Validation:  Dev. ppl = 455.318549
Hit patience 2


| Epoch 14, Iter 46100| Avg Loss = 95.01| Avg. ppl = 217.19| Speed 3068.18 words/sec| Time 274.67 min|
| Epoch 14, Iter 46200| Avg Loss = 93.09| Avg. ppl = 206.42| Speed 3149.66 words/sec| Time 275.26 min|
| Epoch 14, Iter 46300| Avg Loss = 94.77| Avg. ppl = 205.94| Speed 3176.41 words/sec| Time 275.86 min|
| Epoch 14, Iter 46400| Avg Loss = 92.95| Avg. ppl = 207.79| Speed 3167.23 words/sec| Time 276.44 min|
| Epoch 14, Iter 46500| Avg Loss = 94.16| Avg. ppl = 198.79| Speed 3195.42 words/sec| Time 277.04 min|
| Epoch 14, Iter 46600| Avg Loss = 94.48| Avg. ppl = 219.21| Speed 3190.37 words/sec| Time 277.62 min|
| Epoch 14, Iter 46700| Avg Loss = 94.43| Avg. ppl = 205.43| Speed 3167.87 words/sec| Time 278.22 min|
| Epoch 14, Iter 46800| Avg Loss = 96.25| Avg. ppl = 215.80| Speed 3271.65 words/sec| Time 278.80 min|
| Epoch 14, Iter 46900| Avg Loss = 94.02| Avg. ppl = 209.24| Speed 3131.39 words/sec| Time 279.40 min|
| Epoch 14, Iter 47000| Avg Loss = 95.87| Avg. ppl = 218.24| Speed 3205.6

Report on validation set:
Validation:  Dev. ppl = 431.797966
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 14, Iter 47100| Avg Loss = 94.24| Avg. ppl = 212.92| Speed 2988.27 words/sec| Time 280.62 min|
| Epoch 14, Iter 47200| Avg Loss = 96.06| Avg. ppl = 211.98| Speed 3178.92 words/sec| Time 281.22 min|
| Epoch 14, Iter 47300| Avg Loss = 93.70| Avg. ppl = 203.81| Speed 3129.32 words/sec| Time 281.83 min|
| Epoch 15, Iter 47400| Avg Loss = 95.61| Avg. ppl = 221.67| Speed 3225.95 words/sec| Time 282.41 min|
| Epoch 15, Iter 47500| Avg Loss = 92.47| Avg. ppl = 181.03| Speed 3188.86 words/sec| Time 283.00 min|
| Epoch 15, Iter 47600| Avg Loss = 90.93| Avg. ppl = 172.18| Speed 3141.66 words/sec| Time 283.60 min|
| Epoch 15, Iter 47700| Avg Loss = 91.35| Avg. ppl = 178.05| Speed 3196.77 words/sec| Time 284.19 min|
| Epoch 15, Iter 47800| Avg Loss = 90.84| Avg. ppl = 175.02| Speed 3147.77 words/sec| Time 284.79 min|
| Epoch 15, Iter 47900| Avg Loss = 92.94| Avg. ppl = 185.71| Speed 3242.48 words/sec| Time 285.37 min|
| Epoch 15, Iter 48000| Avg 

Report on validation set:
Validation:  Dev. ppl = 443.279596
Hit patience 1


| Epoch 15, Iter 48100| Avg Loss = 92.73| Avg. ppl = 188.58| Speed 3082.31 words/sec| Time 286.57 min|
| Epoch 15, Iter 48200| Avg Loss = 90.66| Avg. ppl = 182.98| Speed 3183.01 words/sec| Time 287.15 min|
| Epoch 15, Iter 48300| Avg Loss = 91.17| Avg. ppl = 184.97| Speed 3117.99 words/sec| Time 287.75 min|
| Epoch 15, Iter 48400| Avg Loss = 93.41| Avg. ppl = 198.31| Speed 3167.38 words/sec| Time 288.34 min|
| Epoch 15, Iter 48500| Avg Loss = 92.73| Avg. ppl = 187.14| Speed 3150.00 words/sec| Time 288.94 min|
| Epoch 15, Iter 48600| Avg Loss = 93.17| Avg. ppl = 188.98| Speed 3204.35 words/sec| Time 289.53 min|
| Epoch 15, Iter 48700| Avg Loss = 93.57| Avg. ppl = 194.08| Speed 3228.82 words/sec| Time 290.12 min|
| Epoch 15, Iter 48800| Avg Loss = 93.33| Avg. ppl = 197.37| Speed 3228.50 words/sec| Time 290.70 min|
| Epoch 15, Iter 48900| Avg Loss = 93.76| Avg. ppl = 200.47| Speed 3185.76 words/sec| Time 291.30 min|
| Epoch 15, Iter 49000| Avg Loss = 93.59| Avg. ppl = 198.14| Speed 3199.8

Report on validation set:
Validation:  Dev. ppl = 446.002864
Hit patience 2


| Epoch 15, Iter 49100| Avg Loss = 92.71| Avg. ppl = 196.50| Speed 3036.78 words/sec| Time 292.50 min|
| Epoch 15, Iter 49200| Avg Loss = 93.54| Avg. ppl = 199.02| Speed 3171.44 words/sec| Time 293.10 min|
| Epoch 15, Iter 49300| Avg Loss = 93.04| Avg. ppl = 190.45| Speed 3184.48 words/sec| Time 293.69 min|
| Epoch 15, Iter 49400| Avg Loss = 92.42| Avg. ppl = 196.71| Speed 3211.58 words/sec| Time 294.27 min|
| Epoch 15, Iter 49500| Avg Loss = 93.12| Avg. ppl = 203.42| Speed 3158.73 words/sec| Time 294.86 min|
| Epoch 15, Iter 49600| Avg Loss = 92.16| Avg. ppl = 195.97| Speed 3198.16 words/sec| Time 295.45 min|
| Epoch 15, Iter 49700| Avg Loss = 93.48| Avg. ppl = 196.17| Speed 3192.01 words/sec| Time 296.04 min|
| Epoch 15, Iter 49800| Avg Loss = 94.21| Avg. ppl = 211.33| Speed 3138.43 words/sec| Time 296.64 min|
| Epoch 15, Iter 49900| Avg Loss = 93.32| Avg. ppl = 200.18| Speed 3207.17 words/sec| Time 297.22 min|
| Epoch 15, Iter 50000| Avg Loss = 93.59| Avg. ppl = 199.58| Speed 3144.2

Report on validation set:
Validation:  Dev. ppl = 430.449229
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 15, Iter 50100| Avg Loss = 94.00| Avg. ppl = 194.48| Speed 2964.12 words/sec| Time 298.46 min|
| Epoch 15, Iter 50200| Avg Loss = 94.23| Avg. ppl = 200.53| Speed 3226.47 words/sec| Time 299.05 min|
| Epoch 15, Iter 50300| Avg Loss = 95.73| Avg. ppl = 208.43| Speed 3215.56 words/sec| Time 299.64 min|
| Epoch 15, Iter 50400| Avg Loss = 94.63| Avg. ppl = 202.22| Speed 3211.24 words/sec| Time 300.24 min|
| Epoch 15, Iter 50500| Avg Loss = 94.30| Avg. ppl = 199.30| Speed 3185.93 words/sec| Time 300.83 min|
| Epoch 15, Iter 50600| Avg Loss = 93.35| Avg. ppl = 201.93| Speed 3183.22 words/sec| Time 301.42 min|
| Epoch 15, Iter 50700| Avg Loss = 93.21| Avg. ppl = 199.17| Speed 3194.08 words/sec| Time 302.01 min|
| Epoch 16, Iter 50800| Avg Loss = 93.65| Avg. ppl = 194.14| Speed 3184.84 words/sec| Time 302.60 min|
| Epoch 16, Iter 50900| Avg Loss = 91.24| Avg. ppl = 167.22| Speed 3144.37 words/sec| Time 303.21 min|
| Epoch 16, Iter 51000| Avg 

Report on validation set:
Validation:  Dev. ppl = 447.436192
Hit patience 1


| Epoch 16, Iter 51100| Avg Loss = 91.03| Avg. ppl = 170.83| Speed 3109.18 words/sec| Time 304.40 min|
| Epoch 16, Iter 51200| Avg Loss = 89.00| Avg. ppl = 164.64| Speed 3104.52 words/sec| Time 305.00 min|
| Epoch 16, Iter 51300| Avg Loss = 91.48| Avg. ppl = 172.18| Speed 3188.81 words/sec| Time 305.59 min|
| Epoch 16, Iter 51400| Avg Loss = 91.07| Avg. ppl = 169.65| Speed 3181.14 words/sec| Time 306.19 min|
| Epoch 16, Iter 51500| Avg Loss = 91.34| Avg. ppl = 178.05| Speed 3142.56 words/sec| Time 306.78 min|
| Epoch 16, Iter 51600| Avg Loss = 93.56| Avg. ppl = 190.59| Speed 3254.58 words/sec| Time 307.37 min|
| Epoch 16, Iter 51700| Avg Loss = 90.79| Avg. ppl = 174.34| Speed 3191.87 words/sec| Time 307.96 min|
| Epoch 16, Iter 51800| Avg Loss = 91.76| Avg. ppl = 177.08| Speed 3210.39 words/sec| Time 308.54 min|
| Epoch 16, Iter 51900| Avg Loss = 93.52| Avg. ppl = 193.61| Speed 3125.51 words/sec| Time 309.15 min|
| Epoch 16, Iter 52000| Avg Loss = 92.51| Avg. ppl = 184.11| Speed 3200.0

Report on validation set:
Validation:  Dev. ppl = 431.402748
Hit patience 2


| Epoch 16, Iter 52100| Avg Loss = 91.29| Avg. ppl = 179.90| Speed 3047.91 words/sec| Time 310.36 min|
| Epoch 16, Iter 52200| Avg Loss = 92.04| Avg. ppl = 181.50| Speed 3171.46 words/sec| Time 310.95 min|
| Epoch 16, Iter 52300| Avg Loss = 91.69| Avg. ppl = 184.82| Speed 3148.79 words/sec| Time 311.55 min|
| Epoch 16, Iter 52400| Avg Loss = 93.66| Avg. ppl = 190.46| Speed 3245.37 words/sec| Time 312.13 min|
| Epoch 16, Iter 52500| Avg Loss = 92.24| Avg. ppl = 179.02| Speed 3170.93 words/sec| Time 312.73 min|
| Epoch 16, Iter 52600| Avg Loss = 92.43| Avg. ppl = 191.53| Speed 3118.79 words/sec| Time 313.33 min|
| Epoch 16, Iter 52700| Avg Loss = 92.90| Avg. ppl = 189.56| Speed 3204.28 words/sec| Time 313.92 min|
| Epoch 16, Iter 52800| Avg Loss = 93.98| Avg. ppl = 194.70| Speed 3163.41 words/sec| Time 314.52 min|
| Epoch 16, Iter 52900| Avg Loss = 91.63| Avg. ppl = 186.17| Speed 3158.97 words/sec| Time 315.12 min|
| Epoch 16, Iter 53000| Avg Loss = 93.86| Avg. ppl = 195.12| Speed 3171.6

Report on validation set:
Validation:  Dev. ppl = 429.408976
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 16, Iter 53100| Avg Loss = 93.67| Avg. ppl = 196.66| Speed 2957.96 words/sec| Time 316.35 min|
| Epoch 16, Iter 53200| Avg Loss = 90.71| Avg. ppl = 178.39| Speed 3198.50 words/sec| Time 316.94 min|
| Epoch 16, Iter 53300| Avg Loss = 92.55| Avg. ppl = 201.89| Speed 3130.59 words/sec| Time 317.53 min|
| Epoch 16, Iter 53400| Avg Loss = 94.10| Avg. ppl = 195.62| Speed 3180.56 words/sec| Time 318.13 min|
| Epoch 16, Iter 53500| Avg Loss = 92.65| Avg. ppl = 201.81| Speed 3160.80 words/sec| Time 318.72 min|
| Epoch 16, Iter 53600| Avg Loss = 92.18| Avg. ppl = 192.35| Speed 3177.84 words/sec| Time 319.31 min|
| Epoch 16, Iter 53700| Avg Loss = 92.65| Avg. ppl = 192.98| Speed 3151.25 words/sec| Time 319.90 min|
| Epoch 16, Iter 53800| Avg Loss = 93.01| Avg. ppl = 192.59| Speed 3123.30 words/sec| Time 320.51 min|
| Epoch 16, Iter 53900| Avg Loss = 93.15| Avg. ppl = 194.68| Speed 3205.01 words/sec| Time 321.10 min|
| Epoch 16, Iter 54000| Avg 

Report on validation set:
Validation:  Dev. ppl = 421.431247
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 16, Iter 54100| Avg Loss = 93.06| Avg. ppl = 196.85| Speed 2927.02 words/sec| Time 322.32 min|
| Epoch 17, Iter 54200| Avg Loss = 92.56| Avg. ppl = 187.06| Speed 3160.39 words/sec| Time 322.92 min|
| Epoch 17, Iter 54300| Avg Loss = 91.60| Avg. ppl = 172.96| Speed 3200.01 words/sec| Time 323.51 min|
| Epoch 17, Iter 54400| Avg Loss = 89.94| Avg. ppl = 171.90| Speed 3179.85 words/sec| Time 324.10 min|
| Epoch 17, Iter 54500| Avg Loss = 90.59| Avg. ppl = 168.38| Speed 3230.61 words/sec| Time 324.68 min|
| Epoch 17, Iter 54600| Avg Loss = 90.51| Avg. ppl = 170.89| Speed 3191.72 words/sec| Time 325.27 min|
| Epoch 17, Iter 54700| Avg Loss = 91.37| Avg. ppl = 174.76| Speed 3138.64 words/sec| Time 325.87 min|
| Epoch 17, Iter 54800| Avg Loss = 89.54| Avg. ppl = 163.54| Speed 3227.39 words/sec| Time 326.45 min|
| Epoch 17, Iter 54900| Avg Loss = 91.66| Avg. ppl = 169.14| Speed 3158.53 words/sec| Time 327.06 min|
| Epoch 17, Iter 55000| Avg 

Report on validation set:
Validation:  Dev. ppl = 445.994442
Hit patience 1


| Epoch 17, Iter 55100| Avg Loss = 90.43| Avg. ppl = 169.13| Speed 3075.31 words/sec| Time 328.26 min|
| Epoch 17, Iter 55200| Avg Loss = 87.90| Avg. ppl = 162.51| Speed 3138.21 words/sec| Time 328.85 min|
| Epoch 17, Iter 55300| Avg Loss = 91.40| Avg. ppl = 178.77| Speed 3168.51 words/sec| Time 329.44 min|
| Epoch 17, Iter 55400| Avg Loss = 91.83| Avg. ppl = 182.88| Speed 3142.84 words/sec| Time 330.04 min|
| Epoch 17, Iter 55500| Avg Loss = 91.63| Avg. ppl = 172.71| Speed 3199.32 words/sec| Time 330.63 min|
| Epoch 17, Iter 55600| Avg Loss = 91.85| Avg. ppl = 176.89| Speed 3230.17 words/sec| Time 331.22 min|
| Epoch 17, Iter 55700| Avg Loss = 92.55| Avg. ppl = 181.36| Speed 3148.20 words/sec| Time 331.82 min|
| Epoch 17, Iter 55800| Avg Loss = 91.82| Avg. ppl = 178.62| Speed 3167.24 words/sec| Time 332.42 min|
| Epoch 17, Iter 55900| Avg Loss = 91.41| Avg. ppl = 179.98| Speed 3179.17 words/sec| Time 333.01 min|
| Epoch 17, Iter 56000| Avg Loss = 93.54| Avg. ppl = 189.17| Speed 3173.2

Report on validation set:
Validation:  Dev. ppl = 431.133946
Hit patience 2


| Epoch 17, Iter 56100| Avg Loss = 91.32| Avg. ppl = 179.45| Speed 3071.52 words/sec| Time 334.22 min|
| Epoch 17, Iter 56200| Avg Loss = 93.34| Avg. ppl = 184.07| Speed 3178.42 words/sec| Time 334.82 min|
| Epoch 17, Iter 56300| Avg Loss = 92.16| Avg. ppl = 188.02| Speed 3211.07 words/sec| Time 335.41 min|
| Epoch 17, Iter 56400| Avg Loss = 93.46| Avg. ppl = 184.47| Speed 3183.86 words/sec| Time 336.01 min|
| Epoch 17, Iter 56500| Avg Loss = 92.25| Avg. ppl = 184.44| Speed 3153.87 words/sec| Time 336.60 min|
| Epoch 17, Iter 56600| Avg Loss = 92.23| Avg. ppl = 176.58| Speed 3183.72 words/sec| Time 337.20 min|
| Epoch 17, Iter 56700| Avg Loss = 92.22| Avg. ppl = 176.12| Speed 3191.69 words/sec| Time 337.80 min|
| Epoch 17, Iter 56800| Avg Loss = 91.39| Avg. ppl = 182.49| Speed 3118.40 words/sec| Time 338.40 min|
| Epoch 17, Iter 56900| Avg Loss = 92.75| Avg. ppl = 190.10| Speed 3178.21 words/sec| Time 338.99 min|
| Epoch 17, Iter 57000| Avg Loss = 92.25| Avg. ppl = 194.65| Speed 3240.3

Report on validation set:
Validation:  Dev. ppl = 431.101479
Hit patience 3
Hit #1 trial
load previously best model and decay learning rate to 0.000500
restore parameters of the optimizers


| Epoch 17, Iter 57100| Avg Loss = 89.09| Avg. ppl = 156.69| Speed 3024.08 words/sec| Time 340.19 min|
| Epoch 17, Iter 57200| Avg Loss = 89.29| Avg. ppl = 151.93| Speed 3255.87 words/sec| Time 340.77 min|
| Epoch 17, Iter 57300| Avg Loss = 87.33| Avg. ppl = 140.22| Speed 3142.44 words/sec| Time 341.37 min|
| Epoch 17, Iter 57400| Avg Loss = 87.90| Avg. ppl = 145.64| Speed 3221.19 words/sec| Time 341.95 min|
| Epoch 17, Iter 57500| Avg Loss = 86.94| Avg. ppl = 146.99| Speed 3205.12 words/sec| Time 342.53 min|
| Epoch 18, Iter 57600| Avg Loss = 88.02| Avg. ppl = 150.07| Speed 3142.22 words/sec| Time 343.13 min|
| Epoch 18, Iter 57700| Avg Loss = 87.76| Avg. ppl = 147.20| Speed 3216.14 words/sec| Time 343.71 min|
| Epoch 18, Iter 57800| Avg Loss = 89.14| Avg. ppl = 147.45| Speed 3217.81 words/sec| Time 344.30 min|
| Epoch 18, Iter 57900| Avg Loss = 86.45| Avg. ppl = 144.83| Speed 3177.40 words/sec| Time 344.89 min|
| Epoch 18, Iter 58000| Avg Loss = 88.29| Avg. ppl = 145.18| Speed 3206.2

Report on validation set:
Validation:  Dev. ppl = 388.688160
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 18, Iter 58100| Avg Loss = 87.64| Avg. ppl = 140.19| Speed 3017.92 words/sec| Time 346.10 min|
| Epoch 18, Iter 58200| Avg Loss = 88.30| Avg. ppl = 145.64| Speed 3217.98 words/sec| Time 346.69 min|
| Epoch 18, Iter 58300| Avg Loss = 87.31| Avg. ppl = 143.62| Speed 3187.08 words/sec| Time 347.28 min|
| Epoch 18, Iter 58400| Avg Loss = 86.58| Avg. ppl = 139.10| Speed 3171.90 words/sec| Time 347.87 min|
| Epoch 18, Iter 58500| Avg Loss = 87.53| Avg. ppl = 140.63| Speed 3195.69 words/sec| Time 348.46 min|
| Epoch 18, Iter 58600| Avg Loss = 87.90| Avg. ppl = 148.93| Speed 3094.02 words/sec| Time 349.07 min|
| Epoch 18, Iter 58700| Avg Loss = 89.89| Avg. ppl = 155.23| Speed 3185.20 words/sec| Time 349.66 min|
| Epoch 18, Iter 58800| Avg Loss = 87.62| Avg. ppl = 147.51| Speed 3174.95 words/sec| Time 350.25 min|
| Epoch 18, Iter 58900| Avg Loss = 89.01| Avg. ppl = 147.40| Speed 3133.26 words/sec| Time 350.86 min|
| Epoch 18, Iter 59000| Avg 

Report on validation set:
Validation:  Dev. ppl = 373.550138
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 18, Iter 59100| Avg Loss = 88.00| Avg. ppl = 142.94| Speed 2984.07 words/sec| Time 352.08 min|
| Epoch 18, Iter 59200| Avg Loss = 89.76| Avg. ppl = 148.49| Speed 3182.61 words/sec| Time 352.68 min|
| Epoch 18, Iter 59300| Avg Loss = 87.39| Avg. ppl = 143.81| Speed 3205.78 words/sec| Time 353.27 min|
| Epoch 18, Iter 59400| Avg Loss = 87.32| Avg. ppl = 145.50| Speed 3172.87 words/sec| Time 353.86 min|
| Epoch 18, Iter 59500| Avg Loss = 88.86| Avg. ppl = 147.14| Speed 3197.85 words/sec| Time 354.45 min|
| Epoch 18, Iter 59600| Avg Loss = 87.31| Avg. ppl = 139.97| Speed 3217.52 words/sec| Time 355.04 min|
| Epoch 18, Iter 59700| Avg Loss = 86.52| Avg. ppl = 142.08| Speed 3204.17 words/sec| Time 355.62 min|
| Epoch 18, Iter 59800| Avg Loss = 88.17| Avg. ppl = 143.80| Speed 3140.02 words/sec| Time 356.22 min|
| Epoch 18, Iter 59900| Avg Loss = 88.07| Avg. ppl = 151.09| Speed 3254.25 words/sec| Time 356.80 min|
| Epoch 18, Iter 60000| Avg 

Report on validation set:
Validation:  Dev. ppl = 372.445787
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 18, Iter 60100| Avg Loss = 87.70| Avg. ppl = 146.76| Speed 2948.85 words/sec| Time 358.04 min|
| Epoch 18, Iter 60200| Avg Loss = 88.75| Avg. ppl = 150.35| Speed 3147.17 words/sec| Time 358.64 min|
| Epoch 18, Iter 60300| Avg Loss = 88.63| Avg. ppl = 142.47| Speed 3237.16 words/sec| Time 359.23 min|
| Epoch 18, Iter 60400| Avg Loss = 87.56| Avg. ppl = 142.05| Speed 3191.76 words/sec| Time 359.82 min|
| Epoch 18, Iter 60500| Avg Loss = 88.67| Avg. ppl = 147.26| Speed 3176.92 words/sec| Time 360.41 min|
| Epoch 18, Iter 60600| Avg Loss = 87.45| Avg. ppl = 143.00| Speed 3232.14 words/sec| Time 361.00 min|
| Epoch 18, Iter 60700| Avg Loss = 88.42| Avg. ppl = 149.92| Speed 3172.04 words/sec| Time 361.59 min|
| Epoch 18, Iter 60800| Avg Loss = 88.48| Avg. ppl = 153.53| Speed 3143.39 words/sec| Time 362.19 min|
| Epoch 18, Iter 60900| Avg Loss = 89.23| Avg. ppl = 150.21| Speed 3245.99 words/sec| Time 362.77 min|
| Epoch 19, Iter 61000| Avg 

Report on validation set:
Validation:  Dev. ppl = 379.247698
Hit patience 1


| Epoch 19, Iter 61100| Avg Loss = 86.60| Avg. ppl = 137.60| Speed 3101.46 words/sec| Time 363.97 min|
| Epoch 19, Iter 61200| Avg Loss = 87.13| Avg. ppl = 132.47| Speed 3286.38 words/sec| Time 364.55 min|
| Epoch 19, Iter 61300| Avg Loss = 85.97| Avg. ppl = 128.53| Speed 3144.46 words/sec| Time 365.15 min|
| Epoch 19, Iter 61400| Avg Loss = 86.32| Avg. ppl = 133.46| Speed 3245.11 words/sec| Time 365.73 min|
| Epoch 19, Iter 61500| Avg Loss = 86.66| Avg. ppl = 136.97| Speed 3162.97 words/sec| Time 366.32 min|
| Epoch 19, Iter 61600| Avg Loss = 86.74| Avg. ppl = 134.01| Speed 3153.32 words/sec| Time 366.92 min|
| Epoch 19, Iter 61700| Avg Loss = 86.53| Avg. ppl = 138.97| Speed 3180.01 words/sec| Time 367.51 min|
| Epoch 19, Iter 61800| Avg Loss = 87.60| Avg. ppl = 136.31| Speed 3234.29 words/sec| Time 368.10 min|
| Epoch 19, Iter 61900| Avg Loss = 88.38| Avg. ppl = 141.16| Speed 3234.81 words/sec| Time 368.69 min|
| Epoch 19, Iter 62000| Avg Loss = 87.76| Avg. ppl = 140.20| Speed 3165.7

Report on validation set:
Validation:  Dev. ppl = 374.599398
Hit patience 2


| Epoch 19, Iter 62100| Avg Loss = 87.05| Avg. ppl = 138.89| Speed 3015.09 words/sec| Time 369.91 min|
| Epoch 19, Iter 62200| Avg Loss = 86.21| Avg. ppl = 140.42| Speed 3128.75 words/sec| Time 370.50 min|
| Epoch 19, Iter 62300| Avg Loss = 85.40| Avg. ppl = 131.79| Speed 3194.86 words/sec| Time 371.09 min|
| Epoch 19, Iter 62400| Avg Loss = 87.23| Avg. ppl = 137.88| Speed 3141.63 words/sec| Time 371.69 min|
| Epoch 19, Iter 62500| Avg Loss = 87.61| Avg. ppl = 139.70| Speed 3169.31 words/sec| Time 372.29 min|
| Epoch 19, Iter 62600| Avg Loss = 86.77| Avg. ppl = 138.24| Speed 3199.68 words/sec| Time 372.87 min|
| Epoch 19, Iter 62700| Avg Loss = 88.88| Avg. ppl = 144.80| Speed 3172.15 words/sec| Time 373.47 min|
| Epoch 19, Iter 62800| Avg Loss = 87.67| Avg. ppl = 142.77| Speed 3171.96 words/sec| Time 374.07 min|
| Epoch 19, Iter 62900| Avg Loss = 86.89| Avg. ppl = 137.39| Speed 3180.36 words/sec| Time 374.66 min|
| Epoch 19, Iter 63000| Avg Loss = 87.18| Avg. ppl = 138.82| Speed 3171.9

Report on validation set:
Validation:  Dev. ppl = 367.103108
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 19, Iter 63100| Avg Loss = 88.11| Avg. ppl = 142.74| Speed 2910.37 words/sec| Time 375.91 min|
| Epoch 19, Iter 63200| Avg Loss = 88.13| Avg. ppl = 139.93| Speed 3212.03 words/sec| Time 376.50 min|
| Epoch 19, Iter 63300| Avg Loss = 87.21| Avg. ppl = 129.76| Speed 3211.06 words/sec| Time 377.09 min|
| Epoch 19, Iter 63400| Avg Loss = 86.47| Avg. ppl = 137.02| Speed 3149.75 words/sec| Time 377.69 min|
| Epoch 19, Iter 63500| Avg Loss = 87.21| Avg. ppl = 139.56| Speed 3157.64 words/sec| Time 378.28 min|
| Epoch 19, Iter 63600| Avg Loss = 87.23| Avg. ppl = 141.15| Speed 3191.15 words/sec| Time 378.87 min|
| Epoch 19, Iter 63700| Avg Loss = 86.97| Avg. ppl = 135.85| Speed 3186.22 words/sec| Time 379.47 min|
| Epoch 19, Iter 63800| Avg Loss = 86.76| Avg. ppl = 137.39| Speed 3141.63 words/sec| Time 380.07 min|
| Epoch 19, Iter 63900| Avg Loss = 86.57| Avg. ppl = 143.39| Speed 3167.48 words/sec| Time 380.65 min|
| Epoch 19, Iter 64000| Avg 

Report on validation set:
Validation:  Dev. ppl = 365.281612
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 19, Iter 64100| Avg Loss = 87.92| Avg. ppl = 145.66| Speed 2945.63 words/sec| Time 381.89 min|
| Epoch 19, Iter 64200| Avg Loss = 89.41| Avg. ppl = 147.23| Speed 3137.24 words/sec| Time 382.50 min|
| Epoch 19, Iter 64300| Avg Loss = 87.93| Avg. ppl = 147.81| Speed 3171.89 words/sec| Time 383.09 min|
| Epoch 20, Iter 64400| Avg Loss = 85.92| Avg. ppl = 124.05| Speed 3121.07 words/sec| Time 383.70 min|
| Epoch 20, Iter 64500| Avg Loss = 85.32| Avg. ppl = 124.47| Speed 3180.92 words/sec| Time 384.29 min|
| Epoch 20, Iter 64600| Avg Loss = 86.97| Avg. ppl = 132.57| Speed 3171.68 words/sec| Time 384.89 min|
| Epoch 20, Iter 64700| Avg Loss = 86.33| Avg. ppl = 133.16| Speed 3125.62 words/sec| Time 385.49 min|
| Epoch 20, Iter 64800| Avg Loss = 86.76| Avg. ppl = 138.38| Speed 3245.24 words/sec| Time 386.07 min|
| Epoch 20, Iter 64900| Avg Loss = 87.20| Avg. ppl = 129.82| Speed 3211.80 words/sec| Time 386.67 min|
| Epoch 20, Iter 65000| Avg 

Report on validation set:
Validation:  Dev. ppl = 371.543406
Hit patience 1


| Epoch 20, Iter 65100| Avg Loss = 85.84| Avg. ppl = 135.21| Speed 3069.91 words/sec| Time 387.87 min|
| Epoch 20, Iter 65200| Avg Loss = 86.02| Avg. ppl = 132.67| Speed 3182.46 words/sec| Time 388.46 min|
| Epoch 20, Iter 65300| Avg Loss = 85.60| Avg. ppl = 131.71| Speed 3107.32 words/sec| Time 389.06 min|
| Epoch 20, Iter 65400| Avg Loss = 87.06| Avg. ppl = 130.20| Speed 3196.15 words/sec| Time 389.66 min|
| Epoch 20, Iter 65500| Avg Loss = 85.75| Avg. ppl = 131.03| Speed 3215.42 words/sec| Time 390.24 min|
| Epoch 20, Iter 65600| Avg Loss = 85.34| Avg. ppl = 131.33| Speed 3104.82 words/sec| Time 390.84 min|
| Epoch 20, Iter 65700| Avg Loss = 86.65| Avg. ppl = 133.73| Speed 3145.79 words/sec| Time 391.44 min|
| Epoch 20, Iter 65800| Avg Loss = 87.87| Avg. ppl = 139.28| Speed 3174.31 words/sec| Time 392.04 min|
| Epoch 20, Iter 65900| Avg Loss = 87.07| Avg. ppl = 132.70| Speed 3152.53 words/sec| Time 392.65 min|
| Epoch 20, Iter 66000| Avg Loss = 85.11| Avg. ppl = 128.69| Speed 3118.6

Report on validation set:
Validation:  Dev. ppl = 370.371332
Hit patience 2


| Epoch 20, Iter 66100| Avg Loss = 85.38| Avg. ppl = 132.24| Speed 2965.64 words/sec| Time 393.87 min|
| Epoch 20, Iter 66200| Avg Loss = 86.86| Avg. ppl = 137.72| Speed 3084.37 words/sec| Time 394.48 min|
| Epoch 20, Iter 66300| Avg Loss = 86.55| Avg. ppl = 136.49| Speed 3150.83 words/sec| Time 395.08 min|
| Epoch 20, Iter 66400| Avg Loss = 85.54| Avg. ppl = 135.95| Speed 3135.93 words/sec| Time 395.67 min|
| Epoch 20, Iter 66500| Avg Loss = 87.82| Avg. ppl = 137.94| Speed 3118.20 words/sec| Time 396.28 min|
| Epoch 20, Iter 66600| Avg Loss = 86.00| Avg. ppl = 129.58| Speed 3133.66 words/sec| Time 396.88 min|
| Epoch 20, Iter 66700| Avg Loss = 87.13| Avg. ppl = 133.37| Speed 3180.67 words/sec| Time 397.48 min|
| Epoch 20, Iter 66800| Avg Loss = 86.85| Avg. ppl = 136.41| Speed 3144.46 words/sec| Time 398.08 min|
| Epoch 20, Iter 66900| Avg Loss = 87.22| Avg. ppl = 141.97| Speed 3130.85 words/sec| Time 398.68 min|
| Epoch 20, Iter 67000| Avg Loss = 87.09| Avg. ppl = 132.14| Speed 3133.4

Report on validation set:
Validation:  Dev. ppl = 370.091211
Hit patience 3
Hit #2 trial
load previously best model and decay learning rate to 0.000250
restore parameters of the optimizers


| Epoch 20, Iter 67100| Avg Loss = 85.88| Avg. ppl = 125.56| Speed 3037.94 words/sec| Time 399.91 min|
| Epoch 20, Iter 67200| Avg Loss = 84.50| Avg. ppl = 122.84| Speed 3141.11 words/sec| Time 400.51 min|
| Epoch 20, Iter 67300| Avg Loss = 85.67| Avg. ppl = 124.82| Speed 3161.25 words/sec| Time 401.11 min|
| Epoch 20, Iter 67400| Avg Loss = 84.79| Avg. ppl = 121.08| Speed 3141.28 words/sec| Time 401.71 min|
| Epoch 20, Iter 67500| Avg Loss = 83.72| Avg. ppl = 116.49| Speed 3142.76 words/sec| Time 402.30 min|
| Epoch 20, Iter 67600| Avg Loss = 83.83| Avg. ppl = 119.64| Speed 3174.91 words/sec| Time 402.89 min|
| Epoch 20, Iter 67700| Avg Loss = 86.71| Avg. ppl = 127.73| Speed 3179.66 words/sec| Time 403.49 min|
| Epoch 21, Iter 67800| Avg Loss = 84.58| Avg. ppl = 119.04| Speed 3072.27 words/sec| Time 404.10 min|
| Epoch 21, Iter 67900| Avg Loss = 85.16| Avg. ppl = 117.71| Speed 3197.41 words/sec| Time 404.70 min|
| Epoch 21, Iter 68000| Avg Loss = 85.75| Avg. ppl = 124.44| Speed 3089.8

Report on validation set:
Validation:  Dev. ppl = 351.334072
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 21, Iter 68100| Avg Loss = 84.22| Avg. ppl = 116.93| Speed 2928.50 words/sec| Time 405.96 min|
| Epoch 21, Iter 68200| Avg Loss = 85.18| Avg. ppl = 120.97| Speed 3082.23 words/sec| Time 406.57 min|
| Epoch 21, Iter 68300| Avg Loss = 83.84| Avg. ppl = 116.32| Speed 3111.92 words/sec| Time 407.18 min|
| Epoch 21, Iter 68400| Avg Loss = 85.17| Avg. ppl = 124.77| Speed 3116.70 words/sec| Time 407.78 min|
| Epoch 21, Iter 68500| Avg Loss = 83.28| Avg. ppl = 115.25| Speed 3117.65 words/sec| Time 408.38 min|
| Epoch 21, Iter 68600| Avg Loss = 83.88| Avg. ppl = 116.01| Speed 3138.32 words/sec| Time 408.98 min|
| Epoch 21, Iter 68700| Avg Loss = 83.91| Avg. ppl = 118.82| Speed 3090.45 words/sec| Time 409.59 min|
| Epoch 21, Iter 68800| Avg Loss = 85.08| Avg. ppl = 124.01| Speed 3139.94 words/sec| Time 410.19 min|
| Epoch 21, Iter 68900| Avg Loss = 84.17| Avg. ppl = 120.23| Speed 3112.23 words/sec| Time 410.79 min|
| Epoch 21, Iter 69000| Avg 

Report on validation set:
Validation:  Dev. ppl = 352.535000
Hit patience 1


| Epoch 21, Iter 69100| Avg Loss = 85.16| Avg. ppl = 125.38| Speed 3050.02 words/sec| Time 412.01 min|
| Epoch 21, Iter 69200| Avg Loss = 84.98| Avg. ppl = 122.60| Speed 3190.17 words/sec| Time 412.60 min|
| Epoch 21, Iter 69300| Avg Loss = 85.09| Avg. ppl = 119.10| Speed 3127.23 words/sec| Time 413.21 min|
| Epoch 21, Iter 69400| Avg Loss = 85.76| Avg. ppl = 124.72| Speed 3125.86 words/sec| Time 413.82 min|
| Epoch 21, Iter 69500| Avg Loss = 84.05| Avg. ppl = 123.48| Speed 3142.54 words/sec| Time 414.41 min|
| Epoch 21, Iter 69600| Avg Loss = 84.45| Avg. ppl = 116.27| Speed 3121.40 words/sec| Time 415.01 min|
| Epoch 21, Iter 69700| Avg Loss = 86.68| Avg. ppl = 128.09| Speed 3126.35 words/sec| Time 415.62 min|
| Epoch 21, Iter 69800| Avg Loss = 84.81| Avg. ppl = 125.19| Speed 3170.43 words/sec| Time 416.22 min|
| Epoch 21, Iter 69900| Avg Loss = 84.90| Avg. ppl = 120.56| Speed 3110.16 words/sec| Time 416.82 min|
| Epoch 21, Iter 70000| Avg Loss = 85.02| Avg. ppl = 127.01| Speed 3176.6

Report on validation set:
Validation:  Dev. ppl = 347.563508
Save currently the best model to [NMT_char_model.pt]


save model parameters to [NMT_char_model.pt]
| Epoch 21, Iter 70100| Avg Loss = 84.92| Avg. ppl = 123.27| Speed 2958.91 words/sec| Time 418.05 min|
| Epoch 21, Iter 70200| Avg Loss = 85.42| Avg. ppl = 128.05| Speed 3119.87 words/sec| Time 418.65 min|
| Epoch 21, Iter 70300| Avg Loss = 85.03| Avg. ppl = 119.00| Speed 3117.15 words/sec| Time 419.26 min|
| Epoch 21, Iter 70400| Avg Loss = 84.91| Avg. ppl = 124.02| Speed 3186.39 words/sec| Time 419.85 min|
| Epoch 21, Iter 70500| Avg Loss = 85.45| Avg. ppl = 126.13| Speed 3155.85 words/sec| Time 420.45 min|
| Epoch 21, Iter 70600| Avg Loss = 85.86| Avg. ppl = 122.77| Speed 3156.97 words/sec| Time 421.05 min|
| Epoch 21, Iter 70700| Avg Loss = 84.70| Avg. ppl = 119.48| Speed 3182.30 words/sec| Time 421.64 min|
| Epoch 21, Iter 70800| Avg Loss = 83.95| Avg. ppl = 126.72| Speed 3123.81 words/sec| Time 422.23 min|
| Epoch 21, Iter 70900| Avg Loss = 86.30| Avg. ppl = 130.25| Speed 3199.69 words/sec| Time 422.82 min|
| Epoch 21, Iter 71000| Avg 

Report on validation set:
Validation:  Dev. ppl = 347.665908
Hit patience 1


| Epoch 22, Iter 71100| Avg Loss = 85.68| Avg. ppl = 123.70| Speed 3052.36 words/sec| Time 424.03 min|
| Epoch 22, Iter 71200| Avg Loss = 81.82| Avg. ppl = 109.83| Speed 3212.31 words/sec| Time 424.61 min|
| Epoch 22, Iter 71300| Avg Loss = 82.46| Avg. ppl = 109.23| Speed 3179.11 words/sec| Time 425.20 min|
| Epoch 22, Iter 71400| Avg Loss = 84.12| Avg. ppl = 115.80| Speed 3130.33 words/sec| Time 425.81 min|
| Epoch 22, Iter 71500| Avg Loss = 84.12| Avg. ppl = 118.90| Speed 3243.65 words/sec| Time 426.38 min|
| Epoch 22, Iter 71600| Avg Loss = 84.33| Avg. ppl = 116.74| Speed 3184.65 words/sec| Time 426.98 min|
| Epoch 22, Iter 71700| Avg Loss = 84.36| Avg. ppl = 116.22| Speed 3173.55 words/sec| Time 427.57 min|
| Epoch 22, Iter 71800| Avg Loss = 84.67| Avg. ppl = 117.24| Speed 3207.11 words/sec| Time 428.17 min|
| Epoch 22, Iter 71900| Avg Loss = 83.22| Avg. ppl = 116.08| Speed 3229.40 words/sec| Time 428.74 min|
| Epoch 22, Iter 72000| Avg Loss = 84.12| Avg. ppl = 117.48| Speed 3158.9

Report on validation set:
Validation:  Dev. ppl = 348.905873
Hit patience 2


| Epoch 22, Iter 72100| Avg Loss = 83.06| Avg. ppl = 114.80| Speed 3080.82 words/sec| Time 429.95 min|
| Epoch 22, Iter 72200| Avg Loss = 84.27| Avg. ppl = 112.70| Speed 3140.02 words/sec| Time 430.55 min|
| Epoch 22, Iter 72300| Avg Loss = 84.55| Avg. ppl = 115.06| Speed 3236.56 words/sec| Time 431.14 min|
| Epoch 22, Iter 72400| Avg Loss = 84.59| Avg. ppl = 118.88| Speed 3173.78 words/sec| Time 431.73 min|
| Epoch 22, Iter 72500| Avg Loss = 84.75| Avg. ppl = 119.08| Speed 3147.89 words/sec| Time 432.33 min|
| Epoch 22, Iter 72600| Avg Loss = 84.95| Avg. ppl = 123.76| Speed 3161.49 words/sec| Time 432.93 min|
| Epoch 22, Iter 72700| Avg Loss = 84.94| Avg. ppl = 119.76| Speed 3143.33 words/sec| Time 433.53 min|
| Epoch 22, Iter 72800| Avg Loss = 84.21| Avg. ppl = 116.74| Speed 3195.43 words/sec| Time 434.12 min|
| Epoch 22, Iter 72900| Avg Loss = 83.55| Avg. ppl = 119.13| Speed 3156.83 words/sec| Time 434.71 min|
| Epoch 22, Iter 73000| Avg Loss = 86.36| Avg. ppl = 124.37| Speed 3259.6

Report on validation set:
Validation:  Dev. ppl = 355.497405
Hit patience 3
Hit #3 trial
early stop!


In [0]:
model.device

device(type='cuda', index=0)

## Evaluation

In [0]:
# Load the model
model = NMT_char(embed_size= 256, hidden_size=256, dropout_rate=0.3, vocab=vocab) # blueprint

params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
model.load_state_dict(params['state_dict'])

device = torch.device("cuda:0" if torch.cuda.device_count()>0 else "cpu")
model = model.to(device)

# Evaluation mode
model.eval()

NMT_char(
  (encoder): Encoder(
    (encode_embeddings): ModelEmbeddings(
      (char_embeddings): Embedding(96, 50, padding_idx=0)
      (dropout): Dropout(p=0.3)
      (cnn): CNN(
        (conv): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
        (maxpool): MaxPool1d(kernel_size=17, stride=17, padding=0, dilation=1, ceil_mode=False)
      )
      (highway): Highway(
        (fc_proj): Linear(in_features=256, out_features=256, bias=True)
        (fc_gate): Linear(in_features=256, out_features=256, bias=True)
      )
    )
    (LSTM_encode): LSTM(256, 256, bidirectional=True)
    (h_projection): Linear(in_features=512, out_features=256, bias=False)
    (c_projection): Linear(in_features=512, out_features=256, bias=False)
    (dropout): Dropout(p=0.3)
  )
  (decoder): Decoder(
    (decode_embeddings): ModelEmbeddings(
      (char_embeddings): Embedding(96, 50, padding_idx=0)
      (dropout): Dropout(p=0.3)
      (cnn): CNN(
        (conv): Conv1d(50, 256, kernel_size=(5,), stride=(1

In [0]:
##
# beam search check
src_sent =['Tambien', 'tenemos', 'que', 'tener', 'cuidado', 'con', 'el', 'hielo', 'se', 'resbala', 'facilmente', 'en', 'el.']
en_ref = ['We', 'also', 'have', 'to', 'be', 'careful', 'with', 'the', 'ice,','it', 'slides', 'easily', 'on', 'it.']
en_hat = model.beam_search(src_sent,5,70)
print("=="*40)
print("Model Translation:\n")
print('{}'.format(' '.join(en_hat[0].value)))
print("\n")
print("Human Reference:\n")
print('{}'.format(' '.join(en_ref)))
print("=="*40)

Model Translation:

We also have to be careful with the ice in the ice faster in the ice.


Human Reference:

We also have to be careful with the ice, it slides easily on it.


In [0]:
#Now, we can have to be able to be able to see the way to be in the way in the world.

## Bleu

In [0]:
###########################
## Compute Blue using nltk 
###########################

Hypothesis = namedtuple('Hypothesis', ['value', 'score'])

def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float:
    """ Given decoding results and reference sentences, compute corpus-level BLEU score.
    @param references (List[List[str]]): a list of gold-standard reference target sentences
    @param hypotheses (List[Hypothesis]): a list of hypotheses, one for each reference
    @returns bleu_score: corpus-level BLEU score
    """
    if references[0][0] == '<s>':
        references = [ref[1:-1] for ref in references]
    bleu_score = corpus_bleu([[ref] for ref in references],
                             [hyp.value for hyp in hypotheses])
    return bleu_score
  
###########################
## Model Beam Search for test set 
###########################



def beam_search(model: NMT_char, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src):
            example_hyps = model.beam_search(src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses


###########################
## Decoding using beam search 
###########################
  
def decode():
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences", file=sys.stderr)
    test_data_src = read_corpus(test_es, source='src')
    
    
    print("load test target sentences", file=sys.stderr)
    test_data_tgt = read_corpus(test_en, source='tgt')

    hypotheses = beam_search(model, test_data_src,
                             beam_size=5,
                             max_decoding_time_step=70)

    top_hypotheses = [hyps[0] for hyps in hypotheses]
    bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
    print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)
    
    with open('model_translation.txt', 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')

In [0]:
decode()

load test source sentences
load test target sentences
100%|██████████| 8064/8064 [09:11<00:00, 14.63it/s]
Corpus BLEU: 24.555234003232144


## Inference

In [0]:
# Read in machine translations without char decoder
with open('test_outputs_a4.txt','r') as f:
  nmt = f.readlines()
  
# NLTK
from nltk.translate import bleu  
from nltk.translate.bleu_score import SmoothingFunction
smoothie = SmoothingFunction().method4

In [0]:
idx = 399

print(' '.join(test_data_src[idx][:]))
print('\nRef:')
print(' '.join(test_data_tgt[idx][1:-1]))

score = corpus_bleu([test_data_src[idx]], [model.beam_search(test_data_src[idx])[0].value], smoothing_function=smoothie)
print('\nNMT_char BLEU = {:.2f}'.format(score*100))
print(' '.join(model.beam_search(test_data_src[idx][1:-1])[0].value))


score = corpus_bleu([test_data_src[idx][1:-1]], [nmt[idx][:]], smoothing_function=smoothie)
print('\nNMT BLEU = {:.2f}'.format(score*100))
print(''.join(nmt[idx][:]))



print()

Y nunca las he visto.

Ref:
And I've never seen them.

NMT_char BLEU = 0.00
I never have them.

NMT BLEU = 14.80
And I've never seen them.




In [0]:
model.beam_search(test_data_src[idx])[0].value

['How',
 'do',
 'you',
 'call',
 'the',
 'leader',
 'of',
 'a',
 'Republican',
 'country?']

## Conclusion