# Content with notebooks

You can also create content with Jupyter Notebooks. This means that you can include
code blocks and their outputs in your book.

## Markdown + notebooks

As it is markdown, you can embed images, HTML, etc into your posts!

![](https://myst-parser.readthedocs.io/en/latest/_static/logo-wide.svg)

You can also $add_{math}$ and

$$
math^{blocks}
$$

or

$$
\begin{aligned}
\mbox{mean} la_{tex} \\ \\
math blocks
\end{aligned}
$$

But make sure you \$Escape \$your \$dollar signs \$you want to keep!

## MyST markdown

MyST markdown works in Jupyter Notebooks as well. For more information about MyST markdown, check
out [the MyST guide in Jupyter Book](https://jupyterbook.org/content/myst.html),
or see [the MyST markdown documentation](https://myst-parser.readthedocs.io/en/latest/).

## Code blocks and outputs

Jupyter Book will also embed your code blocks and output in your book.
For example, here's some sample Matplotlib code:

In [1]:
import numpy as np

In [2]:
import data
from model import MyRNNModel
import torch
from torch import nn
import time
import math

In [3]:
seed = 14584234
train_batch_size = 250
sequence_length = 5
p_dropout = .0
size_embedding = 50
n_hidden_layers = 50
n_rnn_layers = 1

In [4]:
# Set the random seed manually for reproducibility.
torch.manual_seed(seed)
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print('Using %s device' % device)

Using cuda device


In [5]:
corpus = data.Corpus('ptb_data')

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

eval_batch_size = 10
train_data = batchify(corpus.train, train_batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [6]:
ntokens = len(corpus.dictionary)
model = MyRNNModel('RNN_TANH', ntokens, size_embedding, n_hidden_layers, n_rnn_layers, p_dropout).to(device)
criterion = nn.NLLLoss()

In [7]:
def repackage_hidden(h): 
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [8]:
def get_batch(source, i):
    seq_len = min(sequence_length, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, sequence_length):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output, targets).item()
    return total_loss / (len(data_source) - 1)

In [9]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(train_batch_size)
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, sequence_length)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        
        loss = criterion(output, targets)
        loss.backward()
        
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
#        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-lr)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // sequence_length, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
#        if args.dry_run:
#            break


In [10]:
# Loop over epochs.
lr = 2.5
n_epochs = 20
best_val_loss = None
log_interval = 1

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, n_epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open('ex1_Elman.torch', 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0


except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

| epoch   1 |     1/  743 batches | lr 2.50 | ms/batch 332.69 | loss 18.32 | ppl 90292192.60
| epoch   1 |     2/  743 batches | lr 2.50 | ms/batch 20.19 | loss  8.94 | ppl  7642.21
| epoch   1 |     3/  743 batches | lr 2.50 | ms/batch 20.06 | loss  8.51 | ppl  4983.76
| epoch   1 |     4/  743 batches | lr 2.50 | ms/batch 19.91 | loss  8.00 | ppl  2972.74
| epoch   1 |     5/  743 batches | lr 2.50 | ms/batch 19.99 | loss  8.20 | ppl  3641.72
| epoch   1 |     6/  743 batches | lr 2.50 | ms/batch 20.04 | loss  8.09 | ppl  3275.08
| epoch   1 |     7/  743 batches | lr 2.50 | ms/batch 20.59 | loss  7.76 | ppl  2349.30
| epoch   1 |     8/  743 batches | lr 2.50 | ms/batch 21.72 | loss  7.58 | ppl  1949.78
| epoch   1 |     9/  743 batches | lr 2.50 | ms/batch 21.13 | loss  7.58 | ppl  1965.41
| epoch   1 |    10/  743 batches | lr 2.50 | ms/batch 21.23 | loss  7.53 | ppl  1868.93
| epoch   1 |    11/  743 batches | lr 2.50 | ms/batch 23.78 | loss  7.69 | ppl  2176.01
| epoch   1 |    

| epoch   1 |    93/  743 batches | lr 2.50 | ms/batch 20.29 | loss  8.25 | ppl  3811.81
| epoch   1 |    94/  743 batches | lr 2.50 | ms/batch 20.79 | loss 10.64 | ppl 41593.46
| epoch   1 |    95/  743 batches | lr 2.50 | ms/batch 20.05 | loss  8.00 | ppl  2991.05
| epoch   1 |    96/  743 batches | lr 2.50 | ms/batch 19.92 | loss  8.30 | ppl  4017.05
| epoch   1 |    97/  743 batches | lr 2.50 | ms/batch 19.81 | loss  8.07 | ppl  3210.38
| epoch   1 |    98/  743 batches | lr 2.50 | ms/batch 19.69 | loss  8.32 | ppl  4114.10
| epoch   1 |    99/  743 batches | lr 2.50 | ms/batch 19.96 | loss  9.55 | ppl 14027.20
| epoch   1 |   100/  743 batches | lr 2.50 | ms/batch 19.88 | loss 11.69 | ppl 119214.73
| epoch   1 |   101/  743 batches | lr 2.50 | ms/batch 20.02 | loss  8.80 | ppl  6644.55
| epoch   1 |   102/  743 batches | lr 2.50 | ms/batch 20.10 | loss  9.15 | ppl  9458.58
| epoch   1 |   103/  743 batches | lr 2.50 | ms/batch 19.96 | loss  8.72 | ppl  6099.79
| epoch   1 |   104/

| epoch   1 |   190/  743 batches | lr 2.50 | ms/batch 21.37 | loss  7.39 | ppl  1625.37
| epoch   1 |   191/  743 batches | lr 2.50 | ms/batch 20.41 | loss  7.68 | ppl  2156.51
| epoch   1 |   192/  743 batches | lr 2.50 | ms/batch 20.03 | loss  7.92 | ppl  2758.47
| epoch   1 |   193/  743 batches | lr 2.50 | ms/batch 19.80 | loss  7.49 | ppl  1788.66
| epoch   1 |   194/  743 batches | lr 2.50 | ms/batch 19.75 | loss  7.61 | ppl  2024.13
| epoch   1 |   195/  743 batches | lr 2.50 | ms/batch 19.90 | loss  7.89 | ppl  2660.06
| epoch   1 |   196/  743 batches | lr 2.50 | ms/batch 20.11 | loss  7.63 | ppl  2053.77
| epoch   1 |   197/  743 batches | lr 2.50 | ms/batch 20.01 | loss  7.44 | ppl  1695.13
| epoch   1 |   198/  743 batches | lr 2.50 | ms/batch 19.87 | loss  7.16 | ppl  1280.80
| epoch   1 |   199/  743 batches | lr 2.50 | ms/batch 19.97 | loss  7.33 | ppl  1525.37
| epoch   1 |   200/  743 batches | lr 2.50 | ms/batch 19.72 | loss  7.08 | ppl  1188.60
| epoch   1 |   201/ 

| epoch   1 |   292/  743 batches | lr 2.50 | ms/batch 26.43 | loss  6.75 | ppl   850.63
| epoch   1 |   293/  743 batches | lr 2.50 | ms/batch 21.20 | loss  6.88 | ppl   975.24
| epoch   1 |   294/  743 batches | lr 2.50 | ms/batch 22.92 | loss  6.89 | ppl   986.70
| epoch   1 |   295/  743 batches | lr 2.50 | ms/batch 22.22 | loss  6.81 | ppl   909.14
| epoch   1 |   296/  743 batches | lr 2.50 | ms/batch 21.50 | loss  6.90 | ppl   988.33
| epoch   1 |   297/  743 batches | lr 2.50 | ms/batch 22.40 | loss  6.89 | ppl   977.50
| epoch   1 |   298/  743 batches | lr 2.50 | ms/batch 20.71 | loss  7.15 | ppl  1272.01
| epoch   1 |   299/  743 batches | lr 2.50 | ms/batch 20.44 | loss  7.02 | ppl  1121.56
| epoch   1 |   300/  743 batches | lr 2.50 | ms/batch 20.69 | loss  6.95 | ppl  1045.59
| epoch   1 |   301/  743 batches | lr 2.50 | ms/batch 20.98 | loss  6.82 | ppl   918.31
| epoch   1 |   302/  743 batches | lr 2.50 | ms/batch 21.60 | loss  6.81 | ppl   906.83
| epoch   1 |   303/ 

| epoch   1 |   394/  743 batches | lr 2.50 | ms/batch 23.17 | loss  6.72 | ppl   828.83
| epoch   1 |   395/  743 batches | lr 2.50 | ms/batch 22.00 | loss  6.80 | ppl   901.74
| epoch   1 |   396/  743 batches | lr 2.50 | ms/batch 21.79 | loss  7.00 | ppl  1094.78
| epoch   1 |   397/  743 batches | lr 2.50 | ms/batch 21.69 | loss  6.84 | ppl   934.80
| epoch   1 |   398/  743 batches | lr 2.50 | ms/batch 20.85 | loss  6.97 | ppl  1062.06
| epoch   1 |   399/  743 batches | lr 2.50 | ms/batch 20.75 | loss  6.85 | ppl   945.40
| epoch   1 |   400/  743 batches | lr 2.50 | ms/batch 20.67 | loss  6.85 | ppl   948.32
| epoch   1 |   401/  743 batches | lr 2.50 | ms/batch 20.43 | loss  6.68 | ppl   798.44
| epoch   1 |   402/  743 batches | lr 2.50 | ms/batch 20.61 | loss  6.78 | ppl   878.09
| epoch   1 |   403/  743 batches | lr 2.50 | ms/batch 20.52 | loss  6.82 | ppl   913.16
| epoch   1 |   404/  743 batches | lr 2.50 | ms/batch 21.29 | loss  6.86 | ppl   954.71
| epoch   1 |   405/ 

| epoch   1 |   494/  743 batches | lr 2.50 | ms/batch 25.96 | loss  6.75 | ppl   851.92
| epoch   1 |   495/  743 batches | lr 2.50 | ms/batch 22.33 | loss  6.56 | ppl   706.54
| epoch   1 |   496/  743 batches | lr 2.50 | ms/batch 21.90 | loss  6.71 | ppl   816.81
| epoch   1 |   497/  743 batches | lr 2.50 | ms/batch 22.56 | loss  6.67 | ppl   787.52
| epoch   1 |   498/  743 batches | lr 2.50 | ms/batch 22.57 | loss  6.81 | ppl   910.07
| epoch   1 |   499/  743 batches | lr 2.50 | ms/batch 21.66 | loss  6.76 | ppl   859.12
| epoch   1 |   500/  743 batches | lr 2.50 | ms/batch 21.79 | loss  6.68 | ppl   798.00
| epoch   1 |   501/  743 batches | lr 2.50 | ms/batch 21.74 | loss  6.74 | ppl   843.05
| epoch   1 |   502/  743 batches | lr 2.50 | ms/batch 22.22 | loss  6.75 | ppl   852.00
| epoch   1 |   503/  743 batches | lr 2.50 | ms/batch 21.45 | loss  6.65 | ppl   772.06
| epoch   1 |   504/  743 batches | lr 2.50 | ms/batch 23.66 | loss  6.67 | ppl   789.14
| epoch   1 |   505/ 

| epoch   1 |   593/  743 batches | lr 2.50 | ms/batch 24.14 | loss  6.64 | ppl   762.78
| epoch   1 |   594/  743 batches | lr 2.50 | ms/batch 22.86 | loss  6.60 | ppl   732.30
| epoch   1 |   595/  743 batches | lr 2.50 | ms/batch 22.73 | loss  6.60 | ppl   736.30
| epoch   1 |   596/  743 batches | lr 2.50 | ms/batch 21.69 | loss  6.57 | ppl   714.27
| epoch   1 |   597/  743 batches | lr 2.50 | ms/batch 22.86 | loss  6.60 | ppl   738.32
| epoch   1 |   598/  743 batches | lr 2.50 | ms/batch 23.03 | loss  6.79 | ppl   886.89
| epoch   1 |   599/  743 batches | lr 2.50 | ms/batch 22.12 | loss  6.96 | ppl  1049.53
| epoch   1 |   600/  743 batches | lr 2.50 | ms/batch 22.80 | loss  6.76 | ppl   866.08
| epoch   1 |   601/  743 batches | lr 2.50 | ms/batch 23.97 | loss  6.80 | ppl   900.35
| epoch   1 |   602/  743 batches | lr 2.50 | ms/batch 27.39 | loss  6.78 | ppl   881.27
| epoch   1 |   603/  743 batches | lr 2.50 | ms/batch 29.47 | loss  6.71 | ppl   821.82
| epoch   1 |   604/ 

| epoch   1 |   689/  743 batches | lr 2.50 | ms/batch 24.65 | loss  6.75 | ppl   855.27
| epoch   1 |   690/  743 batches | lr 2.50 | ms/batch 25.59 | loss  6.68 | ppl   798.41
| epoch   1 |   691/  743 batches | lr 2.50 | ms/batch 24.09 | loss  6.56 | ppl   708.22
| epoch   1 |   692/  743 batches | lr 2.50 | ms/batch 24.38 | loss  6.76 | ppl   862.75
| epoch   1 |   693/  743 batches | lr 2.50 | ms/batch 23.15 | loss  6.72 | ppl   828.06
| epoch   1 |   694/  743 batches | lr 2.50 | ms/batch 23.47 | loss  6.59 | ppl   725.49
| epoch   1 |   695/  743 batches | lr 2.50 | ms/batch 22.16 | loss  6.69 | ppl   802.32
| epoch   1 |   696/  743 batches | lr 2.50 | ms/batch 22.20 | loss  6.61 | ppl   739.65
| epoch   1 |   697/  743 batches | lr 2.50 | ms/batch 22.91 | loss  6.70 | ppl   809.74
| epoch   1 |   698/  743 batches | lr 2.50 | ms/batch 27.32 | loss  6.66 | ppl   779.77
| epoch   1 |   699/  743 batches | lr 2.50 | ms/batch 25.16 | loss  6.62 | ppl   747.17
| epoch   1 |   700/ 

| epoch   2 |    38/  743 batches | lr 2.50 | ms/batch 20.97 | loss  6.80 | ppl   896.28
| epoch   2 |    39/  743 batches | lr 2.50 | ms/batch 21.19 | loss  6.72 | ppl   829.67
| epoch   2 |    40/  743 batches | lr 2.50 | ms/batch 20.22 | loss  6.72 | ppl   825.45
| epoch   2 |    41/  743 batches | lr 2.50 | ms/batch 20.24 | loss  6.90 | ppl   990.89
| epoch   2 |    42/  743 batches | lr 2.50 | ms/batch 20.10 | loss  6.78 | ppl   882.41
| epoch   2 |    43/  743 batches | lr 2.50 | ms/batch 20.55 | loss  7.07 | ppl  1174.46
| epoch   2 |    44/  743 batches | lr 2.50 | ms/batch 20.60 | loss  7.09 | ppl  1200.26
| epoch   2 |    45/  743 batches | lr 2.50 | ms/batch 20.36 | loss  7.15 | ppl  1268.61
| epoch   2 |    46/  743 batches | lr 2.50 | ms/batch 20.40 | loss  7.18 | ppl  1313.87
| epoch   2 |    47/  743 batches | lr 2.50 | ms/batch 20.51 | loss  7.22 | ppl  1362.30
| epoch   2 |    48/  743 batches | lr 2.50 | ms/batch 22.04 | loss  7.13 | ppl  1252.90
| epoch   2 |    49/ 

| epoch   2 |   138/  743 batches | lr 2.50 | ms/batch 23.67 | loss  6.71 | ppl   818.73
| epoch   2 |   139/  743 batches | lr 2.50 | ms/batch 22.35 | loss  6.54 | ppl   691.06
| epoch   2 |   140/  743 batches | lr 2.50 | ms/batch 21.35 | loss  6.72 | ppl   831.44
| epoch   2 |   141/  743 batches | lr 2.50 | ms/batch 23.11 | loss  6.65 | ppl   770.46
| epoch   2 |   142/  743 batches | lr 2.50 | ms/batch 21.28 | loss  6.76 | ppl   862.75
| epoch   2 |   143/  743 batches | lr 2.50 | ms/batch 21.43 | loss  6.65 | ppl   773.32
| epoch   2 |   144/  743 batches | lr 2.50 | ms/batch 22.57 | loss  6.55 | ppl   702.02
| epoch   2 |   145/  743 batches | lr 2.50 | ms/batch 21.03 | loss  6.48 | ppl   649.71
| epoch   2 |   146/  743 batches | lr 2.50 | ms/batch 21.91 | loss  6.64 | ppl   761.93
| epoch   2 |   147/  743 batches | lr 2.50 | ms/batch 21.88 | loss  6.74 | ppl   843.77
| epoch   2 |   148/  743 batches | lr 2.50 | ms/batch 22.37 | loss  6.75 | ppl   851.40
| epoch   2 |   149/ 

| epoch   2 |   235/  743 batches | lr 2.50 | ms/batch 29.04 | loss  7.09 | ppl  1204.03
| epoch   2 |   236/  743 batches | lr 2.50 | ms/batch 26.29 | loss  6.93 | ppl  1023.10
| epoch   2 |   237/  743 batches | lr 2.50 | ms/batch 22.16 | loss  6.91 | ppl  1006.13
| epoch   2 |   238/  743 batches | lr 2.50 | ms/batch 23.40 | loss  6.78 | ppl   877.81
| epoch   2 |   239/  743 batches | lr 2.50 | ms/batch 23.34 | loss  6.72 | ppl   825.26
| epoch   2 |   240/  743 batches | lr 2.50 | ms/batch 23.12 | loss  6.70 | ppl   811.37
| epoch   2 |   241/  743 batches | lr 2.50 | ms/batch 22.26 | loss  6.72 | ppl   829.56
| epoch   2 |   242/  743 batches | lr 2.50 | ms/batch 22.44 | loss  6.71 | ppl   823.70
| epoch   2 |   243/  743 batches | lr 2.50 | ms/batch 22.76 | loss  6.89 | ppl   986.29
| epoch   2 |   244/  743 batches | lr 2.50 | ms/batch 23.77 | loss  6.87 | ppl   963.38
| epoch   2 |   245/  743 batches | lr 2.50 | ms/batch 25.72 | loss  6.94 | ppl  1035.19
| epoch   2 |   246/ 

| epoch   2 |   328/  743 batches | lr 2.50 | ms/batch 34.97 | loss  6.59 | ppl   726.44
| epoch   2 |   329/  743 batches | lr 2.50 | ms/batch 33.68 | loss  6.58 | ppl   719.95
| epoch   2 |   330/  743 batches | lr 2.50 | ms/batch 28.79 | loss  6.66 | ppl   783.66
| epoch   2 |   331/  743 batches | lr 2.50 | ms/batch 28.91 | loss  6.71 | ppl   817.42
| epoch   2 |   332/  743 batches | lr 2.50 | ms/batch 32.32 | loss  6.65 | ppl   769.76
| epoch   2 |   333/  743 batches | lr 2.50 | ms/batch 28.20 | loss  6.68 | ppl   792.68
| epoch   2 |   334/  743 batches | lr 2.50 | ms/batch 26.53 | loss  6.61 | ppl   739.67
| epoch   2 |   335/  743 batches | lr 2.50 | ms/batch 33.61 | loss  6.62 | ppl   748.12
| epoch   2 |   336/  743 batches | lr 2.50 | ms/batch 33.20 | loss  6.55 | ppl   702.49
| epoch   2 |   337/  743 batches | lr 2.50 | ms/batch 23.78 | loss  6.60 | ppl   732.24
| epoch   2 |   338/  743 batches | lr 2.50 | ms/batch 22.96 | loss  6.74 | ppl   846.57
| epoch   2 |   339/ 

| epoch   2 |   430/  743 batches | lr 2.50 | ms/batch 24.88 | loss  6.91 | ppl  1006.23
| epoch   2 |   431/  743 batches | lr 2.50 | ms/batch 22.51 | loss  6.76 | ppl   861.78
| epoch   2 |   432/  743 batches | lr 2.50 | ms/batch 21.98 | loss  6.82 | ppl   913.74
| epoch   2 |   433/  743 batches | lr 2.50 | ms/batch 21.47 | loss  6.58 | ppl   719.34
| epoch   2 |   434/  743 batches | lr 2.50 | ms/batch 21.34 | loss  6.77 | ppl   874.77
| epoch   2 |   435/  743 batches | lr 2.50 | ms/batch 21.39 | loss  6.74 | ppl   845.91
| epoch   2 |   436/  743 batches | lr 2.50 | ms/batch 21.61 | loss  7.30 | ppl  1475.00
| epoch   2 |   437/  743 batches | lr 2.50 | ms/batch 21.50 | loss  6.99 | ppl  1083.38
| epoch   2 |   438/  743 batches | lr 2.50 | ms/batch 22.04 | loss  7.16 | ppl  1287.93
| epoch   2 |   439/  743 batches | lr 2.50 | ms/batch 21.37 | loss  7.63 | ppl  2057.13
| epoch   2 |   440/  743 batches | lr 2.50 | ms/batch 21.84 | loss  8.41 | ppl  4511.85
| epoch   2 |   441/ 

| epoch   2 |   524/  743 batches | lr 2.50 | ms/batch 21.56 | loss  6.65 | ppl   774.37
| epoch   2 |   525/  743 batches | lr 2.50 | ms/batch 21.85 | loss  6.63 | ppl   759.19
| epoch   2 |   526/  743 batches | lr 2.50 | ms/batch 20.82 | loss  6.72 | ppl   829.59
| epoch   2 |   527/  743 batches | lr 2.50 | ms/batch 20.98 | loss  7.36 | ppl  1570.90
| epoch   2 |   528/  743 batches | lr 2.50 | ms/batch 20.93 | loss  8.98 | ppl  7951.22
| epoch   2 |   529/  743 batches | lr 2.50 | ms/batch 20.94 | loss  7.26 | ppl  1415.52
| epoch   2 |   530/  743 batches | lr 2.50 | ms/batch 20.97 | loss  7.27 | ppl  1440.53
| epoch   2 |   531/  743 batches | lr 2.50 | ms/batch 21.13 | loss  7.02 | ppl  1119.25
| epoch   2 |   532/  743 batches | lr 2.50 | ms/batch 21.13 | loss  7.04 | ppl  1136.51
| epoch   2 |   533/  743 batches | lr 2.50 | ms/batch 21.09 | loss  6.95 | ppl  1045.37
| epoch   2 |   534/  743 batches | lr 2.50 | ms/batch 23.37 | loss  6.89 | ppl   983.94
| epoch   2 |   535/ 

| epoch   2 |   624/  743 batches | lr 2.50 | ms/batch 21.45 | loss  6.71 | ppl   822.35
| epoch   2 |   625/  743 batches | lr 2.50 | ms/batch 21.82 | loss  6.52 | ppl   676.49
| epoch   2 |   626/  743 batches | lr 2.50 | ms/batch 21.26 | loss  6.69 | ppl   801.19
| epoch   2 |   627/  743 batches | lr 2.50 | ms/batch 21.85 | loss  6.80 | ppl   898.63
| epoch   2 |   628/  743 batches | lr 2.50 | ms/batch 20.94 | loss  6.91 | ppl  1003.56
| epoch   2 |   629/  743 batches | lr 2.50 | ms/batch 21.36 | loss  6.92 | ppl  1014.28
| epoch   2 |   630/  743 batches | lr 2.50 | ms/batch 20.89 | loss  6.79 | ppl   887.78
| epoch   2 |   631/  743 batches | lr 2.50 | ms/batch 20.63 | loss  6.87 | ppl   960.98
| epoch   2 |   632/  743 batches | lr 2.50 | ms/batch 20.40 | loss  6.66 | ppl   777.59
| epoch   2 |   633/  743 batches | lr 2.50 | ms/batch 20.98 | loss  6.67 | ppl   792.21
| epoch   2 |   634/  743 batches | lr 2.50 | ms/batch 21.67 | loss  6.70 | ppl   815.95
| epoch   2 |   635/ 

| epoch   2 |   718/  743 batches | lr 2.50 | ms/batch 23.47 | loss  8.31 | ppl  4047.79
| epoch   2 |   719/  743 batches | lr 2.50 | ms/batch 24.93 | loss  8.09 | ppl  3265.36
| epoch   2 |   720/  743 batches | lr 2.50 | ms/batch 23.77 | loss  8.02 | ppl  3037.99
| epoch   2 |   721/  743 batches | lr 2.50 | ms/batch 23.53 | loss  7.94 | ppl  2805.78
| epoch   2 |   722/  743 batches | lr 2.50 | ms/batch 24.07 | loss  7.90 | ppl  2706.86
| epoch   2 |   723/  743 batches | lr 2.50 | ms/batch 27.15 | loss  7.62 | ppl  2041.06
| epoch   2 |   724/  743 batches | lr 2.50 | ms/batch 30.68 | loss  7.73 | ppl  2265.50
| epoch   2 |   725/  743 batches | lr 2.50 | ms/batch 29.69 | loss  7.32 | ppl  1509.47
| epoch   2 |   726/  743 batches | lr 2.50 | ms/batch 32.23 | loss  7.47 | ppl  1758.89
| epoch   2 |   727/  743 batches | lr 2.50 | ms/batch 28.19 | loss  7.09 | ppl  1201.63
| epoch   2 |   728/  743 batches | lr 2.50 | ms/batch 26.68 | loss  7.16 | ppl  1282.36
| epoch   2 |   729/ 

| epoch   3 |    66/  743 batches | lr 0.62 | ms/batch 24.01 | loss  6.58 | ppl   718.38
| epoch   3 |    67/  743 batches | lr 0.62 | ms/batch 25.36 | loss  6.77 | ppl   875.54
| epoch   3 |    68/  743 batches | lr 0.62 | ms/batch 23.14 | loss  6.67 | ppl   790.96
| epoch   3 |    69/  743 batches | lr 0.62 | ms/batch 24.72 | loss  6.60 | ppl   732.91
| epoch   3 |    70/  743 batches | lr 0.62 | ms/batch 24.27 | loss  6.61 | ppl   743.92
| epoch   3 |    71/  743 batches | lr 0.62 | ms/batch 22.88 | loss  6.55 | ppl   700.35
| epoch   3 |    72/  743 batches | lr 0.62 | ms/batch 22.90 | loss  6.52 | ppl   677.24
| epoch   3 |    73/  743 batches | lr 0.62 | ms/batch 25.86 | loss  6.47 | ppl   642.98
| epoch   3 |    74/  743 batches | lr 0.62 | ms/batch 23.04 | loss  6.65 | ppl   770.63
| epoch   3 |    75/  743 batches | lr 0.62 | ms/batch 21.69 | loss  6.53 | ppl   686.70
| epoch   3 |    76/  743 batches | lr 0.62 | ms/batch 24.05 | loss  6.60 | ppl   733.05
| epoch   3 |    77/ 

| epoch   3 |   167/  743 batches | lr 0.62 | ms/batch 24.97 | loss  6.63 | ppl   761.05
| epoch   3 |   168/  743 batches | lr 0.62 | ms/batch 23.17 | loss  6.63 | ppl   757.96
| epoch   3 |   169/  743 batches | lr 0.62 | ms/batch 22.77 | loss  6.61 | ppl   743.97
| epoch   3 |   170/  743 batches | lr 0.62 | ms/batch 22.98 | loss  6.51 | ppl   673.38
| epoch   3 |   171/  743 batches | lr 0.62 | ms/batch 21.97 | loss  6.64 | ppl   768.53
| epoch   3 |   172/  743 batches | lr 0.62 | ms/batch 23.30 | loss  6.64 | ppl   765.28
| epoch   3 |   173/  743 batches | lr 0.62 | ms/batch 22.13 | loss  6.39 | ppl   596.29
| epoch   3 |   174/  743 batches | lr 0.62 | ms/batch 21.55 | loss  6.59 | ppl   729.70
| epoch   3 |   175/  743 batches | lr 0.62 | ms/batch 21.57 | loss  6.62 | ppl   752.45
| epoch   3 |   176/  743 batches | lr 0.62 | ms/batch 22.69 | loss  6.62 | ppl   747.14
| epoch   3 |   177/  743 batches | lr 0.62 | ms/batch 25.84 | loss  6.46 | ppl   638.94
| epoch   3 |   178/ 

| epoch   3 |   266/  743 batches | lr 0.62 | ms/batch 23.53 | loss  6.56 | ppl   706.79
| epoch   3 |   267/  743 batches | lr 0.62 | ms/batch 26.36 | loss  6.43 | ppl   618.27
| epoch   3 |   268/  743 batches | lr 0.62 | ms/batch 23.53 | loss  6.48 | ppl   649.51
| epoch   3 |   269/  743 batches | lr 0.62 | ms/batch 22.30 | loss  6.48 | ppl   652.39
| epoch   3 |   270/  743 batches | lr 0.62 | ms/batch 22.98 | loss  6.52 | ppl   680.20
| epoch   3 |   271/  743 batches | lr 0.62 | ms/batch 23.38 | loss  6.41 | ppl   605.06
| epoch   3 |   272/  743 batches | lr 0.62 | ms/batch 22.22 | loss  6.60 | ppl   737.12
| epoch   3 |   273/  743 batches | lr 0.62 | ms/batch 21.94 | loss  6.57 | ppl   715.30
| epoch   3 |   274/  743 batches | lr 0.62 | ms/batch 21.97 | loss  6.56 | ppl   709.49
| epoch   3 |   275/  743 batches | lr 0.62 | ms/batch 24.11 | loss  6.53 | ppl   684.74
| epoch   3 |   276/  743 batches | lr 0.62 | ms/batch 24.46 | loss  6.51 | ppl   671.32
| epoch   3 |   277/ 

| epoch   3 |   365/  743 batches | lr 0.62 | ms/batch 21.99 | loss  6.61 | ppl   740.11
| epoch   3 |   366/  743 batches | lr 0.62 | ms/batch 22.33 | loss  6.53 | ppl   688.51
| epoch   3 |   367/  743 batches | lr 0.62 | ms/batch 21.04 | loss  6.45 | ppl   635.50
| epoch   3 |   368/  743 batches | lr 0.62 | ms/batch 20.90 | loss  6.61 | ppl   745.72
| epoch   3 |   369/  743 batches | lr 0.62 | ms/batch 20.98 | loss  6.63 | ppl   756.51
| epoch   3 |   370/  743 batches | lr 0.62 | ms/batch 21.03 | loss  6.44 | ppl   625.25
| epoch   3 |   371/  743 batches | lr 0.62 | ms/batch 21.01 | loss  6.58 | ppl   718.82
| epoch   3 |   372/  743 batches | lr 0.62 | ms/batch 21.23 | loss  6.63 | ppl   757.64
| epoch   3 |   373/  743 batches | lr 0.62 | ms/batch 21.21 | loss  6.59 | ppl   727.80
| epoch   3 |   374/  743 batches | lr 0.62 | ms/batch 20.93 | loss  6.54 | ppl   690.98
| epoch   3 |   375/  743 batches | lr 0.62 | ms/batch 22.46 | loss  6.51 | ppl   669.96
| epoch   3 |   376/ 

| epoch   3 |   467/  743 batches | lr 0.62 | ms/batch 22.82 | loss  6.56 | ppl   707.71
| epoch   3 |   468/  743 batches | lr 0.62 | ms/batch 21.18 | loss  6.56 | ppl   705.20
| epoch   3 |   469/  743 batches | lr 0.62 | ms/batch 20.49 | loss  6.62 | ppl   753.58
| epoch   3 |   470/  743 batches | lr 0.62 | ms/batch 20.65 | loss  6.52 | ppl   680.92
| epoch   3 |   471/  743 batches | lr 0.62 | ms/batch 20.35 | loss  6.53 | ppl   685.25
| epoch   3 |   472/  743 batches | lr 0.62 | ms/batch 20.40 | loss  6.56 | ppl   703.73
| epoch   3 |   473/  743 batches | lr 0.62 | ms/batch 21.02 | loss  6.61 | ppl   740.21
| epoch   3 |   474/  743 batches | lr 0.62 | ms/batch 20.83 | loss  6.55 | ppl   702.62
| epoch   3 |   475/  743 batches | lr 0.62 | ms/batch 20.84 | loss  6.53 | ppl   682.64
| epoch   3 |   476/  743 batches | lr 0.62 | ms/batch 20.92 | loss  6.53 | ppl   687.27
| epoch   3 |   477/  743 batches | lr 0.62 | ms/batch 21.64 | loss  6.50 | ppl   668.43
| epoch   3 |   478/ 

| epoch   3 |   567/  743 batches | lr 0.62 | ms/batch 21.29 | loss  6.49 | ppl   657.43
| epoch   3 |   568/  743 batches | lr 0.62 | ms/batch 22.33 | loss  6.74 | ppl   844.30
| epoch   3 |   569/  743 batches | lr 0.62 | ms/batch 21.14 | loss  6.38 | ppl   592.31
| epoch   3 |   570/  743 batches | lr 0.62 | ms/batch 21.22 | loss  6.65 | ppl   773.61
| epoch   3 |   571/  743 batches | lr 0.62 | ms/batch 21.48 | loss  6.64 | ppl   761.83
| epoch   3 |   572/  743 batches | lr 0.62 | ms/batch 21.94 | loss  6.53 | ppl   684.48
| epoch   3 |   573/  743 batches | lr 0.62 | ms/batch 21.49 | loss  6.58 | ppl   723.87
| epoch   3 |   574/  743 batches | lr 0.62 | ms/batch 21.33 | loss  6.55 | ppl   696.94
| epoch   3 |   575/  743 batches | lr 0.62 | ms/batch 21.17 | loss  6.57 | ppl   712.37
| epoch   3 |   576/  743 batches | lr 0.62 | ms/batch 21.13 | loss  6.56 | ppl   709.07
| epoch   3 |   577/  743 batches | lr 0.62 | ms/batch 21.44 | loss  6.51 | ppl   674.42
| epoch   3 |   578/ 

| epoch   3 |   663/  743 batches | lr 0.62 | ms/batch 23.73 | loss  6.49 | ppl   660.24
| epoch   3 |   664/  743 batches | lr 0.62 | ms/batch 23.36 | loss  6.57 | ppl   713.52
| epoch   3 |   665/  743 batches | lr 0.62 | ms/batch 24.10 | loss  6.58 | ppl   717.47
| epoch   3 |   666/  743 batches | lr 0.62 | ms/batch 23.43 | loss  6.63 | ppl   755.51
| epoch   3 |   667/  743 batches | lr 0.62 | ms/batch 24.03 | loss  6.48 | ppl   654.46
| epoch   3 |   668/  743 batches | lr 0.62 | ms/batch 23.60 | loss  6.42 | ppl   616.83
| epoch   3 |   669/  743 batches | lr 0.62 | ms/batch 25.09 | loss  6.57 | ppl   716.14
| epoch   3 |   670/  743 batches | lr 0.62 | ms/batch 22.49 | loss  6.50 | ppl   667.34
| epoch   3 |   671/  743 batches | lr 0.62 | ms/batch 22.47 | loss  6.47 | ppl   645.22
| epoch   3 |   672/  743 batches | lr 0.62 | ms/batch 23.63 | loss  6.58 | ppl   722.11
| epoch   3 |   673/  743 batches | lr 0.62 | ms/batch 24.92 | loss  6.42 | ppl   613.14
| epoch   3 |   674/ 

| epoch   4 |    19/  743 batches | lr 0.62 | ms/batch 22.97 | loss  6.67 | ppl   785.01
| epoch   4 |    20/  743 batches | lr 0.62 | ms/batch 22.81 | loss  6.50 | ppl   664.67
| epoch   4 |    21/  743 batches | lr 0.62 | ms/batch 22.65 | loss  6.45 | ppl   634.37
| epoch   4 |    22/  743 batches | lr 0.62 | ms/batch 22.69 | loss  6.57 | ppl   710.76
| epoch   4 |    23/  743 batches | lr 0.62 | ms/batch 22.49 | loss  6.49 | ppl   656.29
| epoch   4 |    24/  743 batches | lr 0.62 | ms/batch 22.83 | loss  6.56 | ppl   706.47
| epoch   4 |    25/  743 batches | lr 0.62 | ms/batch 22.50 | loss  6.51 | ppl   671.25
| epoch   4 |    26/  743 batches | lr 0.62 | ms/batch 21.88 | loss  6.47 | ppl   648.44
| epoch   4 |    27/  743 batches | lr 0.62 | ms/batch 22.48 | loss  6.45 | ppl   629.61
| epoch   4 |    28/  743 batches | lr 0.62 | ms/batch 22.38 | loss  6.49 | ppl   661.30
| epoch   4 |    29/  743 batches | lr 0.62 | ms/batch 23.79 | loss  6.58 | ppl   717.49
| epoch   4 |    30/ 

| epoch   4 |   118/  743 batches | lr 0.62 | ms/batch 25.60 | loss  6.50 | ppl   663.75
| epoch   4 |   119/  743 batches | lr 0.62 | ms/batch 25.12 | loss  6.53 | ppl   683.61
| epoch   4 |   120/  743 batches | lr 0.62 | ms/batch 25.13 | loss  6.61 | ppl   745.28
| epoch   4 |   121/  743 batches | lr 0.62 | ms/batch 23.66 | loss  6.54 | ppl   689.51
| epoch   4 |   122/  743 batches | lr 0.62 | ms/batch 34.35 | loss  6.58 | ppl   721.21
| epoch   4 |   123/  743 batches | lr 0.62 | ms/batch 25.16 | loss  6.68 | ppl   792.50
| epoch   4 |   124/  743 batches | lr 0.62 | ms/batch 43.16 | loss  6.64 | ppl   766.28
| epoch   4 |   125/  743 batches | lr 0.62 | ms/batch 49.64 | loss  6.58 | ppl   719.83
| epoch   4 |   126/  743 batches | lr 0.62 | ms/batch 39.39 | loss  6.63 | ppl   756.35
| epoch   4 |   127/  743 batches | lr 0.62 | ms/batch 41.80 | loss  6.71 | ppl   816.94
| epoch   4 |   128/  743 batches | lr 0.62 | ms/batch 35.45 | loss  6.55 | ppl   698.97
| epoch   4 |   129/ 

| epoch   4 |   217/  743 batches | lr 0.62 | ms/batch 22.31 | loss  6.46 | ppl   641.09
| epoch   4 |   218/  743 batches | lr 0.62 | ms/batch 23.48 | loss  6.41 | ppl   610.93
| epoch   4 |   219/  743 batches | lr 0.62 | ms/batch 22.03 | loss  6.48 | ppl   648.88
| epoch   4 |   220/  743 batches | lr 0.62 | ms/batch 24.32 | loss  6.61 | ppl   744.68
| epoch   4 |   221/  743 batches | lr 0.62 | ms/batch 24.51 | loss  6.65 | ppl   776.06
| epoch   4 |   222/  743 batches | lr 0.62 | ms/batch 23.38 | loss  6.43 | ppl   618.66
| epoch   4 |   223/  743 batches | lr 0.62 | ms/batch 23.85 | loss  6.39 | ppl   594.36
| epoch   4 |   224/  743 batches | lr 0.62 | ms/batch 22.14 | loss  6.72 | ppl   827.11
| epoch   4 |   225/  743 batches | lr 0.62 | ms/batch 22.79 | loss  6.46 | ppl   636.23
| epoch   4 |   226/  743 batches | lr 0.62 | ms/batch 23.84 | loss  6.47 | ppl   647.13
| epoch   4 |   227/  743 batches | lr 0.62 | ms/batch 24.49 | loss  6.44 | ppl   623.59
| epoch   4 |   228/ 

| epoch   4 |   310/  743 batches | lr 0.62 | ms/batch 41.26 | loss  6.57 | ppl   715.73
| epoch   4 |   311/  743 batches | lr 0.62 | ms/batch 31.83 | loss  6.51 | ppl   668.91
| epoch   4 |   312/  743 batches | lr 0.62 | ms/batch 30.23 | loss  6.45 | ppl   634.14
| epoch   4 |   313/  743 batches | lr 0.62 | ms/batch 31.32 | loss  6.53 | ppl   683.86
| epoch   4 |   314/  743 batches | lr 0.62 | ms/batch 31.93 | loss  6.46 | ppl   638.04
| epoch   4 |   315/  743 batches | lr 0.62 | ms/batch 38.21 | loss  6.56 | ppl   704.04
| epoch   4 |   316/  743 batches | lr 0.62 | ms/batch 48.22 | loss  6.48 | ppl   650.97
| epoch   4 |   317/  743 batches | lr 0.62 | ms/batch 44.93 | loss  6.66 | ppl   778.68
| epoch   4 |   318/  743 batches | lr 0.62 | ms/batch 39.46 | loss  6.54 | ppl   692.71
| epoch   4 |   319/  743 batches | lr 0.62 | ms/batch 43.85 | loss  6.55 | ppl   696.61
| epoch   4 |   320/  743 batches | lr 0.62 | ms/batch 36.97 | loss  6.67 | ppl   790.08
| epoch   4 |   321/ 

| epoch   4 |   408/  743 batches | lr 0.62 | ms/batch 33.74 | loss  6.41 | ppl   606.43
| epoch   4 |   409/  743 batches | lr 0.62 | ms/batch 31.56 | loss  6.57 | ppl   715.11
| epoch   4 |   410/  743 batches | lr 0.62 | ms/batch 31.94 | loss  6.53 | ppl   684.33
| epoch   4 |   411/  743 batches | lr 0.62 | ms/batch 30.15 | loss  6.62 | ppl   753.57
| epoch   4 |   412/  743 batches | lr 0.62 | ms/batch 28.43 | loss  6.57 | ppl   716.68
| epoch   4 |   413/  743 batches | lr 0.62 | ms/batch 26.93 | loss  6.46 | ppl   637.12
| epoch   4 |   414/  743 batches | lr 0.62 | ms/batch 27.40 | loss  6.52 | ppl   677.53
| epoch   4 |   415/  743 batches | lr 0.62 | ms/batch 43.37 | loss  6.43 | ppl   619.82
| epoch   4 |   416/  743 batches | lr 0.62 | ms/batch 29.34 | loss  6.59 | ppl   726.52
| epoch   4 |   417/  743 batches | lr 0.62 | ms/batch 25.96 | loss  6.48 | ppl   650.78
| epoch   4 |   418/  743 batches | lr 0.62 | ms/batch 26.56 | loss  6.54 | ppl   690.21
| epoch   4 |   419/ 

| epoch   4 |   503/  743 batches | lr 0.62 | ms/batch 25.62 | loss  6.50 | ppl   667.52
| epoch   4 |   504/  743 batches | lr 0.62 | ms/batch 26.31 | loss  6.59 | ppl   730.09
| epoch   4 |   505/  743 batches | lr 0.62 | ms/batch 23.82 | loss  6.67 | ppl   791.66
| epoch   4 |   506/  743 batches | lr 0.62 | ms/batch 25.70 | loss  6.64 | ppl   767.86
| epoch   4 |   507/  743 batches | lr 0.62 | ms/batch 23.18 | loss  6.74 | ppl   844.08
| epoch   4 |   508/  743 batches | lr 0.62 | ms/batch 23.32 | loss  6.63 | ppl   756.24
| epoch   4 |   509/  743 batches | lr 0.62 | ms/batch 23.21 | loss  6.63 | ppl   761.22
| epoch   4 |   510/  743 batches | lr 0.62 | ms/batch 23.80 | loss  6.55 | ppl   699.85
| epoch   4 |   511/  743 batches | lr 0.62 | ms/batch 24.09 | loss  6.57 | ppl   714.43
| epoch   4 |   512/  743 batches | lr 0.62 | ms/batch 24.14 | loss  6.57 | ppl   716.27
| epoch   4 |   513/  743 batches | lr 0.62 | ms/batch 23.78 | loss  6.57 | ppl   716.18
| epoch   4 |   514/ 

| epoch   4 |   604/  743 batches | lr 0.62 | ms/batch 25.19 | loss  6.53 | ppl   686.80
| epoch   4 |   605/  743 batches | lr 0.62 | ms/batch 27.21 | loss  6.61 | ppl   741.31
| epoch   4 |   606/  743 batches | lr 0.62 | ms/batch 24.04 | loss  6.47 | ppl   642.78
| epoch   4 |   607/  743 batches | lr 0.62 | ms/batch 23.76 | loss  6.54 | ppl   691.84
| epoch   4 |   608/  743 batches | lr 0.62 | ms/batch 23.36 | loss  6.56 | ppl   705.46
| epoch   4 |   609/  743 batches | lr 0.62 | ms/batch 23.63 | loss  6.51 | ppl   671.25
| epoch   4 |   610/  743 batches | lr 0.62 | ms/batch 23.52 | loss  6.46 | ppl   638.94
| epoch   4 |   611/  743 batches | lr 0.62 | ms/batch 24.81 | loss  6.56 | ppl   709.23
| epoch   4 |   612/  743 batches | lr 0.62 | ms/batch 23.93 | loss  6.40 | ppl   604.27
| epoch   4 |   613/  743 batches | lr 0.62 | ms/batch 30.56 | loss  6.48 | ppl   651.80
| epoch   4 |   614/  743 batches | lr 0.62 | ms/batch 23.71 | loss  6.54 | ppl   692.00
| epoch   4 |   615/ 

| epoch   4 |   699/  743 batches | lr 0.62 | ms/batch 37.06 | loss  6.42 | ppl   611.04
| epoch   4 |   700/  743 batches | lr 0.62 | ms/batch 33.56 | loss  6.47 | ppl   646.10
| epoch   4 |   701/  743 batches | lr 0.62 | ms/batch 29.77 | loss  6.44 | ppl   627.87
| epoch   4 |   702/  743 batches | lr 0.62 | ms/batch 27.74 | loss  6.54 | ppl   693.49
| epoch   4 |   703/  743 batches | lr 0.62 | ms/batch 31.23 | loss  6.48 | ppl   648.79
| epoch   4 |   704/  743 batches | lr 0.62 | ms/batch 26.89 | loss  6.51 | ppl   671.44
| epoch   4 |   705/  743 batches | lr 0.62 | ms/batch 29.92 | loss  6.56 | ppl   703.52
| epoch   4 |   706/  743 batches | lr 0.62 | ms/batch 40.56 | loss  6.46 | ppl   641.70
| epoch   4 |   707/  743 batches | lr 0.62 | ms/batch 37.69 | loss  6.44 | ppl   625.12
| epoch   4 |   708/  743 batches | lr 0.62 | ms/batch 27.50 | loss  6.62 | ppl   750.46
| epoch   4 |   709/  743 batches | lr 0.62 | ms/batch 27.76 | loss  6.50 | ppl   664.99
| epoch   4 |   710/ 

| epoch   5 |    51/  743 batches | lr 0.62 | ms/batch 21.82 | loss  6.50 | ppl   664.75
| epoch   5 |    52/  743 batches | lr 0.62 | ms/batch 22.53 | loss  6.55 | ppl   700.96
| epoch   5 |    53/  743 batches | lr 0.62 | ms/batch 22.10 | loss  6.60 | ppl   732.81
| epoch   5 |    54/  743 batches | lr 0.62 | ms/batch 21.05 | loss  6.59 | ppl   725.60
| epoch   5 |    55/  743 batches | lr 0.62 | ms/batch 21.13 | loss  6.53 | ppl   685.66
| epoch   5 |    56/  743 batches | lr 0.62 | ms/batch 21.58 | loss  6.47 | ppl   644.92
| epoch   5 |    57/  743 batches | lr 0.62 | ms/batch 21.05 | loss  6.57 | ppl   712.42
| epoch   5 |    58/  743 batches | lr 0.62 | ms/batch 21.83 | loss  6.72 | ppl   830.57
| epoch   5 |    59/  743 batches | lr 0.62 | ms/batch 20.89 | loss  6.55 | ppl   699.18
| epoch   5 |    60/  743 batches | lr 0.62 | ms/batch 20.59 | loss  6.60 | ppl   735.93
| epoch   5 |    61/  743 batches | lr 0.62 | ms/batch 21.64 | loss  6.64 | ppl   762.09
| epoch   5 |    62/ 

| epoch   5 |   149/  743 batches | lr 0.62 | ms/batch 22.60 | loss  6.64 | ppl   761.61
| epoch   5 |   150/  743 batches | lr 0.62 | ms/batch 24.27 | loss  6.58 | ppl   717.91
| epoch   5 |   151/  743 batches | lr 0.62 | ms/batch 23.92 | loss  6.63 | ppl   756.29
| epoch   5 |   152/  743 batches | lr 0.62 | ms/batch 24.89 | loss  6.44 | ppl   627.42
| epoch   5 |   153/  743 batches | lr 0.62 | ms/batch 27.33 | loss  6.53 | ppl   685.72
| epoch   5 |   154/  743 batches | lr 0.62 | ms/batch 25.85 | loss  6.49 | ppl   660.87
| epoch   5 |   155/  743 batches | lr 0.62 | ms/batch 46.21 | loss  6.53 | ppl   685.46
| epoch   5 |   156/  743 batches | lr 0.62 | ms/batch 35.09 | loss  6.35 | ppl   571.08
| epoch   5 |   157/  743 batches | lr 0.62 | ms/batch 35.95 | loss  6.47 | ppl   643.61
| epoch   5 |   158/  743 batches | lr 0.62 | ms/batch 29.31 | loss  6.44 | ppl   627.45
| epoch   5 |   159/  743 batches | lr 0.62 | ms/batch 27.80 | loss  6.49 | ppl   660.30
| epoch   5 |   160/ 

| epoch   5 |   246/  743 batches | lr 0.62 | ms/batch 33.54 | loss  6.47 | ppl   648.48
| epoch   5 |   247/  743 batches | lr 0.62 | ms/batch 41.94 | loss  6.59 | ppl   725.26
| epoch   5 |   248/  743 batches | lr 0.62 | ms/batch 27.26 | loss  6.59 | ppl   728.87
| epoch   5 |   249/  743 batches | lr 0.62 | ms/batch 29.28 | loss  6.45 | ppl   635.35
| epoch   5 |   250/  743 batches | lr 0.62 | ms/batch 48.91 | loss  6.43 | ppl   620.97
| epoch   5 |   251/  743 batches | lr 0.62 | ms/batch 53.16 | loss  6.44 | ppl   623.81
| epoch   5 |   252/  743 batches | lr 0.62 | ms/batch 52.53 | loss  6.56 | ppl   708.02
| epoch   5 |   253/  743 batches | lr 0.62 | ms/batch 51.29 | loss  6.43 | ppl   618.73
| epoch   5 |   254/  743 batches | lr 0.62 | ms/batch 35.55 | loss  6.35 | ppl   574.95
| epoch   5 |   255/  743 batches | lr 0.62 | ms/batch 39.39 | loss  6.51 | ppl   675.10
| epoch   5 |   256/  743 batches | lr 0.62 | ms/batch 33.75 | loss  6.39 | ppl   593.34
| epoch   5 |   257/ 

| epoch   5 |   341/  743 batches | lr 0.62 | ms/batch 25.05 | loss  6.49 | ppl   657.64
| epoch   5 |   342/  743 batches | lr 0.62 | ms/batch 22.55 | loss  6.52 | ppl   677.99
| epoch   5 |   343/  743 batches | lr 0.62 | ms/batch 22.82 | loss  6.49 | ppl   661.68
| epoch   5 |   344/  743 batches | lr 0.62 | ms/batch 23.16 | loss  6.38 | ppl   588.74
| epoch   5 |   345/  743 batches | lr 0.62 | ms/batch 22.71 | loss  6.57 | ppl   711.23
| epoch   5 |   346/  743 batches | lr 0.62 | ms/batch 29.21 | loss  6.53 | ppl   684.18
| epoch   5 |   347/  743 batches | lr 0.62 | ms/batch 25.94 | loss  6.49 | ppl   661.17
| epoch   5 |   348/  743 batches | lr 0.62 | ms/batch 24.80 | loss  6.47 | ppl   646.57
| epoch   5 |   349/  743 batches | lr 0.62 | ms/batch 25.00 | loss  6.55 | ppl   695.89
| epoch   5 |   350/  743 batches | lr 0.62 | ms/batch 45.68 | loss  6.42 | ppl   611.35
| epoch   5 |   351/  743 batches | lr 0.62 | ms/batch 34.46 | loss  6.45 | ppl   630.41
| epoch   5 |   352/ 

| epoch   5 |   439/  743 batches | lr 0.62 | ms/batch 39.29 | loss  6.60 | ppl   734.75
| epoch   5 |   440/  743 batches | lr 0.62 | ms/batch 41.15 | loss  6.47 | ppl   645.96
| epoch   5 |   441/  743 batches | lr 0.62 | ms/batch 38.31 | loss  6.51 | ppl   670.09
| epoch   5 |   442/  743 batches | lr 0.62 | ms/batch 49.25 | loss  6.54 | ppl   689.32
| epoch   5 |   443/  743 batches | lr 0.62 | ms/batch 29.64 | loss  6.67 | ppl   788.12
| epoch   5 |   444/  743 batches | lr 0.62 | ms/batch 26.99 | loss  6.47 | ppl   646.26
| epoch   5 |   445/  743 batches | lr 0.62 | ms/batch 25.35 | loss  6.55 | ppl   702.43
| epoch   5 |   446/  743 batches | lr 0.62 | ms/batch 40.39 | loss  6.55 | ppl   696.05
| epoch   5 |   447/  743 batches | lr 0.62 | ms/batch 28.82 | loss  6.63 | ppl   756.98
| epoch   5 |   448/  743 batches | lr 0.62 | ms/batch 23.91 | loss  6.51 | ppl   672.04
| epoch   5 |   449/  743 batches | lr 0.62 | ms/batch 25.50 | loss  6.49 | ppl   659.89
| epoch   5 |   450/ 

| epoch   5 |   533/  743 batches | lr 0.62 | ms/batch 25.17 | loss  6.48 | ppl   649.91
| epoch   5 |   534/  743 batches | lr 0.62 | ms/batch 22.81 | loss  6.51 | ppl   673.81
| epoch   5 |   535/  743 batches | lr 0.62 | ms/batch 21.20 | loss  6.46 | ppl   639.24
| epoch   5 |   536/  743 batches | lr 0.62 | ms/batch 21.37 | loss  6.50 | ppl   664.24
| epoch   5 |   537/  743 batches | lr 0.62 | ms/batch 21.39 | loss  6.61 | ppl   745.46
| epoch   5 |   538/  743 batches | lr 0.62 | ms/batch 21.06 | loss  6.44 | ppl   624.05
| epoch   5 |   539/  743 batches | lr 0.62 | ms/batch 21.75 | loss  6.44 | ppl   626.11
| epoch   5 |   540/  743 batches | lr 0.62 | ms/batch 21.37 | loss  6.48 | ppl   652.90
| epoch   5 |   541/  743 batches | lr 0.62 | ms/batch 21.28 | loss  6.48 | ppl   654.12
| epoch   5 |   542/  743 batches | lr 0.62 | ms/batch 22.14 | loss  6.43 | ppl   622.67
| epoch   5 |   543/  743 batches | lr 0.62 | ms/batch 21.43 | loss  6.38 | ppl   589.18
| epoch   5 |   544/ 

| epoch   5 |   633/  743 batches | lr 0.62 | ms/batch 22.53 | loss  6.50 | ppl   663.56
| epoch   5 |   634/  743 batches | lr 0.62 | ms/batch 23.93 | loss  6.52 | ppl   678.08
| epoch   5 |   635/  743 batches | lr 0.62 | ms/batch 22.68 | loss  6.49 | ppl   656.45
| epoch   5 |   636/  743 batches | lr 0.62 | ms/batch 24.39 | loss  6.43 | ppl   619.34
| epoch   5 |   637/  743 batches | lr 0.62 | ms/batch 24.90 | loss  6.43 | ppl   622.40
| epoch   5 |   638/  743 batches | lr 0.62 | ms/batch 24.42 | loss  6.55 | ppl   698.95
| epoch   5 |   639/  743 batches | lr 0.62 | ms/batch 23.94 | loss  6.36 | ppl   578.91
| epoch   5 |   640/  743 batches | lr 0.62 | ms/batch 24.07 | loss  6.58 | ppl   722.00
| epoch   5 |   641/  743 batches | lr 0.62 | ms/batch 23.84 | loss  6.47 | ppl   646.43
| epoch   5 |   642/  743 batches | lr 0.62 | ms/batch 24.14 | loss  6.44 | ppl   625.98
| epoch   5 |   643/  743 batches | lr 0.62 | ms/batch 25.10 | loss  6.42 | ppl   611.58
| epoch   5 |   644/ 

| epoch   5 |   734/  743 batches | lr 0.62 | ms/batch 23.53 | loss  6.49 | ppl   657.50
| epoch   5 |   735/  743 batches | lr 0.62 | ms/batch 23.79 | loss  6.40 | ppl   603.33
| epoch   5 |   736/  743 batches | lr 0.62 | ms/batch 24.12 | loss  6.37 | ppl   583.04
| epoch   5 |   737/  743 batches | lr 0.62 | ms/batch 23.42 | loss  6.47 | ppl   646.09
| epoch   5 |   738/  743 batches | lr 0.62 | ms/batch 23.42 | loss  6.64 | ppl   763.64
| epoch   5 |   739/  743 batches | lr 0.62 | ms/batch 23.66 | loss  6.55 | ppl   699.45
| epoch   5 |   740/  743 batches | lr 0.62 | ms/batch 22.90 | loss  6.34 | ppl   563.98
| epoch   5 |   741/  743 batches | lr 0.62 | ms/batch 22.39 | loss  6.47 | ppl   644.30
| epoch   5 |   742/  743 batches | lr 0.62 | ms/batch 24.04 | loss  6.52 | ppl   676.94
| epoch   5 |   743/  743 batches | lr 0.62 | ms/batch 15.91 | loss  6.37 | ppl   584.61
-----------------------------------------------------------------------------------------
| end of epoch   5 |

| epoch   6 |    83/  743 batches | lr 0.62 | ms/batch 23.37 | loss  6.38 | ppl   590.65
| epoch   6 |    84/  743 batches | lr 0.62 | ms/batch 23.58 | loss  6.45 | ppl   634.54
| epoch   6 |    85/  743 batches | lr 0.62 | ms/batch 22.11 | loss  6.45 | ppl   630.26
| epoch   6 |    86/  743 batches | lr 0.62 | ms/batch 22.76 | loss  6.34 | ppl   567.70
| epoch   6 |    87/  743 batches | lr 0.62 | ms/batch 22.83 | loss  6.53 | ppl   686.55
| epoch   6 |    88/  743 batches | lr 0.62 | ms/batch 23.44 | loss  6.42 | ppl   616.91
| epoch   6 |    89/  743 batches | lr 0.62 | ms/batch 24.80 | loss  6.52 | ppl   675.44
| epoch   6 |    90/  743 batches | lr 0.62 | ms/batch 22.69 | loss  6.44 | ppl   623.86
| epoch   6 |    91/  743 batches | lr 0.62 | ms/batch 22.83 | loss  6.48 | ppl   651.44
| epoch   6 |    92/  743 batches | lr 0.62 | ms/batch 23.45 | loss  6.47 | ppl   642.43
| epoch   6 |    93/  743 batches | lr 0.62 | ms/batch 26.61 | loss  6.48 | ppl   651.93
| epoch   6 |    94/ 

| epoch   6 |   178/  743 batches | lr 0.62 | ms/batch 32.46 | loss  6.46 | ppl   642.13
| epoch   6 |   179/  743 batches | lr 0.62 | ms/batch 26.66 | loss  6.47 | ppl   646.68
| epoch   6 |   180/  743 batches | lr 0.62 | ms/batch 23.79 | loss  6.37 | ppl   584.50
| epoch   6 |   181/  743 batches | lr 0.62 | ms/batch 23.35 | loss  6.51 | ppl   675.07
| epoch   6 |   182/  743 batches | lr 0.62 | ms/batch 23.46 | loss  6.45 | ppl   632.62
| epoch   6 |   183/  743 batches | lr 0.62 | ms/batch 24.03 | loss  6.56 | ppl   704.63
| epoch   6 |   184/  743 batches | lr 0.62 | ms/batch 24.46 | loss  6.53 | ppl   682.09
| epoch   6 |   185/  743 batches | lr 0.62 | ms/batch 25.31 | loss  6.47 | ppl   644.50
| epoch   6 |   186/  743 batches | lr 0.62 | ms/batch 25.82 | loss  6.55 | ppl   700.17
| epoch   6 |   187/  743 batches | lr 0.62 | ms/batch 29.28 | loss  6.51 | ppl   671.77
| epoch   6 |   188/  743 batches | lr 0.62 | ms/batch 23.97 | loss  6.41 | ppl   606.02
| epoch   6 |   189/ 

| epoch   6 |   278/  743 batches | lr 0.62 | ms/batch 26.37 | loss  6.46 | ppl   640.07
| epoch   6 |   279/  743 batches | lr 0.62 | ms/batch 25.18 | loss  6.45 | ppl   631.95
| epoch   6 |   280/  743 batches | lr 0.62 | ms/batch 24.40 | loss  6.42 | ppl   616.52
| epoch   6 |   281/  743 batches | lr 0.62 | ms/batch 24.78 | loss  6.43 | ppl   618.67
| epoch   6 |   282/  743 batches | lr 0.62 | ms/batch 23.40 | loss  6.36 | ppl   578.69
| epoch   6 |   283/  743 batches | lr 0.62 | ms/batch 22.76 | loss  6.44 | ppl   626.42
| epoch   6 |   284/  743 batches | lr 0.62 | ms/batch 24.39 | loss  6.34 | ppl   567.93
| epoch   6 |   285/  743 batches | lr 0.62 | ms/batch 23.27 | loss  6.42 | ppl   616.76
| epoch   6 |   286/  743 batches | lr 0.62 | ms/batch 27.51 | loss  6.42 | ppl   615.14
| epoch   6 |   287/  743 batches | lr 0.62 | ms/batch 37.99 | loss  6.55 | ppl   697.31
| epoch   6 |   288/  743 batches | lr 0.62 | ms/batch 27.83 | loss  6.45 | ppl   633.00
| epoch   6 |   289/ 

| epoch   6 |   376/  743 batches | lr 0.62 | ms/batch 24.22 | loss  6.45 | ppl   635.64
| epoch   6 |   377/  743 batches | lr 0.62 | ms/batch 25.91 | loss  6.37 | ppl   584.52
| epoch   6 |   378/  743 batches | lr 0.62 | ms/batch 24.66 | loss  6.45 | ppl   630.75
| epoch   6 |   379/  743 batches | lr 0.62 | ms/batch 24.41 | loss  6.30 | ppl   543.17
| epoch   6 |   380/  743 batches | lr 0.62 | ms/batch 24.45 | loss  6.44 | ppl   628.06
| epoch   6 |   381/  743 batches | lr 0.62 | ms/batch 23.90 | loss  6.50 | ppl   668.33
| epoch   6 |   382/  743 batches | lr 0.62 | ms/batch 24.08 | loss  6.38 | ppl   587.42
| epoch   6 |   383/  743 batches | lr 0.62 | ms/batch 24.17 | loss  6.50 | ppl   665.15
| epoch   6 |   384/  743 batches | lr 0.62 | ms/batch 23.50 | loss  6.45 | ppl   632.01
| epoch   6 |   385/  743 batches | lr 0.62 | ms/batch 26.86 | loss  6.41 | ppl   610.15
| epoch   6 |   386/  743 batches | lr 0.62 | ms/batch 28.05 | loss  6.36 | ppl   575.57
| epoch   6 |   387/ 

| epoch   6 |   470/  743 batches | lr 0.62 | ms/batch 24.09 | loss  6.40 | ppl   600.84
| epoch   6 |   471/  743 batches | lr 0.62 | ms/batch 28.27 | loss  6.40 | ppl   598.97
| epoch   6 |   472/  743 batches | lr 0.62 | ms/batch 24.75 | loss  6.44 | ppl   627.53
| epoch   6 |   473/  743 batches | lr 0.62 | ms/batch 26.69 | loss  6.49 | ppl   659.29
| epoch   6 |   474/  743 batches | lr 0.62 | ms/batch 26.63 | loss  6.42 | ppl   616.70
| epoch   6 |   475/  743 batches | lr 0.62 | ms/batch 28.12 | loss  6.40 | ppl   600.65
| epoch   6 |   476/  743 batches | lr 0.62 | ms/batch 26.42 | loss  6.42 | ppl   613.36
| epoch   6 |   477/  743 batches | lr 0.62 | ms/batch 25.69 | loss  6.39 | ppl   598.53
| epoch   6 |   478/  743 batches | lr 0.62 | ms/batch 26.67 | loss  6.49 | ppl   656.59
| epoch   6 |   479/  743 batches | lr 0.62 | ms/batch 28.05 | loss  6.47 | ppl   642.44
| epoch   6 |   480/  743 batches | lr 0.62 | ms/batch 25.81 | loss  6.51 | ppl   671.36
| epoch   6 |   481/ 

| epoch   6 |   563/  743 batches | lr 0.62 | ms/batch 23.74 | loss  6.44 | ppl   628.75
| epoch   6 |   564/  743 batches | lr 0.62 | ms/batch 24.17 | loss  6.47 | ppl   647.37
| epoch   6 |   565/  743 batches | lr 0.62 | ms/batch 23.02 | loss  6.52 | ppl   679.21
| epoch   6 |   566/  743 batches | lr 0.62 | ms/batch 23.45 | loss  6.42 | ppl   612.53
| epoch   6 |   567/  743 batches | lr 0.62 | ms/batch 23.14 | loss  6.36 | ppl   576.03
| epoch   6 |   568/  743 batches | lr 0.62 | ms/batch 23.54 | loss  6.63 | ppl   759.05
| epoch   6 |   569/  743 batches | lr 0.62 | ms/batch 23.64 | loss  6.24 | ppl   512.22
| epoch   6 |   570/  743 batches | lr 0.62 | ms/batch 23.62 | loss  6.52 | ppl   677.15
| epoch   6 |   571/  743 batches | lr 0.62 | ms/batch 21.87 | loss  6.52 | ppl   678.56
| epoch   6 |   572/  743 batches | lr 0.62 | ms/batch 23.71 | loss  6.39 | ppl   594.92
| epoch   6 |   573/  743 batches | lr 0.62 | ms/batch 27.21 | loss  6.43 | ppl   618.49
| epoch   6 |   574/ 

| epoch   6 |   661/  743 batches | lr 0.62 | ms/batch 32.58 | loss  6.38 | ppl   588.10
| epoch   6 |   662/  743 batches | lr 0.62 | ms/batch 29.29 | loss  6.41 | ppl   605.99
| epoch   6 |   663/  743 batches | lr 0.62 | ms/batch 26.37 | loss  6.34 | ppl   567.41
| epoch   6 |   664/  743 batches | lr 0.62 | ms/batch 25.40 | loss  6.41 | ppl   608.89
| epoch   6 |   665/  743 batches | lr 0.62 | ms/batch 27.21 | loss  6.43 | ppl   620.57
| epoch   6 |   666/  743 batches | lr 0.62 | ms/batch 28.11 | loss  6.47 | ppl   642.95
| epoch   6 |   667/  743 batches | lr 0.62 | ms/batch 26.85 | loss  6.32 | ppl   556.37
| epoch   6 |   668/  743 batches | lr 0.62 | ms/batch 26.78 | loss  6.27 | ppl   529.83
| epoch   6 |   669/  743 batches | lr 0.62 | ms/batch 36.09 | loss  6.45 | ppl   633.36
| epoch   6 |   670/  743 batches | lr 0.62 | ms/batch 27.68 | loss  6.38 | ppl   591.19
| epoch   6 |   671/  743 batches | lr 0.62 | ms/batch 28.59 | loss  6.33 | ppl   560.88
| epoch   6 |   672/ 

| epoch   7 |    16/  743 batches | lr 0.62 | ms/batch 24.63 | loss  6.49 | ppl   659.04
| epoch   7 |    17/  743 batches | lr 0.62 | ms/batch 23.14 | loss  6.47 | ppl   646.95
| epoch   7 |    18/  743 batches | lr 0.62 | ms/batch 22.75 | loss  6.34 | ppl   565.80
| epoch   7 |    19/  743 batches | lr 0.62 | ms/batch 23.74 | loss  6.50 | ppl   666.45
| epoch   7 |    20/  743 batches | lr 0.62 | ms/batch 22.34 | loss  6.36 | ppl   578.91
| epoch   7 |    21/  743 batches | lr 0.62 | ms/batch 22.73 | loss  6.27 | ppl   528.28
| epoch   7 |    22/  743 batches | lr 0.62 | ms/batch 22.38 | loss  6.40 | ppl   599.78
| epoch   7 |    23/  743 batches | lr 0.62 | ms/batch 24.52 | loss  6.32 | ppl   556.40
| epoch   7 |    24/  743 batches | lr 0.62 | ms/batch 22.72 | loss  6.38 | ppl   587.64
| epoch   7 |    25/  743 batches | lr 0.62 | ms/batch 24.96 | loss  6.36 | ppl   580.90
| epoch   7 |    26/  743 batches | lr 0.62 | ms/batch 25.27 | loss  6.35 | ppl   569.87
| epoch   7 |    27/ 

| epoch   7 |   117/  743 batches | lr 0.62 | ms/batch 24.36 | loss  6.48 | ppl   653.36
| epoch   7 |   118/  743 batches | lr 0.62 | ms/batch 23.64 | loss  6.31 | ppl   552.64
| epoch   7 |   119/  743 batches | lr 0.62 | ms/batch 24.05 | loss  6.40 | ppl   600.72
| epoch   7 |   120/  743 batches | lr 0.62 | ms/batch 24.94 | loss  6.49 | ppl   659.75
| epoch   7 |   121/  743 batches | lr 0.62 | ms/batch 24.10 | loss  6.39 | ppl   593.32
| epoch   7 |   122/  743 batches | lr 0.62 | ms/batch 27.37 | loss  6.46 | ppl   638.15
| epoch   7 |   123/  743 batches | lr 0.62 | ms/batch 25.46 | loss  6.54 | ppl   693.19
| epoch   7 |   124/  743 batches | lr 0.62 | ms/batch 24.54 | loss  6.49 | ppl   656.53
| epoch   7 |   125/  743 batches | lr 0.62 | ms/batch 24.03 | loss  6.43 | ppl   622.27
| epoch   7 |   126/  743 batches | lr 0.62 | ms/batch 29.37 | loss  6.48 | ppl   653.09
| epoch   7 |   127/  743 batches | lr 0.62 | ms/batch 27.67 | loss  6.55 | ppl   698.87
| epoch   7 |   128/ 

| epoch   7 |   217/  743 batches | lr 0.62 | ms/batch 22.93 | loss  6.30 | ppl   543.17
| epoch   7 |   218/  743 batches | lr 0.62 | ms/batch 25.96 | loss  6.23 | ppl   507.71
| epoch   7 |   219/  743 batches | lr 0.62 | ms/batch 23.32 | loss  6.31 | ppl   549.16
| epoch   7 |   220/  743 batches | lr 0.62 | ms/batch 23.02 | loss  6.44 | ppl   624.61
| epoch   7 |   221/  743 batches | lr 0.62 | ms/batch 21.66 | loss  6.50 | ppl   662.19
| epoch   7 |   222/  743 batches | lr 0.62 | ms/batch 22.73 | loss  6.26 | ppl   520.97
| epoch   7 |   223/  743 batches | lr 0.62 | ms/batch 22.78 | loss  6.24 | ppl   512.35
| epoch   7 |   224/  743 batches | lr 0.62 | ms/batch 22.39 | loss  6.59 | ppl   728.10
| epoch   7 |   225/  743 batches | lr 0.62 | ms/batch 22.70 | loss  6.30 | ppl   545.40
| epoch   7 |   226/  743 batches | lr 0.62 | ms/batch 24.64 | loss  6.29 | ppl   540.78
| epoch   7 |   227/  743 batches | lr 0.62 | ms/batch 24.77 | loss  6.27 | ppl   528.83
| epoch   7 |   228/ 

| epoch   7 |   315/  743 batches | lr 0.62 | ms/batch 22.66 | loss  6.39 | ppl   593.03
| epoch   7 |   316/  743 batches | lr 0.62 | ms/batch 22.74 | loss  6.30 | ppl   545.03
| epoch   7 |   317/  743 batches | lr 0.62 | ms/batch 21.09 | loss  6.50 | ppl   662.21
| epoch   7 |   318/  743 batches | lr 0.62 | ms/batch 21.18 | loss  6.37 | ppl   583.84
| epoch   7 |   319/  743 batches | lr 0.62 | ms/batch 21.75 | loss  6.41 | ppl   605.08
| epoch   7 |   320/  743 batches | lr 0.62 | ms/batch 22.12 | loss  6.53 | ppl   682.48
| epoch   7 |   321/  743 batches | lr 0.62 | ms/batch 22.50 | loss  6.36 | ppl   580.80
| epoch   7 |   322/  743 batches | lr 0.62 | ms/batch 22.11 | loss  6.55 | ppl   696.40
| epoch   7 |   323/  743 batches | lr 0.62 | ms/batch 21.13 | loss  6.46 | ppl   637.05
| epoch   7 |   324/  743 batches | lr 0.62 | ms/batch 21.13 | loss  6.32 | ppl   557.86
| epoch   7 |   325/  743 batches | lr 0.62 | ms/batch 21.17 | loss  6.24 | ppl   511.67
| epoch   7 |   326/ 

| epoch   7 |   410/  743 batches | lr 0.62 | ms/batch 23.36 | loss  6.35 | ppl   573.04
| epoch   7 |   411/  743 batches | lr 0.62 | ms/batch 23.94 | loss  6.48 | ppl   649.36
| epoch   7 |   412/  743 batches | lr 0.62 | ms/batch 21.89 | loss  6.41 | ppl   607.57
| epoch   7 |   413/  743 batches | lr 0.62 | ms/batch 21.74 | loss  6.29 | ppl   540.67
| epoch   7 |   414/  743 batches | lr 0.62 | ms/batch 21.39 | loss  6.35 | ppl   573.80
| epoch   7 |   415/  743 batches | lr 0.62 | ms/batch 21.24 | loss  6.23 | ppl   509.99
| epoch   7 |   416/  743 batches | lr 0.62 | ms/batch 21.46 | loss  6.41 | ppl   608.08
| epoch   7 |   417/  743 batches | lr 0.62 | ms/batch 21.58 | loss  6.32 | ppl   557.28
| epoch   7 |   418/  743 batches | lr 0.62 | ms/batch 21.20 | loss  6.35 | ppl   572.48
| epoch   7 |   419/  743 batches | lr 0.62 | ms/batch 21.15 | loss  6.29 | ppl   540.20
| epoch   7 |   420/  743 batches | lr 0.62 | ms/batch 21.50 | loss  6.43 | ppl   621.30
| epoch   7 |   421/ 

| epoch   7 |   510/  743 batches | lr 0.62 | ms/batch 25.70 | loss  6.26 | ppl   524.67
| epoch   7 |   511/  743 batches | lr 0.62 | ms/batch 23.50 | loss  6.37 | ppl   583.53
| epoch   7 |   512/  743 batches | lr 0.62 | ms/batch 21.77 | loss  6.34 | ppl   567.10
| epoch   7 |   513/  743 batches | lr 0.62 | ms/batch 21.58 | loss  6.33 | ppl   562.50
| epoch   7 |   514/  743 batches | lr 0.62 | ms/batch 21.59 | loss  6.27 | ppl   530.87
| epoch   7 |   515/  743 batches | lr 0.62 | ms/batch 21.58 | loss  6.41 | ppl   610.14
| epoch   7 |   516/  743 batches | lr 0.62 | ms/batch 22.97 | loss  6.22 | ppl   503.21
| epoch   7 |   517/  743 batches | lr 0.62 | ms/batch 22.01 | loss  6.43 | ppl   617.18
| epoch   7 |   518/  743 batches | lr 0.62 | ms/batch 22.20 | loss  6.41 | ppl   608.43
| epoch   7 |   519/  743 batches | lr 0.62 | ms/batch 21.62 | loss  6.36 | ppl   578.73
| epoch   7 |   520/  743 batches | lr 0.62 | ms/batch 24.94 | loss  6.57 | ppl   715.65
| epoch   7 |   521/ 

| epoch   7 |   605/  743 batches | lr 0.62 | ms/batch 56.08 | loss  6.45 | ppl   633.39
| epoch   7 |   606/  743 batches | lr 0.62 | ms/batch 63.58 | loss  6.27 | ppl   527.96
| epoch   7 |   607/  743 batches | lr 0.62 | ms/batch 57.65 | loss  6.34 | ppl   568.33
| epoch   7 |   608/  743 batches | lr 0.62 | ms/batch 49.12 | loss  6.41 | ppl   609.31
| epoch   7 |   609/  743 batches | lr 0.62 | ms/batch 52.10 | loss  6.31 | ppl   550.58
| epoch   7 |   610/  743 batches | lr 0.62 | ms/batch 85.66 | loss  6.26 | ppl   521.15
| epoch   7 |   611/  743 batches | lr 0.62 | ms/batch 48.74 | loss  6.42 | ppl   613.84
| epoch   7 |   612/  743 batches | lr 0.62 | ms/batch 53.07 | loss  6.25 | ppl   517.49
| epoch   7 |   613/  743 batches | lr 0.62 | ms/batch 76.61 | loss  6.33 | ppl   560.65
| epoch   7 |   614/  743 batches | lr 0.62 | ms/batch 93.11 | loss  6.37 | ppl   581.73
| epoch   7 |   615/  743 batches | lr 0.62 | ms/batch 36.12 | loss  6.33 | ppl   560.33
| epoch   7 |   616/ 

| epoch   7 |   700/  743 batches | lr 0.62 | ms/batch 41.42 | loss  6.27 | ppl   530.63
| epoch   7 |   701/  743 batches | lr 0.62 | ms/batch 28.24 | loss  6.28 | ppl   534.82
| epoch   7 |   702/  743 batches | lr 0.62 | ms/batch 26.69 | loss  6.36 | ppl   577.66
| epoch   7 |   703/  743 batches | lr 0.62 | ms/batch 25.29 | loss  6.28 | ppl   535.38
| epoch   7 |   704/  743 batches | lr 0.62 | ms/batch 27.12 | loss  6.36 | ppl   577.84
| epoch   7 |   705/  743 batches | lr 0.62 | ms/batch 27.67 | loss  6.39 | ppl   593.04
| epoch   7 |   706/  743 batches | lr 0.62 | ms/batch 25.84 | loss  6.27 | ppl   526.30
| epoch   7 |   707/  743 batches | lr 0.62 | ms/batch 26.49 | loss  6.28 | ppl   532.02
| epoch   7 |   708/  743 batches | lr 0.62 | ms/batch 23.20 | loss  6.44 | ppl   629.46
| epoch   7 |   709/  743 batches | lr 0.62 | ms/batch 23.94 | loss  6.35 | ppl   573.92
| epoch   7 |   710/  743 batches | lr 0.62 | ms/batch 24.72 | loss  6.28 | ppl   532.45
| epoch   7 |   711/ 

| epoch   8 |    47/  743 batches | lr 0.62 | ms/batch 21.69 | loss  6.32 | ppl   555.78
| epoch   8 |    48/  743 batches | lr 0.62 | ms/batch 22.32 | loss  6.34 | ppl   565.69
| epoch   8 |    49/  743 batches | lr 0.62 | ms/batch 20.68 | loss  6.41 | ppl   607.83
| epoch   8 |    50/  743 batches | lr 0.62 | ms/batch 20.97 | loss  6.46 | ppl   641.49
| epoch   8 |    51/  743 batches | lr 0.62 | ms/batch 20.83 | loss  6.33 | ppl   558.40
| epoch   8 |    52/  743 batches | lr 0.62 | ms/batch 21.55 | loss  6.37 | ppl   584.33
| epoch   8 |    53/  743 batches | lr 0.62 | ms/batch 20.86 | loss  6.41 | ppl   610.25
| epoch   8 |    54/  743 batches | lr 0.62 | ms/batch 21.30 | loss  6.42 | ppl   612.99
| epoch   8 |    55/  743 batches | lr 0.62 | ms/batch 20.89 | loss  6.37 | ppl   584.54
| epoch   8 |    56/  743 batches | lr 0.62 | ms/batch 20.89 | loss  6.28 | ppl   534.84
| epoch   8 |    57/  743 batches | lr 0.62 | ms/batch 22.41 | loss  6.39 | ppl   596.22
| epoch   8 |    58/ 

| epoch   8 |   147/  743 batches | lr 0.62 | ms/batch 21.41 | loss  6.30 | ppl   544.34
| epoch   8 |   148/  743 batches | lr 0.62 | ms/batch 21.05 | loss  6.30 | ppl   545.72
| epoch   8 |   149/  743 batches | lr 0.62 | ms/batch 20.43 | loss  6.46 | ppl   641.44
| epoch   8 |   150/  743 batches | lr 0.62 | ms/batch 20.55 | loss  6.40 | ppl   599.24
| epoch   8 |   151/  743 batches | lr 0.62 | ms/batch 21.29 | loss  6.48 | ppl   653.10
| epoch   8 |   152/  743 batches | lr 0.62 | ms/batch 21.32 | loss  6.27 | ppl   528.29
| epoch   8 |   153/  743 batches | lr 0.62 | ms/batch 21.30 | loss  6.36 | ppl   577.09
| epoch   8 |   154/  743 batches | lr 0.62 | ms/batch 21.91 | loss  6.32 | ppl   557.74
| epoch   8 |   155/  743 batches | lr 0.62 | ms/batch 22.05 | loss  6.37 | ppl   586.63
| epoch   8 |   156/  743 batches | lr 0.62 | ms/batch 21.81 | loss  6.15 | ppl   468.21
| epoch   8 |   157/  743 batches | lr 0.62 | ms/batch 21.11 | loss  6.32 | ppl   555.02
| epoch   8 |   158/ 

| epoch   8 |   247/  743 batches | lr 0.62 | ms/batch 22.69 | loss  6.45 | ppl   630.84
| epoch   8 |   248/  743 batches | lr 0.62 | ms/batch 23.15 | loss  6.43 | ppl   620.30
| epoch   8 |   249/  743 batches | lr 0.62 | ms/batch 24.15 | loss  6.27 | ppl   528.59
| epoch   8 |   250/  743 batches | lr 0.62 | ms/batch 22.57 | loss  6.25 | ppl   516.56
| epoch   8 |   251/  743 batches | lr 0.62 | ms/batch 24.11 | loss  6.28 | ppl   533.06
| epoch   8 |   252/  743 batches | lr 0.62 | ms/batch 23.12 | loss  6.38 | ppl   589.86
| epoch   8 |   253/  743 batches | lr 0.62 | ms/batch 22.74 | loss  6.27 | ppl   531.03
| epoch   8 |   254/  743 batches | lr 0.62 | ms/batch 25.60 | loss  6.18 | ppl   484.18
| epoch   8 |   255/  743 batches | lr 0.62 | ms/batch 22.70 | loss  6.36 | ppl   578.23
| epoch   8 |   256/  743 batches | lr 0.62 | ms/batch 24.06 | loss  6.18 | ppl   482.14
| epoch   8 |   257/  743 batches | lr 0.62 | ms/batch 28.06 | loss  6.25 | ppl   518.23
| epoch   8 |   258/ 

| epoch   8 |   341/  743 batches | lr 0.62 | ms/batch 39.20 | loss  6.33 | ppl   560.48
| epoch   8 |   342/  743 batches | lr 0.62 | ms/batch 29.12 | loss  6.32 | ppl   556.96
| epoch   8 |   343/  743 batches | lr 0.62 | ms/batch 28.57 | loss  6.32 | ppl   556.25
| epoch   8 |   344/  743 batches | lr 0.62 | ms/batch 26.28 | loss  6.19 | ppl   489.23
| epoch   8 |   345/  743 batches | lr 0.62 | ms/batch 38.00 | loss  6.41 | ppl   605.17
| epoch   8 |   346/  743 batches | lr 0.62 | ms/batch 31.61 | loss  6.33 | ppl   562.28
| epoch   8 |   347/  743 batches | lr 0.62 | ms/batch 28.39 | loss  6.33 | ppl   561.67
| epoch   8 |   348/  743 batches | lr 0.62 | ms/batch 28.32 | loss  6.31 | ppl   548.69
| epoch   8 |   349/  743 batches | lr 0.62 | ms/batch 34.41 | loss  6.38 | ppl   588.70
| epoch   8 |   350/  743 batches | lr 0.62 | ms/batch 25.63 | loss  6.24 | ppl   514.77
| epoch   8 |   351/  743 batches | lr 0.62 | ms/batch 24.99 | loss  6.24 | ppl   511.26
| epoch   8 |   352/ 

| epoch   8 |   436/  743 batches | lr 0.62 | ms/batch 24.74 | loss  6.47 | ppl   644.90
| epoch   8 |   437/  743 batches | lr 0.62 | ms/batch 23.44 | loss  6.39 | ppl   593.49
| epoch   8 |   438/  743 batches | lr 0.62 | ms/batch 22.33 | loss  6.40 | ppl   604.42
| epoch   8 |   439/  743 batches | lr 0.62 | ms/batch 21.11 | loss  6.44 | ppl   626.66
| epoch   8 |   440/  743 batches | lr 0.62 | ms/batch 21.45 | loss  6.31 | ppl   549.04
| epoch   8 |   441/  743 batches | lr 0.62 | ms/batch 21.66 | loss  6.33 | ppl   562.58
| epoch   8 |   442/  743 batches | lr 0.62 | ms/batch 22.36 | loss  6.34 | ppl   567.67
| epoch   8 |   443/  743 batches | lr 0.62 | ms/batch 21.91 | loss  6.49 | ppl   660.33
| epoch   8 |   444/  743 batches | lr 0.62 | ms/batch 21.75 | loss  6.29 | ppl   540.54
| epoch   8 |   445/  743 batches | lr 0.62 | ms/batch 20.47 | loss  6.40 | ppl   600.96
| epoch   8 |   446/  743 batches | lr 0.62 | ms/batch 20.53 | loss  6.39 | ppl   593.49
| epoch   8 |   447/ 

| epoch   8 |   532/  743 batches | lr 0.62 | ms/batch 65.08 | loss  6.18 | ppl   481.19
| epoch   8 |   533/  743 batches | lr 0.62 | ms/batch 42.12 | loss  6.32 | ppl   555.23
| epoch   8 |   534/  743 batches | lr 0.62 | ms/batch 37.58 | loss  6.34 | ppl   568.79
| epoch   8 |   535/  743 batches | lr 0.62 | ms/batch 31.83 | loss  6.29 | ppl   540.50
| epoch   8 |   536/  743 batches | lr 0.62 | ms/batch 26.29 | loss  6.33 | ppl   560.02
| epoch   8 |   537/  743 batches | lr 0.62 | ms/batch 21.98 | loss  6.45 | ppl   630.43
| epoch   8 |   538/  743 batches | lr 0.62 | ms/batch 21.34 | loss  6.26 | ppl   522.82
| epoch   8 |   539/  743 batches | lr 0.62 | ms/batch 22.23 | loss  6.27 | ppl   526.95
| epoch   8 |   540/  743 batches | lr 0.62 | ms/batch 23.75 | loss  6.29 | ppl   541.60
| epoch   8 |   541/  743 batches | lr 0.62 | ms/batch 22.00 | loss  6.32 | ppl   556.49
| epoch   8 |   542/  743 batches | lr 0.62 | ms/batch 21.34 | loss  6.24 | ppl   514.65
| epoch   8 |   543/ 

| epoch   8 |   629/  743 batches | lr 0.62 | ms/batch 24.11 | loss  6.24 | ppl   514.98
| epoch   8 |   630/  743 batches | lr 0.62 | ms/batch 23.54 | loss  6.25 | ppl   516.31
| epoch   8 |   631/  743 batches | lr 0.62 | ms/batch 23.37 | loss  6.40 | ppl   601.71
| epoch   8 |   632/  743 batches | lr 0.62 | ms/batch 23.29 | loss  6.26 | ppl   522.16
| epoch   8 |   633/  743 batches | lr 0.62 | ms/batch 23.29 | loss  6.31 | ppl   548.47
| epoch   8 |   634/  743 batches | lr 0.62 | ms/batch 21.97 | loss  6.35 | ppl   573.16
| epoch   8 |   635/  743 batches | lr 0.62 | ms/batch 21.93 | loss  6.32 | ppl   554.00
| epoch   8 |   636/  743 batches | lr 0.62 | ms/batch 21.70 | loss  6.25 | ppl   519.16
| epoch   8 |   637/  743 batches | lr 0.62 | ms/batch 21.68 | loss  6.26 | ppl   524.06
| epoch   8 |   638/  743 batches | lr 0.62 | ms/batch 22.33 | loss  6.41 | ppl   607.16
| epoch   8 |   639/  743 batches | lr 0.62 | ms/batch 25.33 | loss  6.18 | ppl   480.98
| epoch   8 |   640/ 

| epoch   8 |   723/  743 batches | lr 0.62 | ms/batch 46.43 | loss  6.38 | ppl   590.50
| epoch   8 |   724/  743 batches | lr 0.62 | ms/batch 38.28 | loss  6.36 | ppl   575.62
| epoch   8 |   725/  743 batches | lr 0.62 | ms/batch 36.19 | loss  6.39 | ppl   595.69
| epoch   8 |   726/  743 batches | lr 0.62 | ms/batch 45.49 | loss  6.43 | ppl   618.99
| epoch   8 |   727/  743 batches | lr 0.62 | ms/batch 43.14 | loss  6.43 | ppl   620.07
| epoch   8 |   728/  743 batches | lr 0.62 | ms/batch 55.08 | loss  6.32 | ppl   557.64
| epoch   8 |   729/  743 batches | lr 0.62 | ms/batch 29.58 | loss  6.34 | ppl   564.25
| epoch   8 |   730/  743 batches | lr 0.62 | ms/batch 27.42 | loss  6.22 | ppl   502.95
| epoch   8 |   731/  743 batches | lr 0.62 | ms/batch 27.34 | loss  6.32 | ppl   556.39
| epoch   8 |   732/  743 batches | lr 0.62 | ms/batch 28.12 | loss  6.37 | ppl   586.14
| epoch   8 |   733/  743 batches | lr 0.62 | ms/batch 55.14 | loss  6.32 | ppl   555.74
| epoch   8 |   734/ 

| epoch   9 |    71/  743 batches | lr 0.62 | ms/batch 29.00 | loss  6.26 | ppl   522.65
| epoch   9 |    72/  743 batches | lr 0.62 | ms/batch 22.96 | loss  6.22 | ppl   500.20
| epoch   9 |    73/  743 batches | lr 0.62 | ms/batch 22.20 | loss  6.16 | ppl   473.44
| epoch   9 |    74/  743 batches | lr 0.62 | ms/batch 24.00 | loss  6.38 | ppl   591.79
| epoch   9 |    75/  743 batches | lr 0.62 | ms/batch 25.39 | loss  6.25 | ppl   517.91
| epoch   9 |    76/  743 batches | lr 0.62 | ms/batch 27.34 | loss  6.33 | ppl   561.17
| epoch   9 |    77/  743 batches | lr 0.62 | ms/batch 24.56 | loss  6.35 | ppl   572.45
| epoch   9 |    78/  743 batches | lr 0.62 | ms/batch 26.29 | loss  6.29 | ppl   541.53
| epoch   9 |    79/  743 batches | lr 0.62 | ms/batch 24.79 | loss  6.29 | ppl   537.48
| epoch   9 |    80/  743 batches | lr 0.62 | ms/batch 34.23 | loss  6.25 | ppl   519.08
| epoch   9 |    81/  743 batches | lr 0.62 | ms/batch 24.98 | loss  6.45 | ppl   632.81
| epoch   9 |    82/ 

| epoch   9 |   171/  743 batches | lr 0.62 | ms/batch 26.19 | loss  6.37 | ppl   582.17
| epoch   9 |   172/  743 batches | lr 0.62 | ms/batch 46.62 | loss  6.38 | ppl   587.25
| epoch   9 |   173/  743 batches | lr 0.62 | ms/batch 31.99 | loss  6.12 | ppl   453.12
| epoch   9 |   174/  743 batches | lr 0.62 | ms/batch 29.74 | loss  6.35 | ppl   574.45
| epoch   9 |   175/  743 batches | lr 0.62 | ms/batch 39.59 | loss  6.36 | ppl   579.81
| epoch   9 |   176/  743 batches | lr 0.62 | ms/batch 34.18 | loss  6.36 | ppl   580.85
| epoch   9 |   177/  743 batches | lr 0.62 | ms/batch 57.08 | loss  6.17 | ppl   479.16
| epoch   9 |   178/  743 batches | lr 0.62 | ms/batch 39.73 | loss  6.31 | ppl   551.18
| epoch   9 |   179/  743 batches | lr 0.62 | ms/batch 48.55 | loss  6.32 | ppl   554.50
| epoch   9 |   180/  743 batches | lr 0.62 | ms/batch 26.30 | loss  6.19 | ppl   485.59
| epoch   9 |   181/  743 batches | lr 0.62 | ms/batch 26.87 | loss  6.34 | ppl   566.02
| epoch   9 |   182/ 

| epoch   9 |   271/  743 batches | lr 0.62 | ms/batch 25.88 | loss  6.16 | ppl   473.66
| epoch   9 |   272/  743 batches | lr 0.62 | ms/batch 24.23 | loss  6.38 | ppl   589.16
| epoch   9 |   273/  743 batches | lr 0.62 | ms/batch 25.41 | loss  6.30 | ppl   543.96
| epoch   9 |   274/  743 batches | lr 0.62 | ms/batch 24.18 | loss  6.30 | ppl   543.19
| epoch   9 |   275/  743 batches | lr 0.62 | ms/batch 21.70 | loss  6.24 | ppl   513.15
| epoch   9 |   276/  743 batches | lr 0.62 | ms/batch 22.43 | loss  6.23 | ppl   509.16
| epoch   9 |   277/  743 batches | lr 0.62 | ms/batch 24.46 | loss  6.28 | ppl   534.25
| epoch   9 |   278/  743 batches | lr 0.62 | ms/batch 23.50 | loss  6.31 | ppl   548.89
| epoch   9 |   279/  743 batches | lr 0.62 | ms/batch 24.25 | loss  6.28 | ppl   531.30
| epoch   9 |   280/  743 batches | lr 0.62 | ms/batch 27.86 | loss  6.26 | ppl   523.63
| epoch   9 |   281/  743 batches | lr 0.62 | ms/batch 24.86 | loss  6.28 | ppl   533.40
| epoch   9 |   282/ 

| epoch   9 |   372/  743 batches | lr 0.62 | ms/batch 22.56 | loss  6.37 | ppl   586.87
| epoch   9 |   373/  743 batches | lr 0.62 | ms/batch 23.69 | loss  6.33 | ppl   563.03
| epoch   9 |   374/  743 batches | lr 0.62 | ms/batch 23.43 | loss  6.30 | ppl   542.76
| epoch   9 |   375/  743 batches | lr 0.62 | ms/batch 22.58 | loss  6.19 | ppl   486.64
| epoch   9 |   376/  743 batches | lr 0.62 | ms/batch 21.43 | loss  6.27 | ppl   531.09
| epoch   9 |   377/  743 batches | lr 0.62 | ms/batch 21.51 | loss  6.24 | ppl   515.13
| epoch   9 |   378/  743 batches | lr 0.62 | ms/batch 21.42 | loss  6.29 | ppl   540.20
| epoch   9 |   379/  743 batches | lr 0.62 | ms/batch 22.14 | loss  6.12 | ppl   456.30
| epoch   9 |   380/  743 batches | lr 0.62 | ms/batch 21.25 | loss  6.28 | ppl   531.93
| epoch   9 |   381/  743 batches | lr 0.62 | ms/batch 21.26 | loss  6.33 | ppl   563.73
| epoch   9 |   382/  743 batches | lr 0.62 | ms/batch 21.19 | loss  6.21 | ppl   495.93
| epoch   9 |   383/ 

| epoch   9 |   470/  743 batches | lr 0.62 | ms/batch 21.58 | loss  6.25 | ppl   516.86
| epoch   9 |   471/  743 batches | lr 0.62 | ms/batch 22.81 | loss  6.25 | ppl   515.54
| epoch   9 |   472/  743 batches | lr 0.62 | ms/batch 23.34 | loss  6.29 | ppl   540.51
| epoch   9 |   473/  743 batches | lr 0.62 | ms/batch 23.08 | loss  6.36 | ppl   580.51
| epoch   9 |   474/  743 batches | lr 0.62 | ms/batch 21.60 | loss  6.27 | ppl   529.72
| epoch   9 |   475/  743 batches | lr 0.62 | ms/batch 21.52 | loss  6.28 | ppl   534.48
| epoch   9 |   476/  743 batches | lr 0.62 | ms/batch 21.88 | loss  6.28 | ppl   533.11
| epoch   9 |   477/  743 batches | lr 0.62 | ms/batch 21.74 | loss  6.26 | ppl   523.02
| epoch   9 |   478/  743 batches | lr 0.62 | ms/batch 21.49 | loss  6.35 | ppl   572.34
| epoch   9 |   479/  743 batches | lr 0.62 | ms/batch 21.99 | loss  6.32 | ppl   557.16
| epoch   9 |   480/  743 batches | lr 0.62 | ms/batch 24.74 | loss  6.37 | ppl   582.69
| epoch   9 |   481/ 

| epoch   9 |   566/  743 batches | lr 0.62 | ms/batch 24.41 | loss  6.30 | ppl   545.80
| epoch   9 |   567/  743 batches | lr 0.62 | ms/batch 29.75 | loss  6.22 | ppl   500.59
| epoch   9 |   568/  743 batches | lr 0.62 | ms/batch 24.50 | loss  6.50 | ppl   662.09
| epoch   9 |   569/  743 batches | lr 0.62 | ms/batch 24.23 | loss  6.11 | ppl   448.88
| epoch   9 |   570/  743 batches | lr 0.62 | ms/batch 23.96 | loss  6.36 | ppl   578.45
| epoch   9 |   571/  743 batches | lr 0.62 | ms/batch 26.50 | loss  6.37 | ppl   586.56
| epoch   9 |   572/  743 batches | lr 0.62 | ms/batch 23.90 | loss  6.26 | ppl   524.70
| epoch   9 |   573/  743 batches | lr 0.62 | ms/batch 24.18 | loss  6.28 | ppl   535.31
| epoch   9 |   574/  743 batches | lr 0.62 | ms/batch 24.92 | loss  6.26 | ppl   521.91
| epoch   9 |   575/  743 batches | lr 0.62 | ms/batch 29.65 | loss  6.31 | ppl   547.47
| epoch   9 |   576/  743 batches | lr 0.62 | ms/batch 27.03 | loss  6.26 | ppl   524.45
| epoch   9 |   577/ 

| epoch   9 |   662/  743 batches | lr 0.62 | ms/batch 24.27 | loss  6.31 | ppl   547.49
| epoch   9 |   663/  743 batches | lr 0.62 | ms/batch 26.82 | loss  6.21 | ppl   497.84
| epoch   9 |   664/  743 batches | lr 0.62 | ms/batch 24.08 | loss  6.29 | ppl   539.28
| epoch   9 |   665/  743 batches | lr 0.62 | ms/batch 24.63 | loss  6.33 | ppl   560.93
| epoch   9 |   666/  743 batches | lr 0.62 | ms/batch 26.68 | loss  6.35 | ppl   570.81
| epoch   9 |   667/  743 batches | lr 0.62 | ms/batch 25.10 | loss  6.20 | ppl   491.76
| epoch   9 |   668/  743 batches | lr 0.62 | ms/batch 23.63 | loss  6.14 | ppl   465.98
| epoch   9 |   669/  743 batches | lr 0.62 | ms/batch 22.88 | loss  6.35 | ppl   572.62
| epoch   9 |   670/  743 batches | lr 0.62 | ms/batch 21.73 | loss  6.25 | ppl   518.20
| epoch   9 |   671/  743 batches | lr 0.62 | ms/batch 21.84 | loss  6.21 | ppl   496.66
| epoch   9 |   672/  743 batches | lr 0.62 | ms/batch 23.00 | loss  6.35 | ppl   574.17
| epoch   9 |   673/ 

| epoch  10 |    18/  743 batches | lr 0.62 | ms/batch 21.78 | loss  6.17 | ppl   479.91
| epoch  10 |    19/  743 batches | lr 0.62 | ms/batch 23.22 | loss  6.37 | ppl   586.93
| epoch  10 |    20/  743 batches | lr 0.62 | ms/batch 21.71 | loss  6.25 | ppl   516.31
| epoch  10 |    21/  743 batches | lr 0.62 | ms/batch 22.46 | loss  6.14 | ppl   461.95
| epoch  10 |    22/  743 batches | lr 0.62 | ms/batch 20.62 | loss  6.27 | ppl   527.63
| epoch  10 |    23/  743 batches | lr 0.62 | ms/batch 20.63 | loss  6.23 | ppl   506.86
| epoch  10 |    24/  743 batches | lr 0.62 | ms/batch 20.64 | loss  6.26 | ppl   521.82
| epoch  10 |    25/  743 batches | lr 0.62 | ms/batch 20.75 | loss  6.25 | ppl   517.41
| epoch  10 |    26/  743 batches | lr 0.62 | ms/batch 20.67 | loss  6.21 | ppl   496.87
| epoch  10 |    27/  743 batches | lr 0.62 | ms/batch 20.45 | loss  6.18 | ppl   483.03
| epoch  10 |    28/  743 batches | lr 0.62 | ms/batch 21.45 | loss  6.17 | ppl   476.77
| epoch  10 |    29/ 

| epoch  10 |   117/  743 batches | lr 0.62 | ms/batch 22.41 | loss  6.39 | ppl   593.59
| epoch  10 |   118/  743 batches | lr 0.62 | ms/batch 22.50 | loss  6.21 | ppl   497.59
| epoch  10 |   119/  743 batches | lr 0.62 | ms/batch 20.94 | loss  6.27 | ppl   529.77
| epoch  10 |   120/  743 batches | lr 0.62 | ms/batch 20.92 | loss  6.36 | ppl   578.25
| epoch  10 |   121/  743 batches | lr 0.62 | ms/batch 20.72 | loss  6.22 | ppl   501.29
| epoch  10 |   122/  743 batches | lr 0.62 | ms/batch 20.78 | loss  6.34 | ppl   564.86
| epoch  10 |   123/  743 batches | lr 0.62 | ms/batch 20.64 | loss  6.43 | ppl   618.77
| epoch  10 |   124/  743 batches | lr 0.62 | ms/batch 20.78 | loss  6.35 | ppl   574.27
| epoch  10 |   125/  743 batches | lr 0.62 | ms/batch 20.66 | loss  6.28 | ppl   532.98
| epoch  10 |   126/  743 batches | lr 0.62 | ms/batch 20.95 | loss  6.34 | ppl   567.74
| epoch  10 |   127/  743 batches | lr 0.62 | ms/batch 21.69 | loss  6.42 | ppl   610.96
| epoch  10 |   128/ 

| epoch  10 |   216/  743 batches | lr 0.62 | ms/batch 27.34 | loss  6.23 | ppl   508.33
| epoch  10 |   217/  743 batches | lr 0.62 | ms/batch 23.98 | loss  6.20 | ppl   494.33
| epoch  10 |   218/  743 batches | lr 0.62 | ms/batch 23.43 | loss  6.13 | ppl   461.16
| epoch  10 |   219/  743 batches | lr 0.62 | ms/batch 22.17 | loss  6.17 | ppl   479.23
| epoch  10 |   220/  743 batches | lr 0.62 | ms/batch 22.16 | loss  6.34 | ppl   565.11
| epoch  10 |   221/  743 batches | lr 0.62 | ms/batch 22.87 | loss  6.40 | ppl   601.32
| epoch  10 |   222/  743 batches | lr 0.62 | ms/batch 22.67 | loss  6.12 | ppl   454.53
| epoch  10 |   223/  743 batches | lr 0.62 | ms/batch 21.91 | loss  6.11 | ppl   450.14
| epoch  10 |   224/  743 batches | lr 0.62 | ms/batch 22.09 | loss  6.45 | ppl   632.78
| epoch  10 |   225/  743 batches | lr 0.62 | ms/batch 22.23 | loss  6.17 | ppl   477.80
| epoch  10 |   226/  743 batches | lr 0.62 | ms/batch 25.58 | loss  6.19 | ppl   487.72
| epoch  10 |   227/ 

| epoch  10 |   315/  743 batches | lr 0.62 | ms/batch 25.18 | loss  6.26 | ppl   521.52
| epoch  10 |   316/  743 batches | lr 0.62 | ms/batch 23.08 | loss  6.19 | ppl   487.12
| epoch  10 |   317/  743 batches | lr 0.62 | ms/batch 21.99 | loss  6.38 | ppl   591.40
| epoch  10 |   318/  743 batches | lr 0.62 | ms/batch 21.85 | loss  6.24 | ppl   515.14
| epoch  10 |   319/  743 batches | lr 0.62 | ms/batch 21.81 | loss  6.28 | ppl   531.61
| epoch  10 |   320/  743 batches | lr 0.62 | ms/batch 22.60 | loss  6.42 | ppl   611.10
| epoch  10 |   321/  743 batches | lr 0.62 | ms/batch 23.12 | loss  6.26 | ppl   520.76
| epoch  10 |   322/  743 batches | lr 0.62 | ms/batch 24.50 | loss  6.44 | ppl   627.79
| epoch  10 |   323/  743 batches | lr 0.62 | ms/batch 23.92 | loss  6.33 | ppl   562.63
| epoch  10 |   324/  743 batches | lr 0.62 | ms/batch 21.64 | loss  6.21 | ppl   496.80
| epoch  10 |   325/  743 batches | lr 0.62 | ms/batch 25.02 | loss  6.11 | ppl   450.78
| epoch  10 |   326/ 

| epoch  10 |   411/  743 batches | lr 0.62 | ms/batch 56.08 | loss  6.35 | ppl   573.35
| epoch  10 |   412/  743 batches | lr 0.62 | ms/batch 37.10 | loss  6.31 | ppl   547.61
| epoch  10 |   413/  743 batches | lr 0.62 | ms/batch 33.55 | loss  6.20 | ppl   491.77
| epoch  10 |   414/  743 batches | lr 0.62 | ms/batch 27.86 | loss  6.25 | ppl   517.52
| epoch  10 |   415/  743 batches | lr 0.62 | ms/batch 27.00 | loss  6.13 | ppl   457.18
| epoch  10 |   416/  743 batches | lr 0.62 | ms/batch 28.84 | loss  6.33 | ppl   561.54
| epoch  10 |   417/  743 batches | lr 0.62 | ms/batch 27.28 | loss  6.23 | ppl   508.72
| epoch  10 |   418/  743 batches | lr 0.62 | ms/batch 25.44 | loss  6.28 | ppl   532.10
| epoch  10 |   419/  743 batches | lr 0.62 | ms/batch 30.84 | loss  6.18 | ppl   484.70
| epoch  10 |   420/  743 batches | lr 0.62 | ms/batch 26.48 | loss  6.31 | ppl   549.12
| epoch  10 |   421/  743 batches | lr 0.62 | ms/batch 27.75 | loss  6.20 | ppl   494.15
| epoch  10 |   422/ 

| epoch  10 |   505/  743 batches | lr 0.62 | ms/batch 23.56 | loss  6.31 | ppl   548.58
| epoch  10 |   506/  743 batches | lr 0.62 | ms/batch 25.33 | loss  6.27 | ppl   528.83
| epoch  10 |   507/  743 batches | lr 0.62 | ms/batch 23.44 | loss  6.33 | ppl   558.82
| epoch  10 |   508/  743 batches | lr 0.62 | ms/batch 21.68 | loss  6.33 | ppl   561.06
| epoch  10 |   509/  743 batches | lr 0.62 | ms/batch 21.46 | loss  6.29 | ppl   539.51
| epoch  10 |   510/  743 batches | lr 0.62 | ms/batch 21.64 | loss  6.15 | ppl   470.27
| epoch  10 |   511/  743 batches | lr 0.62 | ms/batch 21.52 | loss  6.25 | ppl   516.34
| epoch  10 |   512/  743 batches | lr 0.62 | ms/batch 21.30 | loss  6.24 | ppl   512.68
| epoch  10 |   513/  743 batches | lr 0.62 | ms/batch 21.58 | loss  6.23 | ppl   506.14
| epoch  10 |   514/  743 batches | lr 0.62 | ms/batch 21.64 | loss  6.17 | ppl   477.00
| epoch  10 |   515/  743 batches | lr 0.62 | ms/batch 21.69 | loss  6.29 | ppl   538.35
| epoch  10 |   516/ 

| epoch  10 |   598/  743 batches | lr 0.62 | ms/batch 41.18 | loss  6.22 | ppl   503.57
| epoch  10 |   599/  743 batches | lr 0.62 | ms/batch 67.84 | loss  6.36 | ppl   578.69
| epoch  10 |   600/  743 batches | lr 0.62 | ms/batch 49.69 | loss  6.14 | ppl   465.22
| epoch  10 |   601/  743 batches | lr 0.62 | ms/batch 33.80 | loss  6.27 | ppl   528.73
| epoch  10 |   602/  743 batches | lr 0.62 | ms/batch 44.46 | loss  6.22 | ppl   505.04
| epoch  10 |   603/  743 batches | lr 0.62 | ms/batch 43.05 | loss  6.21 | ppl   499.11
| epoch  10 |   604/  743 batches | lr 0.62 | ms/batch 33.31 | loss  6.26 | ppl   524.84
| epoch  10 |   605/  743 batches | lr 0.62 | ms/batch 26.66 | loss  6.35 | ppl   572.95
| epoch  10 |   606/  743 batches | lr 0.62 | ms/batch 29.15 | loss  6.16 | ppl   474.91
| epoch  10 |   607/  743 batches | lr 0.62 | ms/batch 44.48 | loss  6.23 | ppl   510.05
| epoch  10 |   608/  743 batches | lr 0.62 | ms/batch 44.32 | loss  6.34 | ppl   564.95
| epoch  10 |   609/ 

| epoch  10 |   698/  743 batches | lr 0.62 | ms/batch 60.70 | loss  6.22 | ppl   501.54
| epoch  10 |   699/  743 batches | lr 0.62 | ms/batch 27.96 | loss  6.09 | ppl   440.57
| epoch  10 |   700/  743 batches | lr 0.62 | ms/batch 26.93 | loss  6.15 | ppl   470.89
| epoch  10 |   701/  743 batches | lr 0.62 | ms/batch 25.65 | loss  6.20 | ppl   494.59
| epoch  10 |   702/  743 batches | lr 0.62 | ms/batch 27.51 | loss  6.26 | ppl   522.41
| epoch  10 |   703/  743 batches | lr 0.62 | ms/batch 25.64 | loss  6.19 | ppl   485.80
| epoch  10 |   704/  743 batches | lr 0.62 | ms/batch 30.66 | loss  6.26 | ppl   521.02
| epoch  10 |   705/  743 batches | lr 0.62 | ms/batch 28.42 | loss  6.28 | ppl   535.48
| epoch  10 |   706/  743 batches | lr 0.62 | ms/batch 27.98 | loss  6.15 | ppl   469.13
| epoch  10 |   707/  743 batches | lr 0.62 | ms/batch 24.86 | loss  6.17 | ppl   477.13
| epoch  10 |   708/  743 batches | lr 0.62 | ms/batch 23.56 | loss  6.33 | ppl   559.61
| epoch  10 |   709/ 

| epoch  11 |    46/  743 batches | lr 0.62 | ms/batch 22.57 | loss  6.18 | ppl   482.40
| epoch  11 |    47/  743 batches | lr 0.62 | ms/batch 22.16 | loss  6.23 | ppl   506.21
| epoch  11 |    48/  743 batches | lr 0.62 | ms/batch 21.80 | loss  6.25 | ppl   517.89
| epoch  11 |    49/  743 batches | lr 0.62 | ms/batch 21.84 | loss  6.32 | ppl   553.49
| epoch  11 |    50/  743 batches | lr 0.62 | ms/batch 21.86 | loss  6.39 | ppl   595.26
| epoch  11 |    51/  743 batches | lr 0.62 | ms/batch 21.66 | loss  6.23 | ppl   506.17
| epoch  11 |    52/  743 batches | lr 0.62 | ms/batch 20.50 | loss  6.28 | ppl   534.80
| epoch  11 |    53/  743 batches | lr 0.62 | ms/batch 20.78 | loss  6.34 | ppl   564.92
| epoch  11 |    54/  743 batches | lr 0.62 | ms/batch 22.07 | loss  6.31 | ppl   551.92
| epoch  11 |    55/  743 batches | lr 0.62 | ms/batch 22.10 | loss  6.29 | ppl   539.38
| epoch  11 |    56/  743 batches | lr 0.62 | ms/batch 21.54 | loss  6.20 | ppl   494.70
| epoch  11 |    57/ 

| epoch  11 |   146/  743 batches | lr 0.62 | ms/batch 23.15 | loss  6.25 | ppl   516.27
| epoch  11 |   147/  743 batches | lr 0.62 | ms/batch 23.05 | loss  6.19 | ppl   489.70
| epoch  11 |   148/  743 batches | lr 0.62 | ms/batch 21.24 | loss  6.22 | ppl   504.94
| epoch  11 |   149/  743 batches | lr 0.62 | ms/batch 22.30 | loss  6.39 | ppl   595.04
| epoch  11 |   150/  743 batches | lr 0.62 | ms/batch 21.17 | loss  6.31 | ppl   547.43
| epoch  11 |   151/  743 batches | lr 0.62 | ms/batch 20.96 | loss  6.36 | ppl   580.01
| epoch  11 |   152/  743 batches | lr 0.62 | ms/batch 20.99 | loss  6.17 | ppl   476.95
| epoch  11 |   153/  743 batches | lr 0.62 | ms/batch 21.08 | loss  6.27 | ppl   530.44
| epoch  11 |   154/  743 batches | lr 0.62 | ms/batch 21.30 | loss  6.25 | ppl   515.47
| epoch  11 |   155/  743 batches | lr 0.62 | ms/batch 21.27 | loss  6.28 | ppl   535.84
| epoch  11 |   156/  743 batches | lr 0.62 | ms/batch 22.44 | loss  6.05 | ppl   423.58
| epoch  11 |   157/ 

| epoch  11 |   247/  743 batches | lr 0.62 | ms/batch 22.49 | loss  6.36 | ppl   576.49
| epoch  11 |   248/  743 batches | lr 0.62 | ms/batch 24.29 | loss  6.35 | ppl   574.82
| epoch  11 |   249/  743 batches | lr 0.62 | ms/batch 21.85 | loss  6.18 | ppl   485.16
| epoch  11 |   250/  743 batches | lr 0.62 | ms/batch 22.65 | loss  6.15 | ppl   467.27
| epoch  11 |   251/  743 batches | lr 0.62 | ms/batch 22.08 | loss  6.16 | ppl   472.99
| epoch  11 |   252/  743 batches | lr 0.62 | ms/batch 21.90 | loss  6.29 | ppl   538.23
| epoch  11 |   253/  743 batches | lr 0.62 | ms/batch 21.80 | loss  6.21 | ppl   495.56
| epoch  11 |   254/  743 batches | lr 0.62 | ms/batch 21.72 | loss  6.08 | ppl   436.17
| epoch  11 |   255/  743 batches | lr 0.62 | ms/batch 21.89 | loss  6.27 | ppl   528.24
| epoch  11 |   256/  743 batches | lr 0.62 | ms/batch 21.74 | loss  6.06 | ppl   429.20
| epoch  11 |   257/  743 batches | lr 0.62 | ms/batch 24.18 | loss  6.14 | ppl   463.45
| epoch  11 |   258/ 

| epoch  11 |   347/  743 batches | lr 0.62 | ms/batch 21.98 | loss  6.24 | ppl   514.90
| epoch  11 |   348/  743 batches | lr 0.62 | ms/batch 24.38 | loss  6.20 | ppl   494.11
| epoch  11 |   349/  743 batches | lr 0.62 | ms/batch 23.44 | loss  6.28 | ppl   536.23
| epoch  11 |   350/  743 batches | lr 0.62 | ms/batch 21.96 | loss  6.13 | ppl   461.51
| epoch  11 |   351/  743 batches | lr 0.62 | ms/batch 22.96 | loss  6.13 | ppl   460.72
| epoch  11 |   352/  743 batches | lr 0.62 | ms/batch 22.62 | loss  6.27 | ppl   530.44
| epoch  11 |   353/  743 batches | lr 0.62 | ms/batch 21.92 | loss  6.24 | ppl   513.89
| epoch  11 |   354/  743 batches | lr 0.62 | ms/batch 21.95 | loss  6.24 | ppl   511.04
| epoch  11 |   355/  743 batches | lr 0.62 | ms/batch 22.46 | loss  6.25 | ppl   519.32
| epoch  11 |   356/  743 batches | lr 0.62 | ms/batch 22.29 | loss  6.25 | ppl   520.57
| epoch  11 |   357/  743 batches | lr 0.62 | ms/batch 27.22 | loss  6.09 | ppl   441.67
| epoch  11 |   358/ 

| epoch  11 |   440/  743 batches | lr 0.62 | ms/batch 44.53 | loss  6.21 | ppl   500.18
| epoch  11 |   441/  743 batches | lr 0.62 | ms/batch 34.87 | loss  6.21 | ppl   495.72
| epoch  11 |   442/  743 batches | lr 0.62 | ms/batch 28.93 | loss  6.25 | ppl   516.34
| epoch  11 |   443/  743 batches | lr 0.62 | ms/batch 25.05 | loss  6.39 | ppl   596.62
| epoch  11 |   444/  743 batches | lr 0.62 | ms/batch 25.27 | loss  6.20 | ppl   492.19
| epoch  11 |   445/  743 batches | lr 0.62 | ms/batch 22.42 | loss  6.32 | ppl   553.98
| epoch  11 |   446/  743 batches | lr 0.62 | ms/batch 22.46 | loss  6.32 | ppl   555.27
| epoch  11 |   447/  743 batches | lr 0.62 | ms/batch 22.51 | loss  6.37 | ppl   584.29
| epoch  11 |   448/  743 batches | lr 0.62 | ms/batch 22.40 | loss  6.24 | ppl   515.35
| epoch  11 |   449/  743 batches | lr 0.62 | ms/batch 24.53 | loss  6.25 | ppl   516.81
| epoch  11 |   450/  743 batches | lr 0.62 | ms/batch 22.88 | loss  6.14 | ppl   463.50
| epoch  11 |   451/ 

| epoch  11 |   538/  743 batches | lr 0.62 | ms/batch 24.51 | loss  6.16 | ppl   472.70
| epoch  11 |   539/  743 batches | lr 0.62 | ms/batch 25.36 | loss  6.21 | ppl   496.20
| epoch  11 |   540/  743 batches | lr 0.62 | ms/batch 22.29 | loss  6.23 | ppl   506.77
| epoch  11 |   541/  743 batches | lr 0.62 | ms/batch 22.37 | loss  6.23 | ppl   509.36
| epoch  11 |   542/  743 batches | lr 0.62 | ms/batch 22.38 | loss  6.18 | ppl   483.92
| epoch  11 |   543/  743 batches | lr 0.62 | ms/batch 23.41 | loss  6.12 | ppl   453.43
| epoch  11 |   544/  743 batches | lr 0.62 | ms/batch 22.46 | loss  6.40 | ppl   599.74
| epoch  11 |   545/  743 batches | lr 0.62 | ms/batch 22.20 | loss  6.32 | ppl   557.69
| epoch  11 |   546/  743 batches | lr 0.62 | ms/batch 22.19 | loss  6.21 | ppl   496.57
| epoch  11 |   547/  743 batches | lr 0.62 | ms/batch 22.36 | loss  6.23 | ppl   510.00
| epoch  11 |   548/  743 batches | lr 0.62 | ms/batch 25.04 | loss  6.25 | ppl   519.15
| epoch  11 |   549/ 

| epoch  11 |   635/  743 batches | lr 0.62 | ms/batch 24.09 | loss  6.21 | ppl   497.85
| epoch  11 |   636/  743 batches | lr 0.62 | ms/batch 22.21 | loss  6.12 | ppl   456.25
| epoch  11 |   637/  743 batches | lr 0.62 | ms/batch 21.26 | loss  6.15 | ppl   467.19
| epoch  11 |   638/  743 batches | lr 0.62 | ms/batch 21.49 | loss  6.31 | ppl   548.49
| epoch  11 |   639/  743 batches | lr 0.62 | ms/batch 23.12 | loss  6.07 | ppl   431.85
| epoch  11 |   640/  743 batches | lr 0.62 | ms/batch 21.99 | loss  6.28 | ppl   536.29
| epoch  11 |   641/  743 batches | lr 0.62 | ms/batch 24.48 | loss  6.18 | ppl   481.73
| epoch  11 |   642/  743 batches | lr 0.62 | ms/batch 23.58 | loss  6.16 | ppl   475.45
| epoch  11 |   643/  743 batches | lr 0.62 | ms/batch 23.22 | loss  6.14 | ppl   463.00
| epoch  11 |   644/  743 batches | lr 0.62 | ms/batch 24.39 | loss  6.23 | ppl   507.89
| epoch  11 |   645/  743 batches | lr 0.62 | ms/batch 26.16 | loss  6.26 | ppl   520.87
| epoch  11 |   646/ 

| epoch  11 |   731/  743 batches | lr 0.62 | ms/batch 29.95 | loss  6.25 | ppl   515.61
| epoch  11 |   732/  743 batches | lr 0.62 | ms/batch 26.82 | loss  6.31 | ppl   548.09
| epoch  11 |   733/  743 batches | lr 0.62 | ms/batch 24.98 | loss  6.24 | ppl   513.54
| epoch  11 |   734/  743 batches | lr 0.62 | ms/batch 25.74 | loss  6.27 | ppl   527.06
| epoch  11 |   735/  743 batches | lr 0.62 | ms/batch 24.95 | loss  6.15 | ppl   470.16
| epoch  11 |   736/  743 batches | lr 0.62 | ms/batch 27.41 | loss  6.11 | ppl   449.82
| epoch  11 |   737/  743 batches | lr 0.62 | ms/batch 26.36 | loss  6.21 | ppl   499.26
| epoch  11 |   738/  743 batches | lr 0.62 | ms/batch 24.69 | loss  6.36 | ppl   576.14
| epoch  11 |   739/  743 batches | lr 0.62 | ms/batch 24.01 | loss  6.31 | ppl   550.73
| epoch  11 |   740/  743 batches | lr 0.62 | ms/batch 23.38 | loss  6.14 | ppl   465.83
| epoch  11 |   741/  743 batches | lr 0.62 | ms/batch 24.10 | loss  6.18 | ppl   481.46
| epoch  11 |   742/ 

| epoch  12 |    85/  743 batches | lr 0.62 | ms/batch 23.20 | loss  6.19 | ppl   487.99
| epoch  12 |    86/  743 batches | lr 0.62 | ms/batch 23.51 | loss  6.08 | ppl   437.02
| epoch  12 |    87/  743 batches | lr 0.62 | ms/batch 21.41 | loss  6.26 | ppl   522.71
| epoch  12 |    88/  743 batches | lr 0.62 | ms/batch 22.42 | loss  6.15 | ppl   467.95
| epoch  12 |    89/  743 batches | lr 0.62 | ms/batch 22.94 | loss  6.28 | ppl   532.90
| epoch  12 |    90/  743 batches | lr 0.62 | ms/batch 22.14 | loss  6.17 | ppl   478.10
| epoch  12 |    91/  743 batches | lr 0.62 | ms/batch 24.36 | loss  6.19 | ppl   490.05
| epoch  12 |    92/  743 batches | lr 0.62 | ms/batch 23.89 | loss  6.21 | ppl   495.33
| epoch  12 |    93/  743 batches | lr 0.62 | ms/batch 23.41 | loss  6.22 | ppl   501.07
| epoch  12 |    94/  743 batches | lr 0.62 | ms/batch 25.03 | loss  6.11 | ppl   451.32
| epoch  12 |    95/  743 batches | lr 0.62 | ms/batch 26.21 | loss  6.16 | ppl   473.89
| epoch  12 |    96/ 

| epoch  12 |   183/  743 batches | lr 0.62 | ms/batch 21.64 | loss  6.29 | ppl   541.22
| epoch  12 |   184/  743 batches | lr 0.62 | ms/batch 23.13 | loss  6.26 | ppl   522.32
| epoch  12 |   185/  743 batches | lr 0.62 | ms/batch 20.89 | loss  6.22 | ppl   504.65
| epoch  12 |   186/  743 batches | lr 0.62 | ms/batch 20.85 | loss  6.25 | ppl   518.37
| epoch  12 |   187/  743 batches | lr 0.62 | ms/batch 20.94 | loss  6.24 | ppl   514.66
| epoch  12 |   188/  743 batches | lr 0.62 | ms/batch 20.46 | loss  6.16 | ppl   472.90
| epoch  12 |   189/  743 batches | lr 0.62 | ms/batch 20.60 | loss  6.11 | ppl   449.00
| epoch  12 |   190/  743 batches | lr 0.62 | ms/batch 20.29 | loss  6.27 | ppl   529.36
| epoch  12 |   191/  743 batches | lr 0.62 | ms/batch 20.06 | loss  6.10 | ppl   443.82
| epoch  12 |   192/  743 batches | lr 0.62 | ms/batch 20.36 | loss  6.15 | ppl   468.95
| epoch  12 |   193/  743 batches | lr 0.62 | ms/batch 20.64 | loss  6.02 | ppl   413.57
| epoch  12 |   194/ 

| epoch  12 |   276/  743 batches | lr 0.62 | ms/batch 22.19 | loss  6.13 | ppl   458.22
| epoch  12 |   277/  743 batches | lr 0.62 | ms/batch 21.46 | loss  6.19 | ppl   485.77
| epoch  12 |   278/  743 batches | lr 0.62 | ms/batch 21.22 | loss  6.21 | ppl   495.82
| epoch  12 |   279/  743 batches | lr 0.62 | ms/batch 20.98 | loss  6.19 | ppl   489.04
| epoch  12 |   280/  743 batches | lr 0.62 | ms/batch 21.50 | loss  6.19 | ppl   487.82
| epoch  12 |   281/  743 batches | lr 0.62 | ms/batch 21.17 | loss  6.17 | ppl   478.73
| epoch  12 |   282/  743 batches | lr 0.62 | ms/batch 21.02 | loss  6.12 | ppl   455.60
| epoch  12 |   283/  743 batches | lr 0.62 | ms/batch 21.25 | loss  6.18 | ppl   483.68
| epoch  12 |   284/  743 batches | lr 0.62 | ms/batch 21.22 | loss  6.10 | ppl   444.45
| epoch  12 |   285/  743 batches | lr 0.62 | ms/batch 21.23 | loss  6.11 | ppl   451.64
| epoch  12 |   286/  743 batches | lr 0.62 | ms/batch 22.22 | loss  6.18 | ppl   484.06
| epoch  12 |   287/ 

| epoch  12 |   373/  743 batches | lr 0.62 | ms/batch 22.58 | loss  6.24 | ppl   514.38
| epoch  12 |   374/  743 batches | lr 0.62 | ms/batch 22.87 | loss  6.20 | ppl   493.88
| epoch  12 |   375/  743 batches | lr 0.62 | ms/batch 22.26 | loss  6.12 | ppl   453.97
| epoch  12 |   376/  743 batches | lr 0.62 | ms/batch 22.31 | loss  6.17 | ppl   479.88
| epoch  12 |   377/  743 batches | lr 0.62 | ms/batch 22.27 | loss  6.15 | ppl   468.07
| epoch  12 |   378/  743 batches | lr 0.62 | ms/batch 22.63 | loss  6.18 | ppl   483.85
| epoch  12 |   379/  743 batches | lr 0.62 | ms/batch 22.51 | loss  6.01 | ppl   407.89
| epoch  12 |   380/  743 batches | lr 0.62 | ms/batch 34.95 | loss  6.18 | ppl   482.37
| epoch  12 |   381/  743 batches | lr 0.62 | ms/batch 41.76 | loss  6.24 | ppl   512.58
| epoch  12 |   382/  743 batches | lr 0.62 | ms/batch 38.61 | loss  6.11 | ppl   448.44
| epoch  12 |   383/  743 batches | lr 0.62 | ms/batch 38.20 | loss  6.27 | ppl   528.52
| epoch  12 |   384/ 

| epoch  12 |   473/  743 batches | lr 0.62 | ms/batch 24.27 | loss  6.28 | ppl   531.16
| epoch  12 |   474/  743 batches | lr 0.62 | ms/batch 22.81 | loss  6.17 | ppl   479.29
| epoch  12 |   475/  743 batches | lr 0.62 | ms/batch 22.21 | loss  6.19 | ppl   486.01
| epoch  12 |   476/  743 batches | lr 0.62 | ms/batch 23.71 | loss  6.19 | ppl   489.83
| epoch  12 |   477/  743 batches | lr 0.62 | ms/batch 22.44 | loss  6.17 | ppl   478.58
| epoch  12 |   478/  743 batches | lr 0.62 | ms/batch 23.01 | loss  6.24 | ppl   510.90
| epoch  12 |   479/  743 batches | lr 0.62 | ms/batch 23.33 | loss  6.23 | ppl   509.18
| epoch  12 |   480/  743 batches | lr 0.62 | ms/batch 23.46 | loss  6.27 | ppl   530.39
| epoch  12 |   481/  743 batches | lr 0.62 | ms/batch 23.92 | loss  6.12 | ppl   452.95
| epoch  12 |   482/  743 batches | lr 0.62 | ms/batch 25.83 | loss  6.21 | ppl   500.11
| epoch  12 |   483/  743 batches | lr 0.62 | ms/batch 26.24 | loss  6.24 | ppl   514.87
| epoch  12 |   484/ 

| epoch  12 |   570/  743 batches | lr 0.62 | ms/batch 22.31 | loss  6.30 | ppl   542.47
| epoch  12 |   571/  743 batches | lr 0.62 | ms/batch 23.20 | loss  6.29 | ppl   540.99
| epoch  12 |   572/  743 batches | lr 0.62 | ms/batch 21.22 | loss  6.17 | ppl   479.49
| epoch  12 |   573/  743 batches | lr 0.62 | ms/batch 21.22 | loss  6.20 | ppl   492.49
| epoch  12 |   574/  743 batches | lr 0.62 | ms/batch 20.92 | loss  6.16 | ppl   472.00
| epoch  12 |   575/  743 batches | lr 0.62 | ms/batch 20.83 | loss  6.20 | ppl   493.60
| epoch  12 |   576/  743 batches | lr 0.62 | ms/batch 21.00 | loss  6.18 | ppl   482.59
| epoch  12 |   577/  743 batches | lr 0.62 | ms/batch 20.89 | loss  6.13 | ppl   458.67
| epoch  12 |   578/  743 batches | lr 0.62 | ms/batch 21.07 | loss  6.16 | ppl   474.95
| epoch  12 |   579/  743 batches | lr 0.62 | ms/batch 21.31 | loss  6.19 | ppl   488.99
| epoch  12 |   580/  743 batches | lr 0.62 | ms/batch 22.08 | loss  6.20 | ppl   491.62
| epoch  12 |   581/ 

| epoch  12 |   667/  743 batches | lr 0.62 | ms/batch 28.86 | loss  6.07 | ppl   431.03
| epoch  12 |   668/  743 batches | lr 0.62 | ms/batch 27.74 | loss  6.04 | ppl   418.32
| epoch  12 |   669/  743 batches | lr 0.62 | ms/batch 23.70 | loss  6.23 | ppl   506.69
| epoch  12 |   670/  743 batches | lr 0.62 | ms/batch 23.75 | loss  6.16 | ppl   474.24
| epoch  12 |   671/  743 batches | lr 0.62 | ms/batch 22.72 | loss  6.11 | ppl   450.15
| epoch  12 |   672/  743 batches | lr 0.62 | ms/batch 22.54 | loss  6.26 | ppl   523.91
| epoch  12 |   673/  743 batches | lr 0.62 | ms/batch 22.36 | loss  6.03 | ppl   414.70
| epoch  12 |   674/  743 batches | lr 0.62 | ms/batch 22.13 | loss  6.30 | ppl   546.54
| epoch  12 |   675/  743 batches | lr 0.62 | ms/batch 22.60 | loss  6.20 | ppl   493.70
| epoch  12 |   676/  743 batches | lr 0.62 | ms/batch 24.30 | loss  6.14 | ppl   466.21
| epoch  12 |   677/  743 batches | lr 0.62 | ms/batch 24.27 | loss  6.14 | ppl   464.03
| epoch  12 |   678/ 

| epoch  13 |    18/  743 batches | lr 0.62 | ms/batch 21.87 | loss  6.08 | ppl   435.92
| epoch  13 |    19/  743 batches | lr 0.62 | ms/batch 21.99 | loss  6.30 | ppl   543.88
| epoch  13 |    20/  743 batches | lr 0.62 | ms/batch 20.58 | loss  6.15 | ppl   469.76
| epoch  13 |    21/  743 batches | lr 0.62 | ms/batch 20.84 | loss  6.06 | ppl   426.52
| epoch  13 |    22/  743 batches | lr 0.62 | ms/batch 20.60 | loss  6.16 | ppl   474.04
| epoch  13 |    23/  743 batches | lr 0.62 | ms/batch 20.94 | loss  6.12 | ppl   456.28
| epoch  13 |    24/  743 batches | lr 0.62 | ms/batch 20.87 | loss  6.15 | ppl   470.85
| epoch  13 |    25/  743 batches | lr 0.62 | ms/batch 20.72 | loss  6.13 | ppl   461.58
| epoch  13 |    26/  743 batches | lr 0.62 | ms/batch 20.98 | loss  6.10 | ppl   447.50
| epoch  13 |    27/  743 batches | lr 0.62 | ms/batch 20.81 | loss  6.10 | ppl   444.85
| epoch  13 |    28/  743 batches | lr 0.62 | ms/batch 22.40 | loss  6.09 | ppl   440.59
| epoch  13 |    29/ 

| epoch  13 |   118/  743 batches | lr 0.62 | ms/batch 22.21 | loss  6.11 | ppl   450.69
| epoch  13 |   119/  743 batches | lr 0.62 | ms/batch 22.08 | loss  6.17 | ppl   478.92
| epoch  13 |   120/  743 batches | lr 0.62 | ms/batch 21.28 | loss  6.28 | ppl   534.32
| epoch  13 |   121/  743 batches | lr 0.62 | ms/batch 20.76 | loss  6.14 | ppl   466.29
| epoch  13 |   122/  743 batches | lr 0.62 | ms/batch 20.67 | loss  6.24 | ppl   515.26
| epoch  13 |   123/  743 batches | lr 0.62 | ms/batch 20.79 | loss  6.36 | ppl   577.23
| epoch  13 |   124/  743 batches | lr 0.62 | ms/batch 21.00 | loss  6.26 | ppl   524.86
| epoch  13 |   125/  743 batches | lr 0.62 | ms/batch 20.95 | loss  6.19 | ppl   486.56
| epoch  13 |   126/  743 batches | lr 0.62 | ms/batch 21.06 | loss  6.27 | ppl   528.65
| epoch  13 |   127/  743 batches | lr 0.62 | ms/batch 20.84 | loss  6.34 | ppl   566.93
| epoch  13 |   128/  743 batches | lr 0.62 | ms/batch 21.90 | loss  6.15 | ppl   470.84
| epoch  13 |   129/ 

| epoch  13 |   218/  743 batches | lr 0.62 | ms/batch 21.96 | loss  6.01 | ppl   408.77
| epoch  13 |   219/  743 batches | lr 0.62 | ms/batch 22.67 | loss  6.09 | ppl   442.23
| epoch  13 |   220/  743 batches | lr 0.62 | ms/batch 21.40 | loss  6.25 | ppl   515.45
| epoch  13 |   221/  743 batches | lr 0.62 | ms/batch 21.84 | loss  6.31 | ppl   548.14
| epoch  13 |   222/  743 batches | lr 0.62 | ms/batch 21.52 | loss  6.01 | ppl   409.24
| epoch  13 |   223/  743 batches | lr 0.62 | ms/batch 21.14 | loss  6.03 | ppl   415.68
| epoch  13 |   224/  743 batches | lr 0.62 | ms/batch 21.11 | loss  6.39 | ppl   596.86
| epoch  13 |   225/  743 batches | lr 0.62 | ms/batch 21.09 | loss  6.11 | ppl   448.61
| epoch  13 |   226/  743 batches | lr 0.62 | ms/batch 21.31 | loss  6.09 | ppl   441.76
| epoch  13 |   227/  743 batches | lr 0.62 | ms/batch 21.31 | loss  6.02 | ppl   410.51
| epoch  13 |   228/  743 batches | lr 0.62 | ms/batch 22.09 | loss  6.18 | ppl   482.80
| epoch  13 |   229/ 

| epoch  13 |   318/  743 batches | lr 0.62 | ms/batch 22.30 | loss  6.18 | ppl   484.98
| epoch  13 |   319/  743 batches | lr 0.62 | ms/batch 22.93 | loss  6.19 | ppl   488.74
| epoch  13 |   320/  743 batches | lr 0.62 | ms/batch 21.63 | loss  6.33 | ppl   560.75
| epoch  13 |   321/  743 batches | lr 0.62 | ms/batch 21.35 | loss  6.15 | ppl   469.58
| epoch  13 |   322/  743 batches | lr 0.62 | ms/batch 22.04 | loss  6.35 | ppl   573.73
| epoch  13 |   323/  743 batches | lr 0.62 | ms/batch 21.24 | loss  6.25 | ppl   519.95
| epoch  13 |   324/  743 batches | lr 0.62 | ms/batch 21.24 | loss  6.14 | ppl   464.62
| epoch  13 |   325/  743 batches | lr 0.62 | ms/batch 22.09 | loss  6.03 | ppl   415.61
| epoch  13 |   326/  743 batches | lr 0.62 | ms/batch 20.78 | loss  6.15 | ppl   466.72
| epoch  13 |   327/  743 batches | lr 0.62 | ms/batch 20.69 | loss  6.07 | ppl   434.03
| epoch  13 |   328/  743 batches | lr 0.62 | ms/batch 21.98 | loss  6.16 | ppl   472.19
| epoch  13 |   329/ 

| epoch  13 |   418/  743 batches | lr 0.62 | ms/batch 22.64 | loss  6.18 | ppl   482.85
| epoch  13 |   419/  743 batches | lr 0.62 | ms/batch 22.58 | loss  6.08 | ppl   437.46
| epoch  13 |   420/  743 batches | lr 0.62 | ms/batch 21.31 | loss  6.21 | ppl   500.15
| epoch  13 |   421/  743 batches | lr 0.62 | ms/batch 21.48 | loss  6.13 | ppl   458.23
| epoch  13 |   422/  743 batches | lr 0.62 | ms/batch 21.46 | loss  6.15 | ppl   467.86
| epoch  13 |   423/  743 batches | lr 0.62 | ms/batch 21.63 | loss  6.24 | ppl   514.74
| epoch  13 |   424/  743 batches | lr 0.62 | ms/batch 22.83 | loss  6.00 | ppl   404.23
| epoch  13 |   425/  743 batches | lr 0.62 | ms/batch 22.40 | loss  6.11 | ppl   448.86
| epoch  13 |   426/  743 batches | lr 0.62 | ms/batch 21.14 | loss  6.27 | ppl   529.25
| epoch  13 |   427/  743 batches | lr 0.62 | ms/batch 21.37 | loss  6.07 | ppl   430.74
| epoch  13 |   428/  743 batches | lr 0.62 | ms/batch 21.05 | loss  6.16 | ppl   474.76
| epoch  13 |   429/ 

| epoch  13 |   518/  743 batches | lr 0.62 | ms/batch 21.27 | loss  6.21 | ppl   498.81
| epoch  13 |   519/  743 batches | lr 0.62 | ms/batch 22.43 | loss  6.12 | ppl   455.02
| epoch  13 |   520/  743 batches | lr 0.62 | ms/batch 22.08 | loss  6.36 | ppl   581.04
| epoch  13 |   521/  743 batches | lr 0.62 | ms/batch 21.55 | loss  6.07 | ppl   430.77
| epoch  13 |   522/  743 batches | lr 0.62 | ms/batch 21.55 | loss  6.21 | ppl   499.34
| epoch  13 |   523/  743 batches | lr 0.62 | ms/batch 21.34 | loss  6.05 | ppl   423.89
| epoch  13 |   524/  743 batches | lr 0.62 | ms/batch 21.22 | loss  6.05 | ppl   425.05
| epoch  13 |   525/  743 batches | lr 0.62 | ms/batch 21.38 | loss  6.11 | ppl   448.41
| epoch  13 |   526/  743 batches | lr 0.62 | ms/batch 21.11 | loss  6.19 | ppl   488.23
| epoch  13 |   527/  743 batches | lr 0.62 | ms/batch 20.96 | loss  6.01 | ppl   409.12
| epoch  13 |   528/  743 batches | lr 0.62 | ms/batch 21.35 | loss  6.19 | ppl   490.12
| epoch  13 |   529/ 

| epoch  13 |   618/  743 batches | lr 0.62 | ms/batch 22.81 | loss  6.37 | ppl   583.11
| epoch  13 |   619/  743 batches | lr 0.62 | ms/batch 22.47 | loss  6.07 | ppl   433.22
| epoch  13 |   620/  743 batches | lr 0.62 | ms/batch 21.38 | loss  6.19 | ppl   488.11
| epoch  13 |   621/  743 batches | lr 0.62 | ms/batch 21.47 | loss  6.23 | ppl   509.39
| epoch  13 |   622/  743 batches | lr 0.62 | ms/batch 22.07 | loss  6.10 | ppl   445.16
| epoch  13 |   623/  743 batches | lr 0.62 | ms/batch 23.08 | loss  6.18 | ppl   483.01
| epoch  13 |   624/  743 batches | lr 0.62 | ms/batch 21.59 | loss  6.19 | ppl   489.28
| epoch  13 |   625/  743 batches | lr 0.62 | ms/batch 21.37 | loss  6.04 | ppl   421.68
| epoch  13 |   626/  743 batches | lr 0.62 | ms/batch 21.81 | loss  6.14 | ppl   461.92
| epoch  13 |   627/  743 batches | lr 0.62 | ms/batch 21.38 | loss  6.14 | ppl   466.06
| epoch  13 |   628/  743 batches | lr 0.62 | ms/batch 21.20 | loss  6.10 | ppl   445.38
| epoch  13 |   629/ 

| epoch  13 |   718/  743 batches | lr 0.62 | ms/batch 22.57 | loss  6.28 | ppl   533.56
| epoch  13 |   719/  743 batches | lr 0.62 | ms/batch 22.89 | loss  6.25 | ppl   519.65
| epoch  13 |   720/  743 batches | lr 0.62 | ms/batch 21.58 | loss  6.05 | ppl   423.82
| epoch  13 |   721/  743 batches | lr 0.62 | ms/batch 21.22 | loss  6.28 | ppl   532.16
| epoch  13 |   722/  743 batches | lr 0.62 | ms/batch 21.39 | loss  6.31 | ppl   548.03
| epoch  13 |   723/  743 batches | lr 0.62 | ms/batch 21.27 | loss  6.26 | ppl   525.04
| epoch  13 |   724/  743 batches | lr 0.62 | ms/batch 20.89 | loss  6.21 | ppl   498.59
| epoch  13 |   725/  743 batches | lr 0.62 | ms/batch 20.95 | loss  6.23 | ppl   509.50
| epoch  13 |   726/  743 batches | lr 0.62 | ms/batch 21.34 | loss  6.27 | ppl   526.98
| epoch  13 |   727/  743 batches | lr 0.62 | ms/batch 21.22 | loss  6.28 | ppl   535.29
| epoch  13 |   728/  743 batches | lr 0.62 | ms/batch 22.83 | loss  6.18 | ppl   482.15
| epoch  13 |   729/ 

| epoch  14 |    68/  743 batches | lr 0.62 | ms/batch 23.96 | loss  6.24 | ppl   513.61
| epoch  14 |    69/  743 batches | lr 0.62 | ms/batch 22.65 | loss  6.10 | ppl   446.62
| epoch  14 |    70/  743 batches | lr 0.62 | ms/batch 21.91 | loss  6.17 | ppl   477.37
| epoch  14 |    71/  743 batches | lr 0.62 | ms/batch 21.83 | loss  6.08 | ppl   437.84
| epoch  14 |    72/  743 batches | lr 0.62 | ms/batch 21.97 | loss  6.07 | ppl   432.22
| epoch  14 |    73/  743 batches | lr 0.62 | ms/batch 21.61 | loss  6.02 | ppl   410.13
| epoch  14 |    74/  743 batches | lr 0.62 | ms/batch 21.89 | loss  6.22 | ppl   501.09
| epoch  14 |    75/  743 batches | lr 0.62 | ms/batch 21.53 | loss  6.11 | ppl   448.57
| epoch  14 |    76/  743 batches | lr 0.62 | ms/batch 21.80 | loss  6.17 | ppl   476.95
| epoch  14 |    77/  743 batches | lr 0.62 | ms/batch 21.61 | loss  6.21 | ppl   497.54
| epoch  14 |    78/  743 batches | lr 0.62 | ms/batch 21.75 | loss  6.16 | ppl   471.91
| epoch  14 |    79/ 

| epoch  14 |   168/  743 batches | lr 0.62 | ms/batch 22.31 | loss  6.25 | ppl   519.89
| epoch  14 |   169/  743 batches | lr 0.62 | ms/batch 22.90 | loss  6.23 | ppl   506.60
| epoch  14 |   170/  743 batches | lr 0.62 | ms/batch 21.28 | loss  6.11 | ppl   449.15
| epoch  14 |   171/  743 batches | lr 0.62 | ms/batch 21.98 | loss  6.23 | ppl   507.36
| epoch  14 |   172/  743 batches | lr 0.62 | ms/batch 21.24 | loss  6.22 | ppl   500.33
| epoch  14 |   173/  743 batches | lr 0.62 | ms/batch 21.23 | loss  5.94 | ppl   379.77
| epoch  14 |   174/  743 batches | lr 0.62 | ms/batch 21.19 | loss  6.20 | ppl   492.12
| epoch  14 |   175/  743 batches | lr 0.62 | ms/batch 22.74 | loss  6.23 | ppl   505.57
| epoch  14 |   176/  743 batches | lr 0.62 | ms/batch 21.48 | loss  6.21 | ppl   497.55
| epoch  14 |   177/  743 batches | lr 0.62 | ms/batch 22.64 | loss  6.04 | ppl   418.59
| epoch  14 |   178/  743 batches | lr 0.62 | ms/batch 21.38 | loss  6.19 | ppl   487.66
| epoch  14 |   179/ 

| epoch  14 |   268/  743 batches | lr 0.62 | ms/batch 21.96 | loss  6.03 | ppl   413.73
| epoch  14 |   269/  743 batches | lr 0.62 | ms/batch 22.86 | loss  6.08 | ppl   437.53
| epoch  14 |   270/  743 batches | lr 0.62 | ms/batch 21.62 | loss  6.09 | ppl   443.01
| epoch  14 |   271/  743 batches | lr 0.62 | ms/batch 21.62 | loss  6.02 | ppl   412.49
| epoch  14 |   272/  743 batches | lr 0.62 | ms/batch 21.86 | loss  6.25 | ppl   519.13
| epoch  14 |   273/  743 batches | lr 0.62 | ms/batch 21.74 | loss  6.14 | ppl   464.15
| epoch  14 |   274/  743 batches | lr 0.62 | ms/batch 21.63 | loss  6.14 | ppl   464.41
| epoch  14 |   275/  743 batches | lr 0.62 | ms/batch 21.49 | loss  6.06 | ppl   429.29
| epoch  14 |   276/  743 batches | lr 0.62 | ms/batch 21.88 | loss  6.05 | ppl   425.31
| epoch  14 |   277/  743 batches | lr 0.62 | ms/batch 21.89 | loss  6.13 | ppl   460.40
| epoch  14 |   278/  743 batches | lr 0.62 | ms/batch 21.87 | loss  6.16 | ppl   475.48
| epoch  14 |   279/ 

| epoch  14 |   364/  743 batches | lr 0.62 | ms/batch 22.12 | loss  6.21 | ppl   498.01
| epoch  14 |   365/  743 batches | lr 0.62 | ms/batch 22.91 | loss  6.19 | ppl   490.13
| epoch  14 |   366/  743 batches | lr 0.62 | ms/batch 21.93 | loss  6.14 | ppl   466.02
| epoch  14 |   367/  743 batches | lr 0.62 | ms/batch 21.92 | loss  6.05 | ppl   423.01
| epoch  14 |   368/  743 batches | lr 0.62 | ms/batch 22.22 | loss  6.22 | ppl   504.21
| epoch  14 |   369/  743 batches | lr 0.62 | ms/batch 22.02 | loss  6.24 | ppl   512.27
| epoch  14 |   370/  743 batches | lr 0.62 | ms/batch 22.13 | loss  6.02 | ppl   410.31
| epoch  14 |   371/  743 batches | lr 0.62 | ms/batch 22.51 | loss  6.15 | ppl   470.07
| epoch  14 |   372/  743 batches | lr 0.62 | ms/batch 22.12 | loss  6.23 | ppl   507.90
| epoch  14 |   373/  743 batches | lr 0.62 | ms/batch 22.26 | loss  6.20 | ppl   491.46
| epoch  14 |   374/  743 batches | lr 0.62 | ms/batch 25.90 | loss  6.14 | ppl   464.63
| epoch  14 |   375/ 

| epoch  14 |   460/  743 batches | lr 0.62 | ms/batch 21.65 | loss  6.18 | ppl   481.66
| epoch  14 |   461/  743 batches | lr 0.62 | ms/batch 22.79 | loss  6.22 | ppl   502.37
| epoch  14 |   462/  743 batches | lr 0.62 | ms/batch 21.61 | loss  6.21 | ppl   495.44
| epoch  14 |   463/  743 batches | lr 0.62 | ms/batch 21.53 | loss  6.22 | ppl   502.02
| epoch  14 |   464/  743 batches | lr 0.62 | ms/batch 21.39 | loss  6.24 | ppl   513.61
| epoch  14 |   465/  743 batches | lr 0.62 | ms/batch 21.61 | loss  6.05 | ppl   422.69
| epoch  14 |   466/  743 batches | lr 0.62 | ms/batch 21.77 | loss  6.19 | ppl   486.24
| epoch  14 |   467/  743 batches | lr 0.62 | ms/batch 21.49 | loss  6.16 | ppl   471.29
| epoch  14 |   468/  743 batches | lr 0.62 | ms/batch 22.80 | loss  6.11 | ppl   448.97
| epoch  14 |   469/  743 batches | lr 0.62 | ms/batch 21.84 | loss  6.24 | ppl   510.88
| epoch  14 |   470/  743 batches | lr 0.62 | ms/batch 21.55 | loss  6.12 | ppl   456.99
| epoch  14 |   471/ 

| epoch  14 |   556/  743 batches | lr 0.62 | ms/batch 25.84 | loss  6.00 | ppl   404.63
| epoch  14 |   557/  743 batches | lr 0.62 | ms/batch 23.22 | loss  6.00 | ppl   404.65
| epoch  14 |   558/  743 batches | lr 0.62 | ms/batch 22.10 | loss  6.22 | ppl   501.00
| epoch  14 |   559/  743 batches | lr 0.62 | ms/batch 22.11 | loss  6.13 | ppl   459.95
| epoch  14 |   560/  743 batches | lr 0.62 | ms/batch 21.99 | loss  6.22 | ppl   505.02
| epoch  14 |   561/  743 batches | lr 0.62 | ms/batch 22.31 | loss  6.20 | ppl   491.43
| epoch  14 |   562/  743 batches | lr 0.62 | ms/batch 22.35 | loss  6.15 | ppl   467.21
| epoch  14 |   563/  743 batches | lr 0.62 | ms/batch 22.10 | loss  6.19 | ppl   486.88
| epoch  14 |   564/  743 batches | lr 0.62 | ms/batch 23.62 | loss  6.21 | ppl   495.78
| epoch  14 |   565/  743 batches | lr 0.62 | ms/batch 22.01 | loss  6.25 | ppl   519.06
| epoch  14 |   566/  743 batches | lr 0.62 | ms/batch 25.80 | loss  6.19 | ppl   487.72
| epoch  14 |   567/ 

| epoch  14 |   655/  743 batches | lr 0.62 | ms/batch 23.25 | loss  6.11 | ppl   449.26
| epoch  14 |   656/  743 batches | lr 0.62 | ms/batch 24.16 | loss  6.21 | ppl   495.72
| epoch  14 |   657/  743 batches | lr 0.62 | ms/batch 21.81 | loss  6.09 | ppl   440.68
| epoch  14 |   658/  743 batches | lr 0.62 | ms/batch 21.60 | loss  6.17 | ppl   480.30
| epoch  14 |   659/  743 batches | lr 0.62 | ms/batch 21.74 | loss  6.17 | ppl   475.91
| epoch  14 |   660/  743 batches | lr 0.62 | ms/batch 21.53 | loss  6.06 | ppl   429.87
| epoch  14 |   661/  743 batches | lr 0.62 | ms/batch 21.32 | loss  6.06 | ppl   429.56
| epoch  14 |   662/  743 batches | lr 0.62 | ms/batch 21.99 | loss  6.16 | ppl   471.17
| epoch  14 |   663/  743 batches | lr 0.62 | ms/batch 21.69 | loss  6.08 | ppl   438.15
| epoch  14 |   664/  743 batches | lr 0.62 | ms/batch 21.60 | loss  6.15 | ppl   470.04
| epoch  14 |   665/  743 batches | lr 0.62 | ms/batch 21.75 | loss  6.17 | ppl   476.82
| epoch  14 |   666/ 

| epoch  15 |     8/  743 batches | lr 0.62 | ms/batch 22.29 | loss  6.25 | ppl   517.31
| epoch  15 |     9/  743 batches | lr 0.62 | ms/batch 23.02 | loss  6.17 | ppl   479.77
| epoch  15 |    10/  743 batches | lr 0.62 | ms/batch 21.21 | loss  6.15 | ppl   470.21
| epoch  15 |    11/  743 batches | lr 0.62 | ms/batch 21.15 | loss  6.16 | ppl   473.71
| epoch  15 |    12/  743 batches | lr 0.62 | ms/batch 21.18 | loss  6.23 | ppl   505.34
| epoch  15 |    13/  743 batches | lr 0.62 | ms/batch 20.48 | loss  6.12 | ppl   455.64
| epoch  15 |    14/  743 batches | lr 0.62 | ms/batch 20.56 | loss  6.20 | ppl   493.18
| epoch  15 |    15/  743 batches | lr 0.62 | ms/batch 20.82 | loss  6.20 | ppl   491.26
| epoch  15 |    16/  743 batches | lr 0.62 | ms/batch 20.94 | loss  6.24 | ppl   514.24
| epoch  15 |    17/  743 batches | lr 0.62 | ms/batch 21.05 | loss  6.19 | ppl   489.60
| epoch  15 |    18/  743 batches | lr 0.62 | ms/batch 22.54 | loss  6.03 | ppl   417.59
| epoch  15 |    19/ 

| epoch  15 |   108/  743 batches | lr 0.62 | ms/batch 24.41 | loss  6.10 | ppl   444.95
| epoch  15 |   109/  743 batches | lr 0.62 | ms/batch 24.53 | loss  6.09 | ppl   440.12
| epoch  15 |   110/  743 batches | lr 0.62 | ms/batch 22.38 | loss  6.07 | ppl   431.23
| epoch  15 |   111/  743 batches | lr 0.62 | ms/batch 21.90 | loss  6.13 | ppl   459.51
| epoch  15 |   112/  743 batches | lr 0.62 | ms/batch 21.80 | loss  5.95 | ppl   384.88
| epoch  15 |   113/  743 batches | lr 0.62 | ms/batch 22.10 | loss  6.08 | ppl   435.48
| epoch  15 |   114/  743 batches | lr 0.62 | ms/batch 21.85 | loss  6.17 | ppl   477.42
| epoch  15 |   115/  743 batches | lr 0.62 | ms/batch 22.02 | loss  6.04 | ppl   418.02
| epoch  15 |   116/  743 batches | lr 0.62 | ms/batch 22.03 | loss  6.31 | ppl   551.33
| epoch  15 |   117/  743 batches | lr 0.62 | ms/batch 22.32 | loss  6.24 | ppl   513.40
| epoch  15 |   118/  743 batches | lr 0.62 | ms/batch 26.01 | loss  6.06 | ppl   428.50
| epoch  15 |   119/ 

| epoch  15 |   208/  743 batches | lr 0.62 | ms/batch 21.41 | loss  6.15 | ppl   468.84
| epoch  15 |   209/  743 batches | lr 0.62 | ms/batch 22.57 | loss  6.16 | ppl   475.60
| epoch  15 |   210/  743 batches | lr 0.62 | ms/batch 21.67 | loss  5.96 | ppl   386.27
| epoch  15 |   211/  743 batches | lr 0.62 | ms/batch 21.58 | loss  6.19 | ppl   489.32
| epoch  15 |   212/  743 batches | lr 0.62 | ms/batch 22.27 | loss  6.01 | ppl   409.36
| epoch  15 |   213/  743 batches | lr 0.62 | ms/batch 21.40 | loss  6.03 | ppl   415.70
| epoch  15 |   214/  743 batches | lr 0.62 | ms/batch 21.56 | loss  6.17 | ppl   478.80
| epoch  15 |   215/  743 batches | lr 0.62 | ms/batch 21.30 | loss  6.10 | ppl   446.09
| epoch  15 |   216/  743 batches | lr 0.62 | ms/batch 21.31 | loss  6.10 | ppl   444.89
| epoch  15 |   217/  743 batches | lr 0.62 | ms/batch 21.32 | loss  6.08 | ppl   437.56
| epoch  15 |   218/  743 batches | lr 0.62 | ms/batch 21.35 | loss  5.98 | ppl   395.06
| epoch  15 |   219/ 

| epoch  15 |   308/  743 batches | lr 0.62 | ms/batch 21.40 | loss  6.24 | ppl   511.54
| epoch  15 |   309/  743 batches | lr 0.62 | ms/batch 22.17 | loss  6.12 | ppl   453.04
| epoch  15 |   310/  743 batches | lr 0.62 | ms/batch 21.74 | loss  6.15 | ppl   470.99
| epoch  15 |   311/  743 batches | lr 0.62 | ms/batch 21.75 | loss  6.10 | ppl   447.72
| epoch  15 |   312/  743 batches | lr 0.62 | ms/batch 21.76 | loss  6.04 | ppl   420.04
| epoch  15 |   313/  743 batches | lr 0.62 | ms/batch 21.44 | loss  6.11 | ppl   451.38
| epoch  15 |   314/  743 batches | lr 0.62 | ms/batch 21.43 | loss  6.02 | ppl   411.56
| epoch  15 |   315/  743 batches | lr 0.62 | ms/batch 21.75 | loss  6.14 | ppl   462.60
| epoch  15 |   316/  743 batches | lr 0.62 | ms/batch 22.14 | loss  6.08 | ppl   437.53
| epoch  15 |   317/  743 batches | lr 0.62 | ms/batch 21.67 | loss  6.27 | ppl   529.12
| epoch  15 |   318/  743 batches | lr 0.62 | ms/batch 21.91 | loss  6.13 | ppl   460.09
| epoch  15 |   319/ 

| epoch  15 |   406/  743 batches | lr 0.62 | ms/batch 26.82 | loss  6.16 | ppl   472.04
| epoch  15 |   407/  743 batches | lr 0.62 | ms/batch 26.22 | loss  6.23 | ppl   509.50
| epoch  15 |   408/  743 batches | lr 0.62 | ms/batch 24.68 | loss  6.00 | ppl   405.02
| epoch  15 |   409/  743 batches | lr 0.62 | ms/batch 24.18 | loss  6.18 | ppl   484.96
| epoch  15 |   410/  743 batches | lr 0.62 | ms/batch 26.28 | loss  6.10 | ppl   444.33
| epoch  15 |   411/  743 batches | lr 0.62 | ms/batch 24.07 | loss  6.23 | ppl   508.97
| epoch  15 |   412/  743 batches | lr 0.62 | ms/batch 24.25 | loss  6.16 | ppl   475.49
| epoch  15 |   413/  743 batches | lr 0.62 | ms/batch 21.92 | loss  6.05 | ppl   425.77
| epoch  15 |   414/  743 batches | lr 0.62 | ms/batch 22.01 | loss  6.11 | ppl   449.41
| epoch  15 |   415/  743 batches | lr 0.62 | ms/batch 22.25 | loss  6.00 | ppl   405.28
| epoch  15 |   416/  743 batches | lr 0.62 | ms/batch 23.12 | loss  6.20 | ppl   492.34
| epoch  15 |   417/ 

| epoch  15 |   502/  743 batches | lr 0.62 | ms/batch 25.38 | loss  6.10 | ppl   444.80
| epoch  15 |   503/  743 batches | lr 0.62 | ms/batch 25.72 | loss  6.07 | ppl   432.61
| epoch  15 |   504/  743 batches | lr 0.62 | ms/batch 24.01 | loss  6.05 | ppl   425.61
| epoch  15 |   505/  743 batches | lr 0.62 | ms/batch 23.91 | loss  6.20 | ppl   491.93
| epoch  15 |   506/  743 batches | lr 0.62 | ms/batch 23.91 | loss  6.12 | ppl   456.22
| epoch  15 |   507/  743 batches | lr 0.62 | ms/batch 23.69 | loss  6.21 | ppl   497.15
| epoch  15 |   508/  743 batches | lr 0.62 | ms/batch 23.52 | loss  6.20 | ppl   491.32
| epoch  15 |   509/  743 batches | lr 0.62 | ms/batch 23.60 | loss  6.18 | ppl   484.50
| epoch  15 |   510/  743 batches | lr 0.62 | ms/batch 26.66 | loss  6.01 | ppl   405.81
| epoch  15 |   511/  743 batches | lr 0.62 | ms/batch 23.59 | loss  6.12 | ppl   456.28
| epoch  15 |   512/  743 batches | lr 0.62 | ms/batch 25.91 | loss  6.10 | ppl   446.95
| epoch  15 |   513/ 

| epoch  15 |   596/  743 batches | lr 0.62 | ms/batch 31.39 | loss  5.98 | ppl   396.87
| epoch  15 |   597/  743 batches | lr 0.62 | ms/batch 34.25 | loss  6.00 | ppl   405.30
| epoch  15 |   598/  743 batches | lr 0.62 | ms/batch 31.63 | loss  6.09 | ppl   441.29
| epoch  15 |   599/  743 batches | lr 0.62 | ms/batch 29.01 | loss  6.22 | ppl   502.30
| epoch  15 |   600/  743 batches | lr 0.62 | ms/batch 30.56 | loss  6.03 | ppl   414.36
| epoch  15 |   601/  743 batches | lr 0.62 | ms/batch 32.99 | loss  6.14 | ppl   463.34
| epoch  15 |   602/  743 batches | lr 0.62 | ms/batch 31.02 | loss  6.10 | ppl   445.11
| epoch  15 |   603/  743 batches | lr 0.62 | ms/batch 31.56 | loss  6.09 | ppl   439.32
| epoch  15 |   604/  743 batches | lr 0.62 | ms/batch 34.78 | loss  6.12 | ppl   454.33
| epoch  15 |   605/  743 batches | lr 0.62 | ms/batch 29.00 | loss  6.23 | ppl   505.35
| epoch  15 |   606/  743 batches | lr 0.62 | ms/batch 27.47 | loss  6.04 | ppl   418.78
| epoch  15 |   607/ 

| epoch  15 |   693/  743 batches | lr 0.62 | ms/batch 48.04 | loss  6.16 | ppl   471.41
| epoch  15 |   694/  743 batches | lr 0.62 | ms/batch 31.08 | loss  6.11 | ppl   449.11
| epoch  15 |   695/  743 batches | lr 0.62 | ms/batch 29.30 | loss  6.14 | ppl   465.17
| epoch  15 |   696/  743 batches | lr 0.62 | ms/batch 28.37 | loss  6.10 | ppl   446.36
| epoch  15 |   697/  743 batches | lr 0.62 | ms/batch 29.35 | loss  6.17 | ppl   475.96
| epoch  15 |   698/  743 batches | lr 0.62 | ms/batch 40.88 | loss  6.10 | ppl   443.99
| epoch  15 |   699/  743 batches | lr 0.62 | ms/batch 32.61 | loss  5.99 | ppl   398.99
| epoch  15 |   700/  743 batches | lr 0.62 | ms/batch 34.45 | loss  6.03 | ppl   413.95
| epoch  15 |   701/  743 batches | lr 0.62 | ms/batch 29.05 | loss  6.11 | ppl   449.41
| epoch  15 |   702/  743 batches | lr 0.62 | ms/batch 26.15 | loss  6.12 | ppl   456.85
| epoch  15 |   703/  743 batches | lr 0.62 | ms/batch 29.48 | loss  6.06 | ppl   430.24
| epoch  15 |   704/ 

| epoch  16 |    47/  743 batches | lr 0.62 | ms/batch 20.83 | loss  6.10 | ppl   445.76
| epoch  16 |    48/  743 batches | lr 0.62 | ms/batch 21.15 | loss  6.12 | ppl   453.72
| epoch  16 |    49/  743 batches | lr 0.62 | ms/batch 20.17 | loss  6.18 | ppl   480.97
| epoch  16 |    50/  743 batches | lr 0.62 | ms/batch 20.38 | loss  6.23 | ppl   508.44
| epoch  16 |    51/  743 batches | lr 0.62 | ms/batch 20.33 | loss  6.10 | ppl   447.21
| epoch  16 |    52/  743 batches | lr 0.62 | ms/batch 20.28 | loss  6.15 | ppl   467.71
| epoch  16 |    53/  743 batches | lr 0.62 | ms/batch 20.32 | loss  6.17 | ppl   478.83
| epoch  16 |    54/  743 batches | lr 0.62 | ms/batch 20.27 | loss  6.20 | ppl   494.22
| epoch  16 |    55/  743 batches | lr 0.62 | ms/batch 20.47 | loss  6.18 | ppl   482.17
| epoch  16 |    56/  743 batches | lr 0.62 | ms/batch 20.50 | loss  6.07 | ppl   431.10
| epoch  16 |    57/  743 batches | lr 0.62 | ms/batch 20.20 | loss  6.17 | ppl   480.41
| epoch  16 |    58/ 

| epoch  16 |   149/  743 batches | lr 0.62 | ms/batch 22.07 | loss  6.23 | ppl   508.42
| epoch  16 |   150/  743 batches | lr 0.62 | ms/batch 21.66 | loss  6.17 | ppl   479.57
| epoch  16 |   151/  743 batches | lr 0.62 | ms/batch 20.84 | loss  6.28 | ppl   531.15
| epoch  16 |   152/  743 batches | lr 0.62 | ms/batch 20.94 | loss  6.03 | ppl   415.81
| epoch  16 |   153/  743 batches | lr 0.62 | ms/batch 21.05 | loss  6.13 | ppl   459.84
| epoch  16 |   154/  743 batches | lr 0.62 | ms/batch 20.93 | loss  6.10 | ppl   447.57
| epoch  16 |   155/  743 batches | lr 0.62 | ms/batch 21.12 | loss  6.14 | ppl   464.81
| epoch  16 |   156/  743 batches | lr 0.62 | ms/batch 20.92 | loss  5.92 | ppl   373.11
| epoch  16 |   157/  743 batches | lr 0.62 | ms/batch 20.94 | loss  6.10 | ppl   444.71
| epoch  16 |   158/  743 batches | lr 0.62 | ms/batch 20.95 | loss  6.05 | ppl   422.62
| epoch  16 |   159/  743 batches | lr 0.62 | ms/batch 21.62 | loss  6.11 | ppl   448.33
| epoch  16 |   160/ 

| epoch  16 |   249/  743 batches | lr 0.62 | ms/batch 22.10 | loss  6.07 | ppl   433.42
| epoch  16 |   250/  743 batches | lr 0.62 | ms/batch 21.97 | loss  6.04 | ppl   418.12
| epoch  16 |   251/  743 batches | lr 0.62 | ms/batch 21.24 | loss  6.02 | ppl   413.29
| epoch  16 |   252/  743 batches | lr 0.62 | ms/batch 21.36 | loss  6.16 | ppl   472.40
| epoch  16 |   253/  743 batches | lr 0.62 | ms/batch 21.18 | loss  6.02 | ppl   410.14
| epoch  16 |   254/  743 batches | lr 0.62 | ms/batch 21.06 | loss  5.97 | ppl   392.70
| epoch  16 |   255/  743 batches | lr 0.62 | ms/batch 21.38 | loss  6.14 | ppl   465.17
| epoch  16 |   256/  743 batches | lr 0.62 | ms/batch 21.30 | loss  5.93 | ppl   376.37
| epoch  16 |   257/  743 batches | lr 0.62 | ms/batch 21.15 | loss  6.01 | ppl   408.55
| epoch  16 |   258/  743 batches | lr 0.62 | ms/batch 22.10 | loss  6.00 | ppl   405.04
| epoch  16 |   259/  743 batches | lr 0.62 | ms/batch 22.10 | loss  6.01 | ppl   407.47
| epoch  16 |   260/ 

| epoch  16 |   348/  743 batches | lr 0.62 | ms/batch 28.59 | loss  6.09 | ppl   440.80
| epoch  16 |   349/  743 batches | lr 0.62 | ms/batch 26.55 | loss  6.16 | ppl   473.53
| epoch  16 |   350/  743 batches | lr 0.62 | ms/batch 23.85 | loss  6.00 | ppl   402.16
| epoch  16 |   351/  743 batches | lr 0.62 | ms/batch 23.72 | loss  6.03 | ppl   415.90
| epoch  16 |   352/  743 batches | lr 0.62 | ms/batch 24.20 | loss  6.16 | ppl   473.30
| epoch  16 |   353/  743 batches | lr 0.62 | ms/batch 26.41 | loss  6.10 | ppl   445.59
| epoch  16 |   354/  743 batches | lr 0.62 | ms/batch 23.83 | loss  6.12 | ppl   455.55
| epoch  16 |   355/  743 batches | lr 0.62 | ms/batch 23.48 | loss  6.13 | ppl   460.94
| epoch  16 |   356/  743 batches | lr 0.62 | ms/batch 23.42 | loss  6.10 | ppl   447.21
| epoch  16 |   357/  743 batches | lr 0.62 | ms/batch 23.76 | loss  5.95 | ppl   384.80
| epoch  16 |   358/  743 batches | lr 0.62 | ms/batch 28.52 | loss  6.15 | ppl   467.87
| epoch  16 |   359/ 

| epoch  16 |   443/  743 batches | lr 0.62 | ms/batch 28.62 | loss  6.27 | ppl   530.43
| epoch  16 |   444/  743 batches | lr 0.62 | ms/batch 25.87 | loss  6.06 | ppl   429.83
| epoch  16 |   445/  743 batches | lr 0.62 | ms/batch 24.82 | loss  6.25 | ppl   517.38
| epoch  16 |   446/  743 batches | lr 0.62 | ms/batch 26.97 | loss  6.20 | ppl   494.85
| epoch  16 |   447/  743 batches | lr 0.62 | ms/batch 24.42 | loss  6.29 | ppl   537.12
| epoch  16 |   448/  743 batches | lr 0.62 | ms/batch 27.10 | loss  6.15 | ppl   466.54
| epoch  16 |   449/  743 batches | lr 0.62 | ms/batch 25.00 | loss  6.13 | ppl   459.49
| epoch  16 |   450/  743 batches | lr 0.62 | ms/batch 25.03 | loss  6.00 | ppl   401.73
| epoch  16 |   451/  743 batches | lr 0.62 | ms/batch 24.55 | loss  6.13 | ppl   459.83
| epoch  16 |   452/  743 batches | lr 0.62 | ms/batch 28.17 | loss  6.11 | ppl   452.27
| epoch  16 |   453/  743 batches | lr 0.62 | ms/batch 29.60 | loss  6.23 | ppl   507.53
| epoch  16 |   454/ 

| epoch  16 |   536/  743 batches | lr 0.62 | ms/batch 26.74 | loss  6.10 | ppl   447.66
| epoch  16 |   537/  743 batches | lr 0.62 | ms/batch 26.02 | loss  6.23 | ppl   507.25
| epoch  16 |   538/  743 batches | lr 0.62 | ms/batch 27.06 | loss  6.06 | ppl   428.11
| epoch  16 |   539/  743 batches | lr 0.62 | ms/batch 25.88 | loss  6.08 | ppl   438.26
| epoch  16 |   540/  743 batches | lr 0.62 | ms/batch 25.61 | loss  6.08 | ppl   438.50
| epoch  16 |   541/  743 batches | lr 0.62 | ms/batch 26.70 | loss  6.11 | ppl   450.87
| epoch  16 |   542/  743 batches | lr 0.62 | ms/batch 26.71 | loss  6.05 | ppl   422.72
| epoch  16 |   543/  743 batches | lr 0.62 | ms/batch 24.34 | loss  6.01 | ppl   409.43
| epoch  16 |   544/  743 batches | lr 0.62 | ms/batch 28.16 | loss  6.30 | ppl   544.93
| epoch  16 |   545/  743 batches | lr 0.62 | ms/batch 34.70 | loss  6.18 | ppl   481.15
| epoch  16 |   546/  743 batches | lr 0.62 | ms/batch 26.81 | loss  6.11 | ppl   452.13
| epoch  16 |   547/ 

| epoch  16 |   629/  743 batches | lr 0.62 | ms/batch 39.16 | loss  6.01 | ppl   408.12
| epoch  16 |   630/  743 batches | lr 0.62 | ms/batch 37.10 | loss  6.07 | ppl   432.87
| epoch  16 |   631/  743 batches | lr 0.62 | ms/batch 28.51 | loss  6.23 | ppl   508.76
| epoch  16 |   632/  743 batches | lr 0.62 | ms/batch 27.35 | loss  6.06 | ppl   427.33
| epoch  16 |   633/  743 batches | lr 0.62 | ms/batch 27.01 | loss  6.12 | ppl   455.98
| epoch  16 |   634/  743 batches | lr 0.62 | ms/batch 26.84 | loss  6.16 | ppl   473.21
| epoch  16 |   635/  743 batches | lr 0.62 | ms/batch 27.12 | loss  6.11 | ppl   449.75
| epoch  16 |   636/  743 batches | lr 0.62 | ms/batch 26.34 | loss  5.99 | ppl   399.30
| epoch  16 |   637/  743 batches | lr 0.62 | ms/batch 40.01 | loss  6.04 | ppl   418.89
| epoch  16 |   638/  743 batches | lr 0.62 | ms/batch 33.37 | loss  6.20 | ppl   491.17
| epoch  16 |   639/  743 batches | lr 0.62 | ms/batch 31.56 | loss  5.93 | ppl   374.46
| epoch  16 |   640/ 

| epoch  16 |   724/  743 batches | lr 0.62 | ms/batch 35.27 | loss  6.13 | ppl   458.98
| epoch  16 |   725/  743 batches | lr 0.62 | ms/batch 31.09 | loss  6.18 | ppl   482.77
| epoch  16 |   726/  743 batches | lr 0.62 | ms/batch 26.02 | loss  6.23 | ppl   508.71
| epoch  16 |   727/  743 batches | lr 0.62 | ms/batch 26.31 | loss  6.23 | ppl   506.01
| epoch  16 |   728/  743 batches | lr 0.62 | ms/batch 26.37 | loss  6.11 | ppl   452.45
| epoch  16 |   729/  743 batches | lr 0.62 | ms/batch 27.40 | loss  6.13 | ppl   457.77
| epoch  16 |   730/  743 batches | lr 0.62 | ms/batch 30.97 | loss  6.04 | ppl   418.84
| epoch  16 |   731/  743 batches | lr 0.62 | ms/batch 27.68 | loss  6.13 | ppl   459.11
| epoch  16 |   732/  743 batches | lr 0.62 | ms/batch 32.36 | loss  6.25 | ppl   517.73
| epoch  16 |   733/  743 batches | lr 0.62 | ms/batch 31.04 | loss  6.19 | ppl   485.69
| epoch  16 |   734/  743 batches | lr 0.62 | ms/batch 26.60 | loss  6.15 | ppl   466.55
| epoch  16 |   735/ 

| epoch  17 |    78/  743 batches | lr 0.62 | ms/batch 29.99 | loss  6.09 | ppl   439.71
| epoch  17 |    79/  743 batches | lr 0.62 | ms/batch 25.47 | loss  6.06 | ppl   427.11
| epoch  17 |    80/  743 batches | lr 0.62 | ms/batch 35.72 | loss  6.01 | ppl   405.56
| epoch  17 |    81/  743 batches | lr 0.62 | ms/batch 25.34 | loss  6.27 | ppl   531.07
| epoch  17 |    82/  743 batches | lr 0.62 | ms/batch 46.07 | loss  6.12 | ppl   453.81
| epoch  17 |    83/  743 batches | lr 0.62 | ms/batch 38.09 | loss  6.02 | ppl   412.94
| epoch  17 |    84/  743 batches | lr 0.62 | ms/batch 65.32 | loss  6.08 | ppl   435.66
| epoch  17 |    85/  743 batches | lr 0.62 | ms/batch 35.10 | loss  6.08 | ppl   438.38
| epoch  17 |    86/  743 batches | lr 0.62 | ms/batch 21.45 | loss  5.97 | ppl   391.12
| epoch  17 |    87/  743 batches | lr 0.62 | ms/batch 28.19 | loss  6.15 | ppl   466.65
| epoch  17 |    88/  743 batches | lr 0.62 | ms/batch 24.68 | loss  6.04 | ppl   420.46
| epoch  17 |    89/ 

| epoch  17 |   173/  743 batches | lr 0.62 | ms/batch 22.40 | loss  5.90 | ppl   365.34
| epoch  17 |   174/  743 batches | lr 0.62 | ms/batch 22.56 | loss  6.15 | ppl   468.97
| epoch  17 |   175/  743 batches | lr 0.62 | ms/batch 21.82 | loss  6.19 | ppl   486.73
| epoch  17 |   176/  743 batches | lr 0.62 | ms/batch 23.10 | loss  6.16 | ppl   471.54
| epoch  17 |   177/  743 batches | lr 0.62 | ms/batch 22.99 | loss  5.95 | ppl   385.61
| epoch  17 |   178/  743 batches | lr 0.62 | ms/batch 21.61 | loss  6.14 | ppl   463.92
| epoch  17 |   179/  743 batches | lr 0.62 | ms/batch 22.44 | loss  6.12 | ppl   456.76
| epoch  17 |   180/  743 batches | lr 0.62 | ms/batch 21.98 | loss  5.98 | ppl   395.35
| epoch  17 |   181/  743 batches | lr 0.62 | ms/batch 21.47 | loss  6.13 | ppl   457.49
| epoch  17 |   182/  743 batches | lr 0.62 | ms/batch 21.39 | loss  6.05 | ppl   424.58
| epoch  17 |   183/  743 batches | lr 0.62 | ms/batch 23.90 | loss  6.18 | ppl   480.79
| epoch  17 |   184/ 

| epoch  17 |   273/  743 batches | lr 0.62 | ms/batch 25.26 | loss  6.08 | ppl   437.50
| epoch  17 |   274/  743 batches | lr 0.62 | ms/batch 23.20 | loss  6.07 | ppl   431.32
| epoch  17 |   275/  743 batches | lr 0.62 | ms/batch 22.39 | loss  6.01 | ppl   407.65
| epoch  17 |   276/  743 batches | lr 0.62 | ms/batch 22.14 | loss  6.01 | ppl   406.04
| epoch  17 |   277/  743 batches | lr 0.62 | ms/batch 22.90 | loss  6.08 | ppl   438.17
| epoch  17 |   278/  743 batches | lr 0.62 | ms/batch 21.98 | loss  6.09 | ppl   440.08
| epoch  17 |   279/  743 batches | lr 0.62 | ms/batch 24.22 | loss  6.10 | ppl   446.24
| epoch  17 |   280/  743 batches | lr 0.62 | ms/batch 22.14 | loss  6.07 | ppl   432.52
| epoch  17 |   281/  743 batches | lr 0.62 | ms/batch 22.35 | loss  6.06 | ppl   428.44
| epoch  17 |   282/  743 batches | lr 0.62 | ms/batch 22.34 | loss  6.02 | ppl   410.12
| epoch  17 |   283/  743 batches | lr 0.62 | ms/batch 24.30 | loss  6.07 | ppl   434.59
| epoch  17 |   284/ 

| epoch  17 |   372/  743 batches | lr 0.62 | ms/batch 30.70 | loss  6.16 | ppl   475.54
| epoch  17 |   373/  743 batches | lr 0.62 | ms/batch 26.45 | loss  6.18 | ppl   483.80
| epoch  17 |   374/  743 batches | lr 0.62 | ms/batch 27.38 | loss  6.11 | ppl   448.14
| epoch  17 |   375/  743 batches | lr 0.62 | ms/batch 27.33 | loss  6.06 | ppl   427.14
| epoch  17 |   376/  743 batches | lr 0.62 | ms/batch 27.29 | loss  6.09 | ppl   443.57
| epoch  17 |   377/  743 batches | lr 0.62 | ms/batch 26.34 | loss  6.05 | ppl   422.43
| epoch  17 |   378/  743 batches | lr 0.62 | ms/batch 24.53 | loss  6.06 | ppl   430.32
| epoch  17 |   379/  743 batches | lr 0.62 | ms/batch 24.22 | loss  5.89 | ppl   359.80
| epoch  17 |   380/  743 batches | lr 0.62 | ms/batch 26.48 | loss  6.08 | ppl   436.62
| epoch  17 |   381/  743 batches | lr 0.62 | ms/batch 28.90 | loss  6.16 | ppl   474.64
| epoch  17 |   382/  743 batches | lr 0.62 | ms/batch 24.38 | loss  6.06 | ppl   427.26
| epoch  17 |   383/ 

| epoch  17 |   471/  743 batches | lr 0.62 | ms/batch 21.34 | loss  6.05 | ppl   426.00
| epoch  17 |   472/  743 batches | lr 0.62 | ms/batch 22.78 | loss  6.12 | ppl   455.47
| epoch  17 |   473/  743 batches | lr 0.62 | ms/batch 21.77 | loss  6.18 | ppl   485.15
| epoch  17 |   474/  743 batches | lr 0.62 | ms/batch 22.33 | loss  6.10 | ppl   445.65
| epoch  17 |   475/  743 batches | lr 0.62 | ms/batch 22.07 | loss  6.12 | ppl   454.78
| epoch  17 |   476/  743 batches | lr 0.62 | ms/batch 23.94 | loss  6.11 | ppl   449.25
| epoch  17 |   477/  743 batches | lr 0.62 | ms/batch 22.28 | loss  6.08 | ppl   435.64
| epoch  17 |   478/  743 batches | lr 0.62 | ms/batch 22.49 | loss  6.12 | ppl   454.57
| epoch  17 |   479/  743 batches | lr 0.62 | ms/batch 24.67 | loss  6.11 | ppl   452.07
| epoch  17 |   480/  743 batches | lr 0.62 | ms/batch 28.04 | loss  6.15 | ppl   467.59
| epoch  17 |   481/  743 batches | lr 0.62 | ms/batch 27.52 | loss  6.00 | ppl   403.68
| epoch  17 |   482/ 

| epoch  17 |   564/  743 batches | lr 0.62 | ms/batch 24.06 | loss  6.15 | ppl   467.05
| epoch  17 |   565/  743 batches | lr 0.62 | ms/batch 23.97 | loss  6.17 | ppl   478.17
| epoch  17 |   566/  743 batches | lr 0.62 | ms/batch 22.63 | loss  6.11 | ppl   448.36
| epoch  17 |   567/  743 batches | lr 0.62 | ms/batch 24.30 | loss  6.02 | ppl   411.90
| epoch  17 |   568/  743 batches | lr 0.62 | ms/batch 22.42 | loss  6.32 | ppl   553.10
| epoch  17 |   569/  743 batches | lr 0.62 | ms/batch 22.26 | loss  5.88 | ppl   356.12
| epoch  17 |   570/  743 batches | lr 0.62 | ms/batch 22.23 | loss  6.20 | ppl   494.99
| epoch  17 |   571/  743 batches | lr 0.62 | ms/batch 22.16 | loss  6.21 | ppl   497.49
| epoch  17 |   572/  743 batches | lr 0.62 | ms/batch 22.79 | loss  6.06 | ppl   426.71
| epoch  17 |   573/  743 batches | lr 0.62 | ms/batch 22.43 | loss  6.11 | ppl   450.28
| epoch  17 |   574/  743 batches | lr 0.62 | ms/batch 25.54 | loss  6.06 | ppl   427.08
| epoch  17 |   575/ 

| epoch  17 |   658/  743 batches | lr 0.62 | ms/batch 24.17 | loss  6.09 | ppl   442.08
| epoch  17 |   659/  743 batches | lr 0.62 | ms/batch 24.05 | loss  6.11 | ppl   451.10
| epoch  17 |   660/  743 batches | lr 0.62 | ms/batch 22.97 | loss  6.00 | ppl   401.45
| epoch  17 |   661/  743 batches | lr 0.62 | ms/batch 22.26 | loss  5.98 | ppl   395.01
| epoch  17 |   662/  743 batches | lr 0.62 | ms/batch 22.74 | loss  6.08 | ppl   439.00
| epoch  17 |   663/  743 batches | lr 0.62 | ms/batch 22.59 | loss  6.02 | ppl   411.70
| epoch  17 |   664/  743 batches | lr 0.62 | ms/batch 22.29 | loss  6.09 | ppl   442.82
| epoch  17 |   665/  743 batches | lr 0.62 | ms/batch 22.71 | loss  6.08 | ppl   435.28
| epoch  17 |   666/  743 batches | lr 0.62 | ms/batch 22.33 | loss  6.12 | ppl   455.31
| epoch  17 |   667/  743 batches | lr 0.62 | ms/batch 22.33 | loss  5.98 | ppl   393.93
| epoch  17 |   668/  743 batches | lr 0.62 | ms/batch 25.20 | loss  5.96 | ppl   385.87
| epoch  17 |   669/ 

| epoch  18 |     8/  743 batches | lr 0.62 | ms/batch 22.52 | loss  6.15 | ppl   467.47
| epoch  18 |     9/  743 batches | lr 0.62 | ms/batch 22.02 | loss  6.06 | ppl   427.48
| epoch  18 |    10/  743 batches | lr 0.62 | ms/batch 20.86 | loss  6.07 | ppl   432.95
| epoch  18 |    11/  743 batches | lr 0.62 | ms/batch 21.15 | loss  6.05 | ppl   423.78
| epoch  18 |    12/  743 batches | lr 0.62 | ms/batch 20.95 | loss  6.15 | ppl   469.84
| epoch  18 |    13/  743 batches | lr 0.62 | ms/batch 20.56 | loss  6.04 | ppl   418.06
| epoch  18 |    14/  743 batches | lr 0.62 | ms/batch 20.59 | loss  6.12 | ppl   455.17
| epoch  18 |    15/  743 batches | lr 0.62 | ms/batch 20.88 | loss  6.12 | ppl   454.21
| epoch  18 |    16/  743 batches | lr 0.62 | ms/batch 20.86 | loss  6.13 | ppl   459.17
| epoch  18 |    17/  743 batches | lr 0.62 | ms/batch 20.80 | loss  6.09 | ppl   440.77
| epoch  18 |    18/  743 batches | lr 0.62 | ms/batch 21.96 | loss  5.94 | ppl   378.14
| epoch  18 |    19/ 

| epoch  18 |   108/  743 batches | lr 0.62 | ms/batch 21.54 | loss  6.04 | ppl   418.86
| epoch  18 |   109/  743 batches | lr 0.62 | ms/batch 22.68 | loss  6.05 | ppl   425.02
| epoch  18 |   110/  743 batches | lr 0.62 | ms/batch 22.05 | loss  5.99 | ppl   399.19
| epoch  18 |   111/  743 batches | lr 0.62 | ms/batch 23.06 | loss  6.06 | ppl   427.28
| epoch  18 |   112/  743 batches | lr 0.62 | ms/batch 22.97 | loss  5.90 | ppl   364.34
| epoch  18 |   113/  743 batches | lr 0.62 | ms/batch 21.31 | loss  6.03 | ppl   416.00
| epoch  18 |   114/  743 batches | lr 0.62 | ms/batch 21.89 | loss  6.12 | ppl   453.36
| epoch  18 |   115/  743 batches | lr 0.62 | ms/batch 22.57 | loss  5.98 | ppl   395.05
| epoch  18 |   116/  743 batches | lr 0.62 | ms/batch 21.05 | loss  6.25 | ppl   519.84
| epoch  18 |   117/  743 batches | lr 0.62 | ms/batch 20.82 | loss  6.19 | ppl   487.49
| epoch  18 |   118/  743 batches | lr 0.62 | ms/batch 21.05 | loss  6.11 | ppl   449.83
| epoch  18 |   119/ 

| epoch  18 |   208/  743 batches | lr 0.62 | ms/batch 21.79 | loss  6.10 | ppl   443.81
| epoch  18 |   209/  743 batches | lr 0.62 | ms/batch 22.29 | loss  6.09 | ppl   440.80
| epoch  18 |   210/  743 batches | lr 0.62 | ms/batch 22.33 | loss  5.93 | ppl   375.14
| epoch  18 |   211/  743 batches | lr 0.62 | ms/batch 22.46 | loss  6.11 | ppl   449.73
| epoch  18 |   212/  743 batches | lr 0.62 | ms/batch 21.66 | loss  5.90 | ppl   366.59
| epoch  18 |   213/  743 batches | lr 0.62 | ms/batch 21.38 | loss  5.94 | ppl   380.29
| epoch  18 |   214/  743 batches | lr 0.62 | ms/batch 21.32 | loss  6.09 | ppl   443.55
| epoch  18 |   215/  743 batches | lr 0.62 | ms/batch 21.19 | loss  6.03 | ppl   414.64
| epoch  18 |   216/  743 batches | lr 0.62 | ms/batch 21.25 | loss  6.02 | ppl   410.00
| epoch  18 |   217/  743 batches | lr 0.62 | ms/batch 20.88 | loss  6.01 | ppl   406.76
| epoch  18 |   218/  743 batches | lr 0.62 | ms/batch 21.26 | loss  5.90 | ppl   365.55
| epoch  18 |   219/ 

| epoch  18 |   307/  743 batches | lr 0.62 | ms/batch 25.26 | loss  6.14 | ppl   465.83
| epoch  18 |   308/  743 batches | lr 0.62 | ms/batch 22.99 | loss  6.13 | ppl   460.83
| epoch  18 |   309/  743 batches | lr 0.62 | ms/batch 23.50 | loss  6.03 | ppl   417.69
| epoch  18 |   310/  743 batches | lr 0.62 | ms/batch 23.46 | loss  6.06 | ppl   430.39
| epoch  18 |   311/  743 batches | lr 0.62 | ms/batch 22.02 | loss  6.02 | ppl   412.25
| epoch  18 |   312/  743 batches | lr 0.62 | ms/batch 22.01 | loss  5.95 | ppl   384.26
| epoch  18 |   313/  743 batches | lr 0.62 | ms/batch 21.49 | loss  6.06 | ppl   429.94
| epoch  18 |   314/  743 batches | lr 0.62 | ms/batch 21.97 | loss  5.92 | ppl   371.93
| epoch  18 |   315/  743 batches | lr 0.62 | ms/batch 21.60 | loss  6.05 | ppl   422.52
| epoch  18 |   316/  743 batches | lr 0.62 | ms/batch 21.40 | loss  6.00 | ppl   405.13
| epoch  18 |   317/  743 batches | lr 0.62 | ms/batch 23.55 | loss  6.18 | ppl   483.76
| epoch  18 |   318/ 

| epoch  18 |   405/  743 batches | lr 0.62 | ms/batch 25.54 | loss  6.02 | ppl   411.11
| epoch  18 |   406/  743 batches | lr 0.62 | ms/batch 24.54 | loss  6.08 | ppl   438.40
| epoch  18 |   407/  743 batches | lr 0.62 | ms/batch 22.45 | loss  6.14 | ppl   464.94
| epoch  18 |   408/  743 batches | lr 0.62 | ms/batch 22.53 | loss  5.90 | ppl   366.19
| epoch  18 |   409/  743 batches | lr 0.62 | ms/batch 22.53 | loss  6.10 | ppl   444.34
| epoch  18 |   410/  743 batches | lr 0.62 | ms/batch 22.34 | loss  6.01 | ppl   405.72
| epoch  18 |   411/  743 batches | lr 0.62 | ms/batch 22.57 | loss  6.14 | ppl   465.01
| epoch  18 |   412/  743 batches | lr 0.62 | ms/batch 23.59 | loss  6.08 | ppl   435.03
| epoch  18 |   413/  743 batches | lr 0.62 | ms/batch 22.33 | loss  5.97 | ppl   393.17
| epoch  18 |   414/  743 batches | lr 0.62 | ms/batch 23.09 | loss  6.01 | ppl   406.74
| epoch  18 |   415/  743 batches | lr 0.62 | ms/batch 24.72 | loss  5.92 | ppl   371.71
| epoch  18 |   416/ 

| epoch  18 |   502/  743 batches | lr 0.62 | ms/batch 26.96 | loss  6.00 | ppl   403.71
| epoch  18 |   503/  743 batches | lr 0.62 | ms/batch 23.34 | loss  5.97 | ppl   393.16
| epoch  18 |   504/  743 batches | lr 0.62 | ms/batch 22.66 | loss  5.97 | ppl   389.76
| epoch  18 |   505/  743 batches | lr 0.62 | ms/batch 22.50 | loss  6.11 | ppl   448.52
| epoch  18 |   506/  743 batches | lr 0.62 | ms/batch 22.47 | loss  6.04 | ppl   420.69
| epoch  18 |   507/  743 batches | lr 0.62 | ms/batch 22.35 | loss  6.15 | ppl   469.18
| epoch  18 |   508/  743 batches | lr 0.62 | ms/batch 22.24 | loss  6.11 | ppl   451.35
| epoch  18 |   509/  743 batches | lr 0.62 | ms/batch 22.49 | loss  6.09 | ppl   441.29
| epoch  18 |   510/  743 batches | lr 0.62 | ms/batch 22.81 | loss  5.92 | ppl   371.03
| epoch  18 |   511/  743 batches | lr 0.62 | ms/batch 22.43 | loss  6.03 | ppl   415.66
| epoch  18 |   512/  743 batches | lr 0.62 | ms/batch 26.79 | loss  6.05 | ppl   423.34
| epoch  18 |   513/ 

| epoch  18 |   601/  743 batches | lr 0.62 | ms/batch 24.87 | loss  6.08 | ppl   435.64
| epoch  18 |   602/  743 batches | lr 0.62 | ms/batch 22.33 | loss  6.02 | ppl   410.59
| epoch  18 |   603/  743 batches | lr 0.62 | ms/batch 22.52 | loss  5.99 | ppl   399.52
| epoch  18 |   604/  743 batches | lr 0.62 | ms/batch 21.87 | loss  6.02 | ppl   412.81
| epoch  18 |   605/  743 batches | lr 0.62 | ms/batch 22.31 | loss  6.11 | ppl   451.85
| epoch  18 |   606/  743 batches | lr 0.62 | ms/batch 22.46 | loss  5.94 | ppl   380.99
| epoch  18 |   607/  743 batches | lr 0.62 | ms/batch 23.29 | loss  6.01 | ppl   409.31
| epoch  18 |   608/  743 batches | lr 0.62 | ms/batch 23.49 | loss  6.12 | ppl   456.78
| epoch  18 |   609/  743 batches | lr 0.62 | ms/batch 22.71 | loss  5.97 | ppl   392.90
| epoch  18 |   610/  743 batches | lr 0.62 | ms/batch 24.97 | loss  5.95 | ppl   383.92
| epoch  18 |   611/  743 batches | lr 0.62 | ms/batch 24.23 | loss  6.12 | ppl   452.83
| epoch  18 |   612/ 

| epoch  18 |   699/  743 batches | lr 0.62 | ms/batch 30.56 | loss  5.89 | ppl   361.65
| epoch  18 |   700/  743 batches | lr 0.62 | ms/batch 36.08 | loss  5.94 | ppl   378.58
| epoch  18 |   701/  743 batches | lr 0.62 | ms/batch 27.22 | loss  6.00 | ppl   402.93
| epoch  18 |   702/  743 batches | lr 0.62 | ms/batch 31.70 | loss  6.02 | ppl   412.93
| epoch  18 |   703/  743 batches | lr 0.62 | ms/batch 31.93 | loss  5.98 | ppl   395.48
| epoch  18 |   704/  743 batches | lr 0.62 | ms/batch 33.14 | loss  6.05 | ppl   423.48
| epoch  18 |   705/  743 batches | lr 0.62 | ms/batch 34.25 | loss  6.07 | ppl   431.41
| epoch  18 |   706/  743 batches | lr 0.62 | ms/batch 35.63 | loss  5.96 | ppl   385.77
| epoch  18 |   707/  743 batches | lr 0.62 | ms/batch 30.71 | loss  6.01 | ppl   405.48
| epoch  18 |   708/  743 batches | lr 0.62 | ms/batch 32.69 | loss  6.15 | ppl   468.60
| epoch  18 |   709/  743 batches | lr 0.62 | ms/batch 27.17 | loss  6.00 | ppl   403.03
| epoch  18 |   710/ 

| epoch  19 |    50/  743 batches | lr 0.62 | ms/batch 21.85 | loss  6.15 | ppl   466.95
| epoch  19 |    51/  743 batches | lr 0.62 | ms/batch 21.94 | loss  5.99 | ppl   399.94
| epoch  19 |    52/  743 batches | lr 0.62 | ms/batch 20.79 | loss  6.05 | ppl   422.38
| epoch  19 |    53/  743 batches | lr 0.62 | ms/batch 20.66 | loss  6.05 | ppl   423.11
| epoch  19 |    54/  743 batches | lr 0.62 | ms/batch 20.54 | loss  6.14 | ppl   462.67
| epoch  19 |    55/  743 batches | lr 0.62 | ms/batch 20.62 | loss  6.08 | ppl   435.82
| epoch  19 |    56/  743 batches | lr 0.62 | ms/batch 20.77 | loss  6.00 | ppl   403.22
| epoch  19 |    57/  743 batches | lr 0.62 | ms/batch 20.91 | loss  6.08 | ppl   436.86
| epoch  19 |    58/  743 batches | lr 0.62 | ms/batch 20.84 | loss  6.29 | ppl   539.64
| epoch  19 |    59/  743 batches | lr 0.62 | ms/batch 20.98 | loss  6.08 | ppl   439.11
| epoch  19 |    60/  743 batches | lr 0.62 | ms/batch 22.34 | loss  6.14 | ppl   463.05
| epoch  19 |    61/ 

| epoch  19 |   150/  743 batches | lr 0.62 | ms/batch 21.81 | loss  6.09 | ppl   442.13
| epoch  19 |   151/  743 batches | lr 0.62 | ms/batch 22.41 | loss  6.18 | ppl   480.70
| epoch  19 |   152/  743 batches | lr 0.62 | ms/batch 21.78 | loss  5.94 | ppl   381.15
| epoch  19 |   153/  743 batches | lr 0.62 | ms/batch 21.22 | loss  6.05 | ppl   424.11
| epoch  19 |   154/  743 batches | lr 0.62 | ms/batch 21.14 | loss  5.99 | ppl   399.58
| epoch  19 |   155/  743 batches | lr 0.62 | ms/batch 21.14 | loss  6.03 | ppl   416.77
| epoch  19 |   156/  743 batches | lr 0.62 | ms/batch 21.42 | loss  5.83 | ppl   339.16
| epoch  19 |   157/  743 batches | lr 0.62 | ms/batch 21.26 | loss  5.97 | ppl   390.40
| epoch  19 |   158/  743 batches | lr 0.62 | ms/batch 21.30 | loss  5.94 | ppl   379.17
| epoch  19 |   159/  743 batches | lr 0.62 | ms/batch 21.42 | loss  6.03 | ppl   414.91
| epoch  19 |   160/  743 batches | lr 0.62 | ms/batch 23.02 | loss  6.05 | ppl   425.01
| epoch  19 |   161/ 

| epoch  19 |   250/  743 batches | lr 0.62 | ms/batch 22.60 | loss  5.94 | ppl   380.12
| epoch  19 |   251/  743 batches | lr 0.62 | ms/batch 22.42 | loss  5.93 | ppl   375.73
| epoch  19 |   252/  743 batches | lr 0.62 | ms/batch 21.91 | loss  6.05 | ppl   424.54
| epoch  19 |   253/  743 batches | lr 0.62 | ms/batch 21.23 | loss  5.95 | ppl   382.13
| epoch  19 |   254/  743 batches | lr 0.62 | ms/batch 21.37 | loss  5.87 | ppl   354.12
| epoch  19 |   255/  743 batches | lr 0.62 | ms/batch 21.02 | loss  6.08 | ppl   436.65
| epoch  19 |   256/  743 batches | lr 0.62 | ms/batch 21.09 | loss  5.81 | ppl   333.54
| epoch  19 |   257/  743 batches | lr 0.62 | ms/batch 21.46 | loss  5.90 | ppl   365.88
| epoch  19 |   258/  743 batches | lr 0.62 | ms/batch 21.50 | loss  5.89 | ppl   361.14
| epoch  19 |   259/  743 batches | lr 0.62 | ms/batch 21.48 | loss  5.91 | ppl   368.48
| epoch  19 |   260/  743 batches | lr 0.62 | ms/batch 23.17 | loss  6.15 | ppl   468.36
| epoch  19 |   261/ 

| epoch  19 |   350/  743 batches | lr 0.62 | ms/batch 24.66 | loss  5.89 | ppl   359.68
| epoch  19 |   351/  743 batches | lr 0.62 | ms/batch 25.11 | loss  5.91 | ppl   367.11
| epoch  19 |   352/  743 batches | lr 0.62 | ms/batch 22.44 | loss  6.07 | ppl   433.67
| epoch  19 |   353/  743 batches | lr 0.62 | ms/batch 22.25 | loss  6.00 | ppl   403.92
| epoch  19 |   354/  743 batches | lr 0.62 | ms/batch 22.29 | loss  6.03 | ppl   416.90
| epoch  19 |   355/  743 batches | lr 0.62 | ms/batch 24.37 | loss  6.04 | ppl   419.99
| epoch  19 |   356/  743 batches | lr 0.62 | ms/batch 23.85 | loss  5.98 | ppl   394.56
| epoch  19 |   357/  743 batches | lr 0.62 | ms/batch 23.74 | loss  5.88 | ppl   356.62
| epoch  19 |   358/  743 batches | lr 0.62 | ms/batch 23.51 | loss  6.03 | ppl   415.36
| epoch  19 |   359/  743 batches | lr 0.62 | ms/batch 23.14 | loss  5.99 | ppl   399.72
| epoch  19 |   360/  743 batches | lr 0.62 | ms/batch 25.04 | loss  6.04 | ppl   420.43
| epoch  19 |   361/ 

| epoch  19 |   448/  743 batches | lr 0.62 | ms/batch 23.82 | loss  6.03 | ppl   415.70
| epoch  19 |   449/  743 batches | lr 0.62 | ms/batch 26.40 | loss  6.01 | ppl   407.13
| epoch  19 |   450/  743 batches | lr 0.62 | ms/batch 23.96 | loss  5.94 | ppl   378.22
| epoch  19 |   451/  743 batches | lr 0.62 | ms/batch 26.76 | loss  6.05 | ppl   422.39
| epoch  19 |   452/  743 batches | lr 0.62 | ms/batch 24.14 | loss  6.04 | ppl   417.97
| epoch  19 |   453/  743 batches | lr 0.62 | ms/batch 23.87 | loss  6.16 | ppl   472.53
| epoch  19 |   454/  743 batches | lr 0.62 | ms/batch 23.99 | loss  6.10 | ppl   443.79
| epoch  19 |   455/  743 batches | lr 0.62 | ms/batch 24.40 | loss  6.04 | ppl   421.86
| epoch  19 |   456/  743 batches | lr 0.62 | ms/batch 25.00 | loss  6.08 | ppl   436.40
| epoch  19 |   457/  743 batches | lr 0.62 | ms/batch 25.92 | loss  6.19 | ppl   487.12
| epoch  19 |   458/  743 batches | lr 0.62 | ms/batch 26.53 | loss  6.11 | ppl   448.84
| epoch  19 |   459/ 

| epoch  19 |   549/  743 batches | lr 0.62 | ms/batch 26.33 | loss  5.94 | ppl   380.59
| epoch  19 |   550/  743 batches | lr 0.62 | ms/batch 25.73 | loss  6.07 | ppl   431.40
| epoch  19 |   551/  743 batches | lr 0.62 | ms/batch 24.16 | loss  6.15 | ppl   468.91
| epoch  19 |   552/  743 batches | lr 0.62 | ms/batch 25.91 | loss  6.19 | ppl   485.95
| epoch  19 |   553/  743 batches | lr 0.62 | ms/batch 24.67 | loss  6.00 | ppl   403.81
| epoch  19 |   554/  743 batches | lr 0.62 | ms/batch 24.72 | loss  5.92 | ppl   370.63
| epoch  19 |   555/  743 batches | lr 0.62 | ms/batch 24.62 | loss  5.89 | ppl   362.98
| epoch  19 |   556/  743 batches | lr 0.62 | ms/batch 24.29 | loss  5.88 | ppl   357.60
| epoch  19 |   557/  743 batches | lr 0.62 | ms/batch 24.99 | loss  5.86 | ppl   351.06
| epoch  19 |   558/  743 batches | lr 0.62 | ms/batch 28.37 | loss  6.12 | ppl   453.40
| epoch  19 |   559/  743 batches | lr 0.62 | ms/batch 25.97 | loss  5.99 | ppl   398.96
| epoch  19 |   560/ 

| epoch  19 |   644/  743 batches | lr 0.62 | ms/batch 38.70 | loss  6.02 | ppl   411.44
| epoch  19 |   645/  743 batches | lr 0.62 | ms/batch 33.98 | loss  6.04 | ppl   418.78
| epoch  19 |   646/  743 batches | lr 0.62 | ms/batch 26.91 | loss  5.95 | ppl   382.78
| epoch  19 |   647/  743 batches | lr 0.62 | ms/batch 27.46 | loss  5.89 | ppl   362.68
| epoch  19 |   648/  743 batches | lr 0.62 | ms/batch 27.47 | loss  5.96 | ppl   388.26
| epoch  19 |   649/  743 batches | lr 0.62 | ms/batch 26.48 | loss  6.06 | ppl   429.41
| epoch  19 |   650/  743 batches | lr 0.62 | ms/batch 27.67 | loss  6.16 | ppl   474.53
| epoch  19 |   651/  743 batches | lr 0.62 | ms/batch 38.91 | loss  6.05 | ppl   424.30
| epoch  19 |   652/  743 batches | lr 0.62 | ms/batch 41.42 | loss  6.06 | ppl   429.22
| epoch  19 |   653/  743 batches | lr 0.62 | ms/batch 33.09 | loss  6.00 | ppl   405.38
| epoch  19 |   654/  743 batches | lr 0.62 | ms/batch 30.89 | loss  5.88 | ppl   359.32
| epoch  19 |   655/ 

| epoch  19 |   741/  743 batches | lr 0.62 | ms/batch 27.29 | loss  5.96 | ppl   387.67
| epoch  19 |   742/  743 batches | lr 0.62 | ms/batch 30.74 | loss  6.07 | ppl   431.87
| epoch  19 |   743/  743 batches | lr 0.62 | ms/batch 23.23 | loss  5.93 | ppl   377.61
-----------------------------------------------------------------------------------------
| end of epoch  19 | time: 23.50s | valid loss  6.02 | valid ppl   411.04
-----------------------------------------------------------------------------------------
| epoch  20 |     1/  743 batches | lr 0.62 | ms/batch 44.07 | loss 12.96 | ppl 422987.20
| epoch  20 |     2/  743 batches | lr 0.62 | ms/batch 21.05 | loss  6.25 | ppl   517.61
| epoch  20 |     3/  743 batches | lr 0.62 | ms/batch 21.21 | loss  6.26 | ppl   525.38
| epoch  20 |     4/  743 batches | lr 0.62 | ms/batch 21.06 | loss  6.20 | ppl   494.26
| epoch  20 |     5/  743 batches | lr 0.62 | ms/batch 20.88 | loss  5.97 | ppl   392.70
| epoch  20 |     6/  743 batches

| epoch  20 |    95/  743 batches | lr 0.62 | ms/batch 21.38 | loss  5.96 | ppl   387.69
| epoch  20 |    96/  743 batches | lr 0.62 | ms/batch 22.94 | loss  6.12 | ppl   455.64
| epoch  20 |    97/  743 batches | lr 0.62 | ms/batch 21.40 | loss  5.91 | ppl   368.80
| epoch  20 |    98/  743 batches | lr 0.62 | ms/batch 21.59 | loss  5.88 | ppl   359.32
| epoch  20 |    99/  743 batches | lr 0.62 | ms/batch 21.83 | loss  6.07 | ppl   433.60
| epoch  20 |   100/  743 batches | lr 0.62 | ms/batch 22.22 | loss  5.85 | ppl   347.21
| epoch  20 |   101/  743 batches | lr 0.62 | ms/batch 21.02 | loss  5.99 | ppl   400.84
| epoch  20 |   102/  743 batches | lr 0.62 | ms/batch 21.22 | loss  6.06 | ppl   427.31
| epoch  20 |   103/  743 batches | lr 0.62 | ms/batch 21.48 | loss  5.97 | ppl   390.64
| epoch  20 |   104/  743 batches | lr 0.62 | ms/batch 21.33 | loss  5.95 | ppl   383.12
| epoch  20 |   105/  743 batches | lr 0.62 | ms/batch 21.38 | loss  5.95 | ppl   382.92
| epoch  20 |   106/ 

| epoch  20 |   195/  743 batches | lr 0.62 | ms/batch 25.07 | loss  6.03 | ppl   416.27
| epoch  20 |   196/  743 batches | lr 0.62 | ms/batch 22.57 | loss  5.90 | ppl   365.23
| epoch  20 |   197/  743 batches | lr 0.62 | ms/batch 21.64 | loss  5.94 | ppl   378.35
| epoch  20 |   198/  743 batches | lr 0.62 | ms/batch 21.69 | loss  5.83 | ppl   339.01
| epoch  20 |   199/  743 batches | lr 0.62 | ms/batch 21.87 | loss  6.08 | ppl   435.09
| epoch  20 |   200/  743 batches | lr 0.62 | ms/batch 21.80 | loss  5.92 | ppl   373.40
| epoch  20 |   201/  743 batches | lr 0.62 | ms/batch 22.46 | loss  5.96 | ppl   385.93
| epoch  20 |   202/  743 batches | lr 0.62 | ms/batch 21.83 | loss  5.93 | ppl   377.68
| epoch  20 |   203/  743 batches | lr 0.62 | ms/batch 21.53 | loss  5.99 | ppl   398.27
| epoch  20 |   204/  743 batches | lr 0.62 | ms/batch 21.48 | loss  5.86 | ppl   351.83
| epoch  20 |   205/  743 batches | lr 0.62 | ms/batch 21.61 | loss  5.88 | ppl   358.86
| epoch  20 |   206/ 

| epoch  20 |   295/  743 batches | lr 0.62 | ms/batch 24.70 | loss  6.07 | ppl   434.51
| epoch  20 |   296/  743 batches | lr 0.62 | ms/batch 23.58 | loss  6.06 | ppl   430.22
| epoch  20 |   297/  743 batches | lr 0.62 | ms/batch 21.46 | loss  5.91 | ppl   369.60
| epoch  20 |   298/  743 batches | lr 0.62 | ms/batch 21.40 | loss  6.17 | ppl   478.04
| epoch  20 |   299/  743 batches | lr 0.62 | ms/batch 21.71 | loss  5.99 | ppl   398.81
| epoch  20 |   300/  743 batches | lr 0.62 | ms/batch 21.83 | loss  6.07 | ppl   432.18
| epoch  20 |   301/  743 batches | lr 0.62 | ms/batch 21.87 | loss  5.99 | ppl   397.91
| epoch  20 |   302/  743 batches | lr 0.62 | ms/batch 21.47 | loss  5.97 | ppl   390.76
| epoch  20 |   303/  743 batches | lr 0.62 | ms/batch 21.61 | loss  5.98 | ppl   396.99
| epoch  20 |   304/  743 batches | lr 0.62 | ms/batch 21.58 | loss  6.00 | ppl   404.28
| epoch  20 |   305/  743 batches | lr 0.62 | ms/batch 21.67 | loss  5.99 | ppl   400.01
| epoch  20 |   306/ 

| epoch  20 |   393/  743 batches | lr 0.62 | ms/batch 26.12 | loss  6.08 | ppl   437.39
| epoch  20 |   394/  743 batches | lr 0.62 | ms/batch 27.31 | loss  5.95 | ppl   384.36
| epoch  20 |   395/  743 batches | lr 0.62 | ms/batch 24.55 | loss  6.02 | ppl   409.54
| epoch  20 |   396/  743 batches | lr 0.62 | ms/batch 25.00 | loss  6.19 | ppl   490.07
| epoch  20 |   397/  743 batches | lr 0.62 | ms/batch 24.98 | loss  5.86 | ppl   352.42
| epoch  20 |   398/  743 batches | lr 0.62 | ms/batch 24.61 | loss  6.01 | ppl   407.89
| epoch  20 |   399/  743 batches | lr 0.62 | ms/batch 25.15 | loss  6.00 | ppl   403.72
| epoch  20 |   400/  743 batches | lr 0.62 | ms/batch 25.50 | loss  6.04 | ppl   421.17
| epoch  20 |   401/  743 batches | lr 0.62 | ms/batch 24.49 | loss  5.95 | ppl   383.06
| epoch  20 |   402/  743 batches | lr 0.62 | ms/batch 30.91 | loss  6.04 | ppl   419.08
| epoch  20 |   403/  743 batches | lr 0.62 | ms/batch 27.87 | loss  5.99 | ppl   398.43
| epoch  20 |   404/ 

| epoch  20 |   491/  743 batches | lr 0.62 | ms/batch 24.32 | loss  6.04 | ppl   421.82
| epoch  20 |   492/  743 batches | lr 0.62 | ms/batch 28.16 | loss  5.99 | ppl   400.53
| epoch  20 |   493/  743 batches | lr 0.62 | ms/batch 26.61 | loss  6.08 | ppl   437.13
| epoch  20 |   494/  743 batches | lr 0.62 | ms/batch 22.19 | loss  6.06 | ppl   428.06
| epoch  20 |   495/  743 batches | lr 0.62 | ms/batch 22.22 | loss  5.93 | ppl   375.24
| epoch  20 |   496/  743 batches | lr 0.62 | ms/batch 23.25 | loss  6.06 | ppl   429.37
| epoch  20 |   497/  743 batches | lr 0.62 | ms/batch 22.04 | loss  5.93 | ppl   376.42
| epoch  20 |   498/  743 batches | lr 0.62 | ms/batch 22.00 | loss  6.14 | ppl   463.70
| epoch  20 |   499/  743 batches | lr 0.62 | ms/batch 24.23 | loss  6.00 | ppl   403.20
| epoch  20 |   500/  743 batches | lr 0.62 | ms/batch 24.46 | loss  5.90 | ppl   364.76
| epoch  20 |   501/  743 batches | lr 0.62 | ms/batch 26.38 | loss  6.01 | ppl   409.28
| epoch  20 |   502/ 

| epoch  20 |   592/  743 batches | lr 0.62 | ms/batch 23.98 | loss  5.95 | ppl   382.83
| epoch  20 |   593/  743 batches | lr 0.62 | ms/batch 23.44 | loss  5.94 | ppl   378.88
| epoch  20 |   594/  743 batches | lr 0.62 | ms/batch 22.89 | loss  5.90 | ppl   364.03
| epoch  20 |   595/  743 batches | lr 0.62 | ms/batch 22.74 | loss  5.92 | ppl   371.65
| epoch  20 |   596/  743 batches | lr 0.62 | ms/batch 22.42 | loss  5.85 | ppl   348.83
| epoch  20 |   597/  743 batches | lr 0.62 | ms/batch 22.27 | loss  5.85 | ppl   346.62
| epoch  20 |   598/  743 batches | lr 0.62 | ms/batch 23.29 | loss  5.96 | ppl   386.42
| epoch  20 |   599/  743 batches | lr 0.62 | ms/batch 22.16 | loss  6.08 | ppl   435.03
| epoch  20 |   600/  743 batches | lr 0.62 | ms/batch 22.19 | loss  5.89 | ppl   359.89
| epoch  20 |   601/  743 batches | lr 0.62 | ms/batch 22.45 | loss  6.03 | ppl   414.20
| epoch  20 |   602/  743 batches | lr 0.62 | ms/batch 26.08 | loss  5.96 | ppl   388.01
| epoch  20 |   603/ 

| epoch  20 |   685/  743 batches | lr 0.62 | ms/batch 27.81 | loss  5.98 | ppl   395.03
| epoch  20 |   686/  743 batches | lr 0.62 | ms/batch 36.77 | loss  5.99 | ppl   399.36
| epoch  20 |   687/  743 batches | lr 0.62 | ms/batch 32.81 | loss  5.94 | ppl   379.48
| epoch  20 |   688/  743 batches | lr 0.62 | ms/batch 32.47 | loss  6.05 | ppl   423.90
| epoch  20 |   689/  743 batches | lr 0.62 | ms/batch 27.19 | loss  6.05 | ppl   424.42
| epoch  20 |   690/  743 batches | lr 0.62 | ms/batch 28.54 | loss  6.02 | ppl   409.80
| epoch  20 |   691/  743 batches | lr 0.62 | ms/batch 26.62 | loss  5.89 | ppl   359.67
| epoch  20 |   692/  743 batches | lr 0.62 | ms/batch 27.09 | loss  6.10 | ppl   447.91
| epoch  20 |   693/  743 batches | lr 0.62 | ms/batch 31.83 | loss  6.02 | ppl   410.26
| epoch  20 |   694/  743 batches | lr 0.62 | ms/batch 31.16 | loss  5.98 | ppl   395.66
| epoch  20 |   695/  743 batches | lr 0.62 | ms/batch 30.53 | loss  6.01 | ppl   406.40
| epoch  20 |   696/ 

In [11]:
corpus

<data.Corpus at 0x7f0a96828410>

In [12]:
corpus.train.shape

torch.Size([929589])

In [13]:
corpus.test.shape

torch.Size([82430])

In [14]:
corpus.valid.shape

torch.Size([73760])

In [15]:
len(corpus.dictionary.idx2word)

10000

In [16]:
corpus.dictionary.word2idx['<unk>']

26

In [17]:
corpus.dictionary.word2idx['<eos>']

24

In [18]:
corpus.

SyntaxError: invalid syntax (1008075712.py, line 1)

There is a lot more that you can do with outputs (such as including interactive outputs)
with your book. For more information about this, see [the Jupyter Book documentation](https://jupyterbook.org)

In [21]:
from mymodel import MyElmanNetwork
import torch.optim as optim

# Example usage
vocab_size = 10000
embedding_dim = 100
input_size = embedding_dim
hidden_size = 20
output_size = 5
sequence_length = 6
batch_size = 3
num_epochs = 10
learning_rate = 0.001

# Create an instance of the Elman network
elman_net = MyElmanNetwork(input_size, hidden_size, output_size, vocab_size, embedding_dim)

# Generate some random input sequence and corresponding target labels
input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))
target_labels = torch.randint(0, output_size, (batch_size, sequence_length))

# Define the loss function and the optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(elman_net.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    # Zero the gradients
    optimizer.zero_grad()

    # Forward pass
    outputs = elman_net(input_sequence)

    # Reshape outputs and target labels for loss calculation
    outputs = outputs.view(-1, output_size)
    target_labels = target_labels.view(-1)

    # Calculate the loss
    loss = loss_function(outputs, target_labels)

    # Backward pass and optimization step
    loss.backward()
    optimizer.step()

    # Print the loss for each epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Print the final outputs
final_outputs = elman_net(input_sequence)
print(final_outputs)

Epoch 1/10, Loss: 1.6084262132644653
Epoch 2/10, Loss: 1.5708050727844238
Epoch 3/10, Loss: 1.5337529182434082
Epoch 4/10, Loss: 1.4973175525665283
Epoch 5/10, Loss: 1.4615721702575684
Epoch 6/10, Loss: 1.4265872240066528
Epoch 7/10, Loss: 1.3924317359924316
Epoch 8/10, Loss: 1.3591649532318115
Epoch 9/10, Loss: 1.326827883720398
Epoch 10/10, Loss: 1.2954357862472534
tensor([[[-0.4557,  0.3464,  0.6389,  0.3665, -0.7939],
         [ 0.1077,  0.0440,  0.5277,  0.1991,  0.1102],
         [-0.2033,  0.5700, -0.0068,  0.3846,  0.2028],
         [ 0.2616, -0.4129,  0.1387,  0.0085,  0.7012],
         [-0.4419,  0.3845,  0.0186,  0.4997, -0.4066],
         [-0.2927,  0.0781,  0.9332,  0.6109, -0.3657]],

        [[-0.0044, -0.1858,  0.0045,  0.6286,  0.2181],
         [ 0.0687,  0.1818, -0.2695,  0.2359, -0.0188],
         [-0.5020, -0.2445,  0.2477,  0.6203, -0.5808],
         [-0.6771, -0.3951,  0.8126,  0.3976, -0.4112],
         [-0.0098,  0.5429, -0.4774,  0.2871, -0.4462],
         [ 0