# Imports and Installs



In [None]:
import math
import random
import os
import time
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import tqdm
import importlib
from pathlib import Path
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer
from itertools import chain

# Initial Models

We will build the following modules:

1. `ChunkwiseLinearAttentionHead`: encapsulates the linear attention mechanism, computed using a chunkwise parallel algorithm.

2. `CLAWrapperNextToken`: wrapper around a _single_ ChunkwiseLinearAttentionHead for next-token prediction. Wrapper consists of an embedding layer to convert tokens into vectors for the head, the head itself, and 1 fully-connected layer to project onto token probabilities.

3. `CLAWrapperTextClassification`: wrapper around a _single_ ChunkwiseLinearAttentionHead for text classification. Similar architecture as `CLAWrapperNextToken`, except the final linear layer output dimension is `num_classes` instead of `vocab_size`.

In [None]:
# class Parallel

In [None]:
class ChunkwiseLinearAttentionHead(nn.Module):

  def __init__(self, in_features, out_features, chunk_size, seq_len):
    super().__init__()
    self.in_features = in_features
    self.out_features = out_features
    self.chunk_size = chunk_size
    self.seq_len = seq_len

    self.Q_linear = nn.Linear(in_features, out_features, bias=False)
    self.K_linear = nn.Linear(in_features, out_features, bias=False)
    self.V_linear = nn.Linear(in_features, out_features, bias=False)

    M = torch.tensor(torch.tril(torch.ones(chunk_size, chunk_size)))
    self.register_buffer("M", M)


    # self.rope_rotator

  def rope(self):
    pass

  def forward(self, x):
    # Calculate number of chunks we need
    n_chunks = x.shape[1] // self.chunk_size

    # Compute initial Q, K, V
    Q = self.Q_linear(x)
    K = self.K_linear(x)
    V = self.V_linear(x)

    # Sanity check
    # print(Q.shape)
    # print(K.shape)
    # print(V.shape)

    # RoPE reshaping -- pair adjacent values in Q and K
    Q = Q.reshape(Q.shape[0], self.seq_len, self.out_features // 2, 2)
    K = K.reshape(K.shape[0], self.seq_len, self.out_features // 2, 2)


    # Reshape Q, K, V into blocks for each chunk
    Q = Q.reshape(Q.shape[0], n_chunks, self.chunk_size, self.out_features)
    K = K.reshape(K.shape[0], n_chunks, self.chunk_size, self.out_features)
    V = V.reshape(V.shape[0], n_chunks, self.chunk_size, self.out_features)


    # Initialize state
    S = torch.zeros(x.shape[0], self.out_features, self.out_features, device=x.device)
    S.to(x.device)

    out = (Q @ K.transpose(-2, -1) * self.M) @ V
    out.to(x.device)

    for i in range(n_chunks):
      out[:, i, :, :] = out[:, i, :, :] +  Q[:, i, :, :] @ S.transpose(-2, -1)
      S = S + V[:, i, :, :].transpose(-2, -1) @ K[:, i, :, :]

    return out.reshape(x.shape[0], -1, self.out_features)

In [None]:
class CLAWrapperNextToken(nn.Module):

  def __init__(self, in_features, out_features, chunk_size, vocab_size, seq_len, ffn_multiplier=4, p=0.1):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, in_features)
    self.head = ChunkwiseLinearAttentionHead(in_features, out_features, chunk_size, seq_len)

    d_ff = ffn_multiplier * out_features
    self.ffn = nn.Sequential(
        nn.Linear(out_features, d_ff),
        nn.GELU(),
        nn.Dropout(p),
        nn.Linear(d_ff, out_features),
        nn.Dropout(p),
    )

    self.fc = nn.Linear(out_features, vocab_size)

  def forward(self, x, attention_mask=None):
    embeds = self.embed(x)

    # Absolute position encodings -- use sinusoidal position embeddings a la BERT
    positions = torch.arange(embeds.shape[1], device=embeds.device).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, embeds.shape[2], 2, device=embeds.device) * -(math.log(10000.0) / embeds.shape[2]))
    encodings = torch.zeros_like(embeds)
    encodings[:, :, 0::2] = torch.sin(positions * div_term)
    encodings[:, :, 1::2] = torch.cos(positions * div_term)
    encoded_embeds = embeds + encodings


    attention_vectors = self.head(encoded_embeds)

    # x is B x L x D -- we want output to be B x L x V, so we apply linear layer
    out = self.fc(attention_vectors + self.ffn(attention_vectors))

    # return unnormalized logits for cross entropy loss
    return out

In [None]:
class CLAWrapperNextToken2(nn.Module):

  def __init__(self, in_features, out_features, chunk_size, vocab_size, seq_len, ffn_multiplier=4, p=0.1):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, in_features)
    self.head1 = ChunkwiseLinearAttentionHead(in_features, out_features, chunk_size, seq_len)

    d_ff = ffn_multiplier * out_features

    self.ffn = nn.Sequential(
        nn.Linear(out_features, d_ff),
        nn.GELU(),
        nn.Dropout(p),
        nn.Linear(d_ff, out_features),
        nn.Dropout(p),
    )

    self.fc_mid = nn.Linear(out_features, in_features)
    self.head2 = ChunkwiseLinearAttentionHead(in_features, out_features, chunk_size, seq_len)

    self.ffn2 = nn.Sequential(
        nn.Linear(out_features, d_ff),
        nn.GELU(),
        nn.Dropout(p),
        nn.Linear(d_ff, out_features),
        nn.Dropout(p),
    )

    self.fc = nn.Linear(out_features, vocab_size)

  def forward(self, x, attention_mask=None):
    embeds = self.embed(x)

    # Absolute position encodings -- use sinusoidal position embeddings a la BERT
    positions = torch.arange(embeds.shape[1], device=embeds.device).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, embeds.shape[2], 2, device=embeds.device) * -(math.log(10000.0) / embeds.shape[2]))
    encodings = torch.zeros_like(embeds)
    encodings[:, :, 0::2] = torch.sin(positions * div_term)
    encodings[:, :, 1::2] = torch.cos(positions * div_term)
    encoded_embeds = embeds + encodings


    attention_vectors1 = self.head1(encoded_embeds)

    # map first attention head outputs to input space of second head, then pass through second head
    next_inps = self.fc_mid(attention_vectors1 + self.ffn(attention_vectors1))
    attention_vectors2 = self.head2(next_inps + encoded_embeds)

    # x is B x L x D -- we want output to be B x L x V, so we apply linear layer
    out = self.fc(attention_vectors2 + self.ffn2(attention_vectors2))

    # return unnormalized logits for cross entropy loss
    return out

In [None]:
class CLAWrapperTextClassification(nn.Module):

  def __init__(self, in_features, out_features, chunk_size, vocab_size, num_classes, seq_len):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, in_features)
    self.head = ChunkwiseLinearAttentionHead(in_features, out_features, chunk_size, seq_len)
    self.fc = nn.Linear(out_features, num_classes)

  def forward(self, x, attention_mask=None):
    embeds = self.embed(x)
    attention_vectors = self.head(embeds)

    # B-length 1d tensor
    if attention_mask is not None:
      cutoff_indices = attention_mask.sum(dim=1) - 1
    else:
      cutoff_indices = -1

    arange_selector = torch.arange(attention_vectors.shape[0])

    # x is B x L x D -- we want output to be B x C, so we need to remove the L dimension
    out = self.fc(attention_vectors[arange_selector, cutoff_indices, :])

    # return unnormalized logits for cross entropy loss
    return out

## Tests of initial models

In [None]:
batch_size = 64
seq_len = 1024
dims = 512

data = torch.randn(batch_size, seq_len, dims)
linhead = ChunkwiseLinearAttentionHead(dims, 3*dims, 64, seq_len)
out = linhead(data)
print()
print(data.shape)
print(out.shape)

  self.M = torch.tensor(torch.tril(torch.ones(chunk_size, chunk_size)))


torch.Size([64, 1024, 1536])
torch.Size([64, 1024, 1536])
torch.Size([64, 1024, 1536])

torch.Size([64, 1024, 512])
torch.Size([64, 1024, 1536])


In [None]:
for i, k in enumerate(linhead.named_modules()):
  print(i)
  print(k)
  print()

0
('', ChunkwiseLinearAttentionHead(
  (Q_linear): Linear(in_features=512, out_features=1536, bias=False)
  (K_linear): Linear(in_features=512, out_features=1536, bias=False)
  (V_linear): Linear(in_features=512, out_features=1536, bias=False)
))

1
('Q_linear', Linear(in_features=512, out_features=1536, bias=False))

2
('K_linear', Linear(in_features=512, out_features=1536, bias=False))

3
('V_linear', Linear(in_features=512, out_features=1536, bias=False))



# AWQ simulation

In [None]:
# core quantization method (simulated quantization)
def pseudo_quantize_tensor(w, n_bit=4, q_group_size=-1):
    org_w_shape = w.shape
    if q_group_size > 0:
        assert org_w_shape[-1] % q_group_size == 0
        w = w.reshape(-1, q_group_size)

    assert w.dim() == 2

    # Calculate the maximum (\alpha) and minimum values (\beta) in the tensor.
    max_val = w.amax(dim=1, keepdim=True)
    assert max_val.dim() == 2 and max_val.size(0) == w.size(0) and max_val.size(1) == 1
    min_val = w.amin(dim=1, keepdim=True)
    assert min_val.dim() == 2 and min_val.size(0) == w.size(0) and min_val.size(1) == 1

    # Calculate the scale factor and zero point.  (Formula 1 & 2)
    max_int = 2 ** n_bit - 1
    scales = (max_val - min_val).clamp(min=1e-5) / max_int
    assert scales.shape == max_val.shape
    zeros = (-torch.round(min_val / scales)).clamp_(0, max_int)
    assert scales.shape == min_val.shape

    assert torch.isnan(scales).sum() == 0
    assert torch.isnan(w).sum() == 0

    # Quantize W: Map values in the range [\beta, \alpha] to lie within [0, 2^b - 1] (Formula 3)
    w = torch.clamp(torch.round(w / scales) + zeros, 0, max_int)
    assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size

    # Dequantize W (pseudo quantization, the inverse transformation of Formula 3)
    w = (w - zeros) * scales
    assert w.dim() == 2 and w.size(0) == scales.size(0) and w.size(1) == q_group_size

    assert torch.isnan(w).sum() == 0

    w = w.reshape(org_w_shape)
    return w

@torch.no_grad()
def pseudo_quantize_model_weight(
    model, w_bit, q_group_size,
):
    for n, m in model.named_modules():
        if isinstance(m, nn.Linear):
            m.weight.data = pseudo_quantize_tensor(m.weight.data, n_bit=w_bit, q_group_size=q_group_size)

In [1]:
@torch.no_grad()
def get_calib_feat(model, tokenizer, dataset):
    input_dict = dict()
    def stat_input_max_hook(m, x, y, name):
        if isinstance(x, tuple):
            x = x[0]
        x_max = x.view(-1, x.shape[-1]).abs().mean(dim=0).cpu().detach()
        if name not in input_dict:
            input_dict[name] = [x_max]
        else:
            input_dict[name] += [x_max]

    hooks = []
    for name, m in model.named_modules():
        if isinstance(m, nn.Linear):
            hooks.append(
                m.register_forward_hook(
                    partial(stat_input_max_hook, name=name)))

    print("Collecting activation scales...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    pbar = tqdm.tqdm(dataset)
    for batch in pbar:
        data_fields = ['input_ids']

        for k in data_fields:
          try:
            batch[k] = torch.stack(batch[k], dim=1)
          except:
            print(batch[k])
            raise ValueError("CLOWN")

        batch = {k: torch.tensor(batch[k]).to(device, non_blocking=True) for k in data_fields}

        model(batch['input_ids'])

    for hook in hooks:
        hook.remove()
    return input_dict

NameError: name 'torch' is not defined

In [None]:
@torch.no_grad()
def pseudo_quantize_model_salient_weight_fp16(
    model, w_bit, q_group_size, input_feat
):
    for n, m in model.named_modules():
        if isinstance(m, nn.Linear):
            importance = sum(input_feat[n]).float()

            outlier_indices = torch.topk(importance, int(m.weight.data.shape[1]//100)).indices
            assert outlier_indices.dim() == 1

            # Back up the values of the salient weight channels
            outlier = m.weight.data[:, outlier_indices].clone()

            m.weight.data = pseudo_quantize_tensor(m.weight.data, n_bit=w_bit, q_group_size=q_group_size)

            m.weight.data[:, outlier_indices] = outlier

In [None]:
pseudo_quantize_model_weight(linhead, w_bit=3, q_group_size=128)

# Evaluate the model
out_quant_pure = linhead(data)
print((out_quant_pure - out).abs().sum())

KeyboardInterrupt: 

In [None]:
print(((out_quant_pure - out).abs().sum())/(out_quant_pure.numel()))

In [None]:
out.mean(axis=None)

# Load Data for Tasks

There are datasets we will be considering in this notebook.

1. Penn Tree Bank -- this is a next-token prediction task used by early variants of GPT.

2. IMDB database -- this is a binary text classification task, sorting movie reviews into positive/negative sentiment.

## Penn Treebank

We will consider two variants with Penn Treebank. We can either feed each sentence as an independent input, or we can replace end of sentences with an EOS token, and divide the input into strings of fixed length.

We will build data pipelines for both functionalities here, and experiment with them as if they were separate tasks. In some ways, the Penn Treebank task may have a part (a) and a part (b).

The components of this module are as follows:
1. Vocab: maps tokens to ints and ints to tokens. Converts unseen tokens into `<unk>`.
2. Tokenizer: uses the vocab object to take raw text files and convert them into token streams.
3. Dataloaders: loads train/val/test data for torch models. slightly different behavior necessary for training than for val and test (sliding instead of chunking for context creation).

In [None]:
!unzip /content/ptb-copy-paste.zip

Archive:  /content/ptb-copy-paste.zip
   creating: ptb-copy-paste/
  inflating: ptb-copy-paste/train.txt  
  inflating: __MACOSX/ptb-copy-paste/._train.txt  
  inflating: ptb-copy-paste/test.txt  
  inflating: __MACOSX/ptb-copy-paste/._test.txt  
  inflating: ptb-copy-paste/val.txt  
  inflating: __MACOSX/ptb-copy-paste/._val.txt  


In [None]:
read_dirs = ['/content/ptb-copy-paste/train.txt', '/content/ptb-copy-paste/val.txt']

### Vocab

In [None]:
class Vocab:
    """
    Simple word-level vocabulary.

    - Built from token streams (train only).
    - Provides stoi/itos mappings with reserved specials.
    - OOV tokens map to <unk>.

    Args:
        min_freq: keep tokens with frequency >= min_freq
        max_size: keep at most this many tokens (excluding specials); ties broken by freq then lexicographic
        specials: list of reserved tokens placed at the start of vocab
    """
    def __init__(self, min_freq, max_size, specials):
        self.min_freq = int(min_freq)
        self.max_size = max_size if (max_size is None) else int(max_size)
        self.specials = specials or ["<unk>", "<eos>"]
        # mappings
        self.stoi = {} # Dict[str, int]
        self.itos = [] # List [str]
        self._frozen = False

    @property
    def unk_token(self) -> str:
        return self.specials[0]

    @property
    def eos_token(self) -> str:
        # we include <eos> as a special for convenience; you may not need an id for it explicitly
        return self.specials[1] if len(self.specials) > 1 else "<eos>"


    def file_tokens(self, file_list):
      for f in file_list:
        with open(f, "r", encoding="utf-8") as f:
          for raw in f:
            s = raw.split()
            s.append('<eos>')
            yield s


    def build(self, file_list=read_dirs):
        """
        Build vocabulary from an iterable of tokens (typically the entire train set).
        """
        if self._frozen:
            raise RuntimeError("Vocab is frozen; cannot rebuild.")

        for token_stream in self.file_tokens(file_list=file_list):
          for token in token_stream:
            if token not in self.stoi and token not in self.specials:
              self.stoi[token] = len(self.itos)
              self.itos.append(token)

        for special in self.specials:
          if special not in self.stoi:
            self.stoi[special] = len(self.itos)
            self.itos.append(special)


        '''
        counter = Counter(tokens)
        # Filter by min_freq
        items = [(tok, freq) for tok, freq in counter.items() if freq >= self.min_freq]
        # Sort: frequency (desc) then token (asc) for determinism
        items.sort(key=lambda x: (-x[1], x[0]))

        if self.max_size is not None:
            items = items[: self.max_size]

        # Start with specials
        self.itos = list(dict.fromkeys(self.specials))  # preserve order, dedupe if passed duplicates
        # Then add items not already in specials
        for tok, _ in items:
            if tok not in self.itos:
                self.itos.append(tok)

        self.stoi = {tok: i for i, tok in enumerate(self.itos)}
        '''
        self._frozen = True

    def __len__(self):
        return len(self.itos)

    def to_id(self, tok: str):
        """
        Map token -> id; unknowns -> <unk>.
        """
        if tok in self.stoi:
            return self.stoi[tok]
        return self.stoi[self.unk_token]

    def to_token(self, idx: int):
        return self.itos[idx]

    def ids(self, tokens):
        return [self.to_id(t) for t in tokens]

In [None]:
v = Vocab(min_freq=0, max_size=None, specials=None)
v.specials
v.build()

'preliminary'

In [None]:
# Save Vocab object
import pickle
with open('/content/ptb-vocab-word.pkl', 'wb') as f:
  pickle.dump(v, f)

In [None]:
# Test load
import pickle
with open('/content/ptb-vocab-word.pkl', 'rb') as f:
  v2 = pickle.load(f)
  print(v2.specials)
  print(len(v2))
  print(v2.ids(['the', 'fat', 'rat', 'was', 'eaten', 'by', 'the', 'cat', 'quickly']))

['<unk>', '<eos>']
10000
[30, 1795, 9998, 52, 9998, 227, 30, 3108, 1843]


In [None]:
len(v2)

10000

### Tokenizer



In [None]:
# use v2.ids() for tokenization

### Preprocessing

In [None]:
ptb_root_dir = '/content/ptb-copy-paste/'
ptb_write_dir = '/content/ptb-copy-paste-tokenized/'

In [None]:
def load_ptb_help(filename):
  final_array = np.array([], dtype=np.int64)
  with open(filename, 'r') as f:
    for raw in f:
      s = raw.split()
      s.append('<eos>')
      final_array = np.concatenate([final_array, np.array(v2.ids(s))])

  return final_array

def load_ptb_train():
  return load_ptb_help(ptb_root_dir + 'train.txt')

def load_ptb_val():
  return load_ptb_help(ptb_root_dir + 'val.txt')

def load_ptb_test():
  return load_ptb_help(ptb_root_dir + 'test.txt')

def load_ptb_all():
  return {'train': load_ptb_train(), 'val': load_ptb_val(), 'test': load_ptb_test()}

In [None]:
all_ptb_tokens = load_ptb_all()
for k in all_ptb_tokens:
  print(k, len(all_ptb_tokens[k]))

train 929589
val 73760
test 82430


### Dataloader

We need to convert a large corpus of tokens into a dataset, and then create a dataset dict using the datasets.

Thus, the functionalities we need to build are as folloows:
1. Convert corpus into dictionary of fields
2. Convert fields into `Dataset`
3. Aggregate `Dataset` objects into `DatasetDict`.

In [None]:
def corpus_to_fields(corpus, seq_len, stride=None):
  if stride == None:
    stride = seq_len

  if len(corpus) % seq_len != 1:
    corpus = corpus[:-(len(corpus) % seq_len)+1]

  start_indices = np.arange(0, len(corpus) - seq_len, stride)

  input_ids = np.array([corpus[i:i+seq_len] for i in start_indices])
  labels = np.array([corpus[i+1:i+1+seq_len] for i in start_indices])

  attention_mask = np.ones_like(input_ids)

  return {'input_ids': input_ids, 'labels': labels, 'attention_mask': attention_mask}

In [None]:
for k in all_ptb_tokens:
  stride = 32 if k == 'train' else 1
  all_ptb_tokens[k] = Dataset.from_dict(corpus_to_fields(all_ptb_tokens[k], seq_len=32, stride=stride))

all_ptb_tokens = DatasetDict(all_ptb_tokens)

In [None]:
all_ptb_tokens.save_to_disk(ptb_write_dir)
print(f"Saved packed dataset to: {ptb_write_dir}")
print(all_ptb_tokens)

Saving the dataset (0/1 shards):   0%|          | 0/29049 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards): 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/82369 [00:00<?, ? examples/s]

Saved packed dataset to: /content/ptb-copy-paste-tokenized/
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 29049
    })
    val: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 0
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 82369
    })
})


In [None]:
!zip -r /content/ptb_copy_paste_tokenized.zip /content/ptb-copy-paste-tokenized

updating: content/ptb-copy-paste-tokenized/ (stored 0%)
updating: content/ptb-copy-paste-tokenized/dataset_dict.json (deflated 3%)
updating: content/ptb-copy-paste-tokenized/test/ (stored 0%)
updating: content/ptb-copy-paste-tokenized/test/dataset_info.json (deflated 69%)
updating: content/ptb-copy-paste-tokenized/test/state.json (deflated 38%)
updating: content/ptb-copy-paste-tokenized/test/data-00001-of-00002.arrow (deflated 99%)
updating: content/ptb-copy-paste-tokenized/test/data-00000-of-00002.arrow (deflated 99%)
updating: content/ptb-copy-paste-tokenized/val/ (stored 0%)
updating: content/ptb-copy-paste-tokenized/val/dataset_info.json (deflated 69%)
updating: content/ptb-copy-paste-tokenized/val/data-00000-of-00001.arrow (deflated 99%)
updating: content/ptb-copy-paste-tokenized/val/state.json (deflated 39%)
updating: content/ptb-copy-paste-tokenized/train/ (stored 0%)
updating: content/ptb-copy-paste-tokenized/train/dataset_info.json (deflated 69%)
updating: content/ptb-copy-pas

In [None]:
!unzip /content/ptb_copy_paste_tokenized.zip
!rm -rf /content/ptb-copy-paste-tokenized
!mv -f /content/content/ptb-copy-paste-tokenized/ /content
!rm -rf /content/content

!ls

Archive:  /content/ptb_copy_paste_tokenized.zip
   creating: content/ptb-copy-paste-tokenized/
  inflating: content/ptb-copy-paste-tokenized/dataset_dict.json  
   creating: content/ptb-copy-paste-tokenized/test/
  inflating: content/ptb-copy-paste-tokenized/test/dataset_info.json  
  inflating: content/ptb-copy-paste-tokenized/test/state.json  
  inflating: content/ptb-copy-paste-tokenized/test/data-00001-of-00002.arrow  
  inflating: content/ptb-copy-paste-tokenized/test/data-00000-of-00002.arrow  
   creating: content/ptb-copy-paste-tokenized/val/
  inflating: content/ptb-copy-paste-tokenized/val/dataset_info.json  
  inflating: content/ptb-copy-paste-tokenized/val/data-00000-of-00001.arrow  
  inflating: content/ptb-copy-paste-tokenized/val/state.json  
   creating: content/ptb-copy-paste-tokenized/train/
  inflating: content/ptb-copy-paste-tokenized/train/dataset_info.json  
  inflating: content/ptb-copy-paste-tokenized/train/data-00000-of-00001.arrow  
  inflating: content/ptb-co

In [None]:
all_ptb_tokens = DatasetDict.load_from_disk(ptb_write_dir)

In [None]:
all_ptb_tokens

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 29049
    })
    val: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 0
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 82369
    })
})

In [None]:
batch_size = 128
ptb_train_loader = DataLoader(all_ptb_tokens["train"], batch_size=batch_size, shuffle=True)
# ptb_val_loader = DataLoader(all_ptb_tokens["val"], batch_size=batch_size, shuffle=True)
ptb_test_loader = DataLoader(all_ptb_tokens["test"], batch_size=batch_size, shuffle=True)

## IMDB Reviews

In [None]:
!unzip imdb-dataset-classic.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: imdb_dataset_classic /train/pos/9260_7.txt  
  inflating: __MACOSX/imdb_dataset_classic /train/pos/._9260_7.txt  
  inflating: imdb_dataset_classic /train/pos/1599_7.txt  
  inflating: __MACOSX/imdb_dataset_classic /train/pos/._1599_7.txt  
  inflating: imdb_dataset_classic /train/pos/2174_8.txt  
  inflating: __MACOSX/imdb_dataset_classic /train/pos/._2174_8.txt  
  inflating: imdb_dataset_classic /train/pos/2309_9.txt  
  inflating: __MACOSX/imdb_dataset_classic /train/pos/._2309_9.txt  
  inflating: imdb_dataset_classic /train/pos/12034_10.txt  
  inflating: __MACOSX/imdb_dataset_classic /train/pos/._12034_10.txt  
  inflating: imdb_dataset_classic /train/pos/11703_9.txt  
  inflating: __MACOSX/imdb_dataset_classic /train/pos/._11703_9.txt  
  inflating: imdb_dataset_classic /train/pos/5619_9.txt  
  inflating: __MACOSX/imdb_dataset_classic /train/pos/._5619_9.txt  
  inflating: imdb_dataset_classic /train

### Tokenizer

For the IMDB review dataset, we will be tokenizing by using the BERT tokenizer.

In [None]:
imdb_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
sample_sentence = "Like an alien, I am sent here to destroy you. And there's a million others just like me, who walk, talk, and act like me."
vv = imdb_tokenizer(sample_sentence)
vv

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'input_ids': [101, 2409, 1126, 8143, 117, 146, 1821, 1850, 1303, 1106, 5535, 1128, 119, 1262, 1175, 112, 188, 170, 1550, 1639, 1198, 1176, 1143, 117, 1150, 2647, 117, 2037, 117, 1105, 2496, 1176, 1143, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
imdb_tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [None]:
len(imdb_tokenizer)

28996

In [None]:
# Check for longest review in train/neg, train/pos, test/neg, and test/pos

imdb_root_dir = '/content/imdb_dataset_classic/'
os.listdir(imdb_root_dir)
dirs_to_consider = ['train/neg', 'train/pos', 'test/neg', 'test/pos']
for d in dirs_to_consider:
  max_len = 0
  for f in os.listdir(imdb_root_dir + d):
    with open(imdb_root_dir + d + '/' + f, 'r') as f:
      max_len = max(max_len, len(f.read()))
  print(d, max_len)

train/neg 8969
train/pos 13704
test/neg 6385
test/pos 12988


In [None]:
dirs_to_consider

['train/neg', 'train/pos', 'test/neg', 'test/pos']

### Preprocessing

Here, we will use the tokenizer we created to tokenize all the data in `imdb_dataset_classic`. We will then stored the tokenized copies as PyArrow datasets, for future use in training.

In [None]:
imdb_root_dir = '/content/imdb_dataset_classic/'
imdb_write_dir = '/content/imdb_dataset_classic_tokenized/'
dirs_to_consider = ['train/neg', 'train/pos', 'test/neg', 'test/pos']

We load the PyArrow datasets, which consist of tokenized input streams and labels. We attach these datasets to dataloaders, which will be used for training and evaluation loops later on.

In [None]:
# load files
def load_files(split='all'):
  for d in dirs_to_consider:
    if split != 'all' and not d.startswith(split):
      print(d)
      continue
    cur_directory = imdb_root_dir + d
    if cur_directory.endswith('neg'):
      label = 0
    else:
      label = 1
    for p in Path(cur_directory).glob('*.txt'):
      plaintext = Path(p).read_text(encoding="utf-8", errors="ignore")
      yield {'text': plaintext, 'label': label}

def load_files_train():
  return load_files(split='train')

def load_files_test():
  return load_files(split='test')


raw_dataset = Dataset.from_generator(load_files)
train_dataset = Dataset.from_generator(load_files_train)
test_dataset = Dataset.from_generator(load_files_test)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

test/neg
test/pos


Generating train split: 0 examples [00:00, ? examples/s]

train/neg
train/pos


In [None]:
raw_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})

In [None]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [None]:
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [None]:
test_dataset[0]

{'text': 'I carefully checked if there\'s another movie named as this one, and there isn\'t ! But I really don\'t think we all saw the same movie ! There\'s no way ! How can you vote more than "1" for this movie ?! The idea of this movie let\'s say it\'s acceptable. Oh, and the acting of Dan Gordon (Chris) is quite good. But those are the only two things acceptable in this project. The others are... awful ? It\'s a very delicate word to describe the acting of the other actors, the directing, the (so said) "special" effects, even the way that the crew was filming ! I don\'t even like the way that the camera operators were moving to record the scenes ! This may be the most miserable film I\'ve ever seen. I really don\'t remember a movie lower than this one... Maybe there is, but... I don\'t think so... Ehh, what\'s done, it\'s done... That\'s the movie and there\'s too late for anyone to change anything. I\'ve voted "1", but my realistic vote starts with a "-" (minus) in front....',
 'la

In [None]:
imdb_ds_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

In [None]:
def imdb_tokenize(batch):
  return imdb_tokenizer(batch["text"], truncation=True, padding='max_length', max_length=512)

imdb_ds_dict_tokenized = imdb_ds_dict.map(imdb_tokenize, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
imdb_ds_dict_tokenized.keys()

dict_keys(['train', 'test'])

In [None]:
imdb_ds_dict_tokenized["train"].features

{'text': Value('string'),
 'label': Value('int64'),
 'input_ids': List(Value('int32')),
 'token_type_ids': List(Value('int8')),
 'attention_mask': List(Value('int8'))}

In [None]:
imdb_ds_dict_tokenized.save_to_disk(imdb_write_dir)
print(f"Saved packed dataset to: {imdb_write_dir}")
print(imdb_ds_dict_tokenized)

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saved packed dataset to: /content/imdb_dataset_classic_tokenized/
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})


In [None]:
from google.colab import files
files.download('/content/imdb_dataset_classic_tokenized')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r /content/imdb_dataset_classic_tokenized.zip /content/imdb_dataset_classic_tokenized

  adding: content/imdb_dataset_classic_tokenized/ (stored 0%)
  adding: content/imdb_dataset_classic_tokenized/test/ (stored 0%)
  adding: content/imdb_dataset_classic_tokenized/test/dataset_info.json (deflated 69%)
  adding: content/imdb_dataset_classic_tokenized/test/state.json (deflated 38%)
  adding: content/imdb_dataset_classic_tokenized/test/data-00000-of-00001.arrow (deflated 77%)
  adding: content/imdb_dataset_classic_tokenized/train/ (stored 0%)
  adding: content/imdb_dataset_classic_tokenized/train/dataset_info.json (deflated 69%)
  adding: content/imdb_dataset_classic_tokenized/train/state.json (deflated 37%)
  adding: content/imdb_dataset_classic_tokenized/train/data-00000-of-00001.arrow (deflated 77%)
  adding: content/imdb_dataset_classic_tokenized/dataset_dict.json (stored 0%)


In [None]:
!unzip /content/imdb_dataset_classic_tokenized.zip
!rm -rf /content/imdb_dataset_classic_tokenized
!mv -f /content/content/imdb_dataset_classic_tokenized/ /content
!rm -rf /content/content

!ls

Archive:  /content/imdb_dataset_classic_tokenized.zip
   creating: content/imdb_dataset_classic_tokenized/
   creating: content/imdb_dataset_classic_tokenized/test/
  inflating: content/imdb_dataset_classic_tokenized/test/dataset_info.json  
  inflating: content/imdb_dataset_classic_tokenized/test/state.json  
  inflating: content/imdb_dataset_classic_tokenized/test/data-00000-of-00001.arrow  
   creating: content/imdb_dataset_classic_tokenized/train/
  inflating: content/imdb_dataset_classic_tokenized/train/dataset_info.json  
  inflating: content/imdb_dataset_classic_tokenized/train/state.json  
  inflating: content/imdb_dataset_classic_tokenized/train/data-00000-of-00001.arrow  
 extracting: content/imdb_dataset_classic_tokenized/dataset_dict.json  
imdb_dataset_classic_tokenized	imdb_dataset_classic_tokenized.zip  sample_data


In [None]:
imdb_ds_dict_tokenized = DatasetDict.load_from_disk(imdb_write_dir)

### Dataloaders

In [None]:
batch_size = 64

imdb_train_loader = DataLoader(imdb_ds_dict_tokenized["train"], batch_size=batch_size, shuffle=True)
imdb_test_loader = DataLoader(imdb_ds_dict_tokenized["test"], batch_size=batch_size, shuffle=True)

# Metrics

We need different metrics for each of the tasks.

1. Penn Tree Bank -- this is next-token prediction. Therefore, we need to be able to compute multiclass (N = vocab size) log-loss of token prediction for training and perplexity for eval.

2. IMDB -- this is binary prediction. Therefore, we need binary log-loss for training, as well as accuracy, F1 score, precision, recall on different thresholds for eval. AUC functionality would be ideal here too.

## Penn Tree Bank

In [None]:
class PennTreeBankMetrics:
  # will be entirely static methods -- no need for constructor

  @staticmethod
  def loss(y_pred, y_true):
    # y_pred is B x L x V, and y_true is B x L

    y_pred = y_pred.view(-1, y_pred.shape[-1])
    y_true = y_true.view(-1)

    return F.cross_entropy(y_pred, y_true)

  @staticmethod
  def perplexity(y_pred, y_true):
    return torch.exp(PennTreeBankMetrics.loss(y_pred, y_true))

In [None]:
torch.randn(3, 5)

tensor([[ 0.2054, -1.5499, -0.4270, -0.4302,  0.3877],
        [ 1.2357,  0.3781,  0.6738, -1.7964, -0.0053],
        [ 1.8661, -0.3323, -0.9979,  1.0900, -0.2780]])

In [None]:
torch.randint(5, (3,), dtype=torch.int64)

tensor([0, 1, 4])

## IMDB

In [None]:
class IMDBMetrics:
  # Will be entirely static methods -- no need for constructor

  @staticmethod
  def loss(y_pred, y_true):
    # y_pred is B x 2 -- we submit as is with CEL
    return F.cross_entropy(y_pred, y_true)

  @staticmethod
  def accuracy(y_pred, y_true):
    return (y_pred.argmax(dim=1) == y_true).float().mean()

  @staticmethod
  def precision(y_pred, y_true):
    ps = []
    thresh = np.arange(0, 1, 0.05)
    for t in thresh:
      true_pos = (y_pred[:, 1] > t) & (y_true == 1)
      false_pos = (y_pred[:, 1] > t) & (y_true == 0)
      precision = true_pos.sum() / (true_pos.sum() + false_pos.sum())
      ps.append(precision)
    return np.array(ps)

  @staticmethod
  def recall(y_pred, y_true):
    thresh = np.arange(0, 1, 0.05)
    rs = []
    for t in thresh:
      true_pos = (y_pred[:, 1] > t) & (y_true == 1)
      false_neg = (y_pred[:, 1] < t) & (y_true == 1)
      recall = true_pos.sum() / (true_pos.sum() + false_neg.sum())
      rs.append(recall)
    return np.array(rs)

  @staticmethod
  def f1(y_pred, y_true):
    ps = IMDBMetrics.precision(y_pred, y_true)
    rs = IMDBMetrics.recall(y_pred, y_true)
    return 2 * ps * rs / (ps + rs)

  @staticmethod
  def auc(y_pred, y_true):
    ps = IMDBMetrics.precision(y_pred, y_true)
    rs = IMDBMetrics.recall(y_pred, y_true)
    return np.trapz(y=ps, x=rs)

# Train + Val Loop

## Penn Treebank

In [None]:
model = CLAWrapperNextToken(in_features=64, out_features=64, chunk_size=16, vocab_size=len(v2), seq_len=32)
model

  M = torch.tensor(torch.tril(torch.ones(chunk_size, chunk_size)))


CLAWrapperNextToken(
  (embed): Embedding(10000, 64)
  (head): ChunkwiseLinearAttentionHead(
    (Q_linear): Linear(in_features=64, out_features=64, bias=False)
    (K_linear): Linear(in_features=64, out_features=64, bias=False)
    (V_linear): Linear(in_features=64, out_features=64, bias=False)
  )
  (ffn): Sequential(
    (0): Linear(in_features=64, out_features=256, bias=True)
    (1): GELU(approximate='none')
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=256, out_features=64, bias=True)
    (4): Dropout(p=0.1, inplace=False)
  )
  (fc): Linear(in_features=64, out_features=10000, bias=True)
)

In [None]:
model = CLAWrapperNextToken2(in_features=32, out_features=32, chunk_size=16, vocab_size=len(v2), seq_len=32)

  M = torch.tensor(torch.tril(torch.ones(chunk_size, chunk_size)))


In [None]:
epochs = 500
lr = 5e-4
betas = (0.9, 0.999)
weight_decay = 0.01

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

for epoch in tqdm.tqdm(range(epochs)):
  total_loss = 0.0
  for step, batch in tqdm.tqdm(enumerate(ptb_train_loader)):
    model.train()
    data_fields = ['input_ids', 'labels']

    for k in data_fields:
      try:
        batch[k] = torch.stack(batch[k], dim=1)
      except:
        print(batch[k])
        raise ValueError("CLOWN")

    batch = {k: torch.tensor(batch[k]).to(device, non_blocking=True) for k in data_fields}

    y_pred = model(batch['input_ids'])
    loss = PennTreeBankMetrics.loss(y_pred, batch['labels'])

    total_loss += loss.item()

    loss.backward()
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)

    # Periodic loss updates on training
    if (step + 1) % 100 == 0:
      avg_train_loss = total_loss / (step + 1)
      print(f"Epoch: {epoch + 1}, Step: {step + 1}, Loss: {avg_train_loss}")

  if epoch % 5 != 4:
    continue

  model.eval()
  with torch.no_grad():
    losses = []
    perplexities = []
    for step, batch in tqdm.tqdm(enumerate(ptb_test_loader)):
      data_fields = ['input_ids', 'labels']

      for k in data_fields:
        try:
          batch[k] = torch.stack(batch[k], dim=1)
        except:
          print(batch[k])
          raise ValueError("CLOWN")


      batch = {k: torch.tensor(batch[k]).to(device, non_blocking=True) for k in data_fields}
      y_pred = model(batch['input_ids'])

      loss = PennTreeBankMetrics.loss(y_pred, batch['labels'])
      losses.append(loss.item())

      ppl = PennTreeBankMetrics.perplexity(y_pred, batch['labels'])
      perplexities.append(ppl.item())

    avg_test_loss = np.mean(losses)
    avg_test_ppl = np.mean(perplexities)
    print()
    print(f"Epoch: {epoch + 1}, Test Loss: {avg_test_loss}, Test Perplexity: {avg_test_ppl}")
    print()

  0%|          | 0/500 [00:00<?, ?it/s]
  batch = {k: torch.tensor(batch[k]).to(device, non_blocking=True) for k in data_fields}

5it [00:00, 46.15it/s][A
10it [00:00, 47.35it/s][A
15it [00:00, 47.38it/s][A
20it [00:00, 47.69it/s][A
25it [00:00, 47.91it/s][A
30it [00:00, 48.05it/s][A
35it [00:00, 48.11it/s][A
40it [00:00, 48.08it/s][A
45it [00:00, 48.25it/s][A
50it [00:01, 48.43it/s][A
55it [00:01, 48.58it/s][A
60it [00:01, 48.65it/s][A
65it [00:01, 48.63it/s][A
70it [00:01, 48.62it/s][A
75it [00:01, 48.48it/s][A
80it [00:01, 48.42it/s][A
85it [00:01, 48.36it/s][A
90it [00:01, 48.43it/s][A
95it [00:01, 48.49it/s][A
100it [00:02, 48.53it/s][A
105it [00:02, 48.64it/s][A

Epoch: 1, Step: 100, Loss: 4.949657382965088



110it [00:02, 48.68it/s][A
115it [00:02, 48.71it/s][A
120it [00:02, 48.77it/s][A
125it [00:02, 48.75it/s][A
130it [00:02, 48.78it/s][A
135it [00:02, 48.81it/s][A
140it [00:02, 48.75it/s][A
145it [00:02, 48.75it/s][A
150it [00:03, 48.72it/s][A
155it [00:03, 48.83it/s][A
160it [00:03, 48.91it/s][A
165it [00:03, 48.98it/s][A
170it [00:03, 49.00it/s][A
175it [00:03, 49.02it/s][A
180it [00:03, 48.90it/s][A
185it [00:03, 48.92it/s][A
190it [00:03, 48.61it/s][A
195it [00:04, 48.73it/s][A
200it [00:04, 48.66it/s][A
205it [00:04, 48.52it/s][A

Epoch: 1, Step: 200, Loss: 4.963284723758697



210it [00:04, 48.54it/s][A
215it [00:04, 47.98it/s][A
220it [00:04, 48.10it/s][A
227it [00:04, 48.46it/s]
  0%|          | 1/500 [00:04<38:58,  4.69s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.11it/s][A
10it [00:00, 47.94it/s][A
15it [00:00, 48.27it/s][A
20it [00:00, 45.77it/s][A
25it [00:00, 46.80it/s][A
30it [00:00, 46.83it/s][A
35it [00:00, 46.65it/s][A
40it [00:00, 47.04it/s][A
45it [00:00, 47.15it/s][A
50it [00:01, 47.34it/s][A
55it [00:01, 47.77it/s][A
60it [00:01, 48.05it/s][A
65it [00:01, 48.32it/s][A
70it [00:01, 48.51it/s][A
75it [00:01, 48.67it/s][A
80it [00:01, 48.75it/s][A
85it [00:01, 48.87it/s][A
90it [00:01, 48.84it/s][A
95it [00:01, 48.92it/s][A
100it [00:02, 48.88it/s][A
105it [00:02, 48.89it/s][A

Epoch: 2, Step: 100, Loss: 4.94382426738739



110it [00:02, 48.85it/s][A
115it [00:02, 48.96it/s][A
120it [00:02, 48.86it/s][A
125it [00:02, 48.70it/s][A
130it [00:02, 48.75it/s][A
135it [00:02, 48.65it/s][A
140it [00:02, 48.60it/s][A
145it [00:03, 48.53it/s][A
150it [00:03, 48.49it/s][A
155it [00:03, 48.51it/s][A
160it [00:03, 48.61it/s][A
165it [00:03, 48.38it/s][A
170it [00:03, 48.03it/s][A
175it [00:03, 47.95it/s][A
180it [00:03, 48.01it/s][A
185it [00:03, 48.16it/s][A
190it [00:03, 48.24it/s][A
195it [00:04, 48.26it/s][A
200it [00:04, 48.36it/s][A
205it [00:04, 48.38it/s][A

Epoch: 2, Step: 200, Loss: 4.9501638007164



210it [00:04, 48.50it/s][A
215it [00:04, 48.53it/s][A
220it [00:04, 48.59it/s][A
227it [00:04, 48.23it/s]
  0%|          | 2/500 [00:09<39:00,  4.70s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.59it/s][A
10it [00:00, 48.37it/s][A
15it [00:00, 48.48it/s][A
20it [00:00, 48.59it/s][A
25it [00:00, 48.46it/s][A
30it [00:00, 48.69it/s][A
35it [00:00, 48.83it/s][A
40it [00:00, 48.95it/s][A
45it [00:00, 48.92it/s][A
50it [00:01, 48.90it/s][A
55it [00:01, 48.92it/s][A
60it [00:01, 48.84it/s][A
65it [00:01, 48.87it/s][A
70it [00:01, 48.67it/s][A
75it [00:01, 48.69it/s][A
80it [00:01, 48.78it/s][A
85it [00:01, 48.86it/s][A
90it [00:01, 48.79it/s][A
95it [00:01, 48.69it/s][A
100it [00:02, 48.78it/s][A
105it [00:02, 48.78it/s][A

Epoch: 3, Step: 100, Loss: 4.935206613540649



110it [00:02, 48.76it/s][A
115it [00:02, 48.70it/s][A
120it [00:02, 48.58it/s][A
125it [00:02, 48.45it/s][A
130it [00:02, 48.52it/s][A
135it [00:02, 48.53it/s][A
140it [00:02, 48.46it/s][A
145it [00:02, 48.51it/s][A
150it [00:03, 48.59it/s][A
155it [00:03, 48.51it/s][A
160it [00:03, 48.36it/s][A
165it [00:03, 48.19it/s][A
170it [00:03, 48.23it/s][A
175it [00:03, 48.31it/s][A
180it [00:03, 48.36it/s][A
185it [00:03, 48.42it/s][A
190it [00:03, 48.45it/s][A
195it [00:04, 48.49it/s][A
200it [00:04, 48.48it/s][A
205it [00:04, 48.51it/s][A

Epoch: 3, Step: 200, Loss: 4.949130411148071



210it [00:04, 48.26it/s][A
215it [00:04, 48.20it/s][A
220it [00:04, 48.35it/s][A
227it [00:04, 48.55it/s]
  1%|          | 3/500 [00:14<38:51,  4.69s/it]
0it [00:00, ?it/s][A
5it [00:00, 49.06it/s][A
10it [00:00, 48.79it/s][A
15it [00:00, 48.72it/s][A
20it [00:00, 48.71it/s][A
25it [00:00, 48.71it/s][A
30it [00:00, 47.75it/s][A
35it [00:00, 47.93it/s][A
40it [00:00, 48.11it/s][A
45it [00:00, 48.32it/s][A
50it [00:01, 48.53it/s][A
55it [00:01, 48.64it/s][A
60it [00:01, 48.66it/s][A
65it [00:01, 48.73it/s][A
70it [00:01, 48.79it/s][A
75it [00:01, 48.83it/s][A
80it [00:01, 48.82it/s][A
85it [00:01, 47.81it/s][A
90it [00:01, 47.87it/s][A
95it [00:01, 48.03it/s][A
100it [00:02, 48.21it/s][A
105it [00:02, 48.55it/s][A

Epoch: 4, Step: 100, Loss: 4.92980947971344



110it [00:02, 48.40it/s][A
115it [00:02, 48.19it/s][A
120it [00:02, 48.31it/s][A
125it [00:02, 48.40it/s][A
130it [00:02, 48.16it/s][A
135it [00:02, 47.77it/s][A
140it [00:02, 47.43it/s][A
145it [00:03, 47.55it/s][A
150it [00:03, 47.80it/s][A
155it [00:03, 48.07it/s][A
160it [00:03, 48.06it/s][A
165it [00:03, 48.08it/s][A
170it [00:03, 48.31it/s][A
175it [00:03, 48.31it/s][A
180it [00:03, 48.46it/s][A
185it [00:03, 48.53it/s][A
190it [00:03, 48.72it/s][A
195it [00:04, 48.83it/s][A
200it [00:04, 48.86it/s][A
205it [00:04, 48.89it/s][A

Epoch: 4, Step: 200, Loss: 4.945190358161926



210it [00:04, 48.83it/s][A
215it [00:04, 48.61it/s][A
220it [00:04, 48.61it/s][A
227it [00:04, 48.36it/s]
  1%|          | 4/500 [00:18<38:47,  4.69s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.73it/s][A
10it [00:00, 48.12it/s][A
15it [00:00, 48.40it/s][A
20it [00:00, 48.51it/s][A
25it [00:00, 48.57it/s][A
30it [00:00, 48.52it/s][A
35it [00:00, 48.45it/s][A
40it [00:00, 48.34it/s][A
45it [00:00, 48.30it/s][A
50it [00:01, 48.41it/s][A
55it [00:01, 48.50it/s][A
60it [00:01, 48.62it/s][A
65it [00:01, 48.65it/s][A
70it [00:01, 48.73it/s][A
75it [00:01, 48.77it/s][A
80it [00:01, 48.76it/s][A
85it [00:01, 48.11it/s][A
90it [00:01, 48.30it/s][A
95it [00:01, 47.90it/s][A
100it [00:02, 47.91it/s][A
105it [00:02, 48.07it/s][A

Epoch: 5, Step: 100, Loss: 4.920273098945618



110it [00:02, 48.24it/s][A
115it [00:02, 48.28it/s][A
120it [00:02, 48.38it/s][A
125it [00:02, 48.26it/s][A
130it [00:02, 48.08it/s][A
135it [00:02, 48.06it/s][A
140it [00:02, 48.11it/s][A
145it [00:03, 48.23it/s][A
150it [00:03, 48.40it/s][A
155it [00:03, 48.02it/s][A
160it [00:03, 48.19it/s][A
165it [00:03, 48.40it/s][A
170it [00:03, 48.38it/s][A
175it [00:03, 48.49it/s][A
180it [00:03, 48.57it/s][A
185it [00:03, 48.63it/s][A
190it [00:03, 48.69it/s][A
195it [00:04, 48.69it/s][A
200it [00:04, 48.51it/s][A
205it [00:04, 48.56it/s][A

Epoch: 5, Step: 200, Loss: 4.941899123191834



210it [00:04, 48.37it/s][A
215it [00:04, 48.42it/s][A
220it [00:04, 48.34it/s][A
227it [00:04, 48.35it/s]

  batch = {k: torch.tensor(batch[k]).to(device, non_blocking=True) for k in data_fields}

7it [00:00, 61.63it/s][A
14it [00:00, 61.85it/s][A
21it [00:00, 62.27it/s][A
28it [00:00, 62.59it/s][A
35it [00:00, 62.85it/s][A
42it [00:00, 62.94it/s][A
49it [00:00, 62.95it/s][A
56it [00:00, 63.01it/s][A
63it [00:01, 63.09it/s][A
70it [00:01, 63.10it/s][A
77it [00:01, 62.93it/s][A
84it [00:01, 62.76it/s][A
91it [00:01, 62.85it/s][A
98it [00:01, 62.95it/s][A
105it [00:01, 62.97it/s][A
112it [00:01, 63.10it/s][A
119it [00:01, 63.03it/s][A
126it [00:02, 63.14it/s][A
133it [00:02, 62.95it/s][A
140it [00:02, 62.87it/s][A
147it [00:02, 62.67it/s][A
154it [00:02, 62.84it/s][A
161it [00:02, 62.92it/s][A
168it [00:02, 62.98it/s][A
175it [00:02, 63.05it/s][A
182it [00:02, 63.15it/s][A
189it [00:03, 63.27it/s][A
196it [00:03, 63.19it/s][A
203it [00:03, 63.00it/s][A
21


Epoch: 5, Test Loss: 5.42355499800688, Test Perplexity: 227.31422417208276




0it [00:00, ?it/s][A
5it [00:00, 47.76it/s][A
10it [00:00, 47.10it/s][A
15it [00:00, 47.36it/s][A
20it [00:00, 47.73it/s][A
25it [00:00, 47.89it/s][A
30it [00:00, 47.80it/s][A
35it [00:00, 47.60it/s][A
40it [00:00, 47.75it/s][A
45it [00:00, 47.69it/s][A
50it [00:01, 47.26it/s][A
55it [00:01, 46.85it/s][A
60it [00:01, 46.87it/s][A
65it [00:01, 47.22it/s][A
70it [00:01, 47.49it/s][A
75it [00:01, 47.47it/s][A
80it [00:01, 47.60it/s][A
85it [00:01, 47.84it/s][A
90it [00:01, 47.99it/s][A
95it [00:01, 47.98it/s][A
100it [00:02, 47.66it/s][A
105it [00:02, 47.45it/s][A

Epoch: 6, Step: 100, Loss: 4.924993233680725



110it [00:02, 47.58it/s][A
115it [00:02, 47.83it/s][A
120it [00:02, 47.76it/s][A
125it [00:02, 47.74it/s][A
130it [00:02, 47.58it/s][A
135it [00:02, 47.62it/s][A
140it [00:02, 47.66it/s][A
145it [00:03, 47.81it/s][A
150it [00:03, 47.73it/s][A
155it [00:03, 47.65it/s][A
160it [00:03, 47.67it/s][A
165it [00:03, 47.22it/s][A
170it [00:03, 46.75it/s][A
175it [00:03, 46.49it/s][A
180it [00:03, 46.51it/s][A
185it [00:03, 46.88it/s][A
190it [00:04, 47.27it/s][A
195it [00:04, 47.69it/s][A
200it [00:04, 47.87it/s][A
205it [00:04, 47.72it/s][A

Epoch: 6, Step: 200, Loss: 4.936957929134369



210it [00:04, 47.49it/s][A
215it [00:04, 46.99it/s][A
220it [00:04, 47.00it/s][A
227it [00:04, 47.43it/s]
  1%|          | 6/500 [00:38<58:59,  7.16s/it]  
0it [00:00, ?it/s][A
5it [00:00, 44.58it/s][A
10it [00:00, 44.85it/s][A
15it [00:00, 45.18it/s][A
20it [00:00, 46.17it/s][A
25it [00:00, 46.53it/s][A
30it [00:00, 46.79it/s][A
35it [00:00, 47.10it/s][A
40it [00:00, 47.42it/s][A
45it [00:00, 47.64it/s][A
50it [00:01, 47.33it/s][A
55it [00:01, 47.31it/s][A
60it [00:01, 46.82it/s][A
65it [00:01, 46.43it/s][A
70it [00:01, 45.92it/s][A
75it [00:01, 46.08it/s][A
80it [00:01, 46.46it/s][A
85it [00:01, 46.86it/s][A
90it [00:01, 47.27it/s][A
95it [00:02, 47.63it/s][A
100it [00:02, 47.67it/s][A
105it [00:02, 47.94it/s][A

Epoch: 7, Step: 100, Loss: 4.914947280883789



110it [00:02, 47.98it/s][A
115it [00:02, 47.98it/s][A
120it [00:02, 47.53it/s][A
125it [00:02, 47.22it/s][A
130it [00:02, 47.03it/s][A
135it [00:02, 46.96it/s][A
140it [00:02, 46.52it/s][A
145it [00:03, 46.67it/s][A
150it [00:03, 46.92it/s][A
155it [00:03, 46.85it/s][A
160it [00:03, 46.79it/s][A
165it [00:03, 46.87it/s][A
170it [00:03, 46.88it/s][A
175it [00:03, 47.30it/s][A
180it [00:03, 47.70it/s][A
185it [00:03, 47.94it/s][A
190it [00:04, 48.12it/s][A
195it [00:04, 48.23it/s][A
200it [00:04, 48.33it/s][A
205it [00:04, 48.05it/s][A

Epoch: 7, Step: 200, Loss: 4.933672702312469



210it [00:04, 47.90it/s][A
215it [00:04, 48.08it/s][A
220it [00:04, 47.84it/s][A
227it [00:04, 47.21it/s]
  1%|▏         | 7/500 [00:43<52:32,  6.40s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.89it/s][A
10it [00:00, 47.60it/s][A
15it [00:00, 47.49it/s][A
20it [00:00, 47.36it/s][A
25it [00:00, 47.41it/s][A
30it [00:00, 47.57it/s][A
35it [00:00, 47.74it/s][A
40it [00:00, 46.67it/s][A
45it [00:00, 46.64it/s][A
50it [00:01, 46.48it/s][A
55it [00:01, 46.33it/s][A
60it [00:01, 46.55it/s][A
65it [00:01, 46.70it/s][A
70it [00:01, 46.37it/s][A
75it [00:01, 46.77it/s][A
80it [00:01, 47.18it/s][A
85it [00:01, 47.52it/s][A
90it [00:01, 47.33it/s][A
95it [00:02, 47.07it/s][A
100it [00:02, 46.16it/s][A
105it [00:02, 45.19it/s][A

Epoch: 8, Step: 100, Loss: 4.91900595664978



110it [00:02, 45.20it/s][A
115it [00:02, 45.63it/s][A
120it [00:02, 46.01it/s][A
125it [00:02, 46.17it/s][A
130it [00:02, 46.24it/s][A
135it [00:02, 46.27it/s][A
140it [00:03, 46.52it/s][A
145it [00:03, 46.64it/s][A
150it [00:03, 46.69it/s][A
155it [00:03, 46.60it/s][A
160it [00:03, 45.96it/s][A
165it [00:03, 45.94it/s][A
170it [00:03, 46.04it/s][A
175it [00:03, 46.25it/s][A
180it [00:03, 45.98it/s][A
185it [00:03, 46.27it/s][A
190it [00:04, 46.81it/s][A
195it [00:04, 46.95it/s][A
200it [00:04, 46.79it/s][A
205it [00:04, 47.02it/s][A

Epoch: 8, Step: 200, Loss: 4.930286254882812



210it [00:04, 47.12it/s][A
215it [00:04, 47.38it/s][A
220it [00:04, 47.66it/s][A
227it [00:04, 46.70it/s]
  2%|▏         | 8/500 [00:48<48:26,  5.91s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.14it/s][A
10it [00:00, 47.01it/s][A
15it [00:00, 46.71it/s][A
20it [00:00, 47.18it/s][A
25it [00:00, 47.35it/s][A
30it [00:00, 47.40it/s][A
35it [00:00, 47.37it/s][A
40it [00:00, 47.59it/s][A
45it [00:00, 47.65it/s][A
50it [00:01, 47.84it/s][A
55it [00:01, 47.77it/s][A
60it [00:01, 47.80it/s][A
65it [00:01, 47.69it/s][A
70it [00:01, 47.61it/s][A
75it [00:01, 47.51it/s][A
80it [00:01, 47.40it/s][A
85it [00:01, 47.42it/s][A
90it [00:01, 46.98it/s][A
95it [00:02, 46.89it/s][A
100it [00:02, 46.87it/s][A
105it [00:02, 46.99it/s][A

Epoch: 9, Step: 100, Loss: 4.9109120321273805



110it [00:02, 46.82it/s][A
115it [00:02, 45.92it/s][A
120it [00:02, 46.26it/s][A
125it [00:02, 46.69it/s][A
130it [00:02, 46.77it/s][A
135it [00:02, 46.41it/s][A
140it [00:02, 46.43it/s][A
145it [00:03, 46.41it/s][A
150it [00:03, 46.45it/s][A
155it [00:03, 46.57it/s][A
160it [00:03, 47.08it/s][A
165it [00:03, 47.42it/s][A
170it [00:03, 47.78it/s][A
175it [00:03, 47.88it/s][A
180it [00:03, 48.02it/s][A
185it [00:03, 48.24it/s][A
190it [00:04, 48.19it/s][A
195it [00:04, 48.16it/s][A
200it [00:04, 47.99it/s][A
205it [00:04, 48.06it/s][A

Epoch: 9, Step: 200, Loss: 4.927876183986664



210it [00:04, 48.00it/s][A
215it [00:04, 48.18it/s][A
220it [00:04, 48.10it/s][A
227it [00:04, 47.33it/s]
  2%|▏         | 9/500 [00:52<45:30,  5.56s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.04it/s][A
10it [00:00, 46.45it/s][A
15it [00:00, 46.31it/s][A
20it [00:00, 46.24it/s][A
25it [00:00, 46.55it/s][A
30it [00:00, 46.92it/s][A
35it [00:00, 47.08it/s][A
40it [00:00, 47.17it/s][A
45it [00:00, 47.55it/s][A
50it [00:01, 47.94it/s][A
55it [00:01, 48.12it/s][A
60it [00:01, 48.14it/s][A
65it [00:01, 48.14it/s][A
70it [00:01, 48.06it/s][A
75it [00:01, 48.11it/s][A
80it [00:01, 48.22it/s][A
85it [00:01, 48.22it/s][A
90it [00:01, 48.27it/s][A
95it [00:01, 48.37it/s][A
100it [00:02, 48.46it/s][A
105it [00:02, 48.58it/s][A

Epoch: 10, Step: 100, Loss: 4.904231991767883



110it [00:02, 48.63it/s][A
115it [00:02, 48.64it/s][A
120it [00:02, 48.52it/s][A
125it [00:02, 48.54it/s][A
130it [00:02, 48.28it/s][A
135it [00:02, 48.00it/s][A
140it [00:02, 47.63it/s][A
145it [00:03, 47.49it/s][A
150it [00:03, 47.43it/s][A
155it [00:03, 47.30it/s][A
160it [00:03, 47.16it/s][A
165it [00:03, 47.22it/s][A
170it [00:03, 47.37it/s][A
175it [00:03, 47.48it/s][A
180it [00:03, 47.73it/s][A
185it [00:03, 47.86it/s][A
190it [00:03, 47.92it/s][A
195it [00:04, 48.14it/s][A
200it [00:04, 48.21it/s][A
205it [00:04, 47.86it/s][A

Epoch: 10, Step: 200, Loss: 4.919809508323669



210it [00:04, 47.43it/s][A
215it [00:04, 47.13it/s][A
220it [00:04, 46.90it/s][A
227it [00:04, 47.65it/s]

0it [00:00, ?it/s][A
7it [00:00, 61.37it/s][A
14it [00:00, 61.81it/s][A
21it [00:00, 62.28it/s][A
28it [00:00, 62.31it/s][A
35it [00:00, 62.43it/s][A
42it [00:00, 62.76it/s][A
49it [00:00, 62.59it/s][A
56it [00:00, 62.57it/s][A
63it [00:01, 62.20it/s][A
70it [00:01, 62.48it/s][A
77it [00:01, 62.66it/s][A
84it [00:01, 62.97it/s][A
91it [00:01, 63.08it/s][A
98it [00:01, 62.99it/s][A
105it [00:01, 63.06it/s][A
112it [00:01, 62.93it/s][A
119it [00:01, 62.85it/s][A
126it [00:02, 62.90it/s][A
133it [00:02, 63.01it/s][A
140it [00:02, 63.10it/s][A
147it [00:02, 63.24it/s][A
154it [00:02, 63.32it/s][A
161it [00:02, 63.41it/s][A
168it [00:02, 63.47it/s][A
175it [00:02, 63.53it/s][A
182it [00:02, 63.42it/s][A
189it [00:03, 63.44it/s][A
196it [00:03, 62.97it/s][A
203it [00:03, 63.01it/s][A
210it [00:03, 63.16it/s][A
217it [00:03, 63.23it/s][A
224it [00:03, 


Epoch: 10, Test Loss: 5.423792765007256, Test Perplexity: 227.36972375241865




0it [00:00, ?it/s][A
5it [00:00, 48.25it/s][A
10it [00:00, 48.04it/s][A
15it [00:00, 48.31it/s][A
20it [00:00, 48.37it/s][A
25it [00:00, 48.05it/s][A
30it [00:00, 47.94it/s][A
35it [00:00, 47.95it/s][A
40it [00:00, 48.04it/s][A
45it [00:00, 48.14it/s][A
50it [00:01, 48.32it/s][A
55it [00:01, 48.26it/s][A
60it [00:01, 48.03it/s][A
65it [00:01, 47.72it/s][A
70it [00:01, 47.58it/s][A
75it [00:01, 47.27it/s][A
80it [00:01, 47.18it/s][A
85it [00:01, 46.96it/s][A
90it [00:01, 47.28it/s][A
95it [00:01, 47.63it/s][A
100it [00:02, 47.86it/s][A
105it [00:02, 48.04it/s][A

Epoch: 11, Step: 100, Loss: 4.907064785957337



110it [00:02, 47.98it/s][A
115it [00:02, 47.87it/s][A
120it [00:02, 47.97it/s][A
125it [00:02, 48.16it/s][A
130it [00:02, 48.07it/s][A
135it [00:02, 48.09it/s][A
140it [00:02, 48.15it/s][A
145it [00:03, 48.36it/s][A
150it [00:03, 48.44it/s][A
155it [00:03, 48.36it/s][A
160it [00:03, 48.32it/s][A
165it [00:03, 48.25it/s][A
170it [00:03, 48.24it/s][A
175it [00:03, 48.20it/s][A
180it [00:03, 48.26it/s][A
185it [00:03, 48.04it/s][A
190it [00:03, 48.17it/s][A
195it [00:04, 48.27it/s][A
200it [00:04, 48.30it/s][A
205it [00:04, 48.27it/s][A

Epoch: 11, Step: 200, Loss: 4.920467677116394



210it [00:04, 48.35it/s][A
215it [00:04, 48.26it/s][A
220it [00:04, 48.27it/s][A
227it [00:04, 48.04it/s]
  2%|▏         | 11/500 [01:12<59:51,  7.35s/it]  
0it [00:00, ?it/s][A
5it [00:00, 45.51it/s][A
10it [00:00, 45.53it/s][A
15it [00:00, 45.80it/s][A
20it [00:00, 46.35it/s][A
25it [00:00, 46.87it/s][A
30it [00:00, 45.01it/s][A
35it [00:00, 45.86it/s][A
40it [00:00, 46.18it/s][A
45it [00:00, 46.02it/s][A
50it [00:01, 45.53it/s][A
55it [00:01, 45.14it/s][A
60it [00:01, 44.87it/s][A
65it [00:01, 45.41it/s][A
70it [00:01, 45.97it/s][A
75it [00:01, 46.53it/s][A
80it [00:01, 46.73it/s][A
85it [00:01, 46.95it/s][A
90it [00:01, 46.91it/s][A
95it [00:02, 46.10it/s][A
100it [00:02, 46.09it/s][A
105it [00:02, 46.66it/s][A

Epoch: 12, Step: 100, Loss: 4.905740580558777



110it [00:02, 46.77it/s][A
115it [00:02, 47.00it/s][A
120it [00:02, 47.22it/s][A
125it [00:02, 47.25it/s][A
130it [00:02, 47.49it/s][A
135it [00:02, 47.58it/s][A
140it [00:03, 47.47it/s][A
145it [00:03, 47.58it/s][A
150it [00:03, 47.84it/s][A
155it [00:03, 47.84it/s][A
160it [00:03, 47.80it/s][A
165it [00:03, 47.83it/s][A
170it [00:03, 47.83it/s][A
175it [00:03, 47.90it/s][A
180it [00:03, 48.08it/s][A
185it [00:03, 47.99it/s][A
190it [00:04, 48.03it/s][A
195it [00:04, 47.99it/s][A
200it [00:04, 47.97it/s][A
205it [00:04, 47.91it/s][A

Epoch: 12, Step: 200, Loss: 4.916433064937592



210it [00:04, 48.01it/s][A
215it [00:04, 48.14it/s][A
220it [00:04, 48.23it/s][A
227it [00:04, 47.01it/s]
  2%|▏         | 12/500 [01:17<53:31,  6.58s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.28it/s][A
10it [00:00, 47.31it/s][A
15it [00:00, 47.38it/s][A
20it [00:00, 47.35it/s][A
25it [00:00, 47.47it/s][A
30it [00:00, 47.59it/s][A
35it [00:00, 47.76it/s][A
40it [00:00, 47.80it/s][A
45it [00:00, 47.79it/s][A
50it [00:01, 47.81it/s][A
55it [00:01, 47.70it/s][A
60it [00:01, 47.80it/s][A
65it [00:01, 47.91it/s][A
70it [00:01, 47.95it/s][A
75it [00:01, 48.01it/s][A
80it [00:01, 48.20it/s][A
85it [00:01, 48.41it/s][A
90it [00:01, 48.47it/s][A
95it [00:01, 48.43it/s][A
100it [00:02, 48.44it/s][A
105it [00:02, 48.52it/s][A

Epoch: 13, Step: 100, Loss: 4.900459780693054



110it [00:02, 48.29it/s][A
115it [00:02, 48.37it/s][A
120it [00:02, 47.71it/s][A
125it [00:02, 48.04it/s][A
130it [00:02, 48.27it/s][A
135it [00:02, 48.35it/s][A
140it [00:02, 48.34it/s][A
145it [00:03, 48.18it/s][A
150it [00:03, 48.16it/s][A
155it [00:03, 47.95it/s][A
160it [00:03, 47.80it/s][A
165it [00:03, 47.90it/s][A
170it [00:03, 47.89it/s][A
175it [00:03, 47.78it/s][A
180it [00:03, 47.86it/s][A
185it [00:03, 47.95it/s][A
190it [00:03, 48.01it/s][A
195it [00:04, 47.67it/s][A
200it [00:04, 47.56it/s][A
205it [00:04, 47.68it/s][A

Epoch: 13, Step: 200, Loss: 4.911067080497742



210it [00:04, 47.56it/s][A
215it [00:04, 47.59it/s][A
220it [00:04, 47.85it/s][A
227it [00:04, 47.92it/s]
  3%|▎         | 13/500 [01:22<48:53,  6.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.64it/s][A
10it [00:00, 47.98it/s][A
15it [00:00, 48.33it/s][A
20it [00:00, 48.39it/s][A
25it [00:00, 48.47it/s][A
30it [00:00, 48.57it/s][A
35it [00:00, 48.65it/s][A
40it [00:00, 48.37it/s][A
45it [00:00, 48.36it/s][A
50it [00:01, 48.40it/s][A
55it [00:01, 48.43it/s][A
60it [00:01, 48.46it/s][A
65it [00:01, 48.44it/s][A
70it [00:01, 48.38it/s][A
75it [00:01, 48.39it/s][A
80it [00:01, 48.41it/s][A
85it [00:01, 48.45it/s][A
90it [00:01, 48.09it/s][A
95it [00:01, 47.34it/s][A
100it [00:02, 47.18it/s][A
105it [00:02, 47.34it/s][A

Epoch: 14, Step: 100, Loss: 4.894035406112671



110it [00:02, 47.51it/s][A
115it [00:02, 47.40it/s][A
120it [00:02, 47.42it/s][A
125it [00:02, 47.43it/s][A
130it [00:02, 47.49it/s][A
135it [00:02, 47.52it/s][A
140it [00:02, 47.46it/s][A
145it [00:03, 47.51it/s][A
150it [00:03, 47.21it/s][A
155it [00:03, 47.21it/s][A
160it [00:03, 47.12it/s][A
165it [00:03, 47.46it/s][A
170it [00:03, 47.44it/s][A
175it [00:03, 47.57it/s][A
180it [00:03, 47.78it/s][A
185it [00:03, 47.85it/s][A
190it [00:03, 48.01it/s][A
195it [00:04, 48.05it/s][A
200it [00:04, 48.03it/s][A
205it [00:04, 48.13it/s][A

Epoch: 14, Step: 200, Loss: 4.911860859394073



210it [00:04, 47.99it/s][A
215it [00:04, 47.97it/s][A
220it [00:04, 47.83it/s][A
227it [00:04, 47.85it/s]
  3%|▎         | 14/500 [01:27<45:39,  5.64s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.37it/s][A
10it [00:00, 47.93it/s][A
15it [00:00, 48.16it/s][A
20it [00:00, 48.28it/s][A
25it [00:00, 48.31it/s][A
30it [00:00, 48.24it/s][A
35it [00:00, 48.20it/s][A
40it [00:00, 48.22it/s][A
45it [00:00, 48.20it/s][A
50it [00:01, 48.19it/s][A
55it [00:01, 48.21it/s][A
60it [00:01, 48.13it/s][A
65it [00:01, 48.19it/s][A
70it [00:01, 48.24it/s][A
75it [00:01, 48.25it/s][A
80it [00:01, 48.11it/s][A
85it [00:01, 47.93it/s][A
90it [00:01, 48.02it/s][A
95it [00:01, 48.15it/s][A
100it [00:02, 48.18it/s][A
105it [00:02, 48.08it/s][A

Epoch: 15, Step: 100, Loss: 4.898897523880005



110it [00:02, 48.01it/s][A
115it [00:02, 48.03it/s][A
120it [00:02, 47.90it/s][A
125it [00:02, 47.96it/s][A
130it [00:02, 48.09it/s][A
135it [00:02, 48.09it/s][A
140it [00:02, 48.01it/s][A
145it [00:03, 48.06it/s][A
150it [00:03, 48.20it/s][A
155it [00:03, 48.23it/s][A
160it [00:03, 47.93it/s][A
165it [00:03, 47.65it/s][A
170it [00:03, 47.52it/s][A
175it [00:03, 47.55it/s][A
180it [00:03, 47.35it/s][A
185it [00:03, 47.45it/s][A
190it [00:03, 47.50it/s][A
195it [00:04, 47.68it/s][A
200it [00:04, 47.62it/s][A
205it [00:04, 47.68it/s][A

Epoch: 15, Step: 200, Loss: 4.9051805591583255



210it [00:04, 47.52it/s][A
215it [00:04, 47.71it/s][A
220it [00:04, 47.65it/s][A
227it [00:04, 47.89it/s]

0it [00:00, ?it/s][A
7it [00:00, 61.06it/s][A
14it [00:00, 61.67it/s][A
21it [00:00, 62.29it/s][A
28it [00:00, 62.40it/s][A
35it [00:00, 62.51it/s][A
42it [00:00, 62.20it/s][A
49it [00:00, 62.31it/s][A
56it [00:00, 62.02it/s][A
63it [00:01, 62.13it/s][A
70it [00:01, 62.11it/s][A
77it [00:01, 62.29it/s][A
84it [00:01, 62.48it/s][A
91it [00:01, 62.65it/s][A
98it [00:01, 62.74it/s][A
105it [00:01, 62.33it/s][A
112it [00:01, 62.37it/s][A
119it [00:01, 62.02it/s][A
126it [00:02, 61.96it/s][A
133it [00:02, 61.86it/s][A
140it [00:02, 62.13it/s][A
147it [00:02, 62.17it/s][A
154it [00:02, 62.15it/s][A
161it [00:02, 62.31it/s][A
168it [00:02, 62.33it/s][A
175it [00:02, 62.53it/s][A
182it [00:02, 62.54it/s][A
189it [00:03, 62.63it/s][A
196it [00:03, 62.62it/s][A
203it [00:03, 62.69it/s][A
210it [00:03, 62.82it/s][A
217it [00:03, 62.94it/s][A
224it [00:03, 


Epoch: 15, Test Loss: 5.4236319124328425, Test Perplexity: 227.31504679614713




0it [00:00, ?it/s][A
5it [00:00, 47.36it/s][A
10it [00:00, 47.09it/s][A
15it [00:00, 47.36it/s][A
20it [00:00, 47.62it/s][A
25it [00:00, 47.57it/s][A
30it [00:00, 47.30it/s][A
35it [00:00, 47.16it/s][A
40it [00:00, 47.13it/s][A
45it [00:00, 47.10it/s][A
50it [00:01, 47.42it/s][A
55it [00:01, 47.73it/s][A
60it [00:01, 47.92it/s][A
65it [00:01, 47.96it/s][A
70it [00:01, 48.02it/s][A
75it [00:01, 48.13it/s][A
80it [00:01, 47.76it/s][A
85it [00:01, 47.73it/s][A
90it [00:01, 47.79it/s][A
95it [00:01, 47.96it/s][A
100it [00:02, 47.97it/s][A
105it [00:02, 48.27it/s][A

Epoch: 16, Step: 100, Loss: 4.885257749557495



110it [00:02, 48.34it/s][A
115it [00:02, 48.46it/s][A
120it [00:02, 48.23it/s][A
125it [00:02, 47.70it/s][A
130it [00:02, 47.55it/s][A
135it [00:02, 46.89it/s][A
140it [00:02, 47.16it/s][A
145it [00:03, 47.26it/s][A
150it [00:03, 47.65it/s][A
155it [00:03, 47.74it/s][A
160it [00:03, 48.03it/s][A
165it [00:03, 47.86it/s][A
170it [00:03, 47.84it/s][A
175it [00:03, 47.39it/s][A
180it [00:03, 47.07it/s][A
185it [00:03, 47.26it/s][A
190it [00:03, 47.57it/s][A
195it [00:04, 47.69it/s][A
200it [00:04, 47.93it/s][A
205it [00:04, 47.94it/s][A

Epoch: 16, Step: 200, Loss: 4.902988827228546



210it [00:04, 47.68it/s][A
215it [00:04, 47.63it/s][A
220it [00:04, 47.45it/s][A
227it [00:04, 47.61it/s]
  3%|▎         | 16/500 [01:46<59:24,  7.36s/it]  
0it [00:00, ?it/s][A
5it [00:00, 48.04it/s][A
10it [00:00, 45.52it/s][A
15it [00:00, 45.70it/s][A
20it [00:00, 45.90it/s][A
25it [00:00, 46.20it/s][A
30it [00:00, 46.93it/s][A
35it [00:00, 47.19it/s][A
40it [00:00, 47.02it/s][A
45it [00:00, 46.98it/s][A
50it [00:01, 47.25it/s][A
55it [00:01, 47.20it/s][A
60it [00:01, 47.07it/s][A
65it [00:01, 46.73it/s][A
70it [00:01, 46.64it/s][A
75it [00:01, 46.43it/s][A
80it [00:01, 46.94it/s][A
85it [00:01, 47.33it/s][A
90it [00:01, 47.56it/s][A
95it [00:02, 47.76it/s][A
100it [00:02, 48.08it/s][A
105it [00:02, 48.11it/s][A

Epoch: 17, Step: 100, Loss: 4.889044013023376



110it [00:02, 48.14it/s][A
115it [00:02, 48.25it/s][A
120it [00:02, 48.36it/s][A
125it [00:02, 48.44it/s][A
130it [00:02, 48.52it/s][A
135it [00:02, 48.64it/s][A
140it [00:02, 48.27it/s][A
145it [00:03, 48.33it/s][A
150it [00:03, 48.52it/s][A
155it [00:03, 48.47it/s][A
160it [00:03, 48.60it/s][A
165it [00:03, 48.73it/s][A
170it [00:03, 48.79it/s][A
175it [00:03, 48.71it/s][A
180it [00:03, 48.79it/s][A
185it [00:03, 48.70it/s][A
190it [00:03, 48.43it/s][A
195it [00:04, 48.62it/s][A
200it [00:04, 48.81it/s][A
205it [00:04, 48.74it/s][A

Epoch: 17, Step: 200, Loss: 4.900312678813934



210it [00:04, 48.47it/s][A
215it [00:04, 48.55it/s][A
220it [00:04, 48.63it/s][A
227it [00:04, 47.89it/s]
  3%|▎         | 17/500 [01:51<52:56,  6.58s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.70it/s][A
10it [00:00, 48.76it/s][A
15it [00:00, 47.28it/s][A
20it [00:00, 47.49it/s][A
25it [00:00, 47.96it/s][A
30it [00:00, 48.18it/s][A
35it [00:00, 48.19it/s][A
40it [00:00, 48.35it/s][A
45it [00:00, 48.44it/s][A
50it [00:01, 48.50it/s][A
55it [00:01, 48.55it/s][A
60it [00:01, 48.59it/s][A
65it [00:01, 48.52it/s][A
70it [00:01, 48.75it/s][A
75it [00:01, 48.47it/s][A
80it [00:01, 48.67it/s][A
85it [00:01, 48.60it/s][A
90it [00:01, 48.74it/s][A
95it [00:01, 48.56it/s][A
100it [00:02, 48.42it/s][A
105it [00:02, 48.25it/s][A

Epoch: 18, Step: 100, Loss: 4.8861067533493046



110it [00:02, 48.11it/s][A
115it [00:02, 48.27it/s][A
120it [00:02, 48.45it/s][A
125it [00:02, 48.51it/s][A
130it [00:02, 48.58it/s][A
135it [00:02, 48.37it/s][A
140it [00:02, 48.53it/s][A
145it [00:02, 48.43it/s][A
150it [00:03, 48.57it/s][A
155it [00:03, 48.65it/s][A
160it [00:03, 48.65it/s][A
165it [00:03, 48.56it/s][A
170it [00:03, 48.53it/s][A
175it [00:03, 48.65it/s][A
180it [00:03, 48.77it/s][A
185it [00:03, 48.64it/s][A
190it [00:03, 48.72it/s][A
195it [00:04, 48.47it/s][A
200it [00:04, 48.63it/s][A
205it [00:04, 48.67it/s][A

Epoch: 18, Step: 200, Loss: 4.893946073055267



210it [00:04, 48.70it/s][A
215it [00:04, 48.37it/s][A
220it [00:04, 48.45it/s][A
227it [00:04, 48.44it/s]
  4%|▎         | 18/500 [01:56<48:16,  6.01s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.62it/s][A
10it [00:00, 48.37it/s][A
15it [00:00, 48.25it/s][A
20it [00:00, 48.09it/s][A
25it [00:00, 47.80it/s][A
30it [00:00, 47.92it/s][A
35it [00:00, 48.20it/s][A
40it [00:00, 48.26it/s][A
45it [00:00, 48.40it/s][A
50it [00:01, 48.54it/s][A
55it [00:01, 48.61it/s][A
60it [00:01, 48.50it/s][A
65it [00:01, 48.44it/s][A
70it [00:01, 48.29it/s][A
75it [00:01, 47.93it/s][A
80it [00:01, 47.96it/s][A
85it [00:01, 47.76it/s][A
90it [00:01, 47.94it/s][A
95it [00:01, 48.23it/s][A
100it [00:02, 48.32it/s][A
105it [00:02, 47.58it/s][A

Epoch: 19, Step: 100, Loss: 4.885489211082459



110it [00:02, 46.97it/s][A
115it [00:02, 47.07it/s][A
120it [00:02, 47.09it/s][A
125it [00:02, 47.12it/s][A
130it [00:02, 45.35it/s][A
135it [00:02, 45.99it/s][A
140it [00:02, 46.56it/s][A
145it [00:03, 46.88it/s][A
150it [00:03, 47.12it/s][A
155it [00:03, 46.93it/s][A
160it [00:03, 46.44it/s][A
165it [00:03, 46.39it/s][A
170it [00:03, 46.83it/s][A
175it [00:03, 47.31it/s][A
180it [00:03, 47.44it/s][A
185it [00:03, 47.49it/s][A
190it [00:03, 47.73it/s][A
195it [00:04, 47.83it/s][A
200it [00:04, 47.75it/s][A
205it [00:04, 47.63it/s][A

Epoch: 19, Step: 200, Loss: 4.892434005737305



210it [00:04, 47.21it/s][A
215it [00:04, 47.46it/s][A
220it [00:04, 47.90it/s][A
227it [00:04, 47.56it/s]
  4%|▍         | 19/500 [02:01<45:12,  5.64s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.65it/s][A
10it [00:00, 48.59it/s][A
15it [00:00, 48.70it/s][A
20it [00:00, 48.23it/s][A
25it [00:00, 48.08it/s][A
30it [00:00, 48.07it/s][A
35it [00:00, 47.74it/s][A
40it [00:00, 47.62it/s][A
45it [00:00, 47.90it/s][A
50it [00:01, 47.98it/s][A
55it [00:01, 48.00it/s][A
60it [00:01, 48.18it/s][A
65it [00:01, 48.25it/s][A
70it [00:01, 48.24it/s][A
75it [00:01, 48.36it/s][A
80it [00:01, 48.44it/s][A
85it [00:01, 48.35it/s][A
90it [00:01, 48.44it/s][A
95it [00:01, 48.47it/s][A
100it [00:02, 48.28it/s][A
105it [00:02, 48.16it/s][A

Epoch: 20, Step: 100, Loss: 4.872687845230103



110it [00:02, 48.01it/s][A
115it [00:02, 48.17it/s][A
120it [00:02, 48.40it/s][A
125it [00:02, 48.13it/s][A
130it [00:02, 48.15it/s][A
135it [00:02, 48.25it/s][A
140it [00:02, 48.42it/s][A
145it [00:03, 48.48it/s][A
150it [00:03, 48.20it/s][A
155it [00:03, 48.37it/s][A
160it [00:03, 48.46it/s][A
165it [00:03, 48.47it/s][A
170it [00:03, 48.61it/s][A
175it [00:03, 48.44it/s][A
180it [00:03, 48.49it/s][A
185it [00:03, 48.35it/s][A
190it [00:03, 48.38it/s][A
195it [00:04, 48.47it/s][A
200it [00:04, 48.36it/s][A
205it [00:04, 48.54it/s][A

Epoch: 20, Step: 200, Loss: 4.890772073268891



210it [00:04, 48.60it/s][A
215it [00:04, 48.62it/s][A
220it [00:04, 48.69it/s][A
227it [00:04, 48.31it/s]

0it [00:00, ?it/s][A
7it [00:00, 61.73it/s][A
14it [00:00, 62.13it/s][A
21it [00:00, 62.69it/s][A
28it [00:00, 62.90it/s][A
35it [00:00, 62.84it/s][A
42it [00:00, 63.08it/s][A
49it [00:00, 63.18it/s][A
56it [00:00, 62.91it/s][A
63it [00:01, 62.90it/s][A
70it [00:01, 62.92it/s][A
77it [00:01, 63.04it/s][A
84it [00:01, 63.20it/s][A
91it [00:01, 63.16it/s][A
98it [00:01, 63.15it/s][A
105it [00:01, 63.12it/s][A
112it [00:01, 63.10it/s][A
119it [00:01, 63.19it/s][A
126it [00:02, 63.15it/s][A
133it [00:02, 62.96it/s][A
140it [00:02, 63.02it/s][A
147it [00:02, 62.93it/s][A
154it [00:02, 62.86it/s][A
161it [00:02, 62.98it/s][A
168it [00:02, 63.07it/s][A
175it [00:02, 63.05it/s][A
182it [00:02, 63.07it/s][A
189it [00:03, 63.05it/s][A
196it [00:03, 62.96it/s][A
203it [00:03, 62.99it/s][A
210it [00:03, 63.07it/s][A
217it [00:03, 63.11it/s][A
224it [00:03, 


Epoch: 20, Test Loss: 5.4203291842656105, Test Perplexity: 226.5225746676048




0it [00:00, ?it/s][A
5it [00:00, 47.94it/s][A
10it [00:00, 48.23it/s][A
15it [00:00, 48.36it/s][A
20it [00:00, 48.46it/s][A
25it [00:00, 48.22it/s][A
30it [00:00, 48.39it/s][A
35it [00:00, 48.36it/s][A
40it [00:00, 48.36it/s][A
45it [00:00, 48.20it/s][A
50it [00:01, 48.27it/s][A
55it [00:01, 48.29it/s][A
60it [00:01, 48.44it/s][A
65it [00:01, 48.63it/s][A
70it [00:01, 48.74it/s][A
75it [00:01, 48.77it/s][A
80it [00:01, 48.78it/s][A
85it [00:01, 48.40it/s][A
90it [00:01, 48.37it/s][A
95it [00:01, 48.47it/s][A
100it [00:02, 48.50it/s][A
105it [00:02, 48.55it/s][A

Epoch: 21, Step: 100, Loss: 4.873905501365662



110it [00:02, 48.57it/s][A
115it [00:02, 48.61it/s][A
120it [00:02, 48.58it/s][A
125it [00:02, 48.65it/s][A
130it [00:02, 48.69it/s][A
135it [00:02, 48.63it/s][A
140it [00:02, 48.64it/s][A
145it [00:02, 48.60it/s][A
150it [00:03, 48.50it/s][A
155it [00:03, 48.68it/s][A
160it [00:03, 48.73it/s][A
165it [00:03, 48.58it/s][A
170it [00:03, 48.33it/s][A
175it [00:03, 48.12it/s][A
180it [00:03, 48.31it/s][A
185it [00:03, 48.29it/s][A
190it [00:03, 48.54it/s][A
195it [00:04, 48.36it/s][A
200it [00:04, 48.19it/s][A
205it [00:04, 48.02it/s][A

Epoch: 21, Step: 200, Loss: 4.884306688308715



210it [00:04, 48.26it/s][A
215it [00:04, 48.28it/s][A
220it [00:04, 48.15it/s][A
227it [00:04, 48.37it/s]
  4%|▍         | 21/500 [02:20<58:24,  7.32s/it]  
0it [00:00, ?it/s][A
5it [00:00, 47.99it/s][A
10it [00:00, 48.04it/s][A
15it [00:00, 48.25it/s][A
20it [00:00, 48.32it/s][A
25it [00:00, 47.37it/s][A
30it [00:00, 47.19it/s][A
35it [00:00, 47.08it/s][A
40it [00:00, 47.48it/s][A
45it [00:00, 47.55it/s][A
50it [00:01, 47.28it/s][A
55it [00:01, 45.90it/s][A
60it [00:01, 46.43it/s][A
65it [00:01, 46.30it/s][A
70it [00:01, 46.58it/s][A
75it [00:01, 46.66it/s][A
80it [00:01, 46.96it/s][A
85it [00:01, 45.86it/s][A
90it [00:01, 46.05it/s][A
95it [00:02, 46.47it/s][A
100it [00:02, 46.85it/s][A
105it [00:02, 47.27it/s][A

Epoch: 22, Step: 100, Loss: 4.86775845527649



110it [00:02, 47.27it/s][A
115it [00:02, 47.53it/s][A
120it [00:02, 47.95it/s][A
125it [00:02, 48.19it/s][A
130it [00:02, 48.27it/s][A
135it [00:02, 48.44it/s][A
140it [00:02, 48.31it/s][A
145it [00:03, 48.22it/s][A
150it [00:03, 48.38it/s][A
155it [00:03, 48.57it/s][A
160it [00:03, 48.59it/s][A
165it [00:03, 48.67it/s][A
170it [00:03, 48.73it/s][A
175it [00:03, 48.35it/s][A
180it [00:03, 48.22it/s][A
185it [00:03, 48.34it/s][A
190it [00:03, 48.24it/s][A
195it [00:04, 47.99it/s][A
200it [00:04, 47.75it/s][A
205it [00:04, 47.70it/s][A

Epoch: 22, Step: 200, Loss: 4.881404449939728



210it [00:04, 48.07it/s][A
215it [00:04, 48.29it/s][A
220it [00:04, 48.48it/s][A
227it [00:04, 47.65it/s]
  4%|▍         | 22/500 [02:25<52:11,  6.55s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.34it/s][A
10it [00:00, 48.25it/s][A
15it [00:00, 48.01it/s][A
20it [00:00, 48.24it/s][A
25it [00:00, 48.13it/s][A
30it [00:00, 48.16it/s][A
35it [00:00, 48.23it/s][A
40it [00:00, 48.27it/s][A
45it [00:00, 48.42it/s][A
50it [00:01, 48.55it/s][A
55it [00:01, 48.34it/s][A
60it [00:01, 48.34it/s][A
65it [00:01, 48.08it/s][A
70it [00:01, 48.31it/s][A
75it [00:01, 48.33it/s][A
80it [00:01, 48.25it/s][A
85it [00:01, 48.39it/s][A
90it [00:01, 48.40it/s][A
95it [00:01, 48.34it/s][A
100it [00:02, 48.23it/s][A
105it [00:02, 48.22it/s][A

Epoch: 23, Step: 100, Loss: 4.867558135986328



110it [00:02, 47.87it/s][A
115it [00:02, 47.97it/s][A
120it [00:02, 48.03it/s][A
125it [00:02, 48.13it/s][A
130it [00:02, 48.41it/s][A
135it [00:02, 48.57it/s][A
140it [00:02, 48.68it/s][A
145it [00:03, 48.73it/s][A
150it [00:03, 48.56it/s][A
155it [00:03, 48.28it/s][A
160it [00:03, 47.86it/s][A
165it [00:03, 47.61it/s][A
170it [00:03, 47.74it/s][A
175it [00:03, 47.86it/s][A
180it [00:03, 48.01it/s][A
185it [00:03, 48.11it/s][A
190it [00:03, 48.14it/s][A
195it [00:04, 48.11it/s][A
200it [00:04, 47.88it/s][A
205it [00:04, 47.81it/s][A

Epoch: 23, Step: 200, Loss: 4.880541515350342



210it [00:04, 47.60it/s][A
215it [00:04, 47.31it/s][A
220it [00:04, 47.14it/s][A
227it [00:04, 48.04it/s]
  5%|▍         | 23/500 [02:30<47:44,  6.00s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.95it/s][A
10it [00:00, 47.67it/s][A
15it [00:00, 48.12it/s][A
20it [00:00, 48.16it/s][A
25it [00:00, 47.82it/s][A
30it [00:00, 48.10it/s][A
35it [00:00, 48.27it/s][A
40it [00:00, 48.30it/s][A
45it [00:00, 48.14it/s][A
50it [00:01, 48.34it/s][A
55it [00:01, 48.50it/s][A
60it [00:01, 48.36it/s][A
65it [00:01, 48.19it/s][A
70it [00:01, 48.21it/s][A
75it [00:01, 48.09it/s][A
80it [00:01, 48.21it/s][A
85it [00:01, 48.05it/s][A
90it [00:01, 48.15it/s][A
95it [00:01, 47.91it/s][A
100it [00:02, 48.08it/s][A
105it [00:02, 48.03it/s][A

Epoch: 24, Step: 100, Loss: 4.854595060348511



110it [00:02, 48.16it/s][A
115it [00:02, 48.14it/s][A
120it [00:02, 47.42it/s][A
125it [00:02, 47.09it/s][A
130it [00:02, 46.56it/s][A
135it [00:02, 46.51it/s][A
140it [00:02, 46.86it/s][A
145it [00:03, 46.86it/s][A
150it [00:03, 47.14it/s][A
155it [00:03, 47.54it/s][A
160it [00:03, 47.79it/s][A
165it [00:03, 47.15it/s][A
170it [00:03, 47.16it/s][A
175it [00:03, 46.45it/s][A
180it [00:03, 46.07it/s][A
185it [00:03, 46.26it/s][A
190it [00:03, 46.82it/s][A
195it [00:04, 47.22it/s][A
200it [00:04, 47.57it/s][A
205it [00:04, 47.51it/s][A

Epoch: 24, Step: 200, Loss: 4.8768985033035275



210it [00:04, 47.80it/s][A
215it [00:04, 47.95it/s][A
220it [00:04, 48.07it/s][A
227it [00:04, 47.61it/s]
  5%|▍         | 24/500 [02:35<44:41,  5.63s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.73it/s][A
10it [00:00, 48.55it/s][A
15it [00:00, 48.68it/s][A
20it [00:00, 48.69it/s][A
25it [00:00, 48.58it/s][A
30it [00:00, 48.63it/s][A
35it [00:00, 48.65it/s][A
40it [00:00, 48.67it/s][A
45it [00:00, 48.81it/s][A
50it [00:01, 48.89it/s][A
55it [00:01, 48.64it/s][A
60it [00:01, 48.69it/s][A
65it [00:01, 48.75it/s][A
70it [00:01, 48.76it/s][A
75it [00:01, 48.57it/s][A
80it [00:01, 48.28it/s][A
85it [00:01, 47.99it/s][A
90it [00:01, 47.89it/s][A
95it [00:01, 47.82it/s][A
100it [00:02, 47.81it/s][A
105it [00:02, 47.88it/s][A

Epoch: 25, Step: 100, Loss: 4.8633404016494755



110it [00:02, 47.89it/s][A
115it [00:02, 48.00it/s][A
120it [00:02, 48.06it/s][A
125it [00:02, 48.06it/s][A
130it [00:02, 48.24it/s][A
135it [00:02, 48.33it/s][A
140it [00:02, 48.24it/s][A
145it [00:02, 48.32it/s][A
150it [00:03, 48.26it/s][A
155it [00:03, 48.45it/s][A
160it [00:03, 48.47it/s][A
165it [00:03, 48.56it/s][A
170it [00:03, 48.36it/s][A
175it [00:03, 48.36it/s][A
180it [00:03, 48.36it/s][A
185it [00:03, 48.45it/s][A
190it [00:03, 48.42it/s][A
195it [00:04, 48.60it/s][A
200it [00:04, 48.43it/s][A
205it [00:04, 48.59it/s][A

Epoch: 25, Step: 200, Loss: 4.873378837108612



210it [00:04, 48.61it/s][A
215it [00:04, 48.75it/s][A
220it [00:04, 48.85it/s][A
227it [00:04, 48.41it/s]

0it [00:00, ?it/s][A
7it [00:00, 61.20it/s][A
14it [00:00, 61.46it/s][A
21it [00:00, 61.84it/s][A
28it [00:00, 61.72it/s][A
35it [00:00, 61.87it/s][A
42it [00:00, 62.20it/s][A
49it [00:00, 62.30it/s][A
56it [00:00, 62.63it/s][A
63it [00:01, 62.60it/s][A
70it [00:01, 62.53it/s][A
77it [00:01, 62.77it/s][A
84it [00:01, 62.88it/s][A
91it [00:01, 62.60it/s][A
98it [00:01, 62.80it/s][A
105it [00:01, 62.98it/s][A
112it [00:01, 62.69it/s][A
119it [00:01, 62.91it/s][A
126it [00:02, 62.99it/s][A
133it [00:02, 62.97it/s][A
140it [00:02, 62.88it/s][A
147it [00:02, 63.02it/s][A
154it [00:02, 62.89it/s][A
161it [00:02, 62.79it/s][A
168it [00:02, 62.98it/s][A
175it [00:02, 63.01it/s][A
182it [00:02, 63.21it/s][A
189it [00:03, 63.06it/s][A
196it [00:03, 62.61it/s][A
203it [00:03, 62.73it/s][A
210it [00:03, 62.81it/s][A
217it [00:03, 62.71it/s][A
224it [00:03, 


Epoch: 25, Test Loss: 5.415561048880868, Test Perplexity: 225.5109957345524




0it [00:00, ?it/s][A
5it [00:00, 48.59it/s][A
10it [00:00, 48.66it/s][A
15it [00:00, 48.73it/s][A
20it [00:00, 48.54it/s][A
25it [00:00, 48.56it/s][A
30it [00:00, 48.68it/s][A
35it [00:00, 48.72it/s][A
40it [00:00, 48.81it/s][A
45it [00:00, 48.77it/s][A
50it [00:01, 48.72it/s][A
55it [00:01, 48.67it/s][A
60it [00:01, 48.56it/s][A
65it [00:01, 48.57it/s][A
70it [00:01, 48.55it/s][A
75it [00:01, 48.45it/s][A
80it [00:01, 48.37it/s][A
85it [00:01, 48.20it/s][A
90it [00:01, 48.40it/s][A
95it [00:01, 48.40it/s][A
100it [00:02, 48.57it/s][A
105it [00:02, 48.69it/s][A

Epoch: 26, Step: 100, Loss: 4.8549098300933835



110it [00:02, 48.62it/s][A
115it [00:02, 48.74it/s][A
120it [00:02, 48.82it/s][A
125it [00:02, 48.63it/s][A
130it [00:02, 48.69it/s][A
135it [00:02, 48.74it/s][A
140it [00:02, 48.72it/s][A
145it [00:02, 48.67it/s][A
150it [00:03, 48.80it/s][A
155it [00:03, 48.73it/s][A
160it [00:03, 48.75it/s][A
165it [00:03, 47.54it/s][A
170it [00:03, 47.83it/s][A
175it [00:03, 48.07it/s][A
180it [00:03, 48.15it/s][A
185it [00:03, 48.01it/s][A
190it [00:03, 48.05it/s][A
195it [00:04, 48.20it/s][A
200it [00:04, 48.14it/s][A
205it [00:04, 46.73it/s][A

Epoch: 26, Step: 200, Loss: 4.869188694953919



210it [00:04, 47.23it/s][A
215it [00:04, 47.63it/s][A
220it [00:04, 47.97it/s][A
227it [00:04, 48.33it/s]
  5%|▌         | 26/500 [02:54<57:49,  7.32s/it]  
0it [00:00, ?it/s][A
5it [00:00, 48.26it/s][A
10it [00:00, 48.12it/s][A
15it [00:00, 47.98it/s][A
20it [00:00, 48.23it/s][A
25it [00:00, 48.20it/s][A
30it [00:00, 48.27it/s][A
35it [00:00, 48.24it/s][A
40it [00:00, 47.28it/s][A
45it [00:00, 46.72it/s][A
50it [00:01, 46.60it/s][A
55it [00:01, 46.85it/s][A
60it [00:01, 47.21it/s][A
65it [00:01, 47.59it/s][A
70it [00:01, 47.54it/s][A
75it [00:01, 47.61it/s][A
80it [00:01, 47.66it/s][A
85it [00:01, 47.68it/s][A
90it [00:01, 47.27it/s][A
95it [00:02, 47.14it/s][A
100it [00:02, 46.70it/s][A
105it [00:02, 46.51it/s][A

Epoch: 27, Step: 100, Loss: 4.850432801246643



110it [00:02, 46.72it/s][A
115it [00:02, 47.17it/s][A
120it [00:02, 47.47it/s][A
125it [00:02, 47.76it/s][A
130it [00:02, 48.00it/s][A
135it [00:02, 48.08it/s][A
140it [00:02, 48.16it/s][A
145it [00:03, 48.25it/s][A
150it [00:03, 48.17it/s][A
155it [00:03, 48.24it/s][A
160it [00:03, 48.17it/s][A
165it [00:03, 47.98it/s][A
170it [00:03, 48.06it/s][A
175it [00:03, 48.16it/s][A
180it [00:03, 48.25it/s][A
185it [00:03, 48.28it/s][A
190it [00:03, 48.30it/s][A
195it [00:04, 48.34it/s][A
200it [00:04, 48.45it/s][A
205it [00:04, 48.50it/s][A

Epoch: 27, Step: 200, Loss: 4.865983426570892



210it [00:04, 48.58it/s][A
215it [00:04, 48.54it/s][A
220it [00:04, 48.55it/s][A
227it [00:04, 47.83it/s]
  5%|▌         | 27/500 [02:59<51:37,  6.55s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.79it/s][A
10it [00:00, 48.67it/s][A
15it [00:00, 48.02it/s][A
20it [00:00, 48.21it/s][A
25it [00:00, 48.37it/s][A
30it [00:00, 48.40it/s][A
35it [00:00, 48.43it/s][A
40it [00:00, 48.46it/s][A
45it [00:00, 48.56it/s][A
50it [00:01, 48.58it/s][A
55it [00:01, 48.45it/s][A
60it [00:01, 48.32it/s][A
65it [00:01, 48.30it/s][A
70it [00:01, 48.22it/s][A
75it [00:01, 48.26it/s][A
80it [00:01, 48.37it/s][A
85it [00:01, 48.37it/s][A
90it [00:01, 48.48it/s][A
95it [00:01, 48.52it/s][A
100it [00:02, 48.54it/s][A
105it [00:02, 48.61it/s][A

Epoch: 28, Step: 100, Loss: 4.843945899009705



110it [00:02, 48.47it/s][A
115it [00:02, 48.51it/s][A
120it [00:02, 48.46it/s][A
125it [00:02, 48.45it/s][A
130it [00:02, 48.37it/s][A
135it [00:02, 48.34it/s][A
140it [00:02, 48.45it/s][A
145it [00:02, 48.41it/s][A
150it [00:03, 48.16it/s][A
155it [00:03, 48.13it/s][A
160it [00:03, 48.04it/s][A
165it [00:03, 48.26it/s][A
170it [00:03, 48.27it/s][A
175it [00:03, 48.24it/s][A
180it [00:03, 48.39it/s][A
185it [00:03, 48.48it/s][A
190it [00:03, 48.61it/s][A
195it [00:04, 48.77it/s][A
200it [00:04, 48.80it/s][A
205it [00:04, 48.65it/s][A

Epoch: 28, Step: 200, Loss: 4.8627214431762695



210it [00:04, 48.18it/s][A
215it [00:04, 48.04it/s][A
220it [00:04, 47.84it/s][A
227it [00:04, 48.34it/s]
  6%|▌         | 28/500 [03:04<47:08,  5.99s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.27it/s][A
10it [00:00, 47.72it/s][A
15it [00:00, 48.17it/s][A
20it [00:00, 48.27it/s][A
25it [00:00, 48.41it/s][A
30it [00:00, 48.20it/s][A
35it [00:00, 48.16it/s][A
40it [00:00, 48.24it/s][A
45it [00:00, 48.33it/s][A
50it [00:01, 48.34it/s][A
55it [00:01, 48.43it/s][A
60it [00:01, 48.37it/s][A
65it [00:01, 48.37it/s][A
70it [00:01, 47.98it/s][A
75it [00:01, 47.87it/s][A
80it [00:01, 47.18it/s][A
85it [00:01, 47.11it/s][A
90it [00:01, 47.34it/s][A
95it [00:01, 47.73it/s][A
100it [00:02, 47.86it/s][A
105it [00:02, 48.08it/s][A

Epoch: 29, Step: 100, Loss: 4.850341081619263



110it [00:02, 48.25it/s][A
115it [00:02, 48.29it/s][A
120it [00:02, 48.34it/s][A
125it [00:02, 48.21it/s][A
130it [00:02, 48.06it/s][A
135it [00:02, 47.54it/s][A
140it [00:02, 47.22it/s][A
145it [00:03, 47.35it/s][A
150it [00:03, 47.32it/s][A
155it [00:03, 47.46it/s][A
160it [00:03, 47.54it/s][A
165it [00:03, 47.56it/s][A
170it [00:03, 47.56it/s][A
175it [00:03, 47.48it/s][A
180it [00:03, 46.83it/s][A
185it [00:03, 46.02it/s][A
190it [00:03, 45.79it/s][A
195it [00:04, 45.66it/s][A
200it [00:04, 45.95it/s][A
205it [00:04, 46.57it/s][A

Epoch: 29, Step: 200, Loss: 4.861494204998016



210it [00:04, 47.04it/s][A
215it [00:04, 47.55it/s][A
220it [00:04, 47.76it/s][A
227it [00:04, 47.59it/s]
  6%|▌         | 29/500 [03:08<44:10,  5.63s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.49it/s][A
10it [00:00, 48.37it/s][A
15it [00:00, 48.06it/s][A
20it [00:00, 48.01it/s][A
25it [00:00, 48.26it/s][A
30it [00:00, 48.24it/s][A
35it [00:00, 48.35it/s][A
40it [00:00, 48.48it/s][A
45it [00:00, 47.99it/s][A
50it [00:01, 47.79it/s][A
55it [00:01, 47.72it/s][A
60it [00:01, 47.74it/s][A
65it [00:01, 47.91it/s][A
70it [00:01, 48.05it/s][A
75it [00:01, 47.90it/s][A
80it [00:01, 47.71it/s][A
85it [00:01, 47.83it/s][A
90it [00:01, 48.04it/s][A
95it [00:01, 48.21it/s][A
100it [00:02, 48.33it/s][A
105it [00:02, 48.43it/s][A

Epoch: 30, Step: 100, Loss: 4.83866102218628



110it [00:02, 48.51it/s][A
115it [00:02, 48.34it/s][A
120it [00:02, 48.43it/s][A
125it [00:02, 48.55it/s][A
130it [00:02, 48.54it/s][A
135it [00:02, 48.37it/s][A
140it [00:02, 48.22it/s][A
145it [00:03, 48.25it/s][A
150it [00:03, 48.28it/s][A
155it [00:03, 48.48it/s][A
160it [00:03, 48.64it/s][A
165it [00:03, 48.68it/s][A
170it [00:03, 48.72it/s][A
175it [00:03, 48.57it/s][A
180it [00:03, 48.59it/s][A
185it [00:03, 48.33it/s][A
190it [00:03, 48.18it/s][A
195it [00:04, 48.03it/s][A
200it [00:04, 47.78it/s][A
205it [00:04, 47.73it/s][A

Epoch: 30, Step: 200, Loss: 4.859467678070068



210it [00:04, 47.60it/s][A
215it [00:04, 47.98it/s][A
220it [00:04, 48.33it/s][A
227it [00:04, 48.17it/s]

0it [00:00, ?it/s][A
7it [00:00, 61.63it/s][A
14it [00:00, 62.23it/s][A
21it [00:00, 62.71it/s][A
28it [00:00, 62.99it/s][A
35it [00:00, 63.11it/s][A
42it [00:00, 63.29it/s][A
49it [00:00, 63.24it/s][A
56it [00:00, 63.28it/s][A
63it [00:00, 63.30it/s][A
70it [00:01, 63.41it/s][A
77it [00:01, 63.46it/s][A
84it [00:01, 63.50it/s][A
91it [00:01, 63.50it/s][A
98it [00:01, 62.95it/s][A
105it [00:01, 62.92it/s][A
112it [00:01, 63.19it/s][A
119it [00:01, 63.29it/s][A
126it [00:01, 63.42it/s][A
133it [00:02, 63.24it/s][A
140it [00:02, 63.22it/s][A
147it [00:02, 63.27it/s][A
154it [00:02, 63.30it/s][A
161it [00:02, 63.20it/s][A
168it [00:02, 62.99it/s][A
175it [00:02, 62.99it/s][A
182it [00:02, 63.04it/s][A
189it [00:02, 63.11it/s][A
196it [00:03, 62.80it/s][A
203it [00:03, 62.88it/s][A
210it [00:03, 63.17it/s][A
217it [00:03, 63.13it/s][A
224it [00:03, 


Epoch: 30, Test Loss: 5.410739461087292, Test Perplexity: 224.47454554397868




0it [00:00, ?it/s][A
5it [00:00, 47.98it/s][A
10it [00:00, 47.80it/s][A
15it [00:00, 48.08it/s][A
20it [00:00, 48.05it/s][A
25it [00:00, 48.11it/s][A
30it [00:00, 48.16it/s][A
35it [00:00, 48.21it/s][A
40it [00:00, 48.10it/s][A
45it [00:00, 46.69it/s][A
50it [00:01, 46.83it/s][A
55it [00:01, 47.11it/s][A
60it [00:01, 45.81it/s][A
65it [00:01, 45.10it/s][A
70it [00:01, 45.13it/s][A
75it [00:01, 45.82it/s][A
80it [00:01, 46.10it/s][A
85it [00:01, 46.38it/s][A
90it [00:01, 46.38it/s][A
95it [00:02, 46.52it/s][A
100it [00:02, 46.24it/s][A
105it [00:02, 46.29it/s][A

Epoch: 31, Step: 100, Loss: 4.855305619239807



110it [00:02, 46.31it/s][A
115it [00:02, 46.86it/s][A
120it [00:02, 47.11it/s][A
125it [00:02, 47.42it/s][A
130it [00:02, 47.80it/s][A
135it [00:02, 47.90it/s][A
140it [00:02, 48.05it/s][A
145it [00:03, 48.14it/s][A
150it [00:03, 48.29it/s][A
155it [00:03, 48.31it/s][A
160it [00:03, 48.31it/s][A
165it [00:03, 48.41it/s][A
170it [00:03, 48.46it/s][A
175it [00:03, 48.53it/s][A
180it [00:03, 48.51it/s][A
185it [00:03, 48.62it/s][A
190it [00:04, 48.57it/s][A
195it [00:04, 48.53it/s][A
200it [00:04, 48.46it/s][A
205it [00:04, 48.51it/s][A

Epoch: 31, Step: 200, Loss: 4.855062630176544



210it [00:04, 48.22it/s][A
215it [00:04, 48.27it/s][A
220it [00:04, 48.23it/s][A
227it [00:04, 47.50it/s]
  6%|▌         | 31/500 [03:28<57:19,  7.33s/it]  
0it [00:00, ?it/s][A
5it [00:00, 47.98it/s][A
10it [00:00, 47.91it/s][A
15it [00:00, 48.38it/s][A
20it [00:00, 48.51it/s][A
25it [00:00, 48.63it/s][A
30it [00:00, 48.66it/s][A
35it [00:00, 48.81it/s][A
40it [00:00, 48.43it/s][A
45it [00:00, 48.33it/s][A
50it [00:01, 48.53it/s][A
55it [00:01, 47.92it/s][A
60it [00:01, 47.93it/s][A
65it [00:01, 47.67it/s][A
70it [00:01, 47.38it/s][A
75it [00:01, 47.72it/s][A
80it [00:01, 47.84it/s][A
85it [00:01, 47.98it/s][A
90it [00:01, 47.75it/s][A
95it [00:01, 47.68it/s][A
100it [00:02, 47.11it/s][A
105it [00:02, 47.16it/s][A

Epoch: 32, Step: 100, Loss: 4.8467328023910525



110it [00:02, 46.55it/s][A
115it [00:02, 46.76it/s][A
120it [00:02, 46.73it/s][A
125it [00:02, 47.19it/s][A
130it [00:02, 47.46it/s][A
135it [00:02, 47.34it/s][A
140it [00:02, 47.62it/s][A
145it [00:03, 47.86it/s][A
150it [00:03, 47.93it/s][A
155it [00:03, 48.19it/s][A
160it [00:03, 48.33it/s][A
165it [00:03, 48.39it/s][A
170it [00:03, 48.42it/s][A
175it [00:03, 48.50it/s][A
180it [00:03, 48.21it/s][A
185it [00:03, 47.90it/s][A
190it [00:03, 47.82it/s][A
195it [00:04, 47.62it/s][A
200it [00:04, 47.74it/s][A
205it [00:04, 47.64it/s][A

Epoch: 32, Step: 200, Loss: 4.853377377986908



210it [00:04, 47.98it/s][A
215it [00:04, 48.01it/s][A
220it [00:04, 47.82it/s][A
227it [00:04, 47.85it/s]
  6%|▋         | 32/500 [03:33<51:09,  6.56s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.69it/s][A
10it [00:00, 48.59it/s][A
15it [00:00, 48.18it/s][A
20it [00:00, 48.20it/s][A
25it [00:00, 47.90it/s][A
30it [00:00, 48.06it/s][A
35it [00:00, 47.70it/s][A
40it [00:00, 47.89it/s][A
45it [00:00, 47.90it/s][A
50it [00:01, 48.12it/s][A
55it [00:01, 48.15it/s][A
60it [00:01, 48.26it/s][A
65it [00:01, 48.24it/s][A
70it [00:01, 48.20it/s][A
75it [00:01, 48.34it/s][A
80it [00:01, 48.50it/s][A
85it [00:01, 48.63it/s][A
90it [00:01, 48.73it/s][A
95it [00:01, 48.87it/s][A
100it [00:02, 48.89it/s][A
105it [00:02, 48.77it/s][A

Epoch: 33, Step: 100, Loss: 4.837917218208313



110it [00:02, 48.17it/s][A
115it [00:02, 48.32it/s][A
120it [00:02, 48.21it/s][A
125it [00:02, 48.44it/s][A
130it [00:02, 48.28it/s][A
135it [00:02, 47.99it/s][A
140it [00:02, 48.18it/s][A
145it [00:03, 48.45it/s][A
150it [00:03, 48.62it/s][A
155it [00:03, 48.67it/s][A
160it [00:03, 48.27it/s][A
165it [00:03, 48.51it/s][A
170it [00:03, 48.72it/s][A
175it [00:03, 48.82it/s][A
180it [00:03, 48.92it/s][A
185it [00:03, 48.83it/s][A
190it [00:03, 48.87it/s][A
195it [00:04, 48.91it/s][A
200it [00:04, 48.99it/s][A
205it [00:04, 48.86it/s][A

Epoch: 33, Step: 200, Loss: 4.850223326683045



210it [00:04, 48.63it/s][A
215it [00:04, 48.37it/s][A
220it [00:04, 48.44it/s][A
227it [00:04, 48.43it/s]
  7%|▋         | 33/500 [03:38<46:41,  6.00s/it]
0it [00:00, ?it/s][A
5it [00:00, 49.07it/s][A
10it [00:00, 48.85it/s][A
15it [00:00, 48.80it/s][A
20it [00:00, 48.84it/s][A
25it [00:00, 48.84it/s][A
30it [00:00, 47.91it/s][A
35it [00:00, 47.95it/s][A
40it [00:00, 47.86it/s][A
45it [00:00, 48.22it/s][A
50it [00:01, 47.92it/s][A
55it [00:01, 48.17it/s][A
60it [00:01, 48.23it/s][A
65it [00:01, 48.16it/s][A
70it [00:01, 48.32it/s][A
75it [00:01, 48.33it/s][A
80it [00:01, 48.46it/s][A
85it [00:01, 48.32it/s][A
90it [00:01, 48.48it/s][A
95it [00:01, 48.44it/s][A
100it [00:02, 48.38it/s][A
105it [00:02, 48.48it/s][A

Epoch: 34, Step: 100, Loss: 4.838865566253662



110it [00:02, 48.44it/s][A
115it [00:02, 48.60it/s][A
120it [00:02, 48.68it/s][A
125it [00:02, 48.17it/s][A
130it [00:02, 47.95it/s][A
135it [00:02, 48.10it/s][A
140it [00:02, 48.33it/s][A
145it [00:02, 48.48it/s][A
150it [00:03, 47.67it/s][A
155it [00:03, 47.37it/s][A
160it [00:03, 47.65it/s][A
165it [00:03, 47.96it/s][A
170it [00:03, 48.08it/s][A
175it [00:03, 47.92it/s][A
180it [00:03, 47.87it/s][A
185it [00:03, 47.58it/s][A
190it [00:03, 47.85it/s][A
195it [00:04, 47.28it/s][A
200it [00:04, 47.24it/s][A
205it [00:04, 47.63it/s][A

Epoch: 34, Step: 200, Loss: 4.847582662105561



210it [00:04, 47.52it/s][A
215it [00:04, 47.38it/s][A
220it [00:04, 47.75it/s][A
227it [00:04, 48.06it/s]
  7%|▋         | 34/500 [03:42<43:37,  5.62s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.64it/s][A
10it [00:00, 48.31it/s][A
15it [00:00, 48.22it/s][A
20it [00:00, 48.37it/s][A
25it [00:00, 48.45it/s][A
30it [00:00, 48.55it/s][A
35it [00:00, 48.65it/s][A
40it [00:00, 47.87it/s][A
45it [00:00, 47.80it/s][A
50it [00:01, 48.09it/s][A
55it [00:01, 48.24it/s][A
60it [00:01, 48.35it/s][A
65it [00:01, 48.36it/s][A
70it [00:01, 48.50it/s][A
75it [00:01, 48.59it/s][A
80it [00:01, 48.37it/s][A
85it [00:01, 48.04it/s][A
90it [00:01, 47.66it/s][A
95it [00:01, 47.73it/s][A
100it [00:02, 47.72it/s][A
105it [00:02, 47.77it/s][A

Epoch: 35, Step: 100, Loss: 4.836358461380005



110it [00:02, 47.66it/s][A
115it [00:02, 47.51it/s][A
120it [00:02, 46.95it/s][A
125it [00:02, 47.33it/s][A
130it [00:02, 47.52it/s][A
135it [00:02, 47.89it/s][A
140it [00:02, 48.03it/s][A
145it [00:03, 48.01it/s][A
150it [00:03, 48.06it/s][A
155it [00:03, 48.05it/s][A
160it [00:03, 48.12it/s][A
165it [00:03, 48.15it/s][A
170it [00:03, 47.87it/s][A
175it [00:03, 47.78it/s][A
180it [00:03, 47.95it/s][A
185it [00:03, 47.78it/s][A
190it [00:03, 47.86it/s][A
195it [00:04, 48.01it/s][A
200it [00:04, 48.18it/s][A
205it [00:04, 48.47it/s][A

Epoch: 35, Step: 200, Loss: 4.846602523326874



210it [00:04, 48.47it/s][A
215it [00:04, 48.65it/s][A
220it [00:04, 48.68it/s][A
227it [00:04, 48.06it/s]

0it [00:00, ?it/s][A
7it [00:00, 61.41it/s][A
14it [00:00, 61.55it/s][A
21it [00:00, 62.38it/s][A
28it [00:00, 62.69it/s][A
35it [00:00, 62.82it/s][A
42it [00:00, 62.98it/s][A
49it [00:00, 63.11it/s][A
56it [00:00, 63.21it/s][A
63it [00:01, 63.18it/s][A
70it [00:01, 62.58it/s][A
77it [00:01, 62.53it/s][A
84it [00:01, 62.80it/s][A
91it [00:01, 62.74it/s][A
98it [00:01, 62.52it/s][A
105it [00:01, 62.77it/s][A
112it [00:01, 62.98it/s][A
119it [00:01, 63.15it/s][A
126it [00:02, 63.22it/s][A
133it [00:02, 63.21it/s][A
140it [00:02, 63.26it/s][A
147it [00:02, 62.81it/s][A
154it [00:02, 62.79it/s][A
161it [00:02, 62.95it/s][A
168it [00:02, 63.05it/s][A
175it [00:02, 62.90it/s][A
182it [00:02, 63.07it/s][A
189it [00:03, 63.15it/s][A
196it [00:03, 63.20it/s][A
203it [00:03, 63.09it/s][A
210it [00:03, 63.05it/s][A
217it [00:03, 63.04it/s][A
224it [00:03, 


Epoch: 35, Test Loss: 5.417065724082615, Test Perplexity: 225.8927239838594




0it [00:00, ?it/s][A
5it [00:00, 47.94it/s][A
10it [00:00, 48.10it/s][A
15it [00:00, 48.18it/s][A
20it [00:00, 48.24it/s][A
25it [00:00, 48.12it/s][A
30it [00:00, 48.03it/s][A
35it [00:00, 47.89it/s][A
40it [00:00, 48.04it/s][A
45it [00:00, 48.01it/s][A
50it [00:01, 47.91it/s][A
55it [00:01, 48.09it/s][A
60it [00:01, 48.04it/s][A
65it [00:01, 48.26it/s][A
70it [00:01, 48.00it/s][A
75it [00:01, 48.09it/s][A
80it [00:01, 48.26it/s][A
85it [00:01, 48.45it/s][A
90it [00:01, 48.63it/s][A
95it [00:01, 48.60it/s][A
100it [00:02, 48.69it/s][A
105it [00:02, 48.65it/s][A

Epoch: 36, Step: 100, Loss: 4.8377819871902465



110it [00:02, 48.55it/s][A
115it [00:02, 48.43it/s][A
120it [00:02, 48.51it/s][A
125it [00:02, 48.59it/s][A
130it [00:02, 48.40it/s][A
135it [00:02, 48.47it/s][A
140it [00:02, 48.11it/s][A
145it [00:03, 48.24it/s][A
150it [00:03, 48.42it/s][A
155it [00:03, 48.39it/s][A
160it [00:03, 48.38it/s][A
165it [00:03, 48.39it/s][A
170it [00:03, 48.12it/s][A
175it [00:03, 48.28it/s][A
180it [00:03, 48.44it/s][A
185it [00:03, 48.46it/s][A
190it [00:03, 48.41it/s][A
195it [00:04, 48.52it/s][A
200it [00:04, 48.66it/s][A
205it [00:04, 48.78it/s][A

Epoch: 36, Step: 200, Loss: 4.844729948043823



210it [00:04, 48.49it/s][A
215it [00:04, 48.55it/s][A
220it [00:04, 48.54it/s][A
227it [00:04, 48.33it/s]
  7%|▋         | 36/500 [04:02<56:36,  7.32s/it]  
0it [00:00, ?it/s][A
5it [00:00, 49.42it/s][A
10it [00:00, 48.07it/s][A
15it [00:00, 48.60it/s][A
20it [00:00, 48.65it/s][A
25it [00:00, 48.75it/s][A
30it [00:00, 48.45it/s][A
35it [00:00, 48.41it/s][A
40it [00:00, 48.26it/s][A
45it [00:00, 48.32it/s][A
50it [00:01, 48.37it/s][A
55it [00:01, 48.21it/s][A
60it [00:01, 48.33it/s][A
65it [00:01, 48.31it/s][A
70it [00:01, 47.99it/s][A
75it [00:01, 47.38it/s][A
80it [00:01, 47.28it/s][A
85it [00:01, 47.55it/s][A
90it [00:01, 47.85it/s][A
95it [00:01, 48.04it/s][A
100it [00:02, 48.21it/s][A
105it [00:02, 48.37it/s][A

Epoch: 37, Step: 100, Loss: 4.830890641212464



110it [00:02, 48.20it/s][A
115it [00:02, 48.30it/s][A
120it [00:02, 48.33it/s][A
125it [00:02, 48.12it/s][A
130it [00:02, 47.81it/s][A
135it [00:02, 47.46it/s][A
140it [00:02, 47.64it/s][A
145it [00:03, 47.57it/s][A
150it [00:03, 47.89it/s][A
155it [00:03, 48.05it/s][A
160it [00:03, 48.30it/s][A
165it [00:03, 48.29it/s][A
170it [00:03, 47.92it/s][A
175it [00:03, 47.99it/s][A
180it [00:03, 47.89it/s][A
185it [00:03, 48.09it/s][A
190it [00:03, 48.23it/s][A
195it [00:04, 48.31it/s][A
200it [00:04, 48.05it/s][A
205it [00:04, 47.91it/s][A

Epoch: 37, Step: 200, Loss: 4.8409698414802556



210it [00:04, 48.04it/s][A
215it [00:04, 48.23it/s][A
220it [00:04, 48.24it/s][A
227it [00:04, 48.09it/s]
  7%|▋         | 37/500 [04:07<50:28,  6.54s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.12it/s][A
10it [00:00, 48.13it/s][A
15it [00:00, 48.13it/s][A
20it [00:00, 47.67it/s][A
25it [00:00, 47.37it/s][A
30it [00:00, 47.78it/s][A
35it [00:00, 48.07it/s][A
40it [00:00, 48.18it/s][A
45it [00:00, 48.36it/s][A
50it [00:01, 48.39it/s][A
55it [00:01, 48.18it/s][A
60it [00:01, 48.25it/s][A
65it [00:01, 47.76it/s][A
70it [00:01, 47.66it/s][A
75it [00:01, 47.87it/s][A
80it [00:01, 48.12it/s][A
85it [00:01, 48.43it/s][A
90it [00:01, 48.41it/s][A
95it [00:01, 48.01it/s][A
100it [00:02, 47.86it/s][A
105it [00:02, 47.90it/s][A

Epoch: 38, Step: 100, Loss: 4.8252650213241575



110it [00:02, 47.91it/s][A
115it [00:02, 47.94it/s][A
120it [00:02, 47.79it/s][A
125it [00:02, 47.94it/s][A
130it [00:02, 47.99it/s][A
135it [00:02, 48.20it/s][A
140it [00:02, 47.83it/s][A
145it [00:03, 47.95it/s][A
150it [00:03, 48.07it/s][A
155it [00:03, 47.79it/s][A
160it [00:03, 47.92it/s][A
165it [00:03, 48.18it/s][A
170it [00:03, 47.93it/s][A
175it [00:03, 48.22it/s][A
180it [00:03, 48.48it/s][A
185it [00:03, 48.35it/s][A
190it [00:03, 48.04it/s][A
195it [00:04, 48.10it/s][A
200it [00:04, 48.14it/s][A
205it [00:04, 48.25it/s][A

Epoch: 38, Step: 200, Loss: 4.837960326671601



210it [00:04, 48.29it/s][A
215it [00:04, 48.35it/s][A
220it [00:04, 48.49it/s][A
227it [00:04, 48.06it/s]
  8%|▊         | 38/500 [04:12<46:10,  6.00s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.75it/s][A
10it [00:00, 48.46it/s][A
15it [00:00, 48.37it/s][A
20it [00:00, 48.35it/s][A
25it [00:00, 48.51it/s][A
30it [00:00, 48.54it/s][A
35it [00:00, 48.47it/s][A
40it [00:00, 48.32it/s][A
45it [00:00, 48.37it/s][A
50it [00:01, 48.58it/s][A
55it [00:01, 48.44it/s][A
60it [00:01, 48.60it/s][A
65it [00:01, 48.71it/s][A
70it [00:01, 48.61it/s][A
75it [00:01, 48.66it/s][A
80it [00:01, 48.69it/s][A
85it [00:01, 48.47it/s][A
90it [00:01, 48.48it/s][A
95it [00:01, 48.33it/s][A
100it [00:02, 48.42it/s][A
105it [00:02, 48.31it/s][A

Epoch: 39, Step: 100, Loss: 4.8237970924377445



110it [00:02, 48.58it/s][A
115it [00:02, 48.38it/s][A
120it [00:02, 48.24it/s][A
125it [00:02, 47.89it/s][A
130it [00:02, 48.07it/s][A
135it [00:02, 48.29it/s][A
140it [00:02, 48.42it/s][A
145it [00:02, 48.62it/s][A
150it [00:03, 48.72it/s][A
155it [00:03, 48.44it/s][A
160it [00:03, 47.89it/s][A
165it [00:03, 47.11it/s][A
170it [00:03, 46.75it/s][A
175it [00:03, 46.48it/s][A
180it [00:03, 46.90it/s][A
185it [00:03, 47.51it/s][A
190it [00:03, 45.90it/s][A
195it [00:04, 46.28it/s][A
200it [00:04, 46.92it/s][A
205it [00:04, 47.58it/s][A

Epoch: 39, Step: 200, Loss: 4.8339360809326175



210it [00:04, 47.74it/s][A
215it [00:04, 47.37it/s][A
220it [00:04, 47.47it/s][A
227it [00:04, 47.92it/s]
  8%|▊         | 39/500 [04:16<43:10,  5.62s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.95it/s][A
10it [00:00, 48.06it/s][A
15it [00:00, 48.43it/s][A
20it [00:00, 48.53it/s][A
25it [00:00, 48.74it/s][A
30it [00:00, 48.42it/s][A
35it [00:00, 48.53it/s][A
40it [00:00, 48.52it/s][A
45it [00:00, 48.38it/s][A
50it [00:01, 48.24it/s][A
55it [00:01, 48.29it/s][A
60it [00:01, 48.44it/s][A
65it [00:01, 48.55it/s][A
70it [00:01, 48.41it/s][A
75it [00:01, 48.31it/s][A
80it [00:01, 48.49it/s][A
85it [00:01, 48.48it/s][A
90it [00:01, 48.36it/s][A
95it [00:01, 48.50it/s][A
100it [00:02, 48.01it/s][A
105it [00:02, 48.22it/s][A

Epoch: 40, Step: 100, Loss: 4.819097127914429



110it [00:02, 48.37it/s][A
115it [00:02, 48.48it/s][A
120it [00:02, 48.31it/s][A
125it [00:02, 48.48it/s][A
130it [00:02, 48.53it/s][A
135it [00:02, 48.41it/s][A
140it [00:02, 48.19it/s][A
145it [00:02, 48.11it/s][A
150it [00:03, 48.25it/s][A
155it [00:03, 48.36it/s][A
160it [00:03, 48.44it/s][A
165it [00:03, 48.35it/s][A
170it [00:03, 48.26it/s][A
175it [00:03, 48.24it/s][A
180it [00:03, 48.33it/s][A
185it [00:03, 48.32it/s][A
190it [00:03, 48.25it/s][A
195it [00:04, 48.04it/s][A
200it [00:04, 47.92it/s][A
205it [00:04, 48.08it/s][A

Epoch: 40, Step: 200, Loss: 4.832674894332886



210it [00:04, 48.17it/s][A
215it [00:04, 48.37it/s][A
220it [00:04, 48.53it/s][A
227it [00:04, 48.33it/s]

0it [00:00, ?it/s][A
7it [00:00, 60.94it/s][A
14it [00:00, 61.21it/s][A
21it [00:00, 61.07it/s][A
28it [00:00, 61.41it/s][A
35it [00:00, 61.75it/s][A
42it [00:00, 61.50it/s][A
49it [00:00, 61.77it/s][A
56it [00:00, 62.06it/s][A
63it [00:01, 62.24it/s][A
70it [00:01, 62.41it/s][A
77it [00:01, 62.48it/s][A
84it [00:01, 62.26it/s][A
91it [00:01, 62.46it/s][A
98it [00:01, 62.75it/s][A
105it [00:01, 62.86it/s][A
112it [00:01, 62.99it/s][A
119it [00:01, 63.07it/s][A
126it [00:02, 63.09it/s][A
133it [00:02, 63.06it/s][A
140it [00:02, 63.09it/s][A
147it [00:02, 63.10it/s][A
154it [00:02, 63.04it/s][A
161it [00:02, 63.11it/s][A
168it [00:02, 63.15it/s][A
175it [00:02, 63.16it/s][A
182it [00:02, 62.99it/s][A
189it [00:03, 63.11it/s][A
196it [00:03, 63.03it/s][A
203it [00:03, 62.52it/s][A
210it [00:03, 62.48it/s][A
217it [00:03, 62.61it/s][A
224it [00:03, 


Epoch: 40, Test Loss: 5.412922385316458, Test Perplexity: 224.91920087352304




0it [00:00, ?it/s][A
5it [00:00, 42.01it/s][A
10it [00:00, 43.44it/s][A
15it [00:00, 43.30it/s][A
20it [00:00, 43.60it/s][A
25it [00:00, 43.69it/s][A
30it [00:00, 43.86it/s][A
35it [00:00, 44.06it/s][A
40it [00:00, 45.17it/s][A
45it [00:01, 46.11it/s][A
50it [00:01, 46.67it/s][A
55it [00:01, 47.35it/s][A
60it [00:01, 47.95it/s][A
65it [00:01, 48.05it/s][A
70it [00:01, 48.17it/s][A
75it [00:01, 48.18it/s][A
80it [00:01, 47.96it/s][A
85it [00:01, 48.09it/s][A
90it [00:01, 48.32it/s][A
95it [00:02, 48.52it/s][A
100it [00:02, 48.33it/s][A
105it [00:02, 48.50it/s][A

Epoch: 41, Step: 100, Loss: 4.8178127479553225



110it [00:02, 48.26it/s][A
115it [00:02, 48.30it/s][A
120it [00:02, 48.30it/s][A
125it [00:02, 48.21it/s][A
130it [00:02, 48.11it/s][A
135it [00:02, 47.97it/s][A
140it [00:02, 47.75it/s][A
145it [00:03, 47.57it/s][A
150it [00:03, 47.63it/s][A
155it [00:03, 47.86it/s][A
160it [00:03, 48.13it/s][A
165it [00:03, 48.31it/s][A
170it [00:03, 48.29it/s][A
175it [00:03, 48.34it/s][A
180it [00:03, 48.49it/s][A
185it [00:03, 48.41it/s][A
190it [00:04, 48.46it/s][A
195it [00:04, 48.52it/s][A
200it [00:04, 48.54it/s][A
205it [00:04, 48.53it/s][A

Epoch: 41, Step: 200, Loss: 4.826229286193848



210it [00:04, 48.35it/s][A
215it [00:04, 48.41it/s][A
220it [00:04, 48.47it/s][A
227it [00:04, 47.52it/s]
  8%|▊         | 41/500 [04:36<56:07,  7.34s/it]  
0it [00:00, ?it/s][A
5it [00:00, 48.96it/s][A
10it [00:00, 48.81it/s][A
15it [00:00, 48.44it/s][A
20it [00:00, 48.26it/s][A
25it [00:00, 48.32it/s][A
30it [00:00, 48.44it/s][A
35it [00:00, 48.46it/s][A
40it [00:00, 48.08it/s][A
45it [00:00, 48.28it/s][A
50it [00:01, 48.33it/s][A
55it [00:01, 48.19it/s][A
60it [00:01, 48.10it/s][A
65it [00:01, 48.02it/s][A
70it [00:01, 48.16it/s][A
75it [00:01, 48.20it/s][A
80it [00:01, 48.44it/s][A
85it [00:01, 47.39it/s][A
90it [00:01, 47.12it/s][A
95it [00:01, 46.96it/s][A
100it [00:02, 47.38it/s][A
105it [00:02, 47.68it/s][A

Epoch: 42, Step: 100, Loss: 4.817641735076904



110it [00:02, 47.71it/s][A
115it [00:02, 47.81it/s][A
120it [00:02, 47.97it/s][A
125it [00:02, 48.08it/s][A
130it [00:02, 48.09it/s][A
135it [00:02, 47.96it/s][A
140it [00:02, 47.96it/s][A
145it [00:03, 47.74it/s][A
150it [00:03, 47.39it/s][A
155it [00:03, 47.71it/s][A
160it [00:03, 47.92it/s][A
165it [00:03, 48.08it/s][A
170it [00:03, 47.89it/s][A
175it [00:03, 48.01it/s][A
180it [00:03, 48.11it/s][A
185it [00:03, 48.06it/s][A
190it [00:03, 47.96it/s][A
195it [00:04, 47.83it/s][A
200it [00:04, 47.94it/s][A
205it [00:04, 48.04it/s][A

Epoch: 42, Step: 200, Loss: 4.827890293598175



210it [00:04, 48.06it/s][A
215it [00:04, 47.99it/s][A
220it [00:04, 48.03it/s][A
227it [00:04, 47.96it/s]
  8%|▊         | 42/500 [04:41<50:02,  6.56s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.40it/s][A
10it [00:00, 48.45it/s][A
15it [00:00, 48.27it/s][A
20it [00:00, 48.42it/s][A
25it [00:00, 48.17it/s][A
30it [00:00, 48.00it/s][A
35it [00:00, 48.24it/s][A
40it [00:00, 48.29it/s][A
45it [00:00, 48.29it/s][A
50it [00:01, 48.40it/s][A
55it [00:01, 48.36it/s][A
60it [00:01, 48.37it/s][A
65it [00:01, 48.05it/s][A
70it [00:01, 47.87it/s][A
75it [00:01, 47.49it/s][A
80it [00:01, 47.81it/s][A
85it [00:01, 48.09it/s][A
90it [00:01, 47.96it/s][A
95it [00:01, 47.72it/s][A
100it [00:02, 47.74it/s][A
105it [00:02, 47.69it/s][A

Epoch: 43, Step: 100, Loss: 4.806503281593323



110it [00:02, 47.71it/s][A
115it [00:02, 47.59it/s][A
120it [00:02, 47.83it/s][A
125it [00:02, 47.94it/s][A
130it [00:02, 48.08it/s][A
135it [00:02, 47.95it/s][A
140it [00:02, 47.61it/s][A
145it [00:03, 47.86it/s][A
150it [00:03, 47.98it/s][A
155it [00:03, 47.94it/s][A
160it [00:03, 47.55it/s][A
165it [00:03, 47.62it/s][A
170it [00:03, 47.44it/s][A
175it [00:03, 47.44it/s][A
180it [00:03, 47.55it/s][A
185it [00:03, 47.65it/s][A
190it [00:03, 47.73it/s][A
195it [00:04, 47.88it/s][A
200it [00:04, 47.49it/s][A
205it [00:04, 46.79it/s][A

Epoch: 43, Step: 200, Loss: 4.8233221864700315



210it [00:04, 46.83it/s][A
215it [00:04, 47.02it/s][A
220it [00:04, 47.18it/s][A
227it [00:04, 47.74it/s]
  9%|▊         | 43/500 [04:46<45:49,  6.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.50it/s][A
10it [00:00, 47.13it/s][A
15it [00:00, 47.52it/s][A
20it [00:00, 47.88it/s][A
25it [00:00, 47.36it/s][A
30it [00:00, 47.16it/s][A
35it [00:00, 46.52it/s][A
40it [00:00, 46.80it/s][A
45it [00:00, 46.64it/s][A
50it [00:01, 47.01it/s][A
55it [00:01, 47.23it/s][A
60it [00:01, 47.07it/s][A
65it [00:01, 47.18it/s][A
70it [00:01, 47.48it/s][A
75it [00:01, 47.76it/s][A
80it [00:01, 48.14it/s][A
85it [00:01, 48.39it/s][A
90it [00:01, 48.55it/s][A
95it [00:01, 48.74it/s][A
100it [00:02, 48.45it/s][A
105it [00:02, 48.17it/s][A

Epoch: 44, Step: 100, Loss: 4.813006820678711



110it [00:02, 48.11it/s][A
115it [00:02, 48.16it/s][A
120it [00:02, 48.25it/s][A
125it [00:02, 48.17it/s][A
130it [00:02, 48.22it/s][A
135it [00:02, 48.00it/s][A
140it [00:02, 48.12it/s][A
145it [00:03, 47.94it/s][A
150it [00:03, 47.58it/s][A
155it [00:03, 47.80it/s][A
160it [00:03, 47.75it/s][A
165it [00:03, 47.85it/s][A
170it [00:03, 47.86it/s][A
175it [00:03, 46.99it/s][A
180it [00:03, 46.57it/s][A
185it [00:03, 46.57it/s][A
190it [00:03, 46.89it/s][A
195it [00:04, 47.05it/s][A
200it [00:04, 47.25it/s][A
205it [00:04, 47.38it/s][A

Epoch: 44, Step: 200, Loss: 4.82508670091629



210it [00:04, 47.29it/s][A
215it [00:04, 47.57it/s][A
220it [00:04, 47.38it/s][A
227it [00:04, 47.51it/s]
  9%|▉         | 44/500 [04:50<42:54,  5.65s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.87it/s][A
10it [00:00, 45.89it/s][A
15it [00:00, 46.65it/s][A
20it [00:00, 47.33it/s][A
25it [00:00, 47.67it/s][A
30it [00:00, 47.93it/s][A
35it [00:00, 48.09it/s][A
40it [00:00, 48.07it/s][A
45it [00:00, 48.08it/s][A
50it [00:01, 48.04it/s][A
55it [00:01, 48.09it/s][A
60it [00:01, 48.13it/s][A
65it [00:01, 48.10it/s][A
70it [00:01, 48.12it/s][A
75it [00:01, 48.05it/s][A
80it [00:01, 47.68it/s][A
85it [00:01, 47.80it/s][A
90it [00:01, 47.95it/s][A
95it [00:01, 48.08it/s][A
100it [00:02, 48.24it/s][A
105it [00:02, 48.36it/s][A

Epoch: 45, Step: 100, Loss: 4.804991130828857



110it [00:02, 48.10it/s][A
115it [00:02, 48.24it/s][A
120it [00:02, 48.29it/s][A
125it [00:02, 48.25it/s][A
130it [00:02, 48.35it/s][A
135it [00:02, 48.37it/s][A
140it [00:02, 48.40it/s][A
145it [00:03, 48.50it/s][A
150it [00:03, 48.61it/s][A
155it [00:03, 48.54it/s][A
160it [00:03, 48.38it/s][A
165it [00:03, 48.32it/s][A
170it [00:03, 48.22it/s][A
175it [00:03, 48.19it/s][A
180it [00:03, 48.20it/s][A
185it [00:03, 48.20it/s][A
190it [00:03, 48.19it/s][A
195it [00:04, 48.21it/s][A
200it [00:04, 48.14it/s][A
205it [00:04, 48.07it/s][A

Epoch: 45, Step: 200, Loss: 4.817917468547821



210it [00:04, 48.08it/s][A
215it [00:04, 48.19it/s][A
220it [00:04, 47.84it/s][A
227it [00:04, 48.00it/s]

0it [00:00, ?it/s][A
7it [00:00, 60.93it/s][A
14it [00:00, 61.57it/s][A
21it [00:00, 61.81it/s][A
28it [00:00, 62.59it/s][A
35it [00:00, 63.01it/s][A
42it [00:00, 62.80it/s][A
49it [00:00, 62.29it/s][A
56it [00:00, 62.61it/s][A
63it [00:01, 62.73it/s][A
70it [00:01, 63.00it/s][A
77it [00:01, 62.55it/s][A
84it [00:01, 62.42it/s][A
91it [00:01, 62.49it/s][A
98it [00:01, 62.17it/s][A
105it [00:01, 62.04it/s][A
112it [00:01, 61.80it/s][A
119it [00:01, 61.92it/s][A
126it [00:02, 61.59it/s][A
133it [00:02, 61.69it/s][A
140it [00:02, 62.01it/s][A
147it [00:02, 62.07it/s][A
154it [00:02, 62.46it/s][A
161it [00:02, 62.68it/s][A
168it [00:02, 62.01it/s][A
175it [00:02, 62.36it/s][A
182it [00:02, 62.50it/s][A
189it [00:03, 62.52it/s][A
196it [00:03, 62.52it/s][A
203it [00:03, 61.77it/s][A
210it [00:03, 61.66it/s][A
217it [00:03, 61.78it/s][A
224it [00:03, 


Epoch: 45, Test Loss: 5.413643061744501, Test Perplexity: 225.07272954905255




0it [00:00, ?it/s][A
5it [00:00, 47.98it/s][A
10it [00:00, 48.18it/s][A
15it [00:00, 48.16it/s][A
20it [00:00, 48.39it/s][A
25it [00:00, 48.44it/s][A
30it [00:00, 48.67it/s][A
35it [00:00, 48.57it/s][A
40it [00:00, 48.74it/s][A
45it [00:00, 47.93it/s][A
50it [00:01, 47.82it/s][A
55it [00:01, 48.02it/s][A
60it [00:01, 48.32it/s][A
65it [00:01, 48.00it/s][A
70it [00:01, 48.27it/s][A
75it [00:01, 48.46it/s][A
80it [00:01, 48.64it/s][A
85it [00:01, 48.76it/s][A
90it [00:01, 48.87it/s][A
95it [00:01, 48.53it/s][A
100it [00:02, 48.53it/s][A
105it [00:02, 48.59it/s][A

Epoch: 46, Step: 100, Loss: 4.8090434694290165



110it [00:02, 48.62it/s][A
115it [00:02, 48.63it/s][A
120it [00:02, 48.66it/s][A
125it [00:02, 48.64it/s][A
130it [00:02, 48.63it/s][A
135it [00:02, 48.69it/s][A
140it [00:02, 48.56it/s][A
145it [00:02, 48.59it/s][A
150it [00:03, 48.33it/s][A
155it [00:03, 48.34it/s][A
160it [00:03, 48.34it/s][A
165it [00:03, 48.43it/s][A
170it [00:03, 48.55it/s][A
175it [00:03, 48.70it/s][A
180it [00:03, 48.68it/s][A
185it [00:03, 48.50it/s][A
190it [00:03, 48.19it/s][A
195it [00:04, 47.80it/s][A
200it [00:04, 47.98it/s][A
205it [00:04, 48.23it/s][A

Epoch: 46, Step: 200, Loss: 4.8175844073295595



210it [00:04, 48.30it/s][A
215it [00:04, 48.19it/s][A
220it [00:04, 48.28it/s][A
227it [00:04, 48.37it/s]
  9%|▉         | 46/500 [05:10<55:29,  7.33s/it]  
0it [00:00, ?it/s][A
5it [00:00, 47.66it/s][A
10it [00:00, 46.51it/s][A
15it [00:00, 46.16it/s][A
20it [00:00, 47.02it/s][A
25it [00:00, 47.60it/s][A
30it [00:00, 47.58it/s][A
35it [00:00, 47.84it/s][A
40it [00:00, 48.00it/s][A
45it [00:00, 47.94it/s][A
50it [00:01, 48.18it/s][A
55it [00:01, 47.98it/s][A
60it [00:01, 48.11it/s][A
65it [00:01, 48.11it/s][A
70it [00:01, 47.91it/s][A
75it [00:01, 48.11it/s][A
80it [00:01, 48.08it/s][A
85it [00:01, 48.23it/s][A
90it [00:01, 48.35it/s][A
95it [00:01, 47.29it/s][A
100it [00:02, 46.91it/s][A
105it [00:02, 46.97it/s][A

Epoch: 47, Step: 100, Loss: 4.798100428581238



110it [00:02, 47.27it/s][A
115it [00:02, 47.51it/s][A
120it [00:02, 47.57it/s][A
125it [00:02, 47.93it/s][A
130it [00:02, 46.26it/s][A
135it [00:02, 46.79it/s][A
140it [00:02, 46.89it/s][A
145it [00:03, 46.64it/s][A
150it [00:03, 46.40it/s][A
155it [00:03, 46.82it/s][A
160it [00:03, 47.27it/s][A
165it [00:03, 47.56it/s][A
170it [00:03, 47.89it/s][A
175it [00:03, 47.91it/s][A
180it [00:03, 48.09it/s][A
185it [00:03, 48.34it/s][A
190it [00:03, 48.33it/s][A
195it [00:04, 48.36it/s][A
200it [00:04, 48.45it/s][A
205it [00:04, 48.47it/s][A

Epoch: 47, Step: 200, Loss: 4.814676020145416



210it [00:04, 48.45it/s][A
215it [00:04, 48.50it/s][A
220it [00:04, 48.56it/s][A
227it [00:04, 47.72it/s]
  9%|▉         | 47/500 [05:15<49:32,  6.56s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.51it/s][A
10it [00:00, 48.56it/s][A
15it [00:00, 48.79it/s][A
20it [00:00, 48.20it/s][A
25it [00:00, 48.15it/s][A
30it [00:00, 48.39it/s][A
35it [00:00, 48.13it/s][A
40it [00:00, 48.33it/s][A
45it [00:00, 48.36it/s][A
50it [00:01, 48.07it/s][A
55it [00:01, 48.26it/s][A
60it [00:01, 48.34it/s][A
65it [00:01, 48.23it/s][A
70it [00:01, 48.36it/s][A
75it [00:01, 48.52it/s][A
80it [00:01, 48.61it/s][A
85it [00:01, 48.50it/s][A
90it [00:01, 48.61it/s][A
95it [00:01, 48.73it/s][A
100it [00:02, 48.67it/s][A
105it [00:02, 48.76it/s][A

Epoch: 48, Step: 100, Loss: 4.800694589614868



110it [00:02, 48.67it/s][A
115it [00:02, 48.51it/s][A
120it [00:02, 48.35it/s][A
125it [00:02, 48.55it/s][A
130it [00:02, 48.76it/s][A
135it [00:02, 48.81it/s][A
140it [00:02, 48.78it/s][A
145it [00:02, 48.74it/s][A
150it [00:03, 48.85it/s][A
155it [00:03, 48.86it/s][A
160it [00:03, 48.70it/s][A
165it [00:03, 48.60it/s][A
170it [00:03, 48.34it/s][A
175it [00:03, 48.13it/s][A
180it [00:03, 48.16it/s][A
185it [00:03, 48.03it/s][A
190it [00:03, 47.80it/s][A
195it [00:04, 47.98it/s][A
200it [00:04, 48.02it/s][A
205it [00:04, 48.18it/s][A

Epoch: 48, Step: 200, Loss: 4.813360579013825



210it [00:04, 48.26it/s][A
215it [00:04, 48.29it/s][A
220it [00:04, 48.15it/s][A
227it [00:04, 48.36it/s]
 10%|▉         | 48/500 [05:20<45:13,  6.00s/it]
0it [00:00, ?it/s][A
5it [00:00, 48.57it/s][A
10it [00:00, 47.57it/s][A
15it [00:00, 48.18it/s][A
20it [00:00, 47.74it/s][A
25it [00:00, 47.99it/s][A
30it [00:00, 48.14it/s][A
35it [00:00, 48.29it/s][A
40it [00:00, 48.27it/s][A
45it [00:00, 47.79it/s][A
50it [00:01, 47.90it/s][A
55it [00:01, 48.04it/s][A
60it [00:01, 48.10it/s][A
65it [00:01, 48.21it/s][A
70it [00:01, 48.33it/s][A
75it [00:01, 48.46it/s][A
80it [00:01, 47.84it/s][A
85it [00:01, 47.76it/s][A
90it [00:01, 47.76it/s][A
95it [00:01, 47.93it/s][A
100it [00:02, 48.08it/s][A
105it [00:02, 48.29it/s][A

Epoch: 49, Step: 100, Loss: 4.800514221191406



110it [00:02, 48.20it/s][A
115it [00:02, 48.37it/s][A
120it [00:02, 48.00it/s][A
125it [00:02, 47.67it/s][A
130it [00:02, 47.13it/s][A
135it [00:02, 47.08it/s][A
140it [00:02, 47.21it/s][A
145it [00:03, 47.58it/s][A
150it [00:03, 47.42it/s][A
155it [00:03, 47.20it/s][A
160it [00:03, 47.62it/s][A
165it [00:03, 47.85it/s][A
170it [00:03, 47.62it/s][A
175it [00:03, 47.90it/s][A
180it [00:03, 47.90it/s][A
185it [00:03, 48.04it/s][A
190it [00:03, 47.15it/s][A
195it [00:04, 47.15it/s][A
200it [00:04, 47.38it/s][A
205it [00:04, 45.50it/s][A

Epoch: 49, Step: 200, Loss: 4.810184350013733



210it [00:04, 46.10it/s][A
215it [00:04, 46.54it/s][A
220it [00:04, 47.12it/s][A
227it [00:04, 47.62it/s]
 10%|▉         | 49/500 [05:24<42:20,  5.63s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.19it/s][A
10it [00:00, 45.33it/s][A
15it [00:00, 46.06it/s][A
20it [00:00, 45.29it/s][A
25it [00:00, 45.21it/s][A
30it [00:00, 46.16it/s][A
35it [00:00, 46.78it/s][A
40it [00:00, 46.60it/s][A
45it [00:00, 47.04it/s][A
50it [00:01, 47.52it/s][A
55it [00:01, 47.81it/s][A
60it [00:01, 48.24it/s][A
65it [00:01, 48.51it/s][A
70it [00:01, 48.11it/s][A
75it [00:01, 48.38it/s][A
80it [00:01, 48.59it/s][A
85it [00:01, 48.42it/s][A
90it [00:01, 48.55it/s][A
95it [00:02, 48.35it/s][A
100it [00:02, 47.76it/s][A
105it [00:02, 47.90it/s][A

Epoch: 50, Step: 100, Loss: 4.794444689750671



110it [00:02, 48.11it/s][A
115it [00:02, 48.39it/s][A
120it [00:02, 48.54it/s][A
125it [00:02, 48.44it/s][A
130it [00:02, 48.54it/s][A
135it [00:02, 48.43it/s][A
140it [00:02, 48.53it/s][A
145it [00:03, 48.11it/s][A
150it [00:03, 47.91it/s][A
155it [00:03, 48.27it/s][A
160it [00:03, 48.45it/s][A
165it [00:03, 48.57it/s][A
170it [00:03, 48.55it/s][A
175it [00:03, 48.44it/s][A
180it [00:03, 48.59it/s][A
185it [00:03, 48.26it/s][A
190it [00:03, 48.33it/s][A
195it [00:04, 48.41it/s][A
200it [00:04, 48.61it/s][A
205it [00:04, 47.98it/s][A

Epoch: 50, Step: 200, Loss: 4.804558434486389



210it [00:04, 48.12it/s][A
215it [00:04, 48.18it/s][A
220it [00:04, 48.40it/s][A
227it [00:04, 47.92it/s]

0it [00:00, ?it/s][A
7it [00:00, 61.64it/s][A
14it [00:00, 62.68it/s][A
21it [00:00, 62.91it/s][A
28it [00:00, 63.18it/s][A
35it [00:00, 63.35it/s][A
42it [00:00, 63.05it/s][A
49it [00:00, 62.90it/s][A
56it [00:00, 62.45it/s][A
63it [00:01, 62.54it/s][A
70it [00:01, 62.82it/s][A
77it [00:01, 62.59it/s][A
84it [00:01, 62.68it/s][A
91it [00:01, 62.76it/s][A
98it [00:01, 63.09it/s][A
105it [00:01, 63.11it/s][A
112it [00:01, 63.18it/s][A
119it [00:01, 63.20it/s][A
126it [00:02, 63.33it/s][A
133it [00:02, 63.19it/s][A
140it [00:02, 63.11it/s][A
147it [00:02, 63.19it/s][A
154it [00:02, 63.20it/s][A
161it [00:02, 63.28it/s][A
168it [00:02, 63.24it/s][A
175it [00:02, 63.32it/s][A
182it [00:02, 63.28it/s][A
189it [00:02, 63.29it/s][A
196it [00:03, 63.10it/s][A
203it [00:03, 62.92it/s][A
210it [00:03, 62.90it/s][A
217it [00:03, 62.84it/s][A
224it [00:03, 


Epoch: 50, Test Loss: 5.406356989967157, Test Perplexity: 223.38458780324237




0it [00:00, ?it/s][A
5it [00:00, 47.44it/s][A
10it [00:00, 47.50it/s][A
15it [00:00, 47.78it/s][A
20it [00:00, 47.95it/s][A
25it [00:00, 48.16it/s][A
30it [00:00, 48.24it/s][A
35it [00:00, 48.36it/s][A
40it [00:00, 48.45it/s][A
45it [00:00, 48.37it/s][A
50it [00:01, 48.47it/s][A
55it [00:01, 48.52it/s][A
60it [00:01, 48.71it/s][A
65it [00:01, 48.83it/s][A
70it [00:01, 48.60it/s][A
75it [00:01, 48.81it/s][A
80it [00:01, 48.89it/s][A
85it [00:01, 48.92it/s][A
90it [00:01, 48.95it/s][A
95it [00:01, 48.84it/s][A
100it [00:02, 48.63it/s][A
105it [00:02, 48.38it/s][A

Epoch: 51, Step: 100, Loss: 4.786552000045776



110it [00:02, 48.45it/s][A
115it [00:02, 48.41it/s][A
120it [00:02, 48.10it/s][A
125it [00:02, 48.32it/s][A
130it [00:02, 48.45it/s][A
135it [00:02, 48.25it/s][A
140it [00:02, 48.26it/s][A
145it [00:02, 48.07it/s][A
150it [00:03, 47.51it/s][A
155it [00:03, 46.38it/s][A
160it [00:03, 45.77it/s][A
165it [00:03, 45.45it/s][A
170it [00:03, 45.19it/s][A
175it [00:03, 44.81it/s][A
180it [00:03, 44.44it/s][A
185it [00:03, 44.48it/s][A
190it [00:04, 44.50it/s][A
195it [00:04, 44.58it/s][A
200it [00:04, 43.84it/s][A
205it [00:04, 44.09it/s][A

Epoch: 51, Step: 200, Loss: 4.80556333065033



210it [00:04, 43.81it/s][A
215it [00:04, 43.85it/s][A
220it [00:04, 44.00it/s][A
227it [00:04, 46.77it/s]
 10%|█         | 51/500 [05:44<55:13,  7.38s/it]  
0it [00:00, ?it/s][A
5it [00:00, 44.28it/s][A
10it [00:00, 44.40it/s][A
15it [00:00, 44.00it/s][A
20it [00:00, 43.72it/s][A
25it [00:00, 43.64it/s][A
30it [00:00, 43.71it/s][A
35it [00:00, 44.18it/s][A
40it [00:00, 44.70it/s][A
45it [00:01, 44.80it/s][A
50it [00:01, 44.26it/s][A
55it [00:01, 44.63it/s][A
60it [00:01, 44.94it/s][A
65it [00:01, 44.91it/s][A
70it [00:01, 45.16it/s][A
75it [00:01, 45.54it/s][A
80it [00:01, 45.96it/s][A
85it [00:01, 46.11it/s][A
90it [00:02, 45.98it/s][A
95it [00:02, 45.23it/s][A
100it [00:02, 44.95it/s][A
105it [00:02, 44.85it/s][A

Epoch: 52, Step: 100, Loss: 4.786493649482727



110it [00:02, 44.48it/s][A
115it [00:02, 45.00it/s][A
120it [00:02, 44.99it/s][A
125it [00:02, 45.20it/s][A
130it [00:02, 45.24it/s][A
135it [00:03, 44.96it/s][A
140it [00:03, 44.83it/s][A
145it [00:03, 44.91it/s][A
150it [00:03, 44.91it/s][A
155it [00:03, 44.90it/s][A
160it [00:03, 45.32it/s][A
165it [00:03, 45.21it/s][A
170it [00:03, 45.16it/s][A
175it [00:03, 45.36it/s][A
180it [00:04, 45.54it/s][A
185it [00:04, 45.63it/s][A
190it [00:04, 45.72it/s][A
195it [00:04, 45.64it/s][A
200it [00:04, 45.83it/s][A
205it [00:04, 45.78it/s][A

Epoch: 52, Step: 200, Loss: 4.802151036262512



210it [00:04, 45.77it/s][A
215it [00:04, 45.97it/s][A
220it [00:04, 46.10it/s][A
227it [00:05, 45.16it/s]
 10%|█         | 52/500 [05:49<49:50,  6.68s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.03it/s][A
10it [00:00, 45.39it/s][A
15it [00:00, 45.78it/s][A
20it [00:00, 45.29it/s][A
25it [00:00, 45.43it/s][A
30it [00:00, 44.96it/s][A
35it [00:00, 45.24it/s][A
40it [00:00, 45.15it/s][A
45it [00:00, 45.26it/s][A
50it [00:01, 44.91it/s][A
55it [00:01, 45.11it/s][A
60it [00:01, 45.38it/s][A
65it [00:01, 45.53it/s][A
70it [00:01, 45.44it/s][A
75it [00:01, 45.50it/s][A
80it [00:01, 45.62it/s][A
85it [00:01, 45.93it/s][A
90it [00:01, 45.64it/s][A
95it [00:02, 45.95it/s][A
100it [00:02, 45.99it/s][A
105it [00:02, 45.91it/s][A

Epoch: 53, Step: 100, Loss: 4.791191825866699



110it [00:02, 45.93it/s][A
115it [00:02, 46.13it/s][A
120it [00:02, 46.28it/s][A
125it [00:02, 46.31it/s][A
130it [00:02, 46.30it/s][A
135it [00:02, 46.27it/s][A
140it [00:03, 46.21it/s][A
145it [00:03, 46.13it/s][A
150it [00:03, 46.23it/s][A
155it [00:03, 46.31it/s][A
160it [00:03, 46.15it/s][A
165it [00:03, 45.89it/s][A
170it [00:03, 46.06it/s][A
175it [00:03, 46.23it/s][A
180it [00:03, 46.06it/s][A
185it [00:04, 46.19it/s][A
190it [00:04, 46.14it/s][A
195it [00:04, 46.18it/s][A
200it [00:04, 46.29it/s][A
205it [00:04, 46.19it/s][A

Epoch: 53, Step: 200, Loss: 4.800136449337006



210it [00:04, 45.91it/s][A
215it [00:04, 46.03it/s][A
220it [00:04, 45.58it/s][A
227it [00:04, 45.80it/s]
 11%|█         | 53/500 [05:54<45:53,  6.16s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.39it/s][A
10it [00:00, 46.09it/s][A
15it [00:00, 45.43it/s][A
20it [00:00, 45.56it/s][A
25it [00:00, 45.72it/s][A
30it [00:00, 45.67it/s][A
35it [00:00, 45.85it/s][A
40it [00:00, 45.77it/s][A
45it [00:00, 45.57it/s][A
50it [00:01, 45.12it/s][A
55it [00:01, 45.44it/s][A
60it [00:01, 45.03it/s][A
65it [00:01, 45.48it/s][A
70it [00:01, 45.37it/s][A
75it [00:01, 45.38it/s][A
80it [00:01, 45.45it/s][A
85it [00:01, 45.16it/s][A
90it [00:01, 45.13it/s][A
95it [00:02, 45.24it/s][A
100it [00:02, 45.08it/s][A
105it [00:02, 45.42it/s][A

Epoch: 54, Step: 100, Loss: 4.7843520450592045



110it [00:02, 45.41it/s][A
115it [00:02, 45.37it/s][A
120it [00:02, 45.24it/s][A
125it [00:02, 45.46it/s][A
130it [00:02, 45.56it/s][A
135it [00:02, 44.99it/s][A
140it [00:03, 44.32it/s][A
145it [00:03, 44.44it/s][A
150it [00:03, 44.80it/s][A
155it [00:03, 44.92it/s][A
160it [00:03, 44.21it/s][A
165it [00:03, 44.28it/s][A
170it [00:03, 44.51it/s][A
175it [00:03, 44.89it/s][A
180it [00:03, 44.84it/s][A
185it [00:04, 44.79it/s][A
190it [00:04, 44.98it/s][A
195it [00:04, 45.11it/s][A
200it [00:04, 44.44it/s][A
205it [00:04, 44.71it/s][A

Epoch: 54, Step: 200, Loss: 4.797415292263031



210it [00:04, 44.56it/s][A
215it [00:04, 44.00it/s][A
220it [00:04, 44.28it/s][A
227it [00:05, 45.04it/s]
 11%|█         | 54/500 [05:59<43:18,  5.83s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.44it/s][A
10it [00:00, 46.21it/s][A
15it [00:00, 46.25it/s][A
20it [00:00, 46.10it/s][A
25it [00:00, 45.44it/s][A
30it [00:00, 45.60it/s][A
35it [00:00, 45.56it/s][A
40it [00:00, 45.70it/s][A
45it [00:00, 45.83it/s][A
50it [00:01, 45.64it/s][A
55it [00:01, 45.77it/s][A
60it [00:01, 45.87it/s][A
65it [00:01, 45.88it/s][A
70it [00:01, 45.95it/s][A
75it [00:01, 45.78it/s][A
80it [00:01, 45.87it/s][A
85it [00:01, 45.48it/s][A
90it [00:01, 45.59it/s][A
95it [00:02, 45.66it/s][A
100it [00:02, 45.69it/s][A
105it [00:02, 45.77it/s][A

Epoch: 55, Step: 100, Loss: 4.783058729171753



110it [00:02, 45.81it/s][A
115it [00:02, 45.82it/s][A
120it [00:02, 45.78it/s][A
125it [00:02, 45.88it/s][A
130it [00:02, 45.72it/s][A
135it [00:02, 45.75it/s][A
140it [00:03, 45.18it/s][A
145it [00:03, 45.50it/s][A
150it [00:03, 45.29it/s][A
155it [00:03, 45.40it/s][A
160it [00:03, 44.91it/s][A
165it [00:03, 45.20it/s][A
170it [00:03, 45.32it/s][A
175it [00:03, 45.47it/s][A
180it [00:03, 45.36it/s][A
185it [00:04, 45.39it/s][A
190it [00:04, 45.38it/s][A
195it [00:04, 45.49it/s][A
200it [00:04, 45.63it/s][A
205it [00:04, 45.67it/s][A

Epoch: 55, Step: 200, Loss: 4.7950461173057555



210it [00:04, 45.09it/s][A
215it [00:04, 45.27it/s][A
220it [00:04, 45.54it/s][A
227it [00:04, 45.58it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.46it/s][A
13it [00:00, 60.13it/s][A
20it [00:00, 60.56it/s][A
27it [00:00, 60.50it/s][A
34it [00:00, 60.53it/s][A
41it [00:00, 60.36it/s][A
48it [00:00, 60.62it/s][A
55it [00:00, 60.12it/s][A
62it [00:01, 60.66it/s][A
69it [00:01, 60.56it/s][A
76it [00:01, 60.76it/s][A
83it [00:01, 60.75it/s][A
90it [00:01, 60.75it/s][A
97it [00:01, 60.83it/s][A
104it [00:01, 60.29it/s][A
111it [00:01, 60.54it/s][A
118it [00:01, 60.59it/s][A
125it [00:02, 60.51it/s][A
132it [00:02, 60.33it/s][A
139it [00:02, 60.52it/s][A
146it [00:02, 60.57it/s][A
153it [00:02, 60.64it/s][A
160it [00:02, 60.75it/s][A
167it [00:02, 60.73it/s][A
174it [00:02, 60.41it/s][A
181it [00:02, 60.01it/s][A
188it [00:03, 59.47it/s][A
194it [00:03, 59.57it/s][A
201it [00:03, 59.85it/s][A
208it [00:03, 60.11it/s][A
215it [00:03, 60.48it/s][A
222it [00:03, 


Epoch: 55, Test Loss: 5.41240484299867, Test Perplexity: 224.85842867075286




0it [00:00, ?it/s][A
5it [00:00, 46.95it/s][A
10it [00:00, 46.28it/s][A
15it [00:00, 46.31it/s][A
20it [00:00, 46.31it/s][A
25it [00:00, 45.45it/s][A
30it [00:00, 45.79it/s][A
35it [00:00, 45.93it/s][A
40it [00:00, 46.01it/s][A
45it [00:00, 46.09it/s][A
50it [00:01, 46.13it/s][A
55it [00:01, 46.06it/s][A
60it [00:01, 46.08it/s][A
65it [00:01, 46.14it/s][A
70it [00:01, 46.08it/s][A
75it [00:01, 46.07it/s][A
80it [00:01, 46.11it/s][A
85it [00:01, 46.04it/s][A
90it [00:01, 46.11it/s][A
95it [00:02, 46.35it/s][A
100it [00:02, 46.47it/s][A
105it [00:02, 46.57it/s][A

Epoch: 56, Step: 100, Loss: 4.776369667053222



110it [00:02, 46.39it/s][A
115it [00:02, 46.40it/s][A
120it [00:02, 46.34it/s][A
125it [00:02, 46.06it/s][A
130it [00:02, 46.14it/s][A
135it [00:02, 46.32it/s][A
140it [00:03, 46.41it/s][A
145it [00:03, 46.41it/s][A
150it [00:03, 46.41it/s][A
155it [00:03, 45.74it/s][A
160it [00:03, 45.68it/s][A
165it [00:03, 46.08it/s][A
170it [00:03, 46.20it/s][A
175it [00:03, 46.19it/s][A
180it [00:03, 46.30it/s][A
185it [00:04, 46.24it/s][A
190it [00:04, 46.33it/s][A
195it [00:04, 46.32it/s][A
200it [00:04, 46.17it/s][A
205it [00:04, 46.31it/s][A

Epoch: 56, Step: 200, Loss: 4.792399315834046



210it [00:04, 46.31it/s][A
215it [00:04, 46.47it/s][A
220it [00:04, 46.36it/s][A
227it [00:04, 46.18it/s]
 11%|█         | 56/500 [06:20<56:22,  7.62s/it]  
0it [00:00, ?it/s][A
5it [00:00, 45.79it/s][A
10it [00:00, 45.42it/s][A
15it [00:00, 45.23it/s][A
20it [00:00, 45.56it/s][A
25it [00:00, 45.77it/s][A
30it [00:00, 45.86it/s][A
35it [00:00, 45.36it/s][A
40it [00:00, 45.07it/s][A
45it [00:00, 44.92it/s][A
50it [00:01, 45.15it/s][A
55it [00:01, 45.38it/s][A
60it [00:01, 45.39it/s][A
65it [00:01, 42.79it/s][A
70it [00:01, 43.24it/s][A
75it [00:01, 43.75it/s][A
80it [00:01, 43.99it/s][A
85it [00:01, 43.89it/s][A
90it [00:02, 44.31it/s][A
95it [00:02, 44.09it/s][A
100it [00:02, 44.63it/s][A
105it [00:02, 44.70it/s][A

Epoch: 57, Step: 100, Loss: 4.777989730834961



110it [00:02, 44.56it/s][A
115it [00:02, 44.78it/s][A
120it [00:02, 44.65it/s][A
125it [00:02, 44.71it/s][A
130it [00:02, 44.82it/s][A
135it [00:03, 45.14it/s][A
140it [00:03, 44.98it/s][A
145it [00:03, 45.16it/s][A
150it [00:03, 45.15it/s][A
155it [00:03, 45.25it/s][A
160it [00:03, 44.88it/s][A
165it [00:03, 44.89it/s][A
170it [00:03, 45.02it/s][A
175it [00:03, 45.27it/s][A
180it [00:04, 45.27it/s][A
185it [00:04, 45.03it/s][A
190it [00:04, 45.30it/s][A
195it [00:04, 44.99it/s][A
200it [00:04, 45.24it/s][A
205it [00:04, 45.15it/s][A

Epoch: 57, Step: 200, Loss: 4.790709252357483



210it [00:04, 45.04it/s][A
215it [00:04, 44.98it/s][A
220it [00:04, 44.89it/s][A
227it [00:05, 44.87it/s]
 11%|█▏        | 57/500 [06:25<50:35,  6.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.44it/s][A
10it [00:00, 45.78it/s][A
15it [00:00, 45.63it/s][A
20it [00:00, 45.62it/s][A
25it [00:00, 45.67it/s][A
30it [00:00, 45.92it/s][A
35it [00:00, 46.00it/s][A
40it [00:00, 45.83it/s][A
45it [00:00, 45.75it/s][A
50it [00:01, 45.80it/s][A
55it [00:01, 45.53it/s][A
60it [00:01, 45.52it/s][A
65it [00:01, 45.35it/s][A
70it [00:01, 45.25it/s][A
75it [00:01, 45.55it/s][A
80it [00:01, 45.23it/s][A
85it [00:01, 45.51it/s][A
90it [00:01, 45.58it/s][A
95it [00:02, 45.61it/s][A
100it [00:02, 45.46it/s][A
105it [00:02, 45.52it/s][A

Epoch: 58, Step: 100, Loss: 4.782105145454406



110it [00:02, 45.46it/s][A
115it [00:02, 45.45it/s][A
120it [00:02, 45.69it/s][A
125it [00:02, 45.54it/s][A
130it [00:02, 45.56it/s][A
135it [00:02, 45.72it/s][A
140it [00:03, 45.94it/s][A
145it [00:03, 45.92it/s][A
150it [00:03, 45.98it/s][A
155it [00:03, 45.90it/s][A
160it [00:03, 45.87it/s][A
165it [00:03, 46.04it/s][A
170it [00:03, 45.78it/s][A
175it [00:03, 45.67it/s][A
180it [00:03, 45.23it/s][A
185it [00:04, 45.13it/s][A
190it [00:04, 45.28it/s][A
195it [00:04, 44.77it/s][A
200it [00:04, 44.80it/s][A
205it [00:04, 44.98it/s][A

Epoch: 58, Step: 200, Loss: 4.792130274772644



210it [00:04, 45.15it/s][A
215it [00:04, 45.08it/s][A
220it [00:04, 44.97it/s][A
227it [00:04, 45.45it/s]
 12%|█▏        | 58/500 [06:30<46:22,  6.30s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.82it/s][A
10it [00:00, 45.55it/s][A
15it [00:00, 45.40it/s][A
20it [00:00, 45.59it/s][A
25it [00:00, 45.68it/s][A
30it [00:00, 45.57it/s][A
35it [00:00, 45.65it/s][A
40it [00:00, 45.73it/s][A
45it [00:00, 45.83it/s][A
50it [00:01, 45.94it/s][A
55it [00:01, 45.48it/s][A
60it [00:01, 45.43it/s][A
65it [00:01, 45.37it/s][A
70it [00:01, 45.44it/s][A
75it [00:01, 45.56it/s][A
80it [00:01, 45.69it/s][A
85it [00:01, 45.79it/s][A
90it [00:01, 45.46it/s][A
95it [00:02, 45.59it/s][A
100it [00:02, 45.08it/s][A
105it [00:02, 45.23it/s][A

Epoch: 59, Step: 100, Loss: 4.768302655220031



110it [00:02, 44.78it/s][A
115it [00:02, 45.12it/s][A
120it [00:02, 44.98it/s][A
125it [00:02, 45.03it/s][A
130it [00:02, 45.36it/s][A
135it [00:02, 45.35it/s][A
140it [00:03, 45.13it/s][A
145it [00:03, 45.05it/s][A
150it [00:03, 44.58it/s][A
155it [00:03, 44.60it/s][A
160it [00:03, 44.18it/s][A
165it [00:03, 44.55it/s][A
170it [00:03, 44.55it/s][A
175it [00:03, 44.78it/s][A
180it [00:03, 45.02it/s][A
185it [00:04, 44.37it/s][A
190it [00:04, 44.70it/s][A
195it [00:04, 45.05it/s][A
200it [00:04, 43.78it/s][A
205it [00:04, 43.98it/s][A

Epoch: 59, Step: 200, Loss: 4.785933332443237



210it [00:04, 44.38it/s][A
215it [00:04, 44.55it/s][A
220it [00:04, 44.66it/s][A
227it [00:05, 45.02it/s]
 12%|█▏        | 59/500 [06:35<43:31,  5.92s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.17it/s][A
10it [00:00, 45.98it/s][A
15it [00:00, 45.74it/s][A
20it [00:00, 45.37it/s][A
25it [00:00, 45.56it/s][A
30it [00:00, 45.24it/s][A
35it [00:00, 45.41it/s][A
40it [00:00, 45.62it/s][A
45it [00:00, 45.67it/s][A
50it [00:01, 45.78it/s][A
55it [00:01, 45.58it/s][A
60it [00:01, 45.47it/s][A
65it [00:01, 45.50it/s][A
70it [00:01, 45.65it/s][A
75it [00:01, 45.68it/s][A
80it [00:01, 45.75it/s][A
85it [00:01, 45.82it/s][A
90it [00:01, 45.80it/s][A
95it [00:02, 45.82it/s][A
100it [00:02, 45.94it/s][A
105it [00:02, 46.04it/s][A

Epoch: 60, Step: 100, Loss: 4.76907256603241



110it [00:02, 45.73it/s][A
115it [00:02, 45.52it/s][A
120it [00:02, 45.72it/s][A
125it [00:02, 45.10it/s][A
130it [00:02, 45.26it/s][A
135it [00:02, 45.40it/s][A
140it [00:03, 45.21it/s][A
145it [00:03, 45.44it/s][A
150it [00:03, 45.56it/s][A
155it [00:03, 45.59it/s][A
160it [00:03, 45.70it/s][A
165it [00:03, 45.77it/s][A
170it [00:03, 45.68it/s][A
175it [00:03, 45.86it/s][A
180it [00:03, 45.93it/s][A
185it [00:04, 45.76it/s][A
190it [00:04, 45.49it/s][A
195it [00:04, 45.64it/s][A
200it [00:04, 45.66it/s][A
205it [00:04, 45.87it/s][A

Epoch: 60, Step: 200, Loss: 4.786338109970092



210it [00:04, 45.21it/s][A
215it [00:04, 45.43it/s][A
220it [00:04, 45.68it/s][A
227it [00:04, 45.61it/s]

0it [00:00, ?it/s][A
6it [00:00, 56.14it/s][A
12it [00:00, 57.86it/s][A
19it [00:00, 59.02it/s][A
26it [00:00, 59.69it/s][A
33it [00:00, 59.98it/s][A
40it [00:00, 60.08it/s][A
47it [00:00, 60.12it/s][A
54it [00:00, 60.39it/s][A
61it [00:01, 60.32it/s][A
68it [00:01, 60.26it/s][A
75it [00:01, 59.90it/s][A
81it [00:01, 59.89it/s][A
88it [00:01, 60.27it/s][A
95it [00:01, 59.68it/s][A
101it [00:01, 59.73it/s][A
107it [00:01, 59.70it/s][A
114it [00:01, 59.94it/s][A
120it [00:02, 59.95it/s][A
127it [00:02, 59.97it/s][A
133it [00:02, 59.36it/s][A
139it [00:02, 59.53it/s][A
146it [00:02, 59.83it/s][A
152it [00:02, 59.64it/s][A
158it [00:02, 59.64it/s][A
164it [00:02, 59.51it/s][A
171it [00:02, 59.84it/s][A
177it [00:02, 59.46it/s][A
184it [00:03, 60.04it/s][A
191it [00:03, 60.25it/s][A
198it [00:03, 60.15it/s][A
205it [00:03, 60.49it/s][A
212it [00:03, 


Epoch: 60, Test Loss: 5.407727529543527, Test Perplexity: 223.81622544282712




0it [00:00, ?it/s][A
5it [00:00, 46.37it/s][A
10it [00:00, 46.43it/s][A
15it [00:00, 46.38it/s][A
20it [00:00, 46.04it/s][A
25it [00:00, 46.70it/s][A
30it [00:00, 46.92it/s][A
35it [00:00, 45.75it/s][A
40it [00:00, 45.21it/s][A
45it [00:00, 45.51it/s][A
50it [00:01, 45.74it/s][A
55it [00:01, 45.85it/s][A
60it [00:01, 46.00it/s][A
65it [00:01, 46.12it/s][A
70it [00:01, 45.83it/s][A
75it [00:01, 45.59it/s][A
80it [00:01, 45.45it/s][A
85it [00:01, 45.28it/s][A
90it [00:01, 45.43it/s][A
95it [00:02, 45.76it/s][A
100it [00:02, 45.69it/s][A
105it [00:02, 45.43it/s][A

Epoch: 61, Step: 100, Loss: 4.769519710540772



110it [00:02, 45.07it/s][A
115it [00:02, 45.47it/s][A
120it [00:02, 45.31it/s][A
125it [00:02, 45.33it/s][A
130it [00:02, 45.36it/s][A
135it [00:02, 45.44it/s][A
140it [00:03, 45.64it/s][A
145it [00:03, 45.67it/s][A
150it [00:03, 45.72it/s][A
155it [00:03, 45.70it/s][A
160it [00:03, 45.43it/s][A
165it [00:03, 45.37it/s][A
170it [00:03, 45.26it/s][A
175it [00:03, 45.37it/s][A
180it [00:03, 45.14it/s][A
185it [00:04, 44.72it/s][A
190it [00:04, 45.09it/s][A
195it [00:04, 44.62it/s][A
200it [00:04, 44.36it/s][A
205it [00:04, 44.26it/s][A

Epoch: 61, Step: 200, Loss: 4.783968029022216



210it [00:04, 44.46it/s][A
215it [00:04, 44.74it/s][A
220it [00:04, 42.85it/s][A
227it [00:05, 45.23it/s]
 12%|█▏        | 61/500 [06:56<56:19,  7.70s/it]  
0it [00:00, ?it/s][A
5it [00:00, 46.12it/s][A
10it [00:00, 44.94it/s][A
15it [00:00, 44.48it/s][A
20it [00:00, 43.69it/s][A
25it [00:00, 44.01it/s][A
30it [00:00, 43.23it/s][A
35it [00:00, 43.87it/s][A
40it [00:00, 44.42it/s][A
45it [00:01, 44.85it/s][A
50it [00:01, 45.30it/s][A
55it [00:01, 45.27it/s][A
60it [00:01, 45.42it/s][A
65it [00:01, 44.82it/s][A
70it [00:01, 45.11it/s][A
75it [00:01, 44.88it/s][A
80it [00:01, 45.12it/s][A
85it [00:01, 44.94it/s][A
90it [00:02, 45.08it/s][A
95it [00:02, 45.09it/s][A
100it [00:02, 45.14it/s][A
105it [00:02, 45.03it/s][A

Epoch: 62, Step: 100, Loss: 4.766783947944641



110it [00:02, 44.88it/s][A
115it [00:02, 44.93it/s][A
120it [00:02, 44.98it/s][A
125it [00:02, 44.95it/s][A
130it [00:02, 44.79it/s][A
135it [00:03, 44.22it/s][A
140it [00:03, 44.49it/s][A
145it [00:03, 44.46it/s][A
150it [00:03, 44.75it/s][A
155it [00:03, 44.95it/s][A
160it [00:03, 45.13it/s][A
165it [00:03, 45.46it/s][A
170it [00:03, 45.70it/s][A
175it [00:03, 45.73it/s][A
180it [00:04, 45.78it/s][A
185it [00:04, 45.95it/s][A
190it [00:04, 46.05it/s][A
195it [00:04, 46.01it/s][A
200it [00:04, 46.02it/s][A
205it [00:04, 46.06it/s][A

Epoch: 62, Step: 200, Loss: 4.7791431784629825



210it [00:04, 45.87it/s][A
215it [00:04, 45.90it/s][A
220it [00:04, 46.01it/s][A
227it [00:05, 45.13it/s]
 12%|█▏        | 62/500 [07:01<50:21,  6.90s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.96it/s][A
10it [00:00, 46.18it/s][A
15it [00:00, 46.04it/s][A
20it [00:00, 45.64it/s][A
25it [00:00, 45.24it/s][A
30it [00:00, 45.35it/s][A
35it [00:00, 45.57it/s][A
40it [00:00, 45.55it/s][A
45it [00:00, 44.77it/s][A
50it [00:01, 45.17it/s][A
55it [00:01, 44.93it/s][A
60it [00:01, 45.23it/s][A
65it [00:01, 44.62it/s][A
70it [00:01, 45.11it/s][A
75it [00:01, 45.18it/s][A
80it [00:01, 45.54it/s][A
85it [00:01, 45.48it/s][A
90it [00:01, 44.78it/s][A
95it [00:02, 45.00it/s][A
100it [00:02, 44.96it/s][A
105it [00:02, 45.28it/s][A

Epoch: 63, Step: 100, Loss: 4.757365007400512



110it [00:02, 45.04it/s][A
115it [00:02, 45.43it/s][A
120it [00:02, 45.63it/s][A
125it [00:02, 45.56it/s][A
130it [00:02, 45.43it/s][A
135it [00:02, 45.45it/s][A
140it [00:03, 45.45it/s][A
145it [00:03, 45.62it/s][A
150it [00:03, 45.66it/s][A
155it [00:03, 45.76it/s][A
160it [00:03, 45.83it/s][A
165it [00:03, 45.86it/s][A
170it [00:03, 45.91it/s][A
175it [00:03, 45.86it/s][A
180it [00:03, 45.76it/s][A
185it [00:04, 45.91it/s][A
190it [00:04, 45.99it/s][A
195it [00:04, 45.92it/s][A
200it [00:04, 45.75it/s][A
205it [00:04, 45.72it/s][A

Epoch: 63, Step: 200, Loss: 4.777986760139465



210it [00:04, 45.67it/s][A
215it [00:04, 45.67it/s][A
220it [00:04, 45.72it/s][A
227it [00:04, 45.47it/s]
 13%|█▎        | 63/500 [07:06<46:05,  6.33s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.67it/s][A
10it [00:00, 44.79it/s][A
15it [00:00, 45.26it/s][A
20it [00:00, 45.52it/s][A
25it [00:00, 45.53it/s][A
30it [00:00, 44.54it/s][A
35it [00:00, 44.26it/s][A
40it [00:00, 44.68it/s][A
45it [00:01, 44.80it/s][A
50it [00:01, 45.09it/s][A
55it [00:01, 44.81it/s][A
60it [00:01, 45.23it/s][A
65it [00:01, 44.94it/s][A
70it [00:01, 45.32it/s][A
75it [00:01, 44.97it/s][A
80it [00:01, 45.15it/s][A
85it [00:01, 44.77it/s][A
90it [00:02, 44.61it/s][A
95it [00:02, 44.56it/s][A
100it [00:02, 44.88it/s][A
105it [00:02, 45.13it/s][A

Epoch: 64, Step: 100, Loss: 4.760225667953491



110it [00:02, 45.42it/s][A
115it [00:02, 45.49it/s][A
120it [00:02, 45.30it/s][A
125it [00:02, 45.33it/s][A
130it [00:02, 45.13it/s][A
135it [00:03, 44.91it/s][A
140it [00:03, 45.05it/s][A
145it [00:03, 45.07it/s][A
150it [00:03, 45.18it/s][A
155it [00:03, 45.37it/s][A
160it [00:03, 44.80it/s][A
165it [00:03, 44.95it/s][A
170it [00:03, 45.10it/s][A
175it [00:03, 45.25it/s][A
180it [00:03, 45.26it/s][A
185it [00:04, 45.25it/s][A
190it [00:04, 45.37it/s][A
195it [00:04, 45.32it/s][A
200it [00:04, 45.32it/s][A
205it [00:04, 45.25it/s][A

Epoch: 64, Step: 200, Loss: 4.773718507289886



210it [00:04, 45.16it/s][A
215it [00:04, 45.53it/s][A
220it [00:04, 45.66it/s][A
227it [00:05, 45.10it/s]
 13%|█▎        | 64/500 [07:11<43:10,  5.94s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.94it/s][A
10it [00:00, 45.24it/s][A
15it [00:00, 45.21it/s][A
20it [00:00, 45.20it/s][A
25it [00:00, 45.23it/s][A
30it [00:00, 45.63it/s][A
35it [00:00, 45.86it/s][A
40it [00:00, 45.66it/s][A
45it [00:00, 45.77it/s][A
50it [00:01, 46.02it/s][A
55it [00:01, 45.86it/s][A
60it [00:01, 45.79it/s][A
65it [00:01, 45.80it/s][A
70it [00:01, 45.69it/s][A
75it [00:01, 45.82it/s][A
80it [00:01, 45.81it/s][A
85it [00:01, 45.80it/s][A
90it [00:01, 45.69it/s][A
95it [00:02, 45.43it/s][A
100it [00:02, 45.63it/s][A
105it [00:02, 45.40it/s][A

Epoch: 65, Step: 100, Loss: 4.763287491798401



110it [00:02, 45.44it/s][A
115it [00:02, 45.42it/s][A
120it [00:02, 45.47it/s][A
125it [00:02, 45.73it/s][A
130it [00:02, 45.88it/s][A
135it [00:02, 46.15it/s][A
140it [00:03, 46.12it/s][A
145it [00:03, 45.33it/s][A
150it [00:03, 45.40it/s][A
155it [00:03, 45.34it/s][A
160it [00:03, 45.45it/s][A
165it [00:03, 45.22it/s][A
170it [00:03, 45.46it/s][A
175it [00:03, 45.18it/s][A
180it [00:03, 45.39it/s][A
185it [00:04, 45.00it/s][A
190it [00:04, 44.72it/s][A
195it [00:04, 44.65it/s][A
200it [00:04, 44.39it/s][A
205it [00:04, 44.81it/s][A

Epoch: 65, Step: 200, Loss: 4.773828492164612



210it [00:04, 44.97it/s][A
215it [00:04, 45.34it/s][A
220it [00:04, 45.21it/s][A
227it [00:04, 45.42it/s]

0it [00:00, ?it/s][A
6it [00:00, 55.35it/s][A
13it [00:00, 58.76it/s][A
20it [00:00, 59.73it/s][A
26it [00:00, 59.09it/s][A
33it [00:00, 59.95it/s][A
40it [00:00, 60.34it/s][A
47it [00:00, 60.72it/s][A
54it [00:00, 60.78it/s][A
61it [00:01, 60.84it/s][A
68it [00:01, 60.79it/s][A
75it [00:01, 60.89it/s][A
82it [00:01, 60.54it/s][A
89it [00:01, 60.65it/s][A
96it [00:01, 60.74it/s][A
103it [00:01, 60.84it/s][A
110it [00:01, 60.86it/s][A
117it [00:01, 60.92it/s][A
124it [00:02, 60.74it/s][A
131it [00:02, 60.22it/s][A
138it [00:02, 60.32it/s][A
145it [00:02, 60.26it/s][A
152it [00:02, 57.81it/s][A
159it [00:02, 58.62it/s][A
166it [00:02, 59.38it/s][A
173it [00:02, 59.90it/s][A
180it [00:02, 60.38it/s][A
187it [00:03, 60.38it/s][A
194it [00:03, 59.89it/s][A
201it [00:03, 59.99it/s][A
208it [00:03, 59.87it/s][A
215it [00:03, 60.18it/s][A
222it [00:03, 


Epoch: 65, Test Loss: 5.416369276017136, Test Perplexity: 225.75840740322326




0it [00:00, ?it/s][A
5it [00:00, 44.41it/s][A
10it [00:00, 44.39it/s][A
15it [00:00, 45.22it/s][A
20it [00:00, 45.55it/s][A
25it [00:00, 45.68it/s][A
30it [00:00, 45.24it/s][A
35it [00:00, 45.41it/s][A
40it [00:00, 45.31it/s][A
45it [00:00, 45.29it/s][A
50it [00:01, 45.30it/s][A
55it [00:01, 45.16it/s][A
60it [00:01, 45.43it/s][A
65it [00:01, 45.29it/s][A
70it [00:01, 45.43it/s][A
75it [00:01, 45.51it/s][A
80it [00:01, 45.35it/s][A
85it [00:01, 45.67it/s][A
90it [00:01, 45.67it/s][A
95it [00:02, 45.62it/s][A
100it [00:02, 45.60it/s][A
105it [00:02, 45.66it/s][A

Epoch: 66, Step: 100, Loss: 4.762142400741578



110it [00:02, 45.63it/s][A
115it [00:02, 45.10it/s][A
120it [00:02, 45.39it/s][A
125it [00:02, 45.57it/s][A
130it [00:02, 45.35it/s][A
135it [00:02, 45.10it/s][A
140it [00:03, 44.98it/s][A
145it [00:03, 44.64it/s][A
150it [00:03, 45.16it/s][A
155it [00:03, 43.63it/s][A
160it [00:03, 44.46it/s][A
165it [00:03, 44.39it/s][A
170it [00:03, 44.88it/s][A
175it [00:03, 44.79it/s][A
180it [00:03, 44.69it/s][A
185it [00:04, 44.02it/s][A
190it [00:04, 44.15it/s][A
195it [00:04, 44.46it/s][A
200it [00:04, 44.61it/s][A
205it [00:04, 44.92it/s][A

Epoch: 66, Step: 200, Loss: 4.772976801395417



210it [00:04, 44.86it/s][A
215it [00:04, 45.18it/s][A
220it [00:04, 45.48it/s][A
227it [00:05, 45.09it/s]
 13%|█▎        | 66/500 [07:31<55:44,  7.71s/it]  
0it [00:00, ?it/s][A
5it [00:00, 46.36it/s][A
10it [00:00, 45.84it/s][A
15it [00:00, 45.63it/s][A
20it [00:00, 45.62it/s][A
25it [00:00, 45.94it/s][A
30it [00:00, 45.93it/s][A
35it [00:00, 45.90it/s][A
40it [00:00, 45.97it/s][A
45it [00:00, 45.81it/s][A
50it [00:01, 45.73it/s][A
55it [00:01, 45.57it/s][A
60it [00:01, 45.71it/s][A
65it [00:01, 45.69it/s][A
70it [00:01, 45.93it/s][A
75it [00:01, 45.99it/s][A
80it [00:01, 45.82it/s][A
85it [00:01, 45.74it/s][A
90it [00:01, 45.28it/s][A
95it [00:02, 45.33it/s][A
100it [00:02, 45.19it/s][A
105it [00:02, 45.15it/s][A

Epoch: 67, Step: 100, Loss: 4.756384744644165



110it [00:02, 44.97it/s][A
115it [00:02, 45.18it/s][A
120it [00:02, 45.16it/s][A
125it [00:02, 45.19it/s][A
130it [00:02, 45.28it/s][A
135it [00:02, 45.28it/s][A
140it [00:03, 45.16it/s][A
145it [00:03, 45.10it/s][A
150it [00:03, 45.29it/s][A
155it [00:03, 45.48it/s][A
160it [00:03, 45.40it/s][A
165it [00:03, 45.51it/s][A
170it [00:03, 45.24it/s][A
175it [00:03, 45.14it/s][A
180it [00:03, 44.80it/s][A
185it [00:04, 45.16it/s][A
190it [00:04, 45.00it/s][A
195it [00:04, 45.08it/s][A
200it [00:04, 45.28it/s][A
205it [00:04, 45.54it/s][A

Epoch: 67, Step: 200, Loss: 4.7688646984100345



210it [00:04, 45.50it/s][A
215it [00:04, 45.64it/s][A
220it [00:04, 45.64it/s][A
227it [00:04, 45.43it/s]
 13%|█▎        | 67/500 [07:36<49:45,  6.89s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.74it/s][A
10it [00:00, 44.34it/s][A
15it [00:00, 44.19it/s][A
20it [00:00, 45.15it/s][A
25it [00:00, 45.38it/s][A
30it [00:00, 45.85it/s][A
35it [00:00, 45.74it/s][A
40it [00:00, 45.44it/s][A
45it [00:00, 45.25it/s][A
50it [00:01, 44.85it/s][A
55it [00:01, 44.91it/s][A
60it [00:01, 44.99it/s][A
65it [00:01, 45.09it/s][A
70it [00:01, 44.84it/s][A
75it [00:01, 44.74it/s][A
80it [00:01, 45.21it/s][A
85it [00:01, 45.29it/s][A
90it [00:01, 45.48it/s][A
95it [00:02, 45.60it/s][A
100it [00:02, 45.82it/s][A
105it [00:02, 45.79it/s][A

Epoch: 68, Step: 100, Loss: 4.7561793756484985



110it [00:02, 45.65it/s][A
115it [00:02, 45.71it/s][A
120it [00:02, 45.86it/s][A
125it [00:02, 46.10it/s][A
130it [00:02, 45.95it/s][A
135it [00:02, 45.73it/s][A
140it [00:03, 45.80it/s][A
145it [00:03, 45.67it/s][A
150it [00:03, 45.81it/s][A
155it [00:03, 45.79it/s][A
160it [00:03, 45.68it/s][A
165it [00:03, 45.84it/s][A
170it [00:03, 45.83it/s][A
175it [00:03, 45.58it/s][A
180it [00:03, 45.63it/s][A
185it [00:04, 45.70it/s][A
190it [00:04, 45.95it/s][A
195it [00:04, 45.07it/s][A
200it [00:04, 44.80it/s][A
205it [00:04, 44.91it/s][A

Epoch: 68, Step: 200, Loss: 4.766790668964386



210it [00:04, 45.20it/s][A
215it [00:04, 45.45it/s][A
220it [00:04, 45.44it/s][A
227it [00:04, 45.41it/s]
 14%|█▎        | 68/500 [07:41<45:33,  6.33s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.63it/s][A
10it [00:00, 43.45it/s][A
15it [00:00, 42.98it/s][A
20it [00:00, 43.06it/s][A
25it [00:00, 43.32it/s][A
30it [00:00, 43.92it/s][A
35it [00:00, 44.14it/s][A
40it [00:00, 44.55it/s][A
45it [00:01, 44.58it/s][A
50it [00:01, 44.87it/s][A
55it [00:01, 45.14it/s][A
60it [00:01, 45.34it/s][A
65it [00:01, 44.95it/s][A
70it [00:01, 44.90it/s][A
75it [00:01, 45.11it/s][A
80it [00:01, 45.33it/s][A
85it [00:01, 45.44it/s][A
90it [00:02, 45.47it/s][A
95it [00:02, 45.45it/s][A
100it [00:02, 45.60it/s][A
105it [00:02, 45.75it/s][A

Epoch: 69, Step: 100, Loss: 4.7569818019866945



110it [00:02, 45.52it/s][A
115it [00:02, 45.69it/s][A
120it [00:02, 45.50it/s][A
125it [00:02, 45.33it/s][A
130it [00:02, 45.11it/s][A
135it [00:03, 45.10it/s][A
140it [00:03, 44.92it/s][A
145it [00:03, 45.11it/s][A
150it [00:03, 45.28it/s][A
155it [00:03, 45.15it/s][A
160it [00:03, 45.21it/s][A
165it [00:03, 45.44it/s][A
170it [00:03, 45.39it/s][A
175it [00:03, 45.34it/s][A
180it [00:04, 44.80it/s][A
185it [00:04, 44.62it/s][A
190it [00:04, 44.76it/s][A
195it [00:04, 44.29it/s][A
200it [00:04, 44.64it/s][A
205it [00:04, 44.07it/s][A

Epoch: 69, Step: 200, Loss: 4.7673407912254335



210it [00:04, 44.26it/s][A
215it [00:04, 44.45it/s][A
220it [00:04, 44.63it/s][A
227it [00:05, 44.81it/s]
 14%|█▍        | 69/500 [07:46<42:44,  5.95s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.32it/s][A
10it [00:00, 44.88it/s][A
15it [00:00, 45.20it/s][A
20it [00:00, 45.20it/s][A
25it [00:00, 45.15it/s][A
30it [00:00, 45.12it/s][A
35it [00:00, 44.99it/s][A
40it [00:00, 44.71it/s][A
45it [00:01, 44.73it/s][A
50it [00:01, 44.60it/s][A
55it [00:01, 44.63it/s][A
60it [00:01, 44.58it/s][A
65it [00:01, 45.09it/s][A
70it [00:01, 45.20it/s][A
75it [00:01, 45.31it/s][A
80it [00:01, 45.51it/s][A
85it [00:01, 45.59it/s][A
90it [00:01, 45.35it/s][A
95it [00:02, 45.34it/s][A
100it [00:02, 45.12it/s][A
105it [00:02, 45.26it/s][A

Epoch: 70, Step: 100, Loss: 4.747731485366821



110it [00:02, 45.26it/s][A
115it [00:02, 45.31it/s][A
120it [00:02, 45.40it/s][A
125it [00:02, 45.08it/s][A
130it [00:02, 45.36it/s][A
135it [00:02, 45.42it/s][A
140it [00:03, 45.66it/s][A
145it [00:03, 45.68it/s][A
150it [00:03, 45.80it/s][A
155it [00:03, 45.97it/s][A
160it [00:03, 45.88it/s][A
165it [00:03, 45.56it/s][A
170it [00:03, 45.55it/s][A
175it [00:03, 45.52it/s][A
180it [00:03, 45.48it/s][A
185it [00:04, 45.48it/s][A
190it [00:04, 45.16it/s][A
195it [00:04, 45.41it/s][A
200it [00:04, 45.68it/s][A
205it [00:04, 45.86it/s][A

Epoch: 70, Step: 200, Loss: 4.7636412596702575



210it [00:04, 46.00it/s][A
215it [00:04, 46.19it/s][A
220it [00:04, 46.21it/s][A
227it [00:04, 45.42it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.93it/s][A
13it [00:00, 61.13it/s][A
20it [00:00, 61.26it/s][A
27it [00:00, 61.31it/s][A
34it [00:00, 61.44it/s][A
41it [00:00, 60.87it/s][A
48it [00:00, 60.36it/s][A
55it [00:00, 60.36it/s][A
62it [00:01, 60.73it/s][A
69it [00:01, 61.09it/s][A
76it [00:01, 61.20it/s][A
83it [00:01, 61.35it/s][A
90it [00:01, 61.35it/s][A
97it [00:01, 60.94it/s][A
104it [00:01, 60.99it/s][A
111it [00:01, 60.94it/s][A
118it [00:01, 59.08it/s][A
124it [00:02, 58.45it/s][A
131it [00:02, 59.40it/s][A
138it [00:02, 60.06it/s][A
145it [00:02, 60.36it/s][A
152it [00:02, 60.48it/s][A
159it [00:02, 60.69it/s][A
166it [00:02, 60.77it/s][A
173it [00:02, 60.34it/s][A
180it [00:02, 60.65it/s][A
187it [00:03, 60.99it/s][A
194it [00:03, 60.77it/s][A
201it [00:03, 61.04it/s][A
208it [00:03, 61.17it/s][A
215it [00:03, 61.40it/s][A
222it [00:03, 


Epoch: 70, Test Loss: 5.412887402943203, Test Perplexity: 225.00169386774857




0it [00:00, ?it/s][A
5it [00:00, 44.89it/s][A
10it [00:00, 45.30it/s][A
15it [00:00, 45.52it/s][A
20it [00:00, 45.51it/s][A
25it [00:00, 45.03it/s][A
30it [00:00, 45.42it/s][A
35it [00:00, 45.26it/s][A
40it [00:00, 44.75it/s][A
45it [00:00, 45.16it/s][A
50it [00:01, 44.98it/s][A
55it [00:01, 45.14it/s][A
60it [00:01, 44.84it/s][A
65it [00:01, 44.69it/s][A
70it [00:01, 44.59it/s][A
75it [00:01, 44.78it/s][A
80it [00:01, 44.84it/s][A
85it [00:01, 44.92it/s][A
90it [00:02, 44.91it/s][A
95it [00:02, 45.20it/s][A
100it [00:02, 45.32it/s][A
105it [00:02, 44.67it/s][A

Epoch: 71, Step: 100, Loss: 4.755911355018616



110it [00:02, 44.77it/s][A
115it [00:02, 44.76it/s][A
120it [00:02, 44.90it/s][A
125it [00:02, 44.64it/s][A
130it [00:02, 45.01it/s][A
135it [00:03, 44.76it/s][A
140it [00:03, 44.93it/s][A
145it [00:03, 45.19it/s][A
150it [00:03, 45.41it/s][A
155it [00:03, 45.34it/s][A
160it [00:03, 45.58it/s][A
165it [00:03, 45.17it/s][A
170it [00:03, 45.40it/s][A
175it [00:03, 45.28it/s][A
180it [00:03, 45.15it/s][A
185it [00:04, 44.98it/s][A
190it [00:04, 45.36it/s][A
195it [00:04, 45.13it/s][A
200it [00:04, 45.38it/s][A
205it [00:04, 45.29it/s][A

Epoch: 71, Step: 200, Loss: 4.764356892108918



210it [00:04, 45.08it/s][A
215it [00:04, 45.29it/s][A
220it [00:04, 45.38it/s][A
227it [00:05, 45.11it/s]
 14%|█▍        | 71/500 [08:07<55:12,  7.72s/it]  
0it [00:00, ?it/s][A
5it [00:00, 46.02it/s][A
10it [00:00, 46.01it/s][A
15it [00:00, 45.67it/s][A
20it [00:00, 46.01it/s][A
25it [00:00, 45.86it/s][A
30it [00:00, 45.83it/s][A
35it [00:00, 45.86it/s][A
40it [00:00, 45.31it/s][A
45it [00:00, 44.67it/s][A
50it [00:01, 45.13it/s][A
55it [00:01, 45.37it/s][A
60it [00:01, 45.66it/s][A
65it [00:01, 45.74it/s][A
70it [00:01, 45.89it/s][A
75it [00:01, 45.84it/s][A
80it [00:01, 46.02it/s][A
85it [00:01, 45.27it/s][A
90it [00:01, 45.59it/s][A
95it [00:02, 45.92it/s][A
100it [00:02, 45.77it/s][A
105it [00:02, 45.82it/s][A

Epoch: 72, Step: 100, Loss: 4.744762539863586



110it [00:02, 45.54it/s][A
115it [00:02, 45.58it/s][A
120it [00:02, 45.58it/s][A
125it [00:02, 45.59it/s][A
130it [00:02, 45.15it/s][A
135it [00:02, 45.12it/s][A
140it [00:03, 45.23it/s][A
145it [00:03, 45.29it/s][A
150it [00:03, 45.32it/s][A
155it [00:03, 44.64it/s][A
160it [00:03, 44.55it/s][A
165it [00:03, 44.79it/s][A
170it [00:03, 44.93it/s][A
175it [00:03, 44.94it/s][A
180it [00:03, 44.85it/s][A
185it [00:04, 44.13it/s][A
190it [00:04, 44.15it/s][A
195it [00:04, 44.17it/s][A
200it [00:04, 44.44it/s][A
205it [00:04, 44.55it/s][A

Epoch: 72, Step: 200, Loss: 4.759185998439789



210it [00:04, 44.85it/s][A
215it [00:04, 45.23it/s][A
220it [00:04, 44.80it/s][A
227it [00:05, 45.23it/s]
 14%|█▍        | 72/500 [08:12<49:18,  6.91s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.14it/s][A
10it [00:00, 43.98it/s][A
15it [00:00, 44.62it/s][A
20it [00:00, 44.78it/s][A
25it [00:00, 44.83it/s][A
30it [00:00, 44.72it/s][A
35it [00:00, 45.01it/s][A
40it [00:00, 44.94it/s][A
45it [00:01, 45.02it/s][A
50it [00:01, 44.84it/s][A
55it [00:01, 44.96it/s][A
60it [00:01, 45.13it/s][A
65it [00:01, 45.32it/s][A
70it [00:01, 45.19it/s][A
75it [00:01, 44.95it/s][A
80it [00:01, 45.31it/s][A
85it [00:01, 45.66it/s][A
90it [00:02, 44.75it/s][A
95it [00:02, 44.85it/s][A
100it [00:02, 45.18it/s][A
105it [00:02, 45.39it/s][A

Epoch: 73, Step: 100, Loss: 4.751061449050903



110it [00:02, 45.56it/s][A
115it [00:02, 45.69it/s][A
120it [00:02, 45.79it/s][A
125it [00:02, 45.22it/s][A
130it [00:02, 44.91it/s][A
135it [00:03, 44.53it/s][A
140it [00:03, 44.85it/s][A
145it [00:03, 45.02it/s][A
150it [00:03, 45.10it/s][A
155it [00:03, 45.35it/s][A
160it [00:03, 44.56it/s][A
165it [00:03, 44.83it/s][A
170it [00:03, 44.62it/s][A
175it [00:03, 44.05it/s][A
180it [00:04, 44.12it/s][A
185it [00:04, 44.25it/s][A
190it [00:04, 44.84it/s][A
195it [00:04, 45.21it/s][A
200it [00:04, 45.41it/s][A
205it [00:04, 45.48it/s][A

Epoch: 73, Step: 200, Loss: 4.759843258857727



210it [00:04, 45.45it/s][A
215it [00:04, 45.60it/s][A
220it [00:04, 45.67it/s][A
227it [00:05, 45.03it/s]
 15%|█▍        | 73/500 [08:17<45:12,  6.35s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.07it/s][A
10it [00:00, 45.93it/s][A
15it [00:00, 44.85it/s][A
20it [00:00, 45.14it/s][A
25it [00:00, 45.16it/s][A
30it [00:00, 45.33it/s][A
35it [00:00, 45.35it/s][A
40it [00:00, 45.59it/s][A
45it [00:00, 45.55it/s][A
50it [00:01, 44.65it/s][A
55it [00:01, 44.89it/s][A
60it [00:01, 44.61it/s][A
65it [00:01, 44.38it/s][A
70it [00:01, 44.43it/s][A
75it [00:01, 43.97it/s][A
80it [00:01, 44.48it/s][A
85it [00:01, 44.65it/s][A
90it [00:02, 42.97it/s][A
95it [00:02, 43.67it/s][A
100it [00:02, 43.77it/s][A
105it [00:02, 44.07it/s][A

Epoch: 74, Step: 100, Loss: 4.7354475784301755



110it [00:02, 43.88it/s][A
115it [00:02, 43.82it/s][A
120it [00:02, 43.78it/s][A
125it [00:02, 44.44it/s][A
130it [00:02, 44.73it/s][A
135it [00:03, 44.93it/s][A
140it [00:03, 45.12it/s][A
145it [00:03, 44.66it/s][A
150it [00:03, 44.87it/s][A
155it [00:03, 44.40it/s][A
160it [00:03, 44.87it/s][A
165it [00:03, 44.85it/s][A
170it [00:03, 45.15it/s][A
175it [00:03, 44.96it/s][A
180it [00:04, 44.35it/s][A
185it [00:04, 44.79it/s][A
190it [00:04, 44.78it/s][A
195it [00:04, 44.88it/s][A
200it [00:04, 44.60it/s][A
205it [00:04, 44.99it/s][A

Epoch: 74, Step: 200, Loss: 4.75389666557312



210it [00:04, 44.97it/s][A
215it [00:04, 44.95it/s][A
220it [00:04, 44.78it/s][A
227it [00:05, 44.58it/s]
 15%|█▍        | 74/500 [08:22<42:25,  5.97s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.33it/s][A
10it [00:00, 44.75it/s][A
15it [00:00, 44.08it/s][A
20it [00:00, 44.63it/s][A
25it [00:00, 44.87it/s][A
30it [00:00, 45.04it/s][A
35it [00:00, 45.21it/s][A
40it [00:00, 45.34it/s][A
45it [00:00, 45.54it/s][A
50it [00:01, 45.63it/s][A
55it [00:01, 45.09it/s][A
60it [00:01, 45.31it/s][A
65it [00:01, 45.35it/s][A
70it [00:01, 45.75it/s][A
75it [00:01, 46.07it/s][A
80it [00:01, 46.25it/s][A
85it [00:01, 46.11it/s][A
90it [00:01, 45.00it/s][A
95it [00:02, 45.26it/s][A
100it [00:02, 44.89it/s][A
105it [00:02, 44.90it/s][A

Epoch: 75, Step: 100, Loss: 4.742854108810425



110it [00:02, 45.39it/s][A
115it [00:02, 45.79it/s][A
120it [00:02, 46.15it/s][A
125it [00:02, 46.33it/s][A
130it [00:02, 46.22it/s][A
135it [00:02, 45.86it/s][A
140it [00:03, 45.57it/s][A
145it [00:03, 45.26it/s][A
150it [00:03, 45.41it/s][A
155it [00:03, 45.63it/s][A
160it [00:03, 45.78it/s][A
165it [00:03, 46.06it/s][A
170it [00:03, 46.04it/s][A
175it [00:03, 45.82it/s][A
180it [00:03, 45.87it/s][A
185it [00:04, 45.22it/s][A
190it [00:04, 45.08it/s][A
195it [00:04, 45.37it/s][A
200it [00:04, 45.00it/s][A
205it [00:04, 45.61it/s][A

Epoch: 75, Step: 200, Loss: 4.754009582996368



210it [00:04, 45.64it/s][A
215it [00:04, 45.70it/s][A
220it [00:04, 45.22it/s][A
227it [00:04, 45.45it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.91it/s][A
12it [00:00, 58.17it/s][A
18it [00:00, 58.70it/s][A
24it [00:00, 58.79it/s][A
31it [00:00, 60.29it/s][A
38it [00:00, 59.88it/s][A
45it [00:00, 60.12it/s][A
52it [00:00, 60.28it/s][A
59it [00:00, 60.35it/s][A
66it [00:01, 60.44it/s][A
73it [00:01, 60.49it/s][A
80it [00:01, 59.79it/s][A
86it [00:01, 59.77it/s][A
93it [00:01, 60.21it/s][A
100it [00:01, 60.15it/s][A
107it [00:01, 60.03it/s][A
114it [00:01, 60.43it/s][A
121it [00:02, 60.65it/s][A
128it [00:02, 60.44it/s][A
135it [00:02, 60.60it/s][A
142it [00:02, 60.70it/s][A
149it [00:02, 60.87it/s][A
156it [00:02, 60.87it/s][A
163it [00:02, 60.87it/s][A
170it [00:02, 60.55it/s][A
177it [00:02, 60.60it/s][A
184it [00:03, 60.16it/s][A
191it [00:03, 60.22it/s][A
198it [00:03, 60.32it/s][A
205it [00:03, 60.16it/s][A
212it [00:03, 60.31it/s][A
219it [00:03, 


Epoch: 75, Test Loss: 5.409318533743391, Test Perplexity: 224.0928011829068




0it [00:00, ?it/s][A
5it [00:00, 45.03it/s][A
10it [00:00, 44.95it/s][A
15it [00:00, 43.98it/s][A
20it [00:00, 44.53it/s][A
25it [00:00, 43.89it/s][A
30it [00:00, 43.88it/s][A
35it [00:00, 43.95it/s][A
40it [00:00, 43.61it/s][A
45it [00:01, 43.83it/s][A
50it [00:01, 42.89it/s][A
55it [00:01, 43.48it/s][A
60it [00:01, 43.95it/s][A
65it [00:01, 44.47it/s][A
70it [00:01, 44.99it/s][A
75it [00:01, 45.38it/s][A
80it [00:01, 45.63it/s][A
85it [00:01, 45.90it/s][A
90it [00:02, 45.80it/s][A
95it [00:02, 45.79it/s][A
100it [00:02, 45.74it/s][A
105it [00:02, 45.80it/s][A

Epoch: 76, Step: 100, Loss: 4.7436652183532715



110it [00:02, 45.81it/s][A
115it [00:02, 45.80it/s][A
120it [00:02, 45.88it/s][A
125it [00:02, 45.92it/s][A
130it [00:02, 46.00it/s][A
135it [00:02, 46.10it/s][A
140it [00:03, 46.03it/s][A
145it [00:03, 45.92it/s][A
150it [00:03, 45.46it/s][A
155it [00:03, 45.59it/s][A
160it [00:03, 45.80it/s][A
165it [00:03, 45.82it/s][A
170it [00:03, 46.04it/s][A
175it [00:03, 45.99it/s][A
180it [00:03, 46.02it/s][A
185it [00:04, 46.03it/s][A
190it [00:04, 45.84it/s][A
195it [00:04, 45.84it/s][A
200it [00:04, 45.93it/s][A
205it [00:04, 45.92it/s][A

Epoch: 76, Step: 200, Loss: 4.75453385591507



210it [00:04, 45.81it/s][A
215it [00:04, 45.84it/s][A
220it [00:04, 45.99it/s][A
227it [00:05, 45.34it/s]
 15%|█▌        | 76/500 [08:43<54:38,  7.73s/it]  
0it [00:00, ?it/s][A
5it [00:00, 42.17it/s][A
10it [00:00, 44.01it/s][A
15it [00:00, 44.68it/s][A
20it [00:00, 44.67it/s][A
25it [00:00, 45.10it/s][A
30it [00:00, 45.36it/s][A
35it [00:00, 45.04it/s][A
40it [00:00, 45.07it/s][A
45it [00:01, 44.70it/s][A
50it [00:01, 44.19it/s][A
55it [00:01, 44.58it/s][A
60it [00:01, 44.78it/s][A
65it [00:01, 45.22it/s][A
70it [00:01, 45.38it/s][A
75it [00:01, 45.38it/s][A
80it [00:01, 45.12it/s][A
85it [00:01, 45.06it/s][A
90it [00:02, 45.24it/s][A
95it [00:02, 45.46it/s][A
100it [00:02, 45.58it/s][A
105it [00:02, 45.68it/s][A

Epoch: 77, Step: 100, Loss: 4.7442197370529176



110it [00:02, 45.80it/s][A
115it [00:02, 45.66it/s][A
120it [00:02, 45.90it/s][A
125it [00:02, 45.74it/s][A
130it [00:02, 45.72it/s][A
135it [00:02, 45.32it/s][A
140it [00:03, 45.12it/s][A
145it [00:03, 44.53it/s][A
150it [00:03, 44.07it/s][A
155it [00:03, 44.43it/s][A
160it [00:03, 44.57it/s][A
165it [00:03, 44.62it/s][A
170it [00:03, 44.96it/s][A
175it [00:03, 45.02it/s][A
180it [00:03, 45.07it/s][A
185it [00:04, 45.24it/s][A
190it [00:04, 44.67it/s][A
195it [00:04, 44.80it/s][A
200it [00:04, 44.97it/s][A
205it [00:04, 44.91it/s][A

Epoch: 77, Step: 200, Loss: 4.751196730136871



210it [00:04, 44.90it/s][A
215it [00:04, 45.14it/s][A
220it [00:04, 45.39it/s][A
227it [00:05, 45.04it/s]
 15%|█▌        | 77/500 [08:48<48:49,  6.93s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.97it/s][A
10it [00:00, 45.85it/s][A
15it [00:00, 45.92it/s][A
20it [00:00, 46.04it/s][A
25it [00:00, 45.94it/s][A
30it [00:00, 45.78it/s][A
35it [00:00, 45.72it/s][A
40it [00:00, 45.73it/s][A
45it [00:00, 45.83it/s][A
50it [00:01, 45.78it/s][A
55it [00:01, 45.00it/s][A
60it [00:01, 44.96it/s][A
65it [00:01, 45.14it/s][A
70it [00:01, 45.21it/s][A
75it [00:01, 45.29it/s][A
80it [00:01, 45.37it/s][A
85it [00:01, 45.50it/s][A
90it [00:01, 45.56it/s][A
95it [00:02, 45.32it/s][A
100it [00:02, 44.60it/s][A
105it [00:02, 44.56it/s][A

Epoch: 78, Step: 100, Loss: 4.74250786781311



110it [00:02, 44.23it/s][A
115it [00:02, 44.43it/s][A
120it [00:02, 44.50it/s][A
125it [00:02, 44.68it/s][A
130it [00:02, 45.19it/s][A
135it [00:02, 45.09it/s][A
140it [00:03, 45.22it/s][A
145it [00:03, 45.14it/s][A
150it [00:03, 45.15it/s][A
155it [00:03, 45.28it/s][A
160it [00:03, 45.37it/s][A
165it [00:03, 44.52it/s][A
170it [00:03, 44.99it/s][A
175it [00:03, 45.05it/s][A
180it [00:03, 45.29it/s][A
185it [00:04, 45.31it/s][A
190it [00:04, 44.49it/s][A
195it [00:04, 43.96it/s][A
200it [00:04, 44.73it/s][A
205it [00:04, 45.02it/s][A

Epoch: 78, Step: 200, Loss: 4.752372317314148



210it [00:04, 45.21it/s][A
215it [00:04, 45.19it/s][A
220it [00:04, 45.49it/s][A
227it [00:05, 45.16it/s]
 16%|█▌        | 78/500 [08:53<44:42,  6.36s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.56it/s][A
10it [00:00, 44.64it/s][A
15it [00:00, 45.17it/s][A
20it [00:00, 45.42it/s][A
25it [00:00, 45.41it/s][A
30it [00:00, 45.37it/s][A
35it [00:00, 45.44it/s][A
40it [00:00, 45.52it/s][A
45it [00:00, 45.37it/s][A
50it [00:01, 45.24it/s][A
55it [00:01, 44.51it/s][A
60it [00:01, 44.16it/s][A
65it [00:01, 44.80it/s][A
70it [00:01, 45.12it/s][A
75it [00:01, 45.28it/s][A
80it [00:01, 45.33it/s][A
85it [00:01, 45.66it/s][A
90it [00:01, 45.62it/s][A
95it [00:02, 45.59it/s][A
100it [00:02, 45.87it/s][A
105it [00:02, 46.05it/s][A

Epoch: 79, Step: 100, Loss: 4.729995746612548



110it [00:02, 46.07it/s][A
115it [00:02, 46.29it/s][A
120it [00:02, 46.37it/s][A
125it [00:02, 46.12it/s][A
130it [00:02, 46.09it/s][A
135it [00:02, 45.86it/s][A
140it [00:03, 45.79it/s][A
145it [00:03, 45.81it/s][A
150it [00:03, 45.95it/s][A
155it [00:03, 46.11it/s][A
160it [00:03, 45.75it/s][A
165it [00:03, 46.09it/s][A
170it [00:03, 46.31it/s][A
175it [00:03, 46.46it/s][A
180it [00:03, 46.55it/s][A
185it [00:04, 46.67it/s][A
190it [00:04, 46.60it/s][A
195it [00:04, 46.71it/s][A
200it [00:04, 46.00it/s][A
205it [00:04, 46.29it/s][A

Epoch: 79, Step: 200, Loss: 4.747496342658996



210it [00:04, 46.34it/s][A
215it [00:04, 45.61it/s][A
220it [00:04, 45.53it/s][A
227it [00:04, 45.68it/s]
 16%|█▌        | 79/500 [08:58<41:41,  5.94s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.68it/s][A
10it [00:00, 45.86it/s][A
15it [00:00, 46.15it/s][A
20it [00:00, 46.00it/s][A
25it [00:00, 45.94it/s][A
30it [00:00, 46.12it/s][A
35it [00:00, 46.20it/s][A
40it [00:00, 45.17it/s][A
45it [00:00, 45.62it/s][A
50it [00:01, 45.67it/s][A
55it [00:01, 45.75it/s][A
60it [00:01, 45.63it/s][A
65it [00:01, 45.86it/s][A
70it [00:01, 46.33it/s][A
75it [00:01, 46.28it/s][A
80it [00:01, 45.50it/s][A
85it [00:01, 45.39it/s][A
90it [00:01, 45.54it/s][A
95it [00:02, 45.36it/s][A
100it [00:02, 45.20it/s][A
105it [00:02, 45.38it/s][A

Epoch: 80, Step: 100, Loss: 4.722667355537414



110it [00:02, 45.01it/s][A
115it [00:02, 45.29it/s][A
120it [00:02, 45.04it/s][A
125it [00:02, 45.26it/s][A
130it [00:02, 45.11it/s][A
135it [00:02, 45.21it/s][A
140it [00:03, 44.51it/s][A
145it [00:03, 44.43it/s][A
150it [00:03, 43.78it/s][A
155it [00:03, 43.62it/s][A
160it [00:03, 43.27it/s][A
165it [00:03, 43.42it/s][A
170it [00:03, 43.82it/s][A
175it [00:03, 43.88it/s][A
180it [00:03, 44.12it/s][A
185it [00:04, 44.22it/s][A
190it [00:04, 44.19it/s][A
195it [00:04, 44.36it/s][A
200it [00:04, 44.60it/s][A
205it [00:04, 44.90it/s][A

Epoch: 80, Step: 200, Loss: 4.743602321147919



210it [00:04, 44.85it/s][A
215it [00:04, 45.13it/s][A
220it [00:04, 44.65it/s][A
227it [00:05, 44.96it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.54it/s][A
13it [00:00, 59.96it/s][A
20it [00:00, 60.29it/s][A
27it [00:00, 60.37it/s][A
34it [00:00, 60.63it/s][A
41it [00:00, 60.84it/s][A
48it [00:00, 60.45it/s][A
55it [00:00, 60.30it/s][A
62it [00:01, 60.17it/s][A
69it [00:01, 60.07it/s][A
76it [00:01, 60.06it/s][A
83it [00:01, 59.96it/s][A
89it [00:01, 59.62it/s][A
96it [00:01, 59.94it/s][A
103it [00:01, 60.14it/s][A
110it [00:01, 60.18it/s][A
117it [00:01, 60.24it/s][A
124it [00:02, 59.88it/s][A
130it [00:02, 59.86it/s][A
137it [00:02, 60.06it/s][A
144it [00:02, 60.31it/s][A
151it [00:02, 60.48it/s][A
158it [00:02, 60.58it/s][A
165it [00:02, 60.70it/s][A
172it [00:02, 60.61it/s][A
179it [00:02, 60.61it/s][A
186it [00:03, 60.31it/s][A
193it [00:03, 60.06it/s][A
200it [00:03, 60.19it/s][A
207it [00:03, 60.30it/s][A
214it [00:03, 60.50it/s][A
221it [00:03, 


Epoch: 80, Test Loss: 5.405390411430264, Test Perplexity: 223.32848351046167




0it [00:00, ?it/s][A
5it [00:00, 46.03it/s][A
10it [00:00, 45.97it/s][A
15it [00:00, 45.79it/s][A
20it [00:00, 45.51it/s][A
25it [00:00, 45.22it/s][A
30it [00:00, 45.49it/s][A
35it [00:00, 45.47it/s][A
40it [00:00, 45.51it/s][A
45it [00:00, 45.62it/s][A
50it [00:01, 45.89it/s][A
55it [00:01, 45.92it/s][A
60it [00:01, 45.74it/s][A
65it [00:01, 45.69it/s][A
70it [00:01, 45.51it/s][A
75it [00:01, 45.55it/s][A
80it [00:01, 45.20it/s][A
85it [00:01, 45.25it/s][A
90it [00:01, 45.45it/s][A
95it [00:02, 45.55it/s][A
100it [00:02, 45.47it/s][A
105it [00:02, 45.46it/s][A

Epoch: 81, Step: 100, Loss: 4.7261476850509645



110it [00:02, 44.92it/s][A
115it [00:02, 44.31it/s][A
120it [00:02, 44.99it/s][A
125it [00:02, 45.19it/s][A
130it [00:02, 45.38it/s][A
135it [00:02, 45.57it/s][A
140it [00:03, 45.89it/s][A
145it [00:03, 45.86it/s][A
150it [00:03, 45.76it/s][A
155it [00:03, 45.54it/s][A
160it [00:03, 45.39it/s][A
165it [00:03, 45.39it/s][A
170it [00:03, 45.12it/s][A
175it [00:03, 45.07it/s][A
180it [00:03, 44.80it/s][A
185it [00:04, 44.84it/s][A
190it [00:04, 45.03it/s][A
195it [00:04, 45.10it/s][A
200it [00:04, 44.79it/s][A
205it [00:04, 44.96it/s][A

Epoch: 81, Step: 200, Loss: 4.7410100555419925



210it [00:04, 44.36it/s][A
215it [00:04, 44.16it/s][A
220it [00:04, 44.49it/s][A
227it [00:05, 45.22it/s]
 16%|█▌        | 81/500 [09:19<53:59,  7.73s/it]  
0it [00:00, ?it/s][A
5it [00:00, 46.20it/s][A
10it [00:00, 46.23it/s][A
15it [00:00, 45.89it/s][A
20it [00:00, 45.46it/s][A
25it [00:00, 45.73it/s][A
30it [00:00, 45.78it/s][A
35it [00:00, 45.67it/s][A
40it [00:00, 45.86it/s][A
45it [00:00, 45.76it/s][A
50it [00:01, 45.79it/s][A
55it [00:01, 45.83it/s][A
60it [00:01, 45.84it/s][A
65it [00:01, 45.58it/s][A
70it [00:01, 44.88it/s][A
75it [00:01, 45.07it/s][A
80it [00:01, 45.31it/s][A
85it [00:01, 45.27it/s][A
90it [00:01, 45.20it/s][A
95it [00:02, 45.06it/s][A
100it [00:02, 45.37it/s][A
105it [00:02, 45.10it/s][A

Epoch: 82, Step: 100, Loss: 4.728482413291931



110it [00:02, 44.83it/s][A
115it [00:02, 45.18it/s][A
120it [00:02, 44.91it/s][A
125it [00:02, 45.22it/s][A
130it [00:02, 45.17it/s][A
135it [00:02, 45.24it/s][A
140it [00:03, 45.23it/s][A
145it [00:03, 45.33it/s][A
150it [00:03, 45.43it/s][A
155it [00:03, 45.50it/s][A
160it [00:03, 45.57it/s][A
165it [00:03, 45.68it/s][A
170it [00:03, 45.70it/s][A
175it [00:03, 45.36it/s][A
180it [00:03, 45.54it/s][A
185it [00:04, 45.48it/s][A
190it [00:04, 45.68it/s][A
195it [00:04, 45.67it/s][A
200it [00:04, 45.46it/s][A
205it [00:04, 45.55it/s][A

Epoch: 82, Step: 200, Loss: 4.739694371223449



210it [00:04, 44.73it/s][A
215it [00:04, 44.10it/s][A
220it [00:04, 44.36it/s][A
227it [00:05, 45.26it/s]
 16%|█▋        | 82/500 [09:24<48:11,  6.92s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.76it/s][A
10it [00:00, 45.02it/s][A
15it [00:00, 45.22it/s][A
20it [00:00, 45.35it/s][A
25it [00:00, 45.10it/s][A
30it [00:00, 45.02it/s][A
35it [00:00, 44.74it/s][A
40it [00:00, 44.76it/s][A
45it [00:01, 44.76it/s][A
50it [00:01, 44.83it/s][A
55it [00:01, 45.14it/s][A
60it [00:01, 45.10it/s][A
65it [00:01, 45.16it/s][A
70it [00:01, 45.36it/s][A
75it [00:01, 44.74it/s][A
80it [00:01, 45.15it/s][A
85it [00:01, 45.45it/s][A
90it [00:01, 45.38it/s][A
95it [00:02, 44.89it/s][A
100it [00:02, 45.04it/s][A
105it [00:02, 44.90it/s][A

Epoch: 83, Step: 100, Loss: 4.727716226577758



110it [00:02, 44.55it/s][A
115it [00:02, 44.87it/s][A
120it [00:02, 44.94it/s][A
125it [00:02, 45.00it/s][A
130it [00:02, 45.04it/s][A
135it [00:02, 45.27it/s][A
140it [00:03, 45.32it/s][A
145it [00:03, 45.09it/s][A
150it [00:03, 45.43it/s][A
155it [00:03, 45.78it/s][A
160it [00:03, 45.94it/s][A
165it [00:03, 46.24it/s][A
170it [00:03, 46.29it/s][A
175it [00:03, 46.35it/s][A
180it [00:03, 46.71it/s][A
185it [00:04, 46.70it/s][A
190it [00:04, 46.62it/s][A
195it [00:04, 46.77it/s][A
200it [00:04, 47.00it/s][A
205it [00:04, 46.78it/s][A

Epoch: 83, Step: 200, Loss: 4.739360201358795



210it [00:04, 46.70it/s][A
215it [00:04, 46.76it/s][A
220it [00:04, 46.70it/s][A
227it [00:04, 45.58it/s]
 17%|█▋        | 83/500 [09:29<44:02,  6.34s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.59it/s][A
10it [00:00, 45.55it/s][A
15it [00:00, 45.88it/s][A
20it [00:00, 45.79it/s][A
25it [00:00, 45.47it/s][A
30it [00:00, 46.02it/s][A
35it [00:00, 46.08it/s][A
40it [00:00, 45.86it/s][A
45it [00:00, 45.80it/s][A
50it [00:01, 45.78it/s][A
55it [00:01, 45.56it/s][A
60it [00:01, 45.18it/s][A
65it [00:01, 45.16it/s][A
70it [00:01, 45.45it/s][A
75it [00:01, 45.63it/s][A
80it [00:01, 45.78it/s][A
85it [00:01, 45.79it/s][A
90it [00:01, 45.91it/s][A
95it [00:02, 45.33it/s][A
100it [00:02, 45.46it/s][A
105it [00:02, 45.42it/s][A

Epoch: 84, Step: 100, Loss: 4.73411678314209



110it [00:02, 45.28it/s][A
115it [00:02, 45.51it/s][A
120it [00:02, 45.68it/s][A
125it [00:02, 45.64it/s][A
130it [00:02, 46.44it/s][A
135it [00:02, 46.42it/s][A
140it [00:03, 46.65it/s][A
145it [00:03, 46.53it/s][A
150it [00:03, 46.00it/s][A
155it [00:03, 45.91it/s][A
160it [00:03, 45.92it/s][A
165it [00:03, 46.07it/s][A
170it [00:03, 46.14it/s][A
175it [00:03, 46.01it/s][A
180it [00:03, 45.95it/s][A
185it [00:04, 45.16it/s][A
190it [00:04, 45.43it/s][A
195it [00:04, 45.22it/s][A
200it [00:04, 45.36it/s][A
205it [00:04, 45.68it/s][A

Epoch: 84, Step: 200, Loss: 4.734867420196533



210it [00:04, 45.69it/s][A
215it [00:04, 45.82it/s][A
220it [00:04, 45.80it/s][A
227it [00:04, 45.70it/s]
 17%|█▋        | 84/500 [09:34<41:05,  5.93s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.83it/s][A
10it [00:00, 45.87it/s][A
15it [00:00, 45.67it/s][A
20it [00:00, 45.44it/s][A
25it [00:00, 45.70it/s][A
30it [00:00, 44.92it/s][A
35it [00:00, 45.25it/s][A
40it [00:00, 45.23it/s][A
45it [00:00, 44.92it/s][A
50it [00:01, 44.33it/s][A
55it [00:01, 44.45it/s][A
60it [00:01, 44.58it/s][A
65it [00:01, 44.16it/s][A
70it [00:01, 44.77it/s][A
75it [00:01, 45.10it/s][A
80it [00:01, 45.20it/s][A
85it [00:01, 45.35it/s][A
90it [00:02, 44.81it/s][A
95it [00:02, 44.57it/s][A
100it [00:02, 44.13it/s][A
105it [00:02, 43.32it/s][A

Epoch: 85, Step: 100, Loss: 4.719993605613708



110it [00:02, 42.72it/s][A
115it [00:02, 43.51it/s][A
120it [00:02, 44.14it/s][A
125it [00:02, 44.64it/s][A
130it [00:02, 45.07it/s][A
135it [00:03, 45.19it/s][A
140it [00:03, 45.04it/s][A
145it [00:03, 44.42it/s][A
150it [00:03, 44.23it/s][A
155it [00:03, 44.51it/s][A
160it [00:03, 44.96it/s][A
165it [00:03, 45.28it/s][A
170it [00:03, 45.59it/s][A
175it [00:03, 45.63it/s][A
180it [00:04, 45.70it/s][A
185it [00:04, 45.25it/s][A
190it [00:04, 45.55it/s][A
195it [00:04, 45.51it/s][A
200it [00:04, 45.51it/s][A
205it [00:04, 45.43it/s][A

Epoch: 85, Step: 200, Loss: 4.734386644363403



210it [00:04, 45.46it/s][A
215it [00:04, 45.78it/s][A
220it [00:04, 45.73it/s][A
227it [00:05, 44.96it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.42it/s][A
13it [00:00, 59.87it/s][A
19it [00:00, 59.92it/s][A
25it [00:00, 59.60it/s][A
31it [00:00, 59.63it/s][A
37it [00:00, 59.37it/s][A
44it [00:00, 59.93it/s][A
51it [00:00, 60.29it/s][A
58it [00:00, 60.54it/s][A
65it [00:01, 60.33it/s][A
72it [00:01, 60.22it/s][A
79it [00:01, 60.19it/s][A
86it [00:01, 59.73it/s][A
93it [00:01, 60.12it/s][A
100it [00:01, 60.33it/s][A
107it [00:01, 60.41it/s][A
114it [00:01, 60.44it/s][A
121it [00:02, 60.52it/s][A
128it [00:02, 60.46it/s][A
135it [00:02, 60.26it/s][A
142it [00:02, 60.01it/s][A
149it [00:02, 59.94it/s][A
156it [00:02, 60.14it/s][A
163it [00:02, 59.92it/s][A
170it [00:02, 60.11it/s][A
177it [00:02, 60.21it/s][A
184it [00:03, 60.41it/s][A
191it [00:03, 60.40it/s][A
198it [00:03, 60.06it/s][A
205it [00:03, 59.89it/s][A
211it [00:03, 59.70it/s][A
217it [00:03, 


Epoch: 85, Test Loss: 5.409943291119167, Test Perplexity: 224.37828525993393




0it [00:00, ?it/s][A
5it [00:00, 46.45it/s][A
10it [00:00, 46.42it/s][A
15it [00:00, 46.11it/s][A
20it [00:00, 45.63it/s][A
25it [00:00, 45.75it/s][A
30it [00:00, 45.23it/s][A
35it [00:00, 45.40it/s][A
40it [00:00, 45.14it/s][A
45it [00:00, 45.36it/s][A
50it [00:01, 45.68it/s][A
55it [00:01, 44.84it/s][A
60it [00:01, 44.32it/s][A
65it [00:01, 44.92it/s][A
70it [00:01, 45.18it/s][A
75it [00:01, 44.58it/s][A
80it [00:01, 44.89it/s][A
85it [00:01, 44.93it/s][A
90it [00:01, 45.15it/s][A
95it [00:02, 45.35it/s][A
100it [00:02, 45.47it/s][A
105it [00:02, 45.59it/s][A

Epoch: 86, Step: 100, Loss: 4.7194359540939335



110it [00:02, 45.43it/s][A
115it [00:02, 45.53it/s][A
120it [00:02, 45.37it/s][A
125it [00:02, 45.21it/s][A
130it [00:02, 45.33it/s][A
135it [00:02, 45.40it/s][A
140it [00:03, 45.52it/s][A
145it [00:03, 45.54it/s][A
150it [00:03, 45.67it/s][A
155it [00:03, 45.80it/s][A
160it [00:03, 45.93it/s][A
165it [00:03, 45.90it/s][A
170it [00:03, 45.21it/s][A
175it [00:03, 45.58it/s][A
180it [00:03, 45.70it/s][A
185it [00:04, 45.67it/s][A
190it [00:04, 45.80it/s][A
195it [00:04, 45.45it/s][A
200it [00:04, 45.30it/s][A
205it [00:04, 45.24it/s][A

Epoch: 86, Step: 200, Loss: 4.732020008563995



210it [00:04, 45.33it/s][A
215it [00:04, 45.15it/s][A
220it [00:04, 45.36it/s][A
227it [00:05, 45.35it/s]
 17%|█▋        | 86/500 [09:55<53:18,  7.73s/it]  
0it [00:00, ?it/s][A
5it [00:00, 45.25it/s][A
10it [00:00, 45.55it/s][A
15it [00:00, 45.30it/s][A
20it [00:00, 44.54it/s][A
25it [00:00, 44.98it/s][A
30it [00:00, 44.46it/s][A
35it [00:00, 44.01it/s][A
40it [00:00, 44.59it/s][A
45it [00:01, 44.81it/s][A
50it [00:01, 45.16it/s][A
55it [00:01, 45.22it/s][A
60it [00:01, 45.05it/s][A
65it [00:01, 45.01it/s][A
70it [00:01, 45.18it/s][A
75it [00:01, 45.28it/s][A
80it [00:01, 45.34it/s][A
85it [00:01, 45.24it/s][A
90it [00:01, 45.33it/s][A
95it [00:02, 45.18it/s][A
100it [00:02, 45.45it/s][A
105it [00:02, 45.18it/s][A

Epoch: 87, Step: 100, Loss: 4.720080571174622



110it [00:02, 44.91it/s][A
115it [00:02, 45.04it/s][A
120it [00:02, 44.55it/s][A
125it [00:02, 44.95it/s][A
130it [00:02, 45.23it/s][A
135it [00:02, 45.49it/s][A
140it [00:03, 45.46it/s][A
145it [00:03, 45.13it/s][A
150it [00:03, 44.07it/s][A
155it [00:03, 44.62it/s][A
160it [00:03, 44.93it/s][A
165it [00:03, 44.85it/s][A
170it [00:03, 44.98it/s][A
175it [00:03, 44.92it/s][A
180it [00:04, 45.08it/s][A
185it [00:04, 44.92it/s][A
190it [00:04, 44.76it/s][A
195it [00:04, 44.66it/s][A
200it [00:04, 44.85it/s][A
205it [00:04, 44.96it/s][A

Epoch: 87, Step: 200, Loss: 4.732063353061676



210it [00:04, 45.27it/s][A
215it [00:04, 45.74it/s][A
220it [00:04, 46.15it/s][A
227it [00:05, 45.10it/s]
 17%|█▋        | 87/500 [10:00<47:37,  6.92s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.01it/s][A
10it [00:00, 45.08it/s][A
15it [00:00, 44.67it/s][A
20it [00:00, 45.08it/s][A
25it [00:00, 45.09it/s][A
30it [00:00, 45.36it/s][A
35it [00:00, 45.87it/s][A
40it [00:00, 46.09it/s][A
45it [00:00, 46.30it/s][A
50it [00:01, 46.69it/s][A
55it [00:01, 46.84it/s][A
60it [00:01, 46.83it/s][A
65it [00:01, 47.02it/s][A
70it [00:01, 47.03it/s][A
75it [00:01, 47.05it/s][A
80it [00:01, 47.28it/s][A
85it [00:01, 47.16it/s][A
90it [00:01, 46.99it/s][A
95it [00:02, 47.00it/s][A
100it [00:02, 46.83it/s][A
105it [00:02, 46.81it/s][A

Epoch: 88, Step: 100, Loss: 4.713689980506897



110it [00:02, 46.55it/s][A
115it [00:02, 46.65it/s][A
120it [00:02, 45.90it/s][A
125it [00:02, 46.23it/s][A
130it [00:02, 46.30it/s][A
135it [00:02, 46.08it/s][A
140it [00:03, 46.01it/s][A
145it [00:03, 46.31it/s][A
150it [00:03, 45.48it/s][A
155it [00:03, 45.94it/s][A
160it [00:03, 45.94it/s][A
165it [00:03, 45.97it/s][A
170it [00:03, 46.19it/s][A
175it [00:03, 46.22it/s][A
180it [00:03, 46.16it/s][A
185it [00:04, 45.83it/s][A
190it [00:04, 46.46it/s][A
195it [00:04, 46.38it/s][A
200it [00:04, 46.53it/s][A
205it [00:04, 46.21it/s][A

Epoch: 88, Step: 200, Loss: 4.729729447364807



210it [00:04, 45.86it/s][A
215it [00:04, 45.78it/s][A
220it [00:04, 46.03it/s][A
227it [00:04, 46.22it/s]
 18%|█▊        | 88/500 [10:05<43:22,  6.32s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.29it/s][A
10it [00:00, 45.79it/s][A
15it [00:00, 46.14it/s][A
20it [00:00, 45.97it/s][A
25it [00:00, 45.94it/s][A
30it [00:00, 45.91it/s][A
35it [00:00, 44.80it/s][A
40it [00:00, 45.30it/s][A
45it [00:00, 45.38it/s][A
50it [00:01, 45.25it/s][A
55it [00:01, 45.38it/s][A
60it [00:01, 45.64it/s][A
65it [00:01, 45.51it/s][A
70it [00:01, 45.33it/s][A
75it [00:01, 45.48it/s][A
80it [00:01, 45.72it/s][A
85it [00:01, 45.80it/s][A
90it [00:01, 45.31it/s][A
95it [00:02, 45.52it/s][A
100it [00:02, 45.26it/s][A
105it [00:02, 45.43it/s][A

Epoch: 89, Step: 100, Loss: 4.713501052856445



110it [00:02, 45.42it/s][A
115it [00:02, 45.53it/s][A
120it [00:02, 45.50it/s][A
125it [00:02, 45.44it/s][A
130it [00:02, 45.52it/s][A
135it [00:02, 45.64it/s][A
140it [00:03, 45.59it/s][A
145it [00:03, 45.59it/s][A
150it [00:03, 45.08it/s][A
155it [00:03, 45.22it/s][A
160it [00:03, 44.63it/s][A
165it [00:03, 45.11it/s][A
170it [00:03, 45.36it/s][A
175it [00:03, 45.15it/s][A
180it [00:03, 45.23it/s][A
185it [00:04, 45.55it/s][A
190it [00:04, 45.35it/s][A
195it [00:04, 44.99it/s][A
200it [00:04, 45.13it/s][A
205it [00:04, 45.18it/s][A

Epoch: 89, Step: 200, Loss: 4.728921856880188



210it [00:04, 44.42it/s][A
215it [00:04, 44.01it/s][A
220it [00:04, 44.32it/s][A
227it [00:05, 45.26it/s]
 18%|█▊        | 89/500 [10:10<40:36,  5.93s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.39it/s][A
10it [00:00, 44.55it/s][A
15it [00:00, 44.80it/s][A
20it [00:00, 44.61it/s][A
25it [00:00, 44.78it/s][A
30it [00:00, 44.26it/s][A
35it [00:00, 43.54it/s][A
40it [00:00, 43.86it/s][A
45it [00:01, 43.99it/s][A
50it [00:01, 44.66it/s][A
55it [00:01, 45.03it/s][A
60it [00:01, 45.10it/s][A
65it [00:01, 45.22it/s][A
70it [00:01, 45.22it/s][A
75it [00:01, 44.98it/s][A
80it [00:01, 45.34it/s][A
85it [00:01, 45.50it/s][A
90it [00:02, 45.25it/s][A
95it [00:02, 45.49it/s][A
100it [00:02, 45.57it/s][A
105it [00:02, 45.50it/s][A

Epoch: 90, Step: 100, Loss: 4.716008186340332



110it [00:02, 45.30it/s][A
115it [00:02, 45.29it/s][A
120it [00:02, 45.14it/s][A
125it [00:02, 45.29it/s][A
130it [00:02, 45.58it/s][A
135it [00:02, 45.74it/s][A
140it [00:03, 45.89it/s][A
145it [00:03, 46.01it/s][A
150it [00:03, 46.05it/s][A
155it [00:03, 45.37it/s][A
160it [00:03, 45.38it/s][A
165it [00:03, 45.59it/s][A
170it [00:03, 44.94it/s][A
175it [00:03, 45.24it/s][A
180it [00:03, 45.52it/s][A
185it [00:04, 45.66it/s][A
190it [00:04, 44.83it/s][A
195it [00:04, 45.11it/s][A
200it [00:04, 45.30it/s][A
205it [00:04, 45.51it/s][A

Epoch: 90, Step: 200, Loss: 4.728647377490997



210it [00:04, 45.36it/s][A
215it [00:04, 45.57it/s][A
220it [00:04, 45.60it/s][A
227it [00:05, 45.15it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.45it/s][A
13it [00:00, 59.75it/s][A
19it [00:00, 59.80it/s][A
25it [00:00, 59.81it/s][A
32it [00:00, 60.26it/s][A
39it [00:00, 60.21it/s][A
46it [00:00, 58.56it/s][A
53it [00:00, 59.27it/s][A
60it [00:01, 59.75it/s][A
66it [00:01, 59.80it/s][A
72it [00:01, 59.70it/s][A
79it [00:01, 60.10it/s][A
86it [00:01, 58.75it/s][A
93it [00:01, 59.45it/s][A
99it [00:01, 59.59it/s][A
106it [00:01, 60.06it/s][A
113it [00:01, 58.70it/s][A
119it [00:02, 58.86it/s][A
126it [00:02, 59.68it/s][A
132it [00:02, 59.62it/s][A
139it [00:02, 60.08it/s][A
146it [00:02, 60.18it/s][A
153it [00:02, 59.49it/s][A
160it [00:02, 59.89it/s][A
166it [00:02, 58.89it/s][A
172it [00:02, 58.11it/s][A
179it [00:03, 59.03it/s][A
185it [00:03, 58.99it/s][A
192it [00:03, 59.49it/s][A
198it [00:03, 59.44it/s][A
205it [00:03, 59.86it/s][A
212it [00:03, 6


Epoch: 90, Test Loss: 5.414180720815007, Test Perplexity: 225.29366830861346




0it [00:00, ?it/s][A
5it [00:00, 45.54it/s][A
10it [00:00, 45.30it/s][A
15it [00:00, 45.15it/s][A
20it [00:00, 45.33it/s][A
25it [00:00, 45.52it/s][A
30it [00:00, 45.36it/s][A
35it [00:00, 45.49it/s][A
40it [00:00, 45.19it/s][A
45it [00:00, 45.37it/s][A
50it [00:01, 45.59it/s][A
55it [00:01, 45.84it/s][A
60it [00:01, 44.94it/s][A
65it [00:01, 45.29it/s][A
70it [00:01, 45.45it/s][A
75it [00:01, 45.60it/s][A
80it [00:01, 45.08it/s][A
85it [00:01, 45.04it/s][A
90it [00:01, 45.42it/s][A
95it [00:02, 45.55it/s][A
100it [00:02, 45.33it/s][A
105it [00:02, 45.55it/s][A

Epoch: 91, Step: 100, Loss: 4.720094971656799



110it [00:02, 45.66it/s][A
115it [00:02, 45.86it/s][A
120it [00:02, 45.65it/s][A
125it [00:02, 45.57it/s][A
130it [00:02, 45.52it/s][A
135it [00:02, 45.73it/s][A
140it [00:03, 45.81it/s][A
145it [00:03, 45.70it/s][A
150it [00:03, 45.59it/s][A
155it [00:03, 45.53it/s][A
160it [00:03, 45.23it/s][A
165it [00:03, 45.51it/s][A
170it [00:03, 45.31it/s][A
175it [00:03, 45.15it/s][A
180it [00:03, 45.20it/s][A
185it [00:04, 45.38it/s][A
190it [00:04, 45.50it/s][A
195it [00:04, 45.80it/s][A
200it [00:04, 45.17it/s][A
205it [00:04, 45.37it/s][A

Epoch: 91, Step: 200, Loss: 4.723913848400116



210it [00:04, 45.50it/s][A
215it [00:04, 45.56it/s][A
220it [00:04, 45.62it/s][A
227it [00:04, 45.44it/s]
 18%|█▊        | 91/500 [10:31<52:40,  7.73s/it]  
0it [00:00, ?it/s][A
5it [00:00, 46.24it/s][A
10it [00:00, 45.90it/s][A
15it [00:00, 46.04it/s][A
20it [00:00, 45.15it/s][A
25it [00:00, 45.55it/s][A
30it [00:00, 45.62it/s][A
35it [00:00, 45.46it/s][A
40it [00:00, 45.66it/s][A
45it [00:00, 44.95it/s][A
50it [00:01, 45.55it/s][A
55it [00:01, 45.92it/s][A
60it [00:01, 46.32it/s][A
65it [00:01, 46.67it/s][A
70it [00:01, 46.90it/s][A
75it [00:01, 46.71it/s][A
80it [00:01, 46.18it/s][A
85it [00:01, 45.99it/s][A
90it [00:01, 45.88it/s][A
95it [00:02, 46.01it/s][A
100it [00:02, 46.00it/s][A
105it [00:02, 46.18it/s][A

Epoch: 92, Step: 100, Loss: 4.720780415534973



110it [00:02, 45.91it/s][A
115it [00:02, 46.02it/s][A
120it [00:02, 46.04it/s][A
125it [00:02, 45.59it/s][A
130it [00:02, 45.36it/s][A
135it [00:02, 45.26it/s][A
140it [00:03, 44.79it/s][A
145it [00:03, 44.76it/s][A
150it [00:03, 45.31it/s][A
155it [00:03, 45.41it/s][A
160it [00:03, 45.57it/s][A
165it [00:03, 45.13it/s][A
170it [00:03, 45.39it/s][A
175it [00:03, 45.38it/s][A
180it [00:03, 45.38it/s][A
185it [00:04, 45.10it/s][A
190it [00:04, 45.49it/s][A
195it [00:04, 45.76it/s][A
200it [00:04, 45.43it/s][A
205it [00:04, 45.62it/s][A

Epoch: 92, Step: 200, Loss: 4.7257307076454165



210it [00:04, 45.49it/s][A
215it [00:04, 45.40it/s][A
220it [00:04, 45.09it/s][A
227it [00:04, 45.57it/s]
 18%|█▊        | 92/500 [10:36<46:57,  6.91s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.29it/s][A
10it [00:00, 45.97it/s][A
15it [00:00, 45.72it/s][A
20it [00:00, 45.31it/s][A
25it [00:00, 45.03it/s][A
30it [00:00, 45.31it/s][A
35it [00:00, 45.54it/s][A
40it [00:00, 45.90it/s][A
45it [00:00, 46.09it/s][A
50it [00:01, 46.39it/s][A
55it [00:01, 46.04it/s][A
60it [00:01, 45.96it/s][A
65it [00:01, 45.92it/s][A
70it [00:01, 45.72it/s][A
75it [00:01, 45.64it/s][A
80it [00:01, 45.57it/s][A
85it [00:01, 45.19it/s][A
90it [00:01, 45.28it/s][A
95it [00:02, 44.60it/s][A
100it [00:02, 44.88it/s][A
105it [00:02, 45.21it/s][A

Epoch: 93, Step: 100, Loss: 4.719469170570374



110it [00:02, 45.28it/s][A
115it [00:02, 44.99it/s][A
120it [00:02, 44.93it/s][A
125it [00:02, 45.39it/s][A
130it [00:02, 45.46it/s][A
135it [00:02, 45.60it/s][A
140it [00:03, 45.57it/s][A
145it [00:03, 45.52it/s][A
150it [00:03, 45.69it/s][A
155it [00:03, 45.50it/s][A
160it [00:03, 45.57it/s][A
165it [00:03, 45.50it/s][A
170it [00:03, 45.61it/s][A
175it [00:03, 45.56it/s][A
180it [00:03, 45.54it/s][A
185it [00:04, 45.54it/s][A
190it [00:04, 45.66it/s][A
195it [00:04, 45.40it/s][A
200it [00:04, 45.57it/s][A
205it [00:04, 44.92it/s][A

Epoch: 93, Step: 200, Loss: 4.723190457820892



210it [00:04, 45.21it/s][A
215it [00:04, 45.36it/s][A
220it [00:04, 45.15it/s][A
227it [00:04, 45.45it/s]
 19%|█▊        | 93/500 [10:41<42:57,  6.33s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.50it/s][A
10it [00:00, 46.12it/s][A
15it [00:00, 45.97it/s][A
20it [00:00, 45.90it/s][A
25it [00:00, 45.89it/s][A
30it [00:00, 45.89it/s][A
35it [00:00, 45.48it/s][A
40it [00:00, 45.09it/s][A
45it [00:00, 45.29it/s][A
50it [00:01, 45.43it/s][A
55it [00:01, 45.46it/s][A
60it [00:01, 45.52it/s][A
65it [00:01, 45.65it/s][A
70it [00:01, 45.68it/s][A
75it [00:01, 45.75it/s][A
80it [00:01, 45.56it/s][A
85it [00:01, 45.01it/s][A
90it [00:01, 45.26it/s][A
95it [00:02, 45.41it/s][A
100it [00:02, 45.63it/s][A
105it [00:02, 45.66it/s][A

Epoch: 94, Step: 100, Loss: 4.710491256713867



110it [00:02, 45.55it/s][A
115it [00:02, 44.67it/s][A
120it [00:02, 44.74it/s][A
125it [00:02, 44.96it/s][A
130it [00:02, 45.21it/s][A
135it [00:02, 45.33it/s][A
140it [00:03, 45.50it/s][A
145it [00:03, 44.79it/s][A
150it [00:03, 44.87it/s][A
155it [00:03, 44.73it/s][A
160it [00:03, 44.06it/s][A
165it [00:03, 44.41it/s][A
170it [00:03, 44.42it/s][A
175it [00:03, 44.74it/s][A
180it [00:03, 44.88it/s][A
185it [00:04, 43.96it/s][A
190it [00:04, 44.34it/s][A
195it [00:04, 43.92it/s][A
200it [00:04, 42.75it/s][A
205it [00:04, 43.06it/s][A

Epoch: 94, Step: 200, Loss: 4.718372180461883



210it [00:04, 43.78it/s][A
215it [00:04, 44.31it/s][A
220it [00:04, 44.74it/s][A
227it [00:05, 44.93it/s]
 19%|█▉        | 94/500 [10:46<40:15,  5.95s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.83it/s][A
10it [00:00, 45.02it/s][A
15it [00:00, 44.33it/s][A
20it [00:00, 45.05it/s][A
25it [00:00, 44.96it/s][A
30it [00:00, 44.55it/s][A
35it [00:00, 44.64it/s][A
40it [00:00, 44.91it/s][A
45it [00:01, 43.99it/s][A
50it [00:01, 43.53it/s][A
55it [00:01, 44.08it/s][A
60it [00:01, 44.44it/s][A
65it [00:01, 44.70it/s][A
70it [00:01, 45.07it/s][A
75it [00:01, 44.91it/s][A
80it [00:01, 44.71it/s][A
85it [00:01, 44.67it/s][A
90it [00:02, 45.05it/s][A
95it [00:02, 45.13it/s][A
100it [00:02, 45.21it/s][A
105it [00:02, 45.22it/s][A

Epoch: 95, Step: 100, Loss: 4.707922191619873



110it [00:02, 45.15it/s][A
115it [00:02, 45.03it/s][A
120it [00:02, 44.71it/s][A
125it [00:02, 43.65it/s][A
130it [00:02, 43.99it/s][A
135it [00:03, 43.70it/s][A
140it [00:03, 44.05it/s][A
145it [00:03, 44.40it/s][A
150it [00:03, 44.66it/s][A
155it [00:03, 44.71it/s][A
160it [00:03, 45.16it/s][A
165it [00:03, 45.11it/s][A
170it [00:03, 45.19it/s][A
175it [00:03, 45.35it/s][A
180it [00:04, 45.47it/s][A
185it [00:04, 44.70it/s][A
190it [00:04, 45.09it/s][A
195it [00:04, 45.13it/s][A
200it [00:04, 45.28it/s][A
205it [00:04, 45.29it/s][A

Epoch: 95, Step: 200, Loss: 4.71869616985321



210it [00:04, 45.22it/s][A
215it [00:04, 45.39it/s][A
220it [00:04, 45.53it/s][A
227it [00:05, 44.79it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.31it/s][A
13it [00:00, 60.32it/s][A
20it [00:00, 60.84it/s][A
27it [00:00, 60.42it/s][A
34it [00:00, 60.55it/s][A
41it [00:00, 60.91it/s][A
48it [00:00, 60.96it/s][A
55it [00:00, 61.06it/s][A
62it [00:01, 61.17it/s][A
69it [00:01, 61.35it/s][A
76it [00:01, 61.35it/s][A
83it [00:01, 61.28it/s][A
90it [00:01, 61.25it/s][A
97it [00:01, 61.24it/s][A
104it [00:01, 61.26it/s][A
111it [00:01, 61.17it/s][A
118it [00:01, 61.05it/s][A
125it [00:02, 60.45it/s][A
132it [00:02, 60.22it/s][A
139it [00:02, 60.46it/s][A
146it [00:02, 60.54it/s][A
153it [00:02, 60.58it/s][A
160it [00:02, 60.62it/s][A
167it [00:02, 60.66it/s][A
174it [00:02, 60.45it/s][A
181it [00:02, 59.38it/s][A
188it [00:03, 59.75it/s][A
195it [00:03, 59.98it/s][A
202it [00:03, 60.22it/s][A
209it [00:03, 60.40it/s][A
216it [00:03, 59.83it/s][A
222it [00:03, 


Epoch: 95, Test Loss: 5.411852248707173, Test Perplexity: 224.74813113005266




0it [00:00, ?it/s][A
5it [00:00, 45.96it/s][A
10it [00:00, 45.47it/s][A
15it [00:00, 45.35it/s][A
20it [00:00, 45.28it/s][A
25it [00:00, 44.45it/s][A
30it [00:00, 45.12it/s][A
35it [00:00, 45.45it/s][A
40it [00:00, 45.54it/s][A
45it [00:00, 45.42it/s][A
50it [00:01, 45.66it/s][A
55it [00:01, 45.15it/s][A
60it [00:01, 45.37it/s][A
65it [00:01, 45.65it/s][A
70it [00:01, 45.72it/s][A
75it [00:01, 45.67it/s][A
80it [00:01, 45.79it/s][A
85it [00:01, 45.50it/s][A
90it [00:01, 45.43it/s][A
95it [00:02, 45.56it/s][A
100it [00:02, 45.43it/s][A
105it [00:02, 45.56it/s][A

Epoch: 96, Step: 100, Loss: 4.70419981956482



110it [00:02, 45.71it/s][A
115it [00:02, 46.23it/s][A
120it [00:02, 46.42it/s][A
125it [00:02, 46.17it/s][A
130it [00:02, 46.46it/s][A
135it [00:02, 46.37it/s][A
140it [00:03, 45.58it/s][A
145it [00:03, 46.07it/s][A
150it [00:03, 46.46it/s][A
155it [00:03, 46.52it/s][A
160it [00:03, 46.85it/s][A
165it [00:03, 46.97it/s][A
170it [00:03, 47.10it/s][A
175it [00:03, 47.14it/s][A
180it [00:03, 47.09it/s][A
185it [00:04, 46.84it/s][A
190it [00:04, 46.75it/s][A
195it [00:04, 46.77it/s][A
200it [00:04, 46.75it/s][A
205it [00:04, 46.76it/s][A

Epoch: 96, Step: 200, Loss: 4.717828996181488



210it [00:04, 46.42it/s][A
215it [00:04, 46.24it/s][A
220it [00:04, 46.58it/s][A
227it [00:04, 46.07it/s]
 19%|█▉        | 96/500 [11:06<51:57,  7.72s/it]  
0it [00:00, ?it/s][A
5it [00:00, 46.22it/s][A
10it [00:00, 43.58it/s][A
15it [00:00, 44.13it/s][A
20it [00:00, 44.69it/s][A
25it [00:00, 45.37it/s][A
30it [00:00, 45.92it/s][A
35it [00:00, 45.98it/s][A
40it [00:00, 46.19it/s][A
45it [00:00, 46.21it/s][A
50it [00:01, 45.91it/s][A
55it [00:01, 45.42it/s][A
60it [00:01, 45.60it/s][A
65it [00:01, 45.15it/s][A
70it [00:01, 45.20it/s][A
75it [00:01, 45.42it/s][A
80it [00:01, 46.02it/s][A
85it [00:01, 46.51it/s][A
90it [00:01, 45.96it/s][A
95it [00:02, 46.09it/s][A
100it [00:02, 46.06it/s][A
105it [00:02, 46.69it/s][A

Epoch: 97, Step: 100, Loss: 4.694586772918701



110it [00:02, 47.16it/s][A
115it [00:02, 47.35it/s][A
120it [00:02, 47.04it/s][A
125it [00:02, 46.86it/s][A
130it [00:02, 46.44it/s][A
135it [00:02, 46.36it/s][A
140it [00:03, 46.02it/s][A
145it [00:03, 45.59it/s][A
150it [00:03, 45.70it/s][A
155it [00:03, 45.74it/s][A
160it [00:03, 45.09it/s][A
165it [00:03, 45.42it/s][A
170it [00:03, 45.66it/s][A
175it [00:03, 45.52it/s][A
180it [00:03, 45.45it/s][A
185it [00:04, 45.44it/s][A
190it [00:04, 45.46it/s][A
195it [00:04, 44.63it/s][A
200it [00:04, 45.02it/s][A
205it [00:04, 45.10it/s][A

Epoch: 97, Step: 200, Loss: 4.71590315580368



210it [00:04, 45.14it/s][A
215it [00:04, 45.26it/s][A
220it [00:04, 45.18it/s][A
227it [00:04, 45.67it/s]
 19%|█▉        | 97/500 [11:11<46:18,  6.89s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.16it/s][A
10it [00:00, 43.91it/s][A
15it [00:00, 44.59it/s][A
20it [00:00, 45.16it/s][A
25it [00:00, 45.47it/s][A
30it [00:00, 45.43it/s][A
35it [00:00, 45.30it/s][A
40it [00:00, 44.36it/s][A
45it [00:01, 44.70it/s][A
50it [00:01, 43.97it/s][A
55it [00:01, 44.37it/s][A
60it [00:01, 44.54it/s][A
65it [00:01, 44.75it/s][A
70it [00:01, 45.01it/s][A
75it [00:01, 44.88it/s][A
80it [00:01, 45.21it/s][A
85it [00:01, 44.64it/s][A
90it [00:02, 44.76it/s][A
95it [00:02, 44.64it/s][A
100it [00:02, 44.87it/s][A
105it [00:02, 44.46it/s][A

Epoch: 98, Step: 100, Loss: 4.6994394159317014



110it [00:02, 44.51it/s][A
115it [00:02, 44.79it/s][A
120it [00:02, 44.84it/s][A
125it [00:02, 45.03it/s][A
130it [00:02, 45.18it/s][A
135it [00:03, 45.16it/s][A
140it [00:03, 45.03it/s][A
145it [00:03, 44.96it/s][A
150it [00:03, 44.30it/s][A
155it [00:03, 44.63it/s][A
160it [00:03, 43.83it/s][A
165it [00:03, 44.43it/s][A
170it [00:03, 44.94it/s][A
175it [00:03, 45.36it/s][A
180it [00:04, 45.33it/s][A
185it [00:04, 45.28it/s][A
190it [00:04, 44.79it/s][A
195it [00:04, 45.09it/s][A
200it [00:04, 44.49it/s][A
205it [00:04, 45.01it/s][A

Epoch: 98, Step: 200, Loss: 4.714145457744598



210it [00:04, 45.25it/s][A
215it [00:04, 44.51it/s][A
220it [00:04, 44.91it/s][A
227it [00:05, 44.80it/s]
 20%|█▉        | 98/500 [11:16<42:31,  6.35s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.29it/s][A
10it [00:00, 46.21it/s][A
15it [00:00, 46.13it/s][A
20it [00:00, 45.54it/s][A
25it [00:00, 45.89it/s][A
30it [00:00, 46.00it/s][A
35it [00:00, 45.68it/s][A
40it [00:00, 45.84it/s][A
45it [00:00, 45.77it/s][A
50it [00:01, 45.95it/s][A
55it [00:01, 45.97it/s][A
60it [00:01, 45.87it/s][A
65it [00:01, 45.67it/s][A
70it [00:01, 45.46it/s][A
75it [00:01, 44.15it/s][A
80it [00:01, 44.51it/s][A
85it [00:01, 44.85it/s][A
90it [00:01, 44.94it/s][A
95it [00:02, 45.19it/s][A
100it [00:02, 45.39it/s][A
105it [00:02, 45.42it/s][A

Epoch: 99, Step: 100, Loss: 4.692651901245117



110it [00:02, 44.55it/s][A
115it [00:02, 44.37it/s][A
120it [00:02, 44.56it/s][A
125it [00:02, 44.64it/s][A
130it [00:02, 44.62it/s][A
135it [00:02, 44.22it/s][A
140it [00:03, 44.74it/s][A
145it [00:03, 44.90it/s][A
150it [00:03, 45.35it/s][A
155it [00:03, 44.82it/s][A
160it [00:03, 45.07it/s][A
165it [00:03, 45.41it/s][A
170it [00:03, 45.53it/s][A
175it [00:03, 45.59it/s][A
180it [00:03, 45.56it/s][A
185it [00:04, 45.63it/s][A
190it [00:04, 45.72it/s][A
195it [00:04, 45.87it/s][A
200it [00:04, 45.81it/s][A
205it [00:04, 45.64it/s][A

Epoch: 99, Step: 200, Loss: 4.710748863220215



210it [00:04, 45.61it/s][A
215it [00:04, 45.69it/s][A
220it [00:04, 45.74it/s][A
227it [00:05, 45.24it/s]
 20%|█▉        | 99/500 [11:21<39:45,  5.95s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.79it/s][A
10it [00:00, 43.14it/s][A
15it [00:00, 44.42it/s][A
20it [00:00, 44.70it/s][A
25it [00:00, 45.03it/s][A
30it [00:00, 45.38it/s][A
35it [00:00, 45.51it/s][A
40it [00:00, 45.49it/s][A
45it [00:01, 45.23it/s][A
50it [00:01, 45.47it/s][A
55it [00:01, 45.07it/s][A
60it [00:01, 45.52it/s][A
65it [00:01, 45.58it/s][A
70it [00:01, 45.50it/s][A
75it [00:01, 45.05it/s][A
80it [00:01, 45.33it/s][A
85it [00:01, 45.47it/s][A
90it [00:01, 45.36it/s][A
95it [00:02, 45.33it/s][A
100it [00:02, 45.47it/s][A
105it [00:02, 44.52it/s][A

Epoch: 100, Step: 100, Loss: 4.696447896957397



110it [00:02, 44.97it/s][A
115it [00:02, 45.29it/s][A
120it [00:02, 45.65it/s][A
125it [00:02, 45.59it/s][A
130it [00:02, 45.58it/s][A
135it [00:02, 45.16it/s][A
140it [00:03, 45.37it/s][A
145it [00:03, 45.35it/s][A
150it [00:03, 45.47it/s][A
155it [00:03, 45.49it/s][A
160it [00:03, 45.76it/s][A
165it [00:03, 45.56it/s][A
170it [00:03, 45.46it/s][A
175it [00:03, 45.63it/s][A
180it [00:03, 45.05it/s][A
185it [00:04, 45.37it/s][A
190it [00:04, 45.13it/s][A
195it [00:04, 45.15it/s][A
200it [00:04, 44.97it/s][A
205it [00:04, 45.35it/s][A

Epoch: 100, Step: 200, Loss: 4.710663893222809



210it [00:04, 45.40it/s][A
215it [00:04, 45.52it/s][A
220it [00:04, 45.58it/s][A
227it [00:05, 45.19it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.53it/s][A
12it [00:00, 56.34it/s][A
19it [00:00, 58.51it/s][A
25it [00:00, 58.21it/s][A
31it [00:00, 58.79it/s][A
37it [00:00, 57.16it/s][A
43it [00:00, 57.89it/s][A
49it [00:00, 57.21it/s][A
55it [00:00, 58.01it/s][A
62it [00:01, 58.99it/s][A
68it [00:01, 59.16it/s][A
74it [00:01, 59.31it/s][A
81it [00:01, 59.59it/s][A
87it [00:01, 59.59it/s][A
93it [00:01, 59.41it/s][A
99it [00:01, 59.46it/s][A
105it [00:01, 59.52it/s][A
111it [00:01, 59.42it/s][A
117it [00:01, 58.17it/s][A
123it [00:02, 58.01it/s][A
130it [00:02, 59.02it/s][A
136it [00:02, 58.24it/s][A
143it [00:02, 59.13it/s][A
150it [00:02, 59.63it/s][A
157it [00:02, 60.04it/s][A
164it [00:02, 60.26it/s][A
171it [00:02, 60.16it/s][A
178it [00:03, 59.47it/s][A
185it [00:03, 59.72it/s][A
191it [00:03, 58.69it/s][A
198it [00:03, 59.39it/s][A
204it [00:03, 59


Epoch: 100, Test Loss: 5.41662641711857, Test Perplexity: 225.77041590435906




0it [00:00, ?it/s][A
5it [00:00, 45.37it/s][A
10it [00:00, 46.40it/s][A
15it [00:00, 46.60it/s][A
20it [00:00, 46.80it/s][A
25it [00:00, 46.98it/s][A
30it [00:00, 46.28it/s][A
35it [00:00, 46.21it/s][A
40it [00:00, 46.34it/s][A
45it [00:00, 46.26it/s][A
50it [00:01, 46.15it/s][A
55it [00:01, 46.27it/s][A
60it [00:01, 46.44it/s][A
65it [00:01, 46.42it/s][A
70it [00:01, 46.52it/s][A
75it [00:01, 45.86it/s][A
80it [00:01, 45.98it/s][A
85it [00:01, 45.32it/s][A
90it [00:01, 44.81it/s][A
95it [00:02, 44.87it/s][A
100it [00:02, 44.86it/s][A
105it [00:02, 45.40it/s][A

Epoch: 101, Step: 100, Loss: 4.694608426094055



110it [00:02, 45.61it/s][A
115it [00:02, 45.74it/s][A
120it [00:02, 45.80it/s][A
125it [00:02, 45.88it/s][A
130it [00:02, 45.85it/s][A
135it [00:02, 45.96it/s][A
140it [00:03, 45.83it/s][A
145it [00:03, 46.13it/s][A
150it [00:03, 45.55it/s][A
155it [00:03, 45.65it/s][A
160it [00:03, 45.03it/s][A
165it [00:03, 45.33it/s][A
170it [00:03, 45.03it/s][A
175it [00:03, 45.64it/s][A
180it [00:03, 45.69it/s][A
185it [00:04, 45.61it/s][A
190it [00:04, 45.50it/s][A
195it [00:04, 45.49it/s][A
200it [00:04, 45.71it/s][A
205it [00:04, 45.65it/s][A

Epoch: 101, Step: 200, Loss: 4.709160192012787



210it [00:04, 45.09it/s][A
215it [00:04, 44.63it/s][A
220it [00:04, 43.99it/s][A
227it [00:04, 45.55it/s]
 20%|██        | 101/500 [11:42<51:24,  7.73s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.08it/s][A
10it [00:00, 45.93it/s][A
15it [00:00, 45.56it/s][A
20it [00:00, 45.62it/s][A
25it [00:00, 45.75it/s][A
30it [00:00, 45.87it/s][A
35it [00:00, 46.02it/s][A
40it [00:00, 46.00it/s][A
45it [00:00, 46.03it/s][A
50it [00:01, 46.21it/s][A
55it [00:01, 46.07it/s][A
60it [00:01, 45.83it/s][A
65it [00:01, 45.82it/s][A
70it [00:01, 46.01it/s][A
75it [00:01, 45.95it/s][A
80it [00:01, 45.85it/s][A
85it [00:01, 45.85it/s][A
90it [00:01, 44.87it/s][A
95it [00:02, 45.26it/s][A
100it [00:02, 45.29it/s][A
105it [00:02, 44.73it/s][A

Epoch: 102, Step: 100, Loss: 4.694116730690002



110it [00:02, 45.03it/s][A
115it [00:02, 45.14it/s][A
120it [00:02, 45.26it/s][A
125it [00:02, 44.92it/s][A
130it [00:02, 45.22it/s][A
135it [00:02, 45.30it/s][A
140it [00:03, 45.52it/s][A
145it [00:03, 45.56it/s][A
150it [00:03, 45.68it/s][A
155it [00:03, 45.82it/s][A
160it [00:03, 45.87it/s][A
165it [00:03, 45.86it/s][A
170it [00:03, 45.98it/s][A
175it [00:03, 45.37it/s][A
180it [00:03, 45.67it/s][A
185it [00:04, 45.72it/s][A
190it [00:04, 45.48it/s][A
195it [00:04, 44.70it/s][A
200it [00:04, 45.23it/s][A
205it [00:04, 45.38it/s][A

Epoch: 102, Step: 200, Loss: 4.705979030132294



210it [00:04, 45.39it/s][A
215it [00:04, 45.04it/s][A
220it [00:04, 45.24it/s][A
227it [00:04, 45.49it/s]
 20%|██        | 102/500 [11:47<45:49,  6.91s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.54it/s][A
10it [00:00, 43.56it/s][A
15it [00:00, 44.92it/s][A
20it [00:00, 45.02it/s][A
25it [00:00, 45.06it/s][A
30it [00:00, 44.70it/s][A
35it [00:00, 45.30it/s][A
40it [00:00, 45.62it/s][A
45it [00:00, 45.31it/s][A
50it [00:01, 45.58it/s][A
55it [00:01, 45.55it/s][A
60it [00:01, 44.83it/s][A
65it [00:01, 44.83it/s][A
70it [00:01, 44.96it/s][A
75it [00:01, 45.09it/s][A
80it [00:01, 45.14it/s][A
85it [00:01, 45.25it/s][A
90it [00:01, 45.46it/s][A
95it [00:02, 45.50it/s][A
100it [00:02, 45.65it/s][A
105it [00:02, 45.81it/s][A

Epoch: 103, Step: 100, Loss: 4.695198731422424



110it [00:02, 45.80it/s][A
115it [00:02, 45.82it/s][A
120it [00:02, 46.09it/s][A
125it [00:02, 45.95it/s][A
130it [00:02, 45.69it/s][A
135it [00:02, 45.81it/s][A
140it [00:03, 46.04it/s][A
145it [00:03, 45.96it/s][A
150it [00:03, 45.75it/s][A
155it [00:03, 45.76it/s][A
160it [00:03, 45.62it/s][A
165it [00:03, 45.66it/s][A
170it [00:03, 45.07it/s][A
175it [00:03, 45.04it/s][A
180it [00:03, 45.36it/s][A
185it [00:04, 45.54it/s][A
190it [00:04, 45.04it/s][A
195it [00:04, 45.34it/s][A
200it [00:04, 44.94it/s][A
205it [00:04, 45.14it/s][A

Epoch: 103, Step: 200, Loss: 4.704902727603912



210it [00:04, 45.18it/s][A
215it [00:04, 44.35it/s][A
220it [00:04, 44.97it/s][A
227it [00:05, 45.32it/s]
 21%|██        | 103/500 [11:52<41:57,  6.34s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.58it/s][A
10it [00:00, 43.02it/s][A
15it [00:00, 43.15it/s][A
20it [00:00, 41.24it/s][A
25it [00:00, 42.58it/s][A
30it [00:00, 43.52it/s][A
35it [00:00, 44.00it/s][A
40it [00:00, 44.41it/s][A
45it [00:01, 44.25it/s][A
50it [00:01, 44.41it/s][A
55it [00:01, 44.12it/s][A
60it [00:01, 44.02it/s][A
65it [00:01, 44.05it/s][A
70it [00:01, 44.04it/s][A
75it [00:01, 44.56it/s][A
80it [00:01, 45.07it/s][A
85it [00:01, 45.34it/s][A
90it [00:02, 45.45it/s][A
95it [00:02, 45.47it/s][A
100it [00:02, 45.72it/s][A
105it [00:02, 45.87it/s][A

Epoch: 104, Step: 100, Loss: 4.682259998321533



110it [00:02, 45.34it/s][A
115it [00:02, 45.37it/s][A
120it [00:02, 45.42it/s][A
125it [00:02, 44.57it/s][A
130it [00:02, 44.85it/s][A
135it [00:03, 44.95it/s][A
140it [00:03, 45.08it/s][A
145it [00:03, 45.39it/s][A
150it [00:03, 45.53it/s][A
155it [00:03, 45.39it/s][A
160it [00:03, 45.28it/s][A
165it [00:03, 45.26it/s][A
170it [00:03, 45.10it/s][A
175it [00:03, 45.20it/s][A
180it [00:04, 44.37it/s][A
185it [00:04, 44.68it/s][A
190it [00:04, 45.01it/s][A
195it [00:04, 45.20it/s][A
200it [00:04, 45.45it/s][A
205it [00:04, 45.61it/s][A

Epoch: 104, Step: 200, Loss: 4.705690982341767



210it [00:04, 45.63it/s][A
215it [00:04, 45.22it/s][A
220it [00:04, 45.58it/s][A
227it [00:05, 44.83it/s]
 21%|██        | 104/500 [11:57<39:19,  5.96s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.28it/s][A
10it [00:00, 46.08it/s][A
15it [00:00, 45.46it/s][A
20it [00:00, 45.81it/s][A
25it [00:00, 45.77it/s][A
30it [00:00, 45.95it/s][A
35it [00:00, 46.02it/s][A
40it [00:00, 45.64it/s][A
45it [00:00, 45.28it/s][A
50it [00:01, 45.40it/s][A
55it [00:01, 45.26it/s][A
60it [00:01, 45.24it/s][A
65it [00:01, 45.32it/s][A
70it [00:01, 45.16it/s][A
75it [00:01, 45.03it/s][A
80it [00:01, 45.25it/s][A
85it [00:01, 45.03it/s][A
90it [00:01, 44.74it/s][A
95it [00:02, 45.02it/s][A
100it [00:02, 44.73it/s][A
105it [00:02, 45.05it/s][A

Epoch: 105, Step: 100, Loss: 4.694388661384583



110it [00:02, 45.14it/s][A
115it [00:02, 45.22it/s][A
120it [00:02, 45.23it/s][A
125it [00:02, 45.24it/s][A
130it [00:02, 45.42it/s][A
135it [00:02, 45.41it/s][A
140it [00:03, 45.41it/s][A
145it [00:03, 45.48it/s][A
150it [00:03, 45.49it/s][A
155it [00:03, 45.43it/s][A
160it [00:03, 45.35it/s][A
165it [00:03, 44.32it/s][A
170it [00:03, 44.47it/s][A
175it [00:03, 44.82it/s][A
180it [00:03, 45.21it/s][A
185it [00:04, 45.39it/s][A
190it [00:04, 45.44it/s][A
195it [00:04, 45.76it/s][A
200it [00:04, 44.94it/s][A
205it [00:04, 45.18it/s][A

Epoch: 105, Step: 200, Loss: 4.7057384014129635



210it [00:04, 45.20it/s][A
215it [00:04, 44.56it/s][A
220it [00:04, 44.70it/s][A
227it [00:05, 45.19it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.51it/s][A
13it [00:00, 60.71it/s][A
20it [00:00, 61.05it/s][A
27it [00:00, 60.92it/s][A
34it [00:00, 60.99it/s][A
41it [00:00, 60.67it/s][A
48it [00:00, 60.56it/s][A
55it [00:00, 60.59it/s][A
62it [00:01, 60.68it/s][A
69it [00:01, 60.53it/s][A
76it [00:01, 60.63it/s][A
83it [00:01, 60.71it/s][A
90it [00:01, 60.14it/s][A
97it [00:01, 59.95it/s][A
103it [00:01, 59.95it/s][A
110it [00:01, 60.07it/s][A
117it [00:01, 60.23it/s][A
124it [00:02, 60.16it/s][A
131it [00:02, 60.28it/s][A
138it [00:02, 60.42it/s][A
145it [00:02, 60.31it/s][A
152it [00:02, 60.27it/s][A
159it [00:02, 60.15it/s][A
166it [00:02, 59.87it/s][A
172it [00:02, 59.63it/s][A
178it [00:02, 59.64it/s][A
184it [00:03, 59.63it/s][A
190it [00:03, 59.67it/s][A
197it [00:03, 59.86it/s][A
204it [00:03, 60.02it/s][A
210it [00:03, 59.98it/s][A
216it [00:03, 


Epoch: 105, Test Loss: 5.414917497901443, Test Perplexity: 225.4503316938507




0it [00:00, ?it/s][A
5it [00:00, 44.94it/s][A
10it [00:00, 43.60it/s][A
15it [00:00, 44.80it/s][A
20it [00:00, 45.32it/s][A
25it [00:00, 45.60it/s][A
30it [00:00, 45.69it/s][A
35it [00:00, 45.91it/s][A
40it [00:00, 46.07it/s][A
45it [00:00, 45.89it/s][A
50it [00:01, 45.92it/s][A
55it [00:01, 45.66it/s][A
60it [00:01, 45.66it/s][A
65it [00:01, 44.56it/s][A
70it [00:01, 44.93it/s][A
75it [00:01, 45.37it/s][A
80it [00:01, 45.62it/s][A
85it [00:01, 45.84it/s][A
90it [00:01, 45.96it/s][A
95it [00:02, 46.04it/s][A
100it [00:02, 45.82it/s][A
105it [00:02, 44.52it/s][A

Epoch: 106, Step: 100, Loss: 4.693712639808655



110it [00:02, 44.39it/s][A
115it [00:02, 44.31it/s][A
120it [00:02, 43.98it/s][A
125it [00:02, 44.51it/s][A
130it [00:02, 44.78it/s][A
135it [00:03, 43.34it/s][A
140it [00:03, 42.27it/s][A
145it [00:03, 42.46it/s][A
150it [00:03, 42.84it/s][A
155it [00:03, 43.17it/s][A
160it [00:03, 43.71it/s][A
165it [00:03, 43.69it/s][A
170it [00:03, 44.29it/s][A
175it [00:03, 44.64it/s][A
180it [00:04, 44.82it/s][A
185it [00:04, 43.92it/s][A
190it [00:04, 44.11it/s][A
195it [00:04, 44.44it/s][A
200it [00:04, 44.46it/s][A
205it [00:04, 44.62it/s][A

Epoch: 106, Step: 200, Loss: 4.705040712356567



210it [00:04, 44.59it/s][A
215it [00:04, 45.04it/s][A
220it [00:04, 44.74it/s][A
227it [00:05, 44.67it/s]
 21%|██        | 106/500 [12:18<50:49,  7.74s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.58it/s][A
10it [00:00, 45.48it/s][A
15it [00:00, 45.47it/s][A
20it [00:00, 45.20it/s][A
25it [00:00, 45.09it/s][A
30it [00:00, 45.03it/s][A
35it [00:00, 45.02it/s][A
40it [00:00, 45.21it/s][A
45it [00:00, 44.90it/s][A
50it [00:01, 45.19it/s][A
55it [00:01, 45.13it/s][A
60it [00:01, 45.10it/s][A
65it [00:01, 44.95it/s][A
70it [00:01, 44.84it/s][A
75it [00:01, 44.53it/s][A
80it [00:01, 45.18it/s][A
85it [00:01, 45.48it/s][A
90it [00:01, 45.57it/s][A
95it [00:02, 45.70it/s][A
100it [00:02, 45.82it/s][A
105it [00:02, 45.67it/s][A

Epoch: 107, Step: 100, Loss: 4.682676291465759



110it [00:02, 45.64it/s][A
115it [00:02, 45.69it/s][A
120it [00:02, 45.85it/s][A
125it [00:02, 45.39it/s][A
130it [00:02, 44.97it/s][A
135it [00:02, 44.91it/s][A
140it [00:03, 45.20it/s][A
145it [00:03, 45.12it/s][A
150it [00:03, 45.01it/s][A
155it [00:03, 45.18it/s][A
160it [00:03, 45.19it/s][A
165it [00:03, 45.44it/s][A
170it [00:03, 44.49it/s][A
175it [00:03, 44.95it/s][A
180it [00:03, 45.23it/s][A
185it [00:04, 45.62it/s][A
190it [00:04, 45.78it/s][A
195it [00:04, 45.79it/s][A
200it [00:04, 46.02it/s][A
205it [00:04, 45.72it/s][A

Epoch: 107, Step: 200, Loss: 4.698754448890686



210it [00:04, 45.54it/s][A
215it [00:04, 45.65it/s][A
220it [00:04, 45.68it/s][A
227it [00:05, 45.32it/s]
 21%|██▏       | 107/500 [12:23<45:20,  6.92s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.35it/s][A
10it [00:00, 45.30it/s][A
15it [00:00, 45.50it/s][A
20it [00:00, 45.79it/s][A
25it [00:00, 45.51it/s][A
30it [00:00, 45.52it/s][A
35it [00:00, 45.46it/s][A
40it [00:00, 45.38it/s][A
45it [00:00, 45.51it/s][A
50it [00:01, 45.75it/s][A
55it [00:01, 45.88it/s][A
60it [00:01, 45.75it/s][A
65it [00:01, 44.92it/s][A
70it [00:01, 45.19it/s][A
75it [00:01, 45.15it/s][A
80it [00:01, 45.28it/s][A
85it [00:01, 45.38it/s][A
90it [00:01, 45.58it/s][A
95it [00:02, 45.20it/s][A
100it [00:02, 45.38it/s][A
105it [00:02, 45.38it/s][A

Epoch: 108, Step: 100, Loss: 4.682157802581787



110it [00:02, 45.44it/s][A
115it [00:02, 45.47it/s][A
120it [00:02, 45.45it/s][A
125it [00:02, 45.32it/s][A
130it [00:02, 45.41it/s][A
135it [00:02, 45.68it/s][A
140it [00:03, 45.32it/s][A
145it [00:03, 45.37it/s][A
150it [00:03, 44.46it/s][A
155it [00:03, 44.11it/s][A
160it [00:03, 44.38it/s][A
165it [00:03, 44.35it/s][A
170it [00:03, 44.37it/s][A
175it [00:03, 44.65it/s][A
180it [00:03, 45.11it/s][A
185it [00:04, 44.95it/s][A
190it [00:04, 45.09it/s][A
195it [00:04, 44.98it/s][A
200it [00:04, 45.02it/s][A
205it [00:04, 44.82it/s][A

Epoch: 108, Step: 200, Loss: 4.6998877787590025



210it [00:04, 44.60it/s][A
215it [00:04, 44.49it/s][A
220it [00:04, 44.56it/s][A
227it [00:05, 44.99it/s]
 22%|██▏       | 108/500 [12:28<41:33,  6.36s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.80it/s][A
10it [00:00, 45.71it/s][A
15it [00:00, 44.73it/s][A
20it [00:00, 45.13it/s][A
25it [00:00, 45.39it/s][A
30it [00:00, 45.29it/s][A
35it [00:00, 45.34it/s][A
40it [00:00, 45.35it/s][A
45it [00:00, 45.39it/s][A
50it [00:01, 45.47it/s][A
55it [00:01, 45.30it/s][A
60it [00:01, 45.49it/s][A
65it [00:01, 45.64it/s][A
70it [00:01, 45.72it/s][A
75it [00:01, 45.38it/s][A
80it [00:01, 45.29it/s][A
85it [00:01, 45.18it/s][A
90it [00:01, 45.19it/s][A
95it [00:02, 44.56it/s][A
100it [00:02, 44.93it/s][A
105it [00:02, 45.01it/s][A

Epoch: 109, Step: 100, Loss: 4.691226024627685



110it [00:02, 44.05it/s][A
115it [00:02, 44.54it/s][A
120it [00:02, 44.85it/s][A
125it [00:02, 45.02it/s][A
130it [00:02, 45.22it/s][A
135it [00:02, 45.47it/s][A
140it [00:03, 45.38it/s][A
145it [00:03, 45.34it/s][A
150it [00:03, 45.32it/s][A
155it [00:03, 44.36it/s][A
160it [00:03, 44.74it/s][A
165it [00:03, 44.64it/s][A
170it [00:03, 44.90it/s][A
175it [00:03, 45.12it/s][A
180it [00:03, 45.30it/s][A
185it [00:04, 45.37it/s][A
190it [00:04, 45.29it/s][A
195it [00:04, 45.26it/s][A
200it [00:04, 44.48it/s][A
205it [00:04, 44.93it/s][A

Epoch: 109, Step: 200, Loss: 4.700233778953552



210it [00:04, 45.29it/s][A
215it [00:04, 45.47it/s][A
220it [00:04, 45.66it/s][A
227it [00:05, 45.18it/s]
 22%|██▏       | 109/500 [12:33<38:50,  5.96s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.79it/s][A
10it [00:00, 45.56it/s][A
15it [00:00, 45.39it/s][A
20it [00:00, 45.64it/s][A
25it [00:00, 45.61it/s][A
30it [00:00, 45.54it/s][A
35it [00:00, 45.68it/s][A
40it [00:00, 45.93it/s][A
45it [00:00, 45.95it/s][A
50it [00:01, 45.74it/s][A
55it [00:01, 45.40it/s][A
60it [00:01, 45.67it/s][A
65it [00:01, 45.72it/s][A
70it [00:01, 45.81it/s][A
75it [00:01, 45.89it/s][A
80it [00:01, 45.93it/s][A
85it [00:01, 46.05it/s][A
90it [00:01, 45.98it/s][A
95it [00:02, 45.11it/s][A
100it [00:02, 45.10it/s][A
105it [00:02, 45.22it/s][A

Epoch: 110, Step: 100, Loss: 4.676161432266236



110it [00:02, 45.29it/s][A
115it [00:02, 45.41it/s][A
120it [00:02, 45.49it/s][A
125it [00:02, 45.71it/s][A
130it [00:02, 45.26it/s][A
135it [00:02, 45.21it/s][A
140it [00:03, 45.44it/s][A
145it [00:03, 45.59it/s][A
150it [00:03, 45.00it/s][A
155it [00:03, 45.36it/s][A
160it [00:03, 45.49it/s][A
165it [00:03, 45.65it/s][A
170it [00:03, 45.66it/s][A
175it [00:03, 44.79it/s][A
180it [00:03, 45.03it/s][A
185it [00:04, 45.00it/s][A
190it [00:04, 45.12it/s][A
195it [00:04, 45.15it/s][A
200it [00:04, 45.27it/s][A
205it [00:04, 45.36it/s][A

Epoch: 110, Step: 200, Loss: 4.693288674354553



210it [00:04, 45.44it/s][A
215it [00:04, 45.61it/s][A
220it [00:04, 45.66it/s][A
227it [00:05, 45.31it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.45it/s][A
12it [00:00, 58.77it/s][A
18it [00:00, 57.85it/s][A
25it [00:00, 59.18it/s][A
32it [00:00, 59.72it/s][A
39it [00:00, 59.97it/s][A
45it [00:00, 59.89it/s][A
51it [00:00, 59.38it/s][A
57it [00:00, 58.34it/s][A
63it [00:01, 58.54it/s][A
69it [00:01, 58.72it/s][A
75it [00:01, 58.89it/s][A
82it [00:01, 59.74it/s][A
88it [00:01, 58.97it/s][A
95it [00:01, 59.64it/s][A
102it [00:01, 59.90it/s][A
109it [00:01, 60.20it/s][A
116it [00:01, 60.38it/s][A
123it [00:02, 60.36it/s][A
130it [00:02, 60.57it/s][A
137it [00:02, 60.68it/s][A
144it [00:02, 61.01it/s][A
151it [00:02, 61.22it/s][A
158it [00:02, 61.30it/s][A
165it [00:02, 61.42it/s][A
172it [00:02, 61.40it/s][A
179it [00:02, 61.15it/s][A
186it [00:03, 61.02it/s][A
193it [00:03, 61.05it/s][A
200it [00:03, 61.17it/s][A
207it [00:03, 61.23it/s][A
214it [00:03, 6


Epoch: 110, Test Loss: 5.422363785483082, Test Perplexity: 227.07334101422234




0it [00:00, ?it/s][A
5it [00:00, 43.66it/s][A
10it [00:00, 44.58it/s][A
15it [00:00, 44.94it/s][A
20it [00:00, 45.01it/s][A
25it [00:00, 45.08it/s][A
30it [00:00, 44.87it/s][A
35it [00:00, 43.93it/s][A
40it [00:00, 44.51it/s][A
45it [00:01, 44.73it/s][A
50it [00:01, 45.05it/s][A
55it [00:01, 45.27it/s][A
60it [00:01, 45.17it/s][A
65it [00:01, 43.75it/s][A
70it [00:01, 43.97it/s][A
75it [00:01, 44.20it/s][A
80it [00:01, 44.59it/s][A
85it [00:01, 44.43it/s][A
90it [00:02, 44.53it/s][A
95it [00:02, 44.86it/s][A
100it [00:02, 44.25it/s][A
105it [00:02, 44.77it/s][A

Epoch: 111, Step: 100, Loss: 4.686342163085937



110it [00:02, 45.05it/s][A
115it [00:02, 44.91it/s][A
120it [00:02, 44.17it/s][A
125it [00:02, 44.86it/s][A
130it [00:02, 44.44it/s][A
135it [00:03, 44.88it/s][A
140it [00:03, 44.18it/s][A
145it [00:03, 44.87it/s][A
150it [00:03, 44.93it/s][A
155it [00:03, 45.01it/s][A
160it [00:03, 44.93it/s][A
165it [00:03, 44.96it/s][A
170it [00:03, 45.13it/s][A
175it [00:03, 45.36it/s][A
180it [00:04, 45.56it/s][A
185it [00:04, 45.82it/s][A
190it [00:04, 45.92it/s][A
195it [00:04, 45.81it/s][A
200it [00:04, 45.80it/s][A
205it [00:04, 45.70it/s][A

Epoch: 111, Step: 200, Loss: 4.696682257652283



210it [00:04, 45.80it/s][A
215it [00:04, 45.39it/s][A
220it [00:04, 45.53it/s][A
227it [00:05, 44.89it/s]
 22%|██▏       | 111/500 [12:54<50:07,  7.73s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.98it/s][A
10it [00:00, 46.03it/s][A
15it [00:00, 45.81it/s][A
20it [00:00, 45.92it/s][A
25it [00:00, 45.81it/s][A
30it [00:00, 45.69it/s][A
35it [00:00, 45.68it/s][A
40it [00:00, 45.75it/s][A
45it [00:00, 45.82it/s][A
50it [00:01, 45.78it/s][A
55it [00:01, 45.75it/s][A
60it [00:01, 45.58it/s][A
65it [00:01, 45.37it/s][A
70it [00:01, 44.98it/s][A
75it [00:01, 44.93it/s][A
80it [00:01, 44.53it/s][A
85it [00:01, 44.89it/s][A
90it [00:01, 45.08it/s][A
95it [00:02, 44.93it/s][A
100it [00:02, 45.17it/s][A
105it [00:02, 44.50it/s][A

Epoch: 112, Step: 100, Loss: 4.679286804199219



110it [00:02, 44.53it/s][A
115it [00:02, 44.18it/s][A
120it [00:02, 44.32it/s][A
125it [00:02, 44.79it/s][A
130it [00:02, 44.01it/s][A
135it [00:02, 44.53it/s][A
140it [00:03, 44.83it/s][A
145it [00:03, 45.37it/s][A
150it [00:03, 45.34it/s][A
155it [00:03, 45.39it/s][A
160it [00:03, 45.30it/s][A
165it [00:03, 45.14it/s][A
170it [00:03, 44.89it/s][A
175it [00:03, 45.04it/s][A
180it [00:03, 45.22it/s][A
185it [00:04, 45.22it/s][A
190it [00:04, 45.34it/s][A
195it [00:04, 45.19it/s][A
200it [00:04, 45.22it/s][A
205it [00:04, 45.12it/s][A

Epoch: 112, Step: 200, Loss: 4.692903051376343



210it [00:04, 45.11it/s][A
215it [00:04, 45.35it/s][A
220it [00:04, 45.33it/s][A
227it [00:05, 45.16it/s]
 22%|██▏       | 112/500 [12:59<44:45,  6.92s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.50it/s][A
10it [00:00, 45.62it/s][A
15it [00:00, 45.43it/s][A
20it [00:00, 45.05it/s][A
25it [00:00, 44.95it/s][A
30it [00:00, 45.11it/s][A
35it [00:00, 45.40it/s][A
40it [00:00, 45.38it/s][A
45it [00:00, 45.56it/s][A
50it [00:01, 44.45it/s][A
55it [00:01, 44.75it/s][A
60it [00:01, 44.97it/s][A
65it [00:01, 45.14it/s][A
70it [00:01, 45.53it/s][A
75it [00:01, 44.97it/s][A
80it [00:01, 44.72it/s][A
85it [00:01, 45.22it/s][A
90it [00:01, 45.24it/s][A
95it [00:02, 44.94it/s][A
100it [00:02, 44.95it/s][A
105it [00:02, 44.30it/s][A

Epoch: 113, Step: 100, Loss: 4.68285101890564



110it [00:02, 44.29it/s][A
115it [00:02, 44.26it/s][A
120it [00:02, 44.63it/s][A
125it [00:02, 44.29it/s][A
130it [00:02, 44.55it/s][A
135it [00:03, 44.69it/s][A
140it [00:03, 43.10it/s][A
145it [00:03, 43.24it/s][A
150it [00:03, 43.60it/s][A
155it [00:03, 44.17it/s][A
160it [00:03, 44.36it/s][A
165it [00:03, 44.55it/s][A
170it [00:03, 44.53it/s][A
175it [00:03, 44.94it/s][A
180it [00:04, 45.15it/s][A
185it [00:04, 45.37it/s][A
190it [00:04, 45.51it/s][A
195it [00:04, 45.58it/s][A
200it [00:04, 45.56it/s][A
205it [00:04, 45.15it/s][A

Epoch: 113, Step: 200, Loss: 4.691733202934265



210it [00:04, 45.14it/s][A
215it [00:04, 45.12it/s][A
220it [00:04, 45.17it/s][A
227it [00:05, 44.83it/s]
 23%|██▎       | 113/500 [13:04<41:03,  6.37s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.78it/s][A
10it [00:00, 45.96it/s][A
15it [00:00, 44.79it/s][A
20it [00:00, 45.35it/s][A
25it [00:00, 45.17it/s][A
30it [00:00, 45.37it/s][A
35it [00:00, 45.47it/s][A
40it [00:00, 45.45it/s][A
45it [00:00, 45.59it/s][A
50it [00:01, 45.72it/s][A
55it [00:01, 45.67it/s][A
60it [00:01, 45.04it/s][A
65it [00:01, 44.87it/s][A
70it [00:01, 45.14it/s][A
75it [00:01, 45.44it/s][A
80it [00:01, 44.99it/s][A
85it [00:01, 45.33it/s][A
90it [00:01, 45.15it/s][A
95it [00:02, 45.25it/s][A
100it [00:02, 44.62it/s][A
105it [00:02, 44.54it/s][A

Epoch: 114, Step: 100, Loss: 4.687446722984314



110it [00:02, 44.55it/s][A
115it [00:02, 44.74it/s][A
120it [00:02, 44.96it/s][A
125it [00:02, 44.79it/s][A
130it [00:02, 45.12it/s][A
135it [00:02, 45.09it/s][A
140it [00:03, 45.42it/s][A
145it [00:03, 45.30it/s][A
150it [00:03, 44.80it/s][A
155it [00:03, 44.93it/s][A
160it [00:03, 45.27it/s][A
165it [00:03, 45.47it/s][A
170it [00:03, 45.55it/s][A
175it [00:03, 44.74it/s][A
180it [00:03, 45.27it/s][A
185it [00:04, 45.51it/s][A
190it [00:04, 44.55it/s][A
195it [00:04, 45.02it/s][A
200it [00:04, 45.20it/s][A
205it [00:04, 44.23it/s][A

Epoch: 114, Step: 200, Loss: 4.691538219451904



210it [00:04, 44.37it/s][A
215it [00:04, 43.92it/s][A
220it [00:04, 44.49it/s][A
227it [00:05, 45.00it/s]
 23%|██▎       | 114/500 [13:09<38:24,  5.97s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.19it/s][A
10it [00:00, 45.73it/s][A
15it [00:00, 44.31it/s][A
20it [00:00, 44.58it/s][A
25it [00:00, 44.98it/s][A
30it [00:00, 45.33it/s][A
35it [00:00, 44.38it/s][A
40it [00:00, 44.67it/s][A
45it [00:01, 45.20it/s][A
50it [00:01, 45.52it/s][A
55it [00:01, 44.54it/s][A
60it [00:01, 44.70it/s][A
65it [00:01, 44.74it/s][A
70it [00:01, 44.74it/s][A
75it [00:01, 43.91it/s][A
80it [00:01, 44.44it/s][A
85it [00:01, 44.54it/s][A
90it [00:02, 44.89it/s][A
95it [00:02, 45.08it/s][A
100it [00:02, 45.35it/s][A
105it [00:02, 45.20it/s][A

Epoch: 115, Step: 100, Loss: 4.690096240043641



110it [00:02, 44.92it/s][A
115it [00:02, 45.01it/s][A
120it [00:02, 45.23it/s][A
125it [00:02, 45.30it/s][A
130it [00:02, 45.24it/s][A
135it [00:03, 44.63it/s][A
140it [00:03, 44.13it/s][A
145it [00:03, 44.50it/s][A
150it [00:03, 44.02it/s][A
155it [00:03, 43.72it/s][A
160it [00:03, 44.13it/s][A
165it [00:03, 43.51it/s][A
170it [00:03, 43.71it/s][A
175it [00:03, 43.43it/s][A
180it [00:04, 42.68it/s][A
185it [00:04, 43.39it/s][A
190it [00:04, 43.54it/s][A
195it [00:04, 43.40it/s][A
200it [00:04, 42.54it/s][A
205it [00:04, 43.03it/s][A

Epoch: 115, Step: 200, Loss: 4.693267648220062



210it [00:04, 42.46it/s][A
215it [00:04, 42.11it/s][A
220it [00:04, 41.64it/s][A
227it [00:05, 44.07it/s]

0it [00:00, ?it/s][A
6it [00:00, 56.30it/s][A
12it [00:00, 58.28it/s][A
19it [00:00, 59.43it/s][A
25it [00:00, 59.53it/s][A
32it [00:00, 60.13it/s][A
39it [00:00, 60.04it/s][A
46it [00:00, 60.41it/s][A
53it [00:00, 60.50it/s][A
60it [00:01, 59.29it/s][A
67it [00:01, 59.64it/s][A
74it [00:01, 60.33it/s][A
81it [00:01, 60.57it/s][A
88it [00:01, 60.94it/s][A
95it [00:01, 60.90it/s][A
102it [00:01, 61.16it/s][A
109it [00:01, 61.35it/s][A
116it [00:01, 61.17it/s][A
123it [00:02, 60.87it/s][A
130it [00:02, 60.91it/s][A
137it [00:02, 60.96it/s][A
144it [00:02, 60.85it/s][A
151it [00:02, 60.93it/s][A
158it [00:02, 61.05it/s][A
165it [00:02, 61.06it/s][A
172it [00:02, 61.07it/s][A
179it [00:02, 60.91it/s][A
186it [00:03, 60.00it/s][A
193it [00:03, 58.98it/s][A
200it [00:03, 59.57it/s][A
207it [00:03, 60.42it/s][A
214it [00:03, 60.75it/s][A
221it [00:03, 


Epoch: 115, Test Loss: 5.418360188140632, Test Perplexity: 226.26069664807054




0it [00:00, ?it/s][A
5it [00:00, 43.30it/s][A
10it [00:00, 43.73it/s][A
15it [00:00, 43.91it/s][A
20it [00:00, 44.14it/s][A
25it [00:00, 44.64it/s][A
30it [00:00, 44.70it/s][A
35it [00:00, 45.09it/s][A
40it [00:00, 45.11it/s][A
45it [00:01, 45.28it/s][A
50it [00:01, 44.33it/s][A
55it [00:01, 44.77it/s][A
60it [00:01, 45.04it/s][A
65it [00:01, 44.96it/s][A
70it [00:01, 45.22it/s][A
75it [00:01, 45.38it/s][A
80it [00:01, 45.41it/s][A
85it [00:01, 45.42it/s][A
90it [00:02, 45.48it/s][A
95it [00:02, 45.52it/s][A
100it [00:02, 45.84it/s][A
105it [00:02, 45.82it/s][A

Epoch: 116, Step: 100, Loss: 4.671337251663208



110it [00:02, 45.76it/s][A
115it [00:02, 45.61it/s][A
120it [00:02, 44.84it/s][A
125it [00:02, 45.17it/s][A
130it [00:02, 45.28it/s][A
135it [00:02, 45.39it/s][A
140it [00:03, 45.64it/s][A
145it [00:03, 44.80it/s][A
150it [00:03, 45.25it/s][A
155it [00:03, 45.18it/s][A
160it [00:03, 44.71it/s][A
165it [00:03, 45.11it/s][A
170it [00:03, 45.38it/s][A
175it [00:03, 45.52it/s][A
180it [00:03, 45.70it/s][A
185it [00:04, 45.81it/s][A
190it [00:04, 45.81it/s][A
195it [00:04, 45.79it/s][A
200it [00:04, 45.07it/s][A
205it [00:04, 44.81it/s][A

Epoch: 116, Step: 200, Loss: 4.6869519591331485



210it [00:04, 44.98it/s][A
215it [00:04, 45.07it/s][A
220it [00:04, 45.35it/s][A
227it [00:05, 45.18it/s]
 23%|██▎       | 116/500 [13:30<49:40,  7.76s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.34it/s][A
10it [00:00, 45.84it/s][A
15it [00:00, 45.59it/s][A
20it [00:00, 45.26it/s][A
25it [00:00, 45.13it/s][A
30it [00:00, 45.24it/s][A
35it [00:00, 45.12it/s][A
40it [00:00, 45.07it/s][A
45it [00:00, 44.93it/s][A
50it [00:01, 45.20it/s][A
55it [00:01, 45.33it/s][A
60it [00:01, 45.49it/s][A
65it [00:01, 45.50it/s][A
70it [00:01, 45.36it/s][A
75it [00:01, 45.52it/s][A
80it [00:01, 45.56it/s][A
85it [00:01, 45.41it/s][A
90it [00:01, 44.80it/s][A
95it [00:02, 44.62it/s][A
100it [00:02, 45.16it/s][A
105it [00:02, 45.48it/s][A

Epoch: 117, Step: 100, Loss: 4.680673041343689



110it [00:02, 45.06it/s][A
115it [00:02, 45.16it/s][A
120it [00:02, 45.40it/s][A
125it [00:02, 45.59it/s][A
130it [00:02, 45.80it/s][A
135it [00:02, 45.76it/s][A
140it [00:03, 45.76it/s][A
145it [00:03, 46.07it/s][A
150it [00:03, 46.30it/s][A
155it [00:03, 46.01it/s][A
160it [00:03, 45.31it/s][A
165it [00:03, 45.42it/s][A
170it [00:03, 45.70it/s][A
175it [00:03, 45.85it/s][A
180it [00:03, 46.00it/s][A
185it [00:04, 45.67it/s][A
190it [00:04, 45.90it/s][A
195it [00:04, 46.03it/s][A
200it [00:04, 45.89it/s][A
205it [00:04, 45.75it/s][A

Epoch: 117, Step: 200, Loss: 4.689140627384186



210it [00:04, 45.45it/s][A
215it [00:04, 45.50it/s][A
220it [00:04, 45.28it/s][A
227it [00:04, 45.42it/s]
 23%|██▎       | 117/500 [13:35<44:15,  6.93s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.06it/s][A
10it [00:00, 45.51it/s][A
15it [00:00, 44.75it/s][A
20it [00:00, 44.60it/s][A
25it [00:00, 44.42it/s][A
30it [00:00, 44.88it/s][A
35it [00:00, 44.74it/s][A
40it [00:00, 45.08it/s][A
45it [00:01, 45.05it/s][A
50it [00:01, 44.58it/s][A
55it [00:01, 45.10it/s][A
60it [00:01, 44.79it/s][A
65it [00:01, 43.88it/s][A
70it [00:01, 43.94it/s][A
75it [00:01, 43.64it/s][A
80it [00:01, 43.95it/s][A
85it [00:01, 44.26it/s][A
90it [00:02, 44.65it/s][A
95it [00:02, 45.03it/s][A
100it [00:02, 44.89it/s][A
105it [00:02, 44.80it/s][A

Epoch: 118, Step: 100, Loss: 4.670951600074768



110it [00:02, 44.22it/s][A
115it [00:02, 44.31it/s][A
120it [00:02, 43.46it/s][A
125it [00:02, 43.70it/s][A
130it [00:02, 44.43it/s][A
135it [00:03, 44.95it/s][A
140it [00:03, 45.27it/s][A
145it [00:03, 45.55it/s][A
150it [00:03, 45.89it/s][A
155it [00:03, 45.89it/s][A
160it [00:03, 45.94it/s][A
165it [00:03, 45.91it/s][A
170it [00:03, 45.66it/s][A
175it [00:03, 45.65it/s][A
180it [00:04, 45.73it/s][A
185it [00:04, 46.02it/s][A
190it [00:04, 45.84it/s][A
195it [00:04, 45.90it/s][A
200it [00:04, 45.92it/s][A
205it [00:04, 46.04it/s][A

Epoch: 118, Step: 200, Loss: 4.684603133201599



210it [00:04, 45.67it/s][A
215it [00:04, 45.52it/s][A
220it [00:04, 45.73it/s][A
227it [00:05, 45.05it/s]
 24%|██▎       | 118/500 [13:40<40:32,  6.37s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.65it/s][A
10it [00:00, 46.00it/s][A
15it [00:00, 45.59it/s][A
20it [00:00, 46.01it/s][A
25it [00:00, 45.92it/s][A
30it [00:00, 45.88it/s][A
35it [00:00, 45.28it/s][A
40it [00:00, 45.56it/s][A
45it [00:00, 45.51it/s][A
50it [00:01, 45.75it/s][A
55it [00:01, 45.83it/s][A
60it [00:01, 45.89it/s][A
65it [00:01, 45.94it/s][A
70it [00:01, 45.97it/s][A
75it [00:01, 45.91it/s][A
80it [00:01, 45.94it/s][A
85it [00:01, 45.92it/s][A
90it [00:01, 45.73it/s][A
95it [00:02, 45.67it/s][A
100it [00:02, 45.67it/s][A
105it [00:02, 45.71it/s][A

Epoch: 119, Step: 100, Loss: 4.675535550117493



110it [00:02, 45.54it/s][A
115it [00:02, 45.66it/s][A
120it [00:02, 45.79it/s][A
125it [00:02, 45.90it/s][A
130it [00:02, 45.25it/s][A
135it [00:02, 45.26it/s][A
140it [00:03, 45.37it/s][A
145it [00:03, 45.36it/s][A
150it [00:03, 45.42it/s][A
155it [00:03, 45.43it/s][A
160it [00:03, 45.38it/s][A
165it [00:03, 45.48it/s][A
170it [00:03, 45.56it/s][A
175it [00:03, 45.66it/s][A
180it [00:03, 45.75it/s][A
185it [00:04, 45.46it/s][A
190it [00:04, 45.72it/s][A
195it [00:04, 45.78it/s][A
200it [00:04, 45.81it/s][A
205it [00:04, 45.79it/s][A

Epoch: 119, Step: 200, Loss: 4.682239031791687



210it [00:04, 45.71it/s][A
215it [00:04, 45.62it/s][A
220it [00:04, 45.68it/s][A
227it [00:04, 45.64it/s]
 24%|██▍       | 119/500 [13:45<37:46,  5.95s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.75it/s][A
10it [00:00, 46.48it/s][A
15it [00:00, 46.34it/s][A
20it [00:00, 46.34it/s][A
25it [00:00, 46.40it/s][A
30it [00:00, 46.42it/s][A
35it [00:00, 46.33it/s][A
40it [00:00, 46.15it/s][A
45it [00:00, 46.01it/s][A
50it [00:01, 46.06it/s][A
55it [00:01, 46.15it/s][A
60it [00:01, 46.17it/s][A
65it [00:01, 46.11it/s][A
70it [00:01, 46.31it/s][A
75it [00:01, 46.52it/s][A
80it [00:01, 46.29it/s][A
85it [00:01, 45.44it/s][A
90it [00:01, 45.34it/s][A
95it [00:02, 45.60it/s][A
100it [00:02, 45.63it/s][A
105it [00:02, 45.36it/s][A

Epoch: 120, Step: 100, Loss: 4.66790322303772



110it [00:02, 44.94it/s][A
115it [00:02, 45.35it/s][A
120it [00:02, 44.83it/s][A
125it [00:02, 43.74it/s][A
130it [00:02, 43.54it/s][A
135it [00:02, 43.33it/s][A
140it [00:03, 42.88it/s][A
145it [00:03, 43.40it/s][A
150it [00:03, 44.01it/s][A
155it [00:03, 44.42it/s][A
160it [00:03, 44.72it/s][A
165it [00:03, 45.06it/s][A
170it [00:03, 45.19it/s][A
175it [00:03, 45.26it/s][A
180it [00:03, 45.53it/s][A
185it [00:04, 45.37it/s][A
190it [00:04, 45.59it/s][A
195it [00:04, 45.33it/s][A
200it [00:04, 45.07it/s][A
205it [00:04, 45.01it/s][A

Epoch: 120, Step: 200, Loss: 4.683789758682251



210it [00:04, 44.56it/s][A
215it [00:04, 44.78it/s][A
220it [00:04, 44.71it/s][A
227it [00:05, 45.22it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.99it/s][A
12it [00:00, 56.97it/s][A
18it [00:00, 58.05it/s][A
25it [00:00, 59.09it/s][A
32it [00:00, 59.82it/s][A
38it [00:00, 59.66it/s][A
45it [00:00, 60.04it/s][A
52it [00:00, 60.31it/s][A
59it [00:00, 60.44it/s][A
66it [00:01, 60.43it/s][A
73it [00:01, 60.25it/s][A
80it [00:01, 60.12it/s][A
87it [00:01, 60.15it/s][A
94it [00:01, 60.40it/s][A
101it [00:01, 60.58it/s][A
108it [00:01, 60.59it/s][A
115it [00:01, 60.68it/s][A
122it [00:02, 60.68it/s][A
129it [00:02, 60.57it/s][A
136it [00:02, 60.47it/s][A
143it [00:02, 60.28it/s][A
150it [00:02, 60.55it/s][A
157it [00:02, 60.41it/s][A
164it [00:02, 60.49it/s][A
171it [00:02, 60.52it/s][A
178it [00:02, 59.30it/s][A
184it [00:03, 59.46it/s][A
190it [00:03, 59.61it/s][A
196it [00:03, 59.40it/s][A
203it [00:03, 59.95it/s][A
210it [00:03, 60.07it/s][A
217it [00:03, 


Epoch: 120, Test Loss: 5.419418090618915, Test Perplexity: 226.46561704363143




0it [00:00, ?it/s][A
5it [00:00, 46.04it/s][A
10it [00:00, 45.65it/s][A
15it [00:00, 45.57it/s][A
20it [00:00, 45.54it/s][A
25it [00:00, 45.52it/s][A
30it [00:00, 45.09it/s][A
35it [00:00, 45.17it/s][A
40it [00:00, 44.91it/s][A
45it [00:00, 44.86it/s][A
50it [00:01, 45.25it/s][A
55it [00:01, 45.27it/s][A
60it [00:01, 44.72it/s][A
65it [00:01, 44.95it/s][A
70it [00:01, 44.99it/s][A
75it [00:01, 45.04it/s][A
80it [00:01, 44.05it/s][A
85it [00:01, 44.56it/s][A
90it [00:01, 44.89it/s][A
95it [00:02, 44.57it/s][A
100it [00:02, 44.77it/s][A
105it [00:02, 44.55it/s][A

Epoch: 121, Step: 100, Loss: 4.674841122627258



110it [00:02, 44.65it/s][A
115it [00:02, 44.65it/s][A
120it [00:02, 44.66it/s][A
125it [00:02, 44.95it/s][A
130it [00:02, 45.13it/s][A
135it [00:03, 45.23it/s][A
140it [00:03, 45.34it/s][A
145it [00:03, 44.97it/s][A
150it [00:03, 45.22it/s][A
155it [00:03, 45.42it/s][A
160it [00:03, 45.44it/s][A
165it [00:03, 45.33it/s][A
170it [00:03, 45.51it/s][A
175it [00:03, 45.53it/s][A
180it [00:03, 45.58it/s][A
185it [00:04, 45.67it/s][A
190it [00:04, 45.72it/s][A
195it [00:04, 45.97it/s][A
200it [00:04, 45.76it/s][A
205it [00:04, 45.62it/s][A

Epoch: 121, Step: 200, Loss: 4.681891250610351



210it [00:04, 45.30it/s][A
215it [00:04, 45.61it/s][A
220it [00:04, 45.45it/s][A
227it [00:05, 45.16it/s]
 24%|██▍       | 121/500 [14:06<48:54,  7.74s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.30it/s][A
10it [00:00, 46.07it/s][A
15it [00:00, 45.71it/s][A
20it [00:00, 45.66it/s][A
25it [00:00, 45.57it/s][A
30it [00:00, 45.10it/s][A
35it [00:00, 44.40it/s][A
40it [00:00, 44.87it/s][A
45it [00:00, 45.33it/s][A
50it [00:01, 44.99it/s][A
55it [00:01, 45.25it/s][A
60it [00:01, 45.46it/s][A
65it [00:01, 45.76it/s][A
70it [00:01, 45.75it/s][A
75it [00:01, 45.49it/s][A
80it [00:01, 45.65it/s][A
85it [00:01, 45.72it/s][A
90it [00:01, 45.75it/s][A
95it [00:02, 45.50it/s][A
100it [00:02, 45.45it/s][A
105it [00:02, 45.75it/s][A

Epoch: 122, Step: 100, Loss: 4.67062843799591



110it [00:02, 45.69it/s][A
115it [00:02, 45.58it/s][A
120it [00:02, 45.22it/s][A
125it [00:02, 44.91it/s][A
130it [00:02, 45.27it/s][A
135it [00:02, 45.11it/s][A
140it [00:03, 44.76it/s][A
145it [00:03, 45.10it/s][A
150it [00:03, 45.32it/s][A
155it [00:03, 45.44it/s][A
160it [00:03, 45.54it/s][A
165it [00:03, 45.46it/s][A
170it [00:03, 45.41it/s][A
175it [00:03, 44.33it/s][A
180it [00:03, 44.37it/s][A
185it [00:04, 44.52it/s][A
190it [00:04, 44.69it/s][A
195it [00:04, 45.01it/s][A
200it [00:04, 43.91it/s][A
205it [00:04, 44.02it/s][A

Epoch: 122, Step: 200, Loss: 4.68156096458435



210it [00:04, 44.39it/s][A
215it [00:04, 44.25it/s][A
220it [00:04, 44.12it/s][A
227it [00:05, 45.00it/s]
 24%|██▍       | 122/500 [14:11<43:41,  6.93s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.20it/s][A
10it [00:00, 44.59it/s][A
15it [00:00, 44.79it/s][A
20it [00:00, 44.93it/s][A
25it [00:00, 44.80it/s][A
30it [00:00, 44.86it/s][A
35it [00:00, 45.10it/s][A
40it [00:00, 45.22it/s][A
45it [00:01, 45.19it/s][A
50it [00:01, 45.22it/s][A
55it [00:01, 45.25it/s][A
60it [00:01, 45.31it/s][A
65it [00:01, 45.43it/s][A
70it [00:01, 44.36it/s][A
75it [00:01, 44.70it/s][A
80it [00:01, 43.79it/s][A
85it [00:01, 44.40it/s][A
90it [00:02, 43.86it/s][A
95it [00:02, 44.41it/s][A
100it [00:02, 44.85it/s][A
105it [00:02, 45.16it/s][A

Epoch: 123, Step: 100, Loss: 4.667743778228759



110it [00:02, 45.25it/s][A
115it [00:02, 45.04it/s][A
120it [00:02, 44.89it/s][A
125it [00:02, 44.83it/s][A
130it [00:02, 44.87it/s][A
135it [00:03, 44.70it/s][A
140it [00:03, 43.82it/s][A
145it [00:03, 44.31it/s][A
150it [00:03, 44.47it/s][A
155it [00:03, 44.63it/s][A
160it [00:03, 44.87it/s][A
165it [00:03, 45.25it/s][A
170it [00:03, 45.42it/s][A
175it [00:03, 45.48it/s][A
180it [00:04, 45.40it/s][A
185it [00:04, 45.49it/s][A
190it [00:04, 44.48it/s][A
195it [00:04, 44.80it/s][A
200it [00:04, 44.00it/s][A
205it [00:04, 44.50it/s][A

Epoch: 123, Step: 200, Loss: 4.678501265048981



210it [00:04, 44.59it/s][A
215it [00:04, 44.86it/s][A
220it [00:04, 44.93it/s][A
227it [00:05, 44.72it/s]
 25%|██▍       | 123/500 [14:16<40:04,  6.38s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.34it/s][A
10it [00:00, 45.16it/s][A
15it [00:00, 44.89it/s][A
20it [00:00, 45.15it/s][A
25it [00:00, 45.16it/s][A
30it [00:00, 45.37it/s][A
35it [00:00, 45.37it/s][A
40it [00:00, 45.41it/s][A
45it [00:00, 45.68it/s][A
50it [00:01, 45.61it/s][A
55it [00:01, 45.68it/s][A
60it [00:01, 45.80it/s][A
65it [00:01, 45.94it/s][A
70it [00:01, 45.88it/s][A
75it [00:01, 45.62it/s][A
80it [00:01, 45.46it/s][A
85it [00:01, 45.33it/s][A
90it [00:01, 45.63it/s][A
95it [00:02, 45.83it/s][A
100it [00:02, 46.07it/s][A
105it [00:02, 46.32it/s][A

Epoch: 124, Step: 100, Loss: 4.66941963672638



110it [00:02, 45.65it/s][A
115it [00:02, 45.84it/s][A
120it [00:02, 45.99it/s][A
125it [00:02, 46.23it/s][A
130it [00:02, 45.16it/s][A
135it [00:02, 45.56it/s][A
140it [00:03, 45.84it/s][A
145it [00:03, 46.05it/s][A
150it [00:03, 46.28it/s][A
155it [00:03, 46.29it/s][A
160it [00:03, 46.26it/s][A
165it [00:03, 46.48it/s][A
170it [00:03, 46.64it/s][A
175it [00:03, 46.58it/s][A
180it [00:03, 46.39it/s][A
185it [00:04, 46.27it/s][A
190it [00:04, 46.29it/s][A
195it [00:04, 46.32it/s][A
200it [00:04, 46.47it/s][A
205it [00:04, 46.47it/s][A

Epoch: 124, Step: 200, Loss: 4.674861071109771



210it [00:04, 46.40it/s][A
215it [00:04, 46.65it/s][A
220it [00:04, 46.74it/s][A
227it [00:04, 45.96it/s]
 25%|██▍       | 124/500 [14:21<37:16,  5.95s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.83it/s][A
10it [00:00, 45.81it/s][A
15it [00:00, 44.71it/s][A
20it [00:00, 45.25it/s][A
25it [00:00, 45.32it/s][A
30it [00:00, 45.19it/s][A
35it [00:00, 45.12it/s][A
40it [00:00, 45.33it/s][A
45it [00:00, 45.82it/s][A
50it [00:01, 46.16it/s][A
55it [00:01, 44.69it/s][A
60it [00:01, 44.33it/s][A
65it [00:01, 43.82it/s][A
70it [00:01, 43.42it/s][A
75it [00:01, 42.88it/s][A
80it [00:01, 42.73it/s][A
85it [00:01, 43.54it/s][A
90it [00:02, 43.93it/s][A
95it [00:02, 43.60it/s][A
100it [00:02, 44.35it/s][A
105it [00:02, 44.87it/s][A

Epoch: 125, Step: 100, Loss: 4.668564019203186



110it [00:02, 44.86it/s][A
115it [00:02, 44.77it/s][A
120it [00:02, 44.97it/s][A
125it [00:02, 44.95it/s][A
130it [00:02, 44.54it/s][A
135it [00:03, 43.88it/s][A
140it [00:03, 44.30it/s][A
145it [00:03, 44.56it/s][A
150it [00:03, 44.79it/s][A
155it [00:03, 44.85it/s][A
160it [00:03, 45.14it/s][A
165it [00:03, 45.47it/s][A
170it [00:03, 45.51it/s][A
175it [00:03, 45.17it/s][A
180it [00:04, 45.27it/s][A
185it [00:04, 45.20it/s][A
190it [00:04, 44.25it/s][A
195it [00:04, 44.73it/s][A
200it [00:04, 45.13it/s][A
205it [00:04, 44.88it/s][A

Epoch: 125, Step: 200, Loss: 4.677781252861023



210it [00:04, 45.02it/s][A
215it [00:04, 45.25it/s][A
220it [00:04, 45.09it/s][A
227it [00:05, 44.72it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.82it/s][A
13it [00:00, 59.85it/s][A
19it [00:00, 59.60it/s][A
25it [00:00, 59.50it/s][A
32it [00:00, 59.90it/s][A
39it [00:00, 60.10it/s][A
46it [00:00, 60.19it/s][A
53it [00:00, 60.05it/s][A
60it [00:01, 60.06it/s][A
67it [00:01, 59.84it/s][A
73it [00:01, 59.72it/s][A
79it [00:01, 59.31it/s][A
85it [00:01, 59.18it/s][A
91it [00:01, 59.36it/s][A
97it [00:01, 59.10it/s][A
104it [00:01, 59.69it/s][A
111it [00:01, 59.92it/s][A
118it [00:01, 60.29it/s][A
125it [00:02, 60.05it/s][A
132it [00:02, 60.21it/s][A
139it [00:02, 59.02it/s][A
146it [00:02, 59.45it/s][A
153it [00:02, 59.83it/s][A
159it [00:02, 58.43it/s][A
165it [00:02, 58.37it/s][A
171it [00:02, 57.36it/s][A
178it [00:03, 58.25it/s][A
184it [00:03, 58.61it/s][A
191it [00:03, 59.19it/s][A
197it [00:03, 59.33it/s][A
203it [00:03, 58.52it/s][A
210it [00:03, 5


Epoch: 125, Test Loss: 5.4277759575695725, Test Perplexity: 228.3900886677807




0it [00:00, ?it/s][A
5it [00:00, 41.74it/s][A
10it [00:00, 42.44it/s][A
15it [00:00, 43.98it/s][A
20it [00:00, 44.55it/s][A
25it [00:00, 44.94it/s][A
30it [00:00, 45.34it/s][A
35it [00:00, 45.48it/s][A
40it [00:00, 45.52it/s][A
45it [00:01, 45.34it/s][A
50it [00:01, 45.28it/s][A
55it [00:01, 45.29it/s][A
60it [00:01, 45.57it/s][A
65it [00:01, 45.73it/s][A
70it [00:01, 45.61it/s][A
75it [00:01, 44.62it/s][A
80it [00:01, 45.02it/s][A
85it [00:01, 45.41it/s][A
90it [00:01, 45.63it/s][A
95it [00:02, 45.54it/s][A
100it [00:02, 45.69it/s][A
105it [00:02, 45.79it/s][A

Epoch: 126, Step: 100, Loss: 4.66197883605957



110it [00:02, 45.90it/s][A
115it [00:02, 44.55it/s][A
120it [00:02, 44.92it/s][A
125it [00:02, 45.15it/s][A
130it [00:02, 45.49it/s][A
135it [00:02, 45.73it/s][A
140it [00:03, 45.53it/s][A
145it [00:03, 45.42it/s][A
150it [00:03, 45.50it/s][A
155it [00:03, 45.61it/s][A
160it [00:03, 45.64it/s][A
165it [00:03, 45.60it/s][A
170it [00:03, 45.74it/s][A
175it [00:03, 45.26it/s][A
180it [00:03, 45.42it/s][A
185it [00:04, 45.28it/s][A
190it [00:04, 45.33it/s][A
195it [00:04, 45.21it/s][A
200it [00:04, 45.35it/s][A
205it [00:04, 45.31it/s][A

Epoch: 126, Step: 200, Loss: 4.6716255903244015



210it [00:04, 45.16it/s][A
215it [00:04, 45.20it/s][A
220it [00:04, 45.31it/s][A
227it [00:05, 45.27it/s]
 25%|██▌       | 126/500 [14:42<48:23,  7.76s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.10it/s][A
10it [00:00, 45.25it/s][A
15it [00:00, 44.42it/s][A
20it [00:00, 44.89it/s][A
25it [00:00, 45.25it/s][A
30it [00:00, 45.47it/s][A
35it [00:00, 45.74it/s][A
40it [00:00, 45.90it/s][A
45it [00:00, 45.84it/s][A
50it [00:01, 45.94it/s][A
55it [00:01, 45.88it/s][A
60it [00:01, 45.86it/s][A
65it [00:01, 45.83it/s][A
70it [00:01, 45.97it/s][A
75it [00:01, 45.37it/s][A
80it [00:01, 45.63it/s][A
85it [00:01, 45.95it/s][A
90it [00:01, 45.36it/s][A
95it [00:02, 45.57it/s][A
100it [00:02, 45.11it/s][A
105it [00:02, 45.25it/s][A

Epoch: 127, Step: 100, Loss: 4.658008728027344



110it [00:02, 45.31it/s][A
115it [00:02, 45.58it/s][A
120it [00:02, 45.54it/s][A
125it [00:02, 45.73it/s][A
130it [00:02, 45.80it/s][A
135it [00:02, 45.78it/s][A
140it [00:03, 45.67it/s][A
145it [00:03, 45.30it/s][A
150it [00:03, 44.63it/s][A
155it [00:03, 43.83it/s][A
160it [00:03, 44.04it/s][A
165it [00:03, 44.42it/s][A
170it [00:03, 44.69it/s][A
175it [00:03, 45.02it/s][A
180it [00:03, 45.09it/s][A
185it [00:04, 45.44it/s][A
190it [00:04, 45.61it/s][A
195it [00:04, 45.50it/s][A
200it [00:04, 45.78it/s][A
205it [00:04, 45.73it/s][A

Epoch: 127, Step: 200, Loss: 4.675130717754364



210it [00:04, 45.67it/s][A
215it [00:04, 45.72it/s][A
220it [00:04, 45.77it/s][A
227it [00:04, 45.42it/s]
 25%|██▌       | 127/500 [14:47<43:06,  6.94s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.34it/s][A
10it [00:00, 46.17it/s][A
15it [00:00, 45.81it/s][A
20it [00:00, 45.07it/s][A
25it [00:00, 45.34it/s][A
30it [00:00, 45.57it/s][A
35it [00:00, 45.55it/s][A
40it [00:00, 45.61it/s][A
45it [00:00, 45.43it/s][A
50it [00:01, 45.23it/s][A
55it [00:01, 45.37it/s][A
60it [00:01, 45.30it/s][A
65it [00:01, 45.29it/s][A
70it [00:01, 45.37it/s][A
75it [00:01, 45.38it/s][A
80it [00:01, 45.52it/s][A
85it [00:01, 45.47it/s][A
90it [00:01, 45.24it/s][A
95it [00:02, 45.37it/s][A
100it [00:02, 45.59it/s][A
105it [00:02, 45.55it/s][A

Epoch: 128, Step: 100, Loss: 4.663965940475464



110it [00:02, 45.49it/s][A
115it [00:02, 45.68it/s][A
120it [00:02, 45.78it/s][A
125it [00:02, 46.02it/s][A
130it [00:02, 45.92it/s][A
135it [00:02, 45.93it/s][A
140it [00:03, 45.95it/s][A
145it [00:03, 45.79it/s][A
150it [00:03, 45.86it/s][A
155it [00:03, 45.92it/s][A
160it [00:03, 46.02it/s][A
165it [00:03, 46.11it/s][A
170it [00:03, 46.22it/s][A
175it [00:03, 46.34it/s][A
180it [00:03, 46.29it/s][A
185it [00:04, 46.44it/s][A
190it [00:04, 46.51it/s][A
195it [00:04, 46.55it/s][A
200it [00:04, 46.36it/s][A
205it [00:04, 46.30it/s][A

Epoch: 128, Step: 200, Loss: 4.669168207645416



210it [00:04, 46.29it/s][A
215it [00:04, 45.98it/s][A
220it [00:04, 46.04it/s][A
227it [00:04, 45.80it/s]
 26%|██▌       | 128/500 [14:52<39:19,  6.34s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.98it/s][A
10it [00:00, 45.31it/s][A
15it [00:00, 45.59it/s][A
20it [00:00, 45.28it/s][A
25it [00:00, 45.40it/s][A
30it [00:00, 45.68it/s][A
35it [00:00, 46.06it/s][A
40it [00:00, 46.41it/s][A
45it [00:00, 46.32it/s][A
50it [00:01, 46.56it/s][A
55it [00:01, 46.77it/s][A
60it [00:01, 46.66it/s][A
65it [00:01, 46.59it/s][A
70it [00:01, 46.43it/s][A
75it [00:01, 46.61it/s][A
80it [00:01, 46.83it/s][A
85it [00:01, 46.81it/s][A
90it [00:01, 46.91it/s][A
95it [00:02, 46.77it/s][A
100it [00:02, 46.44it/s][A
105it [00:02, 46.47it/s][A

Epoch: 129, Step: 100, Loss: 4.656576437950134



110it [00:02, 46.82it/s][A
115it [00:02, 47.00it/s][A
120it [00:02, 46.74it/s][A
125it [00:02, 46.34it/s][A
130it [00:02, 46.14it/s][A
135it [00:02, 45.92it/s][A
140it [00:03, 45.66it/s][A
145it [00:03, 45.70it/s][A
150it [00:03, 45.55it/s][A
155it [00:03, 45.44it/s][A
160it [00:03, 45.32it/s][A
165it [00:03, 44.97it/s][A
170it [00:03, 43.89it/s][A
175it [00:03, 44.42it/s][A
180it [00:03, 44.94it/s][A
185it [00:04, 45.33it/s][A
190it [00:04, 45.57it/s][A
195it [00:04, 45.40it/s][A
200it [00:04, 45.29it/s][A
205it [00:04, 45.18it/s][A

Epoch: 129, Step: 200, Loss: 4.671185412406921



210it [00:04, 44.52it/s][A
215it [00:04, 44.72it/s][A
220it [00:04, 43.38it/s][A
227it [00:04, 45.60it/s]
 26%|██▌       | 129/500 [14:57<36:41,  5.93s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.04it/s][A
10it [00:00, 45.52it/s][A
15it [00:00, 45.68it/s][A
20it [00:00, 45.46it/s][A
25it [00:00, 45.72it/s][A
30it [00:00, 45.63it/s][A
35it [00:00, 45.48it/s][A
40it [00:00, 45.65it/s][A
45it [00:00, 45.64it/s][A
50it [00:01, 45.49it/s][A
55it [00:01, 45.59it/s][A
60it [00:01, 45.68it/s][A
65it [00:01, 45.68it/s][A
70it [00:01, 45.51it/s][A
75it [00:01, 45.52it/s][A
80it [00:01, 45.63it/s][A
85it [00:01, 45.85it/s][A
90it [00:01, 45.87it/s][A
95it [00:02, 45.80it/s][A
100it [00:02, 45.79it/s][A
105it [00:02, 46.00it/s][A

Epoch: 130, Step: 100, Loss: 4.657813496589661



110it [00:02, 45.88it/s][A
115it [00:02, 45.94it/s][A
120it [00:02, 45.57it/s][A
125it [00:02, 45.39it/s][A
130it [00:02, 45.44it/s][A
135it [00:02, 45.77it/s][A
140it [00:03, 45.76it/s][A
145it [00:03, 45.95it/s][A
150it [00:03, 45.23it/s][A
155it [00:03, 44.23it/s][A
160it [00:03, 44.60it/s][A
165it [00:03, 44.68it/s][A
170it [00:03, 44.99it/s][A
175it [00:03, 45.31it/s][A
180it [00:03, 45.18it/s][A
185it [00:04, 45.23it/s][A
190it [00:04, 44.39it/s][A
195it [00:04, 44.74it/s][A
200it [00:04, 45.00it/s][A
205it [00:04, 45.39it/s][A

Epoch: 130, Step: 200, Loss: 4.668508777618408



210it [00:04, 45.66it/s][A
215it [00:04, 45.72it/s][A
220it [00:04, 45.87it/s][A
227it [00:05, 45.40it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.02it/s][A
12it [00:00, 58.64it/s][A
18it [00:00, 59.09it/s][A
24it [00:00, 59.32it/s][A
31it [00:00, 59.70it/s][A
38it [00:00, 60.15it/s][A
45it [00:00, 60.32it/s][A
52it [00:00, 60.32it/s][A
59it [00:00, 60.41it/s][A
66it [00:01, 60.46it/s][A
73it [00:01, 60.56it/s][A
80it [00:01, 60.48it/s][A
87it [00:01, 60.50it/s][A
94it [00:01, 60.43it/s][A
101it [00:01, 60.25it/s][A
108it [00:01, 60.37it/s][A
115it [00:01, 60.64it/s][A
122it [00:02, 60.41it/s][A
129it [00:02, 60.46it/s][A
136it [00:02, 60.54it/s][A
143it [00:02, 60.53it/s][A
150it [00:02, 60.46it/s][A
157it [00:02, 60.46it/s][A
164it [00:02, 60.56it/s][A
171it [00:02, 60.15it/s][A
178it [00:02, 60.39it/s][A
185it [00:03, 60.54it/s][A
192it [00:03, 60.51it/s][A
199it [00:03, 60.40it/s][A
206it [00:03, 60.46it/s][A
213it [00:03, 60.27it/s][A
220it [00:03, 


Epoch: 130, Test Loss: 5.423753334128338, Test Perplexity: 227.47901850161344




0it [00:00, ?it/s][A
5it [00:00, 45.08it/s][A
10it [00:00, 45.58it/s][A
15it [00:00, 45.63it/s][A
20it [00:00, 45.61it/s][A
25it [00:00, 45.71it/s][A
30it [00:00, 45.20it/s][A
35it [00:00, 45.12it/s][A
40it [00:00, 45.14it/s][A
45it [00:00, 45.24it/s][A
50it [00:01, 45.49it/s][A
55it [00:01, 45.61it/s][A
60it [00:01, 45.36it/s][A
65it [00:01, 45.26it/s][A
70it [00:01, 45.34it/s][A
75it [00:01, 45.13it/s][A
80it [00:01, 45.09it/s][A
85it [00:01, 45.28it/s][A
90it [00:01, 45.65it/s][A
95it [00:02, 45.59it/s][A
100it [00:02, 45.60it/s][A
105it [00:02, 45.73it/s][A

Epoch: 131, Step: 100, Loss: 4.655928573608398



110it [00:02, 45.63it/s][A
115it [00:02, 45.69it/s][A
120it [00:02, 45.09it/s][A
125it [00:02, 45.22it/s][A
130it [00:02, 45.17it/s][A
135it [00:02, 45.29it/s][A
140it [00:03, 45.56it/s][A
145it [00:03, 45.64it/s][A
150it [00:03, 45.77it/s][A
155it [00:03, 45.91it/s][A
160it [00:03, 45.76it/s][A
165it [00:03, 45.83it/s][A
170it [00:03, 45.88it/s][A
175it [00:03, 45.81it/s][A
180it [00:03, 45.09it/s][A
185it [00:04, 45.39it/s][A
190it [00:04, 45.52it/s][A
195it [00:04, 45.59it/s][A
200it [00:04, 45.62it/s][A
205it [00:04, 45.62it/s][A

Epoch: 131, Step: 200, Loss: 4.667023108005524



210it [00:04, 45.60it/s][A
215it [00:04, 45.62it/s][A
220it [00:04, 45.62it/s][A
227it [00:04, 45.47it/s]
 26%|██▌       | 131/500 [15:17<47:23,  7.71s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.47it/s][A
10it [00:00, 45.86it/s][A
15it [00:00, 45.74it/s][A
20it [00:00, 45.40it/s][A
25it [00:00, 44.88it/s][A
30it [00:00, 45.00it/s][A
35it [00:00, 44.87it/s][A
40it [00:00, 44.58it/s][A
45it [00:01, 44.43it/s][A
50it [00:01, 44.45it/s][A
55it [00:01, 44.67it/s][A
60it [00:01, 44.81it/s][A
65it [00:01, 44.59it/s][A
70it [00:01, 44.85it/s][A
75it [00:01, 44.80it/s][A
80it [00:01, 43.80it/s][A
85it [00:01, 44.30it/s][A
90it [00:02, 44.39it/s][A
95it [00:02, 44.41it/s][A
100it [00:02, 44.17it/s][A
105it [00:02, 44.50it/s][A

Epoch: 132, Step: 100, Loss: 4.658748307228088



110it [00:02, 44.50it/s][A
115it [00:02, 44.59it/s][A
120it [00:02, 44.82it/s][A
125it [00:02, 44.84it/s][A
130it [00:02, 44.79it/s][A
135it [00:03, 44.31it/s][A
140it [00:03, 44.72it/s][A
145it [00:03, 44.92it/s][A
150it [00:03, 45.03it/s][A
155it [00:03, 45.03it/s][A
160it [00:03, 45.06it/s][A
165it [00:03, 45.24it/s][A
170it [00:03, 45.53it/s][A
175it [00:03, 45.24it/s][A
180it [00:04, 44.59it/s][A
185it [00:04, 45.04it/s][A
190it [00:04, 45.45it/s][A
195it [00:04, 45.83it/s][A
200it [00:04, 46.08it/s][A
205it [00:04, 46.19it/s][A

Epoch: 132, Step: 200, Loss: 4.66700124502182



210it [00:04, 46.30it/s][A
215it [00:04, 46.53it/s][A
220it [00:04, 46.58it/s][A
227it [00:05, 45.02it/s]
 26%|██▋       | 132/500 [15:22<42:22,  6.91s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.58it/s][A
10it [00:00, 46.66it/s][A
15it [00:00, 46.71it/s][A
20it [00:00, 46.72it/s][A
25it [00:00, 45.62it/s][A
30it [00:00, 45.95it/s][A
35it [00:00, 45.96it/s][A
40it [00:00, 46.10it/s][A
45it [00:00, 46.17it/s][A
50it [00:01, 46.14it/s][A
55it [00:01, 46.24it/s][A
60it [00:01, 46.43it/s][A
65it [00:01, 46.29it/s][A
70it [00:01, 46.49it/s][A
75it [00:01, 46.55it/s][A
80it [00:01, 46.35it/s][A
85it [00:01, 46.07it/s][A
90it [00:01, 45.87it/s][A
95it [00:02, 45.97it/s][A
100it [00:02, 45.84it/s][A
105it [00:02, 46.10it/s][A

Epoch: 133, Step: 100, Loss: 4.654439644813538



110it [00:02, 46.14it/s][A
115it [00:02, 46.08it/s][A
120it [00:02, 46.15it/s][A
125it [00:02, 46.17it/s][A
130it [00:02, 45.45it/s][A
135it [00:02, 45.79it/s][A
140it [00:03, 46.05it/s][A
145it [00:03, 46.20it/s][A
150it [00:03, 46.45it/s][A
155it [00:03, 46.37it/s][A
160it [00:03, 46.06it/s][A
165it [00:03, 46.19it/s][A
170it [00:03, 46.76it/s][A
175it [00:03, 46.55it/s][A
180it [00:03, 46.41it/s][A
185it [00:04, 46.24it/s][A
190it [00:04, 46.25it/s][A
195it [00:04, 46.25it/s][A
200it [00:04, 46.27it/s][A
205it [00:04, 46.27it/s][A

Epoch: 133, Step: 200, Loss: 4.662742712497711



210it [00:04, 46.21it/s][A
215it [00:04, 46.24it/s][A
220it [00:04, 45.76it/s][A
227it [00:04, 46.13it/s]
 27%|██▋       | 133/500 [15:27<38:36,  6.31s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.63it/s][A
10it [00:00, 46.15it/s][A
15it [00:00, 45.69it/s][A
20it [00:00, 45.76it/s][A
25it [00:00, 45.18it/s][A
30it [00:00, 45.65it/s][A
35it [00:00, 45.70it/s][A
40it [00:00, 45.52it/s][A
45it [00:00, 45.64it/s][A
50it [00:01, 45.66it/s][A
55it [00:01, 45.70it/s][A
60it [00:01, 45.62it/s][A
65it [00:01, 45.86it/s][A
70it [00:01, 44.91it/s][A
75it [00:01, 45.10it/s][A
80it [00:01, 45.36it/s][A
85it [00:01, 45.44it/s][A
90it [00:01, 44.88it/s][A
95it [00:02, 44.69it/s][A
100it [00:02, 45.15it/s][A
105it [00:02, 44.39it/s][A

Epoch: 134, Step: 100, Loss: 4.652976808547973



110it [00:02, 43.53it/s][A
115it [00:02, 43.95it/s][A
120it [00:02, 44.19it/s][A
125it [00:02, 44.22it/s][A
130it [00:02, 44.33it/s][A
135it [00:02, 44.51it/s][A
140it [00:03, 44.86it/s][A
145it [00:03, 44.44it/s][A
150it [00:03, 44.73it/s][A
155it [00:03, 45.10it/s][A
160it [00:03, 44.99it/s][A
165it [00:03, 44.23it/s][A
170it [00:03, 44.52it/s][A
175it [00:03, 44.77it/s][A
180it [00:04, 45.02it/s][A
185it [00:04, 45.26it/s][A
190it [00:04, 44.45it/s][A
195it [00:04, 44.90it/s][A
200it [00:04, 45.09it/s][A
205it [00:04, 45.16it/s][A

Epoch: 134, Step: 200, Loss: 4.664713749885559



210it [00:04, 45.11it/s][A
215it [00:04, 44.76it/s][A
220it [00:04, 44.96it/s][A
227it [00:05, 44.96it/s]
 27%|██▋       | 134/500 [15:32<36:12,  5.93s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.90it/s][A
10it [00:00, 45.66it/s][A
15it [00:00, 45.37it/s][A
20it [00:00, 45.30it/s][A
25it [00:00, 45.20it/s][A
30it [00:00, 45.22it/s][A
35it [00:00, 45.23it/s][A
40it [00:00, 45.13it/s][A
45it [00:00, 45.14it/s][A
50it [00:01, 45.35it/s][A
55it [00:01, 45.30it/s][A
60it [00:01, 45.31it/s][A
65it [00:01, 44.96it/s][A
70it [00:01, 44.57it/s][A
75it [00:01, 44.22it/s][A
80it [00:01, 44.67it/s][A
85it [00:01, 44.83it/s][A
90it [00:01, 44.87it/s][A
95it [00:02, 45.19it/s][A
100it [00:02, 45.15it/s][A
105it [00:02, 45.27it/s][A

Epoch: 135, Step: 100, Loss: 4.651953949928283



110it [00:02, 45.15it/s][A
115it [00:02, 45.17it/s][A
120it [00:02, 44.31it/s][A
125it [00:02, 44.05it/s][A
130it [00:02, 44.71it/s][A
135it [00:03, 44.97it/s][A
140it [00:03, 45.36it/s][A
145it [00:03, 45.51it/s][A
150it [00:03, 45.50it/s][A
155it [00:03, 45.45it/s][A
160it [00:03, 45.48it/s][A
165it [00:03, 45.63it/s][A
170it [00:03, 45.11it/s][A
175it [00:03, 45.09it/s][A
180it [00:03, 45.20it/s][A
185it [00:04, 45.29it/s][A
190it [00:04, 45.23it/s][A
195it [00:04, 45.17it/s][A
200it [00:04, 45.32it/s][A
205it [00:04, 44.78it/s][A

Epoch: 135, Step: 200, Loss: 4.660847997665405



210it [00:04, 45.10it/s][A
215it [00:04, 45.29it/s][A
220it [00:04, 45.48it/s][A
227it [00:05, 45.13it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.88it/s][A
12it [00:00, 58.33it/s][A
19it [00:00, 59.71it/s][A
25it [00:00, 59.81it/s][A
31it [00:00, 59.74it/s][A
37it [00:00, 59.80it/s][A
44it [00:00, 60.03it/s][A
50it [00:00, 59.94it/s][A
57it [00:00, 60.01it/s][A
64it [00:01, 60.19it/s][A
71it [00:01, 60.36it/s][A
78it [00:01, 60.56it/s][A
85it [00:01, 60.45it/s][A
92it [00:01, 60.27it/s][A
99it [00:01, 60.37it/s][A
106it [00:01, 60.45it/s][A
113it [00:01, 60.51it/s][A
120it [00:01, 60.75it/s][A
127it [00:02, 60.68it/s][A
134it [00:02, 60.76it/s][A
141it [00:02, 60.13it/s][A
148it [00:02, 60.32it/s][A
155it [00:02, 59.85it/s][A
162it [00:02, 60.02it/s][A
169it [00:02, 58.53it/s][A
175it [00:02, 58.37it/s][A
182it [00:03, 59.35it/s][A
188it [00:03, 58.56it/s][A
195it [00:03, 59.31it/s][A
202it [00:03, 59.84it/s][A
208it [00:03, 59.06it/s][A
215it [00:03, 5


Epoch: 135, Test Loss: 5.4271027819710485, Test Perplexity: 228.20783098588078




0it [00:00, ?it/s][A
5it [00:00, 45.32it/s][A
10it [00:00, 45.08it/s][A
15it [00:00, 45.42it/s][A
20it [00:00, 45.43it/s][A
25it [00:00, 44.17it/s][A
30it [00:00, 44.85it/s][A
35it [00:00, 45.07it/s][A
40it [00:00, 45.21it/s][A
45it [00:01, 44.53it/s][A
50it [00:01, 44.96it/s][A
55it [00:01, 44.85it/s][A
60it [00:01, 44.65it/s][A
65it [00:01, 44.80it/s][A
70it [00:01, 45.00it/s][A
75it [00:01, 45.13it/s][A
80it [00:01, 45.20it/s][A
85it [00:01, 45.28it/s][A
90it [00:02, 44.45it/s][A
95it [00:02, 44.56it/s][A
100it [00:02, 44.67it/s][A
105it [00:02, 44.91it/s][A

Epoch: 136, Step: 100, Loss: 4.652377061843872



110it [00:02, 45.21it/s][A
115it [00:02, 45.61it/s][A
120it [00:02, 45.39it/s][A
125it [00:02, 45.41it/s][A
130it [00:02, 44.62it/s][A
135it [00:03, 44.97it/s][A
140it [00:03, 45.01it/s][A
145it [00:03, 45.13it/s][A
150it [00:03, 45.24it/s][A
155it [00:03, 45.61it/s][A
160it [00:03, 45.81it/s][A
165it [00:03, 45.82it/s][A
170it [00:03, 45.94it/s][A
175it [00:03, 45.88it/s][A
180it [00:03, 45.79it/s][A
185it [00:04, 45.40it/s][A
190it [00:04, 45.17it/s][A
195it [00:04, 44.68it/s][A
200it [00:04, 45.00it/s][A
205it [00:04, 45.05it/s][A

Epoch: 136, Step: 200, Loss: 4.661670217514038



210it [00:04, 45.04it/s][A
215it [00:04, 45.39it/s][A
220it [00:04, 45.68it/s][A
227it [00:05, 45.15it/s]
 27%|██▋       | 136/500 [15:53<46:56,  7.74s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.78it/s][A
10it [00:00, 46.01it/s][A
15it [00:00, 45.31it/s][A
20it [00:00, 45.46it/s][A
25it [00:00, 45.52it/s][A
30it [00:00, 45.38it/s][A
35it [00:00, 45.68it/s][A
40it [00:00, 45.83it/s][A
45it [00:00, 46.02it/s][A
50it [00:01, 46.17it/s][A
55it [00:01, 46.51it/s][A
60it [00:01, 46.69it/s][A
65it [00:01, 46.71it/s][A
70it [00:01, 46.92it/s][A
75it [00:01, 46.72it/s][A
80it [00:01, 46.75it/s][A
85it [00:01, 46.65it/s][A
90it [00:01, 46.51it/s][A
95it [00:02, 46.22it/s][A
100it [00:02, 46.26it/s][A
105it [00:02, 46.39it/s][A

Epoch: 137, Step: 100, Loss: 4.6556517601013185



110it [00:02, 46.59it/s][A
115it [00:02, 46.46it/s][A
120it [00:02, 46.59it/s][A
125it [00:02, 46.54it/s][A
130it [00:02, 46.74it/s][A
135it [00:02, 45.65it/s][A
140it [00:03, 45.89it/s][A
145it [00:03, 45.84it/s][A
150it [00:03, 46.06it/s][A
155it [00:03, 46.04it/s][A
160it [00:03, 46.27it/s][A
165it [00:03, 46.61it/s][A
170it [00:03, 46.57it/s][A
175it [00:03, 46.50it/s][A
180it [00:03, 46.41it/s][A
185it [00:04, 46.20it/s][A
190it [00:04, 46.39it/s][A
195it [00:04, 46.44it/s][A
200it [00:04, 46.73it/s][A
205it [00:04, 46.66it/s][A

Epoch: 137, Step: 200, Loss: 4.659664070606231



210it [00:04, 46.20it/s][A
215it [00:04, 45.88it/s][A
220it [00:04, 45.80it/s][A
227it [00:04, 46.27it/s]
 27%|██▋       | 137/500 [15:58<41:40,  6.89s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.17it/s][A
10it [00:00, 46.46it/s][A
15it [00:00, 44.80it/s][A
20it [00:00, 45.40it/s][A
25it [00:00, 45.41it/s][A
30it [00:00, 45.41it/s][A
35it [00:00, 45.41it/s][A
40it [00:00, 45.39it/s][A
45it [00:00, 45.21it/s][A
50it [00:01, 45.09it/s][A
55it [00:01, 45.20it/s][A
60it [00:01, 45.01it/s][A
65it [00:01, 45.10it/s][A
70it [00:01, 44.98it/s][A
75it [00:01, 45.06it/s][A
80it [00:01, 45.31it/s][A
85it [00:01, 45.59it/s][A
90it [00:01, 45.73it/s][A
95it [00:02, 45.60it/s][A
100it [00:02, 45.71it/s][A
105it [00:02, 45.61it/s][A

Epoch: 138, Step: 100, Loss: 4.640897450447082



110it [00:02, 45.46it/s][A
115it [00:02, 45.52it/s][A
120it [00:02, 45.85it/s][A
125it [00:02, 45.56it/s][A
130it [00:02, 45.64it/s][A
135it [00:02, 45.48it/s][A
140it [00:03, 45.53it/s][A
145it [00:03, 45.70it/s][A
150it [00:03, 45.69it/s][A
155it [00:03, 45.48it/s][A
160it [00:03, 45.55it/s][A
165it [00:03, 44.74it/s][A
170it [00:03, 44.02it/s][A
175it [00:03, 44.35it/s][A
180it [00:03, 44.09it/s][A
185it [00:04, 43.63it/s][A
190it [00:04, 43.38it/s][A
195it [00:04, 44.10it/s][A
200it [00:04, 44.35it/s][A
205it [00:04, 44.72it/s][A

Epoch: 138, Step: 200, Loss: 4.659286551475525



210it [00:04, 44.47it/s][A
215it [00:04, 44.91it/s][A
220it [00:04, 45.13it/s][A
227it [00:05, 45.10it/s]
 28%|██▊       | 138/500 [16:03<38:12,  6.33s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.33it/s][A
10it [00:00, 45.67it/s][A
15it [00:00, 45.51it/s][A
20it [00:00, 45.63it/s][A
25it [00:00, 45.66it/s][A
30it [00:00, 45.41it/s][A
35it [00:00, 44.96it/s][A
40it [00:00, 44.92it/s][A
45it [00:01, 43.95it/s][A
50it [00:01, 44.51it/s][A
55it [00:01, 44.80it/s][A
60it [00:01, 44.47it/s][A
65it [00:01, 44.76it/s][A
70it [00:01, 44.85it/s][A
75it [00:01, 44.98it/s][A
80it [00:01, 45.10it/s][A
85it [00:01, 45.24it/s][A
90it [00:01, 45.17it/s][A
95it [00:02, 45.07it/s][A
100it [00:02, 45.08it/s][A
105it [00:02, 45.40it/s][A

Epoch: 139, Step: 100, Loss: 4.64950273513794



110it [00:02, 44.95it/s][A
115it [00:02, 44.75it/s][A
120it [00:02, 45.16it/s][A
125it [00:02, 45.48it/s][A
130it [00:02, 45.55it/s][A
135it [00:02, 44.89it/s][A
140it [00:03, 45.09it/s][A
145it [00:03, 44.99it/s][A
150it [00:03, 44.30it/s][A
155it [00:03, 44.84it/s][A
160it [00:03, 45.25it/s][A
165it [00:03, 45.64it/s][A
170it [00:03, 45.89it/s][A
175it [00:03, 45.93it/s][A
180it [00:03, 45.79it/s][A
185it [00:04, 45.50it/s][A
190it [00:04, 45.65it/s][A
195it [00:04, 45.76it/s][A
200it [00:04, 45.73it/s][A
205it [00:04, 44.64it/s][A

Epoch: 139, Step: 200, Loss: 4.659283015727997



210it [00:04, 45.10it/s][A
215it [00:04, 45.31it/s][A
220it [00:04, 45.18it/s][A
227it [00:05, 45.15it/s]
 28%|██▊       | 139/500 [16:08<35:45,  5.94s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.20it/s][A
10it [00:00, 45.17it/s][A
15it [00:00, 45.24it/s][A
20it [00:00, 45.60it/s][A
25it [00:00, 45.79it/s][A
30it [00:00, 45.45it/s][A
35it [00:00, 45.70it/s][A
40it [00:00, 45.48it/s][A
45it [00:00, 45.77it/s][A
50it [00:01, 45.82it/s][A
55it [00:01, 45.78it/s][A
60it [00:01, 45.75it/s][A
65it [00:01, 45.89it/s][A
70it [00:01, 45.32it/s][A
75it [00:01, 45.21it/s][A
80it [00:01, 45.29it/s][A
85it [00:01, 44.33it/s][A
90it [00:01, 44.87it/s][A
95it [00:02, 45.04it/s][A
100it [00:02, 45.14it/s][A
105it [00:02, 44.94it/s][A

Epoch: 140, Step: 100, Loss: 4.647640948295593



110it [00:02, 45.04it/s][A
115it [00:02, 45.26it/s][A
120it [00:02, 45.42it/s][A
125it [00:02, 45.60it/s][A
130it [00:02, 45.71it/s][A
135it [00:02, 44.86it/s][A
140it [00:03, 44.72it/s][A
145it [00:03, 44.55it/s][A
150it [00:03, 44.99it/s][A
155it [00:03, 44.55it/s][A
160it [00:03, 44.84it/s][A
165it [00:03, 44.99it/s][A
170it [00:03, 45.20it/s][A
175it [00:03, 45.38it/s][A
180it [00:03, 45.52it/s][A
185it [00:04, 45.51it/s][A
190it [00:04, 45.59it/s][A
195it [00:04, 45.43it/s][A
200it [00:04, 45.37it/s][A
205it [00:04, 45.30it/s][A

Epoch: 140, Step: 200, Loss: 4.656942474842071



210it [00:04, 45.21it/s][A
215it [00:04, 45.47it/s][A
220it [00:04, 45.68it/s][A
227it [00:05, 45.27it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.04it/s][A
13it [00:00, 59.79it/s][A
20it [00:00, 60.34it/s][A
27it [00:00, 60.48it/s][A
34it [00:00, 60.58it/s][A
41it [00:00, 60.51it/s][A
48it [00:00, 60.35it/s][A
55it [00:00, 60.29it/s][A
62it [00:01, 60.14it/s][A
69it [00:01, 59.75it/s][A
75it [00:01, 59.78it/s][A
82it [00:01, 60.11it/s][A
89it [00:01, 60.26it/s][A
96it [00:01, 59.25it/s][A
102it [00:01, 59.40it/s][A
109it [00:01, 59.73it/s][A
116it [00:01, 60.05it/s][A
123it [00:02, 60.33it/s][A
130it [00:02, 59.63it/s][A
137it [00:02, 59.88it/s][A
144it [00:02, 60.19it/s][A
151it [00:02, 60.35it/s][A
158it [00:02, 60.17it/s][A
165it [00:02, 60.38it/s][A
172it [00:02, 60.42it/s][A
179it [00:02, 60.68it/s][A
186it [00:03, 58.99it/s][A
192it [00:03, 58.97it/s][A
198it [00:03, 59.12it/s][A
204it [00:03, 59.15it/s][A
210it [00:03, 59.20it/s][A
217it [00:03, 


Epoch: 140, Test Loss: 5.435927705735153, Test Perplexity: 230.25213606461236




0it [00:00, ?it/s][A
5it [00:00, 46.23it/s][A
10it [00:00, 45.65it/s][A
15it [00:00, 45.50it/s][A
20it [00:00, 45.66it/s][A
25it [00:00, 45.93it/s][A
30it [00:00, 45.35it/s][A
35it [00:00, 45.32it/s][A
40it [00:00, 45.31it/s][A
45it [00:00, 45.17it/s][A
50it [00:01, 45.15it/s][A
55it [00:01, 44.75it/s][A
60it [00:01, 44.68it/s][A
65it [00:01, 44.82it/s][A
70it [00:01, 45.23it/s][A
75it [00:01, 45.40it/s][A
80it [00:01, 45.42it/s][A
85it [00:01, 44.84it/s][A
90it [00:01, 45.14it/s][A
95it [00:02, 45.04it/s][A
100it [00:02, 45.12it/s][A
105it [00:02, 44.82it/s][A

Epoch: 141, Step: 100, Loss: 4.6420895290374755



110it [00:02, 44.86it/s][A
115it [00:02, 44.85it/s][A
120it [00:02, 44.73it/s][A
125it [00:02, 45.16it/s][A
130it [00:02, 44.90it/s][A
135it [00:02, 45.16it/s][A
140it [00:03, 45.08it/s][A
145it [00:03, 45.30it/s][A
150it [00:03, 44.99it/s][A
155it [00:03, 45.41it/s][A
160it [00:03, 44.74it/s][A
165it [00:03, 45.50it/s][A
170it [00:03, 45.77it/s][A
175it [00:03, 45.60it/s][A
180it [00:03, 45.35it/s][A
185it [00:04, 44.87it/s][A
190it [00:04, 43.26it/s][A
195it [00:04, 43.82it/s][A
200it [00:04, 43.69it/s][A
205it [00:04, 44.65it/s][A

Epoch: 141, Step: 200, Loss: 4.657730572223663



210it [00:04, 45.28it/s][A
215it [00:04, 45.80it/s][A
220it [00:04, 46.26it/s][A
227it [00:05, 45.18it/s]
 28%|██▊       | 141/500 [16:29<46:11,  7.72s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.11it/s][A
10it [00:00, 47.04it/s][A
15it [00:00, 47.07it/s][A
20it [00:00, 47.29it/s][A
25it [00:00, 46.96it/s][A
30it [00:00, 47.01it/s][A
35it [00:00, 47.19it/s][A
40it [00:00, 47.16it/s][A
45it [00:00, 46.12it/s][A
50it [00:01, 46.61it/s][A
55it [00:01, 47.16it/s][A
60it [00:01, 47.30it/s][A
65it [00:01, 46.71it/s][A
70it [00:01, 46.51it/s][A
75it [00:01, 46.27it/s][A
80it [00:01, 46.06it/s][A
85it [00:01, 46.20it/s][A
90it [00:01, 46.23it/s][A
95it [00:02, 46.25it/s][A
100it [00:02, 46.15it/s][A
105it [00:02, 46.05it/s][A

Epoch: 142, Step: 100, Loss: 4.642607188224792



110it [00:02, 45.90it/s][A
115it [00:02, 45.84it/s][A
120it [00:02, 45.93it/s][A
125it [00:02, 46.06it/s][A
130it [00:02, 45.94it/s][A
135it [00:02, 46.01it/s][A
140it [00:03, 45.98it/s][A
145it [00:03, 45.92it/s][A
150it [00:03, 45.79it/s][A
155it [00:03, 45.63it/s][A
160it [00:03, 44.60it/s][A
165it [00:03, 45.09it/s][A
170it [00:03, 45.54it/s][A
175it [00:03, 45.70it/s][A
180it [00:03, 45.69it/s][A
185it [00:04, 45.76it/s][A
190it [00:04, 45.85it/s][A
195it [00:04, 45.48it/s][A
200it [00:04, 45.60it/s][A
205it [00:04, 45.35it/s][A

Epoch: 142, Step: 200, Loss: 4.653441035747528



210it [00:04, 45.55it/s][A
215it [00:04, 45.78it/s][A
220it [00:04, 45.84it/s][A
227it [00:04, 46.05it/s]
 28%|██▊       | 142/500 [16:34<41:04,  6.88s/it]
0it [00:00, ?it/s][A
4it [00:00, 38.30it/s][A
9it [00:00, 42.72it/s][A
14it [00:00, 44.21it/s][A
19it [00:00, 43.47it/s][A
24it [00:00, 43.98it/s][A
29it [00:00, 44.35it/s][A
34it [00:00, 44.49it/s][A
39it [00:00, 44.88it/s][A
44it [00:00, 45.00it/s][A
49it [00:01, 45.09it/s][A
54it [00:01, 44.91it/s][A
59it [00:01, 44.98it/s][A
64it [00:01, 45.09it/s][A
69it [00:01, 45.34it/s][A
74it [00:01, 45.54it/s][A
79it [00:01, 45.36it/s][A
84it [00:01, 45.61it/s][A
89it [00:01, 45.70it/s][A
94it [00:02, 45.64it/s][A
99it [00:02, 45.77it/s][A
104it [00:02, 45.54it/s][A

Epoch: 143, Step: 100, Loss: 4.644268851280213



109it [00:02, 44.64it/s][A
114it [00:02, 44.06it/s][A
119it [00:02, 44.21it/s][A
124it [00:02, 44.33it/s][A
129it [00:02, 44.73it/s][A
134it [00:02, 44.90it/s][A
139it [00:03, 45.01it/s][A
144it [00:03, 45.11it/s][A
149it [00:03, 45.07it/s][A
154it [00:03, 44.70it/s][A
159it [00:03, 44.29it/s][A
164it [00:03, 44.39it/s][A
169it [00:03, 44.72it/s][A
174it [00:03, 44.87it/s][A
179it [00:03, 44.78it/s][A
184it [00:04, 44.84it/s][A
189it [00:04, 44.98it/s][A
194it [00:04, 44.25it/s][A
199it [00:04, 43.03it/s][A
204it [00:04, 43.35it/s][A


Epoch: 143, Step: 200, Loss: 4.65229079246521


209it [00:04, 43.74it/s][A
214it [00:04, 44.12it/s][A
219it [00:04, 44.50it/s][A
227it [00:05, 44.67it/s]
 29%|██▊       | 143/500 [16:39<37:44,  6.34s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.86it/s][A
10it [00:00, 45.42it/s][A
15it [00:00, 45.46it/s][A
20it [00:00, 43.51it/s][A
25it [00:00, 43.94it/s][A
30it [00:00, 44.42it/s][A
35it [00:00, 45.03it/s][A
40it [00:00, 45.46it/s][A
45it [00:00, 45.68it/s][A
50it [00:01, 46.11it/s][A
55it [00:01, 45.93it/s][A
60it [00:01, 45.72it/s][A
65it [00:01, 45.67it/s][A
70it [00:01, 45.91it/s][A
75it [00:01, 45.97it/s][A
80it [00:01, 45.92it/s][A
85it [00:01, 45.91it/s][A
90it [00:01, 45.65it/s][A
95it [00:02, 45.76it/s][A
100it [00:02, 45.63it/s][A
105it [00:02, 44.56it/s][A

Epoch: 144, Step: 100, Loss: 4.6388374614715575



110it [00:02, 44.74it/s][A
115it [00:02, 44.99it/s][A
120it [00:02, 45.30it/s][A
125it [00:02, 45.53it/s][A
130it [00:02, 45.80it/s][A
135it [00:02, 45.24it/s][A
140it [00:03, 45.59it/s][A
145it [00:03, 45.49it/s][A
150it [00:03, 45.44it/s][A
155it [00:03, 45.51it/s][A
160it [00:03, 45.42it/s][A
165it [00:03, 45.67it/s][A
170it [00:03, 45.59it/s][A
175it [00:03, 45.78it/s][A
180it [00:03, 45.69it/s][A
185it [00:04, 45.68it/s][A
190it [00:04, 45.54it/s][A
195it [00:04, 45.46it/s][A
200it [00:04, 44.43it/s][A
205it [00:04, 44.99it/s][A

Epoch: 144, Step: 200, Loss: 4.6524978089332585



210it [00:04, 45.20it/s][A
215it [00:04, 45.43it/s][A
220it [00:04, 45.69it/s][A
227it [00:05, 45.40it/s]
 29%|██▉       | 144/500 [16:44<35:15,  5.94s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.02it/s][A
10it [00:00, 45.56it/s][A
15it [00:00, 45.22it/s][A
20it [00:00, 45.11it/s][A
25it [00:00, 45.27it/s][A
30it [00:00, 45.51it/s][A
35it [00:00, 45.47it/s][A
40it [00:00, 44.58it/s][A
45it [00:00, 44.73it/s][A
50it [00:01, 44.96it/s][A
55it [00:01, 45.21it/s][A
60it [00:01, 45.17it/s][A
65it [00:01, 44.95it/s][A
70it [00:01, 44.89it/s][A
75it [00:01, 45.23it/s][A
80it [00:01, 45.29it/s][A
85it [00:01, 44.71it/s][A
90it [00:01, 45.19it/s][A
95it [00:02, 45.42it/s][A
100it [00:02, 45.41it/s][A
105it [00:02, 45.45it/s][A

Epoch: 145, Step: 100, Loss: 4.644445238113403



110it [00:02, 44.31it/s][A
115it [00:02, 44.85it/s][A
120it [00:02, 45.29it/s][A
125it [00:02, 45.19it/s][A
130it [00:02, 45.15it/s][A
135it [00:02, 44.27it/s][A
140it [00:03, 44.95it/s][A
145it [00:03, 45.20it/s][A
150it [00:03, 45.30it/s][A
155it [00:03, 45.42it/s][A
160it [00:03, 45.34it/s][A
165it [00:03, 45.42it/s][A
170it [00:03, 45.40it/s][A
175it [00:03, 45.43it/s][A
180it [00:03, 45.32it/s][A
185it [00:04, 45.19it/s][A
190it [00:04, 45.45it/s][A
195it [00:04, 45.36it/s][A
200it [00:04, 45.13it/s][A
205it [00:04, 44.49it/s][A

Epoch: 145, Step: 200, Loss: 4.649911305904388



210it [00:04, 44.28it/s][A
215it [00:04, 44.47it/s][A
220it [00:04, 44.68it/s][A
227it [00:05, 45.08it/s]

0it [00:00, ?it/s][A
6it [00:00, 56.48it/s][A
12it [00:00, 58.37it/s][A
18it [00:00, 59.02it/s][A
24it [00:00, 59.17it/s][A
30it [00:00, 59.01it/s][A
36it [00:00, 59.27it/s][A
42it [00:00, 59.03it/s][A
48it [00:00, 58.88it/s][A
54it [00:00, 59.15it/s][A
60it [00:01, 59.33it/s][A
66it [00:01, 58.08it/s][A
72it [00:01, 58.32it/s][A
79it [00:01, 59.00it/s][A
86it [00:01, 59.39it/s][A
92it [00:01, 59.24it/s][A
98it [00:01, 58.98it/s][A
104it [00:01, 59.02it/s][A
110it [00:01, 59.15it/s][A
116it [00:01, 59.01it/s][A
122it [00:02, 59.03it/s][A
129it [00:02, 59.54it/s][A
135it [00:02, 59.62it/s][A
142it [00:02, 59.97it/s][A
149it [00:02, 60.06it/s][A
156it [00:02, 59.87it/s][A
163it [00:02, 60.17it/s][A
170it [00:02, 60.43it/s][A
177it [00:02, 59.24it/s][A
184it [00:03, 59.83it/s][A
191it [00:03, 60.20it/s][A
198it [00:03, 60.40it/s][A
205it [00:03, 59


Epoch: 145, Test Loss: 5.428747709493459, Test Perplexity: 228.64055090957547




0it [00:00, ?it/s][A
5it [00:00, 45.87it/s][A
10it [00:00, 45.93it/s][A
15it [00:00, 45.91it/s][A
20it [00:00, 46.02it/s][A
25it [00:00, 46.21it/s][A
30it [00:00, 46.31it/s][A
35it [00:00, 46.41it/s][A
40it [00:00, 46.58it/s][A
45it [00:00, 46.63it/s][A
50it [00:01, 45.18it/s][A
55it [00:01, 45.58it/s][A
60it [00:01, 45.83it/s][A
65it [00:01, 44.77it/s][A
70it [00:01, 44.88it/s][A
75it [00:01, 44.33it/s][A
80it [00:01, 44.76it/s][A
85it [00:01, 45.16it/s][A
90it [00:01, 45.36it/s][A
95it [00:02, 45.45it/s][A
100it [00:02, 45.70it/s][A
105it [00:02, 45.47it/s][A

Epoch: 146, Step: 100, Loss: 4.639589881896972



110it [00:02, 45.08it/s][A
115it [00:02, 44.53it/s][A
120it [00:02, 44.78it/s][A
125it [00:02, 45.36it/s][A
130it [00:02, 45.67it/s][A
135it [00:02, 44.92it/s][A
140it [00:03, 44.35it/s][A
145it [00:03, 44.92it/s][A
150it [00:03, 45.19it/s][A
155it [00:03, 45.22it/s][A
160it [00:03, 45.29it/s][A
165it [00:03, 45.31it/s][A
170it [00:03, 45.32it/s][A
175it [00:03, 45.30it/s][A
180it [00:03, 44.89it/s][A
185it [00:04, 45.04it/s][A
190it [00:04, 45.17it/s][A
195it [00:04, 45.23it/s][A
200it [00:04, 45.24it/s][A
205it [00:04, 45.14it/s][A

Epoch: 146, Step: 200, Loss: 4.649214074611664



210it [00:04, 45.21it/s][A
215it [00:04, 45.17it/s][A
220it [00:04, 45.09it/s][A
227it [00:05, 45.29it/s]
 29%|██▉       | 146/500 [17:05<45:36,  7.73s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.09it/s][A
10it [00:00, 45.38it/s][A
15it [00:00, 45.73it/s][A
20it [00:00, 45.68it/s][A
25it [00:00, 45.55it/s][A
30it [00:00, 45.31it/s][A
35it [00:00, 45.48it/s][A
40it [00:00, 45.61it/s][A
45it [00:00, 45.70it/s][A
50it [00:01, 45.75it/s][A
55it [00:01, 45.41it/s][A
60it [00:01, 45.41it/s][A
65it [00:01, 45.51it/s][A
70it [00:01, 44.49it/s][A
75it [00:01, 44.82it/s][A
80it [00:01, 45.09it/s][A
85it [00:01, 44.70it/s][A
90it [00:01, 45.26it/s][A
95it [00:02, 45.21it/s][A
100it [00:02, 45.30it/s][A
105it [00:02, 45.52it/s][A

Epoch: 147, Step: 100, Loss: 4.630336470603943



110it [00:02, 45.06it/s][A
115it [00:02, 45.13it/s][A
120it [00:02, 45.32it/s][A
125it [00:02, 45.37it/s][A
130it [00:02, 45.61it/s][A
135it [00:02, 45.68it/s][A
140it [00:03, 45.87it/s][A
145it [00:03, 45.90it/s][A
150it [00:03, 45.85it/s][A
155it [00:03, 45.93it/s][A
160it [00:03, 44.80it/s][A
165it [00:03, 45.21it/s][A
170it [00:03, 44.34it/s][A
175it [00:03, 45.03it/s][A
180it [00:03, 45.35it/s][A
185it [00:04, 45.66it/s][A
190it [00:04, 45.86it/s][A
195it [00:04, 45.35it/s][A
200it [00:04, 45.57it/s][A
205it [00:04, 45.70it/s][A

Epoch: 147, Step: 200, Loss: 4.647117211818695



210it [00:04, 45.73it/s][A
215it [00:04, 45.95it/s][A
220it [00:04, 45.98it/s][A
227it [00:04, 45.42it/s]
 29%|██▉       | 147/500 [17:10<40:40,  6.91s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.55it/s][A
10it [00:00, 46.53it/s][A
15it [00:00, 45.89it/s][A
20it [00:00, 45.63it/s][A
25it [00:00, 45.52it/s][A
30it [00:00, 45.50it/s][A
35it [00:00, 45.25it/s][A
40it [00:00, 45.53it/s][A
45it [00:00, 45.73it/s][A
50it [00:01, 45.84it/s][A
55it [00:01, 45.70it/s][A
60it [00:01, 44.81it/s][A
65it [00:01, 44.92it/s][A
70it [00:01, 43.96it/s][A
75it [00:01, 44.45it/s][A
80it [00:01, 44.79it/s][A
85it [00:01, 44.81it/s][A
90it [00:01, 45.00it/s][A
95it [00:02, 45.12it/s][A
100it [00:02, 44.51it/s][A
105it [00:02, 44.86it/s][A

Epoch: 148, Step: 100, Loss: 4.645346097946167



110it [00:02, 44.68it/s][A
115it [00:02, 45.03it/s][A
120it [00:02, 45.05it/s][A
125it [00:02, 44.43it/s][A
130it [00:02, 44.27it/s][A
135it [00:03, 44.49it/s][A
140it [00:03, 43.82it/s][A
145it [00:03, 44.04it/s][A
150it [00:03, 44.12it/s][A
155it [00:03, 44.66it/s][A
160it [00:03, 44.84it/s][A
165it [00:03, 44.93it/s][A
170it [00:03, 44.37it/s][A
175it [00:03, 42.77it/s][A
180it [00:04, 43.16it/s][A
185it [00:04, 43.69it/s][A
190it [00:04, 44.36it/s][A
195it [00:04, 44.83it/s][A
200it [00:04, 45.04it/s][A
205it [00:04, 44.90it/s][A

Epoch: 148, Step: 200, Loss: 4.649107465744018



210it [00:04, 44.39it/s][A
215it [00:04, 44.12it/s][A
220it [00:04, 44.23it/s][A
227it [00:05, 44.66it/s]
 30%|██▉       | 148/500 [17:15<37:20,  6.36s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.24it/s][A
10it [00:00, 46.07it/s][A
15it [00:00, 45.30it/s][A
20it [00:00, 45.26it/s][A
25it [00:00, 45.30it/s][A
30it [00:00, 45.30it/s][A
35it [00:00, 45.37it/s][A
40it [00:00, 45.47it/s][A
45it [00:00, 45.40it/s][A
50it [00:01, 45.49it/s][A
55it [00:01, 45.72it/s][A
60it [00:01, 45.69it/s][A
65it [00:01, 45.55it/s][A
70it [00:01, 45.37it/s][A
75it [00:01, 45.16it/s][A
80it [00:01, 44.56it/s][A
85it [00:01, 44.70it/s][A
90it [00:01, 44.84it/s][A
95it [00:02, 44.92it/s][A
100it [00:02, 45.11it/s][A
105it [00:02, 45.14it/s][A

Epoch: 149, Step: 100, Loss: 4.627364168167114



110it [00:02, 45.00it/s][A
115it [00:02, 44.25it/s][A
120it [00:02, 44.55it/s][A
125it [00:02, 44.82it/s][A
130it [00:02, 45.25it/s][A
135it [00:02, 45.44it/s][A
140it [00:03, 45.61it/s][A
145it [00:03, 45.71it/s][A
150it [00:03, 45.56it/s][A
155it [00:03, 45.52it/s][A
160it [00:03, 45.35it/s][A
165it [00:03, 45.07it/s][A
170it [00:03, 45.33it/s][A
175it [00:03, 45.47it/s][A
180it [00:03, 45.69it/s][A
185it [00:04, 45.90it/s][A
190it [00:04, 45.41it/s][A
195it [00:04, 45.76it/s][A
200it [00:04, 45.85it/s][A
205it [00:04, 45.91it/s][A

Epoch: 149, Step: 200, Loss: 4.645972361564636



210it [00:04, 45.84it/s][A
215it [00:04, 44.85it/s][A
220it [00:04, 45.11it/s][A
227it [00:05, 45.31it/s]
 30%|██▉       | 149/500 [17:20<34:51,  5.96s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.63it/s][A
10it [00:00, 43.75it/s][A
15it [00:00, 44.88it/s][A
20it [00:00, 45.29it/s][A
25it [00:00, 45.31it/s][A
30it [00:00, 45.36it/s][A
35it [00:00, 45.35it/s][A
40it [00:00, 45.63it/s][A
45it [00:01, 45.12it/s][A
50it [00:01, 45.42it/s][A
55it [00:01, 45.37it/s][A
60it [00:01, 45.38it/s][A
65it [00:01, 45.40it/s][A
70it [00:01, 45.14it/s][A
75it [00:01, 45.13it/s][A
80it [00:01, 45.26it/s][A
85it [00:01, 45.61it/s][A
90it [00:01, 45.46it/s][A
95it [00:02, 45.44it/s][A
100it [00:02, 45.62it/s][A
105it [00:02, 45.53it/s][A

Epoch: 150, Step: 100, Loss: 4.634470672607422



110it [00:02, 45.49it/s][A
115it [00:02, 45.24it/s][A
120it [00:02, 45.19it/s][A
125it [00:02, 45.39it/s][A
130it [00:02, 45.42it/s][A
135it [00:02, 45.56it/s][A
140it [00:03, 45.61it/s][A
145it [00:03, 45.68it/s][A
150it [00:03, 45.73it/s][A
155it [00:03, 45.79it/s][A
160it [00:03, 45.73it/s][A
165it [00:03, 45.56it/s][A
170it [00:03, 45.38it/s][A
175it [00:03, 45.60it/s][A
180it [00:03, 45.47it/s][A
185it [00:04, 45.37it/s][A
190it [00:04, 44.94it/s][A
195it [00:04, 44.89it/s][A
200it [00:04, 44.61it/s][A
205it [00:04, 44.69it/s][A

Epoch: 150, Step: 200, Loss: 4.64500590801239



210it [00:04, 44.53it/s][A
215it [00:04, 44.80it/s][A
220it [00:04, 44.46it/s][A
227it [00:05, 45.06it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.88it/s][A
12it [00:00, 57.81it/s][A
18it [00:00, 57.91it/s][A
24it [00:00, 58.61it/s][A
31it [00:00, 59.52it/s][A
38it [00:00, 59.86it/s][A
44it [00:00, 59.84it/s][A
51it [00:00, 60.19it/s][A
58it [00:00, 60.14it/s][A
65it [00:01, 60.48it/s][A
72it [00:01, 60.47it/s][A
79it [00:01, 60.58it/s][A
86it [00:01, 60.61it/s][A
93it [00:01, 59.74it/s][A
99it [00:01, 59.75it/s][A
106it [00:01, 59.92it/s][A
112it [00:01, 59.88it/s][A
119it [00:01, 60.01it/s][A
126it [00:02, 60.35it/s][A
133it [00:02, 59.84it/s][A
140it [00:02, 60.28it/s][A
147it [00:02, 60.19it/s][A
154it [00:02, 60.49it/s][A
161it [00:02, 60.39it/s][A
168it [00:02, 60.42it/s][A
175it [00:02, 60.43it/s][A
182it [00:03, 60.18it/s][A
189it [00:03, 60.38it/s][A
196it [00:03, 60.45it/s][A
203it [00:03, 60.59it/s][A
210it [00:03, 60.51it/s][A
217it [00:03, 5


Epoch: 150, Test Loss: 5.43481961762683, Test Perplexity: 230.0059036349658




0it [00:00, ?it/s][A
5it [00:00, 45.07it/s][A
10it [00:00, 45.41it/s][A
15it [00:00, 45.71it/s][A
20it [00:00, 44.99it/s][A
25it [00:00, 44.83it/s][A
30it [00:00, 44.75it/s][A
35it [00:00, 44.97it/s][A
40it [00:00, 44.89it/s][A
45it [00:01, 44.66it/s][A
50it [00:01, 43.85it/s][A
55it [00:01, 44.59it/s][A
60it [00:01, 44.90it/s][A
65it [00:01, 43.96it/s][A
70it [00:01, 43.87it/s][A
75it [00:01, 44.52it/s][A
80it [00:01, 44.51it/s][A
85it [00:01, 44.51it/s][A
90it [00:02, 44.70it/s][A
95it [00:02, 44.86it/s][A
100it [00:02, 44.21it/s][A
105it [00:02, 44.67it/s][A

Epoch: 151, Step: 100, Loss: 4.6340098428726195



110it [00:02, 44.25it/s][A
115it [00:02, 44.63it/s][A
120it [00:02, 44.84it/s][A
125it [00:02, 44.97it/s][A
130it [00:02, 43.91it/s][A
135it [00:03, 44.48it/s][A
140it [00:03, 44.71it/s][A
145it [00:03, 44.91it/s][A
150it [00:03, 44.95it/s][A
155it [00:03, 45.21it/s][A
160it [00:03, 44.29it/s][A
165it [00:03, 44.77it/s][A
170it [00:03, 44.83it/s][A
175it [00:03, 44.21it/s][A
180it [00:04, 44.78it/s][A
185it [00:04, 45.17it/s][A
190it [00:04, 45.44it/s][A
195it [00:04, 45.66it/s][A
200it [00:04, 45.65it/s][A
205it [00:04, 45.59it/s][A

Epoch: 151, Step: 200, Loss: 4.64419823884964



210it [00:04, 45.42it/s][A
215it [00:04, 45.50it/s][A
220it [00:04, 45.61it/s][A
227it [00:05, 44.84it/s]
 30%|███       | 151/500 [17:41<45:04,  7.75s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.85it/s][A
10it [00:00, 45.27it/s][A
15it [00:00, 45.47it/s][A
20it [00:00, 44.82it/s][A
25it [00:00, 45.06it/s][A
30it [00:00, 43.88it/s][A
35it [00:00, 44.58it/s][A
40it [00:00, 44.81it/s][A
45it [00:01, 45.24it/s][A
50it [00:01, 45.55it/s][A
55it [00:01, 45.54it/s][A
60it [00:01, 45.19it/s][A
65it [00:01, 45.03it/s][A
70it [00:01, 45.21it/s][A
75it [00:01, 45.16it/s][A
80it [00:01, 45.27it/s][A
85it [00:01, 45.23it/s][A
90it [00:01, 45.17it/s][A
95it [00:02, 45.28it/s][A
100it [00:02, 44.99it/s][A
105it [00:02, 44.94it/s][A

Epoch: 152, Step: 100, Loss: 4.63506172657013



110it [00:02, 44.83it/s][A
115it [00:02, 45.05it/s][A
120it [00:02, 45.02it/s][A
125it [00:02, 45.18it/s][A
130it [00:02, 45.03it/s][A
135it [00:02, 45.31it/s][A
140it [00:03, 45.41it/s][A
145it [00:03, 45.46it/s][A
150it [00:03, 44.67it/s][A
155it [00:03, 44.94it/s][A
160it [00:03, 44.99it/s][A
165it [00:03, 45.01it/s][A
170it [00:03, 45.28it/s][A
175it [00:03, 45.41it/s][A
180it [00:03, 44.45it/s][A
185it [00:04, 44.73it/s][A
190it [00:04, 44.94it/s][A
195it [00:04, 45.03it/s][A
200it [00:04, 45.26it/s][A
205it [00:04, 44.73it/s][A

Epoch: 152, Step: 200, Loss: 4.644687030315399



210it [00:04, 45.00it/s][A
215it [00:04, 45.32it/s][A
220it [00:04, 45.61it/s][A
227it [00:05, 45.11it/s]
 30%|███       | 152/500 [17:46<40:13,  6.94s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.91it/s][A
10it [00:00, 46.21it/s][A
15it [00:00, 44.39it/s][A
20it [00:00, 45.10it/s][A
25it [00:00, 44.35it/s][A
30it [00:00, 44.85it/s][A
35it [00:00, 44.79it/s][A
40it [00:00, 45.29it/s][A
45it [00:01, 44.36it/s][A
50it [00:01, 44.31it/s][A
55it [00:01, 44.55it/s][A
60it [00:01, 44.02it/s][A
65it [00:01, 44.64it/s][A
70it [00:01, 44.85it/s][A
75it [00:01, 44.61it/s][A
80it [00:01, 44.19it/s][A
85it [00:01, 44.52it/s][A
90it [00:02, 44.81it/s][A
95it [00:02, 44.66it/s][A
100it [00:02, 44.77it/s][A
105it [00:02, 43.85it/s][A

Epoch: 153, Step: 100, Loss: 4.623190140724182



110it [00:02, 43.90it/s][A
115it [00:02, 44.44it/s][A
120it [00:02, 44.97it/s][A
125it [00:02, 44.77it/s][A
130it [00:02, 45.01it/s][A
135it [00:03, 45.30it/s][A
140it [00:03, 45.56it/s][A
145it [00:03, 45.01it/s][A
150it [00:03, 44.84it/s][A
155it [00:03, 44.91it/s][A
160it [00:03, 45.09it/s][A
165it [00:03, 45.19it/s][A
170it [00:03, 44.98it/s][A
175it [00:03, 45.16it/s][A
180it [00:04, 45.27it/s][A
185it [00:04, 45.53it/s][A
190it [00:04, 45.56it/s][A
195it [00:04, 45.67it/s][A
200it [00:04, 45.61it/s][A
205it [00:04, 45.42it/s][A

Epoch: 153, Step: 200, Loss: 4.639765586853027



210it [00:04, 45.24it/s][A
215it [00:04, 45.44it/s][A
220it [00:04, 44.30it/s][A
227it [00:05, 44.85it/s]
 31%|███       | 153/500 [17:51<36:51,  6.37s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.32it/s][A
10it [00:00, 44.80it/s][A
15it [00:00, 44.99it/s][A
20it [00:00, 44.98it/s][A
25it [00:00, 44.90it/s][A
30it [00:00, 45.18it/s][A
35it [00:00, 45.42it/s][A
40it [00:00, 44.62it/s][A
45it [00:01, 45.05it/s][A
50it [00:01, 45.08it/s][A
55it [00:01, 45.27it/s][A
60it [00:01, 45.56it/s][A
65it [00:01, 45.56it/s][A
70it [00:01, 44.87it/s][A
75it [00:01, 45.17it/s][A
80it [00:01, 45.38it/s][A
85it [00:01, 45.49it/s][A
90it [00:01, 45.61it/s][A
95it [00:02, 45.60it/s][A
100it [00:02, 45.77it/s][A
105it [00:02, 45.78it/s][A

Epoch: 154, Step: 100, Loss: 4.6368483877182



110it [00:02, 45.56it/s][A
115it [00:02, 45.50it/s][A
120it [00:02, 45.66it/s][A
125it [00:02, 45.78it/s][A
130it [00:02, 45.46it/s][A
135it [00:02, 45.52it/s][A
140it [00:03, 45.61it/s][A
145it [00:03, 45.74it/s][A
150it [00:03, 45.52it/s][A
155it [00:03, 45.52it/s][A
160it [00:03, 45.50it/s][A
165it [00:03, 45.51it/s][A
170it [00:03, 45.54it/s][A
175it [00:03, 45.24it/s][A
180it [00:03, 45.53it/s][A
185it [00:04, 45.86it/s][A
190it [00:04, 45.22it/s][A
195it [00:04, 45.19it/s][A
200it [00:04, 45.15it/s][A
205it [00:04, 45.10it/s][A

Epoch: 154, Step: 200, Loss: 4.645300078392029



210it [00:04, 45.12it/s][A
215it [00:04, 45.33it/s][A
220it [00:04, 45.12it/s][A
227it [00:05, 45.33it/s]
 31%|███       | 154/500 [17:56<34:23,  5.96s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.17it/s][A
10it [00:00, 44.51it/s][A
15it [00:00, 44.58it/s][A
20it [00:00, 44.72it/s][A
25it [00:00, 44.62it/s][A
30it [00:00, 44.92it/s][A
35it [00:00, 45.16it/s][A
40it [00:00, 45.29it/s][A
45it [00:00, 45.49it/s][A
50it [00:01, 45.76it/s][A
55it [00:01, 45.91it/s][A
60it [00:01, 46.07it/s][A
65it [00:01, 45.77it/s][A
70it [00:01, 45.20it/s][A
75it [00:01, 44.88it/s][A
80it [00:01, 45.27it/s][A
85it [00:01, 45.35it/s][A
90it [00:01, 45.25it/s][A
95it [00:02, 45.32it/s][A
100it [00:02, 45.57it/s][A
105it [00:02, 45.69it/s][A

Epoch: 155, Step: 100, Loss: 4.622346076965332



110it [00:02, 45.53it/s][A
115it [00:02, 44.90it/s][A
120it [00:02, 45.13it/s][A
125it [00:02, 45.41it/s][A
130it [00:02, 45.54it/s][A
135it [00:02, 45.29it/s][A
140it [00:03, 45.31it/s][A
145it [00:03, 45.37it/s][A
150it [00:03, 45.40it/s][A
155it [00:03, 45.34it/s][A
160it [00:03, 44.31it/s][A
165it [00:03, 44.67it/s][A
170it [00:03, 43.81it/s][A
175it [00:03, 44.12it/s][A
180it [00:03, 44.81it/s][A
185it [00:04, 45.21it/s][A
190it [00:04, 45.60it/s][A
195it [00:04, 44.47it/s][A
200it [00:04, 45.14it/s][A
205it [00:04, 44.97it/s][A

Epoch: 155, Step: 200, Loss: 4.641774888038635



210it [00:04, 44.78it/s][A
215it [00:04, 44.08it/s][A
220it [00:04, 44.63it/s][A
227it [00:05, 45.07it/s]

0it [00:00, ?it/s][A
6it [00:00, 56.75it/s][A
12it [00:00, 56.28it/s][A
19it [00:00, 58.23it/s][A
25it [00:00, 56.76it/s][A
31it [00:00, 57.62it/s][A
37it [00:00, 58.04it/s][A
44it [00:00, 59.26it/s][A
51it [00:00, 59.84it/s][A
57it [00:00, 58.52it/s][A
63it [00:01, 58.73it/s][A
70it [00:01, 59.64it/s][A
77it [00:01, 60.37it/s][A
84it [00:01, 60.82it/s][A
91it [00:01, 60.98it/s][A
98it [00:01, 61.05it/s][A
105it [00:01, 61.16it/s][A
112it [00:01, 61.05it/s][A
119it [00:01, 61.19it/s][A
126it [00:02, 61.03it/s][A
133it [00:02, 60.91it/s][A
140it [00:02, 60.91it/s][A
147it [00:02, 60.88it/s][A
154it [00:02, 61.09it/s][A
161it [00:02, 61.20it/s][A
168it [00:02, 59.79it/s][A
175it [00:02, 60.36it/s][A
182it [00:03, 60.76it/s][A
189it [00:03, 61.00it/s][A
196it [00:03, 60.38it/s][A
203it [00:03, 59.70it/s][A
210it [00:03, 59.97it/s][A
217it [00:03, 6


Epoch: 155, Test Loss: 5.43129525273483, Test Perplexity: 229.16193941957462




0it [00:00, ?it/s][A
5it [00:00, 45.28it/s][A
10it [00:00, 45.38it/s][A
15it [00:00, 45.82it/s][A
20it [00:00, 45.88it/s][A
25it [00:00, 46.05it/s][A
30it [00:00, 45.98it/s][A
35it [00:00, 45.96it/s][A
40it [00:00, 45.82it/s][A
45it [00:00, 45.87it/s][A
50it [00:01, 45.40it/s][A
55it [00:01, 45.63it/s][A
60it [00:01, 45.09it/s][A
65it [00:01, 44.91it/s][A
70it [00:01, 44.10it/s][A
75it [00:01, 44.05it/s][A
80it [00:01, 44.70it/s][A
85it [00:01, 43.91it/s][A
90it [00:02, 43.64it/s][A
95it [00:02, 43.58it/s][A
100it [00:02, 44.11it/s][A
105it [00:02, 44.63it/s][A

Epoch: 156, Step: 100, Loss: 4.624070172309875



110it [00:02, 44.66it/s][A
115it [00:02, 44.88it/s][A
120it [00:02, 44.96it/s][A
125it [00:02, 44.55it/s][A
130it [00:02, 44.12it/s][A
135it [00:03, 44.36it/s][A
140it [00:03, 44.66it/s][A
145it [00:03, 44.76it/s][A
150it [00:03, 44.98it/s][A
155it [00:03, 44.56it/s][A
160it [00:03, 44.75it/s][A
165it [00:03, 44.90it/s][A
170it [00:03, 44.99it/s][A
175it [00:03, 45.05it/s][A
180it [00:04, 45.09it/s][A
185it [00:04, 45.27it/s][A
190it [00:04, 45.31it/s][A
195it [00:04, 45.47it/s][A
200it [00:04, 45.56it/s][A
205it [00:04, 45.48it/s][A

Epoch: 156, Step: 200, Loss: 4.6380258917808534



210it [00:04, 45.40it/s][A
215it [00:04, 45.43it/s][A
220it [00:04, 45.45it/s][A
227it [00:05, 44.99it/s]
 31%|███       | 156/500 [18:17<44:33,  7.77s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.36it/s][A
10it [00:00, 46.09it/s][A
15it [00:00, 46.18it/s][A
20it [00:00, 46.13it/s][A
25it [00:00, 45.16it/s][A
30it [00:00, 45.47it/s][A
35it [00:00, 45.42it/s][A
40it [00:00, 45.26it/s][A
45it [00:00, 45.01it/s][A
50it [00:01, 45.31it/s][A
55it [00:01, 44.91it/s][A
60it [00:01, 45.17it/s][A
65it [00:01, 44.47it/s][A
70it [00:01, 45.00it/s][A
75it [00:01, 45.27it/s][A
80it [00:01, 45.25it/s][A
85it [00:01, 44.64it/s][A
90it [00:01, 45.00it/s][A
95it [00:02, 44.13it/s][A
100it [00:02, 44.52it/s][A
105it [00:02, 44.80it/s][A

Epoch: 157, Step: 100, Loss: 4.631019678115845



110it [00:02, 44.87it/s][A
115it [00:02, 45.20it/s][A
120it [00:02, 45.31it/s][A
125it [00:02, 45.48it/s][A
130it [00:02, 45.36it/s][A
135it [00:02, 44.69it/s][A
140it [00:03, 44.91it/s][A
145it [00:03, 45.23it/s][A
150it [00:03, 45.54it/s][A
155it [00:03, 45.17it/s][A
160it [00:03, 45.21it/s][A
165it [00:03, 45.22it/s][A
170it [00:03, 45.36it/s][A
175it [00:03, 45.58it/s][A
180it [00:03, 45.52it/s][A
185it [00:04, 45.74it/s][A
190it [00:04, 45.83it/s][A
195it [00:04, 45.99it/s][A
200it [00:04, 44.99it/s][A
205it [00:04, 44.64it/s][A

Epoch: 157, Step: 200, Loss: 4.639083786010742



210it [00:04, 43.11it/s][A
215it [00:04, 43.57it/s][A
220it [00:04, 43.87it/s][A
227it [00:05, 44.99it/s]
 31%|███▏      | 157/500 [18:22<39:45,  6.96s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.50it/s][A
10it [00:00, 45.62it/s][A
15it [00:00, 44.39it/s][A
20it [00:00, 43.71it/s][A
25it [00:00, 43.39it/s][A
30it [00:00, 43.24it/s][A
35it [00:00, 43.74it/s][A
40it [00:00, 43.14it/s][A
45it [00:01, 43.44it/s][A
50it [00:01, 44.07it/s][A
55it [00:01, 44.64it/s][A
60it [00:01, 45.03it/s][A
65it [00:01, 45.26it/s][A
70it [00:01, 45.43it/s][A
75it [00:01, 45.62it/s][A
80it [00:01, 45.69it/s][A
85it [00:01, 45.61it/s][A
90it [00:02, 45.84it/s][A
95it [00:02, 45.89it/s][A
100it [00:02, 46.03it/s][A
105it [00:02, 45.87it/s][A

Epoch: 158, Step: 100, Loss: 4.6224581813812256



110it [00:02, 45.73it/s][A
115it [00:02, 45.81it/s][A
120it [00:02, 45.75it/s][A
125it [00:02, 45.10it/s][A
130it [00:02, 45.19it/s][A
135it [00:02, 45.52it/s][A
140it [00:03, 45.35it/s][A
145it [00:03, 45.54it/s][A
150it [00:03, 45.49it/s][A
155it [00:03, 44.99it/s][A
160it [00:03, 45.17it/s][A
165it [00:03, 45.02it/s][A
170it [00:03, 44.94it/s][A
175it [00:03, 45.15it/s][A
180it [00:03, 45.35it/s][A
185it [00:04, 45.50it/s][A
190it [00:04, 45.57it/s][A
195it [00:04, 45.33it/s][A
200it [00:04, 45.36it/s][A
205it [00:04, 45.41it/s][A

Epoch: 158, Step: 200, Loss: 4.638714027404785



210it [00:04, 45.30it/s][A
215it [00:04, 45.25it/s][A
220it [00:04, 45.46it/s][A
227it [00:05, 45.12it/s]
 32%|███▏      | 158/500 [18:27<36:21,  6.38s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.28it/s][A
10it [00:00, 44.74it/s][A
15it [00:00, 44.60it/s][A
20it [00:00, 44.43it/s][A
25it [00:00, 44.63it/s][A
30it [00:00, 44.65it/s][A
35it [00:00, 44.81it/s][A
40it [00:00, 45.22it/s][A
45it [00:01, 45.19it/s][A
50it [00:01, 45.39it/s][A
55it [00:01, 45.61it/s][A
60it [00:01, 45.63it/s][A
65it [00:01, 45.13it/s][A
70it [00:01, 45.34it/s][A
75it [00:01, 44.37it/s][A
80it [00:01, 44.82it/s][A
85it [00:01, 45.07it/s][A
90it [00:01, 45.29it/s][A
95it [00:02, 45.33it/s][A
100it [00:02, 45.35it/s][A
105it [00:02, 45.33it/s][A

Epoch: 159, Step: 100, Loss: 4.624372296333313



110it [00:02, 45.04it/s][A
115it [00:02, 45.09it/s][A
120it [00:02, 44.87it/s][A
125it [00:02, 45.07it/s][A
130it [00:02, 45.09it/s][A
135it [00:02, 45.24it/s][A
140it [00:03, 45.38it/s][A
145it [00:03, 45.45it/s][A
150it [00:03, 45.49it/s][A
155it [00:03, 45.39it/s][A
160it [00:03, 45.52it/s][A
165it [00:03, 45.65it/s][A
170it [00:03, 45.62it/s][A
175it [00:03, 45.45it/s][A
180it [00:03, 45.48it/s][A
185it [00:04, 45.56it/s][A
190it [00:04, 45.34it/s][A
195it [00:04, 45.38it/s][A
200it [00:04, 45.48it/s][A
205it [00:04, 45.54it/s][A

Epoch: 159, Step: 200, Loss: 4.637479677200317



210it [00:04, 45.49it/s][A
215it [00:04, 45.48it/s][A
220it [00:04, 45.57it/s][A
227it [00:05, 45.25it/s]
 32%|███▏      | 159/500 [18:32<33:56,  5.97s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.32it/s][A
10it [00:00, 46.25it/s][A
15it [00:00, 45.89it/s][A
20it [00:00, 45.37it/s][A
25it [00:00, 45.52it/s][A
30it [00:00, 45.53it/s][A
35it [00:00, 44.58it/s][A
40it [00:00, 44.21it/s][A
45it [00:01, 44.45it/s][A
50it [00:01, 44.81it/s][A
55it [00:01, 44.85it/s][A
60it [00:01, 44.81it/s][A
65it [00:01, 45.01it/s][A
70it [00:01, 44.83it/s][A
75it [00:01, 44.77it/s][A
80it [00:01, 45.01it/s][A
85it [00:01, 44.54it/s][A
90it [00:02, 44.51it/s][A
95it [00:02, 44.15it/s][A
100it [00:02, 44.61it/s][A
105it [00:02, 44.77it/s][A

Epoch: 160, Step: 100, Loss: 4.610256662368775



110it [00:02, 44.93it/s][A
115it [00:02, 45.16it/s][A
120it [00:02, 45.37it/s][A
125it [00:02, 45.50it/s][A
130it [00:02, 45.55it/s][A
135it [00:03, 44.84it/s][A
140it [00:03, 44.90it/s][A
145it [00:03, 45.11it/s][A
150it [00:03, 45.43it/s][A
155it [00:03, 45.45it/s][A
160it [00:03, 45.35it/s][A
165it [00:03, 45.33it/s][A
170it [00:03, 45.52it/s][A
175it [00:03, 45.22it/s][A
180it [00:03, 44.78it/s][A
185it [00:04, 45.36it/s][A
190it [00:04, 45.22it/s][A
195it [00:04, 45.38it/s][A
200it [00:04, 44.55it/s][A
205it [00:04, 45.21it/s][A

Epoch: 160, Step: 200, Loss: 4.631965613365173



210it [00:04, 45.49it/s][A
215it [00:04, 45.76it/s][A
220it [00:04, 46.17it/s][A
227it [00:05, 45.13it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.71it/s][A
13it [00:00, 60.54it/s][A
20it [00:00, 60.80it/s][A
27it [00:00, 60.79it/s][A
34it [00:00, 60.69it/s][A
41it [00:00, 60.78it/s][A
48it [00:00, 60.62it/s][A
55it [00:00, 60.68it/s][A
62it [00:01, 60.80it/s][A
69it [00:01, 60.81it/s][A
76it [00:01, 60.71it/s][A
83it [00:01, 60.74it/s][A
90it [00:01, 60.86it/s][A
97it [00:01, 60.96it/s][A
104it [00:01, 60.92it/s][A
111it [00:01, 61.04it/s][A
118it [00:01, 60.95it/s][A
125it [00:02, 60.67it/s][A
132it [00:02, 60.96it/s][A
139it [00:02, 61.51it/s][A
146it [00:02, 61.56it/s][A
153it [00:02, 61.28it/s][A
160it [00:02, 61.03it/s][A
167it [00:02, 60.89it/s][A
174it [00:02, 60.77it/s][A
181it [00:02, 60.62it/s][A
188it [00:03, 60.74it/s][A
195it [00:03, 60.77it/s][A
202it [00:03, 60.46it/s][A
209it [00:03, 60.45it/s][A
216it [00:03, 60.51it/s][A
223it [00:03, 


Epoch: 160, Test Loss: 5.437523746342393, Test Perplexity: 230.6642580210052




0it [00:00, ?it/s][A
4it [00:00, 38.49it/s][A
9it [00:00, 42.49it/s][A
14it [00:00, 43.77it/s][A
19it [00:00, 43.68it/s][A
24it [00:00, 43.77it/s][A
29it [00:00, 44.19it/s][A
34it [00:00, 44.41it/s][A
39it [00:00, 44.63it/s][A
44it [00:00, 44.83it/s][A
49it [00:01, 44.53it/s][A
54it [00:01, 44.80it/s][A
59it [00:01, 45.13it/s][A
64it [00:01, 45.50it/s][A
69it [00:01, 45.37it/s][A
74it [00:01, 45.25it/s][A
79it [00:01, 45.06it/s][A
84it [00:01, 45.18it/s][A
89it [00:01, 45.27it/s][A
94it [00:02, 44.94it/s][A
99it [00:02, 45.18it/s][A
104it [00:02, 45.12it/s][A
109it [00:02, 45.25it/s][A

Epoch: 161, Step: 100, Loss: 4.626642012596131



114it [00:02, 45.18it/s][A
119it [00:02, 44.90it/s][A
124it [00:02, 45.13it/s][A
129it [00:02, 44.76it/s][A
134it [00:02, 45.02it/s][A
139it [00:03, 45.14it/s][A
144it [00:03, 45.39it/s][A
149it [00:03, 45.33it/s][A
154it [00:03, 45.45it/s][A
159it [00:03, 44.41it/s][A
164it [00:03, 44.62it/s][A
169it [00:03, 43.72it/s][A
174it [00:03, 44.24it/s][A
179it [00:03, 44.76it/s][A
184it [00:04, 45.14it/s][A
189it [00:04, 45.45it/s][A
194it [00:04, 45.63it/s][A
199it [00:04, 45.64it/s][A
204it [00:04, 45.52it/s][A
209it [00:04, 45.28it/s][A

Epoch: 161, Step: 200, Loss: 4.63585601568222



214it [00:04, 45.27it/s][A
219it [00:04, 45.35it/s][A
227it [00:05, 44.84it/s]
 32%|███▏      | 161/500 [18:53<43:49,  7.76s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.10it/s][A
10it [00:00, 42.90it/s][A
15it [00:00, 43.86it/s][A
20it [00:00, 44.60it/s][A
25it [00:00, 44.93it/s][A
30it [00:00, 44.85it/s][A
35it [00:00, 45.06it/s][A
40it [00:00, 45.30it/s][A
45it [00:01, 45.41it/s][A
50it [00:01, 45.50it/s][A
55it [00:01, 44.60it/s][A
60it [00:01, 45.16it/s][A
65it [00:01, 45.47it/s][A
70it [00:01, 44.64it/s][A
75it [00:01, 44.85it/s][A
80it [00:01, 45.21it/s][A
85it [00:01, 45.21it/s][A
90it [00:02, 45.28it/s][A
95it [00:02, 45.27it/s][A
100it [00:02, 45.34it/s][A
105it [00:02, 45.58it/s][A

Epoch: 162, Step: 100, Loss: 4.614226889610291



110it [00:02, 44.60it/s][A
115it [00:02, 44.90it/s][A
120it [00:02, 44.62it/s][A
125it [00:02, 44.38it/s][A
130it [00:02, 44.45it/s][A
135it [00:03, 44.47it/s][A
140it [00:03, 44.87it/s][A
145it [00:03, 44.97it/s][A
150it [00:03, 45.19it/s][A
155it [00:03, 45.35it/s][A
160it [00:03, 45.19it/s][A
165it [00:03, 44.91it/s][A
170it [00:03, 44.52it/s][A
175it [00:03, 44.51it/s][A
180it [00:04, 44.49it/s][A
185it [00:04, 44.64it/s][A
190it [00:04, 44.27it/s][A
195it [00:04, 44.86it/s][A
200it [00:04, 45.01it/s][A
205it [00:04, 45.25it/s][A

Epoch: 162, Step: 200, Loss: 4.630957136154175



210it [00:04, 44.86it/s][A
215it [00:04, 45.14it/s][A
220it [00:04, 45.39it/s][A
227it [00:05, 44.92it/s]
 32%|███▏      | 162/500 [18:58<39:08,  6.95s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.80it/s][A
10it [00:00, 45.71it/s][A
15it [00:00, 45.73it/s][A
20it [00:00, 44.67it/s][A
25it [00:00, 44.80it/s][A
30it [00:00, 45.09it/s][A
35it [00:00, 45.35it/s][A
40it [00:00, 45.43it/s][A
45it [00:00, 45.52it/s][A
50it [00:01, 45.68it/s][A
55it [00:01, 44.75it/s][A
60it [00:01, 45.09it/s][A
65it [00:01, 44.97it/s][A
70it [00:01, 43.82it/s][A
75it [00:01, 43.88it/s][A
80it [00:01, 44.46it/s][A
85it [00:01, 44.79it/s][A
90it [00:02, 45.12it/s][A
95it [00:02, 45.30it/s][A
100it [00:02, 44.56it/s][A
105it [00:02, 45.11it/s][A

Epoch: 163, Step: 100, Loss: 4.629640884399414



110it [00:02, 45.21it/s][A
115it [00:02, 45.18it/s][A
120it [00:02, 45.28it/s][A
125it [00:02, 44.48it/s][A
130it [00:02, 44.84it/s][A
135it [00:03, 45.05it/s][A
140it [00:03, 45.15it/s][A
145it [00:03, 45.27it/s][A
150it [00:03, 45.31it/s][A
155it [00:03, 45.32it/s][A
160it [00:03, 45.33it/s][A
165it [00:03, 45.24it/s][A
170it [00:03, 43.90it/s][A
175it [00:03, 44.46it/s][A
180it [00:04, 43.81it/s][A
185it [00:04, 44.42it/s][A
190it [00:04, 44.48it/s][A
195it [00:04, 44.53it/s][A
200it [00:04, 44.59it/s][A
205it [00:04, 44.84it/s][A

Epoch: 163, Step: 200, Loss: 4.631272783279419



210it [00:04, 43.23it/s][A
215it [00:04, 43.96it/s][A
220it [00:04, 44.34it/s][A
227it [00:05, 44.79it/s]
 33%|███▎      | 163/500 [19:03<35:51,  6.38s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.62it/s][A
10it [00:00, 43.49it/s][A
15it [00:00, 44.05it/s][A
20it [00:00, 43.07it/s][A
25it [00:00, 43.86it/s][A
30it [00:00, 44.29it/s][A
35it [00:00, 44.26it/s][A
40it [00:00, 44.54it/s][A
45it [00:01, 44.67it/s][A
50it [00:01, 44.41it/s][A
55it [00:01, 43.91it/s][A
60it [00:01, 44.35it/s][A
65it [00:01, 44.25it/s][A
70it [00:01, 43.59it/s][A
75it [00:01, 44.41it/s][A
80it [00:01, 44.96it/s][A
85it [00:01, 45.20it/s][A
90it [00:02, 45.17it/s][A
95it [00:02, 45.49it/s][A
100it [00:02, 45.79it/s][A
105it [00:02, 45.79it/s][A

Epoch: 164, Step: 100, Loss: 4.618046822547913



110it [00:02, 45.63it/s][A
115it [00:02, 45.61it/s][A
120it [00:02, 45.89it/s][A
125it [00:02, 45.91it/s][A
130it [00:02, 45.71it/s][A
135it [00:03, 45.79it/s][A
140it [00:03, 45.70it/s][A
145it [00:03, 44.75it/s][A
150it [00:03, 45.03it/s][A
155it [00:03, 45.10it/s][A
160it [00:03, 45.33it/s][A
165it [00:03, 45.41it/s][A
170it [00:03, 45.41it/s][A
175it [00:03, 45.63it/s][A
180it [00:03, 45.87it/s][A
185it [00:04, 45.20it/s][A
190it [00:04, 45.47it/s][A
195it [00:04, 45.39it/s][A
200it [00:04, 44.27it/s][A
205it [00:04, 44.88it/s][A

Epoch: 164, Step: 200, Loss: 4.6314760994911195



210it [00:04, 45.09it/s][A
215it [00:04, 45.58it/s][A
220it [00:04, 45.72it/s][A
227it [00:05, 45.04it/s]
 33%|███▎      | 164/500 [19:08<33:29,  5.98s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.28it/s][A
10it [00:00, 44.81it/s][A
15it [00:00, 43.27it/s][A
20it [00:00, 44.07it/s][A
25it [00:00, 44.79it/s][A
30it [00:00, 44.22it/s][A
35it [00:00, 45.13it/s][A
40it [00:00, 45.61it/s][A
45it [00:00, 45.86it/s][A
50it [00:01, 46.15it/s][A
55it [00:01, 46.29it/s][A
60it [00:01, 46.46it/s][A
65it [00:01, 46.24it/s][A
70it [00:01, 46.20it/s][A
75it [00:01, 45.88it/s][A
80it [00:01, 45.89it/s][A
85it [00:01, 45.98it/s][A
90it [00:01, 46.25it/s][A
95it [00:02, 46.41it/s][A
100it [00:02, 46.34it/s][A
105it [00:02, 46.18it/s][A

Epoch: 165, Step: 100, Loss: 4.617070527076721



110it [00:02, 46.19it/s][A
115it [00:02, 46.21it/s][A
120it [00:02, 45.32it/s][A
125it [00:02, 45.85it/s][A
130it [00:02, 46.05it/s][A
135it [00:02, 46.29it/s][A
140it [00:03, 45.08it/s][A
145it [00:03, 45.30it/s][A
150it [00:03, 45.54it/s][A
155it [00:03, 46.11it/s][A
160it [00:03, 46.83it/s][A
165it [00:03, 45.83it/s][A
170it [00:03, 45.91it/s][A
175it [00:03, 45.75it/s][A
180it [00:03, 44.51it/s][A
185it [00:04, 44.87it/s][A
190it [00:04, 45.17it/s][A
195it [00:04, 45.36it/s][A
200it [00:04, 45.61it/s][A
205it [00:04, 45.37it/s][A

Epoch: 165, Step: 200, Loss: 4.629683434963226



210it [00:04, 45.36it/s][A
215it [00:04, 45.51it/s][A
220it [00:04, 45.63it/s][A
227it [00:04, 45.60it/s]

0it [00:00, ?it/s][A
6it [00:00, 54.53it/s][A
13it [00:00, 58.23it/s][A
20it [00:00, 59.63it/s][A
26it [00:00, 59.54it/s][A
32it [00:00, 59.42it/s][A
39it [00:00, 59.74it/s][A
45it [00:00, 59.56it/s][A
51it [00:00, 59.55it/s][A
57it [00:00, 59.06it/s][A
63it [00:01, 59.01it/s][A
69it [00:01, 59.24it/s][A
76it [00:01, 59.70it/s][A
82it [00:01, 58.04it/s][A
88it [00:01, 57.54it/s][A
94it [00:01, 57.98it/s][A
100it [00:01, 58.27it/s][A
107it [00:01, 58.95it/s][A
113it [00:01, 58.76it/s][A
119it [00:02, 58.58it/s][A
125it [00:02, 58.80it/s][A
132it [00:02, 59.23it/s][A
138it [00:02, 59.03it/s][A
144it [00:02, 58.99it/s][A
150it [00:02, 59.00it/s][A
157it [00:02, 59.48it/s][A
163it [00:02, 59.55it/s][A
170it [00:02, 59.88it/s][A
177it [00:02, 59.99it/s][A
183it [00:03, 59.29it/s][A
190it [00:03, 59.92it/s][A
197it [00:03, 60.20it/s][A
204it [00:03, 6


Epoch: 165, Test Loss: 5.439151080498784, Test Perplexity: 231.00125567513223




0it [00:00, ?it/s][A
5it [00:00, 43.44it/s][A
10it [00:00, 44.97it/s][A
15it [00:00, 45.47it/s][A
20it [00:00, 45.35it/s][A
25it [00:00, 44.09it/s][A
30it [00:00, 44.60it/s][A
35it [00:00, 44.77it/s][A
40it [00:00, 44.94it/s][A
45it [00:01, 45.08it/s][A
50it [00:01, 45.33it/s][A
55it [00:01, 45.56it/s][A
60it [00:01, 45.49it/s][A
65it [00:01, 45.41it/s][A
70it [00:01, 45.12it/s][A
75it [00:01, 44.94it/s][A
80it [00:01, 45.04it/s][A
85it [00:01, 45.05it/s][A
90it [00:01, 44.97it/s][A
95it [00:02, 45.11it/s][A
100it [00:02, 45.22it/s][A
105it [00:02, 45.27it/s][A

Epoch: 166, Step: 100, Loss: 4.616263022422791



110it [00:02, 44.96it/s][A
115it [00:02, 44.76it/s][A
120it [00:02, 44.49it/s][A
125it [00:02, 44.64it/s][A
130it [00:02, 43.82it/s][A
135it [00:03, 44.32it/s][A
140it [00:03, 44.63it/s][A
145it [00:03, 45.03it/s][A
150it [00:03, 45.12it/s][A
155it [00:03, 45.21it/s][A
160it [00:03, 44.86it/s][A
165it [00:03, 44.58it/s][A
170it [00:03, 44.54it/s][A
175it [00:03, 44.70it/s][A
180it [00:04, 44.92it/s][A
185it [00:04, 45.27it/s][A
190it [00:04, 45.50it/s][A
195it [00:04, 45.58it/s][A
200it [00:04, 45.62it/s][A
205it [00:04, 45.51it/s][A

Epoch: 166, Step: 200, Loss: 4.631761004924774



210it [00:04, 45.36it/s][A
215it [00:04, 45.23it/s][A
220it [00:04, 45.49it/s][A
227it [00:05, 45.01it/s]
 33%|███▎      | 166/500 [19:29<43:12,  7.76s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.94it/s][A
10it [00:00, 45.50it/s][A
15it [00:00, 45.31it/s][A
20it [00:00, 44.78it/s][A
25it [00:00, 44.78it/s][A
30it [00:00, 44.87it/s][A
35it [00:00, 44.06it/s][A
40it [00:00, 44.58it/s][A
45it [00:01, 44.47it/s][A
50it [00:01, 43.95it/s][A
55it [00:01, 44.08it/s][A
60it [00:01, 44.38it/s][A
65it [00:01, 44.14it/s][A
70it [00:01, 44.30it/s][A
75it [00:01, 44.19it/s][A
80it [00:01, 44.66it/s][A
85it [00:01, 42.92it/s][A
90it [00:02, 43.25it/s][A
95it [00:02, 43.47it/s][A
100it [00:02, 43.95it/s][A
105it [00:02, 44.14it/s][A

Epoch: 167, Step: 100, Loss: 4.611418385505676



110it [00:02, 44.14it/s][A
115it [00:02, 44.40it/s][A
120it [00:02, 44.45it/s][A
125it [00:02, 44.84it/s][A
130it [00:02, 45.07it/s][A
135it [00:03, 43.90it/s][A
140it [00:03, 44.56it/s][A
145it [00:03, 44.60it/s][A
150it [00:03, 44.71it/s][A
155it [00:03, 44.94it/s][A
160it [00:03, 44.86it/s][A
165it [00:03, 44.92it/s][A
170it [00:03, 45.05it/s][A
175it [00:03, 45.07it/s][A
180it [00:04, 45.42it/s][A
185it [00:04, 45.26it/s][A
190it [00:04, 45.35it/s][A
195it [00:04, 45.39it/s][A
200it [00:04, 45.60it/s][A
205it [00:04, 45.61it/s][A

Epoch: 167, Step: 200, Loss: 4.630962209701538



210it [00:04, 45.33it/s][A
215it [00:04, 44.31it/s][A
220it [00:04, 44.27it/s][A
227it [00:05, 44.52it/s]
 33%|███▎      | 167/500 [19:34<38:38,  6.96s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.90it/s][A
10it [00:00, 45.13it/s][A
15it [00:00, 45.55it/s][A
20it [00:00, 45.20it/s][A
25it [00:00, 45.31it/s][A
30it [00:00, 45.19it/s][A
35it [00:00, 45.28it/s][A
40it [00:00, 45.27it/s][A
45it [00:00, 45.05it/s][A
50it [00:01, 44.16it/s][A
55it [00:01, 44.37it/s][A
60it [00:01, 44.54it/s][A
65it [00:01, 44.71it/s][A
70it [00:01, 44.96it/s][A
75it [00:01, 45.07it/s][A
80it [00:01, 45.28it/s][A
85it [00:01, 45.21it/s][A
90it [00:01, 45.33it/s][A
95it [00:02, 45.28it/s][A
100it [00:02, 45.22it/s][A
105it [00:02, 45.38it/s][A

Epoch: 168, Step: 100, Loss: 4.609603114128113



110it [00:02, 45.32it/s][A
115it [00:02, 45.50it/s][A
120it [00:02, 45.32it/s][A
125it [00:02, 45.39it/s][A
130it [00:02, 45.39it/s][A
135it [00:02, 45.64it/s][A
140it [00:03, 45.71it/s][A
145it [00:03, 45.18it/s][A
150it [00:03, 45.05it/s][A
155it [00:03, 45.15it/s][A
160it [00:03, 45.14it/s][A
165it [00:03, 44.26it/s][A
170it [00:03, 44.46it/s][A
175it [00:03, 44.79it/s][A
180it [00:03, 44.87it/s][A
185it [00:04, 45.03it/s][A
190it [00:04, 45.27it/s][A
195it [00:04, 45.23it/s][A
200it [00:04, 45.26it/s][A
205it [00:04, 44.97it/s][A

Epoch: 168, Step: 200, Loss: 4.626046311855316



210it [00:04, 44.82it/s][A
215it [00:04, 44.70it/s][A
220it [00:04, 45.15it/s][A
227it [00:05, 45.12it/s]
 34%|███▎      | 168/500 [19:39<35:19,  6.39s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.60it/s][A
10it [00:00, 46.23it/s][A
15it [00:00, 45.92it/s][A
20it [00:00, 44.42it/s][A
25it [00:00, 44.54it/s][A
30it [00:00, 43.86it/s][A
35it [00:00, 44.32it/s][A
40it [00:00, 44.54it/s][A
45it [00:01, 44.94it/s][A
50it [00:01, 45.40it/s][A
55it [00:01, 45.60it/s][A
60it [00:01, 45.81it/s][A
65it [00:01, 45.73it/s][A
70it [00:01, 45.72it/s][A
75it [00:01, 45.83it/s][A
80it [00:01, 45.69it/s][A
85it [00:01, 45.96it/s][A
90it [00:01, 46.30it/s][A
95it [00:02, 46.48it/s][A
100it [00:02, 46.34it/s][A
105it [00:02, 46.27it/s][A

Epoch: 169, Step: 100, Loss: 4.6101823949813845



110it [00:02, 45.35it/s][A
115it [00:02, 45.38it/s][A
120it [00:02, 45.58it/s][A
125it [00:02, 45.57it/s][A
130it [00:02, 45.95it/s][A
135it [00:02, 46.12it/s][A
140it [00:03, 46.21it/s][A
145it [00:03, 46.34it/s][A
150it [00:03, 46.39it/s][A
155it [00:03, 45.85it/s][A
160it [00:03, 45.93it/s][A
165it [00:03, 45.74it/s][A
170it [00:03, 45.02it/s][A
175it [00:03, 45.24it/s][A
180it [00:03, 45.46it/s][A
185it [00:04, 45.98it/s][A
190it [00:04, 46.38it/s][A
195it [00:04, 46.43it/s][A
200it [00:04, 46.28it/s][A
205it [00:04, 46.51it/s][A

Epoch: 169, Step: 200, Loss: 4.625425391197204



210it [00:04, 46.81it/s][A
215it [00:04, 47.02it/s][A
220it [00:04, 47.01it/s][A
227it [00:04, 45.82it/s]
 34%|███▍      | 169/500 [19:44<32:51,  5.96s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.06it/s][A
10it [00:00, 43.96it/s][A
15it [00:00, 44.84it/s][A
20it [00:00, 45.34it/s][A
25it [00:00, 45.22it/s][A
30it [00:00, 45.40it/s][A
35it [00:00, 45.53it/s][A
40it [00:00, 45.58it/s][A
45it [00:00, 45.75it/s][A
50it [00:01, 45.80it/s][A
55it [00:01, 45.93it/s][A
60it [00:01, 46.03it/s][A
65it [00:01, 45.95it/s][A
70it [00:01, 46.01it/s][A
75it [00:01, 45.92it/s][A
80it [00:01, 46.11it/s][A
85it [00:01, 45.30it/s][A
90it [00:01, 45.28it/s][A
95it [00:02, 45.00it/s][A
100it [00:02, 44.49it/s][A
105it [00:02, 44.65it/s][A

Epoch: 170, Step: 100, Loss: 4.599601922035217



110it [00:02, 45.00it/s][A
115it [00:02, 45.14it/s][A
120it [00:02, 45.16it/s][A
125it [00:02, 45.37it/s][A
130it [00:02, 45.53it/s][A
135it [00:02, 45.45it/s][A
140it [00:03, 45.39it/s][A
145it [00:03, 45.36it/s][A
150it [00:03, 45.25it/s][A
155it [00:03, 45.06it/s][A
160it [00:03, 45.36it/s][A
165it [00:03, 45.27it/s][A
170it [00:03, 45.30it/s][A
175it [00:03, 45.10it/s][A
180it [00:03, 45.10it/s][A
185it [00:04, 44.98it/s][A
190it [00:04, 45.27it/s][A
195it [00:04, 45.14it/s][A
200it [00:04, 45.03it/s][A
205it [00:04, 45.08it/s][A

Epoch: 170, Step: 200, Loss: 4.62339195728302



210it [00:04, 44.49it/s][A
215it [00:04, 44.65it/s][A
220it [00:04, 44.77it/s][A
227it [00:05, 45.20it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.19it/s][A
12it [00:00, 55.55it/s][A
18it [00:00, 55.45it/s][A
24it [00:00, 56.55it/s][A
31it [00:00, 58.13it/s][A
37it [00:00, 58.72it/s][A
44it [00:00, 59.46it/s][A
51it [00:00, 59.79it/s][A
57it [00:00, 59.37it/s][A
64it [00:01, 59.73it/s][A
70it [00:01, 59.05it/s][A
77it [00:01, 59.76it/s][A
84it [00:01, 60.01it/s][A
91it [00:01, 60.14it/s][A
98it [00:01, 60.14it/s][A
105it [00:01, 58.39it/s][A
111it [00:01, 58.37it/s][A
117it [00:01, 58.79it/s][A
124it [00:02, 59.30it/s][A
130it [00:02, 59.44it/s][A
137it [00:02, 59.95it/s][A
143it [00:02, 59.96it/s][A
149it [00:02, 59.83it/s][A
155it [00:02, 59.54it/s][A
162it [00:02, 59.85it/s][A
168it [00:02, 59.80it/s][A
174it [00:02, 59.73it/s][A
180it [00:03, 58.19it/s][A
186it [00:03, 58.68it/s][A
192it [00:03, 58.97it/s][A
199it [00:03, 59.59it/s][A
205it [00:03, 5


Epoch: 170, Test Loss: 5.442608020320442, Test Perplexity: 231.86073679953628




0it [00:00, ?it/s][A
5it [00:00, 45.22it/s][A
10it [00:00, 44.67it/s][A
15it [00:00, 44.67it/s][A
20it [00:00, 44.53it/s][A
25it [00:00, 44.77it/s][A
30it [00:00, 44.99it/s][A
35it [00:00, 45.37it/s][A
40it [00:00, 45.45it/s][A
45it [00:00, 45.53it/s][A
50it [00:01, 45.85it/s][A
55it [00:01, 45.93it/s][A
60it [00:01, 45.77it/s][A
65it [00:01, 45.79it/s][A
70it [00:01, 45.73it/s][A
75it [00:01, 46.01it/s][A
80it [00:01, 46.05it/s][A
85it [00:01, 45.24it/s][A
90it [00:01, 44.19it/s][A
95it [00:02, 44.93it/s][A
100it [00:02, 45.24it/s][A
105it [00:02, 45.10it/s][A

Epoch: 171, Step: 100, Loss: 4.602416777610779



110it [00:02, 44.92it/s][A
115it [00:02, 45.16it/s][A
120it [00:02, 45.28it/s][A
125it [00:02, 45.38it/s][A
130it [00:02, 45.31it/s][A
135it [00:02, 45.22it/s][A
140it [00:03, 45.33it/s][A
145it [00:03, 44.42it/s][A
150it [00:03, 44.74it/s][A
155it [00:03, 44.94it/s][A
160it [00:03, 45.06it/s][A
165it [00:03, 45.24it/s][A
170it [00:03, 45.34it/s][A
175it [00:03, 45.26it/s][A
180it [00:03, 45.28it/s][A
185it [00:04, 45.13it/s][A
190it [00:04, 45.01it/s][A
195it [00:04, 45.16it/s][A
200it [00:04, 45.13it/s][A
205it [00:04, 44.55it/s][A

Epoch: 171, Step: 200, Loss: 4.625293757915497



210it [00:04, 43.03it/s][A
215it [00:04, 43.60it/s][A
220it [00:04, 43.86it/s][A
227it [00:05, 45.00it/s]
 34%|███▍      | 171/500 [20:05<42:33,  7.76s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.65it/s][A
10it [00:00, 45.08it/s][A
15it [00:00, 45.04it/s][A
20it [00:00, 44.39it/s][A
25it [00:00, 43.88it/s][A
30it [00:00, 43.83it/s][A
35it [00:00, 43.69it/s][A
40it [00:00, 44.25it/s][A
45it [00:01, 44.59it/s][A
50it [00:01, 44.92it/s][A
55it [00:01, 45.12it/s][A
60it [00:01, 44.92it/s][A
65it [00:01, 44.93it/s][A
70it [00:01, 44.31it/s][A
75it [00:01, 44.59it/s][A
80it [00:01, 45.02it/s][A
85it [00:01, 45.29it/s][A
90it [00:02, 45.50it/s][A
95it [00:02, 45.65it/s][A
100it [00:02, 44.84it/s][A
105it [00:02, 44.97it/s][A

Epoch: 172, Step: 100, Loss: 4.613464341163636



110it [00:02, 45.19it/s][A
115it [00:02, 45.29it/s][A
120it [00:02, 45.62it/s][A
125it [00:02, 45.98it/s][A
130it [00:02, 46.05it/s][A
135it [00:02, 45.95it/s][A
140it [00:03, 45.93it/s][A
145it [00:03, 46.02it/s][A
150it [00:03, 46.08it/s][A
155it [00:03, 45.88it/s][A
160it [00:03, 45.79it/s][A
165it [00:03, 45.77it/s][A
170it [00:03, 45.55it/s][A
175it [00:03, 45.65it/s][A
180it [00:03, 45.63it/s][A
185it [00:04, 45.46it/s][A
190it [00:04, 45.66it/s][A
195it [00:04, 45.82it/s][A
200it [00:04, 45.76it/s][A
205it [00:04, 45.64it/s][A

Epoch: 172, Step: 200, Loss: 4.624428753852844



210it [00:04, 45.27it/s][A
215it [00:04, 45.38it/s][A
220it [00:04, 45.56it/s][A
227it [00:05, 45.28it/s]
 34%|███▍      | 172/500 [20:10<37:55,  6.94s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.30it/s][A
10it [00:00, 44.90it/s][A
15it [00:00, 45.23it/s][A
20it [00:00, 45.45it/s][A
25it [00:00, 45.31it/s][A
30it [00:00, 45.30it/s][A
35it [00:00, 45.40it/s][A
40it [00:00, 45.55it/s][A
45it [00:00, 45.61it/s][A
50it [00:01, 45.86it/s][A
55it [00:01, 46.15it/s][A
60it [00:01, 46.17it/s][A
65it [00:01, 45.84it/s][A
70it [00:01, 45.86it/s][A
75it [00:01, 45.74it/s][A
80it [00:01, 45.73it/s][A
85it [00:01, 46.22it/s][A
90it [00:01, 46.37it/s][A
95it [00:02, 46.54it/s][A
100it [00:02, 46.66it/s][A
105it [00:02, 46.89it/s][A

Epoch: 173, Step: 100, Loss: 4.613665690422058



110it [00:02, 46.35it/s][A
115it [00:02, 46.16it/s][A
120it [00:02, 46.29it/s][A
125it [00:02, 46.27it/s][A
130it [00:02, 46.29it/s][A
135it [00:02, 46.37it/s][A
140it [00:03, 46.46it/s][A
145it [00:03, 46.56it/s][A
150it [00:03, 46.66it/s][A
155it [00:03, 46.64it/s][A
160it [00:03, 46.72it/s][A
165it [00:03, 46.22it/s][A
170it [00:03, 45.27it/s][A
175it [00:03, 45.11it/s][A
180it [00:03, 45.37it/s][A
185it [00:04, 45.72it/s][A
190it [00:04, 45.79it/s][A
195it [00:04, 44.84it/s][A
200it [00:04, 45.23it/s][A
205it [00:04, 45.31it/s][A

Epoch: 173, Step: 200, Loss: 4.625329649448394



210it [00:04, 45.43it/s][A
215it [00:04, 45.70it/s][A
220it [00:04, 45.91it/s][A
227it [00:04, 45.88it/s]
 35%|███▍      | 173/500 [20:15<34:33,  6.34s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.43it/s][A
10it [00:00, 44.93it/s][A
15it [00:00, 45.72it/s][A
20it [00:00, 46.29it/s][A
25it [00:00, 45.98it/s][A
30it [00:00, 45.94it/s][A
35it [00:00, 46.76it/s][A
40it [00:00, 46.86it/s][A
45it [00:00, 45.74it/s][A
50it [00:01, 45.41it/s][A
55it [00:01, 45.39it/s][A
60it [00:01, 45.36it/s][A
65it [00:01, 45.51it/s][A
70it [00:01, 45.47it/s][A
75it [00:01, 45.29it/s][A
80it [00:01, 45.54it/s][A
85it [00:01, 45.21it/s][A
90it [00:01, 45.03it/s][A
95it [00:02, 44.86it/s][A
100it [00:02, 44.66it/s][A
105it [00:02, 44.68it/s][A

Epoch: 174, Step: 100, Loss: 4.610821642875671



110it [00:02, 44.02it/s][A
115it [00:02, 44.17it/s][A
120it [00:02, 44.63it/s][A
125it [00:02, 44.13it/s][A
130it [00:02, 43.92it/s][A
135it [00:02, 44.57it/s][A
140it [00:03, 45.12it/s][A
145it [00:03, 45.49it/s][A
150it [00:03, 45.61it/s][A
155it [00:03, 45.58it/s][A
160it [00:03, 45.57it/s][A
165it [00:03, 45.51it/s][A
170it [00:03, 45.54it/s][A
175it [00:03, 45.56it/s][A
180it [00:03, 45.46it/s][A
185it [00:04, 45.56it/s][A
190it [00:04, 45.76it/s][A
195it [00:04, 45.75it/s][A
200it [00:04, 44.92it/s][A
205it [00:04, 44.84it/s][A

Epoch: 174, Step: 200, Loss: 4.623535308837891



210it [00:04, 44.72it/s][A
215it [00:04, 45.00it/s][A
220it [00:04, 44.90it/s][A
227it [00:05, 45.21it/s]
 35%|███▍      | 174/500 [20:20<32:18,  5.95s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.52it/s][A
10it [00:00, 46.11it/s][A
15it [00:00, 45.89it/s][A
20it [00:00, 45.77it/s][A
25it [00:00, 45.80it/s][A
30it [00:00, 45.66it/s][A
35it [00:00, 45.72it/s][A
40it [00:00, 44.63it/s][A
45it [00:00, 45.02it/s][A
50it [00:01, 45.37it/s][A
55it [00:01, 45.47it/s][A
60it [00:01, 45.60it/s][A
65it [00:01, 45.67it/s][A
70it [00:01, 45.30it/s][A
75it [00:01, 45.52it/s][A
80it [00:01, 45.26it/s][A
85it [00:01, 45.32it/s][A
90it [00:01, 45.41it/s][A
95it [00:02, 45.62it/s][A
100it [00:02, 45.76it/s][A
105it [00:02, 45.65it/s][A

Epoch: 175, Step: 100, Loss: 4.611460371017456



110it [00:02, 45.66it/s][A
115it [00:02, 44.92it/s][A
120it [00:02, 45.02it/s][A
125it [00:02, 45.12it/s][A
130it [00:02, 45.22it/s][A
135it [00:02, 45.38it/s][A
140it [00:03, 45.43it/s][A
145it [00:03, 45.14it/s][A
150it [00:03, 45.15it/s][A
155it [00:03, 45.31it/s][A
160it [00:03, 45.27it/s][A
165it [00:03, 45.56it/s][A
170it [00:03, 45.68it/s][A
175it [00:03, 45.59it/s][A
180it [00:03, 45.54it/s][A
185it [00:04, 45.58it/s][A
190it [00:04, 45.69it/s][A
195it [00:04, 45.44it/s][A
200it [00:04, 45.58it/s][A
205it [00:04, 45.17it/s][A

Epoch: 175, Step: 200, Loss: 4.621395952701569



210it [00:04, 45.06it/s][A
215it [00:04, 45.12it/s][A
220it [00:04, 45.35it/s][A
227it [00:04, 45.41it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.03it/s][A
12it [00:00, 56.61it/s][A
19it [00:00, 58.71it/s][A
26it [00:00, 59.62it/s][A
33it [00:00, 60.14it/s][A
40it [00:00, 60.02it/s][A
46it [00:00, 59.55it/s][A
52it [00:00, 59.67it/s][A
58it [00:00, 58.47it/s][A
65it [00:01, 59.34it/s][A
71it [00:01, 57.95it/s][A
77it [00:01, 58.51it/s][A
83it [00:01, 58.89it/s][A
90it [00:01, 59.46it/s][A
97it [00:01, 59.82it/s][A
104it [00:01, 59.93it/s][A
111it [00:01, 60.16it/s][A
118it [00:01, 60.38it/s][A
125it [00:02, 60.37it/s][A
132it [00:02, 60.33it/s][A
139it [00:02, 59.99it/s][A
145it [00:02, 59.67it/s][A
151it [00:02, 59.75it/s][A
157it [00:02, 59.63it/s][A
164it [00:02, 59.88it/s][A
170it [00:02, 59.89it/s][A
177it [00:02, 59.99it/s][A
183it [00:03, 59.91it/s][A
189it [00:03, 59.88it/s][A
195it [00:03, 59.64it/s][A
201it [00:03, 58.44it/s][A
207it [00:03, 5


Epoch: 175, Test Loss: 5.449584524083582, Test Perplexity: 233.481464836168




0it [00:00, ?it/s][A
5it [00:00, 42.35it/s][A
10it [00:00, 44.70it/s][A
15it [00:00, 45.39it/s][A
20it [00:00, 45.29it/s][A
25it [00:00, 45.59it/s][A
30it [00:00, 45.68it/s][A
35it [00:00, 45.53it/s][A
40it [00:00, 44.43it/s][A
45it [00:00, 44.93it/s][A
50it [00:01, 45.27it/s][A
55it [00:01, 44.68it/s][A
60it [00:01, 44.82it/s][A
65it [00:01, 45.16it/s][A
70it [00:01, 45.47it/s][A
75it [00:01, 45.65it/s][A
80it [00:01, 45.53it/s][A
85it [00:01, 44.89it/s][A
90it [00:01, 45.27it/s][A
95it [00:02, 45.43it/s][A
100it [00:02, 45.71it/s][A
105it [00:02, 45.88it/s][A

Epoch: 176, Step: 100, Loss: 4.607045297622681



110it [00:02, 45.97it/s][A
115it [00:02, 45.76it/s][A
120it [00:02, 45.72it/s][A
125it [00:02, 45.23it/s][A
130it [00:02, 45.39it/s][A
135it [00:02, 45.19it/s][A
140it [00:03, 44.77it/s][A
145it [00:03, 44.89it/s][A
150it [00:03, 45.11it/s][A
155it [00:03, 43.46it/s][A
160it [00:03, 42.99it/s][A
165it [00:03, 43.80it/s][A
170it [00:03, 43.87it/s][A
175it [00:03, 44.35it/s][A
180it [00:04, 44.65it/s][A
185it [00:04, 44.37it/s][A
190it [00:04, 44.32it/s][A
195it [00:04, 44.39it/s][A
200it [00:04, 43.76it/s][A
205it [00:04, 44.24it/s][A

Epoch: 176, Step: 200, Loss: 4.621118919849396



210it [00:04, 44.23it/s][A
215it [00:04, 43.42it/s][A
220it [00:04, 44.15it/s][A
227it [00:05, 44.77it/s]
 35%|███▌      | 176/500 [20:41<41:50,  7.75s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.50it/s][A
10it [00:00, 45.48it/s][A
15it [00:00, 45.74it/s][A
20it [00:00, 45.72it/s][A
25it [00:00, 45.59it/s][A
30it [00:00, 45.47it/s][A
35it [00:00, 44.66it/s][A
40it [00:00, 43.90it/s][A
45it [00:01, 44.54it/s][A
50it [00:01, 44.56it/s][A
55it [00:01, 45.06it/s][A
60it [00:01, 45.24it/s][A
65it [00:01, 44.78it/s][A
70it [00:01, 44.09it/s][A
75it [00:01, 44.67it/s][A
80it [00:01, 44.66it/s][A
85it [00:01, 44.83it/s][A
90it [00:02, 45.00it/s][A
95it [00:02, 45.33it/s][A
100it [00:02, 45.62it/s][A
105it [00:02, 45.81it/s][A

Epoch: 177, Step: 100, Loss: 4.605040760040283



110it [00:02, 45.33it/s][A
115it [00:02, 45.92it/s][A
120it [00:02, 46.02it/s][A
125it [00:02, 45.84it/s][A
130it [00:02, 45.93it/s][A
135it [00:02, 46.14it/s][A
140it [00:03, 45.22it/s][A
145it [00:03, 45.60it/s][A
150it [00:03, 45.60it/s][A
155it [00:03, 44.83it/s][A
160it [00:03, 45.30it/s][A
165it [00:03, 44.74it/s][A
170it [00:03, 44.98it/s][A
175it [00:03, 44.77it/s][A
180it [00:03, 45.13it/s][A
185it [00:04, 44.30it/s][A
190it [00:04, 44.73it/s][A
195it [00:04, 45.04it/s][A
200it [00:04, 44.95it/s][A
205it [00:04, 44.99it/s][A

Epoch: 177, Step: 200, Loss: 4.618687756061554



210it [00:04, 44.85it/s][A
215it [00:04, 44.71it/s][A
220it [00:04, 44.96it/s][A
227it [00:05, 45.14it/s]
 35%|███▌      | 177/500 [20:46<37:19,  6.93s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.31it/s][A
10it [00:00, 45.42it/s][A
15it [00:00, 46.18it/s][A
20it [00:00, 46.81it/s][A
25it [00:00, 46.87it/s][A
30it [00:00, 45.48it/s][A
35it [00:00, 45.73it/s][A
40it [00:00, 45.94it/s][A
45it [00:00, 46.18it/s][A
50it [00:01, 46.42it/s][A
55it [00:01, 46.60it/s][A
60it [00:01, 46.35it/s][A
65it [00:01, 46.20it/s][A
70it [00:01, 45.19it/s][A
75it [00:01, 45.63it/s][A
80it [00:01, 45.21it/s][A
85it [00:01, 45.49it/s][A
90it [00:01, 45.52it/s][A
95it [00:02, 45.68it/s][A
100it [00:02, 45.50it/s][A
105it [00:02, 45.71it/s][A

Epoch: 178, Step: 100, Loss: 4.611075811386108



110it [00:02, 46.42it/s][A
115it [00:02, 46.71it/s][A
120it [00:02, 46.61it/s][A
125it [00:02, 46.15it/s][A
130it [00:02, 45.76it/s][A
135it [00:02, 44.84it/s][A
140it [00:03, 44.92it/s][A
145it [00:03, 45.22it/s][A
150it [00:03, 45.60it/s][A
155it [00:03, 45.35it/s][A
160it [00:03, 45.43it/s][A
165it [00:03, 45.26it/s][A
170it [00:03, 45.19it/s][A
175it [00:03, 45.27it/s][A
180it [00:03, 45.07it/s][A
185it [00:04, 45.21it/s][A
190it [00:04, 45.16it/s][A
195it [00:04, 43.91it/s][A
200it [00:04, 43.78it/s][A
205it [00:04, 44.32it/s][A

Epoch: 178, Step: 200, Loss: 4.619670495986939



210it [00:04, 44.67it/s][A
215it [00:04, 44.64it/s][A
220it [00:04, 44.34it/s][A
227it [00:04, 45.41it/s]
 36%|███▌      | 178/500 [20:51<34:05,  6.35s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.81it/s][A
10it [00:00, 45.37it/s][A
15it [00:00, 44.79it/s][A
20it [00:00, 44.76it/s][A
25it [00:00, 44.66it/s][A
30it [00:00, 44.75it/s][A
35it [00:00, 44.70it/s][A
40it [00:00, 45.11it/s][A
45it [00:01, 45.25it/s][A
50it [00:01, 45.44it/s][A
55it [00:01, 45.44it/s][A
60it [00:01, 45.43it/s][A
65it [00:01, 45.34it/s][A
70it [00:01, 45.06it/s][A
75it [00:01, 45.00it/s][A
80it [00:01, 45.15it/s][A
85it [00:01, 45.10it/s][A
90it [00:01, 45.27it/s][A
95it [00:02, 45.47it/s][A
100it [00:02, 45.64it/s][A
105it [00:02, 45.70it/s][A

Epoch: 179, Step: 100, Loss: 4.607732086181641



110it [00:02, 45.60it/s][A
115it [00:02, 45.53it/s][A
120it [00:02, 44.77it/s][A
125it [00:02, 44.78it/s][A
130it [00:02, 45.05it/s][A
135it [00:02, 45.17it/s][A
140it [00:03, 44.85it/s][A
145it [00:03, 45.21it/s][A
150it [00:03, 45.54it/s][A
155it [00:03, 45.52it/s][A
160it [00:03, 45.63it/s][A
165it [00:03, 45.62it/s][A
170it [00:03, 45.80it/s][A
175it [00:03, 45.82it/s][A
180it [00:03, 45.79it/s][A
185it [00:04, 45.76it/s][A
190it [00:04, 45.91it/s][A
195it [00:04, 45.70it/s][A
200it [00:04, 45.27it/s][A
205it [00:04, 45.50it/s][A

Epoch: 179, Step: 200, Loss: 4.615628814697265



210it [00:04, 44.92it/s][A
215it [00:04, 44.85it/s][A
220it [00:04, 44.93it/s][A
227it [00:05, 45.22it/s]
 36%|███▌      | 179/500 [20:56<31:51,  5.95s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.68it/s][A
10it [00:00, 45.40it/s][A
15it [00:00, 45.68it/s][A
20it [00:00, 45.59it/s][A
25it [00:00, 43.79it/s][A
30it [00:00, 44.24it/s][A
35it [00:00, 44.63it/s][A
40it [00:00, 45.08it/s][A
45it [00:00, 45.52it/s][A
50it [00:01, 45.42it/s][A
55it [00:01, 45.69it/s][A
60it [00:01, 45.75it/s][A
65it [00:01, 45.43it/s][A
70it [00:01, 44.33it/s][A
75it [00:01, 44.57it/s][A
80it [00:01, 44.85it/s][A
85it [00:01, 44.72it/s][A
90it [00:02, 44.85it/s][A
95it [00:02, 45.02it/s][A
100it [00:02, 44.11it/s][A
105it [00:02, 44.58it/s][A

Epoch: 180, Step: 100, Loss: 4.600912156105042



110it [00:02, 44.71it/s][A
115it [00:02, 44.90it/s][A
120it [00:02, 45.02it/s][A
125it [00:02, 44.95it/s][A
130it [00:02, 45.24it/s][A
135it [00:03, 45.39it/s][A
140it [00:03, 45.22it/s][A
145it [00:03, 44.89it/s][A
150it [00:03, 45.20it/s][A
155it [00:03, 45.43it/s][A
160it [00:03, 45.56it/s][A
165it [00:03, 45.55it/s][A
170it [00:03, 45.75it/s][A
175it [00:03, 45.95it/s][A
180it [00:03, 46.10it/s][A
185it [00:04, 46.17it/s][A
190it [00:04, 46.10it/s][A
195it [00:04, 46.17it/s][A
200it [00:04, 46.22it/s][A
205it [00:04, 46.01it/s][A

Epoch: 180, Step: 200, Loss: 4.621275494098663



210it [00:04, 45.74it/s][A
215it [00:04, 45.64it/s][A
220it [00:04, 45.51it/s][A
227it [00:05, 45.23it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.83it/s][A
12it [00:00, 57.69it/s][A
19it [00:00, 59.47it/s][A
26it [00:00, 60.08it/s][A
33it [00:00, 60.15it/s][A
40it [00:00, 60.19it/s][A
47it [00:00, 60.10it/s][A
54it [00:00, 60.10it/s][A
61it [00:01, 60.23it/s][A
68it [00:01, 60.13it/s][A
75it [00:01, 60.39it/s][A
82it [00:01, 60.44it/s][A
89it [00:01, 60.45it/s][A
96it [00:01, 60.26it/s][A
103it [00:01, 59.74it/s][A
109it [00:01, 59.41it/s][A
115it [00:01, 58.58it/s][A
121it [00:02, 57.68it/s][A
127it [00:02, 57.68it/s][A
133it [00:02, 58.23it/s][A
139it [00:02, 57.48it/s][A
145it [00:02, 58.17it/s][A
152it [00:02, 58.99it/s][A
159it [00:02, 59.47it/s][A
166it [00:02, 59.91it/s][A
173it [00:02, 60.20it/s][A
180it [00:03, 60.11it/s][A
187it [00:03, 60.45it/s][A
194it [00:03, 60.03it/s][A
201it [00:03, 60.07it/s][A
208it [00:03, 60.03it/s][A
215it [00:03, 


Epoch: 180, Test Loss: 5.452890581225756, Test Perplexity: 234.26188981903266




0it [00:00, ?it/s][A
5it [00:00, 44.57it/s][A
10it [00:00, 45.87it/s][A
15it [00:00, 46.17it/s][A
20it [00:00, 44.83it/s][A
25it [00:00, 45.18it/s][A
30it [00:00, 45.02it/s][A
35it [00:00, 45.25it/s][A
40it [00:00, 44.99it/s][A
45it [00:00, 45.20it/s][A
50it [00:01, 45.34it/s][A
55it [00:01, 45.55it/s][A
60it [00:01, 45.28it/s][A
65it [00:01, 45.32it/s][A
70it [00:01, 45.29it/s][A
75it [00:01, 45.38it/s][A
80it [00:01, 45.21it/s][A
85it [00:01, 44.20it/s][A
90it [00:01, 44.54it/s][A
95it [00:02, 44.72it/s][A
100it [00:02, 44.74it/s][A
105it [00:02, 44.94it/s][A

Epoch: 181, Step: 100, Loss: 4.5980661725997924



110it [00:02, 43.83it/s][A
115it [00:02, 43.68it/s][A
120it [00:02, 43.15it/s][A
125it [00:02, 43.65it/s][A
130it [00:02, 44.23it/s][A
135it [00:03, 44.66it/s][A
140it [00:03, 44.33it/s][A
145it [00:03, 44.80it/s][A
150it [00:03, 45.13it/s][A
155it [00:03, 45.39it/s][A
160it [00:03, 44.72it/s][A
165it [00:03, 44.51it/s][A
170it [00:03, 44.35it/s][A
175it [00:03, 44.67it/s][A
180it [00:04, 45.29it/s][A
185it [00:04, 45.59it/s][A
190it [00:04, 45.88it/s][A
195it [00:04, 45.93it/s][A
200it [00:04, 46.28it/s][A
205it [00:04, 46.05it/s][A

Epoch: 181, Step: 200, Loss: 4.615265643596649



210it [00:04, 45.92it/s][A
215it [00:04, 45.62it/s][A
220it [00:04, 45.56it/s][A
227it [00:05, 45.03it/s]
 36%|███▌      | 181/500 [21:17<41:15,  7.76s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.19it/s][A
10it [00:00, 47.19it/s][A
15it [00:00, 47.07it/s][A
20it [00:00, 46.80it/s][A
25it [00:00, 46.85it/s][A
30it [00:00, 46.67it/s][A
35it [00:00, 46.38it/s][A
40it [00:00, 46.39it/s][A
45it [00:00, 46.59it/s][A
50it [00:01, 46.74it/s][A
55it [00:01, 46.56it/s][A
60it [00:01, 46.23it/s][A
65it [00:01, 46.33it/s][A
70it [00:01, 46.50it/s][A
75it [00:01, 46.49it/s][A
80it [00:01, 46.12it/s][A
85it [00:01, 45.95it/s][A
90it [00:01, 45.76it/s][A
95it [00:02, 45.72it/s][A
100it [00:02, 45.86it/s][A
105it [00:02, 45.53it/s][A

Epoch: 182, Step: 100, Loss: 4.598894672393799



110it [00:02, 45.16it/s][A
115it [00:02, 45.53it/s][A
120it [00:02, 45.93it/s][A
125it [00:02, 46.18it/s][A
130it [00:02, 43.55it/s][A
135it [00:02, 44.20it/s][A
140it [00:03, 45.08it/s][A
145it [00:03, 45.25it/s][A
150it [00:03, 45.51it/s][A
155it [00:03, 46.46it/s][A
160it [00:03, 46.61it/s][A
165it [00:03, 46.59it/s][A
170it [00:03, 46.16it/s][A
175it [00:03, 45.89it/s][A
180it [00:03, 45.88it/s][A
185it [00:04, 45.96it/s][A
190it [00:04, 46.01it/s][A
195it [00:04, 45.76it/s][A
200it [00:04, 45.89it/s][A
205it [00:04, 46.12it/s][A

Epoch: 182, Step: 200, Loss: 4.613462631702423



210it [00:04, 45.68it/s][A
215it [00:04, 45.60it/s][A
220it [00:04, 44.39it/s][A
227it [00:04, 45.83it/s]
 36%|███▋      | 182/500 [21:22<36:40,  6.92s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.47it/s][A
10it [00:00, 46.06it/s][A
15it [00:00, 46.06it/s][A
20it [00:00, 46.27it/s][A
25it [00:00, 46.21it/s][A
30it [00:00, 45.63it/s][A
35it [00:00, 45.16it/s][A
40it [00:00, 45.24it/s][A
45it [00:00, 44.85it/s][A
50it [00:01, 45.05it/s][A
55it [00:01, 45.03it/s][A
60it [00:01, 45.02it/s][A
65it [00:01, 45.05it/s][A
70it [00:01, 45.16it/s][A
75it [00:01, 45.12it/s][A
80it [00:01, 45.08it/s][A
85it [00:01, 44.29it/s][A
90it [00:01, 44.45it/s][A
95it [00:02, 44.66it/s][A
100it [00:02, 44.86it/s][A
105it [00:02, 44.75it/s][A

Epoch: 183, Step: 100, Loss: 4.587661890983582



110it [00:02, 44.48it/s][A
115it [00:02, 43.62it/s][A
120it [00:02, 43.20it/s][A
125it [00:02, 42.95it/s][A
130it [00:02, 43.66it/s][A
135it [00:03, 44.24it/s][A
140it [00:03, 44.60it/s][A
145it [00:03, 44.69it/s][A
150it [00:03, 44.98it/s][A
155it [00:03, 44.93it/s][A
160it [00:03, 45.09it/s][A
165it [00:03, 44.97it/s][A
170it [00:03, 44.33it/s][A
175it [00:03, 44.42it/s][A
180it [00:04, 44.37it/s][A
185it [00:04, 44.01it/s][A
190it [00:04, 44.66it/s][A
195it [00:04, 44.97it/s][A
200it [00:04, 45.23it/s][A
205it [00:04, 45.44it/s][A

Epoch: 183, Step: 200, Loss: 4.61225836277008



210it [00:04, 45.48it/s][A
215it [00:04, 45.77it/s][A
220it [00:04, 44.87it/s][A
227it [00:05, 44.80it/s]
 37%|███▋      | 183/500 [21:27<33:37,  6.37s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.54it/s][A
10it [00:00, 44.96it/s][A
15it [00:00, 44.43it/s][A
20it [00:00, 43.51it/s][A
25it [00:00, 44.04it/s][A
30it [00:00, 43.76it/s][A
35it [00:00, 43.97it/s][A
40it [00:00, 44.36it/s][A
45it [00:01, 44.89it/s][A
50it [00:01, 45.28it/s][A
55it [00:01, 45.49it/s][A
60it [00:01, 45.62it/s][A
65it [00:01, 44.95it/s][A
70it [00:01, 45.04it/s][A
75it [00:01, 45.06it/s][A
80it [00:01, 44.18it/s][A
85it [00:01, 44.41it/s][A
90it [00:02, 44.77it/s][A
95it [00:02, 44.87it/s][A
100it [00:02, 45.03it/s][A
105it [00:02, 45.29it/s][A

Epoch: 184, Step: 100, Loss: 4.602095408439636



110it [00:02, 45.24it/s][A
115it [00:02, 45.38it/s][A
120it [00:02, 45.28it/s][A
125it [00:02, 45.33it/s][A
130it [00:02, 45.42it/s][A
135it [00:03, 45.41it/s][A
140it [00:03, 45.47it/s][A
145it [00:03, 45.59it/s][A
150it [00:03, 45.75it/s][A
155it [00:03, 45.60it/s][A
160it [00:03, 45.41it/s][A
165it [00:03, 45.42it/s][A
170it [00:03, 44.74it/s][A
175it [00:03, 45.02it/s][A
180it [00:04, 45.24it/s][A
185it [00:04, 44.97it/s][A
190it [00:04, 45.12it/s][A
195it [00:04, 45.25it/s][A
200it [00:04, 45.51it/s][A
205it [00:04, 44.79it/s][A

Epoch: 184, Step: 200, Loss: 4.615586004257202



210it [00:04, 45.00it/s][A
215it [00:04, 43.98it/s][A
220it [00:04, 44.59it/s][A
227it [00:05, 44.94it/s]
 37%|███▋      | 184/500 [21:32<31:27,  5.97s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.64it/s][A
10it [00:00, 43.72it/s][A
15it [00:00, 44.91it/s][A
20it [00:00, 44.93it/s][A
25it [00:00, 45.15it/s][A
30it [00:00, 45.40it/s][A
35it [00:00, 45.03it/s][A
40it [00:00, 45.27it/s][A
45it [00:00, 45.55it/s][A
50it [00:01, 45.16it/s][A
55it [00:01, 45.48it/s][A
60it [00:01, 45.61it/s][A
65it [00:01, 45.41it/s][A
70it [00:01, 45.61it/s][A
75it [00:01, 44.77it/s][A
80it [00:01, 44.46it/s][A
85it [00:01, 44.87it/s][A
90it [00:02, 44.13it/s][A
95it [00:02, 44.45it/s][A
100it [00:02, 44.82it/s][A
105it [00:02, 44.17it/s][A

Epoch: 185, Step: 100, Loss: 4.591388635635376



110it [00:02, 43.53it/s][A
115it [00:02, 43.84it/s][A
120it [00:02, 44.08it/s][A
125it [00:02, 44.26it/s][A
130it [00:02, 44.85it/s][A
135it [00:03, 45.19it/s][A
140it [00:03, 45.29it/s][A
145it [00:03, 45.49it/s][A
150it [00:03, 45.54it/s][A
155it [00:03, 45.61it/s][A
160it [00:03, 44.76it/s][A
165it [00:03, 44.11it/s][A
170it [00:03, 44.34it/s][A
175it [00:03, 44.83it/s][A
180it [00:04, 44.75it/s][A
185it [00:04, 44.58it/s][A
190it [00:04, 44.44it/s][A
195it [00:04, 44.60it/s][A
200it [00:04, 44.67it/s][A
205it [00:04, 44.41it/s][A

Epoch: 185, Step: 200, Loss: 4.613015303611755



210it [00:04, 44.67it/s][A
215it [00:04, 44.62it/s][A
220it [00:04, 44.88it/s][A
227it [00:05, 44.76it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.25it/s][A
12it [00:00, 57.09it/s][A
18it [00:00, 55.06it/s][A
25it [00:00, 57.23it/s][A
32it [00:00, 58.43it/s][A
38it [00:00, 58.63it/s][A
45it [00:00, 59.35it/s][A
52it [00:00, 59.92it/s][A
58it [00:00, 58.87it/s][A
65it [00:01, 59.33it/s][A
72it [00:01, 59.89it/s][A
78it [00:01, 57.82it/s][A
85it [00:01, 58.87it/s][A
91it [00:01, 58.73it/s][A
98it [00:01, 59.54it/s][A
104it [00:01, 58.74it/s][A
110it [00:01, 58.06it/s][A
116it [00:01, 58.52it/s][A
123it [00:02, 59.15it/s][A
130it [00:02, 59.60it/s][A
137it [00:02, 59.90it/s][A
144it [00:02, 60.14it/s][A
151it [00:02, 57.92it/s][A
157it [00:02, 58.36it/s][A
163it [00:02, 58.79it/s][A
170it [00:02, 59.29it/s][A
176it [00:02, 59.47it/s][A
182it [00:03, 58.65it/s][A
189it [00:03, 59.39it/s][A
195it [00:03, 59.26it/s][A
202it [00:03, 59.71it/s][A
208it [00:03, 5


Epoch: 185, Test Loss: 5.44183511689583, Test Perplexity: 231.7294960495848




0it [00:00, ?it/s][A
5it [00:00, 45.57it/s][A
10it [00:00, 46.04it/s][A
15it [00:00, 43.84it/s][A
20it [00:00, 44.67it/s][A
25it [00:00, 44.48it/s][A
30it [00:00, 44.96it/s][A
35it [00:00, 44.14it/s][A
40it [00:00, 44.38it/s][A
45it [00:01, 44.88it/s][A
50it [00:01, 45.29it/s][A
55it [00:01, 45.71it/s][A
60it [00:01, 46.00it/s][A
65it [00:01, 46.05it/s][A
70it [00:01, 45.97it/s][A
75it [00:01, 45.95it/s][A
80it [00:01, 45.73it/s][A
85it [00:01, 44.13it/s][A
90it [00:01, 44.65it/s][A
95it [00:02, 44.84it/s][A
100it [00:02, 45.46it/s][A
105it [00:02, 45.97it/s][A

Epoch: 186, Step: 100, Loss: 4.593449234962463



110it [00:02, 45.11it/s][A
115it [00:02, 45.38it/s][A
120it [00:02, 45.51it/s][A
125it [00:02, 45.84it/s][A
130it [00:02, 45.80it/s][A
135it [00:02, 46.14it/s][A
140it [00:03, 45.66it/s][A
145it [00:03, 45.62it/s][A
150it [00:03, 45.74it/s][A
155it [00:03, 45.86it/s][A
160it [00:03, 46.05it/s][A
165it [00:03, 46.29it/s][A
170it [00:03, 46.05it/s][A
175it [00:03, 45.77it/s][A
180it [00:03, 45.67it/s][A
185it [00:04, 45.77it/s][A
190it [00:04, 45.99it/s][A
195it [00:04, 46.18it/s][A
200it [00:04, 46.18it/s][A
205it [00:04, 45.93it/s][A

Epoch: 186, Step: 200, Loss: 4.61461341381073



210it [00:04, 45.92it/s][A
215it [00:04, 45.95it/s][A
220it [00:04, 45.93it/s][A
227it [00:04, 45.50it/s]
 37%|███▋      | 186/500 [21:53<40:42,  7.78s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.40it/s][A
10it [00:00, 46.02it/s][A
15it [00:00, 45.69it/s][A
20it [00:00, 45.66it/s][A
25it [00:00, 45.77it/s][A
30it [00:00, 45.73it/s][A
35it [00:00, 45.64it/s][A
40it [00:00, 45.57it/s][A
45it [00:00, 45.45it/s][A
50it [00:01, 45.60it/s][A
55it [00:01, 45.71it/s][A
60it [00:01, 45.66it/s][A
65it [00:01, 45.45it/s][A
70it [00:01, 45.49it/s][A
75it [00:01, 45.44it/s][A
80it [00:01, 45.07it/s][A
85it [00:01, 45.25it/s][A
90it [00:01, 45.11it/s][A
95it [00:02, 45.38it/s][A
100it [00:02, 45.18it/s][A
105it [00:02, 44.83it/s][A

Epoch: 187, Step: 100, Loss: 4.59216600894928



110it [00:02, 44.91it/s][A
115it [00:02, 45.21it/s][A
120it [00:02, 45.30it/s][A
125it [00:02, 45.56it/s][A
130it [00:02, 45.79it/s][A
135it [00:02, 45.94it/s][A
140it [00:03, 45.90it/s][A
145it [00:03, 46.06it/s][A
150it [00:03, 46.15it/s][A
155it [00:03, 46.22it/s][A
160it [00:03, 46.08it/s][A
165it [00:03, 46.03it/s][A
170it [00:03, 45.21it/s][A
175it [00:03, 45.30it/s][A
180it [00:03, 45.25it/s][A
185it [00:04, 44.18it/s][A
190it [00:04, 44.73it/s][A
195it [00:04, 45.04it/s][A
200it [00:04, 45.21it/s][A
205it [00:04, 45.36it/s][A

Epoch: 187, Step: 200, Loss: 4.608919153213501



210it [00:04, 45.03it/s][A
215it [00:04, 45.14it/s][A
220it [00:04, 44.47it/s][A
227it [00:05, 45.35it/s]
 37%|███▋      | 187/500 [21:58<36:14,  6.95s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.51it/s][A
10it [00:00, 45.38it/s][A
15it [00:00, 45.32it/s][A
20it [00:00, 45.66it/s][A
25it [00:00, 45.70it/s][A
30it [00:00, 45.58it/s][A
35it [00:00, 45.29it/s][A
40it [00:00, 44.90it/s][A
45it [00:01, 44.37it/s][A
50it [00:01, 44.38it/s][A
55it [00:01, 44.81it/s][A
60it [00:01, 45.15it/s][A
65it [00:01, 45.41it/s][A
70it [00:01, 45.46it/s][A
75it [00:01, 43.94it/s][A
80it [00:01, 43.57it/s][A
85it [00:01, 43.87it/s][A
90it [00:02, 44.21it/s][A
95it [00:02, 44.40it/s][A
100it [00:02, 44.51it/s][A
105it [00:02, 44.79it/s][A

Epoch: 188, Step: 100, Loss: 4.589791479110718



110it [00:02, 43.69it/s][A
115it [00:02, 43.20it/s][A
120it [00:02, 43.70it/s][A
125it [00:02, 43.92it/s][A
130it [00:02, 44.32it/s][A
135it [00:03, 43.69it/s][A
140it [00:03, 44.32it/s][A
145it [00:03, 44.34it/s][A
150it [00:03, 44.56it/s][A
155it [00:03, 44.81it/s][A
160it [00:03, 44.81it/s][A
165it [00:03, 44.72it/s][A
170it [00:03, 44.96it/s][A
175it [00:03, 45.11it/s][A
180it [00:04, 45.23it/s][A
185it [00:04, 45.26it/s][A
190it [00:04, 44.53it/s][A
195it [00:04, 44.81it/s][A
200it [00:04, 44.93it/s][A
205it [00:04, 45.09it/s][A

Epoch: 188, Step: 200, Loss: 4.607406165599823



210it [00:04, 44.60it/s][A
215it [00:04, 44.84it/s][A
220it [00:04, 45.16it/s][A
227it [00:05, 44.67it/s]
 38%|███▊      | 188/500 [22:03<33:13,  6.39s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.64it/s][A
10it [00:00, 43.72it/s][A
15it [00:00, 44.32it/s][A
20it [00:00, 44.78it/s][A
25it [00:00, 44.92it/s][A
30it [00:00, 45.12it/s][A
35it [00:00, 44.91it/s][A
40it [00:00, 44.71it/s][A
45it [00:01, 44.81it/s][A
50it [00:01, 44.08it/s][A
55it [00:01, 44.36it/s][A
60it [00:01, 44.78it/s][A
65it [00:01, 44.53it/s][A
70it [00:01, 44.60it/s][A
75it [00:01, 44.79it/s][A
80it [00:01, 44.89it/s][A
85it [00:01, 44.85it/s][A
90it [00:02, 45.02it/s][A
95it [00:02, 45.26it/s][A
100it [00:02, 44.44it/s][A
105it [00:02, 44.87it/s][A

Epoch: 189, Step: 100, Loss: 4.596023535728454



110it [00:02, 45.07it/s][A
115it [00:02, 45.12it/s][A
120it [00:02, 45.36it/s][A
125it [00:02, 44.72it/s][A
130it [00:02, 44.92it/s][A
135it [00:03, 44.95it/s][A
140it [00:03, 45.33it/s][A
145it [00:03, 44.64it/s][A
150it [00:03, 45.02it/s][A
155it [00:03, 45.05it/s][A
160it [00:03, 45.14it/s][A
165it [00:03, 45.27it/s][A
170it [00:03, 45.45it/s][A
175it [00:03, 45.56it/s][A
180it [00:04, 45.74it/s][A
185it [00:04, 45.63it/s][A
190it [00:04, 44.25it/s][A
195it [00:04, 44.69it/s][A
200it [00:04, 44.76it/s][A
205it [00:04, 44.76it/s][A

Epoch: 189, Step: 200, Loss: 4.609124555587768



210it [00:04, 44.54it/s][A
215it [00:04, 44.41it/s][A
220it [00:04, 44.56it/s][A
227it [00:05, 44.82it/s]
 38%|███▊      | 189/500 [22:08<31:03,  5.99s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.54it/s][A
10it [00:00, 45.71it/s][A
15it [00:00, 45.36it/s][A
20it [00:00, 45.32it/s][A
25it [00:00, 45.40it/s][A
30it [00:00, 45.38it/s][A
35it [00:00, 45.55it/s][A
40it [00:00, 45.74it/s][A
45it [00:00, 45.37it/s][A
50it [00:01, 45.55it/s][A
55it [00:01, 45.72it/s][A
60it [00:01, 45.65it/s][A
65it [00:01, 45.82it/s][A
70it [00:01, 45.51it/s][A
75it [00:01, 45.34it/s][A
80it [00:01, 45.29it/s][A
85it [00:01, 45.37it/s][A
90it [00:01, 44.14it/s][A
95it [00:02, 44.82it/s][A
100it [00:02, 44.90it/s][A
105it [00:02, 44.08it/s][A

Epoch: 190, Step: 100, Loss: 4.588828845024109



110it [00:02, 44.33it/s][A
115it [00:02, 44.56it/s][A
120it [00:02, 44.71it/s][A
125it [00:02, 44.64it/s][A
130it [00:02, 44.56it/s][A
135it [00:03, 43.68it/s][A
140it [00:03, 44.33it/s][A
145it [00:03, 44.44it/s][A
150it [00:03, 44.66it/s][A
155it [00:03, 44.46it/s][A
160it [00:03, 44.36it/s][A
165it [00:03, 44.38it/s][A
170it [00:03, 44.82it/s][A
175it [00:03, 44.86it/s][A
180it [00:04, 44.88it/s][A
185it [00:04, 45.05it/s][A
190it [00:04, 45.33it/s][A
195it [00:04, 45.46it/s][A
200it [00:04, 44.80it/s][A
205it [00:04, 45.24it/s][A

Epoch: 190, Step: 200, Loss: 4.6069156002998355



210it [00:04, 44.97it/s][A
215it [00:04, 45.09it/s][A
220it [00:04, 44.89it/s][A
227it [00:05, 44.94it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.31it/s][A
13it [00:00, 60.02it/s][A
19it [00:00, 59.82it/s][A
26it [00:00, 60.13it/s][A
33it [00:00, 60.06it/s][A
40it [00:00, 60.06it/s][A
47it [00:00, 60.07it/s][A
54it [00:00, 60.30it/s][A
61it [00:01, 60.20it/s][A
68it [00:01, 60.37it/s][A
75it [00:01, 60.19it/s][A
82it [00:01, 58.19it/s][A
88it [00:01, 57.92it/s][A
94it [00:01, 58.10it/s][A
100it [00:01, 58.25it/s][A
106it [00:01, 58.72it/s][A
113it [00:01, 59.27it/s][A
119it [00:02, 59.35it/s][A
125it [00:02, 59.31it/s][A
131it [00:02, 59.39it/s][A
137it [00:02, 58.30it/s][A
143it [00:02, 58.71it/s][A
149it [00:02, 59.07it/s][A
156it [00:02, 59.52it/s][A
162it [00:02, 57.97it/s][A
169it [00:02, 58.79it/s][A
175it [00:02, 58.27it/s][A
182it [00:03, 59.15it/s][A
188it [00:03, 59.14it/s][A
194it [00:03, 58.11it/s][A
200it [00:03, 58.02it/s][A
207it [00:03, 


Epoch: 190, Test Loss: 5.453363111300498, Test Perplexity: 234.33857774438326




0it [00:00, ?it/s][A
5it [00:00, 46.31it/s][A
10it [00:00, 46.54it/s][A
15it [00:00, 46.39it/s][A
20it [00:00, 46.62it/s][A
25it [00:00, 47.19it/s][A
30it [00:00, 47.21it/s][A
35it [00:00, 46.70it/s][A
40it [00:00, 46.36it/s][A
45it [00:00, 46.23it/s][A
50it [00:01, 45.96it/s][A
55it [00:01, 45.57it/s][A
60it [00:01, 45.79it/s][A
65it [00:01, 46.05it/s][A
70it [00:01, 45.92it/s][A
75it [00:01, 45.70it/s][A
80it [00:01, 45.09it/s][A
85it [00:01, 44.95it/s][A
90it [00:01, 44.65it/s][A
95it [00:02, 44.06it/s][A
100it [00:02, 44.35it/s][A
105it [00:02, 44.85it/s][A

Epoch: 191, Step: 100, Loss: 4.591465563774109



110it [00:02, 45.06it/s][A
115it [00:02, 45.04it/s][A
120it [00:02, 45.03it/s][A
125it [00:02, 45.02it/s][A
130it [00:02, 44.94it/s][A
135it [00:02, 44.89it/s][A
140it [00:03, 45.05it/s][A
145it [00:03, 45.19it/s][A
150it [00:03, 45.32it/s][A
155it [00:03, 45.31it/s][A
160it [00:03, 44.25it/s][A
165it [00:03, 44.47it/s][A
170it [00:03, 44.70it/s][A
175it [00:03, 44.96it/s][A
180it [00:03, 45.03it/s][A
185it [00:04, 45.01it/s][A
190it [00:04, 45.10it/s][A
195it [00:04, 44.97it/s][A
200it [00:04, 45.12it/s][A
205it [00:04, 45.40it/s][A

Epoch: 191, Step: 200, Loss: 4.605924623012543



210it [00:04, 45.38it/s][A
215it [00:04, 45.40it/s][A
220it [00:04, 45.49it/s][A
227it [00:05, 45.33it/s]
 38%|███▊      | 191/500 [22:29<39:58,  7.76s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.86it/s][A
10it [00:00, 45.65it/s][A
15it [00:00, 45.43it/s][A
20it [00:00, 45.70it/s][A
25it [00:00, 45.61it/s][A
30it [00:00, 45.55it/s][A
35it [00:00, 45.70it/s][A
40it [00:00, 45.67it/s][A
45it [00:00, 45.55it/s][A
50it [00:01, 45.32it/s][A
55it [00:01, 45.35it/s][A
60it [00:01, 45.18it/s][A
65it [00:01, 44.98it/s][A
70it [00:01, 45.30it/s][A
75it [00:01, 45.52it/s][A
80it [00:01, 45.57it/s][A
85it [00:01, 45.82it/s][A
90it [00:01, 45.76it/s][A
95it [00:02, 44.53it/s][A
100it [00:02, 44.79it/s][A
105it [00:02, 44.89it/s][A

Epoch: 192, Step: 100, Loss: 4.597879815101623



110it [00:02, 45.12it/s][A
115it [00:02, 45.23it/s][A
120it [00:02, 45.20it/s][A
125it [00:02, 44.83it/s][A
130it [00:02, 45.16it/s][A
135it [00:02, 45.36it/s][A
140it [00:03, 45.65it/s][A
145it [00:03, 45.79it/s][A
150it [00:03, 45.74it/s][A
155it [00:03, 45.88it/s][A
160it [00:03, 45.18it/s][A
165it [00:03, 45.38it/s][A
170it [00:03, 45.16it/s][A
175it [00:03, 45.32it/s][A
180it [00:03, 45.39it/s][A
185it [00:04, 45.41it/s][A
190it [00:04, 45.31it/s][A
195it [00:04, 44.94it/s][A
200it [00:04, 44.91it/s][A
205it [00:04, 44.95it/s][A

Epoch: 192, Step: 200, Loss: 4.606764833927155



210it [00:04, 44.49it/s][A
215it [00:04, 44.80it/s][A
220it [00:04, 44.71it/s][A
227it [00:05, 45.23it/s]
 38%|███▊      | 192/500 [22:34<35:37,  6.94s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.49it/s][A
10it [00:00, 44.75it/s][A
15it [00:00, 44.43it/s][A
20it [00:00, 42.94it/s][A
25it [00:00, 43.17it/s][A
30it [00:00, 43.78it/s][A
35it [00:00, 43.94it/s][A
40it [00:00, 44.40it/s][A
45it [00:01, 44.80it/s][A
50it [00:01, 45.31it/s][A
55it [00:01, 44.36it/s][A
60it [00:01, 44.31it/s][A
65it [00:01, 44.89it/s][A
70it [00:01, 45.28it/s][A
75it [00:01, 45.35it/s][A
80it [00:01, 45.38it/s][A
85it [00:01, 44.33it/s][A
90it [00:02, 44.77it/s][A
95it [00:02, 43.90it/s][A
100it [00:02, 44.58it/s][A
105it [00:02, 44.81it/s][A

Epoch: 193, Step: 100, Loss: 4.5952845525741575



110it [00:02, 45.14it/s][A
115it [00:02, 45.31it/s][A
120it [00:02, 44.64it/s][A
125it [00:02, 44.81it/s][A
130it [00:02, 44.86it/s][A
135it [00:03, 45.02it/s][A
140it [00:03, 45.19it/s][A
145it [00:03, 45.42it/s][A
150it [00:03, 45.54it/s][A
155it [00:03, 45.61it/s][A
160it [00:03, 45.59it/s][A
165it [00:03, 45.52it/s][A
170it [00:03, 45.15it/s][A
175it [00:03, 45.07it/s][A
180it [00:04, 45.16it/s][A
185it [00:04, 45.11it/s][A
190it [00:04, 45.22it/s][A
195it [00:04, 45.23it/s][A
200it [00:04, 44.11it/s][A
205it [00:04, 44.53it/s][A

Epoch: 193, Step: 200, Loss: 4.604372854232788



210it [00:04, 44.84it/s][A
215it [00:04, 45.24it/s][A
220it [00:04, 45.07it/s][A
227it [00:05, 44.86it/s]
 39%|███▊      | 193/500 [22:39<32:37,  6.38s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.96it/s][A
10it [00:00, 45.21it/s][A
15it [00:00, 45.32it/s][A
20it [00:00, 45.16it/s][A
25it [00:00, 44.66it/s][A
30it [00:00, 43.94it/s][A
35it [00:00, 44.45it/s][A
40it [00:00, 44.73it/s][A
45it [00:01, 44.73it/s][A
50it [00:01, 43.48it/s][A
55it [00:01, 44.03it/s][A
60it [00:01, 44.47it/s][A
65it [00:01, 44.79it/s][A
70it [00:01, 43.63it/s][A
75it [00:01, 43.46it/s][A
80it [00:01, 44.02it/s][A
85it [00:01, 44.19it/s][A
90it [00:02, 44.15it/s][A
95it [00:02, 44.52it/s][A
100it [00:02, 44.54it/s][A
105it [00:02, 44.47it/s][A

Epoch: 194, Step: 100, Loss: 4.590706033706665



110it [00:02, 44.97it/s][A
115it [00:02, 43.91it/s][A
120it [00:02, 43.30it/s][A
125it [00:02, 44.20it/s][A
130it [00:02, 44.73it/s][A
135it [00:03, 44.27it/s][A
140it [00:03, 44.79it/s][A
145it [00:03, 45.20it/s][A
150it [00:03, 45.50it/s][A
155it [00:03, 45.76it/s][A
160it [00:03, 45.76it/s][A
165it [00:03, 45.69it/s][A
170it [00:03, 44.63it/s][A
175it [00:03, 44.76it/s][A
180it [00:04, 44.96it/s][A
185it [00:04, 45.01it/s][A
190it [00:04, 44.95it/s][A
195it [00:04, 45.30it/s][A
200it [00:04, 45.56it/s][A
205it [00:04, 45.59it/s][A

Epoch: 194, Step: 200, Loss: 4.606719648838043



210it [00:04, 44.81it/s][A
215it [00:04, 45.21it/s][A
220it [00:04, 45.29it/s][A
227it [00:05, 44.71it/s]
 39%|███▉      | 194/500 [22:44<30:32,  5.99s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.27it/s][A
10it [00:00, 46.11it/s][A
15it [00:00, 46.01it/s][A
20it [00:00, 45.59it/s][A
25it [00:00, 44.62it/s][A
30it [00:00, 44.34it/s][A
35it [00:00, 44.24it/s][A
40it [00:00, 43.62it/s][A
45it [00:01, 44.24it/s][A
50it [00:01, 44.68it/s][A
55it [00:01, 43.05it/s][A
60it [00:01, 42.91it/s][A
65it [00:01, 42.08it/s][A
70it [00:01, 43.04it/s][A
75it [00:01, 43.34it/s][A
80it [00:01, 43.67it/s][A
85it [00:01, 43.17it/s][A
90it [00:02, 43.41it/s][A
95it [00:02, 44.30it/s][A
100it [00:02, 43.95it/s][A
105it [00:02, 44.65it/s][A

Epoch: 195, Step: 100, Loss: 4.5903400039672855



110it [00:02, 44.96it/s][A
115it [00:02, 45.26it/s][A
120it [00:02, 45.25it/s][A
125it [00:02, 45.52it/s][A
130it [00:02, 45.38it/s][A
135it [00:03, 45.43it/s][A
140it [00:03, 45.26it/s][A
145it [00:03, 45.48it/s][A
150it [00:03, 45.78it/s][A
155it [00:03, 44.73it/s][A
160it [00:03, 44.70it/s][A
165it [00:03, 44.91it/s][A
170it [00:03, 45.32it/s][A
175it [00:03, 45.40it/s][A
180it [00:04, 45.05it/s][A
185it [00:04, 45.16it/s][A
190it [00:04, 45.33it/s][A
195it [00:04, 45.57it/s][A
200it [00:04, 45.32it/s][A
205it [00:04, 45.35it/s][A

Epoch: 195, Step: 200, Loss: 4.607088348865509



210it [00:04, 45.36it/s][A
215it [00:04, 45.33it/s][A
220it [00:04, 45.22it/s][A
227it [00:05, 44.68it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.15it/s][A
13it [00:00, 59.70it/s][A
20it [00:00, 60.30it/s][A
27it [00:00, 60.57it/s][A
34it [00:00, 60.41it/s][A
41it [00:00, 60.48it/s][A
48it [00:00, 58.14it/s][A
54it [00:00, 58.14it/s][A
61it [00:01, 58.96it/s][A
68it [00:01, 59.51it/s][A
74it [00:01, 58.94it/s][A
81it [00:01, 59.68it/s][A
88it [00:01, 59.96it/s][A
95it [00:01, 60.12it/s][A
102it [00:01, 60.39it/s][A
109it [00:01, 60.50it/s][A
116it [00:01, 60.25it/s][A
123it [00:02, 60.32it/s][A
130it [00:02, 59.33it/s][A
137it [00:02, 59.83it/s][A
143it [00:02, 59.45it/s][A
150it [00:02, 60.14it/s][A
157it [00:02, 60.59it/s][A
164it [00:02, 60.74it/s][A
171it [00:02, 60.77it/s][A
178it [00:02, 60.79it/s][A
185it [00:03, 60.74it/s][A
192it [00:03, 60.95it/s][A
199it [00:03, 61.16it/s][A
206it [00:03, 61.21it/s][A
213it [00:03, 61.26it/s][A
220it [00:03, 


Epoch: 195, Test Loss: 5.450954321008291, Test Perplexity: 233.79310373341815




0it [00:00, ?it/s][A
5it [00:00, 45.87it/s][A
10it [00:00, 45.44it/s][A
15it [00:00, 45.18it/s][A
20it [00:00, 44.90it/s][A
25it [00:00, 44.96it/s][A
30it [00:00, 45.09it/s][A
35it [00:00, 45.02it/s][A
40it [00:00, 45.30it/s][A
45it [00:00, 45.22it/s][A
50it [00:01, 44.25it/s][A
55it [00:01, 44.66it/s][A
60it [00:01, 45.06it/s][A
65it [00:01, 45.26it/s][A
70it [00:01, 45.35it/s][A
75it [00:01, 45.27it/s][A
80it [00:01, 45.14it/s][A
85it [00:01, 45.04it/s][A
90it [00:01, 45.16it/s][A
95it [00:02, 45.47it/s][A
100it [00:02, 45.63it/s][A
105it [00:02, 45.75it/s][A

Epoch: 196, Step: 100, Loss: 4.598071322441101



110it [00:02, 44.19it/s][A
115it [00:02, 44.56it/s][A
120it [00:02, 44.76it/s][A
125it [00:02, 44.47it/s][A
130it [00:02, 44.68it/s][A
135it [00:02, 44.90it/s][A
140it [00:03, 45.08it/s][A
145it [00:03, 44.93it/s][A
150it [00:03, 45.14it/s][A
155it [00:03, 45.26it/s][A
160it [00:03, 44.99it/s][A
165it [00:03, 44.71it/s][A
170it [00:03, 44.95it/s][A
175it [00:03, 44.20it/s][A
180it [00:04, 44.61it/s][A
185it [00:04, 44.78it/s][A
190it [00:04, 44.59it/s][A
195it [00:04, 44.56it/s][A
200it [00:04, 44.65it/s][A
205it [00:04, 44.07it/s][A

Epoch: 196, Step: 200, Loss: 4.603971862792969



210it [00:04, 44.17it/s][A
215it [00:04, 44.55it/s][A
220it [00:04, 44.96it/s][A
227it [00:05, 44.89it/s]
 39%|███▉      | 196/500 [23:05<39:25,  7.78s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.78it/s][A
10it [00:00, 44.71it/s][A
15it [00:00, 44.97it/s][A
20it [00:00, 45.00it/s][A
25it [00:00, 44.88it/s][A
30it [00:00, 45.00it/s][A
35it [00:00, 45.04it/s][A
40it [00:00, 45.21it/s][A
45it [00:01, 44.08it/s][A
50it [00:01, 43.40it/s][A
55it [00:01, 43.95it/s][A
60it [00:01, 44.52it/s][A
65it [00:01, 44.70it/s][A
70it [00:01, 44.79it/s][A
75it [00:01, 44.66it/s][A
80it [00:01, 44.87it/s][A
85it [00:01, 44.94it/s][A
90it [00:02, 44.83it/s][A
95it [00:02, 45.05it/s][A
100it [00:02, 45.47it/s][A
105it [00:02, 45.71it/s][A

Epoch: 197, Step: 100, Loss: 4.581510453224182



110it [00:02, 45.40it/s][A
115it [00:02, 45.17it/s][A
120it [00:02, 44.39it/s][A
125it [00:02, 44.83it/s][A
130it [00:02, 44.76it/s][A
135it [00:03, 45.06it/s][A
140it [00:03, 45.12it/s][A
145it [00:03, 45.24it/s][A
150it [00:03, 45.45it/s][A
155it [00:03, 45.36it/s][A
160it [00:03, 44.85it/s][A
165it [00:03, 44.59it/s][A
170it [00:03, 43.93it/s][A
175it [00:03, 43.82it/s][A
180it [00:04, 43.34it/s][A
185it [00:04, 43.87it/s][A
190it [00:04, 44.27it/s][A
195it [00:04, 44.65it/s][A
200it [00:04, 45.00it/s][A
205it [00:04, 45.30it/s][A

Epoch: 197, Step: 200, Loss: 4.604759855270386



210it [00:04, 44.66it/s][A
215it [00:04, 44.50it/s][A
220it [00:04, 44.26it/s][A
227it [00:05, 44.71it/s]
 39%|███▉      | 197/500 [23:10<35:12,  6.97s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.10it/s][A
10it [00:00, 45.90it/s][A
15it [00:00, 46.12it/s][A
20it [00:00, 46.11it/s][A
25it [00:00, 45.62it/s][A
30it [00:00, 45.02it/s][A
35it [00:00, 45.32it/s][A
40it [00:00, 44.49it/s][A
45it [00:00, 44.97it/s][A
50it [00:01, 45.39it/s][A
55it [00:01, 45.44it/s][A
60it [00:01, 44.91it/s][A
65it [00:01, 44.48it/s][A
70it [00:01, 44.63it/s][A
75it [00:01, 44.62it/s][A
80it [00:01, 45.06it/s][A
85it [00:01, 45.15it/s][A
90it [00:01, 45.33it/s][A
95it [00:02, 45.62it/s][A
100it [00:02, 45.69it/s][A
105it [00:02, 45.72it/s][A

Epoch: 198, Step: 100, Loss: 4.585039849281311



110it [00:02, 45.54it/s][A
115it [00:02, 45.75it/s][A
120it [00:02, 45.83it/s][A
125it [00:02, 45.66it/s][A
130it [00:02, 45.77it/s][A
135it [00:02, 45.72it/s][A
140it [00:03, 45.81it/s][A
145it [00:03, 45.57it/s][A
150it [00:03, 45.82it/s][A
155it [00:03, 45.89it/s][A
160it [00:03, 45.59it/s][A
165it [00:03, 45.59it/s][A
170it [00:03, 45.67it/s][A
175it [00:03, 45.76it/s][A
180it [00:03, 44.57it/s][A
185it [00:04, 45.04it/s][A
190it [00:04, 45.38it/s][A
195it [00:04, 45.53it/s][A
200it [00:04, 45.51it/s][A
205it [00:04, 45.78it/s][A

Epoch: 198, Step: 200, Loss: 4.599643919467926



210it [00:04, 45.65it/s][A
215it [00:04, 45.46it/s][A
220it [00:04, 45.61it/s][A
227it [00:05, 45.37it/s]
 40%|███▉      | 198/500 [23:15<32:07,  6.38s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.59it/s][A
10it [00:00, 45.55it/s][A
15it [00:00, 44.73it/s][A
20it [00:00, 45.09it/s][A
25it [00:00, 45.03it/s][A
30it [00:00, 45.08it/s][A
35it [00:00, 44.79it/s][A
40it [00:00, 45.27it/s][A
45it [00:00, 45.04it/s][A
50it [00:01, 45.23it/s][A
55it [00:01, 44.03it/s][A
60it [00:01, 44.57it/s][A
65it [00:01, 44.90it/s][A
70it [00:01, 45.19it/s][A
75it [00:01, 45.04it/s][A
80it [00:01, 45.24it/s][A
85it [00:01, 45.38it/s][A
90it [00:01, 45.46it/s][A
95it [00:02, 45.48it/s][A
100it [00:02, 45.31it/s][A
105it [00:02, 45.30it/s][A

Epoch: 199, Step: 100, Loss: 4.588068089485168



110it [00:02, 45.20it/s][A
115it [00:02, 45.27it/s][A
120it [00:02, 45.28it/s][A
125it [00:02, 45.45it/s][A
130it [00:02, 45.55it/s][A
135it [00:02, 45.59it/s][A
140it [00:03, 45.64it/s][A
145it [00:03, 45.54it/s][A
150it [00:03, 45.61it/s][A
155it [00:03, 45.46it/s][A
160it [00:03, 45.38it/s][A
165it [00:03, 45.21it/s][A
170it [00:03, 45.33it/s][A
175it [00:03, 45.25it/s][A
180it [00:03, 43.65it/s][A
185it [00:04, 43.67it/s][A
190it [00:04, 43.88it/s][A
195it [00:04, 44.11it/s][A
200it [00:04, 42.58it/s][A
205it [00:04, 42.24it/s][A

Epoch: 199, Step: 200, Loss: 4.5974547386169435



210it [00:04, 43.05it/s][A
215it [00:04, 43.68it/s][A
220it [00:04, 43.52it/s][A
227it [00:05, 44.61it/s]
 40%|███▉      | 199/500 [23:20<30:04,  5.99s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.96it/s][A
10it [00:00, 42.92it/s][A
15it [00:00, 44.30it/s][A
20it [00:00, 44.86it/s][A
25it [00:00, 45.20it/s][A
30it [00:00, 45.16it/s][A
35it [00:00, 45.36it/s][A
40it [00:00, 45.44it/s][A
45it [00:00, 45.69it/s][A
50it [00:01, 45.79it/s][A
55it [00:01, 45.95it/s][A
60it [00:01, 45.87it/s][A
65it [00:01, 45.87it/s][A
70it [00:01, 46.00it/s][A
75it [00:01, 45.85it/s][A
80it [00:01, 45.62it/s][A
85it [00:01, 45.29it/s][A
90it [00:01, 45.10it/s][A
95it [00:02, 45.09it/s][A
100it [00:02, 45.30it/s][A
105it [00:02, 45.45it/s][A

Epoch: 200, Step: 100, Loss: 4.597847394943237



110it [00:02, 45.51it/s][A
115it [00:02, 45.37it/s][A
120it [00:02, 45.41it/s][A
125it [00:02, 45.34it/s][A
130it [00:02, 45.03it/s][A
135it [00:02, 45.39it/s][A
140it [00:03, 45.60it/s][A
145it [00:03, 45.85it/s][A
150it [00:03, 45.78it/s][A
155it [00:03, 44.34it/s][A
160it [00:03, 44.73it/s][A
165it [00:03, 45.12it/s][A
170it [00:03, 45.41it/s][A
175it [00:03, 45.69it/s][A
180it [00:03, 45.88it/s][A
185it [00:04, 45.91it/s][A
190it [00:04, 46.13it/s][A
195it [00:04, 45.89it/s][A
200it [00:04, 46.14it/s][A
205it [00:04, 45.13it/s][A

Epoch: 200, Step: 200, Loss: 4.600345160961151



210it [00:04, 45.69it/s][A
215it [00:04, 46.15it/s][A
220it [00:04, 46.16it/s][A
227it [00:04, 45.49it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.24it/s][A
13it [00:00, 59.78it/s][A
19it [00:00, 57.83it/s][A
26it [00:00, 59.17it/s][A
33it [00:00, 59.74it/s][A
40it [00:00, 60.21it/s][A
47it [00:00, 60.55it/s][A
54it [00:00, 59.70it/s][A
61it [00:01, 60.09it/s][A
68it [00:01, 60.40it/s][A
75it [00:01, 60.63it/s][A
82it [00:01, 60.87it/s][A
89it [00:01, 60.93it/s][A
96it [00:01, 60.89it/s][A
103it [00:01, 61.17it/s][A
110it [00:01, 61.37it/s][A
117it [00:01, 61.25it/s][A
124it [00:02, 61.22it/s][A
131it [00:02, 61.31it/s][A
138it [00:02, 61.33it/s][A
145it [00:02, 61.47it/s][A
152it [00:02, 61.09it/s][A
159it [00:02, 61.07it/s][A
166it [00:02, 61.27it/s][A
173it [00:02, 61.22it/s][A
180it [00:02, 60.96it/s][A
187it [00:03, 61.22it/s][A
194it [00:03, 61.33it/s][A
201it [00:03, 61.26it/s][A
208it [00:03, 61.14it/s][A
215it [00:03, 61.02it/s][A
222it [00:03, 


Epoch: 200, Test Loss: 5.460950178389224, Test Perplexity: 236.23418523658137




0it [00:00, ?it/s][A
5it [00:00, 45.58it/s][A
10it [00:00, 45.88it/s][A
15it [00:00, 45.76it/s][A
20it [00:00, 45.83it/s][A
25it [00:00, 45.71it/s][A
30it [00:00, 45.63it/s][A
35it [00:00, 45.58it/s][A
40it [00:00, 45.78it/s][A
45it [00:00, 45.68it/s][A
50it [00:01, 45.79it/s][A
55it [00:01, 45.04it/s][A
60it [00:01, 44.25it/s][A
65it [00:01, 44.80it/s][A
70it [00:01, 44.98it/s][A
75it [00:01, 45.00it/s][A
80it [00:01, 44.50it/s][A
85it [00:01, 44.92it/s][A
90it [00:01, 44.62it/s][A
95it [00:02, 43.63it/s][A
100it [00:02, 44.29it/s][A
105it [00:02, 44.78it/s][A

Epoch: 201, Step: 100, Loss: 4.582982153892517



110it [00:02, 45.01it/s][A
115it [00:02, 45.09it/s][A
120it [00:02, 45.27it/s][A
125it [00:02, 45.32it/s][A
130it [00:02, 44.60it/s][A
135it [00:02, 45.02it/s][A
140it [00:03, 45.03it/s][A
145it [00:03, 44.61it/s][A
150it [00:03, 45.22it/s][A
155it [00:03, 45.41it/s][A
160it [00:03, 45.29it/s][A
165it [00:03, 45.17it/s][A
170it [00:03, 44.85it/s][A
175it [00:03, 44.44it/s][A
180it [00:03, 44.76it/s][A
185it [00:04, 45.18it/s][A
190it [00:04, 45.30it/s][A
195it [00:04, 45.25it/s][A
200it [00:04, 44.93it/s][A
205it [00:04, 44.73it/s][A

Epoch: 201, Step: 200, Loss: 4.6012840700149535



210it [00:04, 44.70it/s][A
215it [00:04, 44.58it/s][A
220it [00:04, 44.94it/s][A
227it [00:05, 45.00it/s]
 40%|████      | 201/500 [23:41<38:40,  7.76s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.93it/s][A
10it [00:00, 46.04it/s][A
15it [00:00, 45.45it/s][A
20it [00:00, 45.39it/s][A
25it [00:00, 44.95it/s][A
30it [00:00, 44.85it/s][A
35it [00:00, 44.05it/s][A
40it [00:00, 44.04it/s][A
45it [00:01, 42.90it/s][A
50it [00:01, 43.15it/s][A
55it [00:01, 43.37it/s][A
60it [00:01, 44.02it/s][A
65it [00:01, 43.99it/s][A
70it [00:01, 44.38it/s][A
75it [00:01, 44.47it/s][A
80it [00:01, 44.54it/s][A
85it [00:01, 44.61it/s][A
90it [00:02, 43.86it/s][A
95it [00:02, 44.04it/s][A
100it [00:02, 44.04it/s][A
105it [00:02, 44.44it/s][A

Epoch: 202, Step: 100, Loss: 4.573452105522156



110it [00:02, 44.67it/s][A
115it [00:02, 44.85it/s][A
120it [00:02, 44.89it/s][A
125it [00:02, 43.56it/s][A
130it [00:02, 44.11it/s][A
135it [00:03, 44.35it/s][A
140it [00:03, 44.70it/s][A
145it [00:03, 44.78it/s][A
150it [00:03, 44.81it/s][A
155it [00:03, 44.96it/s][A
160it [00:03, 44.13it/s][A
165it [00:03, 43.85it/s][A
170it [00:03, 44.18it/s][A
175it [00:03, 44.59it/s][A
180it [00:04, 45.02it/s][A
185it [00:04, 45.21it/s][A
190it [00:04, 45.18it/s][A
195it [00:04, 45.15it/s][A
200it [00:04, 45.29it/s][A
205it [00:04, 45.07it/s][A

Epoch: 202, Step: 200, Loss: 4.596026167869568



210it [00:04, 45.17it/s][A
215it [00:04, 44.99it/s][A
220it [00:04, 45.27it/s][A
227it [00:05, 44.49it/s]
 40%|████      | 202/500 [23:46<34:35,  6.96s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.05it/s][A
10it [00:00, 45.16it/s][A
15it [00:00, 45.08it/s][A
20it [00:00, 44.83it/s][A
25it [00:00, 45.21it/s][A
30it [00:00, 44.82it/s][A
35it [00:00, 45.16it/s][A
40it [00:00, 44.90it/s][A
45it [00:01, 44.90it/s][A
50it [00:01, 43.93it/s][A
55it [00:01, 44.40it/s][A
60it [00:01, 44.84it/s][A
65it [00:01, 44.87it/s][A
70it [00:01, 45.07it/s][A
75it [00:01, 45.20it/s][A
80it [00:01, 45.32it/s][A
85it [00:01, 45.44it/s][A
90it [00:02, 45.33it/s][A
95it [00:02, 45.25it/s][A
100it [00:02, 45.38it/s][A
105it [00:02, 45.43it/s][A

Epoch: 203, Step: 100, Loss: 4.578468060493469



110it [00:02, 45.31it/s][A
115it [00:02, 45.43it/s][A
120it [00:02, 45.30it/s][A
125it [00:02, 45.41it/s][A
130it [00:02, 45.39it/s][A
135it [00:02, 45.44it/s][A
140it [00:03, 45.37it/s][A
145it [00:03, 44.40it/s][A
150it [00:03, 44.83it/s][A
155it [00:03, 45.12it/s][A
160it [00:03, 45.13it/s][A
165it [00:03, 45.24it/s][A
170it [00:03, 44.91it/s][A
175it [00:03, 45.17it/s][A
180it [00:03, 45.36it/s][A
185it [00:04, 45.13it/s][A
190it [00:04, 45.08it/s][A
195it [00:04, 45.43it/s][A
200it [00:04, 45.66it/s][A
205it [00:04, 45.36it/s][A

Epoch: 203, Step: 200, Loss: 4.598312947750092



210it [00:04, 45.10it/s][A
215it [00:04, 45.04it/s][A
220it [00:04, 45.21it/s][A
227it [00:05, 45.12it/s]
 41%|████      | 203/500 [23:51<31:36,  6.39s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.53it/s][A
10it [00:00, 46.04it/s][A
15it [00:00, 46.10it/s][A
20it [00:00, 45.94it/s][A
25it [00:00, 45.70it/s][A
30it [00:00, 45.67it/s][A
35it [00:00, 45.44it/s][A
40it [00:00, 45.09it/s][A
45it [00:00, 45.37it/s][A
50it [00:01, 45.24it/s][A
55it [00:01, 45.02it/s][A
60it [00:01, 45.19it/s][A
65it [00:01, 44.91it/s][A
70it [00:01, 45.08it/s][A
75it [00:01, 45.00it/s][A
80it [00:01, 44.75it/s][A
85it [00:01, 45.08it/s][A
90it [00:01, 45.01it/s][A
95it [00:02, 45.04it/s][A
100it [00:02, 44.75it/s][A
105it [00:02, 44.62it/s][A

Epoch: 204, Step: 100, Loss: 4.577832975387573



110it [00:02, 44.64it/s][A
115it [00:02, 44.75it/s][A
120it [00:02, 44.74it/s][A
125it [00:02, 44.88it/s][A
130it [00:02, 45.10it/s][A
135it [00:02, 45.05it/s][A
140it [00:03, 44.82it/s][A
145it [00:03, 44.01it/s][A
150it [00:03, 44.22it/s][A
155it [00:03, 43.65it/s][A
160it [00:03, 43.77it/s][A
165it [00:03, 44.11it/s][A
170it [00:03, 44.49it/s][A
175it [00:03, 44.86it/s][A
180it [00:04, 45.09it/s][A
185it [00:04, 45.38it/s][A
190it [00:04, 45.32it/s][A
195it [00:04, 45.45it/s][A
200it [00:04, 45.39it/s][A
205it [00:04, 45.21it/s][A

Epoch: 204, Step: 200, Loss: 4.593793709278106



210it [00:04, 45.31it/s][A
215it [00:04, 44.62it/s][A
220it [00:04, 45.24it/s][A
227it [00:05, 44.98it/s]
 41%|████      | 204/500 [23:56<29:31,  5.98s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.93it/s][A
10it [00:00, 46.03it/s][A
15it [00:00, 44.15it/s][A
20it [00:00, 44.74it/s][A
25it [00:00, 44.86it/s][A
30it [00:00, 45.13it/s][A
35it [00:00, 45.45it/s][A
40it [00:00, 44.88it/s][A
45it [00:00, 45.36it/s][A
50it [00:01, 45.45it/s][A
55it [00:01, 45.38it/s][A
60it [00:01, 45.37it/s][A
65it [00:01, 45.77it/s][A
70it [00:01, 45.83it/s][A
75it [00:01, 46.00it/s][A
80it [00:01, 46.31it/s][A
85it [00:01, 45.79it/s][A
90it [00:01, 46.17it/s][A
95it [00:02, 45.85it/s][A
100it [00:02, 46.10it/s][A
105it [00:02, 46.29it/s][A

Epoch: 205, Step: 100, Loss: 4.5728456497192385



110it [00:02, 45.97it/s][A
115it [00:02, 45.37it/s][A
120it [00:02, 45.90it/s][A
125it [00:02, 46.11it/s][A
130it [00:02, 46.35it/s][A
135it [00:02, 46.45it/s][A
140it [00:03, 46.44it/s][A
145it [00:03, 46.56it/s][A
150it [00:03, 46.10it/s][A
155it [00:03, 46.30it/s][A
160it [00:03, 46.49it/s][A
165it [00:03, 46.51it/s][A
170it [00:03, 45.52it/s][A
175it [00:03, 46.04it/s][A
180it [00:03, 46.13it/s][A
185it [00:04, 46.07it/s][A
190it [00:04, 45.29it/s][A
195it [00:04, 46.04it/s][A
200it [00:04, 46.51it/s][A
205it [00:04, 46.35it/s][A

Epoch: 205, Step: 200, Loss: 4.591950323581695



210it [00:04, 46.02it/s][A
215it [00:04, 46.04it/s][A
220it [00:04, 45.86it/s][A
227it [00:04, 45.83it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.23it/s][A
13it [00:00, 59.15it/s][A
19it [00:00, 57.39it/s][A
25it [00:00, 57.87it/s][A
31it [00:00, 58.14it/s][A
38it [00:00, 58.99it/s][A
45it [00:00, 59.53it/s][A
52it [00:00, 60.07it/s][A
59it [00:00, 60.21it/s][A
66it [00:01, 60.03it/s][A
73it [00:01, 60.05it/s][A
80it [00:01, 60.35it/s][A
87it [00:01, 60.50it/s][A
94it [00:01, 60.63it/s][A
101it [00:01, 60.72it/s][A
108it [00:01, 60.78it/s][A
115it [00:01, 60.95it/s][A
122it [00:02, 60.83it/s][A
129it [00:02, 60.90it/s][A
136it [00:02, 60.49it/s][A
143it [00:02, 60.89it/s][A
150it [00:02, 60.97it/s][A
157it [00:02, 60.67it/s][A
164it [00:02, 60.58it/s][A
171it [00:02, 60.47it/s][A
178it [00:02, 60.53it/s][A
185it [00:03, 60.45it/s][A
192it [00:03, 59.58it/s][A
199it [00:03, 59.90it/s][A
206it [00:03, 60.05it/s][A
213it [00:03, 59.91it/s][A
219it [00:03, 


Epoch: 205, Test Loss: 5.456296837107735, Test Perplexity: 235.00949497104432




0it [00:00, ?it/s][A
5it [00:00, 45.08it/s][A
10it [00:00, 45.33it/s][A
15it [00:00, 45.35it/s][A
20it [00:00, 45.50it/s][A
25it [00:00, 45.71it/s][A
30it [00:00, 45.61it/s][A
35it [00:00, 45.01it/s][A
40it [00:00, 45.25it/s][A
45it [00:00, 45.30it/s][A
50it [00:01, 45.36it/s][A
55it [00:01, 45.42it/s][A
60it [00:01, 45.51it/s][A
65it [00:01, 44.78it/s][A
70it [00:01, 45.07it/s][A
75it [00:01, 45.04it/s][A
80it [00:01, 45.20it/s][A
85it [00:01, 45.33it/s][A
90it [00:01, 45.45it/s][A
95it [00:02, 45.57it/s][A
100it [00:02, 45.73it/s][A
105it [00:02, 45.81it/s][A

Epoch: 206, Step: 100, Loss: 4.5832678413391115



110it [00:02, 45.45it/s][A
115it [00:02, 45.28it/s][A
120it [00:02, 45.08it/s][A
125it [00:02, 45.35it/s][A
130it [00:02, 45.55it/s][A
135it [00:02, 45.86it/s][A
140it [00:03, 45.90it/s][A
145it [00:03, 45.09it/s][A
150it [00:03, 45.41it/s][A
155it [00:03, 45.45it/s][A
160it [00:03, 45.52it/s][A
165it [00:03, 45.41it/s][A
170it [00:03, 45.40it/s][A
175it [00:03, 45.35it/s][A
180it [00:03, 45.50it/s][A
185it [00:04, 45.36it/s][A
190it [00:04, 45.14it/s][A
195it [00:04, 44.49it/s][A
200it [00:04, 44.47it/s][A
205it [00:04, 44.01it/s][A

Epoch: 206, Step: 200, Loss: 4.594583189487457



210it [00:04, 44.34it/s][A
215it [00:04, 44.49it/s][A
220it [00:04, 44.79it/s][A
227it [00:05, 45.19it/s]
 41%|████      | 206/500 [24:17<37:58,  7.75s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.10it/s][A
10it [00:00, 43.60it/s][A
15it [00:00, 44.14it/s][A
20it [00:00, 44.36it/s][A
25it [00:00, 44.45it/s][A
30it [00:00, 44.43it/s][A
35it [00:00, 44.62it/s][A
40it [00:00, 44.82it/s][A
45it [00:01, 44.99it/s][A
50it [00:01, 45.07it/s][A
55it [00:01, 45.18it/s][A
60it [00:01, 44.86it/s][A
65it [00:01, 44.94it/s][A
70it [00:01, 45.27it/s][A
75it [00:01, 45.15it/s][A
80it [00:01, 45.19it/s][A
85it [00:01, 45.25it/s][A
90it [00:02, 45.38it/s][A
95it [00:02, 45.17it/s][A
100it [00:02, 45.20it/s][A
105it [00:02, 45.25it/s][A

Epoch: 207, Step: 100, Loss: 4.586424283981323



110it [00:02, 45.31it/s][A
115it [00:02, 45.39it/s][A
120it [00:02, 45.18it/s][A
125it [00:02, 45.34it/s][A
130it [00:02, 45.38it/s][A
135it [00:02, 45.55it/s][A
140it [00:03, 45.84it/s][A
145it [00:03, 45.74it/s][A
150it [00:03, 45.64it/s][A
155it [00:03, 45.49it/s][A
160it [00:03, 45.20it/s][A
165it [00:03, 45.15it/s][A
170it [00:03, 45.16it/s][A
175it [00:03, 45.33it/s][A
180it [00:03, 45.44it/s][A
185it [00:04, 43.88it/s][A
190it [00:04, 44.19it/s][A
195it [00:04, 44.62it/s][A
200it [00:04, 44.90it/s][A
205it [00:04, 44.88it/s][A

Epoch: 207, Step: 200, Loss: 4.594496042728424



210it [00:04, 45.11it/s][A
215it [00:04, 45.23it/s][A
220it [00:04, 45.45it/s][A
227it [00:05, 44.97it/s]
 41%|████▏     | 207/500 [24:22<33:53,  6.94s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.91it/s][A
9it [00:00, 43.25it/s][A
14it [00:00, 43.91it/s][A
19it [00:00, 44.52it/s][A
24it [00:00, 44.61it/s][A
29it [00:00, 44.81it/s][A
34it [00:00, 45.01it/s][A
39it [00:00, 45.11it/s][A
44it [00:00, 45.37it/s][A
49it [00:01, 45.46it/s][A
54it [00:01, 45.56it/s][A
59it [00:01, 45.45it/s][A
64it [00:01, 45.47it/s][A
69it [00:01, 44.92it/s][A
74it [00:01, 45.23it/s][A
79it [00:01, 45.48it/s][A
84it [00:01, 45.39it/s][A
89it [00:01, 45.42it/s][A
94it [00:02, 45.33it/s][A
99it [00:02, 45.56it/s][A
104it [00:02, 45.64it/s][A
109it [00:02, 45.75it/s][A

Epoch: 208, Step: 100, Loss: 4.588309602737427



114it [00:02, 45.46it/s][A
119it [00:02, 45.37it/s][A
124it [00:02, 44.93it/s][A
129it [00:02, 44.85it/s][A
134it [00:02, 45.06it/s][A
139it [00:03, 44.95it/s][A
144it [00:03, 44.97it/s][A
149it [00:03, 44.99it/s][A
154it [00:03, 44.99it/s][A
159it [00:03, 45.30it/s][A
164it [00:03, 45.52it/s][A
169it [00:03, 44.89it/s][A
174it [00:03, 45.19it/s][A
179it [00:03, 45.36it/s][A
184it [00:04, 45.31it/s][A
189it [00:04, 44.84it/s][A
194it [00:04, 45.25it/s][A
199it [00:04, 44.40it/s][A
204it [00:04, 45.01it/s][A
209it [00:04, 44.88it/s][A

Epoch: 208, Step: 200, Loss: 4.59266275882721



214it [00:04, 44.77it/s][A
219it [00:04, 44.87it/s][A
227it [00:05, 45.03it/s]
 42%|████▏     | 208/500 [24:27<31:00,  6.37s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.71it/s][A
10it [00:00, 45.94it/s][A
15it [00:00, 45.59it/s][A
20it [00:00, 45.73it/s][A
25it [00:00, 45.25it/s][A
30it [00:00, 44.51it/s][A
35it [00:00, 44.95it/s][A
40it [00:00, 44.63it/s][A
45it [00:00, 44.97it/s][A
50it [00:01, 45.25it/s][A
55it [00:01, 45.42it/s][A
60it [00:01, 45.76it/s][A
65it [00:01, 45.86it/s][A
70it [00:01, 45.76it/s][A
75it [00:01, 44.70it/s][A
80it [00:01, 44.63it/s][A
85it [00:01, 44.37it/s][A
90it [00:01, 44.47it/s][A
95it [00:02, 44.81it/s][A
100it [00:02, 45.01it/s][A
105it [00:02, 44.97it/s][A

Epoch: 209, Step: 100, Loss: 4.570659756660461



110it [00:02, 45.19it/s][A
115it [00:02, 45.49it/s][A
120it [00:02, 44.23it/s][A
125it [00:02, 43.38it/s][A
130it [00:02, 43.82it/s][A
135it [00:03, 44.10it/s][A
140it [00:03, 44.54it/s][A
145it [00:03, 45.05it/s][A
150it [00:03, 45.10it/s][A
155it [00:03, 45.24it/s][A
160it [00:03, 45.34it/s][A
165it [00:03, 45.68it/s][A
170it [00:03, 46.01it/s][A
175it [00:03, 44.62it/s][A
180it [00:04, 43.80it/s][A
185it [00:04, 44.64it/s][A
190it [00:04, 45.21it/s][A
195it [00:04, 44.85it/s][A
200it [00:04, 45.32it/s][A
205it [00:04, 45.64it/s][A

Epoch: 209, Step: 200, Loss: 4.592768752574921



210it [00:04, 45.83it/s][A
215it [00:04, 44.97it/s][A
220it [00:04, 45.53it/s][A
227it [00:05, 45.05it/s]
 42%|████▏     | 209/500 [24:32<28:58,  5.97s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.38it/s][A
10it [00:00, 46.14it/s][A
15it [00:00, 45.98it/s][A
20it [00:00, 45.87it/s][A
25it [00:00, 45.36it/s][A
30it [00:00, 45.63it/s][A
35it [00:00, 45.20it/s][A
40it [00:00, 45.00it/s][A
45it [00:00, 44.92it/s][A
50it [00:01, 45.25it/s][A
55it [00:01, 45.30it/s][A
60it [00:01, 45.40it/s][A
65it [00:01, 45.28it/s][A
70it [00:01, 45.42it/s][A
75it [00:01, 45.32it/s][A
80it [00:01, 45.17it/s][A
85it [00:01, 45.23it/s][A
90it [00:01, 44.64it/s][A
95it [00:02, 44.98it/s][A
100it [00:02, 45.16it/s][A
105it [00:02, 45.07it/s][A

Epoch: 210, Step: 100, Loss: 4.577213869094849



110it [00:02, 45.27it/s][A
115it [00:02, 45.25it/s][A
120it [00:02, 44.90it/s][A
125it [00:02, 44.61it/s][A
130it [00:02, 44.95it/s][A
135it [00:02, 44.98it/s][A
140it [00:03, 44.88it/s][A
145it [00:03, 44.98it/s][A
150it [00:03, 45.16it/s][A
155it [00:03, 44.87it/s][A
160it [00:03, 44.99it/s][A
165it [00:03, 45.00it/s][A
170it [00:03, 44.25it/s][A
175it [00:03, 44.83it/s][A
180it [00:03, 45.15it/s][A
185it [00:04, 45.07it/s][A
190it [00:04, 45.28it/s][A
195it [00:04, 45.47it/s][A
200it [00:04, 45.63it/s][A
205it [00:04, 45.48it/s][A

Epoch: 210, Step: 200, Loss: 4.588627336025238



210it [00:04, 45.17it/s][A
215it [00:04, 45.44it/s][A
220it [00:04, 45.49it/s][A
227it [00:05, 45.18it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.86it/s][A
13it [00:00, 59.44it/s][A
20it [00:00, 60.00it/s][A
27it [00:00, 60.21it/s][A
34it [00:00, 59.69it/s][A
40it [00:00, 57.13it/s][A
47it [00:00, 58.32it/s][A
54it [00:00, 58.99it/s][A
61it [00:01, 59.37it/s][A
68it [00:01, 59.65it/s][A
74it [00:01, 58.24it/s][A
80it [00:01, 58.72it/s][A
87it [00:01, 59.20it/s][A
93it [00:01, 57.05it/s][A
100it [00:01, 58.10it/s][A
107it [00:01, 58.85it/s][A
113it [00:01, 59.04it/s][A
119it [00:02, 57.34it/s][A
125it [00:02, 57.57it/s][A
131it [00:02, 58.02it/s][A
137it [00:02, 57.77it/s][A
144it [00:02, 58.74it/s][A
151it [00:02, 59.26it/s][A
158it [00:02, 59.74it/s][A
165it [00:02, 60.00it/s][A
172it [00:02, 60.21it/s][A
179it [00:03, 58.79it/s][A
185it [00:03, 58.75it/s][A
191it [00:03, 58.39it/s][A
197it [00:03, 57.65it/s][A
203it [00:03, 58.12it/s][A
209it [00:03, 


Epoch: 210, Test Loss: 5.459942386017083, Test Perplexity: 235.91693025198043




0it [00:00, ?it/s][A
5it [00:00, 41.34it/s][A
10it [00:00, 43.59it/s][A
15it [00:00, 44.33it/s][A
20it [00:00, 44.69it/s][A
25it [00:00, 43.78it/s][A
30it [00:00, 44.81it/s][A
35it [00:00, 45.44it/s][A
40it [00:00, 45.31it/s][A
45it [00:01, 45.24it/s][A
50it [00:01, 45.18it/s][A
55it [00:01, 44.96it/s][A
60it [00:01, 45.21it/s][A
65it [00:01, 44.03it/s][A
70it [00:01, 44.46it/s][A
75it [00:01, 44.79it/s][A
80it [00:01, 44.80it/s][A
85it [00:01, 44.74it/s][A
90it [00:02, 45.21it/s][A
95it [00:02, 45.38it/s][A
100it [00:02, 45.42it/s][A
105it [00:02, 45.50it/s][A

Epoch: 211, Step: 100, Loss: 4.578412094116211



110it [00:02, 44.79it/s][A
115it [00:02, 43.91it/s][A
120it [00:02, 44.45it/s][A
125it [00:02, 44.60it/s][A
130it [00:02, 44.96it/s][A
135it [00:03, 45.10it/s][A
140it [00:03, 44.05it/s][A
145it [00:03, 44.81it/s][A
150it [00:03, 45.00it/s][A
155it [00:03, 44.16it/s][A
160it [00:03, 44.27it/s][A
165it [00:03, 44.32it/s][A
170it [00:03, 44.07it/s][A
175it [00:03, 44.21it/s][A
180it [00:04, 44.80it/s][A
185it [00:04, 44.86it/s][A
190it [00:04, 45.08it/s][A
195it [00:04, 45.02it/s][A
200it [00:04, 44.95it/s][A
205it [00:04, 44.75it/s][A

Epoch: 211, Step: 200, Loss: 4.591754128932953



210it [00:04, 44.85it/s][A
215it [00:04, 45.00it/s][A
220it [00:04, 44.98it/s][A
227it [00:05, 44.74it/s]
 42%|████▏     | 211/500 [24:53<37:36,  7.81s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.36it/s][A
10it [00:00, 45.24it/s][A
15it [00:00, 44.94it/s][A
20it [00:00, 44.80it/s][A
25it [00:00, 44.94it/s][A
30it [00:00, 45.08it/s][A
35it [00:00, 45.08it/s][A
40it [00:00, 44.99it/s][A
45it [00:01, 45.03it/s][A
50it [00:01, 45.19it/s][A
55it [00:01, 44.38it/s][A
60it [00:01, 44.89it/s][A
65it [00:01, 44.42it/s][A
70it [00:01, 44.72it/s][A
75it [00:01, 45.10it/s][A
80it [00:01, 45.43it/s][A
85it [00:01, 45.37it/s][A
90it [00:01, 45.52it/s][A
95it [00:02, 45.73it/s][A
100it [00:02, 46.00it/s][A
105it [00:02, 45.81it/s][A

Epoch: 212, Step: 100, Loss: 4.578305630683899



110it [00:02, 45.59it/s][A
115it [00:02, 45.40it/s][A
120it [00:02, 45.57it/s][A
125it [00:02, 45.64it/s][A
130it [00:02, 45.65it/s][A
135it [00:02, 45.77it/s][A
140it [00:03, 45.96it/s][A
145it [00:03, 45.82it/s][A
150it [00:03, 45.56it/s][A
155it [00:03, 44.26it/s][A
160it [00:03, 44.58it/s][A
165it [00:03, 44.66it/s][A
170it [00:03, 44.86it/s][A
175it [00:03, 45.11it/s][A
180it [00:03, 45.45it/s][A
185it [00:04, 45.73it/s][A
190it [00:04, 45.86it/s][A
195it [00:04, 45.69it/s][A
200it [00:04, 45.82it/s][A
205it [00:04, 45.41it/s][A

Epoch: 212, Step: 200, Loss: 4.592381963729858



210it [00:04, 45.24it/s][A
215it [00:04, 45.13it/s][A
220it [00:04, 45.38it/s][A
227it [00:05, 45.28it/s]
 42%|████▏     | 212/500 [24:58<33:27,  6.97s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.13it/s][A
10it [00:00, 46.16it/s][A
15it [00:00, 45.19it/s][A
20it [00:00, 43.46it/s][A
25it [00:00, 43.47it/s][A
30it [00:00, 43.44it/s][A
35it [00:00, 44.18it/s][A
40it [00:00, 44.74it/s][A
45it [00:01, 44.49it/s][A
50it [00:01, 44.48it/s][A
55it [00:01, 44.79it/s][A
60it [00:01, 45.07it/s][A
65it [00:01, 44.96it/s][A
70it [00:01, 44.72it/s][A
75it [00:01, 45.26it/s][A
80it [00:01, 45.51it/s][A
85it [00:01, 45.49it/s][A
90it [00:02, 44.63it/s][A
95it [00:02, 44.85it/s][A
100it [00:02, 45.13it/s][A
105it [00:02, 45.19it/s][A

Epoch: 213, Step: 100, Loss: 4.5770235347747805



110it [00:02, 45.07it/s][A
115it [00:02, 45.23it/s][A
120it [00:02, 45.51it/s][A
125it [00:02, 45.72it/s][A
130it [00:02, 44.73it/s][A
135it [00:03, 45.38it/s][A
140it [00:03, 45.94it/s][A
145it [00:03, 46.37it/s][A
150it [00:03, 45.39it/s][A
155it [00:03, 45.21it/s][A
160it [00:03, 44.78it/s][A
165it [00:03, 45.26it/s][A
170it [00:03, 45.66it/s][A
175it [00:03, 45.31it/s][A
180it [00:03, 45.02it/s][A
185it [00:04, 45.30it/s][A
190it [00:04, 45.43it/s][A
195it [00:04, 44.84it/s][A
200it [00:04, 45.30it/s][A
205it [00:04, 45.18it/s][A

Epoch: 213, Step: 200, Loss: 4.589159920215606



210it [00:04, 44.76it/s][A
215it [00:04, 45.05it/s][A
220it [00:04, 45.00it/s][A
227it [00:05, 44.86it/s]
 43%|████▎     | 213/500 [25:03<30:36,  6.40s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.75it/s][A
10it [00:00, 44.05it/s][A
15it [00:00, 44.74it/s][A
20it [00:00, 45.24it/s][A
25it [00:00, 44.43it/s][A
30it [00:00, 44.91it/s][A
35it [00:00, 45.23it/s][A
40it [00:00, 45.45it/s][A
45it [00:00, 45.65it/s][A
50it [00:01, 45.42it/s][A
55it [00:01, 45.61it/s][A
60it [00:01, 46.26it/s][A
65it [00:01, 46.60it/s][A
70it [00:01, 46.59it/s][A
75it [00:01, 46.06it/s][A
80it [00:01, 45.69it/s][A
85it [00:01, 45.43it/s][A
90it [00:01, 45.48it/s][A
95it [00:02, 44.24it/s][A
100it [00:02, 43.13it/s][A
105it [00:02, 43.97it/s][A

Epoch: 214, Step: 100, Loss: 4.582294702529907



110it [00:02, 44.28it/s][A
115it [00:02, 44.50it/s][A
120it [00:02, 44.60it/s][A
125it [00:02, 44.64it/s][A
130it [00:02, 44.76it/s][A
135it [00:02, 44.92it/s][A
140it [00:03, 44.51it/s][A
145it [00:03, 45.00it/s][A
150it [00:03, 44.96it/s][A
155it [00:03, 44.80it/s][A
160it [00:03, 45.16it/s][A
165it [00:03, 45.21it/s][A
170it [00:03, 44.09it/s][A
175it [00:03, 44.66it/s][A
180it [00:04, 44.66it/s][A
185it [00:04, 44.87it/s][A
190it [00:04, 43.96it/s][A
195it [00:04, 44.31it/s][A
200it [00:04, 44.85it/s][A
205it [00:04, 44.87it/s][A

Epoch: 214, Step: 200, Loss: 4.591495132446289



210it [00:04, 43.57it/s][A
215it [00:04, 44.25it/s][A
220it [00:04, 44.81it/s][A
227it [00:05, 44.88it/s]
 43%|████▎     | 214/500 [25:08<28:35,  6.00s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.82it/s][A
10it [00:00, 43.77it/s][A
15it [00:00, 45.06it/s][A
20it [00:00, 45.36it/s][A
25it [00:00, 45.47it/s][A
30it [00:00, 45.36it/s][A
35it [00:00, 45.06it/s][A
40it [00:00, 45.34it/s][A
45it [00:00, 45.60it/s][A
50it [00:01, 45.56it/s][A
55it [00:01, 45.53it/s][A
60it [00:01, 45.59it/s][A
65it [00:01, 45.46it/s][A
70it [00:01, 45.61it/s][A
75it [00:01, 45.75it/s][A
80it [00:01, 44.75it/s][A
85it [00:01, 45.16it/s][A
90it [00:01, 45.44it/s][A
95it [00:02, 45.76it/s][A
100it [00:02, 45.84it/s][A
105it [00:02, 45.71it/s][A

Epoch: 215, Step: 100, Loss: 4.56986738204956



110it [00:02, 45.65it/s][A
115it [00:02, 45.90it/s][A
120it [00:02, 46.03it/s][A
125it [00:02, 45.39it/s][A
130it [00:02, 44.78it/s][A
135it [00:02, 44.56it/s][A
140it [00:03, 44.72it/s][A
145it [00:03, 44.87it/s][A
150it [00:03, 44.76it/s][A
155it [00:03, 43.67it/s][A
160it [00:03, 44.35it/s][A
165it [00:03, 44.40it/s][A
170it [00:03, 44.25it/s][A
175it [00:03, 44.78it/s][A
180it [00:03, 45.06it/s][A
185it [00:04, 45.35it/s][A
190it [00:04, 45.46it/s][A
195it [00:04, 45.32it/s][A
200it [00:04, 45.40it/s][A
205it [00:04, 45.41it/s][A

Epoch: 215, Step: 200, Loss: 4.588175177574158



210it [00:04, 45.07it/s][A
215it [00:04, 45.23it/s][A
220it [00:04, 45.15it/s][A
227it [00:05, 45.13it/s]

0it [00:00, ?it/s][A
6it [00:00, 52.60it/s][A
12it [00:00, 56.61it/s][A
18it [00:00, 57.65it/s][A
24it [00:00, 58.14it/s][A
30it [00:00, 56.46it/s][A
36it [00:00, 56.38it/s][A
42it [00:00, 56.99it/s][A
49it [00:00, 58.17it/s][A
56it [00:00, 58.86it/s][A
62it [00:01, 56.84it/s][A
68it [00:01, 57.21it/s][A
74it [00:01, 57.72it/s][A
80it [00:01, 58.15it/s][A
86it [00:01, 56.82it/s][A
93it [00:01, 58.09it/s][A
99it [00:01, 57.23it/s][A
106it [00:01, 58.35it/s][A
112it [00:01, 58.74it/s][A
119it [00:02, 59.20it/s][A
125it [00:02, 59.31it/s][A
132it [00:02, 59.65it/s][A
139it [00:02, 59.88it/s][A
146it [00:02, 60.28it/s][A
153it [00:02, 59.89it/s][A
160it [00:02, 60.08it/s][A
167it [00:02, 60.20it/s][A
174it [00:02, 60.06it/s][A
181it [00:03, 59.98it/s][A
187it [00:03, 59.94it/s][A
194it [00:03, 60.14it/s][A
201it [00:03, 60.10it/s][A
208it [00:03, 60


Epoch: 215, Test Loss: 5.4630604812077115, Test Perplexity: 236.6032638312867




0it [00:00, ?it/s][A
5it [00:00, 45.87it/s][A
10it [00:00, 45.50it/s][A
15it [00:00, 45.40it/s][A
20it [00:00, 44.90it/s][A
25it [00:00, 43.99it/s][A
30it [00:00, 43.40it/s][A
35it [00:00, 42.91it/s][A
40it [00:00, 43.85it/s][A
45it [00:01, 44.37it/s][A
50it [00:01, 44.27it/s][A
55it [00:01, 44.55it/s][A
60it [00:01, 44.69it/s][A
65it [00:01, 44.44it/s][A
70it [00:01, 44.49it/s][A
75it [00:01, 44.33it/s][A
80it [00:01, 43.25it/s][A
85it [00:01, 42.39it/s][A
90it [00:02, 42.93it/s][A
95it [00:02, 43.58it/s][A
100it [00:02, 43.72it/s][A
105it [00:02, 43.93it/s][A

Epoch: 216, Step: 100, Loss: 4.577405114173889



110it [00:02, 43.41it/s][A
115it [00:02, 44.05it/s][A
120it [00:02, 44.33it/s][A
125it [00:02, 44.63it/s][A
130it [00:02, 44.77it/s][A
135it [00:03, 44.86it/s][A
140it [00:03, 44.83it/s][A
145it [00:03, 44.59it/s][A
150it [00:03, 44.89it/s][A
155it [00:03, 44.96it/s][A
160it [00:03, 45.11it/s][A
165it [00:03, 45.30it/s][A
170it [00:03, 44.08it/s][A
175it [00:03, 44.78it/s][A
180it [00:04, 45.15it/s][A
185it [00:04, 44.83it/s][A
190it [00:04, 44.72it/s][A
195it [00:04, 44.65it/s][A
200it [00:04, 44.87it/s][A
205it [00:04, 45.20it/s][A

Epoch: 216, Step: 200, Loss: 4.58625180721283



210it [00:04, 45.40it/s][A
215it [00:04, 45.57it/s][A
220it [00:04, 45.51it/s][A
227it [00:05, 44.50it/s]
 43%|████▎     | 216/500 [25:29<37:00,  7.82s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.38it/s][A
10it [00:00, 43.93it/s][A
15it [00:00, 43.26it/s][A
20it [00:00, 44.04it/s][A
25it [00:00, 44.22it/s][A
30it [00:00, 44.47it/s][A
35it [00:00, 44.56it/s][A
40it [00:00, 43.54it/s][A
45it [00:01, 43.77it/s][A
50it [00:01, 44.15it/s][A
55it [00:01, 44.55it/s][A
60it [00:01, 44.91it/s][A
65it [00:01, 45.23it/s][A
70it [00:01, 45.54it/s][A
75it [00:01, 45.56it/s][A
80it [00:01, 45.71it/s][A
85it [00:01, 45.79it/s][A
90it [00:02, 44.67it/s][A
95it [00:02, 44.85it/s][A
100it [00:02, 44.79it/s][A
105it [00:02, 45.25it/s][A

Epoch: 217, Step: 100, Loss: 4.573549513816833



110it [00:02, 45.16it/s][A
115it [00:02, 44.98it/s][A
120it [00:02, 44.84it/s][A
125it [00:02, 45.13it/s][A
130it [00:02, 45.24it/s][A
135it [00:03, 44.24it/s][A
140it [00:03, 44.93it/s][A
145it [00:03, 44.65it/s][A
150it [00:03, 44.84it/s][A
155it [00:03, 44.89it/s][A
160it [00:03, 45.10it/s][A
165it [00:03, 45.00it/s][A
170it [00:03, 45.40it/s][A
175it [00:03, 45.59it/s][A
180it [00:04, 45.65it/s][A
185it [00:04, 45.78it/s][A
190it [00:04, 45.71it/s][A
195it [00:04, 45.59it/s][A
200it [00:04, 45.62it/s][A
205it [00:04, 45.74it/s][A

Epoch: 217, Step: 200, Loss: 4.589562702178955



210it [00:04, 44.81it/s][A
215it [00:04, 43.75it/s][A
220it [00:04, 44.75it/s][A
227it [00:05, 44.88it/s]
 43%|████▎     | 217/500 [25:34<32:58,  6.99s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.97it/s][A
10it [00:00, 46.28it/s][A
15it [00:00, 46.22it/s][A
20it [00:00, 46.50it/s][A
25it [00:00, 46.32it/s][A
30it [00:00, 46.29it/s][A
35it [00:00, 46.53it/s][A
40it [00:00, 45.26it/s][A
45it [00:00, 45.67it/s][A
50it [00:01, 45.96it/s][A
55it [00:01, 45.89it/s][A
60it [00:01, 45.63it/s][A
65it [00:01, 45.77it/s][A
70it [00:01, 46.06it/s][A
75it [00:01, 45.95it/s][A
80it [00:01, 45.77it/s][A
85it [00:01, 46.05it/s][A
90it [00:01, 45.59it/s][A
95it [00:02, 44.76it/s][A
100it [00:02, 44.02it/s][A
105it [00:02, 44.28it/s][A

Epoch: 218, Step: 100, Loss: 4.582425222396851



110it [00:02, 44.24it/s][A
115it [00:02, 44.44it/s][A
120it [00:02, 44.69it/s][A
125it [00:02, 44.89it/s][A
130it [00:02, 44.89it/s][A
135it [00:02, 44.54it/s][A
140it [00:03, 42.95it/s][A
145it [00:03, 43.38it/s][A
150it [00:03, 42.43it/s][A
155it [00:03, 43.06it/s][A
160it [00:03, 43.69it/s][A
165it [00:03, 44.13it/s][A
170it [00:03, 44.44it/s][A
175it [00:03, 44.58it/s][A
180it [00:04, 44.76it/s][A
185it [00:04, 44.91it/s][A
190it [00:04, 44.95it/s][A
195it [00:04, 45.10it/s][A
200it [00:04, 45.16it/s][A
205it [00:04, 45.25it/s][A

Epoch: 218, Step: 200, Loss: 4.5853319215774535



210it [00:04, 45.27it/s][A
215it [00:04, 44.83it/s][A
220it [00:04, 45.09it/s][A
227it [00:05, 44.98it/s]
 44%|████▎     | 218/500 [25:39<30:07,  6.41s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.79it/s][A
10it [00:00, 45.24it/s][A
15it [00:00, 43.94it/s][A
20it [00:00, 44.81it/s][A
25it [00:00, 45.10it/s][A
30it [00:00, 45.35it/s][A
35it [00:00, 45.62it/s][A
40it [00:00, 45.71it/s][A
45it [00:00, 45.77it/s][A
50it [00:01, 45.65it/s][A
55it [00:01, 45.42it/s][A
60it [00:01, 45.48it/s][A
65it [00:01, 45.22it/s][A
70it [00:01, 45.41it/s][A
75it [00:01, 44.41it/s][A
80it [00:01, 44.26it/s][A
85it [00:01, 44.70it/s][A
90it [00:02, 43.91it/s][A
95it [00:02, 44.03it/s][A
100it [00:02, 44.35it/s][A
105it [00:02, 44.78it/s][A

Epoch: 219, Step: 100, Loss: 4.573855214118957



110it [00:02, 44.83it/s][A
115it [00:02, 44.07it/s][A
120it [00:02, 44.29it/s][A
125it [00:02, 44.71it/s][A
130it [00:02, 45.04it/s][A
135it [00:03, 44.73it/s][A
140it [00:03, 43.29it/s][A
145it [00:03, 43.92it/s][A
150it [00:03, 43.98it/s][A
155it [00:03, 44.40it/s][A
160it [00:03, 44.76it/s][A
165it [00:03, 45.09it/s][A
170it [00:03, 44.83it/s][A
175it [00:03, 45.15it/s][A
180it [00:04, 45.02it/s][A
185it [00:04, 44.96it/s][A
190it [00:04, 44.85it/s][A
195it [00:04, 45.19it/s][A
200it [00:04, 45.36it/s][A
205it [00:04, 45.37it/s][A

Epoch: 219, Step: 200, Loss: 4.587757678031921



210it [00:04, 45.33it/s][A
215it [00:04, 45.58it/s][A
220it [00:04, 45.73it/s][A
227it [00:05, 44.92it/s]
 44%|████▍     | 219/500 [25:44<28:06,  6.00s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.74it/s][A
10it [00:00, 45.67it/s][A
15it [00:00, 44.89it/s][A
20it [00:00, 45.27it/s][A
25it [00:00, 45.48it/s][A
30it [00:00, 45.76it/s][A
35it [00:00, 45.95it/s][A
40it [00:00, 44.90it/s][A
45it [00:00, 45.43it/s][A
50it [00:01, 45.52it/s][A
55it [00:01, 45.46it/s][A
60it [00:01, 45.44it/s][A
65it [00:01, 45.38it/s][A
70it [00:01, 45.07it/s][A
75it [00:01, 45.21it/s][A
80it [00:01, 45.32it/s][A
85it [00:01, 45.49it/s][A
90it [00:01, 45.47it/s][A
95it [00:02, 45.23it/s][A
100it [00:02, 45.39it/s][A
105it [00:02, 45.41it/s][A

Epoch: 220, Step: 100, Loss: 4.570661411285401



110it [00:02, 45.58it/s][A
115it [00:02, 45.44it/s][A
120it [00:02, 45.55it/s][A
125it [00:02, 44.43it/s][A
130it [00:02, 45.01it/s][A
135it [00:02, 44.87it/s][A
140it [00:03, 44.56it/s][A
145it [00:03, 44.67it/s][A
150it [00:03, 43.84it/s][A
155it [00:03, 44.67it/s][A
160it [00:03, 44.81it/s][A
165it [00:03, 44.80it/s][A
170it [00:03, 45.08it/s][A
175it [00:03, 45.04it/s][A
180it [00:03, 43.58it/s][A
185it [00:04, 43.99it/s][A
190it [00:04, 44.38it/s][A
195it [00:04, 44.56it/s][A
200it [00:04, 44.35it/s][A
205it [00:04, 44.10it/s][A

Epoch: 220, Step: 200, Loss: 4.585714461803437



210it [00:04, 43.02it/s][A
215it [00:04, 42.48it/s][A
220it [00:04, 42.96it/s][A
227it [00:05, 44.73it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.49it/s][A
13it [00:00, 59.16it/s][A
20it [00:00, 59.78it/s][A
26it [00:00, 57.81it/s][A
32it [00:00, 57.76it/s][A
39it [00:00, 58.74it/s][A
46it [00:00, 59.42it/s][A
53it [00:00, 59.71it/s][A
59it [00:01, 57.40it/s][A
66it [00:01, 58.57it/s][A
72it [00:01, 58.28it/s][A
79it [00:01, 58.93it/s][A
86it [00:01, 59.42it/s][A
92it [00:01, 58.44it/s][A
99it [00:01, 59.14it/s][A
105it [00:01, 56.72it/s][A
111it [00:01, 57.58it/s][A
117it [00:02, 58.04it/s][A
124it [00:02, 58.87it/s][A
130it [00:02, 58.97it/s][A
137it [00:02, 59.38it/s][A
143it [00:02, 57.45it/s][A
149it [00:02, 57.80it/s][A
156it [00:02, 58.80it/s][A
162it [00:02, 58.80it/s][A
169it [00:02, 59.42it/s][A
175it [00:02, 58.90it/s][A
181it [00:03, 58.68it/s][A
187it [00:03, 58.95it/s][A
194it [00:03, 59.57it/s][A
201it [00:03, 59.73it/s][A
208it [00:03, 6


Epoch: 220, Test Loss: 5.4728348292178985, Test Perplexity: 238.97796429462315




0it [00:00, ?it/s][A
5it [00:00, 42.31it/s][A
10it [00:00, 43.42it/s][A
15it [00:00, 44.51it/s][A
20it [00:00, 43.58it/s][A
25it [00:00, 43.70it/s][A
30it [00:00, 44.24it/s][A
35it [00:00, 44.86it/s][A
40it [00:00, 44.40it/s][A
45it [00:01, 44.95it/s][A
50it [00:01, 44.68it/s][A
55it [00:01, 45.03it/s][A
60it [00:01, 44.89it/s][A
65it [00:01, 44.85it/s][A
70it [00:01, 45.03it/s][A
75it [00:01, 44.05it/s][A
80it [00:01, 44.54it/s][A
85it [00:01, 44.85it/s][A
90it [00:02, 44.83it/s][A
95it [00:02, 45.11it/s][A
100it [00:02, 44.85it/s][A
105it [00:02, 44.99it/s][A

Epoch: 221, Step: 100, Loss: 4.561671524047852



110it [00:02, 45.16it/s][A
115it [00:02, 45.36it/s][A
120it [00:02, 45.33it/s][A
125it [00:02, 45.12it/s][A
130it [00:02, 44.73it/s][A
135it [00:03, 45.02it/s][A
140it [00:03, 45.40it/s][A
145it [00:03, 45.42it/s][A
150it [00:03, 45.69it/s][A
155it [00:03, 45.90it/s][A
160it [00:03, 46.06it/s][A
165it [00:03, 46.12it/s][A
170it [00:03, 46.11it/s][A
175it [00:03, 46.28it/s][A
180it [00:03, 46.03it/s][A
185it [00:04, 45.97it/s][A
190it [00:04, 45.65it/s][A
195it [00:04, 45.82it/s][A
200it [00:04, 46.23it/s][A
205it [00:04, 46.33it/s][A

Epoch: 221, Step: 200, Loss: 4.582257418632508



210it [00:04, 46.41it/s][A
215it [00:04, 46.53it/s][A
220it [00:04, 46.21it/s][A
227it [00:05, 45.22it/s]
 44%|████▍     | 221/500 [26:05<36:19,  7.81s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.44it/s][A
10it [00:00, 45.89it/s][A
15it [00:00, 46.09it/s][A
20it [00:00, 46.22it/s][A
25it [00:00, 45.99it/s][A
30it [00:00, 44.99it/s][A
35it [00:00, 44.64it/s][A
40it [00:00, 44.79it/s][A
45it [00:01, 43.78it/s][A
50it [00:01, 44.26it/s][A
55it [00:01, 44.90it/s][A
60it [00:01, 45.43it/s][A
65it [00:01, 45.29it/s][A
70it [00:01, 45.48it/s][A
75it [00:01, 45.62it/s][A
80it [00:01, 45.78it/s][A
85it [00:01, 45.86it/s][A
90it [00:01, 45.98it/s][A
95it [00:02, 45.88it/s][A
100it [00:02, 46.06it/s][A
105it [00:02, 46.12it/s][A

Epoch: 222, Step: 100, Loss: 4.569985566139221



110it [00:02, 46.08it/s][A
115it [00:02, 46.29it/s][A
120it [00:02, 46.52it/s][A
125it [00:02, 46.45it/s][A
130it [00:02, 47.08it/s][A
135it [00:02, 47.53it/s][A
140it [00:03, 46.93it/s][A
145it [00:03, 46.22it/s][A
150it [00:03, 45.63it/s][A
155it [00:03, 45.68it/s][A
160it [00:03, 45.41it/s][A
165it [00:03, 45.44it/s][A
170it [00:03, 45.27it/s][A
175it [00:03, 45.22it/s][A
180it [00:03, 45.20it/s][A
185it [00:04, 45.04it/s][A
190it [00:04, 45.02it/s][A
195it [00:04, 44.78it/s][A
200it [00:04, 44.56it/s][A
205it [00:04, 44.73it/s][A

Epoch: 222, Step: 200, Loss: 4.584848668575287



210it [00:04, 44.73it/s][A
215it [00:04, 44.54it/s][A
220it [00:04, 44.93it/s][A
227it [00:04, 45.45it/s]
 44%|████▍     | 222/500 [26:10<32:16,  6.97s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.14it/s][A
10it [00:00, 42.38it/s][A
15it [00:00, 43.93it/s][A
20it [00:00, 43.36it/s][A
25it [00:00, 44.32it/s][A
30it [00:00, 44.51it/s][A
35it [00:00, 43.89it/s][A
40it [00:00, 44.52it/s][A
45it [00:01, 44.67it/s][A
50it [00:01, 43.84it/s][A
55it [00:01, 42.38it/s][A
60it [00:01, 42.33it/s][A
65it [00:01, 42.79it/s][A
70it [00:01, 43.46it/s][A
75it [00:01, 43.90it/s][A
80it [00:01, 44.35it/s][A
85it [00:01, 44.63it/s][A
90it [00:02, 44.74it/s][A
95it [00:02, 44.53it/s][A
100it [00:02, 44.79it/s][A
105it [00:02, 44.93it/s][A

Epoch: 223, Step: 100, Loss: 4.574692149162292



110it [00:02, 44.89it/s][A
115it [00:02, 44.12it/s][A
120it [00:02, 44.67it/s][A
125it [00:02, 44.55it/s][A
130it [00:02, 44.85it/s][A
135it [00:03, 44.75it/s][A
140it [00:03, 44.69it/s][A
145it [00:03, 44.86it/s][A
150it [00:03, 45.20it/s][A
155it [00:03, 45.54it/s][A
160it [00:03, 45.80it/s][A
165it [00:03, 45.77it/s][A
170it [00:03, 45.92it/s][A
175it [00:03, 46.05it/s][A
180it [00:04, 45.65it/s][A
185it [00:04, 45.40it/s][A
190it [00:04, 45.40it/s][A
195it [00:04, 45.16it/s][A
200it [00:04, 45.25it/s][A
205it [00:04, 45.08it/s][A

Epoch: 223, Step: 200, Loss: 4.580474495887756



210it [00:04, 43.76it/s][A
215it [00:04, 44.48it/s][A
220it [00:04, 44.92it/s][A
227it [00:05, 44.47it/s]
 45%|████▍     | 223/500 [26:15<29:35,  6.41s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.09it/s][A
10it [00:00, 45.53it/s][A
15it [00:00, 45.68it/s][A
20it [00:00, 45.50it/s][A
25it [00:00, 45.36it/s][A
30it [00:00, 45.66it/s][A
35it [00:00, 45.39it/s][A
40it [00:00, 45.47it/s][A
45it [00:00, 44.97it/s][A
50it [00:01, 44.90it/s][A
55it [00:01, 44.94it/s][A
60it [00:01, 45.01it/s][A
65it [00:01, 44.85it/s][A
70it [00:01, 44.92it/s][A
75it [00:01, 44.85it/s][A
80it [00:01, 45.03it/s][A
85it [00:01, 45.05it/s][A
90it [00:01, 45.04it/s][A
95it [00:02, 44.40it/s][A
100it [00:02, 44.54it/s][A
105it [00:02, 43.42it/s][A

Epoch: 224, Step: 100, Loss: 4.564328184127808



110it [00:02, 43.67it/s][A
115it [00:02, 44.24it/s][A
120it [00:02, 44.07it/s][A
125it [00:02, 44.49it/s][A
130it [00:02, 44.50it/s][A
135it [00:03, 44.53it/s][A
140it [00:03, 44.40it/s][A
145it [00:03, 44.66it/s][A
150it [00:03, 44.40it/s][A
155it [00:03, 44.57it/s][A
160it [00:03, 44.98it/s][A
165it [00:03, 44.98it/s][A
170it [00:03, 45.25it/s][A
175it [00:03, 45.05it/s][A
180it [00:04, 45.17it/s][A
185it [00:04, 45.32it/s][A
190it [00:04, 45.45it/s][A
195it [00:04, 45.35it/s][A
200it [00:04, 45.49it/s][A
205it [00:04, 45.56it/s][A

Epoch: 224, Step: 200, Loss: 4.583308515548706



210it [00:04, 45.19it/s][A
215it [00:04, 45.34it/s][A
220it [00:04, 45.54it/s][A
227it [00:05, 44.90it/s]
 45%|████▍     | 224/500 [26:20<27:37,  6.00s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.78it/s][A
9it [00:00, 41.81it/s][A
14it [00:00, 43.67it/s][A
19it [00:00, 44.37it/s][A
24it [00:00, 43.22it/s][A
29it [00:00, 44.29it/s][A
34it [00:00, 44.63it/s][A
39it [00:00, 45.15it/s][A
44it [00:00, 45.11it/s][A
49it [00:01, 45.16it/s][A
54it [00:01, 45.21it/s][A
59it [00:01, 45.09it/s][A
64it [00:01, 44.72it/s][A
69it [00:01, 44.72it/s][A
74it [00:01, 44.65it/s][A
79it [00:01, 44.83it/s][A
84it [00:01, 44.91it/s][A
89it [00:01, 44.65it/s][A
94it [00:02, 43.41it/s][A
99it [00:02, 42.41it/s][A
104it [00:02, 42.86it/s][A


Epoch: 225, Step: 100, Loss: 4.565298528671264


109it [00:02, 43.38it/s][A
114it [00:02, 42.86it/s][A
119it [00:02, 43.19it/s][A
124it [00:02, 43.16it/s][A
129it [00:02, 43.72it/s][A
134it [00:03, 44.27it/s][A
139it [00:03, 44.85it/s][A
144it [00:03, 45.22it/s][A
149it [00:03, 45.24it/s][A
154it [00:03, 45.51it/s][A
159it [00:03, 45.63it/s][A
164it [00:03, 44.61it/s][A
169it [00:03, 44.89it/s][A
174it [00:03, 45.05it/s][A
179it [00:04, 45.26it/s][A
184it [00:04, 45.58it/s][A
189it [00:04, 45.69it/s][A
194it [00:04, 45.61it/s][A
199it [00:04, 45.74it/s][A
204it [00:04, 45.72it/s][A
209it [00:04, 45.77it/s][A

Epoch: 225, Step: 200, Loss: 4.58179176568985



214it [00:04, 45.61it/s][A
219it [00:04, 45.66it/s][A
227it [00:05, 44.65it/s]

0it [00:00, ?it/s][A
6it [00:00, 55.20it/s][A
12it [00:00, 57.34it/s][A
19it [00:00, 58.88it/s][A
26it [00:00, 59.70it/s][A
32it [00:00, 59.65it/s][A
38it [00:00, 59.43it/s][A
44it [00:00, 59.27it/s][A
51it [00:00, 59.83it/s][A
58it [00:00, 60.13it/s][A
65it [00:01, 60.38it/s][A
72it [00:01, 60.42it/s][A
79it [00:01, 60.40it/s][A
86it [00:01, 58.41it/s][A
92it [00:01, 58.59it/s][A
99it [00:01, 59.40it/s][A
105it [00:01, 59.30it/s][A
112it [00:01, 59.61it/s][A
118it [00:01, 59.63it/s][A
125it [00:02, 59.88it/s][A
131it [00:02, 59.71it/s][A
138it [00:02, 59.96it/s][A
145it [00:02, 60.15it/s][A
152it [00:02, 59.98it/s][A
158it [00:02, 58.12it/s][A
164it [00:02, 58.37it/s][A
170it [00:02, 58.49it/s][A
176it [00:02, 58.74it/s][A
182it [00:03, 57.38it/s][A
189it [00:03, 58.35it/s][A
196it [00:03, 59.08it/s][A
202it [00:03, 58.98it/s][A
209it [00:03, 59.44it/s][A
215it [00:03, 5


Epoch: 225, Test Loss: 5.4657454912706935, Test Perplexity: 237.35309231207236




0it [00:00, ?it/s][A
5it [00:00, 45.65it/s][A
10it [00:00, 45.88it/s][A
15it [00:00, 46.26it/s][A
20it [00:00, 45.15it/s][A
25it [00:00, 45.89it/s][A
30it [00:00, 46.09it/s][A
35it [00:00, 46.02it/s][A
40it [00:00, 45.68it/s][A
45it [00:00, 45.81it/s][A
50it [00:01, 45.40it/s][A
55it [00:01, 45.61it/s][A
60it [00:01, 45.98it/s][A
65it [00:01, 46.21it/s][A
70it [00:01, 46.32it/s][A
75it [00:01, 46.13it/s][A
80it [00:01, 45.97it/s][A
85it [00:01, 46.19it/s][A
90it [00:01, 46.21it/s][A
95it [00:02, 46.06it/s][A
100it [00:02, 46.17it/s][A
105it [00:02, 46.15it/s][A

Epoch: 226, Step: 100, Loss: 4.5631236600875855



110it [00:02, 46.09it/s][A
115it [00:02, 46.03it/s][A
120it [00:02, 46.10it/s][A
125it [00:02, 45.02it/s][A
130it [00:02, 45.36it/s][A
135it [00:02, 45.65it/s][A
140it [00:03, 45.79it/s][A
145it [00:03, 46.10it/s][A
150it [00:03, 46.35it/s][A
155it [00:03, 46.17it/s][A
160it [00:03, 45.87it/s][A
165it [00:03, 46.56it/s][A
170it [00:03, 46.81it/s][A
175it [00:03, 46.00it/s][A
180it [00:03, 45.97it/s][A
185it [00:04, 44.83it/s][A
190it [00:04, 45.17it/s][A
195it [00:04, 44.05it/s][A
200it [00:04, 43.11it/s][A
205it [00:04, 42.45it/s][A

Epoch: 226, Step: 200, Loss: 4.5782731342315675



210it [00:04, 43.21it/s][A
215it [00:04, 43.91it/s][A
220it [00:04, 44.21it/s][A
227it [00:04, 45.40it/s]
 45%|████▌     | 226/500 [26:41<35:35,  7.79s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.28it/s][A
10it [00:00, 45.73it/s][A
15it [00:00, 45.58it/s][A
20it [00:00, 45.25it/s][A
25it [00:00, 44.98it/s][A
30it [00:00, 45.24it/s][A
35it [00:00, 45.22it/s][A
40it [00:00, 43.24it/s][A
45it [00:01, 43.53it/s][A
50it [00:01, 43.82it/s][A
55it [00:01, 44.07it/s][A
60it [00:01, 43.87it/s][A
65it [00:01, 44.57it/s][A
70it [00:01, 45.12it/s][A
75it [00:01, 45.41it/s][A
80it [00:01, 45.48it/s][A
85it [00:01, 45.44it/s][A
90it [00:02, 44.42it/s][A
95it [00:02, 44.70it/s][A
100it [00:02, 44.97it/s][A
105it [00:02, 45.28it/s][A

Epoch: 227, Step: 100, Loss: 4.558564028739929



110it [00:02, 45.43it/s][A
115it [00:02, 45.39it/s][A
120it [00:02, 45.40it/s][A
125it [00:02, 43.86it/s][A
130it [00:02, 44.30it/s][A
135it [00:03, 44.57it/s][A
140it [00:03, 44.41it/s][A
145it [00:03, 44.45it/s][A
150it [00:03, 44.35it/s][A
155it [00:03, 44.14it/s][A
160it [00:03, 44.35it/s][A
165it [00:03, 44.56it/s][A
170it [00:03, 43.81it/s][A
175it [00:03, 43.94it/s][A
180it [00:04, 44.24it/s][A
185it [00:04, 44.40it/s][A
190it [00:04, 44.65it/s][A
195it [00:04, 44.10it/s][A
200it [00:04, 44.00it/s][A
205it [00:04, 44.02it/s][A

Epoch: 227, Step: 200, Loss: 4.576403694152832



210it [00:04, 43.81it/s][A
215it [00:04, 43.99it/s][A
220it [00:04, 44.63it/s][A
227it [00:05, 44.45it/s]
 45%|████▌     | 227/500 [26:46<31:47,  6.99s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.88it/s][A
10it [00:00, 45.20it/s][A
15it [00:00, 44.38it/s][A
20it [00:00, 45.20it/s][A
25it [00:00, 45.46it/s][A
30it [00:00, 45.50it/s][A
35it [00:00, 45.61it/s][A
40it [00:00, 43.57it/s][A
45it [00:01, 44.28it/s][A
50it [00:01, 44.87it/s][A
55it [00:01, 45.22it/s][A
60it [00:01, 45.54it/s][A
65it [00:01, 45.59it/s][A
70it [00:01, 45.86it/s][A
75it [00:01, 45.99it/s][A
80it [00:01, 46.06it/s][A
85it [00:01, 46.04it/s][A
90it [00:01, 45.66it/s][A
95it [00:02, 44.70it/s][A
100it [00:02, 44.82it/s][A
105it [00:02, 44.17it/s][A

Epoch: 228, Step: 100, Loss: 4.5692593240737915



110it [00:02, 44.58it/s][A
115it [00:02, 44.96it/s][A
120it [00:02, 45.30it/s][A
125it [00:02, 45.33it/s][A
130it [00:02, 45.50it/s][A
135it [00:02, 45.57it/s][A
140it [00:03, 45.28it/s][A
145it [00:03, 45.20it/s][A
150it [00:03, 45.38it/s][A
155it [00:03, 45.72it/s][A
160it [00:03, 45.88it/s][A
165it [00:03, 45.77it/s][A
170it [00:03, 45.80it/s][A
175it [00:03, 45.60it/s][A
180it [00:03, 45.66it/s][A
185it [00:04, 44.77it/s][A
190it [00:04, 44.42it/s][A
195it [00:04, 43.13it/s][A
200it [00:04, 43.97it/s][A
205it [00:04, 43.44it/s][A

Epoch: 228, Step: 200, Loss: 4.574950292110443



210it [00:04, 44.17it/s][A
215it [00:04, 44.59it/s][A
220it [00:04, 45.03it/s][A
227it [00:05, 45.03it/s]
 46%|████▌     | 228/500 [26:52<29:02,  6.41s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.30it/s][A
10it [00:00, 45.64it/s][A
15it [00:00, 45.18it/s][A
20it [00:00, 45.59it/s][A
25it [00:00, 45.32it/s][A
30it [00:00, 45.29it/s][A
35it [00:00, 44.99it/s][A
40it [00:00, 45.26it/s][A
45it [00:00, 44.91it/s][A
50it [00:01, 45.31it/s][A
55it [00:01, 44.24it/s][A
60it [00:01, 43.88it/s][A
65it [00:01, 44.37it/s][A
70it [00:01, 44.90it/s][A
75it [00:01, 45.33it/s][A
80it [00:01, 44.24it/s][A
85it [00:01, 44.53it/s][A
90it [00:02, 44.57it/s][A
95it [00:02, 44.66it/s][A
100it [00:02, 44.85it/s][A
105it [00:02, 45.15it/s][A

Epoch: 229, Step: 100, Loss: 4.573218574523926



110it [00:02, 45.22it/s][A
115it [00:02, 45.42it/s][A
120it [00:02, 45.55it/s][A
125it [00:02, 44.98it/s][A
130it [00:02, 45.01it/s][A
135it [00:03, 44.82it/s][A
140it [00:03, 44.73it/s][A
145it [00:03, 44.68it/s][A
150it [00:03, 44.57it/s][A
155it [00:03, 44.64it/s][A
160it [00:03, 44.55it/s][A
165it [00:03, 44.10it/s][A
170it [00:03, 44.28it/s][A
175it [00:03, 44.35it/s][A
180it [00:04, 44.63it/s][A
185it [00:04, 44.62it/s][A
190it [00:04, 45.04it/s][A
195it [00:04, 44.88it/s][A
200it [00:04, 45.08it/s][A
205it [00:04, 44.60it/s][A

Epoch: 229, Step: 200, Loss: 4.579794628620148



210it [00:04, 43.67it/s][A
215it [00:04, 43.90it/s][A
220it [00:04, 43.99it/s][A
227it [00:05, 44.70it/s]
 46%|████▌     | 229/500 [26:57<27:08,  6.01s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.68it/s][A
10it [00:00, 45.72it/s][A
15it [00:00, 45.36it/s][A
20it [00:00, 45.44it/s][A
25it [00:00, 43.96it/s][A
30it [00:00, 44.35it/s][A
35it [00:00, 44.12it/s][A
40it [00:00, 44.23it/s][A
45it [00:01, 44.34it/s][A
50it [00:01, 44.77it/s][A
55it [00:01, 45.15it/s][A
60it [00:01, 45.28it/s][A
65it [00:01, 45.41it/s][A
70it [00:01, 45.48it/s][A
75it [00:01, 44.31it/s][A
80it [00:01, 44.14it/s][A
85it [00:01, 44.54it/s][A
90it [00:02, 44.69it/s][A
95it [00:02, 44.96it/s][A
100it [00:02, 45.35it/s][A
105it [00:02, 45.55it/s][A

Epoch: 230, Step: 100, Loss: 4.560625290870666



110it [00:02, 45.11it/s][A
115it [00:02, 45.23it/s][A
120it [00:02, 45.27it/s][A
125it [00:02, 45.12it/s][A
130it [00:02, 45.31it/s][A
135it [00:03, 45.49it/s][A
140it [00:03, 45.15it/s][A
145it [00:03, 45.21it/s][A
150it [00:03, 45.15it/s][A
155it [00:03, 45.27it/s][A
160it [00:03, 45.20it/s][A
165it [00:03, 45.45it/s][A
170it [00:03, 45.52it/s][A
175it [00:03, 45.18it/s][A
180it [00:03, 45.33it/s][A
185it [00:04, 45.09it/s][A
190it [00:04, 45.33it/s][A
195it [00:04, 44.97it/s][A
200it [00:04, 45.30it/s][A
205it [00:04, 44.72it/s][A

Epoch: 230, Step: 200, Loss: 4.575642938613892



210it [00:04, 44.75it/s][A
215it [00:04, 44.34it/s][A
220it [00:04, 44.76it/s][A
227it [00:05, 44.95it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.28it/s][A
12it [00:00, 56.24it/s][A
19it [00:00, 58.54it/s][A
25it [00:00, 57.85it/s][A
32it [00:00, 58.77it/s][A
39it [00:00, 59.46it/s][A
46it [00:00, 60.02it/s][A
52it [00:00, 57.59it/s][A
59it [00:01, 58.70it/s][A
66it [00:01, 59.22it/s][A
73it [00:01, 59.73it/s][A
80it [00:01, 59.96it/s][A
87it [00:01, 60.11it/s][A
94it [00:01, 60.25it/s][A
101it [00:01, 59.14it/s][A
107it [00:01, 59.31it/s][A
113it [00:01, 59.47it/s][A
120it [00:02, 59.82it/s][A
127it [00:02, 60.05it/s][A
134it [00:02, 58.50it/s][A
141it [00:02, 59.16it/s][A
148it [00:02, 59.65it/s][A
155it [00:02, 59.94it/s][A
161it [00:02, 59.83it/s][A
167it [00:02, 58.48it/s][A
174it [00:02, 59.20it/s][A
181it [00:03, 59.69it/s][A
187it [00:03, 59.46it/s][A
193it [00:03, 58.78it/s][A
200it [00:03, 59.40it/s][A
206it [00:03, 58.59it/s][A
213it [00:03, 


Epoch: 230, Test Loss: 5.467398390266466, Test Perplexity: 237.70967237105282




0it [00:00, ?it/s][A
5it [00:00, 42.21it/s][A
10it [00:00, 44.13it/s][A
15it [00:00, 44.41it/s][A
20it [00:00, 44.84it/s][A
25it [00:00, 44.53it/s][A
30it [00:00, 45.15it/s][A
35it [00:00, 45.14it/s][A
40it [00:00, 45.22it/s][A
45it [00:01, 45.34it/s][A
50it [00:01, 45.20it/s][A
55it [00:01, 45.44it/s][A
60it [00:01, 45.50it/s][A
65it [00:01, 45.80it/s][A
70it [00:01, 45.79it/s][A
75it [00:01, 44.59it/s][A
80it [00:01, 45.03it/s][A
85it [00:01, 45.13it/s][A
90it [00:01, 45.21it/s][A
95it [00:02, 45.27it/s][A
100it [00:02, 45.34it/s][A
105it [00:02, 45.36it/s][A

Epoch: 231, Step: 100, Loss: 4.559993591308594



110it [00:02, 45.28it/s][A
115it [00:02, 45.48it/s][A
120it [00:02, 45.54it/s][A
125it [00:02, 45.21it/s][A
130it [00:02, 43.94it/s][A
135it [00:02, 44.53it/s][A
140it [00:03, 44.89it/s][A
145it [00:03, 43.73it/s][A
150it [00:03, 44.20it/s][A
155it [00:03, 44.72it/s][A
160it [00:03, 44.63it/s][A
165it [00:03, 44.90it/s][A
170it [00:03, 45.16it/s][A
175it [00:03, 44.92it/s][A
180it [00:03, 45.42it/s][A
185it [00:04, 45.70it/s][A
190it [00:04, 45.88it/s][A
195it [00:04, 45.97it/s][A
200it [00:04, 45.65it/s][A
205it [00:04, 45.62it/s][A

Epoch: 231, Step: 200, Loss: 4.572317733764648



210it [00:04, 45.60it/s][A
215it [00:04, 45.42it/s][A
220it [00:04, 44.18it/s][A
227it [00:05, 45.01it/s]
 46%|████▌     | 231/500 [27:18<34:56,  7.79s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.10it/s][A
10it [00:00, 44.03it/s][A
15it [00:00, 44.26it/s][A
20it [00:00, 44.53it/s][A
25it [00:00, 44.51it/s][A
30it [00:00, 44.45it/s][A
35it [00:00, 44.60it/s][A
40it [00:00, 44.89it/s][A
45it [00:01, 45.03it/s][A
50it [00:01, 44.38it/s][A
55it [00:01, 44.50it/s][A
60it [00:01, 44.53it/s][A
65it [00:01, 44.35it/s][A
70it [00:01, 44.22it/s][A
75it [00:01, 44.21it/s][A
80it [00:01, 44.53it/s][A
85it [00:01, 44.91it/s][A
90it [00:02, 44.97it/s][A
95it [00:02, 45.16it/s][A
100it [00:02, 44.12it/s][A
105it [00:02, 44.63it/s][A

Epoch: 232, Step: 100, Loss: 4.554596214294434



110it [00:02, 43.87it/s][A
115it [00:02, 43.58it/s][A
120it [00:02, 43.19it/s][A
125it [00:02, 42.40it/s][A
130it [00:02, 42.95it/s][A
135it [00:03, 43.51it/s][A
140it [00:03, 43.80it/s][A
145it [00:03, 44.41it/s][A
150it [00:03, 43.17it/s][A
155it [00:03, 44.01it/s][A
160it [00:03, 44.48it/s][A
165it [00:03, 44.54it/s][A
170it [00:03, 44.75it/s][A
175it [00:03, 44.66it/s][A
180it [00:04, 44.98it/s][A
185it [00:04, 45.02it/s][A
190it [00:04, 45.02it/s][A
195it [00:04, 44.87it/s][A
200it [00:04, 45.06it/s][A
205it [00:04, 45.01it/s][A

Epoch: 232, Step: 200, Loss: 4.574945800304413



210it [00:04, 44.72it/s][A
215it [00:04, 45.00it/s][A
220it [00:04, 45.01it/s][A
227it [00:05, 44.41it/s]
 46%|████▋     | 232/500 [27:23<31:13,  6.99s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.48it/s][A
10it [00:00, 46.24it/s][A
15it [00:00, 45.82it/s][A
20it [00:00, 44.56it/s][A
25it [00:00, 44.90it/s][A
30it [00:00, 44.60it/s][A
35it [00:00, 44.29it/s][A
40it [00:00, 44.86it/s][A
45it [00:01, 44.83it/s][A
50it [00:01, 44.86it/s][A
55it [00:01, 44.77it/s][A
60it [00:01, 45.09it/s][A
65it [00:01, 44.93it/s][A
70it [00:01, 44.85it/s][A
75it [00:01, 44.79it/s][A
80it [00:01, 44.60it/s][A
85it [00:01, 44.87it/s][A
90it [00:02, 44.71it/s][A
95it [00:02, 44.81it/s][A
100it [00:02, 44.96it/s][A
105it [00:02, 45.01it/s][A

Epoch: 233, Step: 100, Loss: 4.551572833061218



110it [00:02, 44.70it/s][A
115it [00:02, 45.06it/s][A
120it [00:02, 44.85it/s][A
125it [00:02, 44.82it/s][A
130it [00:02, 44.91it/s][A
135it [00:03, 45.04it/s][A
140it [00:03, 44.97it/s][A
145it [00:03, 45.01it/s][A
150it [00:03, 44.84it/s][A
155it [00:03, 45.12it/s][A
160it [00:03, 45.03it/s][A
165it [00:03, 45.10it/s][A
170it [00:03, 45.11it/s][A
175it [00:03, 44.51it/s][A
180it [00:04, 43.85it/s][A
185it [00:04, 43.65it/s][A
190it [00:04, 44.16it/s][A
195it [00:04, 44.78it/s][A
200it [00:04, 45.13it/s][A
205it [00:04, 45.04it/s][A

Epoch: 233, Step: 200, Loss: 4.576173431873322



210it [00:04, 44.96it/s][A
215it [00:04, 45.13it/s][A
220it [00:04, 44.55it/s][A
227it [00:05, 44.83it/s]
 47%|████▋     | 233/500 [27:28<28:32,  6.41s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.11it/s][A
10it [00:00, 46.32it/s][A
15it [00:00, 46.38it/s][A
20it [00:00, 46.22it/s][A
25it [00:00, 44.72it/s][A
30it [00:00, 44.82it/s][A
35it [00:00, 45.16it/s][A
40it [00:00, 45.34it/s][A
45it [00:00, 45.30it/s][A
50it [00:01, 45.63it/s][A
55it [00:01, 45.86it/s][A
60it [00:01, 45.71it/s][A
65it [00:01, 45.39it/s][A
70it [00:01, 45.23it/s][A
75it [00:01, 45.21it/s][A
80it [00:01, 45.34it/s][A
85it [00:01, 44.05it/s][A
90it [00:01, 44.38it/s][A
95it [00:02, 44.81it/s][A
100it [00:02, 44.95it/s][A
105it [00:02, 44.96it/s][A

Epoch: 234, Step: 100, Loss: 4.560834250450134



110it [00:02, 44.81it/s][A
115it [00:02, 44.94it/s][A
120it [00:02, 45.28it/s][A
125it [00:02, 44.40it/s][A
130it [00:02, 43.89it/s][A
135it [00:03, 43.77it/s][A
140it [00:03, 44.14it/s][A
145it [00:03, 44.14it/s][A
150it [00:03, 44.29it/s][A
155it [00:03, 44.41it/s][A
160it [00:03, 44.38it/s][A
165it [00:03, 44.38it/s][A
170it [00:03, 44.32it/s][A
175it [00:03, 44.21it/s][A
180it [00:04, 44.13it/s][A
185it [00:04, 44.24it/s][A
190it [00:04, 44.35it/s][A
195it [00:04, 44.37it/s][A
200it [00:04, 43.57it/s][A
205it [00:04, 44.36it/s][A

Epoch: 234, Step: 200, Loss: 4.5741772055625916



210it [00:04, 43.08it/s][A
215it [00:04, 43.23it/s][A
220it [00:04, 43.77it/s][A
227it [00:05, 44.60it/s]
 47%|████▋     | 234/500 [27:33<26:40,  6.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.56it/s][A
10it [00:00, 44.05it/s][A
15it [00:00, 44.67it/s][A
20it [00:00, 42.67it/s][A
25it [00:00, 43.12it/s][A
30it [00:00, 43.96it/s][A
35it [00:00, 44.11it/s][A
40it [00:00, 44.41it/s][A
45it [00:01, 44.36it/s][A
50it [00:01, 44.72it/s][A
55it [00:01, 44.57it/s][A
60it [00:01, 43.49it/s][A
65it [00:01, 43.91it/s][A
70it [00:01, 44.42it/s][A
75it [00:01, 44.81it/s][A
80it [00:01, 45.10it/s][A
85it [00:01, 45.14it/s][A
90it [00:02, 45.24it/s][A
95it [00:02, 45.45it/s][A
100it [00:02, 45.51it/s][A
105it [00:02, 45.64it/s][A

Epoch: 235, Step: 100, Loss: 4.555517120361328



110it [00:02, 44.81it/s][A
115it [00:02, 45.17it/s][A
120it [00:02, 44.27it/s][A
125it [00:02, 44.52it/s][A
130it [00:02, 44.78it/s][A
135it [00:03, 44.62it/s][A
140it [00:03, 44.95it/s][A
145it [00:03, 44.05it/s][A
150it [00:03, 44.32it/s][A
155it [00:03, 43.61it/s][A
160it [00:03, 44.32it/s][A
165it [00:03, 44.34it/s][A
170it [00:03, 44.60it/s][A
175it [00:03, 44.76it/s][A
180it [00:04, 45.11it/s][A
185it [00:04, 45.34it/s][A
190it [00:04, 44.24it/s][A
195it [00:04, 44.60it/s][A
200it [00:04, 44.77it/s][A
205it [00:04, 45.02it/s][A

Epoch: 235, Step: 200, Loss: 4.570631487369537



210it [00:04, 44.85it/s][A
215it [00:04, 44.44it/s][A
220it [00:04, 44.73it/s][A
227it [00:05, 44.58it/s]

0it [00:00, ?it/s][A
6it [00:00, 55.33it/s][A
12it [00:00, 57.40it/s][A
18it [00:00, 58.29it/s][A
25it [00:00, 59.34it/s][A
32it [00:00, 59.73it/s][A
39it [00:00, 60.05it/s][A
46it [00:00, 60.46it/s][A
53it [00:00, 60.51it/s][A
60it [00:01, 60.65it/s][A
67it [00:01, 60.22it/s][A
74it [00:01, 60.51it/s][A
81it [00:01, 60.73it/s][A
88it [00:01, 60.95it/s][A
95it [00:01, 61.06it/s][A
102it [00:01, 61.27it/s][A
109it [00:01, 59.00it/s][A
116it [00:01, 59.85it/s][A
123it [00:02, 58.03it/s][A
129it [00:02, 57.25it/s][A
136it [00:02, 58.56it/s][A
143it [00:02, 59.22it/s][A
150it [00:02, 59.74it/s][A
157it [00:02, 60.07it/s][A
164it [00:02, 60.48it/s][A
171it [00:02, 60.81it/s][A
178it [00:02, 60.83it/s][A
185it [00:03, 58.65it/s][A
192it [00:03, 59.26it/s][A
199it [00:03, 59.68it/s][A
205it [00:03, 59.72it/s][A
212it [00:03, 60.00it/s][A
219it [00:03, 


Epoch: 235, Test Loss: 5.472055798731976, Test Perplexity: 238.76840347384814




0it [00:00, ?it/s][A
5it [00:00, 45.75it/s][A
10it [00:00, 45.41it/s][A
15it [00:00, 45.81it/s][A
20it [00:00, 45.67it/s][A
25it [00:00, 45.69it/s][A
30it [00:00, 45.71it/s][A
35it [00:00, 45.68it/s][A
40it [00:00, 45.32it/s][A
45it [00:00, 44.93it/s][A
50it [00:01, 44.67it/s][A
55it [00:01, 44.90it/s][A
60it [00:01, 45.09it/s][A
65it [00:01, 44.67it/s][A
70it [00:01, 45.07it/s][A
75it [00:01, 45.38it/s][A
80it [00:01, 45.28it/s][A
85it [00:01, 45.06it/s][A
90it [00:01, 45.04it/s][A
95it [00:02, 45.00it/s][A
100it [00:02, 45.06it/s][A
105it [00:02, 44.53it/s][A

Epoch: 236, Step: 100, Loss: 4.565484900474548



110it [00:02, 44.60it/s][A
115it [00:02, 44.72it/s][A
120it [00:02, 44.75it/s][A
125it [00:02, 44.89it/s][A
130it [00:02, 44.24it/s][A
135it [00:03, 44.42it/s][A
140it [00:03, 44.25it/s][A
145it [00:03, 44.06it/s][A
150it [00:03, 42.82it/s][A
155it [00:03, 43.14it/s][A
160it [00:03, 43.73it/s][A
165it [00:03, 43.65it/s][A
170it [00:03, 43.00it/s][A
175it [00:03, 42.51it/s][A
180it [00:04, 43.09it/s][A
185it [00:04, 42.48it/s][A
190it [00:04, 42.97it/s][A
195it [00:04, 42.87it/s][A
200it [00:04, 41.99it/s][A
205it [00:04, 41.81it/s][A

Epoch: 236, Step: 200, Loss: 4.571282584667205



210it [00:04, 42.39it/s][A
215it [00:04, 43.21it/s][A
220it [00:04, 43.98it/s][A
227it [00:05, 44.15it/s]
 47%|████▋     | 236/500 [27:54<34:28,  7.83s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.66it/s][A
10it [00:00, 42.87it/s][A
15it [00:00, 43.22it/s][A
20it [00:00, 43.63it/s][A
25it [00:00, 43.87it/s][A
30it [00:00, 43.83it/s][A
35it [00:00, 43.23it/s][A
40it [00:00, 43.42it/s][A
45it [00:01, 43.97it/s][A
50it [00:01, 44.23it/s][A
55it [00:01, 44.39it/s][A
60it [00:01, 42.97it/s][A
65it [00:01, 43.45it/s][A
70it [00:01, 43.88it/s][A
75it [00:01, 44.34it/s][A
80it [00:01, 44.51it/s][A
85it [00:01, 44.76it/s][A
90it [00:02, 44.71it/s][A
95it [00:02, 45.12it/s][A
100it [00:02, 44.95it/s][A
105it [00:02, 45.14it/s][A

Epoch: 237, Step: 100, Loss: 4.55996383190155



110it [00:02, 44.98it/s][A
115it [00:02, 44.95it/s][A
120it [00:02, 45.14it/s][A
125it [00:02, 44.96it/s][A
130it [00:02, 45.29it/s][A
135it [00:03, 45.27it/s][A
140it [00:03, 45.57it/s][A
145it [00:03, 45.22it/s][A
150it [00:03, 45.33it/s][A
155it [00:03, 44.98it/s][A
160it [00:03, 44.67it/s][A
165it [00:03, 45.09it/s][A
170it [00:03, 45.01it/s][A
175it [00:03, 45.14it/s][A
180it [00:04, 45.49it/s][A
185it [00:04, 45.68it/s][A
190it [00:04, 45.62it/s][A
195it [00:04, 45.54it/s][A
200it [00:04, 45.33it/s][A
205it [00:04, 45.32it/s][A

Epoch: 237, Step: 200, Loss: 4.573156094551086



210it [00:04, 44.64it/s][A
215it [00:04, 44.13it/s][A
220it [00:04, 44.47it/s][A
227it [00:05, 44.60it/s]
 47%|████▋     | 237/500 [27:59<30:44,  7.01s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.34it/s][A
10it [00:00, 45.62it/s][A
15it [00:00, 45.25it/s][A
20it [00:00, 45.32it/s][A
25it [00:00, 45.26it/s][A
30it [00:00, 45.47it/s][A
35it [00:00, 44.23it/s][A
40it [00:00, 44.72it/s][A
45it [00:00, 45.05it/s][A
50it [00:01, 45.30it/s][A
55it [00:01, 44.28it/s][A
60it [00:01, 44.66it/s][A
65it [00:01, 44.72it/s][A
70it [00:01, 44.91it/s][A
75it [00:01, 45.03it/s][A
80it [00:01, 44.93it/s][A
85it [00:01, 45.00it/s][A
90it [00:02, 45.05it/s][A
95it [00:02, 45.36it/s][A
100it [00:02, 45.65it/s][A
105it [00:02, 45.60it/s][A

Epoch: 238, Step: 100, Loss: 4.556123042106629



110it [00:02, 45.32it/s][A
115it [00:02, 45.41it/s][A
120it [00:02, 45.45it/s][A
125it [00:02, 45.39it/s][A
130it [00:02, 45.27it/s][A
135it [00:02, 45.43it/s][A
140it [00:03, 45.52it/s][A
145it [00:03, 45.60it/s][A
150it [00:03, 45.19it/s][A
155it [00:03, 45.39it/s][A
160it [00:03, 45.37it/s][A
165it [00:03, 45.40it/s][A
170it [00:03, 45.38it/s][A
175it [00:03, 44.96it/s][A
180it [00:03, 45.21it/s][A
185it [00:04, 45.31it/s][A
190it [00:04, 45.32it/s][A
195it [00:04, 45.30it/s][A
200it [00:04, 45.36it/s][A
205it [00:04, 45.01it/s][A

Epoch: 238, Step: 200, Loss: 4.572664513587951



210it [00:04, 43.54it/s][A
215it [00:04, 43.86it/s][A
220it [00:04, 44.14it/s][A
227it [00:05, 44.98it/s]
 48%|████▊     | 238/500 [28:04<28:02,  6.42s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.95it/s][A
10it [00:00, 45.24it/s][A
15it [00:00, 45.10it/s][A
20it [00:00, 45.09it/s][A
25it [00:00, 45.23it/s][A
30it [00:00, 44.76it/s][A
35it [00:00, 44.35it/s][A
40it [00:00, 44.42it/s][A
45it [00:01, 44.64it/s][A
50it [00:01, 44.95it/s][A
55it [00:01, 44.80it/s][A
60it [00:01, 44.74it/s][A
65it [00:01, 44.76it/s][A
70it [00:01, 44.76it/s][A
75it [00:01, 44.89it/s][A
80it [00:01, 44.63it/s][A
85it [00:01, 44.29it/s][A
90it [00:02, 44.49it/s][A
95it [00:02, 44.63it/s][A
100it [00:02, 44.78it/s][A
105it [00:02, 45.01it/s][A

Epoch: 239, Step: 100, Loss: 4.548583154678345



110it [00:02, 44.97it/s][A
115it [00:02, 45.15it/s][A
120it [00:02, 45.38it/s][A
125it [00:02, 45.51it/s][A
130it [00:02, 45.55it/s][A
135it [00:03, 45.20it/s][A
140it [00:03, 45.25it/s][A
145it [00:03, 45.26it/s][A
150it [00:03, 45.31it/s][A
155it [00:03, 43.84it/s][A
160it [00:03, 44.46it/s][A
165it [00:03, 44.88it/s][A
170it [00:03, 45.21it/s][A
175it [00:03, 45.58it/s][A
180it [00:04, 45.68it/s][A
185it [00:04, 45.63it/s][A
190it [00:04, 45.83it/s][A
195it [00:04, 45.06it/s][A
200it [00:04, 45.25it/s][A
205it [00:04, 45.33it/s][A

Epoch: 239, Step: 200, Loss: 4.56842691898346



210it [00:04, 45.16it/s][A
215it [00:04, 45.29it/s][A
220it [00:04, 45.46it/s][A
227it [00:05, 45.04it/s]
 48%|████▊     | 239/500 [28:09<26:08,  6.01s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.89it/s][A
10it [00:00, 45.48it/s][A
15it [00:00, 45.43it/s][A
20it [00:00, 45.12it/s][A
25it [00:00, 44.66it/s][A
30it [00:00, 44.86it/s][A
35it [00:00, 45.12it/s][A
40it [00:00, 45.23it/s][A
45it [00:00, 45.10it/s][A
50it [00:01, 45.33it/s][A
55it [00:01, 45.38it/s][A
60it [00:01, 45.24it/s][A
65it [00:01, 45.35it/s][A
70it [00:01, 45.61it/s][A
75it [00:01, 45.61it/s][A
80it [00:01, 45.61it/s][A
85it [00:01, 45.51it/s][A
90it [00:01, 45.59it/s][A
95it [00:02, 45.48it/s][A
100it [00:02, 45.69it/s][A
105it [00:02, 45.70it/s][A

Epoch: 240, Step: 100, Loss: 4.557170557975769



110it [00:02, 44.91it/s][A
115it [00:02, 45.28it/s][A
120it [00:02, 44.44it/s][A
125it [00:02, 43.25it/s][A
130it [00:02, 44.36it/s][A
135it [00:02, 44.13it/s][A
140it [00:03, 45.09it/s][A
145it [00:03, 45.86it/s][A
150it [00:03, 45.74it/s][A
155it [00:03, 46.01it/s][A
160it [00:03, 46.26it/s][A
165it [00:03, 46.22it/s][A
170it [00:03, 46.30it/s][A
175it [00:03, 46.46it/s][A
180it [00:03, 46.22it/s][A
185it [00:04, 46.30it/s][A
190it [00:04, 46.43it/s][A
195it [00:04, 46.27it/s][A
200it [00:04, 46.16it/s][A
205it [00:04, 46.31it/s][A

Epoch: 240, Step: 200, Loss: 4.57107342004776



210it [00:04, 46.25it/s][A
215it [00:04, 46.25it/s][A
220it [00:04, 46.40it/s][A
227it [00:04, 45.53it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.24it/s][A
12it [00:00, 58.67it/s][A
19it [00:00, 59.82it/s][A
25it [00:00, 57.62it/s][A
32it [00:00, 58.91it/s][A
39it [00:00, 59.77it/s][A
45it [00:00, 59.78it/s][A
51it [00:00, 59.71it/s][A
58it [00:00, 60.18it/s][A
65it [00:01, 59.38it/s][A
71it [00:01, 59.37it/s][A
78it [00:01, 59.94it/s][A
85it [00:01, 60.58it/s][A
92it [00:01, 60.54it/s][A
99it [00:01, 60.80it/s][A
106it [00:01, 59.07it/s][A
113it [00:01, 60.35it/s][A
120it [00:02, 59.59it/s][A
127it [00:02, 60.28it/s][A
134it [00:02, 58.33it/s][A
140it [00:02, 58.33it/s][A
146it [00:02, 57.27it/s][A
152it [00:02, 57.86it/s][A
158it [00:02, 58.46it/s][A
164it [00:02, 57.89it/s][A
171it [00:02, 58.84it/s][A
178it [00:03, 59.37it/s][A
185it [00:03, 59.78it/s][A
191it [00:03, 59.49it/s][A
197it [00:03, 57.64it/s][A
203it [00:03, 57.86it/s][A
209it [00:03, 5


Epoch: 240, Test Loss: 5.475783903406273, Test Perplexity: 239.68830786284454




0it [00:00, ?it/s][A
4it [00:00, 39.52it/s][A
9it [00:00, 42.94it/s][A
14it [00:00, 43.28it/s][A
19it [00:00, 44.18it/s][A
24it [00:00, 44.29it/s][A
29it [00:00, 44.50it/s][A
34it [00:00, 43.80it/s][A
39it [00:00, 44.22it/s][A
44it [00:00, 44.68it/s][A
49it [00:01, 45.02it/s][A
54it [00:01, 45.17it/s][A
59it [00:01, 45.20it/s][A
64it [00:01, 45.31it/s][A
69it [00:01, 45.23it/s][A
74it [00:01, 45.29it/s][A
79it [00:01, 45.36it/s][A
84it [00:01, 45.36it/s][A
89it [00:01, 45.43it/s][A
94it [00:02, 45.61it/s][A
99it [00:02, 45.57it/s][A
104it [00:02, 45.60it/s][A
109it [00:02, 45.68it/s][A

Epoch: 241, Step: 100, Loss: 4.552192301750183



114it [00:02, 45.25it/s][A
119it [00:02, 45.45it/s][A
124it [00:02, 44.88it/s][A
129it [00:02, 44.22it/s][A
134it [00:02, 43.32it/s][A
139it [00:03, 43.99it/s][A
144it [00:03, 44.27it/s][A
149it [00:03, 44.35it/s][A
154it [00:03, 44.62it/s][A
159it [00:03, 44.28it/s][A
164it [00:03, 44.63it/s][A
169it [00:03, 44.52it/s][A
174it [00:03, 43.98it/s][A
179it [00:04, 44.19it/s][A
184it [00:04, 43.86it/s][A
189it [00:04, 42.98it/s][A
194it [00:04, 43.72it/s][A
199it [00:04, 43.87it/s][A
204it [00:04, 44.24it/s][A

Epoch: 241, Step: 200, Loss: 4.5685117101669315



209it [00:04, 43.35it/s][A
214it [00:04, 43.13it/s][A
219it [00:04, 43.68it/s][A
227it [00:05, 44.42it/s]
 48%|████▊     | 241/500 [28:30<33:41,  7.80s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.45it/s][A
10it [00:00, 45.48it/s][A
15it [00:00, 44.39it/s][A
20it [00:00, 44.89it/s][A
25it [00:00, 45.00it/s][A
30it [00:00, 45.16it/s][A
35it [00:00, 43.48it/s][A
40it [00:00, 44.24it/s][A
45it [00:01, 44.37it/s][A
50it [00:01, 44.74it/s][A
55it [00:01, 44.73it/s][A
60it [00:01, 44.62it/s][A
65it [00:01, 44.59it/s][A
70it [00:01, 45.05it/s][A
75it [00:01, 45.26it/s][A
80it [00:01, 45.42it/s][A
85it [00:01, 45.43it/s][A
90it [00:02, 45.35it/s][A
95it [00:02, 45.26it/s][A
100it [00:02, 45.09it/s][A
105it [00:02, 45.14it/s][A

Epoch: 242, Step: 100, Loss: 4.557015419006348



110it [00:02, 45.20it/s][A
115it [00:02, 44.36it/s][A
120it [00:02, 44.51it/s][A
125it [00:02, 44.89it/s][A
130it [00:02, 45.35it/s][A
135it [00:03, 44.39it/s][A
140it [00:03, 44.56it/s][A
145it [00:03, 43.83it/s][A
150it [00:03, 44.42it/s][A
155it [00:03, 44.82it/s][A
160it [00:03, 44.60it/s][A
165it [00:03, 43.95it/s][A
170it [00:03, 44.56it/s][A
175it [00:03, 44.63it/s][A
180it [00:04, 44.22it/s][A
185it [00:04, 44.60it/s][A
190it [00:04, 44.77it/s][A
195it [00:04, 44.11it/s][A
200it [00:04, 44.73it/s][A
205it [00:04, 45.02it/s][A

Epoch: 242, Step: 200, Loss: 4.572249145507812



210it [00:04, 45.04it/s][A
215it [00:04, 45.18it/s][A
220it [00:04, 45.12it/s][A
227it [00:05, 44.77it/s]
 48%|████▊     | 242/500 [28:35<30:02,  6.98s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.68it/s][A
10it [00:00, 46.01it/s][A
15it [00:00, 44.80it/s][A
20it [00:00, 44.59it/s][A
25it [00:00, 44.82it/s][A
30it [00:00, 45.11it/s][A
35it [00:00, 45.29it/s][A
40it [00:00, 45.28it/s][A
45it [00:00, 45.45it/s][A
50it [00:01, 45.69it/s][A
55it [00:01, 45.80it/s][A
60it [00:01, 45.66it/s][A
65it [00:01, 45.42it/s][A
70it [00:01, 44.71it/s][A
75it [00:01, 44.71it/s][A
80it [00:01, 44.16it/s][A
85it [00:01, 44.53it/s][A
90it [00:01, 44.74it/s][A
95it [00:02, 44.89it/s][A
100it [00:02, 43.55it/s][A
105it [00:02, 43.43it/s][A

Epoch: 243, Step: 100, Loss: 4.550173449516296



110it [00:02, 44.09it/s][A
115it [00:02, 44.46it/s][A
120it [00:02, 44.23it/s][A
125it [00:02, 44.64it/s][A
130it [00:02, 44.62it/s][A
135it [00:03, 45.01it/s][A
140it [00:03, 45.11it/s][A
145it [00:03, 45.05it/s][A
150it [00:03, 44.10it/s][A
155it [00:03, 44.62it/s][A
160it [00:03, 45.13it/s][A
165it [00:03, 45.36it/s][A
170it [00:03, 45.14it/s][A
175it [00:03, 45.43it/s][A
180it [00:04, 44.48it/s][A
185it [00:04, 43.28it/s][A
190it [00:04, 43.83it/s][A
195it [00:04, 44.22it/s][A
200it [00:04, 44.62it/s][A
205it [00:04, 44.79it/s][A

Epoch: 243, Step: 200, Loss: 4.569741079807281



210it [00:04, 44.76it/s][A
215it [00:04, 43.86it/s][A
220it [00:04, 44.46it/s][A
227it [00:05, 44.65it/s]
 49%|████▊     | 243/500 [28:40<27:28,  6.42s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.41it/s][A
10it [00:00, 43.42it/s][A
15it [00:00, 43.15it/s][A
20it [00:00, 43.80it/s][A
25it [00:00, 44.14it/s][A
30it [00:00, 44.70it/s][A
35it [00:00, 45.03it/s][A
40it [00:00, 44.05it/s][A
45it [00:01, 44.40it/s][A
50it [00:01, 44.90it/s][A
55it [00:01, 45.13it/s][A
60it [00:01, 45.11it/s][A
65it [00:01, 45.25it/s][A
70it [00:01, 45.32it/s][A
75it [00:01, 45.37it/s][A
80it [00:01, 45.60it/s][A
85it [00:01, 45.62it/s][A
90it [00:02, 44.12it/s][A
95it [00:02, 44.50it/s][A
100it [00:02, 44.84it/s][A
105it [00:02, 45.11it/s][A

Epoch: 244, Step: 100, Loss: 4.552105135917664



110it [00:02, 45.21it/s][A
115it [00:02, 45.27it/s][A
120it [00:02, 45.44it/s][A
125it [00:02, 45.58it/s][A
130it [00:02, 45.48it/s][A
135it [00:03, 45.43it/s][A
140it [00:03, 45.22it/s][A
145it [00:03, 45.25it/s][A
150it [00:03, 45.11it/s][A
155it [00:03, 45.50it/s][A
160it [00:03, 45.70it/s][A
165it [00:03, 45.82it/s][A
170it [00:03, 45.96it/s][A
175it [00:03, 44.95it/s][A
180it [00:03, 45.15it/s][A
185it [00:04, 45.63it/s][A
190it [00:04, 45.79it/s][A
195it [00:04, 45.93it/s][A
200it [00:04, 45.66it/s][A
205it [00:04, 45.80it/s][A

Epoch: 244, Step: 200, Loss: 4.567891628742218



210it [00:04, 45.08it/s][A
215it [00:04, 45.60it/s][A
220it [00:04, 45.83it/s][A
227it [00:05, 45.18it/s]
 49%|████▉     | 244/500 [28:45<25:35,  6.00s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.04it/s][A
10it [00:00, 45.49it/s][A
15it [00:00, 46.17it/s][A
20it [00:00, 45.92it/s][A
25it [00:00, 46.07it/s][A
30it [00:00, 45.70it/s][A
35it [00:00, 45.95it/s][A
40it [00:00, 45.47it/s][A
45it [00:00, 44.90it/s][A
50it [00:01, 45.38it/s][A
55it [00:01, 45.67it/s][A
60it [00:01, 45.95it/s][A
65it [00:01, 46.06it/s][A
70it [00:01, 46.10it/s][A
75it [00:01, 46.18it/s][A
80it [00:01, 46.23it/s][A
85it [00:01, 46.22it/s][A
90it [00:01, 46.22it/s][A
95it [00:02, 46.32it/s][A
100it [00:02, 46.19it/s][A
105it [00:02, 46.09it/s][A

Epoch: 245, Step: 100, Loss: 4.558723158836365



110it [00:02, 46.12it/s][A
115it [00:02, 46.09it/s][A
120it [00:02, 45.04it/s][A
125it [00:02, 45.10it/s][A
130it [00:02, 45.38it/s][A
135it [00:02, 45.67it/s][A
140it [00:03, 45.57it/s][A
145it [00:03, 45.78it/s][A
150it [00:03, 45.27it/s][A
155it [00:03, 45.29it/s][A
160it [00:03, 45.15it/s][A
165it [00:03, 45.07it/s][A
170it [00:03, 45.18it/s][A
175it [00:03, 45.19it/s][A
180it [00:03, 45.26it/s][A
185it [00:04, 44.04it/s][A
190it [00:04, 44.32it/s][A
195it [00:04, 44.64it/s][A
200it [00:04, 43.84it/s][A
205it [00:04, 44.23it/s][A

Epoch: 245, Step: 200, Loss: 4.563758623600006



210it [00:04, 44.43it/s][A
215it [00:04, 44.94it/s][A
220it [00:04, 44.75it/s][A
227it [00:05, 45.39it/s]

0it [00:00, ?it/s][A
6it [00:00, 55.29it/s][A
13it [00:00, 58.64it/s][A
20it [00:00, 59.63it/s][A
27it [00:00, 59.89it/s][A
33it [00:00, 59.87it/s][A
40it [00:00, 60.21it/s][A
47it [00:00, 60.39it/s][A
54it [00:00, 59.34it/s][A
60it [00:01, 58.22it/s][A
66it [00:01, 58.35it/s][A
73it [00:01, 59.19it/s][A
79it [00:01, 58.95it/s][A
85it [00:01, 59.01it/s][A
91it [00:01, 57.54it/s][A
97it [00:01, 57.75it/s][A
103it [00:01, 57.54it/s][A
109it [00:01, 58.25it/s][A
116it [00:01, 59.04it/s][A
122it [00:02, 58.56it/s][A
129it [00:02, 59.18it/s][A
135it [00:02, 57.88it/s][A
142it [00:02, 58.73it/s][A
149it [00:02, 59.34it/s][A
156it [00:02, 59.62it/s][A
162it [00:02, 59.57it/s][A
169it [00:02, 60.01it/s][A
175it [00:02, 59.69it/s][A
182it [00:03, 60.10it/s][A
189it [00:03, 58.73it/s][A
195it [00:03, 57.00it/s][A
201it [00:03, 57.74it/s][A
207it [00:03, 5


Epoch: 245, Test Loss: 5.47070350484078, Test Perplexity: 238.48707521331977




0it [00:00, ?it/s][A
5it [00:00, 45.74it/s][A
10it [00:00, 45.47it/s][A
15it [00:00, 43.67it/s][A
20it [00:00, 43.39it/s][A
25it [00:00, 44.22it/s][A
30it [00:00, 44.10it/s][A
35it [00:00, 44.17it/s][A
40it [00:00, 44.32it/s][A
45it [00:01, 44.29it/s][A
50it [00:01, 44.42it/s][A
55it [00:01, 44.65it/s][A
60it [00:01, 44.96it/s][A
65it [00:01, 45.03it/s][A
70it [00:01, 45.18it/s][A
75it [00:01, 45.23it/s][A
80it [00:01, 45.56it/s][A
85it [00:01, 44.58it/s][A
90it [00:02, 44.81it/s][A
95it [00:02, 44.65it/s][A
100it [00:02, 44.80it/s][A
105it [00:02, 44.83it/s][A

Epoch: 246, Step: 100, Loss: 4.542213401794434



110it [00:02, 44.93it/s][A
115it [00:02, 45.07it/s][A
120it [00:02, 45.29it/s][A
125it [00:02, 45.47it/s][A
130it [00:02, 45.68it/s][A
135it [00:03, 45.78it/s][A
140it [00:03, 45.46it/s][A
145it [00:03, 45.35it/s][A
150it [00:03, 44.18it/s][A
155it [00:03, 44.33it/s][A
160it [00:03, 44.49it/s][A
165it [00:03, 44.44it/s][A
170it [00:03, 43.77it/s][A
175it [00:03, 44.22it/s][A
180it [00:04, 44.01it/s][A
185it [00:04, 44.68it/s][A
190it [00:04, 44.99it/s][A
195it [00:04, 44.85it/s][A
200it [00:04, 45.14it/s][A
205it [00:04, 45.21it/s][A

Epoch: 246, Step: 200, Loss: 4.566883358955383



210it [00:04, 45.23it/s][A
215it [00:04, 44.14it/s][A
220it [00:04, 44.59it/s][A
227it [00:05, 44.75it/s]
 49%|████▉     | 246/500 [29:06<33:03,  7.81s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.67it/s][A
10it [00:00, 41.43it/s][A
15it [00:00, 43.25it/s][A
20it [00:00, 44.42it/s][A
25it [00:00, 45.05it/s][A
30it [00:00, 44.68it/s][A
35it [00:00, 44.90it/s][A
40it [00:00, 44.80it/s][A
45it [00:01, 44.88it/s][A
50it [00:01, 45.05it/s][A
55it [00:01, 45.12it/s][A
60it [00:01, 45.13it/s][A
65it [00:01, 44.95it/s][A
70it [00:01, 45.09it/s][A
75it [00:01, 45.08it/s][A
80it [00:01, 45.34it/s][A
85it [00:01, 45.60it/s][A
90it [00:02, 45.21it/s][A
95it [00:02, 45.54it/s][A
100it [00:02, 45.35it/s][A
105it [00:02, 45.28it/s][A

Epoch: 247, Step: 100, Loss: 4.559559941291809



110it [00:02, 45.11it/s][A
115it [00:02, 45.05it/s][A
120it [00:02, 45.06it/s][A
125it [00:02, 44.06it/s][A
130it [00:02, 44.67it/s][A
135it [00:03, 44.64it/s][A
140it [00:03, 44.74it/s][A
145it [00:03, 44.95it/s][A
150it [00:03, 45.07it/s][A
155it [00:03, 44.77it/s][A
160it [00:03, 44.62it/s][A
165it [00:03, 45.16it/s][A
170it [00:03, 44.79it/s][A
175it [00:03, 45.21it/s][A
180it [00:04, 44.98it/s][A
185it [00:04, 45.25it/s][A
190it [00:04, 45.06it/s][A
195it [00:04, 45.20it/s][A
200it [00:04, 44.91it/s][A
205it [00:04, 45.26it/s][A

Epoch: 247, Step: 200, Loss: 4.563697028160095



210it [00:04, 44.97it/s][A
215it [00:04, 45.23it/s][A
220it [00:04, 44.98it/s][A
227it [00:05, 44.84it/s]
 49%|████▉     | 247/500 [29:11<29:27,  6.99s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.44it/s][A
10it [00:00, 43.84it/s][A
15it [00:00, 44.78it/s][A
20it [00:00, 45.45it/s][A
25it [00:00, 45.70it/s][A
30it [00:00, 45.80it/s][A
35it [00:00, 45.80it/s][A
40it [00:00, 45.49it/s][A
45it [00:00, 45.89it/s][A
50it [00:01, 45.06it/s][A
55it [00:01, 45.28it/s][A
60it [00:01, 45.31it/s][A
65it [00:01, 45.39it/s][A
70it [00:01, 45.41it/s][A
75it [00:01, 45.40it/s][A
80it [00:01, 45.41it/s][A
85it [00:01, 45.57it/s][A
90it [00:01, 44.55it/s][A
95it [00:02, 44.82it/s][A
100it [00:02, 44.93it/s][A
105it [00:02, 44.32it/s][A

Epoch: 248, Step: 100, Loss: 4.545671105384827



110it [00:02, 44.42it/s][A
115it [00:02, 44.73it/s][A
120it [00:02, 44.40it/s][A
125it [00:02, 44.60it/s][A
130it [00:02, 44.93it/s][A
135it [00:02, 44.89it/s][A
140it [00:03, 45.02it/s][A
145it [00:03, 44.79it/s][A
150it [00:03, 44.25it/s][A
155it [00:03, 43.95it/s][A
160it [00:03, 43.69it/s][A
165it [00:03, 44.02it/s][A
170it [00:03, 42.88it/s][A
175it [00:03, 43.67it/s][A
180it [00:04, 43.09it/s][A
185it [00:04, 43.67it/s][A
190it [00:04, 44.16it/s][A
195it [00:04, 44.29it/s][A
200it [00:04, 44.91it/s][A
205it [00:04, 45.36it/s][A

Epoch: 248, Step: 200, Loss: 4.565112547874451



210it [00:04, 45.60it/s][A
215it [00:04, 45.97it/s][A
220it [00:04, 44.59it/s][A
227it [00:05, 44.78it/s]
 50%|████▉     | 248/500 [29:16<26:55,  6.41s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.51it/s][A
10it [00:00, 46.34it/s][A
15it [00:00, 44.79it/s][A
20it [00:00, 43.91it/s][A
25it [00:00, 44.94it/s][A
30it [00:00, 45.16it/s][A
35it [00:00, 43.94it/s][A
40it [00:00, 44.72it/s][A
45it [00:01, 45.27it/s][A
50it [00:01, 45.59it/s][A
55it [00:01, 44.69it/s][A
60it [00:01, 45.31it/s][A
65it [00:01, 45.49it/s][A
70it [00:01, 45.76it/s][A
75it [00:01, 45.41it/s][A
80it [00:01, 44.93it/s][A
85it [00:01, 45.39it/s][A
90it [00:01, 45.62it/s][A
95it [00:02, 45.46it/s][A
100it [00:02, 44.99it/s][A
105it [00:02, 44.17it/s][A

Epoch: 249, Step: 100, Loss: 4.5513733911514285



110it [00:02, 44.90it/s][A
115it [00:02, 45.25it/s][A
120it [00:02, 45.17it/s][A
125it [00:02, 45.27it/s][A
130it [00:02, 45.38it/s][A
135it [00:02, 45.36it/s][A
140it [00:03, 45.27it/s][A
145it [00:03, 45.18it/s][A
150it [00:03, 45.26it/s][A
155it [00:03, 45.32it/s][A
160it [00:03, 45.40it/s][A
165it [00:03, 45.23it/s][A
170it [00:03, 45.45it/s][A
175it [00:03, 45.22it/s][A
180it [00:03, 45.69it/s][A
185it [00:04, 45.57it/s][A
190it [00:04, 45.96it/s][A
195it [00:04, 44.42it/s][A
200it [00:04, 44.69it/s][A
205it [00:04, 44.20it/s][A

Epoch: 249, Step: 200, Loss: 4.568753950595855



210it [00:04, 44.41it/s][A
215it [00:04, 43.80it/s][A
220it [00:04, 44.10it/s][A
227it [00:05, 44.99it/s]
 50%|████▉     | 249/500 [29:21<25:06,  6.00s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.88it/s][A
10it [00:00, 43.81it/s][A
15it [00:00, 44.64it/s][A
20it [00:00, 45.07it/s][A
25it [00:00, 44.89it/s][A
30it [00:00, 44.62it/s][A
35it [00:00, 45.15it/s][A
40it [00:00, 44.91it/s][A
45it [00:01, 44.00it/s][A
50it [00:01, 44.47it/s][A
55it [00:01, 44.51it/s][A
60it [00:01, 44.78it/s][A
65it [00:01, 45.05it/s][A
70it [00:01, 45.19it/s][A
75it [00:01, 45.20it/s][A
80it [00:01, 45.38it/s][A
85it [00:01, 45.15it/s][A
90it [00:02, 45.19it/s][A
95it [00:02, 45.07it/s][A
100it [00:02, 45.24it/s][A
105it [00:02, 45.46it/s][A

Epoch: 250, Step: 100, Loss: 4.541120443344116



110it [00:02, 45.25it/s][A
115it [00:02, 45.31it/s][A
120it [00:02, 45.06it/s][A
125it [00:02, 45.24it/s][A
130it [00:02, 45.43it/s][A
135it [00:03, 45.39it/s][A
140it [00:03, 45.63it/s][A
145it [00:03, 45.72it/s][A
150it [00:03, 45.81it/s][A
155it [00:03, 45.75it/s][A
160it [00:03, 45.34it/s][A
165it [00:03, 45.31it/s][A
170it [00:03, 45.13it/s][A
175it [00:03, 45.09it/s][A
180it [00:03, 44.83it/s][A
185it [00:04, 44.88it/s][A
190it [00:04, 45.21it/s][A
195it [00:04, 44.63it/s][A
200it [00:04, 44.03it/s][A
205it [00:04, 43.92it/s][A

Epoch: 250, Step: 200, Loss: 4.562890267372131



210it [00:04, 44.02it/s][A
215it [00:04, 44.11it/s][A
220it [00:04, 43.78it/s][A
227it [00:05, 44.83it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.28it/s][A
12it [00:00, 59.08it/s][A
18it [00:00, 55.28it/s][A
25it [00:00, 57.43it/s][A
32it [00:00, 58.57it/s][A
39it [00:00, 59.33it/s][A
45it [00:00, 57.98it/s][A
52it [00:00, 58.99it/s][A
59it [00:01, 59.43it/s][A
65it [00:01, 59.51it/s][A
71it [00:01, 59.17it/s][A
78it [00:01, 59.54it/s][A
84it [00:01, 56.84it/s][A
91it [00:01, 57.94it/s][A
98it [00:01, 58.80it/s][A
104it [00:01, 56.53it/s][A
111it [00:01, 57.65it/s][A
118it [00:02, 58.49it/s][A
124it [00:02, 58.11it/s][A
131it [00:02, 59.13it/s][A
137it [00:02, 57.15it/s][A
144it [00:02, 58.46it/s][A
150it [00:02, 58.74it/s][A
157it [00:02, 59.39it/s][A
164it [00:02, 59.70it/s][A
170it [00:02, 59.74it/s][A
176it [00:03, 58.36it/s][A
183it [00:03, 58.98it/s][A
189it [00:03, 58.33it/s][A
195it [00:03, 58.66it/s][A
201it [00:03, 58.63it/s][A
207it [00:03, 5


Epoch: 250, Test Loss: 5.478697535414133, Test Perplexity: 240.39058827465365




0it [00:00, ?it/s][A
5it [00:00, 44.53it/s][A
10it [00:00, 42.81it/s][A
15it [00:00, 43.50it/s][A
20it [00:00, 43.99it/s][A
25it [00:00, 44.18it/s][A
30it [00:00, 44.67it/s][A
35it [00:00, 45.00it/s][A
40it [00:00, 45.11it/s][A
45it [00:01, 45.03it/s][A
50it [00:01, 45.31it/s][A
55it [00:01, 45.21it/s][A
60it [00:01, 45.33it/s][A
65it [00:01, 45.38it/s][A
70it [00:01, 45.55it/s][A
75it [00:01, 45.16it/s][A
80it [00:01, 45.48it/s][A
85it [00:01, 44.33it/s][A
90it [00:02, 44.61it/s][A
95it [00:02, 44.92it/s][A
100it [00:02, 45.07it/s][A
105it [00:02, 45.35it/s][A

Epoch: 251, Step: 100, Loss: 4.5480366134643555



110it [00:02, 45.44it/s][A
115it [00:02, 45.63it/s][A
120it [00:02, 45.16it/s][A
125it [00:02, 45.27it/s][A
130it [00:02, 45.02it/s][A
135it [00:03, 45.06it/s][A
140it [00:03, 45.15it/s][A
145it [00:03, 45.15it/s][A
150it [00:03, 45.25it/s][A
155it [00:03, 45.25it/s][A
160it [00:03, 44.16it/s][A
165it [00:03, 44.54it/s][A
170it [00:03, 44.59it/s][A
175it [00:03, 44.72it/s][A
180it [00:04, 45.02it/s][A
185it [00:04, 44.72it/s][A
190it [00:04, 45.17it/s][A
195it [00:04, 45.29it/s][A
200it [00:04, 45.09it/s][A
205it [00:04, 44.75it/s][A

Epoch: 251, Step: 200, Loss: 4.562463345527649



210it [00:04, 43.92it/s][A
215it [00:04, 44.43it/s][A
220it [00:04, 44.60it/s][A
227it [00:05, 44.81it/s]
 50%|█████     | 251/500 [29:42<32:25,  7.81s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.82it/s][A
10it [00:00, 45.38it/s][A
15it [00:00, 43.42it/s][A
20it [00:00, 42.72it/s][A
25it [00:00, 43.66it/s][A
30it [00:00, 43.71it/s][A
35it [00:00, 44.34it/s][A
40it [00:00, 44.44it/s][A
45it [00:01, 44.78it/s][A
50it [00:01, 45.00it/s][A
55it [00:01, 44.06it/s][A
60it [00:01, 43.26it/s][A
65it [00:01, 43.66it/s][A
70it [00:01, 44.04it/s][A
75it [00:01, 44.48it/s][A
80it [00:01, 44.55it/s][A
85it [00:01, 44.62it/s][A
90it [00:02, 44.92it/s][A
95it [00:02, 45.19it/s][A
100it [00:02, 45.47it/s][A
105it [00:02, 45.48it/s][A

Epoch: 252, Step: 100, Loss: 4.548039784431458



110it [00:02, 45.33it/s][A
115it [00:02, 45.31it/s][A
120it [00:02, 45.37it/s][A
125it [00:02, 45.35it/s][A
130it [00:02, 45.23it/s][A
135it [00:03, 45.29it/s][A
140it [00:03, 45.51it/s][A
145it [00:03, 45.68it/s][A
150it [00:03, 44.66it/s][A
155it [00:03, 44.09it/s][A
160it [00:03, 44.69it/s][A
165it [00:03, 44.90it/s][A
170it [00:03, 45.27it/s][A
175it [00:03, 45.18it/s][A
180it [00:04, 44.63it/s][A
185it [00:04, 45.12it/s][A
190it [00:04, 45.13it/s][A
195it [00:04, 45.00it/s][A
200it [00:04, 45.36it/s][A
205it [00:04, 45.49it/s][A

Epoch: 252, Step: 200, Loss: 4.566733739376068



210it [00:04, 45.33it/s][A
215it [00:04, 45.43it/s][A
220it [00:04, 45.16it/s][A
227it [00:05, 44.73it/s]
 50%|█████     | 252/500 [29:47<28:54,  6.99s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.66it/s][A
10it [00:00, 45.16it/s][A
15it [00:00, 44.86it/s][A
20it [00:00, 43.98it/s][A
25it [00:00, 44.48it/s][A
30it [00:00, 45.06it/s][A
35it [00:00, 44.59it/s][A
40it [00:00, 44.40it/s][A
45it [00:01, 45.04it/s][A
50it [00:01, 45.30it/s][A
55it [00:01, 45.42it/s][A
60it [00:01, 45.08it/s][A
65it [00:01, 45.44it/s][A
70it [00:01, 45.02it/s][A
75it [00:01, 44.42it/s][A
80it [00:01, 44.53it/s][A
85it [00:01, 44.57it/s][A
90it [00:02, 45.08it/s][A
95it [00:02, 45.27it/s][A
100it [00:02, 45.41it/s][A
105it [00:02, 45.27it/s][A

Epoch: 253, Step: 100, Loss: 4.549706134796143



110it [00:02, 45.26it/s][A
115it [00:02, 45.62it/s][A
120it [00:02, 45.80it/s][A
125it [00:02, 46.06it/s][A
130it [00:02, 45.12it/s][A
135it [00:02, 45.59it/s][A
140it [00:03, 45.93it/s][A
145it [00:03, 44.99it/s][A
150it [00:03, 45.53it/s][A
155it [00:03, 44.52it/s][A
160it [00:03, 45.25it/s][A
165it [00:03, 45.69it/s][A
170it [00:03, 45.36it/s][A
175it [00:03, 45.53it/s][A
180it [00:03, 45.70it/s][A
185it [00:04, 45.63it/s][A
190it [00:04, 45.88it/s][A
195it [00:04, 45.25it/s][A
200it [00:04, 45.53it/s][A
205it [00:04, 45.43it/s][A

Epoch: 253, Step: 200, Loss: 4.561656808853149



210it [00:04, 44.73it/s][A
215it [00:04, 45.64it/s][A
220it [00:04, 46.07it/s][A
227it [00:05, 45.31it/s]
 51%|█████     | 253/500 [29:52<26:20,  6.40s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.45it/s][A
10it [00:00, 45.25it/s][A
15it [00:00, 43.24it/s][A
20it [00:00, 42.98it/s][A
25it [00:00, 42.55it/s][A
30it [00:00, 43.65it/s][A
35it [00:00, 44.26it/s][A
40it [00:00, 44.51it/s][A
45it [00:01, 43.86it/s][A
50it [00:01, 44.35it/s][A
55it [00:01, 43.96it/s][A
60it [00:01, 44.26it/s][A
65it [00:01, 44.00it/s][A
70it [00:01, 42.51it/s][A
75it [00:01, 43.16it/s][A
80it [00:01, 43.79it/s][A
85it [00:01, 42.86it/s][A
90it [00:02, 43.55it/s][A
95it [00:02, 43.24it/s][A
100it [00:02, 42.09it/s][A
105it [00:02, 43.15it/s][A

Epoch: 254, Step: 100, Loss: 4.54031882762909



110it [00:02, 42.59it/s][A
115it [00:02, 43.49it/s][A
120it [00:02, 42.84it/s][A
125it [00:02, 43.25it/s][A
130it [00:02, 43.47it/s][A
135it [00:03, 42.97it/s][A
140it [00:03, 42.48it/s][A
145it [00:03, 42.12it/s][A
150it [00:03, 42.93it/s][A
155it [00:03, 42.81it/s][A
160it [00:03, 43.67it/s][A
165it [00:03, 43.94it/s][A
170it [00:03, 44.30it/s][A
175it [00:04, 44.57it/s][A
180it [00:04, 44.87it/s][A
185it [00:04, 44.56it/s][A
190it [00:04, 44.96it/s][A
195it [00:04, 45.20it/s][A
200it [00:04, 45.36it/s][A
205it [00:04, 45.40it/s][A

Epoch: 254, Step: 200, Loss: 4.560274448394775



210it [00:04, 44.97it/s][A
215it [00:04, 45.02it/s][A
220it [00:05, 45.36it/s][A
227it [00:05, 43.79it/s]
 51%|█████     | 254/500 [29:58<24:44,  6.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.61it/s][A
10it [00:00, 45.22it/s][A
15it [00:00, 45.16it/s][A
20it [00:00, 45.37it/s][A
25it [00:00, 45.55it/s][A
30it [00:00, 45.39it/s][A
35it [00:00, 45.55it/s][A
40it [00:00, 44.83it/s][A
45it [00:01, 44.26it/s][A
50it [00:01, 43.66it/s][A
55it [00:01, 44.41it/s][A
60it [00:01, 43.96it/s][A
65it [00:01, 44.45it/s][A
70it [00:01, 44.53it/s][A
75it [00:01, 44.55it/s][A
80it [00:01, 44.68it/s][A
85it [00:01, 44.57it/s][A
90it [00:02, 44.25it/s][A
95it [00:02, 44.38it/s][A
100it [00:02, 44.14it/s][A
105it [00:02, 44.41it/s][A

Epoch: 255, Step: 100, Loss: 4.543233466148377



110it [00:02, 44.64it/s][A
115it [00:02, 44.36it/s][A
120it [00:02, 44.20it/s][A
125it [00:02, 44.10it/s][A
130it [00:02, 44.26it/s][A
135it [00:03, 44.40it/s][A
140it [00:03, 44.64it/s][A
145it [00:03, 44.01it/s][A
150it [00:03, 44.32it/s][A
155it [00:03, 44.65it/s][A
160it [00:03, 45.00it/s][A
165it [00:03, 45.15it/s][A
170it [00:03, 45.25it/s][A
175it [00:03, 45.56it/s][A
180it [00:04, 44.35it/s][A
185it [00:04, 44.91it/s][A
190it [00:04, 43.84it/s][A
195it [00:04, 44.62it/s][A
200it [00:04, 44.90it/s][A
205it [00:04, 44.89it/s][A

Epoch: 255, Step: 200, Loss: 4.560602147579193



210it [00:04, 45.07it/s][A
215it [00:04, 45.37it/s][A
220it [00:04, 45.72it/s][A
227it [00:05, 44.69it/s]

0it [00:00, ?it/s][A
6it [00:00, 55.94it/s][A
13it [00:00, 59.12it/s][A
20it [00:00, 60.12it/s][A
27it [00:00, 58.73it/s][A
33it [00:00, 58.96it/s][A
39it [00:00, 57.72it/s][A
46it [00:00, 58.79it/s][A
52it [00:00, 56.88it/s][A
58it [00:01, 55.43it/s][A
64it [00:01, 55.77it/s][A
70it [00:01, 56.69it/s][A
77it [00:01, 57.91it/s][A
83it [00:01, 58.34it/s][A
90it [00:01, 59.20it/s][A
96it [00:01, 59.38it/s][A
103it [00:01, 59.78it/s][A
110it [00:01, 59.98it/s][A
116it [00:02, 56.93it/s][A
123it [00:02, 58.06it/s][A
129it [00:02, 58.55it/s][A
136it [00:02, 59.10it/s][A
143it [00:02, 59.57it/s][A
150it [00:02, 59.82it/s][A
156it [00:02, 59.60it/s][A
163it [00:02, 59.79it/s][A
169it [00:02, 59.70it/s][A
175it [00:02, 59.44it/s][A
181it [00:03, 58.00it/s][A
188it [00:03, 58.83it/s][A
194it [00:03, 58.88it/s][A
200it [00:03, 58.82it/s][A
207it [00:03, 5


Epoch: 255, Test Loss: 5.485157015160744, Test Perplexity: 241.9771207963458




0it [00:00, ?it/s][A
4it [00:00, 38.79it/s][A
9it [00:00, 43.26it/s][A
14it [00:00, 44.04it/s][A
19it [00:00, 44.73it/s][A
24it [00:00, 45.30it/s][A
29it [00:00, 45.33it/s][A
34it [00:00, 44.11it/s][A
39it [00:00, 44.08it/s][A
44it [00:00, 44.79it/s][A
49it [00:01, 45.21it/s][A
54it [00:01, 45.39it/s][A
59it [00:01, 45.60it/s][A
64it [00:01, 45.31it/s][A
69it [00:01, 45.67it/s][A
74it [00:01, 45.82it/s][A
79it [00:01, 45.65it/s][A
84it [00:01, 44.31it/s][A
89it [00:01, 44.39it/s][A
94it [00:02, 44.93it/s][A
99it [00:02, 45.28it/s][A
104it [00:02, 43.91it/s][A


Epoch: 256, Step: 100, Loss: 4.548169536590576


109it [00:02, 44.07it/s][A
114it [00:02, 44.58it/s][A
119it [00:02, 44.70it/s][A
124it [00:02, 44.81it/s][A
129it [00:02, 44.77it/s][A
134it [00:02, 44.91it/s][A
139it [00:03, 43.77it/s][A
144it [00:03, 44.29it/s][A
149it [00:03, 44.94it/s][A
154it [00:03, 44.55it/s][A
159it [00:03, 45.06it/s][A
164it [00:03, 45.26it/s][A
169it [00:03, 45.34it/s][A
174it [00:03, 45.43it/s][A
179it [00:03, 45.38it/s][A
184it [00:04, 45.46it/s][A
189it [00:04, 45.44it/s][A
194it [00:04, 45.53it/s][A
199it [00:04, 45.53it/s][A
204it [00:04, 45.57it/s][A
209it [00:04, 45.34it/s][A

Epoch: 256, Step: 200, Loss: 4.558927443027496



214it [00:04, 45.14it/s][A
219it [00:04, 45.09it/s][A
227it [00:05, 44.88it/s]
 51%|█████     | 256/500 [30:19<31:51,  7.84s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.99it/s][A
10it [00:00, 41.75it/s][A
15it [00:00, 42.70it/s][A
20it [00:00, 42.82it/s][A
25it [00:00, 43.21it/s][A
30it [00:00, 43.70it/s][A
35it [00:00, 43.87it/s][A
40it [00:00, 44.32it/s][A
45it [00:01, 43.48it/s][A
50it [00:01, 44.03it/s][A
55it [00:01, 44.40it/s][A
60it [00:01, 44.52it/s][A
65it [00:01, 44.63it/s][A
70it [00:01, 45.14it/s][A
75it [00:01, 45.56it/s][A
80it [00:01, 44.91it/s][A
85it [00:01, 45.43it/s][A
90it [00:02, 45.00it/s][A
95it [00:02, 44.00it/s][A
100it [00:02, 43.73it/s][A
105it [00:02, 44.60it/s][A

Epoch: 257, Step: 100, Loss: 4.545926098823547



110it [00:02, 45.03it/s][A
115it [00:02, 43.69it/s][A
120it [00:02, 44.57it/s][A
125it [00:02, 44.35it/s][A
130it [00:02, 45.24it/s][A
135it [00:03, 45.77it/s][A
140it [00:03, 46.02it/s][A
145it [00:03, 46.11it/s][A
150it [00:03, 46.04it/s][A
155it [00:03, 44.17it/s][A
160it [00:03, 44.18it/s][A
165it [00:03, 43.87it/s][A
170it [00:03, 43.63it/s][A
175it [00:03, 43.53it/s][A
180it [00:04, 44.15it/s][A
185it [00:04, 43.91it/s][A
190it [00:04, 43.70it/s][A
195it [00:04, 43.27it/s][A
200it [00:04, 44.02it/s][A
205it [00:04, 44.12it/s][A

Epoch: 257, Step: 200, Loss: 4.560956661701202



210it [00:04, 44.32it/s][A
215it [00:04, 44.86it/s][A
220it [00:04, 45.37it/s][A
227it [00:05, 44.33it/s]
 51%|█████▏    | 257/500 [30:24<28:26,  7.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.23it/s][A
10it [00:00, 45.74it/s][A
15it [00:00, 45.96it/s][A
20it [00:00, 45.25it/s][A
25it [00:00, 45.49it/s][A
30it [00:00, 46.36it/s][A
35it [00:00, 46.55it/s][A
40it [00:00, 45.50it/s][A
45it [00:00, 45.63it/s][A
50it [00:01, 45.11it/s][A
55it [00:01, 45.21it/s][A
60it [00:01, 44.59it/s][A
65it [00:01, 43.58it/s][A
70it [00:01, 43.39it/s][A
75it [00:01, 44.08it/s][A
80it [00:01, 44.64it/s][A
85it [00:01, 44.41it/s][A
90it [00:02, 44.55it/s][A
95it [00:02, 43.38it/s][A
100it [00:02, 43.88it/s][A
105it [00:02, 43.91it/s][A

Epoch: 258, Step: 100, Loss: 4.546387004852295



110it [00:02, 44.03it/s][A
115it [00:02, 43.23it/s][A
120it [00:02, 43.71it/s][A
125it [00:02, 44.14it/s][A
130it [00:02, 44.53it/s][A
135it [00:03, 44.87it/s][A
140it [00:03, 45.03it/s][A
145it [00:03, 43.76it/s][A
150it [00:03, 43.34it/s][A
155it [00:03, 43.69it/s][A
160it [00:03, 44.10it/s][A
165it [00:03, 44.40it/s][A
170it [00:03, 44.66it/s][A
175it [00:03, 44.04it/s][A
180it [00:04, 44.24it/s][A
185it [00:04, 44.61it/s][A
190it [00:04, 44.82it/s][A
195it [00:04, 44.96it/s][A
200it [00:04, 44.85it/s][A
205it [00:04, 44.12it/s][A

Epoch: 258, Step: 200, Loss: 4.559431710243225



210it [00:04, 43.13it/s][A
215it [00:04, 43.85it/s][A
220it [00:04, 44.09it/s][A
227it [00:05, 44.43it/s]
 52%|█████▏    | 258/500 [30:29<26:00,  6.45s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.71it/s][A
10it [00:00, 43.59it/s][A
15it [00:00, 44.01it/s][A
20it [00:00, 42.87it/s][A
25it [00:00, 43.06it/s][A
30it [00:00, 43.69it/s][A
35it [00:00, 44.20it/s][A
40it [00:00, 44.38it/s][A
45it [00:01, 44.30it/s][A
50it [00:01, 44.23it/s][A
55it [00:01, 44.29it/s][A
60it [00:01, 44.50it/s][A
65it [00:01, 44.79it/s][A
70it [00:01, 45.02it/s][A
75it [00:01, 45.14it/s][A
80it [00:01, 45.48it/s][A
85it [00:01, 45.75it/s][A
90it [00:02, 45.58it/s][A
95it [00:02, 45.44it/s][A
100it [00:02, 44.39it/s][A
105it [00:02, 44.48it/s][A

Epoch: 259, Step: 100, Loss: 4.5351389074325565



110it [00:02, 44.52it/s][A
115it [00:02, 44.61it/s][A
120it [00:02, 43.42it/s][A
125it [00:02, 43.75it/s][A
130it [00:02, 43.65it/s][A
135it [00:03, 44.07it/s][A
140it [00:03, 44.24it/s][A
145it [00:03, 44.44it/s][A
150it [00:03, 44.01it/s][A
155it [00:03, 43.69it/s][A
160it [00:03, 44.08it/s][A
165it [00:03, 44.43it/s][A
170it [00:03, 44.76it/s][A
175it [00:03, 45.04it/s][A
180it [00:04, 45.22it/s][A
185it [00:04, 45.19it/s][A
190it [00:04, 45.25it/s][A
195it [00:04, 45.27it/s][A
200it [00:04, 45.21it/s][A
205it [00:04, 44.66it/s][A

Epoch: 259, Step: 200, Loss: 4.559661712646484



210it [00:04, 43.67it/s][A
215it [00:04, 42.90it/s][A
220it [00:04, 43.63it/s][A
227it [00:05, 44.33it/s]
 52%|█████▏    | 259/500 [30:34<24:18,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.74it/s][A
10it [00:00, 44.08it/s][A
15it [00:00, 44.47it/s][A
20it [00:00, 45.05it/s][A
25it [00:00, 45.14it/s][A
30it [00:00, 44.02it/s][A
35it [00:00, 43.58it/s][A
40it [00:00, 43.79it/s][A
45it [00:01, 44.33it/s][A
50it [00:01, 44.17it/s][A
55it [00:01, 44.29it/s][A
60it [00:01, 44.68it/s][A
65it [00:01, 44.82it/s][A
70it [00:01, 44.54it/s][A
75it [00:01, 44.83it/s][A
80it [00:01, 43.89it/s][A
85it [00:01, 44.06it/s][A
90it [00:02, 43.59it/s][A
95it [00:02, 43.98it/s][A
100it [00:02, 44.33it/s][A
105it [00:02, 44.51it/s][A

Epoch: 260, Step: 100, Loss: 4.545696287155152



110it [00:02, 44.75it/s][A
115it [00:02, 45.06it/s][A
120it [00:02, 45.41it/s][A
125it [00:02, 45.70it/s][A
130it [00:02, 45.47it/s][A
135it [00:03, 45.49it/s][A
140it [00:03, 45.54it/s][A
145it [00:03, 45.03it/s][A
150it [00:03, 45.05it/s][A
155it [00:03, 44.66it/s][A
160it [00:03, 44.94it/s][A
165it [00:03, 44.83it/s][A
170it [00:03, 43.87it/s][A
175it [00:03, 44.39it/s][A
180it [00:04, 44.95it/s][A
185it [00:04, 44.65it/s][A
190it [00:04, 43.66it/s][A
195it [00:04, 44.19it/s][A
200it [00:04, 44.40it/s][A
205it [00:04, 44.69it/s][A

Epoch: 260, Step: 200, Loss: 4.557419893741607



210it [00:04, 44.70it/s][A
215it [00:04, 43.74it/s][A
220it [00:04, 43.79it/s][A
227it [00:05, 44.52it/s]

0it [00:00, ?it/s][A
6it [00:00, 51.70it/s][A
12it [00:00, 56.00it/s][A
19it [00:00, 58.03it/s][A
26it [00:00, 59.15it/s][A
33it [00:00, 59.66it/s][A
39it [00:00, 58.11it/s][A
45it [00:00, 58.61it/s][A
52it [00:00, 59.26it/s][A
58it [00:00, 59.35it/s][A
64it [00:01, 59.28it/s][A
70it [00:01, 58.87it/s][A
76it [00:01, 58.97it/s][A
82it [00:01, 59.26it/s][A
88it [00:01, 59.37it/s][A
94it [00:01, 59.51it/s][A
101it [00:01, 59.73it/s][A
108it [00:01, 60.20it/s][A
115it [00:01, 60.40it/s][A
122it [00:02, 60.16it/s][A
129it [00:02, 59.90it/s][A
135it [00:02, 58.31it/s][A
141it [00:02, 57.87it/s][A
147it [00:02, 56.94it/s][A
153it [00:02, 57.41it/s][A
159it [00:02, 56.77it/s][A
166it [00:02, 58.03it/s][A
173it [00:02, 58.75it/s][A
180it [00:03, 59.26it/s][A
186it [00:03, 59.33it/s][A
192it [00:03, 59.49it/s][A
199it [00:03, 59.95it/s][A
205it [00:03, 5


Epoch: 260, Test Loss: 5.487924204109619, Test Perplexity: 242.78254064713946




0it [00:00, ?it/s][A
5it [00:00, 45.93it/s][A
10it [00:00, 45.75it/s][A
15it [00:00, 45.50it/s][A
20it [00:00, 45.60it/s][A
25it [00:00, 45.31it/s][A
30it [00:00, 45.22it/s][A
35it [00:00, 45.47it/s][A
40it [00:00, 45.58it/s][A
45it [00:00, 45.41it/s][A
50it [00:01, 45.46it/s][A
55it [00:01, 45.47it/s][A
60it [00:01, 45.55it/s][A
65it [00:01, 45.67it/s][A
70it [00:01, 45.40it/s][A
75it [00:01, 45.40it/s][A
80it [00:01, 45.54it/s][A
85it [00:01, 45.75it/s][A
90it [00:01, 46.04it/s][A
95it [00:02, 45.77it/s][A
100it [00:02, 45.46it/s][A
105it [00:02, 45.75it/s][A

Epoch: 261, Step: 100, Loss: 4.537444996833801



110it [00:02, 45.60it/s][A
115it [00:02, 45.60it/s][A
120it [00:02, 45.74it/s][A
125it [00:02, 44.90it/s][A
130it [00:02, 45.21it/s][A
135it [00:02, 45.33it/s][A
140it [00:03, 45.41it/s][A
145it [00:03, 45.42it/s][A
150it [00:03, 45.58it/s][A
155it [00:03, 45.75it/s][A
160it [00:03, 45.76it/s][A
165it [00:03, 45.05it/s][A
170it [00:03, 45.57it/s][A
175it [00:03, 45.83it/s][A
180it [00:03, 45.89it/s][A
185it [00:04, 45.88it/s][A
190it [00:04, 46.03it/s][A
195it [00:04, 45.78it/s][A
200it [00:04, 45.78it/s][A
205it [00:04, 45.73it/s][A

Epoch: 261, Step: 200, Loss: 4.555927138328553



210it [00:04, 45.56it/s][A
215it [00:04, 45.78it/s][A
220it [00:04, 45.82it/s][A
227it [00:04, 45.56it/s]
 52%|█████▏    | 261/500 [30:55<31:11,  7.83s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.15it/s][A
10it [00:00, 45.15it/s][A
15it [00:00, 45.03it/s][A
20it [00:00, 45.49it/s][A
25it [00:00, 45.55it/s][A
30it [00:00, 44.26it/s][A
35it [00:00, 44.64it/s][A
40it [00:00, 44.96it/s][A
45it [00:01, 45.02it/s][A
50it [00:01, 44.44it/s][A
55it [00:01, 44.62it/s][A
60it [00:01, 44.23it/s][A
65it [00:01, 42.45it/s][A
70it [00:01, 42.63it/s][A
75it [00:01, 43.69it/s][A
80it [00:01, 44.61it/s][A
85it [00:01, 44.79it/s][A
90it [00:02, 44.33it/s][A
95it [00:02, 43.87it/s][A
100it [00:02, 43.93it/s][A
105it [00:02, 43.15it/s][A

Epoch: 262, Step: 100, Loss: 4.5473061418533325



110it [00:02, 43.55it/s][A
115it [00:02, 43.56it/s][A
120it [00:02, 43.46it/s][A
125it [00:02, 43.88it/s][A
130it [00:02, 44.25it/s][A
135it [00:03, 44.66it/s][A
140it [00:03, 42.80it/s][A
145it [00:03, 43.41it/s][A
150it [00:03, 43.70it/s][A
155it [00:03, 43.92it/s][A
160it [00:03, 44.30it/s][A
165it [00:03, 44.16it/s][A
170it [00:03, 43.67it/s][A
175it [00:03, 43.93it/s][A
180it [00:04, 44.18it/s][A
185it [00:04, 44.62it/s][A
190it [00:04, 44.80it/s][A
195it [00:04, 45.07it/s][A
200it [00:04, 44.29it/s][A
205it [00:04, 44.71it/s][A

Epoch: 262, Step: 200, Loss: 4.556938190460205



210it [00:04, 43.65it/s][A
215it [00:04, 44.23it/s][A
220it [00:04, 43.58it/s][A
227it [00:05, 44.00it/s]
 52%|█████▏    | 262/500 [31:00<27:53,  7.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.56it/s][A
10it [00:00, 44.45it/s][A
15it [00:00, 45.19it/s][A
20it [00:00, 45.41it/s][A
25it [00:00, 45.62it/s][A
30it [00:00, 45.12it/s][A
35it [00:00, 45.28it/s][A
40it [00:00, 45.50it/s][A
45it [00:00, 45.12it/s][A
50it [00:01, 45.19it/s][A
55it [00:01, 45.26it/s][A
60it [00:01, 45.50it/s][A
65it [00:01, 45.43it/s][A
70it [00:01, 44.34it/s][A
75it [00:01, 44.29it/s][A
80it [00:01, 44.47it/s][A
85it [00:01, 44.02it/s][A
90it [00:02, 44.13it/s][A
95it [00:02, 44.46it/s][A
100it [00:02, 44.58it/s][A
105it [00:02, 44.89it/s][A

Epoch: 263, Step: 100, Loss: 4.541307530403137



110it [00:02, 45.08it/s][A
115it [00:02, 45.02it/s][A
120it [00:02, 43.77it/s][A
125it [00:02, 43.69it/s][A
130it [00:02, 42.74it/s][A
135it [00:03, 43.21it/s][A
140it [00:03, 43.17it/s][A
145it [00:03, 43.70it/s][A
150it [00:03, 43.39it/s][A
155it [00:03, 43.96it/s][A
160it [00:03, 44.13it/s][A
165it [00:03, 44.30it/s][A
170it [00:03, 43.33it/s][A
175it [00:03, 43.93it/s][A
180it [00:04, 44.24it/s][A
185it [00:04, 44.14it/s][A
190it [00:04, 44.48it/s][A
195it [00:04, 44.81it/s][A
200it [00:04, 44.97it/s][A
205it [00:04, 44.79it/s][A

Epoch: 263, Step: 200, Loss: 4.55660849571228



210it [00:04, 44.55it/s][A
215it [00:04, 44.67it/s][A
220it [00:04, 45.05it/s][A
227it [00:05, 44.47it/s]
 53%|█████▎    | 263/500 [31:05<25:29,  6.45s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.19it/s][A
10it [00:00, 45.67it/s][A
15it [00:00, 45.06it/s][A
20it [00:00, 45.12it/s][A
25it [00:00, 45.14it/s][A
30it [00:00, 45.20it/s][A
35it [00:00, 44.97it/s][A
40it [00:00, 45.13it/s][A
45it [00:00, 44.98it/s][A
50it [00:01, 45.09it/s][A
55it [00:01, 44.00it/s][A
60it [00:01, 44.20it/s][A
65it [00:01, 44.60it/s][A
70it [00:01, 45.08it/s][A
75it [00:01, 44.59it/s][A
80it [00:01, 44.75it/s][A
85it [00:01, 44.56it/s][A
90it [00:02, 43.53it/s][A
95it [00:02, 43.08it/s][A
100it [00:02, 43.66it/s][A
105it [00:02, 43.98it/s][A

Epoch: 264, Step: 100, Loss: 4.546272649765014



110it [00:02, 44.04it/s][A
115it [00:02, 43.97it/s][A
120it [00:02, 44.12it/s][A
125it [00:02, 44.31it/s][A
130it [00:02, 44.13it/s][A
135it [00:03, 44.21it/s][A
140it [00:03, 44.20it/s][A
145it [00:03, 44.27it/s][A
150it [00:03, 44.57it/s][A
155it [00:03, 44.14it/s][A
160it [00:03, 44.38it/s][A
165it [00:03, 44.43it/s][A
170it [00:03, 42.61it/s][A
175it [00:03, 42.44it/s][A
180it [00:04, 42.75it/s][A
185it [00:04, 42.95it/s][A
190it [00:04, 43.37it/s][A
195it [00:04, 43.93it/s][A
200it [00:04, 43.56it/s][A
205it [00:04, 44.00it/s][A

Epoch: 264, Step: 200, Loss: 4.557169711589813



210it [00:04, 44.03it/s][A
215it [00:04, 44.27it/s][A
220it [00:04, 44.31it/s][A
227it [00:05, 44.16it/s]
 53%|█████▎    | 264/500 [31:11<23:50,  6.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.84it/s][A
10it [00:00, 45.44it/s][A
15it [00:00, 45.68it/s][A
20it [00:00, 45.72it/s][A
25it [00:00, 45.54it/s][A
30it [00:00, 45.53it/s][A
35it [00:00, 45.34it/s][A
40it [00:00, 44.02it/s][A
45it [00:01, 44.54it/s][A
50it [00:01, 44.83it/s][A
55it [00:01, 45.27it/s][A
60it [00:01, 44.43it/s][A
65it [00:01, 44.18it/s][A
70it [00:01, 44.70it/s][A
75it [00:01, 44.71it/s][A
80it [00:01, 44.59it/s][A
85it [00:01, 44.80it/s][A
90it [00:02, 44.82it/s][A
95it [00:02, 44.69it/s][A
100it [00:02, 44.73it/s][A
105it [00:02, 43.67it/s][A

Epoch: 265, Step: 100, Loss: 4.544969019889831



110it [00:02, 44.02it/s][A
115it [00:02, 43.77it/s][A
120it [00:02, 43.42it/s][A
125it [00:02, 43.98it/s][A
130it [00:02, 43.32it/s][A
135it [00:03, 44.03it/s][A
140it [00:03, 43.73it/s][A
145it [00:03, 44.30it/s][A
150it [00:03, 43.69it/s][A
155it [00:03, 44.19it/s][A
160it [00:03, 43.41it/s][A
165it [00:03, 43.52it/s][A
170it [00:03, 43.94it/s][A
175it [00:03, 43.50it/s][A
180it [00:04, 44.02it/s][A
185it [00:04, 44.47it/s][A
190it [00:04, 44.87it/s][A
195it [00:04, 44.96it/s][A
200it [00:04, 44.91it/s][A
205it [00:04, 43.07it/s][A

Epoch: 265, Step: 200, Loss: 4.554798798561096



210it [00:04, 43.26it/s][A
215it [00:04, 43.51it/s][A
220it [00:04, 43.91it/s][A
227it [00:05, 44.28it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.43it/s][A
12it [00:00, 58.74it/s][A
18it [00:00, 59.12it/s][A
25it [00:00, 59.63it/s][A
31it [00:00, 59.71it/s][A
37it [00:00, 57.61it/s][A
43it [00:00, 57.81it/s][A
50it [00:00, 58.75it/s][A
56it [00:00, 59.00it/s][A
62it [00:01, 59.20it/s][A
69it [00:01, 59.63it/s][A
75it [00:01, 59.66it/s][A
82it [00:01, 60.08it/s][A
89it [00:01, 60.03it/s][A
96it [00:01, 60.15it/s][A
103it [00:01, 59.86it/s][A
109it [00:01, 59.69it/s][A
115it [00:01, 59.31it/s][A
121it [00:02, 58.34it/s][A
127it [00:02, 57.24it/s][A
134it [00:02, 58.31it/s][A
141it [00:02, 58.95it/s][A
147it [00:02, 59.04it/s][A
153it [00:02, 59.06it/s][A
160it [00:02, 59.64it/s][A
167it [00:02, 59.86it/s][A
173it [00:02, 59.86it/s][A
179it [00:03, 59.64it/s][A
185it [00:03, 59.36it/s][A
192it [00:03, 59.80it/s][A
198it [00:03, 59.72it/s][A
204it [00:03, 5


Epoch: 265, Test Loss: 5.486477173633457, Test Perplexity: 242.3639380413553




0it [00:00, ?it/s][A
5it [00:00, 46.28it/s][A
10it [00:00, 46.31it/s][A
15it [00:00, 44.45it/s][A
20it [00:00, 45.55it/s][A
25it [00:00, 45.77it/s][A
30it [00:00, 46.05it/s][A
35it [00:00, 45.87it/s][A
40it [00:00, 44.77it/s][A
45it [00:00, 45.01it/s][A
50it [00:01, 45.04it/s][A
55it [00:01, 45.32it/s][A
60it [00:01, 45.59it/s][A
65it [00:01, 45.93it/s][A
70it [00:01, 44.85it/s][A
75it [00:01, 44.97it/s][A
80it [00:01, 45.17it/s][A
85it [00:01, 45.69it/s][A
90it [00:01, 45.98it/s][A
95it [00:02, 45.91it/s][A
100it [00:02, 45.06it/s][A
105it [00:02, 45.33it/s][A

Epoch: 266, Step: 100, Loss: 4.532316131591797



110it [00:02, 44.99it/s][A
115it [00:02, 44.91it/s][A
120it [00:02, 44.81it/s][A
125it [00:02, 44.63it/s][A
130it [00:02, 43.79it/s][A
135it [00:02, 43.57it/s][A
140it [00:03, 43.93it/s][A
145it [00:03, 44.24it/s][A
150it [00:03, 43.76it/s][A
155it [00:03, 43.31it/s][A
160it [00:03, 44.01it/s][A
165it [00:03, 44.40it/s][A
170it [00:03, 44.70it/s][A
175it [00:03, 44.59it/s][A
180it [00:04, 45.03it/s][A
185it [00:04, 45.27it/s][A
190it [00:04, 44.64it/s][A
195it [00:04, 44.48it/s][A
200it [00:04, 44.70it/s][A
205it [00:04, 44.70it/s][A

Epoch: 266, Step: 200, Loss: 4.553318412303924



210it [00:04, 44.03it/s][A
215it [00:04, 43.42it/s][A
220it [00:04, 43.76it/s][A
227it [00:05, 44.66it/s]
 53%|█████▎    | 266/500 [31:32<30:39,  7.86s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.03it/s][A
10it [00:00, 43.62it/s][A
15it [00:00, 41.78it/s][A
20it [00:00, 41.57it/s][A
25it [00:00, 41.88it/s][A
30it [00:00, 42.47it/s][A
35it [00:00, 43.33it/s][A
40it [00:00, 43.84it/s][A
45it [00:01, 44.26it/s][A
50it [00:01, 44.54it/s][A
55it [00:01, 44.64it/s][A
60it [00:01, 44.10it/s][A
65it [00:01, 44.49it/s][A
70it [00:01, 42.49it/s][A
75it [00:01, 43.14it/s][A
80it [00:01, 43.76it/s][A
85it [00:01, 44.21it/s][A
90it [00:02, 43.82it/s][A
95it [00:02, 44.33it/s][A
100it [00:02, 44.61it/s][A
105it [00:02, 44.93it/s][A

Epoch: 267, Step: 100, Loss: 4.540383176803589



110it [00:02, 44.28it/s][A
115it [00:02, 43.16it/s][A
120it [00:02, 43.66it/s][A
125it [00:02, 44.19it/s][A
130it [00:02, 43.53it/s][A
135it [00:03, 44.11it/s][A
140it [00:03, 44.41it/s][A
145it [00:03, 44.58it/s][A
150it [00:03, 44.67it/s][A
155it [00:03, 44.74it/s][A
160it [00:03, 44.76it/s][A
165it [00:03, 44.75it/s][A
170it [00:03, 44.17it/s][A
175it [00:03, 44.35it/s][A
180it [00:04, 44.41it/s][A
185it [00:04, 44.38it/s][A
190it [00:04, 44.71it/s][A
195it [00:04, 45.07it/s][A
200it [00:04, 45.13it/s][A
205it [00:04, 43.56it/s][A

Epoch: 267, Step: 200, Loss: 4.552402067184448



210it [00:04, 42.55it/s][A
215it [00:04, 43.50it/s][A
220it [00:05, 43.63it/s][A
227it [00:05, 43.85it/s]
 53%|█████▎    | 267/500 [31:37<27:24,  7.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.73it/s][A
10it [00:00, 44.96it/s][A
15it [00:00, 45.23it/s][A
20it [00:00, 45.19it/s][A
25it [00:00, 45.19it/s][A
30it [00:00, 45.32it/s][A
35it [00:00, 45.26it/s][A
40it [00:00, 44.91it/s][A
45it [00:00, 44.95it/s][A
50it [00:01, 45.00it/s][A
55it [00:01, 44.88it/s][A
60it [00:01, 44.01it/s][A
65it [00:01, 44.66it/s][A
70it [00:01, 44.41it/s][A
75it [00:01, 44.77it/s][A
80it [00:01, 44.87it/s][A
85it [00:01, 44.47it/s][A
90it [00:02, 45.04it/s][A
95it [00:02, 44.99it/s][A
100it [00:02, 43.83it/s][A
105it [00:02, 44.50it/s][A

Epoch: 268, Step: 100, Loss: 4.554163861274719



110it [00:02, 44.50it/s][A
115it [00:02, 44.43it/s][A
120it [00:02, 44.71it/s][A
125it [00:02, 43.42it/s][A
130it [00:02, 44.03it/s][A
135it [00:03, 44.58it/s][A
140it [00:03, 44.82it/s][A
145it [00:03, 45.06it/s][A
150it [00:03, 44.59it/s][A
155it [00:03, 44.51it/s][A
160it [00:03, 44.01it/s][A
165it [00:03, 44.65it/s][A
170it [00:03, 44.54it/s][A
175it [00:03, 45.05it/s][A
180it [00:04, 45.07it/s][A
185it [00:04, 44.13it/s][A
190it [00:04, 44.51it/s][A
195it [00:04, 44.27it/s][A
200it [00:04, 44.79it/s][A
205it [00:04, 44.87it/s][A

Epoch: 268, Step: 200, Loss: 4.552851908206939



210it [00:04, 45.14it/s][A
215it [00:04, 45.23it/s][A
220it [00:04, 45.31it/s][A
227it [00:05, 44.75it/s]
 54%|█████▎    | 268/500 [31:42<24:59,  6.46s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.32it/s][A
10it [00:00, 44.67it/s][A
15it [00:00, 44.07it/s][A
20it [00:00, 44.42it/s][A
25it [00:00, 42.94it/s][A
30it [00:00, 41.62it/s][A
35it [00:00, 42.09it/s][A
40it [00:00, 43.01it/s][A
45it [00:01, 43.57it/s][A
50it [00:01, 42.98it/s][A
55it [00:01, 43.76it/s][A
60it [00:01, 43.72it/s][A
65it [00:01, 43.27it/s][A
70it [00:01, 43.66it/s][A
75it [00:01, 43.66it/s][A
80it [00:01, 44.36it/s][A
85it [00:01, 44.81it/s][A
90it [00:02, 45.32it/s][A
95it [00:02, 45.47it/s][A
100it [00:02, 45.29it/s][A
105it [00:02, 45.38it/s][A

Epoch: 269, Step: 100, Loss: 4.528756775856018



110it [00:02, 44.33it/s][A
115it [00:02, 44.53it/s][A
120it [00:02, 44.74it/s][A
125it [00:02, 44.86it/s][A
130it [00:02, 45.08it/s][A
135it [00:03, 45.37it/s][A
140it [00:03, 44.43it/s][A
145it [00:03, 44.76it/s][A
150it [00:03, 44.91it/s][A
155it [00:03, 45.16it/s][A
160it [00:03, 45.39it/s][A
165it [00:03, 45.57it/s][A
170it [00:03, 45.43it/s][A
175it [00:03, 45.71it/s][A
180it [00:04, 45.66it/s][A
185it [00:04, 45.29it/s][A
190it [00:04, 45.48it/s][A
195it [00:04, 45.61it/s][A
200it [00:04, 45.44it/s][A
205it [00:04, 45.49it/s][A

Epoch: 269, Step: 200, Loss: 4.550368459224701



210it [00:04, 45.18it/s][A
215it [00:04, 43.93it/s][A
220it [00:04, 44.51it/s][A
227it [00:05, 44.54it/s]
 54%|█████▍    | 269/500 [31:47<23:18,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.06it/s][A
10it [00:00, 43.80it/s][A
15it [00:00, 44.64it/s][A
20it [00:00, 44.76it/s][A
25it [00:00, 45.05it/s][A
30it [00:00, 45.16it/s][A
35it [00:00, 45.35it/s][A
40it [00:00, 45.20it/s][A
45it [00:01, 45.07it/s][A
50it [00:01, 45.14it/s][A
55it [00:01, 45.37it/s][A
60it [00:01, 44.64it/s][A
65it [00:01, 44.97it/s][A
70it [00:01, 45.08it/s][A
75it [00:01, 45.27it/s][A
80it [00:01, 45.50it/s][A
85it [00:01, 45.23it/s][A
90it [00:01, 45.16it/s][A
95it [00:02, 45.31it/s][A
100it [00:02, 45.42it/s][A
105it [00:02, 45.16it/s][A

Epoch: 270, Step: 100, Loss: 4.53965057849884



110it [00:02, 44.95it/s][A
115it [00:02, 45.27it/s][A
120it [00:02, 45.14it/s][A
125it [00:02, 45.43it/s][A
130it [00:02, 45.66it/s][A
135it [00:02, 45.66it/s][A
140it [00:03, 45.80it/s][A
145it [00:03, 45.48it/s][A
150it [00:03, 45.35it/s][A
155it [00:03, 45.58it/s][A
160it [00:03, 45.64it/s][A
165it [00:03, 43.65it/s][A
170it [00:03, 44.18it/s][A
175it [00:03, 44.46it/s][A
180it [00:03, 44.77it/s][A
185it [00:04, 44.99it/s][A
190it [00:04, 44.82it/s][A
195it [00:04, 44.14it/s][A
200it [00:04, 44.76it/s][A
205it [00:04, 45.12it/s][A

Epoch: 270, Step: 200, Loss: 4.553193874359131



210it [00:04, 45.21it/s][A
215it [00:04, 44.88it/s][A
220it [00:04, 45.04it/s][A
227it [00:05, 45.06it/s]

0it [00:00, ?it/s][A
6it [00:00, 51.83it/s][A
12it [00:00, 55.12it/s][A
18it [00:00, 54.23it/s][A
24it [00:00, 53.19it/s][A
30it [00:00, 55.26it/s][A
36it [00:00, 56.64it/s][A
42it [00:00, 57.56it/s][A
49it [00:00, 58.64it/s][A
55it [00:00, 58.92it/s][A
61it [00:01, 57.40it/s][A
67it [00:01, 57.76it/s][A
73it [00:01, 58.17it/s][A
79it [00:01, 58.34it/s][A
85it [00:01, 58.78it/s][A
92it [00:01, 59.46it/s][A
98it [00:01, 59.21it/s][A
105it [00:01, 59.56it/s][A
111it [00:01, 59.46it/s][A
117it [00:02, 58.23it/s][A
124it [00:02, 59.04it/s][A
130it [00:02, 58.95it/s][A
136it [00:02, 59.15it/s][A
142it [00:02, 57.58it/s][A
148it [00:02, 57.82it/s][A
154it [00:02, 58.14it/s][A
160it [00:02, 58.40it/s][A
166it [00:02, 58.54it/s][A
172it [00:02, 57.68it/s][A
178it [00:03, 57.98it/s][A
184it [00:03, 58.42it/s][A
190it [00:03, 58.73it/s][A
197it [00:03, 59


Epoch: 270, Test Loss: 5.492775209942219, Test Perplexity: 243.88000220541628




0it [00:00, ?it/s][A
4it [00:00, 36.55it/s][A
9it [00:00, 41.87it/s][A
14it [00:00, 41.94it/s][A
19it [00:00, 41.41it/s][A
24it [00:00, 41.95it/s][A
29it [00:00, 43.04it/s][A
34it [00:00, 43.58it/s][A
39it [00:00, 44.26it/s][A
44it [00:01, 44.71it/s][A
49it [00:01, 44.79it/s][A
54it [00:01, 44.89it/s][A
59it [00:01, 43.74it/s][A
64it [00:01, 44.28it/s][A
69it [00:01, 44.57it/s][A
74it [00:01, 44.84it/s][A
79it [00:01, 44.74it/s][A
84it [00:01, 45.03it/s][A
89it [00:02, 44.94it/s][A
94it [00:02, 43.42it/s][A
99it [00:02, 43.28it/s][A
104it [00:02, 43.77it/s][A
109it [00:02, 44.17it/s][A

Epoch: 271, Step: 100, Loss: 4.531298112869263



114it [00:02, 44.42it/s][A
119it [00:02, 44.76it/s][A
124it [00:02, 44.87it/s][A
129it [00:02, 44.93it/s][A
134it [00:03, 44.88it/s][A
139it [00:03, 44.40it/s][A
144it [00:03, 43.20it/s][A
149it [00:03, 43.23it/s][A
154it [00:03, 43.68it/s][A
159it [00:03, 43.87it/s][A
164it [00:03, 44.32it/s][A
169it [00:03, 44.57it/s][A
174it [00:03, 43.81it/s][A
179it [00:04, 44.45it/s][A
184it [00:04, 44.41it/s][A
189it [00:04, 44.66it/s][A
194it [00:04, 44.56it/s][A
199it [00:04, 44.55it/s][A
204it [00:04, 44.81it/s][A
209it [00:04, 44.68it/s][A

Epoch: 271, Step: 200, Loss: 4.549949576854706



214it [00:04, 44.57it/s][A
219it [00:04, 43.55it/s][A
227it [00:05, 44.04it/s]
 54%|█████▍    | 271/500 [32:08<29:59,  7.86s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.01it/s][A
10it [00:00, 44.85it/s][A
15it [00:00, 43.46it/s][A
20it [00:00, 44.34it/s][A
25it [00:00, 44.40it/s][A
30it [00:00, 44.72it/s][A
35it [00:00, 44.72it/s][A
40it [00:00, 44.80it/s][A
45it [00:01, 44.67it/s][A
50it [00:01, 44.68it/s][A
55it [00:01, 44.79it/s][A
60it [00:01, 44.96it/s][A
65it [00:01, 45.12it/s][A
70it [00:01, 44.36it/s][A
75it [00:01, 44.73it/s][A
80it [00:01, 44.89it/s][A
85it [00:01, 45.05it/s][A
90it [00:02, 45.23it/s][A
95it [00:02, 45.12it/s][A
100it [00:02, 45.25it/s][A
105it [00:02, 45.20it/s][A

Epoch: 272, Step: 100, Loss: 4.533909502029419



110it [00:02, 44.97it/s][A
115it [00:02, 45.29it/s][A
120it [00:02, 44.42it/s][A
125it [00:02, 44.65it/s][A
130it [00:02, 44.92it/s][A
135it [00:03, 45.00it/s][A
140it [00:03, 44.97it/s][A
145it [00:03, 45.03it/s][A
150it [00:03, 44.78it/s][A
155it [00:03, 44.69it/s][A
160it [00:03, 44.30it/s][A
165it [00:03, 44.36it/s][A
170it [00:03, 44.73it/s][A
175it [00:03, 44.95it/s][A
180it [00:04, 44.96it/s][A
185it [00:04, 44.76it/s][A
190it [00:04, 44.42it/s][A
195it [00:04, 44.47it/s][A
200it [00:04, 44.74it/s][A
205it [00:04, 43.33it/s][A

Epoch: 272, Step: 200, Loss: 4.553067979812622



210it [00:04, 43.77it/s][A
215it [00:04, 44.17it/s][A
220it [00:04, 44.48it/s][A
227it [00:05, 44.68it/s]
 54%|█████▍    | 272/500 [32:13<26:41,  7.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.08it/s][A
10it [00:00, 44.36it/s][A
15it [00:00, 44.92it/s][A
20it [00:00, 43.79it/s][A
25it [00:00, 44.34it/s][A
30it [00:00, 44.40it/s][A
35it [00:00, 44.45it/s][A
40it [00:00, 44.47it/s][A
45it [00:01, 45.06it/s][A
50it [00:01, 45.24it/s][A
55it [00:01, 44.94it/s][A
60it [00:01, 45.12it/s][A
65it [00:01, 45.21it/s][A
70it [00:01, 45.25it/s][A
75it [00:01, 45.03it/s][A
80it [00:01, 44.14it/s][A
85it [00:01, 44.53it/s][A
90it [00:02, 44.32it/s][A
95it [00:02, 43.50it/s][A
100it [00:02, 43.70it/s][A
105it [00:02, 44.13it/s][A

Epoch: 273, Step: 100, Loss: 4.527854380607605



110it [00:02, 44.31it/s][A
115it [00:02, 44.57it/s][A
120it [00:02, 44.50it/s][A
125it [00:02, 44.73it/s][A
130it [00:02, 44.71it/s][A
135it [00:03, 44.91it/s][A
140it [00:03, 44.26it/s][A
145it [00:03, 43.00it/s][A
150it [00:03, 43.55it/s][A
155it [00:03, 42.71it/s][A
160it [00:03, 43.27it/s][A
165it [00:03, 43.75it/s][A
170it [00:03, 44.28it/s][A
175it [00:03, 44.32it/s][A
180it [00:04, 44.60it/s][A
185it [00:04, 44.49it/s][A
190it [00:04, 44.42it/s][A
195it [00:04, 44.24it/s][A
200it [00:04, 44.36it/s][A
205it [00:04, 43.87it/s][A

Epoch: 273, Step: 200, Loss: 4.551146626472473



210it [00:04, 43.90it/s][A
215it [00:04, 43.28it/s][A
220it [00:04, 43.86it/s][A
227it [00:05, 44.28it/s]
 55%|█████▍    | 273/500 [32:18<24:25,  6.46s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.78it/s][A
9it [00:00, 41.51it/s][A
14it [00:00, 43.52it/s][A
19it [00:00, 43.83it/s][A
24it [00:00, 44.49it/s][A
29it [00:00, 44.52it/s][A
34it [00:00, 44.75it/s][A
39it [00:00, 44.84it/s][A
44it [00:01, 43.60it/s][A
49it [00:01, 43.32it/s][A
54it [00:01, 42.73it/s][A
59it [00:01, 43.32it/s][A
64it [00:01, 43.65it/s][A
69it [00:01, 43.77it/s][A
74it [00:01, 44.32it/s][A
79it [00:01, 44.65it/s][A
84it [00:01, 44.44it/s][A
89it [00:02, 44.56it/s][A
94it [00:02, 43.79it/s][A
99it [00:02, 42.81it/s][A
104it [00:02, 42.10it/s][A

Epoch: 274, Step: 100, Loss: 4.533366260528564



109it [00:02, 42.53it/s][A
114it [00:02, 43.43it/s][A
119it [00:02, 42.37it/s][A
124it [00:02, 42.98it/s][A
129it [00:02, 43.61it/s][A
134it [00:03, 44.25it/s][A
139it [00:03, 43.61it/s][A
144it [00:03, 43.88it/s][A
149it [00:03, 42.89it/s][A
154it [00:03, 43.33it/s][A
159it [00:03, 43.94it/s][A
164it [00:03, 44.24it/s][A
169it [00:03, 44.49it/s][A
174it [00:03, 44.22it/s][A
179it [00:04, 44.20it/s][A
184it [00:04, 43.07it/s][A
189it [00:04, 42.99it/s][A
194it [00:04, 42.02it/s][A
199it [00:04, 42.90it/s][A
204it [00:04, 43.59it/s][A
209it [00:04, 44.06it/s][A

Epoch: 274, Step: 200, Loss: 4.549201800823211



214it [00:04, 44.32it/s][A
219it [00:05, 44.32it/s][A
227it [00:05, 43.64it/s]
 55%|█████▍    | 274/500 [32:24<22:54,  6.08s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.59it/s][A
10it [00:00, 45.57it/s][A
15it [00:00, 45.43it/s][A
20it [00:00, 43.83it/s][A
25it [00:00, 44.26it/s][A
30it [00:00, 44.47it/s][A
35it [00:00, 44.65it/s][A
40it [00:00, 44.59it/s][A
45it [00:01, 44.95it/s][A
50it [00:01, 44.77it/s][A
55it [00:01, 44.90it/s][A
60it [00:01, 44.95it/s][A
65it [00:01, 44.88it/s][A
70it [00:01, 45.03it/s][A
75it [00:01, 43.94it/s][A
80it [00:01, 44.29it/s][A
85it [00:01, 42.40it/s][A
90it [00:02, 43.27it/s][A
95it [00:02, 43.92it/s][A
100it [00:02, 44.32it/s][A
105it [00:02, 44.33it/s][A

Epoch: 275, Step: 100, Loss: 4.5368410587310795



110it [00:02, 44.36it/s][A
115it [00:02, 43.45it/s][A
120it [00:02, 44.01it/s][A
125it [00:02, 43.00it/s][A
130it [00:02, 43.58it/s][A
135it [00:03, 44.14it/s][A
140it [00:03, 44.48it/s][A
145it [00:03, 44.64it/s][A
150it [00:03, 44.78it/s][A
155it [00:03, 44.95it/s][A
160it [00:03, 45.00it/s][A
165it [00:03, 45.02it/s][A
170it [00:03, 44.41it/s][A
175it [00:03, 44.04it/s][A
180it [00:04, 44.00it/s][A
185it [00:04, 44.28it/s][A
190it [00:04, 43.43it/s][A
195it [00:04, 44.07it/s][A
200it [00:04, 44.58it/s][A
205it [00:04, 44.64it/s][A

Epoch: 275, Step: 200, Loss: 4.54835471868515



210it [00:04, 42.97it/s][A
215it [00:04, 43.79it/s][A
220it [00:04, 44.33it/s][A
227it [00:05, 44.28it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.02it/s][A
13it [00:00, 59.85it/s][A
20it [00:00, 60.30it/s][A
27it [00:00, 59.96it/s][A
34it [00:00, 60.26it/s][A
41it [00:00, 57.54it/s][A
47it [00:00, 55.07it/s][A
54it [00:00, 56.86it/s][A
60it [00:01, 57.67it/s][A
66it [00:01, 58.18it/s][A
73it [00:01, 58.88it/s][A
80it [00:01, 59.34it/s][A
87it [00:01, 59.80it/s][A
94it [00:01, 59.92it/s][A
101it [00:01, 60.13it/s][A
108it [00:01, 60.28it/s][A
115it [00:01, 59.90it/s][A
122it [00:02, 60.12it/s][A
129it [00:02, 59.99it/s][A
136it [00:02, 59.96it/s][A
143it [00:02, 60.25it/s][A
150it [00:02, 60.24it/s][A
157it [00:02, 60.27it/s][A
164it [00:02, 58.97it/s][A
170it [00:02, 59.08it/s][A
177it [00:02, 59.55it/s][A
184it [00:03, 59.81it/s][A
191it [00:03, 60.27it/s][A
198it [00:03, 59.62it/s][A
204it [00:03, 59.48it/s][A
210it [00:03, 57.54it/s][A
216it [00:03, 


Epoch: 275, Test Loss: 5.493361675961418, Test Perplexity: 243.9729906402019




0it [00:00, ?it/s][A
5it [00:00, 44.04it/s][A
10it [00:00, 44.36it/s][A
15it [00:00, 44.53it/s][A
20it [00:00, 44.69it/s][A
25it [00:00, 44.78it/s][A
30it [00:00, 44.49it/s][A
35it [00:00, 44.77it/s][A
40it [00:00, 44.46it/s][A
45it [00:01, 44.69it/s][A
50it [00:01, 44.76it/s][A
55it [00:01, 44.20it/s][A
60it [00:01, 44.26it/s][A
65it [00:01, 43.25it/s][A
70it [00:01, 43.61it/s][A
75it [00:01, 43.94it/s][A
80it [00:01, 44.17it/s][A
85it [00:01, 43.70it/s][A
90it [00:02, 42.99it/s][A
95it [00:02, 41.96it/s][A
100it [00:02, 42.39it/s][A
105it [00:02, 42.76it/s][A

Epoch: 276, Step: 100, Loss: 4.5317529296875



110it [00:02, 42.86it/s][A
115it [00:02, 43.20it/s][A
120it [00:02, 43.46it/s][A
125it [00:02, 43.90it/s][A
130it [00:02, 44.32it/s][A
135it [00:03, 44.67it/s][A
140it [00:03, 45.02it/s][A
145it [00:03, 45.10it/s][A
150it [00:03, 45.44it/s][A
155it [00:03, 45.10it/s][A
160it [00:03, 45.12it/s][A
165it [00:03, 45.09it/s][A
170it [00:03, 44.58it/s][A
175it [00:03, 44.47it/s][A
180it [00:04, 44.60it/s][A
185it [00:04, 44.83it/s][A
190it [00:04, 44.95it/s][A
195it [00:04, 44.97it/s][A
200it [00:04, 44.65it/s][A
205it [00:04, 44.29it/s][A

Epoch: 276, Step: 200, Loss: 4.548237795829773



210it [00:04, 44.30it/s][A
215it [00:04, 44.60it/s][A
220it [00:04, 44.87it/s][A
227it [00:05, 44.27it/s]
 55%|█████▌    | 276/500 [32:45<29:29,  7.90s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.21it/s][A
10it [00:00, 44.47it/s][A
15it [00:00, 44.84it/s][A
20it [00:00, 43.21it/s][A
25it [00:00, 43.50it/s][A
30it [00:00, 43.77it/s][A
35it [00:00, 44.28it/s][A
40it [00:00, 43.31it/s][A
45it [00:01, 43.51it/s][A
50it [00:01, 43.93it/s][A
55it [00:01, 44.63it/s][A
60it [00:01, 44.98it/s][A
65it [00:01, 44.57it/s][A
70it [00:01, 44.52it/s][A
75it [00:01, 44.39it/s][A
80it [00:01, 44.82it/s][A
85it [00:01, 44.74it/s][A
90it [00:02, 44.74it/s][A
95it [00:02, 43.67it/s][A
100it [00:02, 43.39it/s][A
105it [00:02, 43.72it/s][A

Epoch: 277, Step: 100, Loss: 4.527586526870728



110it [00:02, 43.50it/s][A
115it [00:02, 42.20it/s][A
120it [00:02, 42.52it/s][A
125it [00:02, 43.27it/s][A
130it [00:02, 43.49it/s][A
135it [00:03, 43.87it/s][A
140it [00:03, 44.35it/s][A
145it [00:03, 44.36it/s][A
150it [00:03, 44.14it/s][A
155it [00:03, 43.91it/s][A
160it [00:03, 44.08it/s][A
165it [00:03, 43.24it/s][A
170it [00:03, 44.02it/s][A
175it [00:03, 43.70it/s][A
180it [00:04, 44.32it/s][A
185it [00:04, 44.68it/s][A
190it [00:04, 44.91it/s][A
195it [00:04, 43.63it/s][A
200it [00:04, 44.01it/s][A
205it [00:04, 44.38it/s][A

Epoch: 277, Step: 200, Loss: 4.5471322631835935



210it [00:04, 44.66it/s][A
215it [00:04, 43.63it/s][A
220it [00:05, 43.91it/s][A
227it [00:05, 43.91it/s]
 55%|█████▌    | 277/500 [32:50<26:18,  7.08s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.90it/s][A
10it [00:00, 44.12it/s][A
15it [00:00, 44.49it/s][A
20it [00:00, 42.19it/s][A
25it [00:00, 42.70it/s][A
30it [00:00, 43.55it/s][A
35it [00:00, 44.02it/s][A
40it [00:00, 44.25it/s][A
45it [00:01, 42.78it/s][A
50it [00:01, 42.74it/s][A
55it [00:01, 43.53it/s][A
60it [00:01, 43.81it/s][A
65it [00:01, 42.35it/s][A
70it [00:01, 42.06it/s][A
75it [00:01, 42.82it/s][A
80it [00:01, 43.46it/s][A
85it [00:01, 43.78it/s][A
90it [00:02, 42.97it/s][A
95it [00:02, 43.57it/s][A
100it [00:02, 43.54it/s][A
105it [00:02, 43.58it/s][A

Epoch: 278, Step: 100, Loss: 4.537404127120972



110it [00:02, 43.74it/s][A
115it [00:02, 43.57it/s][A
120it [00:02, 44.08it/s][A
125it [00:02, 44.30it/s][A
130it [00:02, 43.37it/s][A
135it [00:03, 43.91it/s][A
140it [00:03, 44.22it/s][A
145it [00:03, 44.22it/s][A
150it [00:03, 43.64it/s][A
155it [00:03, 42.96it/s][A
160it [00:03, 42.49it/s][A
165it [00:03, 43.18it/s][A
170it [00:03, 43.73it/s][A
175it [00:04, 43.87it/s][A
180it [00:04, 44.35it/s][A
185it [00:04, 44.79it/s][A
190it [00:04, 44.94it/s][A
195it [00:04, 45.12it/s][A
200it [00:04, 45.23it/s][A
205it [00:04, 44.94it/s][A

Epoch: 278, Step: 200, Loss: 4.546557941436768



210it [00:04, 44.63it/s][A
215it [00:04, 43.81it/s][A
220it [00:05, 44.45it/s][A
227it [00:05, 43.68it/s]
 56%|█████▌    | 278/500 [32:55<24:06,  6.52s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.52it/s][A
10it [00:00, 44.97it/s][A
15it [00:00, 45.13it/s][A
20it [00:00, 45.01it/s][A
25it [00:00, 45.21it/s][A
30it [00:00, 45.40it/s][A
35it [00:00, 45.11it/s][A
40it [00:00, 44.20it/s][A
45it [00:01, 44.23it/s][A
50it [00:01, 43.85it/s][A
55it [00:01, 44.09it/s][A
60it [00:01, 44.22it/s][A
65it [00:01, 44.34it/s][A
70it [00:01, 44.59it/s][A
75it [00:01, 44.96it/s][A
80it [00:01, 45.07it/s][A
85it [00:01, 43.66it/s][A
90it [00:02, 44.00it/s][A
95it [00:02, 44.08it/s][A
100it [00:02, 44.17it/s][A
105it [00:02, 44.02it/s][A

Epoch: 279, Step: 100, Loss: 4.5342765188217165



110it [00:02, 43.93it/s][A
115it [00:02, 44.02it/s][A
120it [00:02, 43.84it/s][A
125it [00:02, 44.44it/s][A
130it [00:02, 44.84it/s][A
135it [00:03, 44.52it/s][A
140it [00:03, 44.78it/s][A
145it [00:03, 44.99it/s][A
150it [00:03, 45.16it/s][A
155it [00:03, 45.08it/s][A
160it [00:03, 45.14it/s][A
165it [00:03, 45.39it/s][A
170it [00:03, 44.45it/s][A
175it [00:03, 43.65it/s][A
180it [00:04, 44.36it/s][A
185it [00:04, 44.69it/s][A
190it [00:04, 44.84it/s][A
195it [00:04, 44.91it/s][A
200it [00:04, 44.95it/s][A
205it [00:04, 44.87it/s][A

Epoch: 279, Step: 200, Loss: 4.54843900680542



210it [00:04, 44.91it/s][A
215it [00:04, 45.04it/s][A
220it [00:04, 44.91it/s][A
227it [00:05, 44.61it/s]
 56%|█████▌    | 279/500 [33:00<22:25,  6.09s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.98it/s][A
10it [00:00, 45.12it/s][A
15it [00:00, 45.36it/s][A
20it [00:00, 43.61it/s][A
25it [00:00, 44.37it/s][A
30it [00:00, 44.53it/s][A
35it [00:00, 44.98it/s][A
40it [00:00, 44.44it/s][A
45it [00:01, 44.06it/s][A
50it [00:01, 44.73it/s][A
55it [00:01, 44.74it/s][A
60it [00:01, 45.15it/s][A
65it [00:01, 44.67it/s][A
70it [00:01, 45.19it/s][A
75it [00:01, 45.69it/s][A
80it [00:01, 45.96it/s][A
85it [00:01, 46.12it/s][A
90it [00:01, 46.22it/s][A
95it [00:02, 45.29it/s][A
100it [00:02, 44.88it/s][A
105it [00:02, 43.99it/s][A

Epoch: 280, Step: 100, Loss: 4.540381832122803



110it [00:02, 43.57it/s][A
115it [00:02, 44.09it/s][A
120it [00:02, 43.59it/s][A
125it [00:02, 44.33it/s][A
130it [00:02, 44.74it/s][A
135it [00:03, 44.73it/s][A
140it [00:03, 44.64it/s][A
145it [00:03, 43.67it/s][A
150it [00:03, 44.12it/s][A
155it [00:03, 44.45it/s][A
160it [00:03, 43.74it/s][A
165it [00:03, 44.38it/s][A
170it [00:03, 44.85it/s][A
175it [00:03, 43.78it/s][A
180it [00:04, 44.22it/s][A
185it [00:04, 44.53it/s][A
190it [00:04, 43.80it/s][A
195it [00:04, 44.32it/s][A
200it [00:04, 44.90it/s][A
205it [00:04, 45.16it/s][A

Epoch: 280, Step: 200, Loss: 4.5468691945075985



210it [00:04, 45.04it/s][A
215it [00:04, 45.73it/s][A
220it [00:04, 46.33it/s][A
227it [00:05, 44.76it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.22it/s][A
13it [00:00, 59.78it/s][A
20it [00:00, 60.47it/s][A
27it [00:00, 60.26it/s][A
34it [00:00, 60.15it/s][A
41it [00:00, 60.32it/s][A
48it [00:00, 58.72it/s][A
54it [00:00, 59.04it/s][A
61it [00:01, 59.52it/s][A
68it [00:01, 60.01it/s][A
75it [00:01, 59.88it/s][A
81it [00:01, 58.91it/s][A
87it [00:01, 57.05it/s][A
93it [00:01, 57.84it/s][A
100it [00:01, 58.90it/s][A
107it [00:01, 59.46it/s][A
113it [00:01, 59.47it/s][A
119it [00:02, 59.58it/s][A
125it [00:02, 59.41it/s][A
131it [00:02, 59.50it/s][A
137it [00:02, 57.26it/s][A
143it [00:02, 57.83it/s][A
149it [00:02, 58.39it/s][A
155it [00:02, 58.83it/s][A
161it [00:02, 58.94it/s][A
167it [00:02, 57.43it/s][A
173it [00:02, 58.16it/s][A
179it [00:03, 58.42it/s][A
185it [00:03, 58.57it/s][A
192it [00:03, 59.13it/s][A
198it [00:03, 57.72it/s][A
204it [00:03, 


Epoch: 280, Test Loss: 5.498955644435764, Test Perplexity: 245.42764059505106




0it [00:00, ?it/s][A
5it [00:00, 43.95it/s][A
10it [00:00, 43.96it/s][A
15it [00:00, 44.38it/s][A
20it [00:00, 44.43it/s][A
25it [00:00, 44.60it/s][A
30it [00:00, 44.84it/s][A
35it [00:00, 44.73it/s][A
40it [00:00, 44.64it/s][A
45it [00:01, 44.54it/s][A
50it [00:01, 43.45it/s][A
55it [00:01, 43.86it/s][A
60it [00:01, 44.15it/s][A
65it [00:01, 44.46it/s][A
70it [00:01, 44.75it/s][A
75it [00:01, 44.83it/s][A
80it [00:01, 44.83it/s][A
85it [00:01, 44.97it/s][A
90it [00:02, 44.30it/s][A
95it [00:02, 44.39it/s][A
100it [00:02, 44.43it/s][A
105it [00:02, 44.62it/s][A

Epoch: 281, Step: 100, Loss: 4.532222437858581



110it [00:02, 44.64it/s][A
115it [00:02, 44.82it/s][A
120it [00:02, 44.87it/s][A
125it [00:02, 44.79it/s][A
130it [00:02, 45.04it/s][A
135it [00:03, 44.82it/s][A
140it [00:03, 44.12it/s][A
145it [00:03, 44.58it/s][A
150it [00:03, 44.89it/s][A
155it [00:03, 44.58it/s][A
160it [00:03, 44.69it/s][A
165it [00:03, 43.70it/s][A
170it [00:03, 44.26it/s][A
175it [00:03, 44.45it/s][A
180it [00:04, 44.75it/s][A
185it [00:04, 44.73it/s][A
190it [00:04, 44.74it/s][A
195it [00:04, 44.96it/s][A
200it [00:04, 44.84it/s][A
205it [00:04, 44.01it/s][A

Epoch: 281, Step: 200, Loss: 4.544533061981201



210it [00:04, 43.23it/s][A
215it [00:04, 44.04it/s][A
220it [00:04, 44.15it/s][A
227it [00:05, 44.44it/s]
 56%|█████▌    | 281/500 [33:21<28:48,  7.89s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.11it/s][A
10it [00:00, 45.63it/s][A
15it [00:00, 45.27it/s][A
20it [00:00, 44.03it/s][A
25it [00:00, 44.55it/s][A
30it [00:00, 45.02it/s][A
35it [00:00, 43.56it/s][A
40it [00:00, 43.47it/s][A
45it [00:01, 44.03it/s][A
50it [00:01, 44.34it/s][A
55it [00:01, 44.23it/s][A
60it [00:01, 44.49it/s][A
65it [00:01, 44.89it/s][A
70it [00:01, 44.50it/s][A
75it [00:01, 44.77it/s][A
80it [00:01, 44.99it/s][A
85it [00:01, 45.08it/s][A
90it [00:02, 45.33it/s][A
95it [00:02, 45.31it/s][A
100it [00:02, 45.16it/s][A
105it [00:02, 44.81it/s][A

Epoch: 282, Step: 100, Loss: 4.532097406387329



110it [00:02, 43.78it/s][A
115it [00:02, 42.23it/s][A
120it [00:02, 43.13it/s][A
125it [00:02, 43.50it/s][A
130it [00:02, 43.97it/s][A
135it [00:03, 43.08it/s][A
140it [00:03, 42.63it/s][A
145it [00:03, 42.14it/s][A
150it [00:03, 43.10it/s][A
155it [00:03, 42.67it/s][A
160it [00:03, 43.52it/s][A
165it [00:03, 44.01it/s][A
170it [00:03, 43.64it/s][A
175it [00:03, 43.37it/s][A
180it [00:04, 43.78it/s][A
185it [00:04, 44.07it/s][A
190it [00:04, 44.43it/s][A
195it [00:04, 44.60it/s][A
200it [00:04, 44.59it/s][A
205it [00:04, 44.77it/s][A

Epoch: 282, Step: 200, Loss: 4.545292525291443



210it [00:04, 43.21it/s][A
215it [00:04, 42.86it/s][A
220it [00:05, 42.77it/s][A
227it [00:05, 43.74it/s]
 56%|█████▋    | 282/500 [33:27<25:44,  7.08s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.74it/s][A
10it [00:00, 44.27it/s][A
15it [00:00, 42.83it/s][A
20it [00:00, 43.97it/s][A
25it [00:00, 44.62it/s][A
30it [00:00, 44.81it/s][A
35it [00:00, 44.68it/s][A
40it [00:00, 44.54it/s][A
45it [00:01, 44.02it/s][A
50it [00:01, 42.86it/s][A
55it [00:01, 43.55it/s][A
60it [00:01, 43.97it/s][A
65it [00:01, 43.16it/s][A
70it [00:01, 43.66it/s][A
75it [00:01, 43.96it/s][A
80it [00:01, 44.31it/s][A
85it [00:01, 44.68it/s][A
90it [00:02, 44.90it/s][A
95it [00:02, 44.86it/s][A
100it [00:02, 44.65it/s][A
105it [00:02, 45.00it/s][A

Epoch: 283, Step: 100, Loss: 4.530879883766175



110it [00:02, 45.19it/s][A
115it [00:02, 45.15it/s][A
120it [00:02, 44.73it/s][A
125it [00:02, 44.77it/s][A
130it [00:02, 43.82it/s][A
135it [00:03, 44.17it/s][A
140it [00:03, 44.27it/s][A
145it [00:03, 44.38it/s][A
150it [00:03, 44.50it/s][A
155it [00:03, 44.68it/s][A
160it [00:03, 44.54it/s][A
165it [00:03, 44.33it/s][A
170it [00:03, 44.31it/s][A
175it [00:03, 44.33it/s][A
180it [00:04, 44.57it/s][A
185it [00:04, 44.53it/s][A
190it [00:04, 43.32it/s][A
195it [00:04, 43.84it/s][A
200it [00:04, 43.65it/s][A
205it [00:04, 44.29it/s][A

Epoch: 283, Step: 200, Loss: 4.5469523048400875



210it [00:04, 44.08it/s][A
215it [00:04, 44.54it/s][A
220it [00:04, 43.35it/s][A
227it [00:05, 44.14it/s]
 57%|█████▋    | 283/500 [33:32<23:30,  6.50s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.74it/s][A
10it [00:00, 44.78it/s][A
15it [00:00, 43.06it/s][A
20it [00:00, 44.30it/s][A
25it [00:00, 43.79it/s][A
30it [00:00, 44.34it/s][A
35it [00:00, 44.45it/s][A
40it [00:00, 44.62it/s][A
45it [00:01, 44.66it/s][A
50it [00:01, 43.58it/s][A
55it [00:01, 44.39it/s][A
60it [00:01, 44.91it/s][A
65it [00:01, 44.93it/s][A
70it [00:01, 44.94it/s][A
75it [00:01, 45.04it/s][A
80it [00:01, 44.93it/s][A
85it [00:01, 45.07it/s][A
90it [00:02, 44.87it/s][A
95it [00:02, 45.03it/s][A
100it [00:02, 45.30it/s][A
105it [00:02, 45.58it/s][A

Epoch: 284, Step: 100, Loss: 4.537372999191284



110it [00:02, 45.33it/s][A
115it [00:02, 45.49it/s][A
120it [00:02, 45.56it/s][A
125it [00:02, 45.54it/s][A
130it [00:02, 45.45it/s][A
135it [00:03, 45.43it/s][A
140it [00:03, 45.39it/s][A
145it [00:03, 45.26it/s][A
150it [00:03, 45.13it/s][A
155it [00:03, 45.19it/s][A
160it [00:03, 45.00it/s][A
165it [00:03, 45.15it/s][A
170it [00:03, 45.15it/s][A
175it [00:03, 45.24it/s][A
180it [00:04, 45.35it/s][A
185it [00:04, 45.64it/s][A
190it [00:04, 45.01it/s][A
195it [00:04, 45.49it/s][A
200it [00:04, 45.21it/s][A
205it [00:04, 45.60it/s][A

Epoch: 284, Step: 200, Loss: 4.5488072824478145



210it [00:04, 45.38it/s][A
215it [00:04, 45.42it/s][A
220it [00:04, 45.49it/s][A
227it [00:05, 44.97it/s]
 57%|█████▋    | 284/500 [33:37<21:50,  6.07s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.57it/s][A
10it [00:00, 44.54it/s][A
15it [00:00, 43.21it/s][A
20it [00:00, 44.15it/s][A
25it [00:00, 44.39it/s][A
30it [00:00, 43.79it/s][A
35it [00:00, 43.48it/s][A
40it [00:00, 43.88it/s][A
45it [00:01, 44.00it/s][A
50it [00:01, 44.36it/s][A
55it [00:01, 44.37it/s][A
60it [00:01, 44.30it/s][A
65it [00:01, 44.51it/s][A
70it [00:01, 44.71it/s][A
75it [00:01, 45.08it/s][A
80it [00:01, 45.39it/s][A
85it [00:01, 44.51it/s][A
90it [00:02, 44.66it/s][A
95it [00:02, 44.90it/s][A
100it [00:02, 44.81it/s][A
105it [00:02, 44.34it/s][A

Epoch: 285, Step: 100, Loss: 4.535392580032348



110it [00:02, 44.63it/s][A
115it [00:02, 44.74it/s][A
120it [00:02, 44.69it/s][A
125it [00:02, 44.64it/s][A
130it [00:02, 44.03it/s][A
135it [00:03, 44.43it/s][A
140it [00:03, 44.72it/s][A
145it [00:03, 44.73it/s][A
150it [00:03, 44.83it/s][A
155it [00:03, 44.83it/s][A
160it [00:03, 44.70it/s][A
165it [00:03, 44.67it/s][A
170it [00:03, 44.62it/s][A
175it [00:03, 44.55it/s][A
180it [00:04, 44.37it/s][A
185it [00:04, 44.61it/s][A
190it [00:04, 44.76it/s][A
195it [00:04, 43.77it/s][A
200it [00:04, 43.98it/s][A
205it [00:04, 43.96it/s][A

Epoch: 285, Step: 200, Loss: 4.543766644001007



210it [00:04, 44.05it/s][A
215it [00:04, 44.36it/s][A
220it [00:04, 44.30it/s][A
227it [00:05, 44.44it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.64it/s][A
12it [00:00, 59.08it/s][A
18it [00:00, 59.46it/s][A
24it [00:00, 59.37it/s][A
30it [00:00, 58.99it/s][A
36it [00:00, 59.03it/s][A
43it [00:00, 59.56it/s][A
49it [00:00, 58.08it/s][A
55it [00:00, 58.39it/s][A
62it [00:01, 59.14it/s][A
68it [00:01, 58.93it/s][A
74it [00:01, 58.25it/s][A
80it [00:01, 58.57it/s][A
86it [00:01, 58.58it/s][A
92it [00:01, 58.76it/s][A
98it [00:01, 57.36it/s][A
104it [00:01, 56.11it/s][A
110it [00:01, 57.04it/s][A
116it [00:01, 57.33it/s][A
122it [00:02, 57.79it/s][A
129it [00:02, 58.73it/s][A
135it [00:02, 58.96it/s][A
141it [00:02, 59.24it/s][A
147it [00:02, 56.22it/s][A
153it [00:02, 56.99it/s][A
159it [00:02, 57.84it/s][A
165it [00:02, 58.15it/s][A
171it [00:02, 55.92it/s][A
177it [00:03, 56.52it/s][A
184it [00:03, 57.84it/s][A
190it [00:03, 58.40it/s][A
196it [00:03, 58


Epoch: 285, Test Loss: 5.493650142450511, Test Perplexity: 244.1432550086738




0it [00:00, ?it/s][A
4it [00:00, 38.30it/s][A
9it [00:00, 40.65it/s][A
14it [00:00, 43.04it/s][A
19it [00:00, 43.98it/s][A
24it [00:00, 44.38it/s][A
29it [00:00, 44.85it/s][A
34it [00:00, 45.14it/s][A
39it [00:00, 45.27it/s][A
44it [00:00, 45.39it/s][A
49it [00:01, 45.28it/s][A
54it [00:01, 45.41it/s][A
59it [00:01, 44.22it/s][A
64it [00:01, 44.68it/s][A
69it [00:01, 44.85it/s][A
74it [00:01, 44.90it/s][A
79it [00:01, 45.29it/s][A
84it [00:01, 45.39it/s][A
89it [00:01, 45.32it/s][A
94it [00:02, 44.98it/s][A
99it [00:02, 44.66it/s][A
104it [00:02, 44.34it/s][A

Epoch: 286, Step: 100, Loss: 4.530690913200378



109it [00:02, 44.05it/s][A
114it [00:02, 43.77it/s][A
119it [00:02, 44.00it/s][A
124it [00:02, 44.14it/s][A
129it [00:02, 44.15it/s][A
134it [00:03, 43.24it/s][A
139it [00:03, 43.41it/s][A
144it [00:03, 42.47it/s][A
149it [00:03, 43.22it/s][A
154it [00:03, 43.63it/s][A
159it [00:03, 44.01it/s][A
164it [00:03, 43.19it/s][A
169it [00:03, 43.50it/s][A
174it [00:03, 43.95it/s][A
179it [00:04, 44.25it/s][A
184it [00:04, 44.53it/s][A
189it [00:04, 44.82it/s][A
194it [00:04, 44.82it/s][A
199it [00:04, 44.81it/s][A
204it [00:04, 45.08it/s][A

Epoch: 286, Step: 200, Loss: 4.5432322573661805



209it [00:04, 43.35it/s][A
214it [00:04, 43.00it/s][A
219it [00:04, 43.33it/s][A
227it [00:05, 44.14it/s]
 57%|█████▋    | 286/500 [33:58<28:09,  7.89s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.16it/s][A
10it [00:00, 45.45it/s][A
15it [00:00, 45.47it/s][A
20it [00:00, 45.11it/s][A
25it [00:00, 45.03it/s][A
30it [00:00, 45.03it/s][A
35it [00:00, 45.02it/s][A
40it [00:00, 44.94it/s][A
45it [00:01, 43.79it/s][A
50it [00:01, 44.22it/s][A
55it [00:01, 43.92it/s][A
60it [00:01, 42.94it/s][A
65it [00:01, 43.61it/s][A
70it [00:01, 43.88it/s][A
75it [00:01, 43.72it/s][A
80it [00:01, 44.09it/s][A
85it [00:01, 42.09it/s][A
90it [00:02, 41.77it/s][A
95it [00:02, 42.57it/s][A
100it [00:02, 43.29it/s][A
105it [00:02, 43.82it/s][A

Epoch: 287, Step: 100, Loss: 4.52142867565155



110it [00:02, 43.32it/s][A
115it [00:02, 43.35it/s][A
120it [00:02, 43.61it/s][A
125it [00:02, 43.84it/s][A
130it [00:02, 44.05it/s][A
135it [00:03, 44.10it/s][A
140it [00:03, 44.23it/s][A
145it [00:03, 44.44it/s][A
150it [00:03, 44.61it/s][A
155it [00:03, 44.92it/s][A
160it [00:03, 44.97it/s][A
165it [00:03, 45.24it/s][A
170it [00:03, 45.44it/s][A
175it [00:03, 45.35it/s][A
180it [00:04, 45.47it/s][A
185it [00:04, 45.35it/s][A
190it [00:04, 45.47it/s][A
195it [00:04, 45.33it/s][A
200it [00:04, 43.72it/s][A
205it [00:04, 44.31it/s][A

Epoch: 287, Step: 200, Loss: 4.544605741500854



210it [00:04, 44.71it/s][A
215it [00:04, 45.08it/s][A
220it [00:04, 45.29it/s][A
227it [00:05, 44.33it/s]
 57%|█████▋    | 287/500 [34:03<25:04,  7.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.51it/s][A
10it [00:00, 44.14it/s][A
15it [00:00, 44.53it/s][A
20it [00:00, 41.89it/s][A
25it [00:00, 42.91it/s][A
30it [00:00, 43.49it/s][A
35it [00:00, 43.77it/s][A
40it [00:00, 44.18it/s][A
45it [00:01, 44.19it/s][A
50it [00:01, 44.39it/s][A
55it [00:01, 44.03it/s][A
60it [00:01, 44.01it/s][A
65it [00:01, 44.26it/s][A
70it [00:01, 44.57it/s][A
75it [00:01, 44.70it/s][A
80it [00:01, 44.80it/s][A
85it [00:01, 44.88it/s][A
90it [00:02, 45.13it/s][A
95it [00:02, 44.57it/s][A
100it [00:02, 44.82it/s][A
105it [00:02, 45.06it/s][A

Epoch: 288, Step: 100, Loss: 4.521958885192871



110it [00:02, 45.36it/s][A
115it [00:02, 44.49it/s][A
120it [00:02, 45.13it/s][A
125it [00:02, 44.15it/s][A
130it [00:02, 44.93it/s][A
135it [00:03, 44.89it/s][A
140it [00:03, 45.22it/s][A
145it [00:03, 44.38it/s][A
150it [00:03, 44.95it/s][A
155it [00:03, 45.15it/s][A
160it [00:03, 45.02it/s][A
165it [00:03, 45.13it/s][A
170it [00:03, 44.91it/s][A
175it [00:03, 44.46it/s][A
180it [00:04, 44.97it/s][A
185it [00:04, 45.05it/s][A
190it [00:04, 44.38it/s][A
195it [00:04, 44.98it/s][A
200it [00:04, 45.38it/s][A
205it [00:04, 45.67it/s][A

Epoch: 288, Step: 200, Loss: 4.541720242500305



210it [00:04, 45.73it/s][A
215it [00:04, 45.72it/s][A
220it [00:04, 45.85it/s][A
227it [00:05, 44.75it/s]
 58%|█████▊    | 288/500 [34:08<22:50,  6.47s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.28it/s][A
10it [00:00, 45.57it/s][A
15it [00:00, 45.42it/s][A
20it [00:00, 45.55it/s][A
25it [00:00, 45.26it/s][A
30it [00:00, 45.54it/s][A
35it [00:00, 45.60it/s][A
40it [00:00, 44.71it/s][A
45it [00:00, 45.16it/s][A
50it [00:01, 44.32it/s][A
55it [00:01, 44.53it/s][A
60it [00:01, 44.77it/s][A
65it [00:01, 45.23it/s][A
70it [00:01, 44.98it/s][A
75it [00:01, 45.05it/s][A
80it [00:01, 45.57it/s][A
85it [00:01, 45.53it/s][A
90it [00:01, 44.56it/s][A
95it [00:02, 44.88it/s][A
100it [00:02, 45.12it/s][A
105it [00:02, 45.32it/s][A

Epoch: 289, Step: 100, Loss: 4.537022233009338



110it [00:02, 45.09it/s][A
115it [00:02, 45.12it/s][A
120it [00:02, 45.24it/s][A
125it [00:02, 45.30it/s][A
130it [00:02, 44.89it/s][A
135it [00:02, 44.89it/s][A
140it [00:03, 45.07it/s][A
145it [00:03, 45.25it/s][A
150it [00:03, 45.59it/s][A
155it [00:03, 44.94it/s][A
160it [00:03, 45.17it/s][A
165it [00:03, 45.35it/s][A
170it [00:03, 45.21it/s][A
175it [00:03, 45.20it/s][A
180it [00:03, 45.18it/s][A
185it [00:04, 44.83it/s][A
190it [00:04, 44.68it/s][A
195it [00:04, 44.65it/s][A
200it [00:04, 44.89it/s][A
205it [00:04, 45.01it/s][A

Epoch: 289, Step: 200, Loss: 4.540957114696503



210it [00:04, 44.80it/s][A
215it [00:04, 45.07it/s][A
220it [00:04, 45.12it/s][A
227it [00:05, 45.02it/s]
 58%|█████▊    | 289/500 [34:13<21:14,  6.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.91it/s][A
10it [00:00, 45.63it/s][A
15it [00:00, 45.10it/s][A
20it [00:00, 43.26it/s][A
25it [00:00, 44.30it/s][A
30it [00:00, 45.01it/s][A
35it [00:00, 45.29it/s][A
40it [00:00, 45.16it/s][A
45it [00:01, 45.06it/s][A
50it [00:01, 45.24it/s][A
55it [00:01, 43.56it/s][A
60it [00:01, 44.38it/s][A
65it [00:01, 44.61it/s][A
70it [00:01, 45.09it/s][A
75it [00:01, 45.32it/s][A
80it [00:01, 45.47it/s][A
85it [00:01, 45.46it/s][A
90it [00:02, 45.24it/s][A
95it [00:02, 45.24it/s][A
100it [00:02, 45.23it/s][A
105it [00:02, 45.07it/s][A

Epoch: 290, Step: 100, Loss: 4.52874342918396



110it [00:02, 44.97it/s][A
115it [00:02, 44.25it/s][A
120it [00:02, 44.83it/s][A
125it [00:02, 44.33it/s][A
130it [00:02, 44.83it/s][A
135it [00:03, 44.75it/s][A
140it [00:03, 45.04it/s][A
145it [00:03, 44.82it/s][A
150it [00:03, 45.05it/s][A
155it [00:03, 45.26it/s][A
160it [00:03, 45.51it/s][A
165it [00:03, 44.32it/s][A
170it [00:03, 44.80it/s][A
175it [00:03, 44.07it/s][A
180it [00:04, 44.74it/s][A
185it [00:04, 44.93it/s][A
190it [00:04, 45.16it/s][A
195it [00:04, 45.05it/s][A
200it [00:04, 45.18it/s][A
205it [00:04, 45.33it/s][A

Epoch: 290, Step: 200, Loss: 4.540789709091187



210it [00:04, 43.73it/s][A
215it [00:04, 42.25it/s][A
220it [00:04, 43.43it/s][A
227it [00:05, 44.65it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.66it/s][A
13it [00:00, 59.52it/s][A
20it [00:00, 60.06it/s][A
27it [00:00, 59.95it/s][A
34it [00:00, 60.14it/s][A
41it [00:00, 60.20it/s][A
48it [00:00, 60.26it/s][A
55it [00:00, 59.38it/s][A
61it [00:01, 59.40it/s][A
68it [00:01, 59.92it/s][A
74it [00:01, 59.92it/s][A
80it [00:01, 59.55it/s][A
87it [00:01, 59.89it/s][A
93it [00:01, 59.70it/s][A
100it [00:01, 59.98it/s][A
106it [00:01, 58.94it/s][A
112it [00:01, 59.16it/s][A
118it [00:01, 56.33it/s][A
124it [00:02, 57.33it/s][A
131it [00:02, 58.21it/s][A
137it [00:02, 58.65it/s][A
143it [00:02, 58.88it/s][A
149it [00:02, 58.95it/s][A
155it [00:02, 59.11it/s][A
161it [00:02, 59.23it/s][A
167it [00:02, 59.36it/s][A
174it [00:02, 59.73it/s][A
181it [00:03, 59.92it/s][A
187it [00:03, 59.75it/s][A
194it [00:03, 59.98it/s][A
200it [00:03, 59.94it/s][A
206it [00:03, 


Epoch: 290, Test Loss: 5.494730628795505, Test Perplexity: 244.38979358554627




0it [00:00, ?it/s][A
5it [00:00, 46.18it/s][A
10it [00:00, 45.02it/s][A
15it [00:00, 45.42it/s][A
20it [00:00, 45.84it/s][A
25it [00:00, 45.72it/s][A
30it [00:00, 45.92it/s][A
35it [00:00, 45.86it/s][A
40it [00:00, 45.23it/s][A
45it [00:00, 45.54it/s][A
50it [00:01, 45.47it/s][A
55it [00:01, 44.08it/s][A
60it [00:01, 44.39it/s][A
65it [00:01, 44.72it/s][A
70it [00:01, 44.64it/s][A
75it [00:01, 45.05it/s][A
80it [00:01, 43.99it/s][A
85it [00:01, 44.10it/s][A
90it [00:02, 44.16it/s][A
95it [00:02, 44.34it/s][A
100it [00:02, 43.66it/s][A
105it [00:02, 42.13it/s][A

Epoch: 291, Step: 100, Loss: 4.52157389163971



110it [00:02, 42.80it/s][A
115it [00:02, 43.48it/s][A
120it [00:02, 44.20it/s][A
125it [00:02, 44.39it/s][A
130it [00:02, 44.74it/s][A
135it [00:03, 44.91it/s][A
140it [00:03, 44.92it/s][A
145it [00:03, 44.63it/s][A
150it [00:03, 44.90it/s][A
155it [00:03, 45.07it/s][A
160it [00:03, 45.13it/s][A
165it [00:03, 45.25it/s][A
170it [00:03, 45.23it/s][A
175it [00:03, 45.23it/s][A
180it [00:04, 45.17it/s][A
185it [00:04, 45.22it/s][A
190it [00:04, 45.17it/s][A
195it [00:04, 45.03it/s][A
200it [00:04, 44.91it/s][A
205it [00:04, 45.03it/s][A

Epoch: 291, Step: 200, Loss: 4.539892587661743



210it [00:04, 44.81it/s][A
215it [00:04, 44.35it/s][A
220it [00:04, 44.28it/s][A
227it [00:05, 44.55it/s]
 58%|█████▊    | 291/500 [34:34<27:19,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.29it/s][A
10it [00:00, 43.18it/s][A
15it [00:00, 44.46it/s][A
20it [00:00, 44.99it/s][A
25it [00:00, 45.15it/s][A
30it [00:00, 45.04it/s][A
35it [00:00, 45.05it/s][A
40it [00:00, 44.79it/s][A
45it [00:01, 44.88it/s][A
50it [00:01, 45.12it/s][A
55it [00:01, 45.14it/s][A
60it [00:01, 44.78it/s][A
65it [00:01, 44.76it/s][A
70it [00:01, 45.00it/s][A
75it [00:01, 45.09it/s][A
80it [00:01, 45.16it/s][A
85it [00:01, 45.25it/s][A
90it [00:02, 45.31it/s][A
95it [00:02, 45.39it/s][A
100it [00:02, 45.55it/s][A
105it [00:02, 45.71it/s][A

Epoch: 292, Step: 100, Loss: 4.529580020904541



110it [00:02, 45.58it/s][A
115it [00:02, 45.52it/s][A
120it [00:02, 45.58it/s][A
125it [00:02, 44.47it/s][A
130it [00:02, 44.63it/s][A
135it [00:03, 44.73it/s][A
140it [00:03, 44.99it/s][A
145it [00:03, 45.23it/s][A
150it [00:03, 44.91it/s][A
155it [00:03, 45.05it/s][A
160it [00:03, 45.12it/s][A
165it [00:03, 44.02it/s][A
170it [00:03, 44.70it/s][A
175it [00:03, 44.71it/s][A
180it [00:04, 43.31it/s][A
185it [00:04, 44.12it/s][A
190it [00:04, 44.96it/s][A
195it [00:04, 45.01it/s][A
200it [00:04, 45.43it/s][A
205it [00:04, 44.46it/s][A

Epoch: 292, Step: 200, Loss: 4.540905771255493



210it [00:04, 44.23it/s][A
215it [00:04, 45.03it/s][A
220it [00:04, 44.34it/s][A
227it [00:05, 44.87it/s]
 58%|█████▊    | 292/500 [34:39<24:18,  7.01s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.98it/s][A
10it [00:00, 43.94it/s][A
15it [00:00, 45.07it/s][A
20it [00:00, 43.31it/s][A
25it [00:00, 42.14it/s][A
30it [00:00, 42.41it/s][A
35it [00:00, 43.62it/s][A
40it [00:00, 44.41it/s][A
45it [00:01, 44.99it/s][A
50it [00:01, 45.45it/s][A
55it [00:01, 45.79it/s][A
60it [00:01, 46.07it/s][A
65it [00:01, 44.23it/s][A
70it [00:01, 45.01it/s][A
75it [00:01, 45.39it/s][A
80it [00:01, 45.80it/s][A
85it [00:01, 45.91it/s][A
90it [00:02, 45.83it/s][A
95it [00:02, 44.54it/s][A
100it [00:02, 44.92it/s][A
105it [00:02, 45.37it/s][A

Epoch: 293, Step: 100, Loss: 4.5310953330993655



110it [00:02, 45.55it/s][A
115it [00:02, 45.99it/s][A
120it [00:02, 46.57it/s][A
125it [00:02, 46.95it/s][A
130it [00:02, 45.03it/s][A
135it [00:02, 45.29it/s][A
140it [00:03, 45.33it/s][A
145it [00:03, 43.84it/s][A
150it [00:03, 44.49it/s][A
155it [00:03, 43.89it/s][A
160it [00:03, 44.60it/s][A
165it [00:03, 44.70it/s][A
170it [00:03, 44.82it/s][A
175it [00:03, 44.89it/s][A
180it [00:04, 44.81it/s][A
185it [00:04, 44.74it/s][A
190it [00:04, 45.02it/s][A
195it [00:04, 44.86it/s][A
200it [00:04, 45.09it/s][A
205it [00:04, 45.23it/s][A

Epoch: 293, Step: 200, Loss: 4.540171668529511



210it [00:04, 45.09it/s][A
215it [00:04, 45.31it/s][A
220it [00:04, 45.26it/s][A
227it [00:05, 44.90it/s]
 59%|█████▊    | 293/500 [34:44<22:10,  6.43s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.24it/s][A
10it [00:00, 43.48it/s][A
15it [00:00, 44.43it/s][A
20it [00:00, 44.34it/s][A
25it [00:00, 44.51it/s][A
30it [00:00, 44.47it/s][A
35it [00:00, 44.61it/s][A
40it [00:00, 44.81it/s][A
45it [00:01, 44.01it/s][A
50it [00:01, 44.31it/s][A
55it [00:01, 44.35it/s][A
60it [00:01, 42.85it/s][A
65it [00:01, 43.62it/s][A
70it [00:01, 43.79it/s][A
75it [00:01, 43.75it/s][A
80it [00:01, 42.15it/s][A
85it [00:01, 42.40it/s][A
90it [00:02, 42.60it/s][A
95it [00:02, 43.08it/s][A
100it [00:02, 43.40it/s][A
105it [00:02, 43.97it/s][A

Epoch: 294, Step: 100, Loss: 4.5409828519821165



110it [00:02, 43.78it/s][A
115it [00:02, 43.12it/s][A
120it [00:02, 44.00it/s][A
125it [00:02, 43.91it/s][A
130it [00:02, 44.33it/s][A
135it [00:03, 44.39it/s][A
140it [00:03, 44.76it/s][A
145it [00:03, 44.92it/s][A
150it [00:03, 45.12it/s][A
155it [00:03, 44.15it/s][A
160it [00:03, 44.62it/s][A
165it [00:03, 44.88it/s][A
170it [00:03, 45.12it/s][A
175it [00:03, 45.08it/s][A
180it [00:04, 45.31it/s][A
185it [00:04, 44.32it/s][A
190it [00:04, 44.77it/s][A
195it [00:04, 44.48it/s][A
200it [00:04, 43.41it/s][A
205it [00:04, 44.06it/s][A

Epoch: 294, Step: 200, Loss: 4.543834848403931



210it [00:04, 43.92it/s][A
215it [00:04, 44.15it/s][A
220it [00:04, 44.54it/s][A
227it [00:05, 44.03it/s]
 59%|█████▉    | 294/500 [34:50<20:45,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.74it/s][A
10it [00:00, 45.58it/s][A
15it [00:00, 43.72it/s][A
20it [00:00, 44.57it/s][A
25it [00:00, 44.77it/s][A
30it [00:00, 45.04it/s][A
35it [00:00, 45.28it/s][A
40it [00:00, 44.22it/s][A
45it [00:01, 44.79it/s][A
50it [00:01, 44.61it/s][A
55it [00:01, 45.02it/s][A
60it [00:01, 43.91it/s][A
65it [00:01, 44.53it/s][A
70it [00:01, 44.62it/s][A
75it [00:01, 44.80it/s][A
80it [00:01, 45.10it/s][A
85it [00:01, 45.01it/s][A
90it [00:02, 45.17it/s][A
95it [00:02, 44.97it/s][A
100it [00:02, 45.32it/s][A
105it [00:02, 45.11it/s][A

Epoch: 295, Step: 100, Loss: 4.524688568115234



110it [00:02, 44.81it/s][A
115it [00:02, 43.99it/s][A
120it [00:02, 44.37it/s][A
125it [00:02, 44.71it/s][A
130it [00:02, 44.86it/s][A
135it [00:03, 44.87it/s][A
140it [00:03, 44.82it/s][A
145it [00:03, 45.03it/s][A
150it [00:03, 45.27it/s][A
155it [00:03, 44.25it/s][A
160it [00:03, 44.44it/s][A
165it [00:03, 44.53it/s][A
170it [00:03, 43.26it/s][A
175it [00:03, 43.97it/s][A
180it [00:04, 44.36it/s][A
185it [00:04, 44.67it/s][A
190it [00:04, 44.95it/s][A
195it [00:04, 44.96it/s][A
200it [00:04, 45.12it/s][A
205it [00:04, 45.16it/s][A

Epoch: 295, Step: 200, Loss: 4.538718304634094



210it [00:04, 45.05it/s][A
215it [00:04, 43.98it/s][A
220it [00:04, 44.57it/s][A
227it [00:05, 44.70it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.54it/s][A
12it [00:00, 58.49it/s][A
18it [00:00, 56.54it/s][A
24it [00:00, 57.51it/s][A
30it [00:00, 56.48it/s][A
36it [00:00, 57.57it/s][A
43it [00:00, 58.79it/s][A
49it [00:00, 56.99it/s][A
56it [00:00, 58.18it/s][A
62it [00:01, 58.67it/s][A
69it [00:01, 59.29it/s][A
76it [00:01, 59.73it/s][A
82it [00:01, 59.68it/s][A
88it [00:01, 59.38it/s][A
95it [00:01, 59.91it/s][A
102it [00:01, 60.10it/s][A
109it [00:01, 60.04it/s][A
116it [00:01, 60.16it/s][A
123it [00:02, 58.88it/s][A
129it [00:02, 58.43it/s][A
135it [00:02, 58.62it/s][A
141it [00:02, 56.37it/s][A
147it [00:02, 57.20it/s][A
153it [00:02, 57.84it/s][A
159it [00:02, 57.88it/s][A
165it [00:02, 58.03it/s][A
172it [00:02, 58.90it/s][A
178it [00:03, 58.90it/s][A
184it [00:03, 58.46it/s][A
190it [00:03, 58.75it/s][A
196it [00:03, 57.64it/s][A
202it [00:03, 5


Epoch: 295, Test Loss: 5.504302784522868, Test Perplexity: 246.66987157312238




0it [00:00, ?it/s][A
5it [00:00, 41.79it/s][A
10it [00:00, 44.30it/s][A
15it [00:00, 44.48it/s][A
20it [00:00, 45.02it/s][A
25it [00:00, 45.37it/s][A
30it [00:00, 45.64it/s][A
35it [00:00, 45.60it/s][A
40it [00:00, 45.70it/s][A
45it [00:01, 44.58it/s][A
50it [00:01, 43.26it/s][A
55it [00:01, 43.98it/s][A
60it [00:01, 43.08it/s][A
65it [00:01, 43.82it/s][A
70it [00:01, 43.16it/s][A
75it [00:01, 43.81it/s][A
80it [00:01, 43.91it/s][A
85it [00:01, 44.36it/s][A
90it [00:02, 44.76it/s][A
95it [00:02, 44.82it/s][A
100it [00:02, 44.82it/s][A
105it [00:02, 44.96it/s][A

Epoch: 296, Step: 100, Loss: 4.526413245201111



110it [00:02, 44.83it/s][A
115it [00:02, 44.97it/s][A
120it [00:02, 44.42it/s][A
125it [00:02, 43.20it/s][A
130it [00:02, 42.43it/s][A
135it [00:03, 42.87it/s][A
140it [00:03, 42.49it/s][A
145it [00:03, 43.32it/s][A
150it [00:03, 43.98it/s][A
155it [00:03, 44.28it/s][A
160it [00:03, 44.58it/s][A
165it [00:03, 44.54it/s][A
170it [00:03, 44.52it/s][A
175it [00:03, 44.75it/s][A
180it [00:04, 43.48it/s][A
185it [00:04, 44.10it/s][A
190it [00:04, 44.71it/s][A
195it [00:04, 44.22it/s][A
200it [00:04, 43.48it/s][A
205it [00:04, 43.92it/s][A

Epoch: 296, Step: 200, Loss: 4.538164381980896



210it [00:04, 43.40it/s][A
215it [00:04, 43.54it/s][A
220it [00:04, 43.82it/s][A
227it [00:05, 43.93it/s]
 59%|█████▉    | 296/500 [35:11<26:46,  7.88s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.63it/s][A
10it [00:00, 46.41it/s][A
15it [00:00, 44.52it/s][A
20it [00:00, 44.59it/s][A
25it [00:00, 43.53it/s][A
30it [00:00, 42.80it/s][A
35it [00:00, 43.57it/s][A
40it [00:00, 44.18it/s][A
45it [00:01, 44.77it/s][A
50it [00:01, 44.02it/s][A
55it [00:01, 44.53it/s][A
60it [00:01, 44.86it/s][A
65it [00:01, 45.12it/s][A
70it [00:01, 45.42it/s][A
75it [00:01, 45.69it/s][A
80it [00:01, 44.76it/s][A
85it [00:01, 45.36it/s][A
90it [00:02, 44.69it/s][A
95it [00:02, 45.29it/s][A
100it [00:02, 45.66it/s][A
105it [00:02, 46.01it/s][A

Epoch: 297, Step: 100, Loss: 4.513353319168091



110it [00:02, 46.02it/s][A
115it [00:02, 45.66it/s][A
120it [00:02, 45.68it/s][A
125it [00:02, 45.39it/s][A
130it [00:02, 45.51it/s][A
135it [00:02, 45.61it/s][A
140it [00:03, 45.20it/s][A
145it [00:03, 45.19it/s][A
150it [00:03, 45.11it/s][A
155it [00:03, 45.72it/s][A
160it [00:03, 46.24it/s][A
165it [00:03, 46.53it/s][A
170it [00:03, 46.15it/s][A
175it [00:03, 45.93it/s][A
180it [00:03, 45.45it/s][A
185it [00:04, 45.51it/s][A
190it [00:04, 45.19it/s][A
195it [00:04, 45.15it/s][A
200it [00:04, 45.25it/s][A
205it [00:04, 44.91it/s][A

Epoch: 297, Step: 200, Loss: 4.53686776638031



210it [00:04, 45.10it/s][A
215it [00:04, 45.00it/s][A
220it [00:04, 44.85it/s][A
227it [00:05, 45.14it/s]
 59%|█████▉    | 297/500 [35:16<23:45,  7.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.24it/s][A
10it [00:00, 44.74it/s][A
15it [00:00, 44.70it/s][A
20it [00:00, 44.75it/s][A
25it [00:00, 44.94it/s][A
30it [00:00, 44.88it/s][A
35it [00:00, 44.39it/s][A
40it [00:00, 44.83it/s][A
45it [00:01, 45.21it/s][A
50it [00:01, 45.52it/s][A
55it [00:01, 44.32it/s][A
60it [00:01, 44.67it/s][A
65it [00:01, 44.91it/s][A
70it [00:01, 44.97it/s][A
75it [00:01, 43.90it/s][A
80it [00:01, 44.28it/s][A
85it [00:01, 44.69it/s][A
90it [00:02, 45.10it/s][A
95it [00:02, 45.19it/s][A
100it [00:02, 44.98it/s][A
105it [00:02, 45.14it/s][A

Epoch: 298, Step: 100, Loss: 4.530598435401917



110it [00:02, 45.11it/s][A
115it [00:02, 45.08it/s][A
120it [00:02, 45.14it/s][A
125it [00:02, 45.35it/s][A
130it [00:02, 45.46it/s][A
135it [00:03, 45.66it/s][A
140it [00:03, 44.41it/s][A
145it [00:03, 45.01it/s][A
150it [00:03, 45.05it/s][A
155it [00:03, 45.19it/s][A
160it [00:03, 44.31it/s][A
165it [00:03, 44.84it/s][A
170it [00:03, 44.53it/s][A
175it [00:03, 44.36it/s][A
180it [00:04, 44.25it/s][A
185it [00:04, 44.63it/s][A
190it [00:04, 44.58it/s][A
195it [00:04, 44.99it/s][A
200it [00:04, 44.83it/s][A
205it [00:04, 43.98it/s][A

Epoch: 298, Step: 200, Loss: 4.538981075286865



210it [00:04, 44.66it/s][A
215it [00:04, 44.78it/s][A
220it [00:04, 44.75it/s][A
227it [00:05, 44.77it/s]
 60%|█████▉    | 298/500 [35:21<21:40,  6.44s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.54it/s][A
10it [00:00, 45.38it/s][A
15it [00:00, 43.38it/s][A
20it [00:00, 43.39it/s][A
25it [00:00, 43.63it/s][A
30it [00:00, 43.95it/s][A
35it [00:00, 42.28it/s][A
40it [00:00, 43.22it/s][A
45it [00:01, 43.75it/s][A
50it [00:01, 44.17it/s][A
55it [00:01, 44.33it/s][A
60it [00:01, 44.37it/s][A
65it [00:01, 44.84it/s][A
70it [00:01, 44.89it/s][A
75it [00:01, 44.69it/s][A
80it [00:01, 44.55it/s][A
85it [00:01, 44.77it/s][A
90it [00:02, 44.98it/s][A
95it [00:02, 43.79it/s][A
100it [00:02, 44.25it/s][A
105it [00:02, 44.61it/s][A

Epoch: 299, Step: 100, Loss: 4.529387321472168



110it [00:02, 44.61it/s][A
115it [00:02, 44.45it/s][A
120it [00:02, 44.35it/s][A
125it [00:02, 44.01it/s][A
130it [00:02, 44.43it/s][A
135it [00:03, 44.72it/s][A
140it [00:03, 44.00it/s][A
145it [00:03, 43.21it/s][A
150it [00:03, 43.69it/s][A
155it [00:03, 43.94it/s][A
160it [00:03, 43.95it/s][A
165it [00:03, 44.18it/s][A
170it [00:03, 44.31it/s][A
175it [00:03, 44.75it/s][A
180it [00:04, 45.05it/s][A
185it [00:04, 44.72it/s][A
190it [00:04, 44.85it/s][A
195it [00:04, 44.82it/s][A
200it [00:04, 45.14it/s][A
205it [00:04, 45.28it/s][A

Epoch: 299, Step: 200, Loss: 4.536997923851013



210it [00:04, 44.86it/s][A
215it [00:04, 44.80it/s][A
220it [00:04, 44.15it/s][A
227it [00:05, 44.32it/s]
 60%|█████▉    | 299/500 [35:26<20:14,  6.04s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.89it/s][A
9it [00:00, 43.10it/s][A
14it [00:00, 44.19it/s][A
19it [00:00, 44.68it/s][A
24it [00:00, 45.05it/s][A
29it [00:00, 45.33it/s][A
34it [00:00, 44.30it/s][A
39it [00:00, 43.60it/s][A
44it [00:00, 44.26it/s][A
49it [00:01, 44.76it/s][A
54it [00:01, 44.32it/s][A
59it [00:01, 44.58it/s][A
64it [00:01, 44.85it/s][A
69it [00:01, 44.98it/s][A
74it [00:01, 43.84it/s][A
79it [00:01, 44.42it/s][A
84it [00:01, 43.72it/s][A
89it [00:02, 44.36it/s][A
94it [00:02, 44.70it/s][A
99it [00:02, 45.16it/s][A
104it [00:02, 45.13it/s][A
109it [00:02, 45.35it/s][A

Epoch: 300, Step: 100, Loss: 4.526637349128723



114it [00:02, 45.28it/s][A
119it [00:02, 44.28it/s][A
124it [00:02, 44.69it/s][A
129it [00:02, 42.82it/s][A
134it [00:03, 43.87it/s][A
139it [00:03, 44.15it/s][A
144it [00:03, 44.64it/s][A
149it [00:03, 44.13it/s][A
154it [00:03, 43.58it/s][A
159it [00:03, 43.93it/s][A
164it [00:03, 43.64it/s][A
169it [00:03, 44.25it/s][A
174it [00:03, 44.57it/s][A
179it [00:04, 45.02it/s][A
184it [00:04, 45.26it/s][A
189it [00:04, 45.52it/s][A
194it [00:04, 45.41it/s][A
199it [00:04, 45.45it/s][A
204it [00:04, 45.38it/s][A
209it [00:04, 45.51it/s][A

Epoch: 300, Step: 200, Loss: 4.5380357646942135



214it [00:04, 45.28it/s][A
219it [00:04, 45.32it/s][A
227it [00:05, 44.60it/s]

0it [00:00, ?it/s][A
6it [00:00, 56.95it/s][A
12it [00:00, 58.00it/s][A
19it [00:00, 59.12it/s][A
25it [00:00, 59.20it/s][A
32it [00:00, 59.99it/s][A
38it [00:00, 59.76it/s][A
44it [00:00, 57.19it/s][A
50it [00:00, 57.39it/s][A
57it [00:00, 58.59it/s][A
63it [00:01, 58.88it/s][A
70it [00:01, 59.35it/s][A
76it [00:01, 59.26it/s][A
82it [00:01, 59.29it/s][A
88it [00:01, 59.49it/s][A
95it [00:01, 59.87it/s][A
101it [00:01, 59.40it/s][A
107it [00:01, 59.40it/s][A
113it [00:01, 59.55it/s][A
120it [00:02, 59.81it/s][A
126it [00:02, 59.85it/s][A
132it [00:02, 59.76it/s][A
138it [00:02, 59.78it/s][A
144it [00:02, 59.83it/s][A
150it [00:02, 59.87it/s][A
157it [00:02, 59.99it/s][A
163it [00:02, 59.85it/s][A
170it [00:02, 60.02it/s][A
176it [00:02, 57.85it/s][A
182it [00:03, 57.92it/s][A
188it [00:03, 58.42it/s][A
194it [00:03, 58.63it/s][A
200it [00:03, 55.81it/s][A
206it [00:03, 5


Epoch: 300, Test Loss: 5.505028083457709, Test Perplexity: 246.90217630611443




0it [00:00, ?it/s][A
5it [00:00, 44.35it/s][A
10it [00:00, 45.15it/s][A
15it [00:00, 45.92it/s][A
20it [00:00, 45.98it/s][A
25it [00:00, 45.19it/s][A
30it [00:00, 44.66it/s][A
35it [00:00, 44.19it/s][A
40it [00:00, 43.34it/s][A
45it [00:01, 43.09it/s][A
50it [00:01, 43.54it/s][A
55it [00:01, 44.41it/s][A
60it [00:01, 44.87it/s][A
65it [00:01, 44.83it/s][A
70it [00:01, 44.58it/s][A
75it [00:01, 44.56it/s][A
80it [00:01, 44.66it/s][A
85it [00:01, 44.76it/s][A
90it [00:02, 45.06it/s][A
95it [00:02, 45.14it/s][A
100it [00:02, 43.92it/s][A
105it [00:02, 44.65it/s][A

Epoch: 301, Step: 100, Loss: 4.523493957519531



110it [00:02, 44.98it/s][A
115it [00:02, 45.12it/s][A
120it [00:02, 45.46it/s][A
125it [00:02, 45.57it/s][A
130it [00:02, 45.53it/s][A
135it [00:03, 45.63it/s][A
140it [00:03, 45.65it/s][A
145it [00:03, 45.55it/s][A
150it [00:03, 45.82it/s][A
155it [00:03, 45.92it/s][A
160it [00:03, 45.52it/s][A
165it [00:03, 45.64it/s][A
170it [00:03, 45.76it/s][A
175it [00:03, 46.46it/s][A
180it [00:03, 46.89it/s][A
185it [00:04, 46.52it/s][A
190it [00:04, 46.18it/s][A
195it [00:04, 45.73it/s][A
200it [00:04, 45.47it/s][A
205it [00:04, 45.41it/s][A

Epoch: 301, Step: 200, Loss: 4.5356564354896545



210it [00:04, 45.12it/s][A
215it [00:04, 45.18it/s][A
220it [00:04, 45.16it/s][A
227it [00:05, 45.16it/s]
 60%|██████    | 301/500 [35:47<26:02,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.10it/s][A
10it [00:00, 45.08it/s][A
15it [00:00, 45.48it/s][A
20it [00:00, 45.48it/s][A
25it [00:00, 45.11it/s][A
30it [00:00, 45.34it/s][A
35it [00:00, 45.09it/s][A
40it [00:00, 45.33it/s][A
45it [00:00, 44.86it/s][A
50it [00:01, 45.00it/s][A
55it [00:01, 45.13it/s][A
60it [00:01, 45.27it/s][A
65it [00:01, 44.40it/s][A
70it [00:01, 44.84it/s][A
75it [00:01, 45.02it/s][A
80it [00:01, 45.23it/s][A
85it [00:01, 45.45it/s][A
90it [00:02, 44.46it/s][A
95it [00:02, 44.65it/s][A
100it [00:02, 44.89it/s][A
105it [00:02, 45.13it/s][A

Epoch: 302, Step: 100, Loss: 4.516329188346862



110it [00:02, 45.36it/s][A
115it [00:02, 45.23it/s][A
120it [00:02, 45.17it/s][A
125it [00:02, 44.37it/s][A
130it [00:02, 44.66it/s][A
135it [00:03, 44.83it/s][A
140it [00:03, 44.74it/s][A
145it [00:03, 43.66it/s][A
150it [00:03, 44.02it/s][A
155it [00:03, 44.07it/s][A
160it [00:03, 44.05it/s][A
165it [00:03, 43.47it/s][A
170it [00:03, 43.87it/s][A
175it [00:03, 44.24it/s][A
180it [00:04, 44.74it/s][A
185it [00:04, 44.77it/s][A
190it [00:04, 44.93it/s][A
195it [00:04, 45.36it/s][A
200it [00:04, 45.52it/s][A
205it [00:04, 45.49it/s][A

Epoch: 302, Step: 200, Loss: 4.533493821620941



210it [00:04, 44.55it/s][A
215it [00:04, 45.11it/s][A
220it [00:04, 45.49it/s][A
227it [00:05, 44.85it/s]
 60%|██████    | 302/500 [35:52<23:09,  7.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.86it/s][A
10it [00:00, 45.94it/s][A
15it [00:00, 45.92it/s][A
20it [00:00, 46.03it/s][A
25it [00:00, 46.08it/s][A
30it [00:00, 45.88it/s][A
35it [00:00, 45.89it/s][A
40it [00:00, 45.31it/s][A
45it [00:00, 45.42it/s][A
50it [00:01, 45.51it/s][A
55it [00:01, 45.13it/s][A
60it [00:01, 45.27it/s][A
65it [00:01, 44.78it/s][A
70it [00:01, 45.10it/s][A
75it [00:01, 45.11it/s][A
80it [00:01, 44.86it/s][A
85it [00:01, 44.46it/s][A
90it [00:01, 44.33it/s][A
95it [00:02, 44.25it/s][A
100it [00:02, 43.05it/s][A
105it [00:02, 43.80it/s][A

Epoch: 303, Step: 100, Loss: 4.52520649433136



110it [00:02, 43.84it/s][A
115it [00:02, 44.33it/s][A
120it [00:02, 44.59it/s][A
125it [00:02, 44.93it/s][A
130it [00:02, 43.93it/s][A
135it [00:03, 44.22it/s][A
140it [00:03, 43.93it/s][A
145it [00:03, 43.95it/s][A
150it [00:03, 43.10it/s][A
155it [00:03, 43.77it/s][A
160it [00:03, 44.42it/s][A
165it [00:03, 44.75it/s][A
170it [00:03, 45.05it/s][A
175it [00:03, 45.27it/s][A
180it [00:04, 45.21it/s][A
185it [00:04, 45.27it/s][A
190it [00:04, 44.60it/s][A
195it [00:04, 44.84it/s][A
200it [00:04, 45.03it/s][A
205it [00:04, 45.30it/s][A

Epoch: 303, Step: 200, Loss: 4.536070046424865



210it [00:04, 44.96it/s][A
215it [00:04, 45.00it/s][A
220it [00:04, 44.88it/s][A
227it [00:05, 44.67it/s]
 61%|██████    | 303/500 [35:57<21:08,  6.44s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.07it/s][A
10it [00:00, 44.73it/s][A
15it [00:00, 44.84it/s][A
20it [00:00, 45.15it/s][A
25it [00:00, 45.20it/s][A
30it [00:00, 45.32it/s][A
35it [00:00, 45.44it/s][A
40it [00:00, 45.39it/s][A
45it [00:00, 45.01it/s][A
50it [00:01, 45.24it/s][A
55it [00:01, 45.18it/s][A
60it [00:01, 45.18it/s][A
65it [00:01, 45.12it/s][A
70it [00:01, 43.77it/s][A
75it [00:01, 44.10it/s][A
80it [00:01, 44.58it/s][A
85it [00:01, 44.89it/s][A
90it [00:02, 45.03it/s][A
95it [00:02, 44.92it/s][A
100it [00:02, 44.95it/s][A
105it [00:02, 45.11it/s][A

Epoch: 304, Step: 100, Loss: 4.52212929725647



110it [00:02, 45.23it/s][A
115it [00:02, 45.35it/s][A
120it [00:02, 44.34it/s][A
125it [00:02, 44.76it/s][A
130it [00:02, 45.06it/s][A
135it [00:03, 45.15it/s][A
140it [00:03, 44.19it/s][A
145it [00:03, 44.65it/s][A
150it [00:03, 44.63it/s][A
155it [00:03, 45.02it/s][A
160it [00:03, 45.27it/s][A
165it [00:03, 45.29it/s][A
170it [00:03, 45.33it/s][A
175it [00:03, 45.35it/s][A
180it [00:04, 44.63it/s][A
185it [00:04, 45.03it/s][A
190it [00:04, 44.35it/s][A
195it [00:04, 43.33it/s][A
200it [00:04, 44.02it/s][A
205it [00:04, 44.65it/s][A

Epoch: 304, Step: 200, Loss: 4.533237433433532



210it [00:04, 44.71it/s][A
215it [00:04, 45.06it/s][A
220it [00:04, 45.17it/s][A
227it [00:05, 44.80it/s]
 61%|██████    | 304/500 [36:02<19:41,  6.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.28it/s][A
10it [00:00, 45.13it/s][A
15it [00:00, 44.60it/s][A
20it [00:00, 45.07it/s][A
25it [00:00, 44.24it/s][A
30it [00:00, 44.77it/s][A
35it [00:00, 44.69it/s][A
40it [00:00, 43.61it/s][A
45it [00:01, 43.27it/s][A
50it [00:01, 44.13it/s][A
55it [00:01, 44.56it/s][A
60it [00:01, 44.91it/s][A
65it [00:01, 44.91it/s][A
70it [00:01, 44.80it/s][A
75it [00:01, 44.62it/s][A
80it [00:01, 45.02it/s][A
85it [00:01, 45.25it/s][A
90it [00:02, 45.20it/s][A
95it [00:02, 43.40it/s][A
100it [00:02, 43.87it/s][A
105it [00:02, 44.35it/s][A

Epoch: 305, Step: 100, Loss: 4.531480803489685



110it [00:02, 44.56it/s][A
115it [00:02, 44.92it/s][A
120it [00:02, 45.18it/s][A
125it [00:02, 45.44it/s][A
130it [00:02, 44.80it/s][A
135it [00:03, 45.14it/s][A
140it [00:03, 44.05it/s][A
145it [00:03, 43.33it/s][A
150it [00:03, 43.64it/s][A
155it [00:03, 43.37it/s][A
160it [00:03, 43.88it/s][A
165it [00:03, 44.49it/s][A
170it [00:03, 43.04it/s][A
175it [00:03, 43.47it/s][A
180it [00:04, 43.84it/s][A
185it [00:04, 43.72it/s][A
190it [00:04, 43.72it/s][A
195it [00:04, 43.26it/s][A
200it [00:04, 42.58it/s][A
205it [00:04, 42.00it/s][A

Epoch: 305, Step: 200, Loss: 4.536306591033935



210it [00:04, 42.98it/s][A
215it [00:04, 43.21it/s][A
220it [00:04, 43.71it/s][A
227it [00:05, 44.11it/s]

0it [00:00, ?it/s][A
6it [00:00, 52.75it/s][A
12it [00:00, 55.85it/s][A
19it [00:00, 57.98it/s][A
25it [00:00, 57.54it/s][A
31it [00:00, 58.10it/s][A
38it [00:00, 58.92it/s][A
44it [00:00, 58.18it/s][A
50it [00:00, 57.01it/s][A
56it [00:00, 57.46it/s][A
63it [00:01, 58.57it/s][A
69it [00:01, 56.44it/s][A
76it [00:01, 57.71it/s][A
82it [00:01, 58.34it/s][A
88it [00:01, 57.81it/s][A
94it [00:01, 58.42it/s][A
101it [00:01, 58.99it/s][A
107it [00:01, 59.13it/s][A
113it [00:01, 59.05it/s][A
120it [00:02, 59.87it/s][A
126it [00:02, 59.86it/s][A
133it [00:02, 59.98it/s][A
139it [00:02, 59.84it/s][A
146it [00:02, 60.01it/s][A
153it [00:02, 60.22it/s][A
160it [00:02, 60.25it/s][A
167it [00:02, 60.21it/s][A
174it [00:02, 60.39it/s][A
181it [00:03, 58.18it/s][A
188it [00:03, 58.89it/s][A
194it [00:03, 59.16it/s][A
200it [00:03, 59.20it/s][A
206it [00:03, 5


Epoch: 305, Test Loss: 5.508456710702884, Test Perplexity: 247.72738104873562




0it [00:00, ?it/s][A
4it [00:00, 39.63it/s][A
9it [00:00, 43.43it/s][A
14it [00:00, 44.71it/s][A
19it [00:00, 43.31it/s][A
24it [00:00, 44.10it/s][A
29it [00:00, 44.66it/s][A
34it [00:00, 44.51it/s][A
39it [00:00, 44.76it/s][A
44it [00:00, 44.79it/s][A
49it [00:01, 43.86it/s][A
54it [00:01, 44.50it/s][A
59it [00:01, 44.90it/s][A
64it [00:01, 45.33it/s][A
69it [00:01, 45.64it/s][A
74it [00:01, 45.75it/s][A
79it [00:01, 45.42it/s][A
84it [00:01, 45.57it/s][A
89it [00:01, 45.50it/s][A
94it [00:02, 45.69it/s][A
99it [00:02, 45.76it/s][A
104it [00:02, 45.94it/s][A
109it [00:02, 45.84it/s][A

Epoch: 306, Step: 100, Loss: 4.525108580589294



114it [00:02, 45.66it/s][A
119it [00:02, 45.52it/s][A
124it [00:02, 44.91it/s][A
129it [00:02, 45.02it/s][A
134it [00:02, 45.00it/s][A
139it [00:03, 45.03it/s][A
144it [00:03, 45.29it/s][A
149it [00:03, 44.43it/s][A
154it [00:03, 43.42it/s][A
159it [00:03, 44.07it/s][A
164it [00:03, 44.31it/s][A
169it [00:03, 44.78it/s][A
174it [00:03, 44.63it/s][A
179it [00:03, 44.90it/s][A
184it [00:04, 42.86it/s][A
189it [00:04, 43.75it/s][A
194it [00:04, 44.25it/s][A
199it [00:04, 44.56it/s][A
204it [00:04, 43.01it/s][A

Epoch: 306, Step: 200, Loss: 4.533523099422455



209it [00:04, 42.55it/s][A
214it [00:04, 43.23it/s][A
219it [00:04, 43.41it/s][A
227it [00:05, 44.54it/s]
 61%|██████    | 306/500 [36:23<25:19,  7.83s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.77it/s][A
10it [00:00, 44.42it/s][A
15it [00:00, 43.99it/s][A
20it [00:00, 44.69it/s][A
25it [00:00, 45.12it/s][A
30it [00:00, 45.36it/s][A
35it [00:00, 44.77it/s][A
40it [00:00, 44.73it/s][A
45it [00:01, 45.03it/s][A
50it [00:01, 45.45it/s][A
55it [00:01, 45.51it/s][A
60it [00:01, 45.55it/s][A
65it [00:01, 45.63it/s][A
70it [00:01, 45.66it/s][A
75it [00:01, 44.60it/s][A
80it [00:01, 43.31it/s][A
85it [00:01, 42.65it/s][A
90it [00:02, 43.28it/s][A
95it [00:02, 43.94it/s][A
100it [00:02, 44.22it/s][A
105it [00:02, 44.30it/s][A

Epoch: 307, Step: 100, Loss: 4.520574231147766



110it [00:02, 44.71it/s][A
115it [00:02, 44.96it/s][A
120it [00:02, 45.12it/s][A
125it [00:02, 45.16it/s][A
130it [00:02, 43.82it/s][A
135it [00:03, 44.36it/s][A
140it [00:03, 44.78it/s][A
145it [00:03, 44.90it/s][A
150it [00:03, 45.15it/s][A
155it [00:03, 45.20it/s][A
160it [00:03, 45.30it/s][A
165it [00:03, 45.45it/s][A
170it [00:03, 45.48it/s][A
175it [00:03, 45.24it/s][A
180it [00:04, 44.98it/s][A
185it [00:04, 45.19it/s][A
190it [00:04, 45.40it/s][A
195it [00:04, 45.30it/s][A
200it [00:04, 45.25it/s][A
205it [00:04, 45.20it/s][A

Epoch: 307, Step: 200, Loss: 4.533129925727844



210it [00:04, 45.34it/s][A
215it [00:04, 45.32it/s][A
220it [00:04, 44.96it/s][A
227it [00:05, 44.77it/s]
 61%|██████▏   | 307/500 [36:29<22:32,  7.01s/it]
0it [00:00, ?it/s][A
4it [00:00, 37.44it/s][A
9it [00:00, 40.49it/s][A
14it [00:00, 42.13it/s][A
19it [00:00, 43.38it/s][A
24it [00:00, 44.17it/s][A
29it [00:00, 44.40it/s][A
34it [00:00, 44.52it/s][A
39it [00:00, 44.39it/s][A
44it [00:01, 44.66it/s][A
49it [00:01, 44.67it/s][A
54it [00:01, 43.23it/s][A
59it [00:01, 42.94it/s][A
64it [00:01, 43.68it/s][A
69it [00:01, 43.81it/s][A
74it [00:01, 44.24it/s][A
79it [00:01, 44.01it/s][A
84it [00:01, 44.13it/s][A
89it [00:02, 44.71it/s][A
94it [00:02, 44.55it/s][A
99it [00:02, 44.55it/s][A
104it [00:02, 44.82it/s][A
109it [00:02, 45.02it/s][A

Epoch: 308, Step: 100, Loss: 4.51302930355072



114it [00:02, 44.70it/s][A
119it [00:02, 45.02it/s][A
124it [00:02, 44.76it/s][A
129it [00:02, 44.86it/s][A
134it [00:03, 44.82it/s][A
139it [00:03, 44.93it/s][A
144it [00:03, 44.93it/s][A
149it [00:03, 44.86it/s][A
154it [00:03, 44.99it/s][A
159it [00:03, 44.88it/s][A
164it [00:03, 45.17it/s][A
169it [00:03, 44.95it/s][A
174it [00:03, 44.96it/s][A
179it [00:04, 45.20it/s][A
184it [00:04, 45.27it/s][A
189it [00:04, 45.39it/s][A
194it [00:04, 45.54it/s][A
199it [00:04, 43.62it/s][A
204it [00:04, 44.27it/s][A
209it [00:04, 44.59it/s][A

Epoch: 308, Step: 200, Loss: 4.534573073387146



214it [00:04, 44.75it/s][A
219it [00:04, 43.89it/s][A
227it [00:05, 44.39it/s]
 62%|██████▏   | 308/500 [36:34<20:36,  6.44s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.09it/s][A
10it [00:00, 45.03it/s][A
15it [00:00, 44.62it/s][A
20it [00:00, 44.72it/s][A
25it [00:00, 45.16it/s][A
30it [00:00, 45.28it/s][A
35it [00:00, 45.31it/s][A
40it [00:00, 45.29it/s][A
45it [00:00, 45.21it/s][A
50it [00:01, 43.99it/s][A
55it [00:01, 44.30it/s][A
60it [00:01, 44.50it/s][A
65it [00:01, 44.49it/s][A
70it [00:01, 44.65it/s][A
75it [00:01, 44.68it/s][A
80it [00:01, 44.76it/s][A
85it [00:01, 44.95it/s][A
90it [00:02, 45.17it/s][A
95it [00:02, 43.93it/s][A
100it [00:02, 44.11it/s][A
105it [00:02, 43.27it/s][A

Epoch: 309, Step: 100, Loss: 4.5264460611343384



110it [00:02, 43.32it/s][A
115it [00:02, 42.67it/s][A
120it [00:02, 43.27it/s][A
125it [00:02, 44.03it/s][A
130it [00:02, 44.52it/s][A
135it [00:03, 45.03it/s][A
140it [00:03, 45.09it/s][A
145it [00:03, 45.36it/s][A
150it [00:03, 45.40it/s][A
155it [00:03, 45.48it/s][A
160it [00:03, 44.26it/s][A
165it [00:03, 44.63it/s][A
170it [00:03, 44.85it/s][A
175it [00:03, 45.05it/s][A
180it [00:04, 44.79it/s][A
185it [00:04, 43.75it/s][A
190it [00:04, 44.14it/s][A
195it [00:04, 44.59it/s][A
200it [00:04, 45.01it/s][A
205it [00:04, 45.06it/s][A

Epoch: 309, Step: 200, Loss: 4.533868751525879



210it [00:04, 44.89it/s][A
215it [00:04, 44.80it/s][A
220it [00:04, 45.11it/s][A
227it [00:05, 44.60it/s]
 62%|██████▏   | 309/500 [36:39<19:12,  6.04s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.81it/s][A
9it [00:00, 42.81it/s][A
14it [00:00, 42.23it/s][A
19it [00:00, 43.47it/s][A
24it [00:00, 44.28it/s][A
29it [00:00, 44.57it/s][A
34it [00:00, 45.05it/s][A
39it [00:00, 44.90it/s][A
44it [00:00, 44.89it/s][A
49it [00:01, 45.25it/s][A
54it [00:01, 44.96it/s][A
59it [00:01, 43.67it/s][A
64it [00:01, 43.59it/s][A
69it [00:01, 44.02it/s][A
74it [00:01, 44.55it/s][A
79it [00:01, 44.06it/s][A
84it [00:01, 44.20it/s][A
89it [00:02, 44.17it/s][A
94it [00:02, 44.04it/s][A
99it [00:02, 44.18it/s][A
104it [00:02, 43.07it/s][A

Epoch: 310, Step: 100, Loss: 4.525750660896302



109it [00:02, 41.68it/s][A
114it [00:02, 42.39it/s][A
119it [00:02, 43.09it/s][A
124it [00:02, 43.57it/s][A
129it [00:02, 43.11it/s][A
134it [00:03, 43.44it/s][A
139it [00:03, 44.09it/s][A
144it [00:03, 44.67it/s][A
149it [00:03, 44.98it/s][A
154it [00:03, 45.24it/s][A
159it [00:03, 45.42it/s][A
164it [00:03, 44.83it/s][A
169it [00:03, 45.42it/s][A
174it [00:03, 45.51it/s][A
179it [00:04, 45.34it/s][A
184it [00:04, 44.10it/s][A
189it [00:04, 44.45it/s][A
194it [00:04, 43.42it/s][A
199it [00:04, 44.16it/s][A
204it [00:04, 44.36it/s][A

Epoch: 310, Step: 200, Loss: 4.531926031112671



209it [00:04, 43.80it/s][A
214it [00:04, 44.47it/s][A
219it [00:04, 44.88it/s][A
227it [00:05, 44.05it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.21it/s][A
13it [00:00, 60.52it/s][A
20it [00:00, 60.21it/s][A
27it [00:00, 58.39it/s][A
33it [00:00, 57.47it/s][A
40it [00:00, 58.64it/s][A
47it [00:00, 59.29it/s][A
54it [00:00, 59.77it/s][A
61it [00:01, 60.21it/s][A
68it [00:01, 60.42it/s][A
75it [00:01, 60.76it/s][A
82it [00:01, 60.93it/s][A
89it [00:01, 60.95it/s][A
96it [00:01, 60.76it/s][A
103it [00:01, 61.02it/s][A
110it [00:01, 59.92it/s][A
117it [00:01, 60.36it/s][A
124it [00:02, 60.51it/s][A
131it [00:02, 60.65it/s][A
138it [00:02, 60.65it/s][A
145it [00:02, 60.83it/s][A
152it [00:02, 60.92it/s][A
159it [00:02, 61.06it/s][A
166it [00:02, 61.34it/s][A
173it [00:02, 61.33it/s][A
180it [00:02, 61.11it/s][A
187it [00:03, 60.15it/s][A
194it [00:03, 60.43it/s][A
201it [00:03, 58.35it/s][A
208it [00:03, 59.01it/s][A
215it [00:03, 59.68it/s][A
221it [00:03, 


Epoch: 310, Test Loss: 5.509894073379706, Test Perplexity: 248.1297109378791




0it [00:00, ?it/s][A
5it [00:00, 44.73it/s][A
10it [00:00, 45.43it/s][A
15it [00:00, 44.11it/s][A
20it [00:00, 45.00it/s][A
25it [00:00, 45.30it/s][A
30it [00:00, 45.30it/s][A
35it [00:00, 43.15it/s][A
40it [00:00, 44.02it/s][A
45it [00:01, 44.47it/s][A
50it [00:01, 44.59it/s][A
55it [00:01, 44.91it/s][A
60it [00:01, 44.91it/s][A
65it [00:01, 44.86it/s][A
70it [00:01, 45.15it/s][A
75it [00:01, 45.26it/s][A
80it [00:01, 45.47it/s][A
85it [00:01, 44.93it/s][A
90it [00:02, 44.73it/s][A
95it [00:02, 45.00it/s][A
100it [00:02, 45.34it/s][A
105it [00:02, 45.60it/s][A

Epoch: 311, Step: 100, Loss: 4.512681756019592



110it [00:02, 45.34it/s][A
115it [00:02, 45.18it/s][A
120it [00:02, 45.12it/s][A
125it [00:02, 42.86it/s][A
130it [00:02, 43.73it/s][A
135it [00:03, 44.06it/s][A
140it [00:03, 44.70it/s][A
145it [00:03, 44.65it/s][A
150it [00:03, 44.80it/s][A
155it [00:03, 44.87it/s][A
160it [00:03, 42.86it/s][A
165it [00:03, 43.39it/s][A
170it [00:03, 42.74it/s][A
175it [00:03, 43.66it/s][A
180it [00:04, 43.96it/s][A
185it [00:04, 44.51it/s][A
190it [00:04, 43.47it/s][A
195it [00:04, 44.07it/s][A
200it [00:04, 44.27it/s][A
205it [00:04, 44.51it/s][A

Epoch: 311, Step: 200, Loss: 4.533329796791077



210it [00:04, 44.74it/s][A
215it [00:04, 45.01it/s][A
220it [00:04, 43.36it/s][A
227it [00:05, 44.43it/s]
 62%|██████▏   | 311/500 [37:00<24:44,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.66it/s][A
10it [00:00, 45.79it/s][A
15it [00:00, 44.45it/s][A
20it [00:00, 44.81it/s][A
25it [00:00, 45.18it/s][A
30it [00:00, 43.95it/s][A
35it [00:00, 43.60it/s][A
40it [00:00, 43.63it/s][A
45it [00:01, 44.35it/s][A
50it [00:01, 44.68it/s][A
55it [00:01, 44.91it/s][A
60it [00:01, 45.34it/s][A
65it [00:01, 45.15it/s][A
70it [00:01, 45.21it/s][A
75it [00:01, 45.43it/s][A
80it [00:01, 45.50it/s][A
85it [00:01, 45.60it/s][A
90it [00:02, 45.47it/s][A
95it [00:02, 45.36it/s][A
100it [00:02, 45.45it/s][A
105it [00:02, 45.43it/s][A

Epoch: 312, Step: 100, Loss: 4.515909104347229



110it [00:02, 45.00it/s][A
115it [00:02, 45.25it/s][A
120it [00:02, 45.22it/s][A
125it [00:02, 44.22it/s][A
130it [00:02, 44.68it/s][A
135it [00:03, 43.87it/s][A
140it [00:03, 43.91it/s][A
145it [00:03, 43.67it/s][A
150it [00:03, 44.00it/s][A
155it [00:03, 43.05it/s][A
160it [00:03, 42.68it/s][A
165it [00:03, 43.74it/s][A
170it [00:03, 42.26it/s][A
175it [00:03, 43.03it/s][A
180it [00:04, 43.55it/s][A
185it [00:04, 43.82it/s][A
190it [00:04, 43.45it/s][A
195it [00:04, 43.29it/s][A
200it [00:04, 43.79it/s][A
205it [00:04, 44.23it/s][A

Epoch: 312, Step: 200, Loss: 4.531410412788391



210it [00:04, 44.63it/s][A
215it [00:04, 44.91it/s][A
220it [00:04, 43.93it/s][A
227it [00:05, 44.38it/s]
 62%|██████▏   | 312/500 [37:05<22:02,  7.03s/it]
0it [00:00, ?it/s][A
4it [00:00, 37.87it/s][A
9it [00:00, 42.37it/s][A
14it [00:00, 43.65it/s][A
19it [00:00, 42.56it/s][A
24it [00:00, 43.74it/s][A
29it [00:00, 41.86it/s][A
34it [00:00, 41.88it/s][A
39it [00:00, 41.59it/s][A
44it [00:01, 42.77it/s][A
49it [00:01, 43.41it/s][A
54it [00:01, 43.97it/s][A
59it [00:01, 42.38it/s][A
64it [00:01, 43.11it/s][A
69it [00:01, 43.77it/s][A
74it [00:01, 44.21it/s][A
79it [00:01, 44.44it/s][A
84it [00:01, 44.68it/s][A
89it [00:02, 44.80it/s][A
94it [00:02, 43.52it/s][A
99it [00:02, 43.86it/s][A
104it [00:02, 44.25it/s][A
109it [00:02, 44.39it/s][A

Epoch: 313, Step: 100, Loss: 4.514090995788575



114it [00:02, 44.03it/s][A
119it [00:02, 44.35it/s][A
124it [00:02, 44.80it/s][A
129it [00:02, 43.87it/s][A
134it [00:03, 44.30it/s][A
139it [00:03, 44.39it/s][A
144it [00:03, 44.70it/s][A
149it [00:03, 44.58it/s][A
154it [00:03, 44.48it/s][A
159it [00:03, 43.60it/s][A
164it [00:03, 44.23it/s][A
169it [00:03, 43.45it/s][A
174it [00:03, 44.12it/s][A
179it [00:04, 44.30it/s][A
184it [00:04, 44.63it/s][A
189it [00:04, 44.35it/s][A
194it [00:04, 44.45it/s][A
199it [00:04, 43.18it/s][A
204it [00:04, 43.91it/s][A
209it [00:04, 44.19it/s][A

Epoch: 313, Step: 200, Loss: 4.529977209568024



214it [00:04, 44.52it/s][A
219it [00:04, 44.89it/s][A
227it [00:05, 43.89it/s]
 63%|██████▎   | 313/500 [37:10<20:10,  6.48s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.46it/s][A
10it [00:00, 42.95it/s][A
15it [00:00, 44.05it/s][A
20it [00:00, 44.63it/s][A
25it [00:00, 44.89it/s][A
30it [00:00, 45.18it/s][A
35it [00:00, 45.36it/s][A
40it [00:00, 45.35it/s][A
45it [00:01, 43.74it/s][A
50it [00:01, 44.24it/s][A
55it [00:01, 43.92it/s][A
60it [00:01, 44.13it/s][A
65it [00:01, 44.50it/s][A
70it [00:01, 44.88it/s][A
75it [00:01, 44.81it/s][A
80it [00:01, 44.95it/s][A
85it [00:01, 43.43it/s][A
90it [00:02, 43.94it/s][A
95it [00:02, 43.71it/s][A
100it [00:02, 43.88it/s][A
105it [00:02, 44.43it/s][A

Epoch: 314, Step: 100, Loss: 4.51415629863739



110it [00:02, 44.72it/s][A
115it [00:02, 45.02it/s][A
120it [00:02, 45.16it/s][A
125it [00:02, 45.06it/s][A
130it [00:02, 45.17it/s][A
135it [00:03, 45.31it/s][A
140it [00:03, 43.91it/s][A
145it [00:03, 44.29it/s][A
150it [00:03, 44.71it/s][A
155it [00:03, 44.84it/s][A
160it [00:03, 45.10it/s][A
165it [00:03, 45.13it/s][A
170it [00:03, 45.37it/s][A
175it [00:03, 45.54it/s][A
180it [00:04, 45.57it/s][A
185it [00:04, 45.29it/s][A
190it [00:04, 44.84it/s][A
195it [00:04, 44.00it/s][A
200it [00:04, 44.29it/s][A
205it [00:04, 42.52it/s][A

Epoch: 314, Step: 200, Loss: 4.526305265426636



210it [00:04, 42.74it/s][A
215it [00:04, 43.50it/s][A
220it [00:04, 44.06it/s][A
227it [00:05, 44.38it/s]
 63%|██████▎   | 314/500 [37:15<18:48,  6.07s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.35it/s][A
10it [00:00, 43.02it/s][A
15it [00:00, 43.49it/s][A
20it [00:00, 43.26it/s][A
25it [00:00, 44.14it/s][A
30it [00:00, 43.17it/s][A
35it [00:00, 44.06it/s][A
40it [00:00, 44.34it/s][A
45it [00:01, 44.92it/s][A
50it [00:01, 44.73it/s][A
55it [00:01, 44.80it/s][A
60it [00:01, 44.97it/s][A
65it [00:01, 44.72it/s][A
70it [00:01, 44.63it/s][A
75it [00:01, 44.92it/s][A
80it [00:01, 45.15it/s][A
85it [00:01, 45.17it/s][A
90it [00:02, 45.53it/s][A
95it [00:02, 45.71it/s][A
100it [00:02, 45.28it/s][A
105it [00:02, 45.73it/s][A

Epoch: 315, Step: 100, Loss: 4.509218034744262



110it [00:02, 45.57it/s][A
115it [00:02, 45.71it/s][A
120it [00:02, 46.02it/s][A
125it [00:02, 46.20it/s][A
130it [00:02, 46.21it/s][A
135it [00:02, 46.49it/s][A
140it [00:03, 46.21it/s][A
145it [00:03, 46.30it/s][A
150it [00:03, 46.40it/s][A
155it [00:03, 46.52it/s][A
160it [00:03, 46.43it/s][A
165it [00:03, 46.26it/s][A
170it [00:03, 46.24it/s][A
175it [00:03, 46.31it/s][A
180it [00:03, 46.33it/s][A
185it [00:04, 45.85it/s][A
190it [00:04, 45.88it/s][A
195it [00:04, 45.94it/s][A
200it [00:04, 45.92it/s][A
205it [00:04, 45.51it/s][A

Epoch: 315, Step: 200, Loss: 4.527005999088288



210it [00:04, 45.74it/s][A
215it [00:04, 46.06it/s][A
220it [00:04, 46.30it/s][A
227it [00:04, 45.52it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.33it/s][A
13it [00:00, 60.26it/s][A
20it [00:00, 60.85it/s][A
27it [00:00, 58.47it/s][A
34it [00:00, 59.75it/s][A
41it [00:00, 60.51it/s][A
48it [00:00, 61.01it/s][A
55it [00:00, 61.50it/s][A
62it [00:01, 61.42it/s][A
69it [00:01, 61.24it/s][A
76it [00:01, 61.21it/s][A
83it [00:01, 58.86it/s][A
89it [00:01, 57.99it/s][A
95it [00:01, 56.66it/s][A
102it [00:01, 58.23it/s][A
109it [00:01, 59.63it/s][A
115it [00:01, 58.96it/s][A
122it [00:02, 59.40it/s][A
128it [00:02, 59.52it/s][A
134it [00:02, 57.30it/s][A
141it [00:02, 58.31it/s][A
148it [00:02, 59.11it/s][A
154it [00:02, 57.32it/s][A
161it [00:02, 58.23it/s][A
168it [00:02, 58.95it/s][A
174it [00:02, 59.03it/s][A
180it [00:03, 59.18it/s][A
186it [00:03, 59.26it/s][A
192it [00:03, 59.09it/s][A
199it [00:03, 59.49it/s][A
205it [00:03, 59.02it/s][A
212it [00:03, 


Epoch: 315, Test Loss: 5.503845992295639, Test Perplexity: 246.63832639611286




0it [00:00, ?it/s][A
5it [00:00, 44.71it/s][A
10it [00:00, 44.69it/s][A
15it [00:00, 45.04it/s][A
20it [00:00, 45.25it/s][A
25it [00:00, 44.96it/s][A
30it [00:00, 44.88it/s][A
35it [00:00, 44.78it/s][A
40it [00:00, 44.77it/s][A
45it [00:01, 44.79it/s][A
50it [00:01, 44.81it/s][A
55it [00:01, 43.79it/s][A
60it [00:01, 44.24it/s][A
65it [00:01, 44.82it/s][A
70it [00:01, 44.77it/s][A
75it [00:01, 44.91it/s][A
80it [00:01, 44.93it/s][A
85it [00:01, 45.06it/s][A
90it [00:02, 45.20it/s][A
95it [00:02, 44.91it/s][A
100it [00:02, 44.98it/s][A
105it [00:02, 45.05it/s][A

Epoch: 316, Step: 100, Loss: 4.5139737319946285



110it [00:02, 44.79it/s][A
115it [00:02, 44.32it/s][A
120it [00:02, 44.82it/s][A
125it [00:02, 45.05it/s][A
130it [00:02, 45.39it/s][A
135it [00:03, 45.16it/s][A
140it [00:03, 43.81it/s][A
145it [00:03, 44.35it/s][A
150it [00:03, 44.53it/s][A
155it [00:03, 44.89it/s][A
160it [00:03, 44.88it/s][A
165it [00:03, 45.07it/s][A
170it [00:03, 45.37it/s][A
175it [00:03, 44.21it/s][A
180it [00:04, 44.56it/s][A
185it [00:04, 44.69it/s][A
190it [00:04, 44.86it/s][A
195it [00:04, 44.93it/s][A
200it [00:04, 44.83it/s][A
205it [00:04, 45.20it/s][A

Epoch: 316, Step: 200, Loss: 4.529845163822174



210it [00:04, 45.21it/s][A
215it [00:04, 44.48it/s][A
220it [00:04, 44.65it/s][A
227it [00:05, 44.77it/s]
 63%|██████▎   | 316/500 [37:36<24:04,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.61it/s][A
10it [00:00, 41.32it/s][A
15it [00:00, 43.27it/s][A
20it [00:00, 43.79it/s][A
25it [00:00, 42.91it/s][A
30it [00:00, 43.82it/s][A
35it [00:00, 44.50it/s][A
40it [00:00, 43.38it/s][A
45it [00:01, 42.79it/s][A
50it [00:01, 43.12it/s][A
55it [00:01, 43.66it/s][A
60it [00:01, 44.22it/s][A
65it [00:01, 44.36it/s][A
70it [00:01, 44.21it/s][A
75it [00:01, 44.38it/s][A
80it [00:01, 43.06it/s][A
85it [00:01, 43.58it/s][A
90it [00:02, 43.63it/s][A
95it [00:02, 43.90it/s][A
100it [00:02, 43.73it/s][A
105it [00:02, 44.11it/s][A

Epoch: 317, Step: 100, Loss: 4.519116721153259



110it [00:02, 44.36it/s][A
115it [00:02, 44.64it/s][A
120it [00:02, 44.98it/s][A
125it [00:02, 44.12it/s][A
130it [00:02, 44.62it/s][A
135it [00:03, 42.34it/s][A
140it [00:03, 43.30it/s][A
145it [00:03, 44.13it/s][A
150it [00:03, 44.57it/s][A
155it [00:03, 44.88it/s][A
160it [00:03, 44.58it/s][A
165it [00:03, 45.21it/s][A
170it [00:03, 44.00it/s][A
175it [00:03, 44.37it/s][A
180it [00:04, 44.72it/s][A
185it [00:04, 45.03it/s][A
190it [00:04, 45.19it/s][A
195it [00:04, 45.18it/s][A
200it [00:04, 45.24it/s][A
205it [00:04, 45.60it/s][A

Epoch: 317, Step: 200, Loss: 4.528416364192963



210it [00:04, 45.51it/s][A
215it [00:04, 45.64it/s][A
220it [00:04, 44.47it/s][A
227it [00:05, 44.20it/s]
 63%|██████▎   | 317/500 [37:41<21:27,  7.04s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.46it/s][A
9it [00:00, 42.98it/s][A
14it [00:00, 44.30it/s][A
19it [00:00, 44.43it/s][A
24it [00:00, 44.58it/s][A
29it [00:00, 45.17it/s][A
34it [00:00, 44.97it/s][A
39it [00:00, 45.22it/s][A
44it [00:00, 45.34it/s][A
49it [00:01, 43.28it/s][A
54it [00:01, 44.05it/s][A
59it [00:01, 44.37it/s][A
64it [00:01, 43.86it/s][A
69it [00:01, 44.38it/s][A
74it [00:01, 44.74it/s][A
79it [00:01, 44.87it/s][A
84it [00:01, 44.04it/s][A
89it [00:02, 44.32it/s][A
94it [00:02, 44.63it/s][A
99it [00:02, 43.85it/s][A
104it [00:02, 44.44it/s][A
109it [00:02, 44.72it/s][A

Epoch: 318, Step: 100, Loss: 4.506298508644104



114it [00:02, 44.08it/s][A
119it [00:02, 44.69it/s][A
124it [00:02, 45.08it/s][A
129it [00:02, 45.05it/s][A
134it [00:03, 44.48it/s][A
139it [00:03, 43.68it/s][A
144it [00:03, 44.07it/s][A
149it [00:03, 44.23it/s][A
154it [00:03, 44.42it/s][A
159it [00:03, 44.59it/s][A
164it [00:03, 44.72it/s][A
169it [00:03, 42.94it/s][A
174it [00:03, 43.53it/s][A
179it [00:04, 42.69it/s][A
184it [00:04, 43.42it/s][A
189it [00:04, 43.87it/s][A
194it [00:04, 43.40it/s][A
199it [00:04, 44.28it/s][A
204it [00:04, 44.10it/s][A
209it [00:04, 44.66it/s][A

Epoch: 318, Step: 200, Loss: 4.528522243499756



214it [00:04, 43.24it/s][A
219it [00:04, 43.62it/s][A
227it [00:05, 44.15it/s]
 64%|██████▎   | 318/500 [37:47<19:37,  6.47s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.29it/s][A
10it [00:00, 45.57it/s][A
15it [00:00, 45.46it/s][A
20it [00:00, 45.43it/s][A
25it [00:00, 45.42it/s][A
30it [00:00, 44.68it/s][A
35it [00:00, 44.78it/s][A
40it [00:00, 44.89it/s][A
45it [00:01, 44.83it/s][A
50it [00:01, 45.04it/s][A
55it [00:01, 45.31it/s][A
60it [00:01, 45.46it/s][A
65it [00:01, 45.42it/s][A
70it [00:01, 45.47it/s][A
75it [00:01, 45.52it/s][A
80it [00:01, 45.12it/s][A
85it [00:01, 45.38it/s][A
90it [00:01, 44.15it/s][A
95it [00:02, 43.18it/s][A
100it [00:02, 43.53it/s][A
105it [00:02, 43.84it/s][A

Epoch: 319, Step: 100, Loss: 4.507984890937805



110it [00:02, 42.41it/s][A
115it [00:02, 43.23it/s][A
120it [00:02, 43.92it/s][A
125it [00:02, 44.35it/s][A
130it [00:02, 44.74it/s][A
135it [00:03, 44.95it/s][A
140it [00:03, 45.13it/s][A
145it [00:03, 43.73it/s][A
150it [00:03, 43.32it/s][A
155it [00:03, 43.33it/s][A
160it [00:03, 44.06it/s][A
165it [00:03, 44.63it/s][A
170it [00:03, 45.30it/s][A
175it [00:03, 45.75it/s][A
180it [00:04, 44.39it/s][A
185it [00:04, 44.37it/s][A
190it [00:04, 45.06it/s][A
195it [00:04, 45.42it/s][A
200it [00:04, 44.50it/s][A
205it [00:04, 45.18it/s][A

Epoch: 319, Step: 200, Loss: 4.528040471076966



210it [00:04, 45.05it/s][A
215it [00:04, 44.31it/s][A
220it [00:04, 45.15it/s][A
227it [00:05, 44.69it/s]
 64%|██████▍   | 319/500 [37:52<18:15,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.25it/s][A
10it [00:00, 46.12it/s][A
15it [00:00, 46.42it/s][A
20it [00:00, 46.50it/s][A
25it [00:00, 46.52it/s][A
30it [00:00, 46.65it/s][A
35it [00:00, 46.76it/s][A
40it [00:00, 46.75it/s][A
45it [00:00, 46.30it/s][A
50it [00:01, 45.97it/s][A
55it [00:01, 45.95it/s][A
60it [00:01, 46.16it/s][A
65it [00:01, 46.47it/s][A
70it [00:01, 46.71it/s][A
75it [00:01, 46.42it/s][A
80it [00:01, 46.50it/s][A
85it [00:01, 45.41it/s][A
90it [00:01, 45.83it/s][A
95it [00:02, 45.72it/s][A
100it [00:02, 44.90it/s][A
105it [00:02, 45.15it/s][A

Epoch: 320, Step: 100, Loss: 4.505907912254333



110it [00:02, 44.95it/s][A
115it [00:02, 44.10it/s][A
120it [00:02, 44.72it/s][A
125it [00:02, 44.62it/s][A
130it [00:02, 44.99it/s][A
135it [00:02, 45.20it/s][A
140it [00:03, 45.19it/s][A
145it [00:03, 44.21it/s][A
150it [00:03, 44.78it/s][A
155it [00:03, 45.09it/s][A
160it [00:03, 45.25it/s][A
165it [00:03, 45.23it/s][A
170it [00:03, 45.11it/s][A
175it [00:03, 45.26it/s][A
180it [00:03, 45.21it/s][A
185it [00:04, 43.33it/s][A
190it [00:04, 42.87it/s][A
195it [00:04, 43.78it/s][A
200it [00:04, 44.37it/s][A
205it [00:04, 44.54it/s][A

Epoch: 320, Step: 200, Loss: 4.523421764373779



210it [00:04, 44.50it/s][A
215it [00:04, 44.78it/s][A
220it [00:04, 43.53it/s][A
227it [00:05, 45.01it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.03it/s][A
12it [00:00, 56.78it/s][A
19it [00:00, 58.59it/s][A
26it [00:00, 59.37it/s][A
32it [00:00, 59.56it/s][A
38it [00:00, 59.55it/s][A
45it [00:00, 59.83it/s][A
52it [00:00, 60.12it/s][A
59it [00:00, 59.97it/s][A
65it [00:01, 59.76it/s][A
71it [00:01, 57.36it/s][A
77it [00:01, 58.09it/s][A
83it [00:01, 58.39it/s][A
89it [00:01, 56.51it/s][A
95it [00:01, 57.39it/s][A
101it [00:01, 58.10it/s][A
108it [00:01, 58.89it/s][A
115it [00:01, 59.43it/s][A
121it [00:02, 59.32it/s][A
127it [00:02, 59.28it/s][A
134it [00:02, 59.73it/s][A
141it [00:02, 59.97it/s][A
148it [00:02, 60.10it/s][A
155it [00:02, 60.01it/s][A
161it [00:02, 59.94it/s][A
167it [00:02, 59.68it/s][A
173it [00:02, 59.65it/s][A
179it [00:03, 59.62it/s][A
186it [00:03, 59.94it/s][A
192it [00:03, 59.83it/s][A
198it [00:03, 58.03it/s][A
204it [00:03, 5


Epoch: 320, Test Loss: 5.505645574990266, Test Perplexity: 246.9917975005156




0it [00:00, ?it/s][A
5it [00:00, 45.99it/s][A
10it [00:00, 45.93it/s][A
15it [00:00, 45.61it/s][A
20it [00:00, 45.64it/s][A
25it [00:00, 45.51it/s][A
30it [00:00, 45.29it/s][A
35it [00:00, 45.13it/s][A
40it [00:00, 45.05it/s][A
45it [00:00, 45.02it/s][A
50it [00:01, 44.86it/s][A
55it [00:01, 45.19it/s][A
60it [00:01, 45.02it/s][A
65it [00:01, 44.86it/s][A
70it [00:01, 44.92it/s][A
75it [00:01, 44.70it/s][A
80it [00:01, 43.61it/s][A
85it [00:01, 44.20it/s][A
90it [00:02, 44.67it/s][A
95it [00:02, 44.76it/s][A
100it [00:02, 44.89it/s][A
105it [00:02, 44.98it/s][A

Epoch: 321, Step: 100, Loss: 4.516005043983459



110it [00:02, 44.64it/s][A
115it [00:02, 44.60it/s][A
120it [00:02, 44.78it/s][A
125it [00:02, 44.16it/s][A
130it [00:02, 44.25it/s][A
135it [00:03, 44.22it/s][A
140it [00:03, 44.28it/s][A
145it [00:03, 44.61it/s][A
150it [00:03, 44.62it/s][A
155it [00:03, 44.82it/s][A
160it [00:03, 44.82it/s][A
165it [00:03, 45.00it/s][A
170it [00:03, 44.93it/s][A
175it [00:03, 44.40it/s][A
180it [00:04, 44.12it/s][A
185it [00:04, 44.03it/s][A
190it [00:04, 44.43it/s][A
195it [00:04, 44.66it/s][A
200it [00:04, 44.96it/s][A
205it [00:04, 43.90it/s][A

Epoch: 321, Step: 200, Loss: 4.527534894943237



210it [00:04, 43.70it/s][A
215it [00:04, 44.18it/s][A
220it [00:04, 44.07it/s][A
227it [00:05, 44.59it/s]
 64%|██████▍   | 321/500 [38:13<23:23,  7.84s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.04it/s][A
10it [00:00, 44.13it/s][A
15it [00:00, 44.90it/s][A
20it [00:00, 45.50it/s][A
25it [00:00, 45.65it/s][A
30it [00:00, 45.40it/s][A
35it [00:00, 45.46it/s][A
40it [00:00, 45.68it/s][A
45it [00:00, 45.55it/s][A
50it [00:01, 45.66it/s][A
55it [00:01, 45.61it/s][A
60it [00:01, 45.65it/s][A
65it [00:01, 45.86it/s][A
70it [00:01, 45.88it/s][A
75it [00:01, 45.72it/s][A
80it [00:01, 43.67it/s][A
85it [00:01, 44.37it/s][A
90it [00:02, 43.48it/s][A
95it [00:02, 44.28it/s][A
100it [00:02, 44.44it/s][A
105it [00:02, 45.04it/s][A

Epoch: 322, Step: 100, Loss: 4.51517566204071



110it [00:02, 45.10it/s][A
115it [00:02, 45.49it/s][A
120it [00:02, 44.98it/s][A
125it [00:02, 45.10it/s][A
130it [00:02, 45.35it/s][A
135it [00:02, 45.37it/s][A
140it [00:03, 45.54it/s][A
145it [00:03, 45.67it/s][A
150it [00:03, 45.74it/s][A
155it [00:03, 43.88it/s][A
160it [00:03, 44.15it/s][A
165it [00:03, 44.02it/s][A
170it [00:03, 44.33it/s][A
175it [00:03, 44.75it/s][A
180it [00:03, 45.07it/s][A
185it [00:04, 45.43it/s][A
190it [00:04, 45.57it/s][A
195it [00:04, 45.64it/s][A
200it [00:04, 45.67it/s][A
205it [00:04, 45.58it/s][A

Epoch: 322, Step: 200, Loss: 4.527118673324585



210it [00:04, 45.32it/s][A
215it [00:04, 45.05it/s][A
220it [00:04, 45.10it/s][A
227it [00:05, 45.08it/s]
 64%|██████▍   | 322/500 [38:18<20:46,  7.00s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.38it/s][A
10it [00:00, 43.73it/s][A
15it [00:00, 44.46it/s][A
20it [00:00, 45.11it/s][A
25it [00:00, 45.44it/s][A
30it [00:00, 45.41it/s][A
35it [00:00, 45.59it/s][A
40it [00:00, 45.55it/s][A
45it [00:00, 45.49it/s][A
50it [00:01, 45.57it/s][A
55it [00:01, 45.62it/s][A
60it [00:01, 45.71it/s][A
65it [00:01, 45.75it/s][A
70it [00:01, 45.77it/s][A
75it [00:01, 45.68it/s][A
80it [00:01, 45.68it/s][A
85it [00:01, 45.82it/s][A
90it [00:01, 45.80it/s][A
95it [00:02, 45.78it/s][A
100it [00:02, 45.72it/s][A
105it [00:02, 45.84it/s][A

Epoch: 323, Step: 100, Loss: 4.504758691787719



110it [00:02, 45.35it/s][A
115it [00:02, 45.43it/s][A
120it [00:02, 45.16it/s][A
125it [00:02, 44.97it/s][A
130it [00:02, 44.18it/s][A
135it [00:02, 44.34it/s][A
140it [00:03, 43.42it/s][A
145it [00:03, 43.87it/s][A
150it [00:03, 44.22it/s][A
155it [00:03, 43.39it/s][A
160it [00:03, 43.70it/s][A
165it [00:03, 44.31it/s][A
170it [00:03, 44.68it/s][A
175it [00:03, 44.00it/s][A
180it [00:04, 44.43it/s][A
185it [00:04, 44.68it/s][A
190it [00:04, 43.75it/s][A
195it [00:04, 44.07it/s][A
200it [00:04, 44.76it/s][A
205it [00:04, 43.55it/s][A

Epoch: 323, Step: 200, Loss: 4.52551029920578



210it [00:04, 44.13it/s][A
215it [00:04, 44.38it/s][A
220it [00:04, 44.68it/s][A
227it [00:05, 44.67it/s]
 65%|██████▍   | 323/500 [38:23<18:57,  6.43s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.12it/s][A
10it [00:00, 42.05it/s][A
15it [00:00, 43.02it/s][A
20it [00:00, 43.95it/s][A
25it [00:00, 44.64it/s][A
30it [00:00, 44.90it/s][A
35it [00:00, 44.99it/s][A
40it [00:00, 44.81it/s][A
45it [00:01, 44.98it/s][A
50it [00:01, 44.89it/s][A
55it [00:01, 45.11it/s][A
60it [00:01, 45.14it/s][A
65it [00:01, 43.67it/s][A
70it [00:01, 43.17it/s][A
75it [00:01, 43.95it/s][A
80it [00:01, 44.52it/s][A
85it [00:01, 43.63it/s][A
90it [00:02, 44.72it/s][A
95it [00:02, 45.22it/s][A
100it [00:02, 45.55it/s][A
105it [00:02, 45.71it/s][A

Epoch: 324, Step: 100, Loss: 4.50500937461853



110it [00:02, 45.66it/s][A
115it [00:02, 44.48it/s][A
120it [00:02, 45.21it/s][A
125it [00:02, 45.44it/s][A
130it [00:02, 44.44it/s][A
135it [00:03, 45.14it/s][A
140it [00:03, 45.10it/s][A
145it [00:03, 45.83it/s][A
150it [00:03, 45.94it/s][A
155it [00:03, 46.25it/s][A
160it [00:03, 45.97it/s][A
165it [00:03, 45.54it/s][A
170it [00:03, 45.42it/s][A
175it [00:03, 45.13it/s][A
180it [00:04, 45.19it/s][A
185it [00:04, 45.25it/s][A
190it [00:04, 45.46it/s][A
195it [00:04, 45.54it/s][A
200it [00:04, 45.75it/s][A
205it [00:04, 45.93it/s][A

Epoch: 324, Step: 200, Loss: 4.525285789966583



210it [00:04, 45.69it/s][A
215it [00:04, 45.94it/s][A
220it [00:04, 44.25it/s][A
227it [00:05, 44.97it/s]
 65%|██████▍   | 324/500 [38:28<17:38,  6.01s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.08it/s][A
10it [00:00, 45.11it/s][A
15it [00:00, 45.33it/s][A
20it [00:00, 45.36it/s][A
25it [00:00, 44.89it/s][A
30it [00:00, 44.97it/s][A
35it [00:00, 42.57it/s][A
40it [00:00, 43.18it/s][A
45it [00:01, 43.76it/s][A
50it [00:01, 44.26it/s][A
55it [00:01, 43.27it/s][A
60it [00:01, 44.13it/s][A
65it [00:01, 44.45it/s][A
70it [00:01, 44.79it/s][A
75it [00:01, 45.01it/s][A
80it [00:01, 44.17it/s][A
85it [00:01, 44.68it/s][A
90it [00:02, 45.01it/s][A
95it [00:02, 45.10it/s][A
100it [00:02, 44.15it/s][A
105it [00:02, 43.65it/s][A

Epoch: 325, Step: 100, Loss: 4.515923137664795



110it [00:02, 42.68it/s][A
115it [00:02, 42.12it/s][A
120it [00:02, 43.25it/s][A
125it [00:02, 43.81it/s][A
130it [00:02, 44.33it/s][A
135it [00:03, 44.64it/s][A
140it [00:03, 45.01it/s][A
145it [00:03, 45.20it/s][A
150it [00:03, 44.98it/s][A
155it [00:03, 43.12it/s][A
160it [00:03, 43.74it/s][A
165it [00:03, 42.98it/s][A
170it [00:03, 42.39it/s][A
175it [00:03, 43.01it/s][A
180it [00:04, 43.80it/s][A
185it [00:04, 44.32it/s][A
190it [00:04, 44.86it/s][A
195it [00:04, 44.15it/s][A
200it [00:04, 44.49it/s][A
205it [00:04, 45.06it/s][A

Epoch: 325, Step: 200, Loss: 4.526403744220733



210it [00:04, 45.27it/s][A
215it [00:04, 45.30it/s][A
220it [00:04, 45.33it/s][A
227it [00:05, 44.24it/s]

0it [00:00, ?it/s][A
6it [00:00, 52.62it/s][A
13it [00:00, 57.42it/s][A
19it [00:00, 58.46it/s][A
26it [00:00, 59.46it/s][A
32it [00:00, 59.53it/s][A
39it [00:00, 59.98it/s][A
45it [00:00, 59.96it/s][A
51it [00:00, 59.79it/s][A
57it [00:00, 58.83it/s][A
63it [00:01, 58.99it/s][A
69it [00:01, 59.23it/s][A
76it [00:01, 59.66it/s][A
83it [00:01, 60.13it/s][A
90it [00:01, 59.35it/s][A
96it [00:01, 59.10it/s][A
102it [00:01, 58.76it/s][A
108it [00:01, 58.75it/s][A
114it [00:01, 56.66it/s][A
120it [00:02, 57.36it/s][A
127it [00:02, 58.34it/s][A
134it [00:02, 58.97it/s][A
140it [00:02, 59.16it/s][A
146it [00:02, 59.11it/s][A
152it [00:02, 59.03it/s][A
158it [00:02, 58.29it/s][A
164it [00:02, 57.69it/s][A
170it [00:02, 55.78it/s][A
176it [00:03, 56.52it/s][A
182it [00:03, 57.11it/s][A
188it [00:03, 56.76it/s][A
194it [00:03, 57.14it/s][A
200it [00:03, 5


Epoch: 325, Test Loss: 5.512232377662421, Test Perplexity: 248.70667314233248




0it [00:00, ?it/s][A
5it [00:00, 44.71it/s][A
10it [00:00, 45.01it/s][A
15it [00:00, 42.80it/s][A
20it [00:00, 41.93it/s][A
25it [00:00, 42.70it/s][A
30it [00:00, 42.92it/s][A
35it [00:00, 42.65it/s][A
40it [00:00, 43.10it/s][A
45it [00:01, 43.91it/s][A
50it [00:01, 44.14it/s][A
55it [00:01, 44.31it/s][A
60it [00:01, 43.15it/s][A
65it [00:01, 43.59it/s][A
70it [00:01, 43.05it/s][A
75it [00:01, 43.66it/s][A
80it [00:01, 44.20it/s][A
85it [00:01, 44.72it/s][A
90it [00:02, 43.57it/s][A
95it [00:02, 43.89it/s][A
100it [00:02, 43.45it/s][A
105it [00:02, 43.75it/s][A

Epoch: 326, Step: 100, Loss: 4.507268252372742



110it [00:02, 44.07it/s][A
115it [00:02, 43.35it/s][A
120it [00:02, 43.37it/s][A
125it [00:02, 42.56it/s][A
130it [00:02, 43.17it/s][A
135it [00:03, 42.33it/s][A
140it [00:03, 42.63it/s][A
145it [00:03, 43.26it/s][A
150it [00:03, 41.48it/s][A
155it [00:03, 42.62it/s][A
160it [00:03, 43.25it/s][A
165it [00:03, 43.26it/s][A
170it [00:03, 43.64it/s][A
175it [00:04, 43.71it/s][A
180it [00:04, 42.61it/s][A
185it [00:04, 42.70it/s][A
190it [00:04, 43.37it/s][A
195it [00:04, 43.76it/s][A
200it [00:04, 42.89it/s][A
205it [00:04, 42.44it/s][A

Epoch: 326, Step: 200, Loss: 4.52267147064209



210it [00:04, 43.10it/s][A
215it [00:04, 43.82it/s][A
220it [00:05, 44.40it/s][A
227it [00:05, 43.39it/s]
 65%|██████▌   | 326/500 [38:49<22:51,  7.88s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.91it/s][A
10it [00:00, 45.16it/s][A
15it [00:00, 45.42it/s][A
20it [00:00, 43.97it/s][A
25it [00:00, 44.80it/s][A
30it [00:00, 45.17it/s][A
35it [00:00, 45.02it/s][A
40it [00:00, 45.19it/s][A
45it [00:00, 45.31it/s][A
50it [00:01, 44.82it/s][A
55it [00:01, 44.88it/s][A
60it [00:01, 44.86it/s][A
65it [00:01, 44.02it/s][A
70it [00:01, 43.09it/s][A
75it [00:01, 43.74it/s][A
80it [00:01, 44.25it/s][A
85it [00:01, 44.52it/s][A
90it [00:02, 44.95it/s][A
95it [00:02, 44.82it/s][A
100it [00:02, 44.90it/s][A
105it [00:02, 44.31it/s][A

Epoch: 327, Step: 100, Loss: 4.51202356338501



110it [00:02, 44.48it/s][A
115it [00:02, 44.79it/s][A
120it [00:02, 44.81it/s][A
125it [00:02, 43.73it/s][A
130it [00:02, 44.00it/s][A
135it [00:03, 44.40it/s][A
140it [00:03, 44.49it/s][A
145it [00:03, 44.29it/s][A
150it [00:03, 44.60it/s][A
155it [00:03, 44.65it/s][A
160it [00:03, 44.88it/s][A
165it [00:03, 44.87it/s][A
170it [00:03, 43.60it/s][A
175it [00:03, 43.97it/s][A
180it [00:04, 44.25it/s][A
185it [00:04, 44.10it/s][A
190it [00:04, 44.00it/s][A
195it [00:04, 44.45it/s][A
200it [00:04, 44.84it/s][A
205it [00:04, 45.16it/s][A

Epoch: 327, Step: 200, Loss: 4.522358682155609



210it [00:04, 45.37it/s][A
215it [00:04, 45.60it/s][A
220it [00:04, 44.27it/s][A
227it [00:05, 44.57it/s]
 65%|██████▌   | 327/500 [38:54<20:18,  7.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.13it/s][A
10it [00:00, 44.79it/s][A
15it [00:00, 42.90it/s][A
20it [00:00, 44.16it/s][A
25it [00:00, 44.76it/s][A
30it [00:00, 44.98it/s][A
35it [00:00, 45.26it/s][A
40it [00:00, 45.15it/s][A
45it [00:01, 45.62it/s][A
50it [00:01, 45.80it/s][A
55it [00:01, 45.53it/s][A
60it [00:01, 45.52it/s][A
65it [00:01, 45.71it/s][A
70it [00:01, 45.82it/s][A
75it [00:01, 46.00it/s][A
80it [00:01, 46.28it/s][A
85it [00:01, 46.57it/s][A
90it [00:01, 46.28it/s][A
95it [00:02, 46.32it/s][A
100it [00:02, 45.20it/s][A
105it [00:02, 43.95it/s][A

Epoch: 328, Step: 100, Loss: 4.505660424232483



110it [00:02, 44.54it/s][A
115it [00:02, 43.99it/s][A
120it [00:02, 44.76it/s][A
125it [00:02, 45.29it/s][A
130it [00:02, 44.45it/s][A
135it [00:02, 44.98it/s][A
140it [00:03, 45.16it/s][A
145it [00:03, 44.84it/s][A
150it [00:03, 43.87it/s][A
155it [00:03, 43.95it/s][A
160it [00:03, 44.38it/s][A
165it [00:03, 44.73it/s][A
170it [00:03, 45.75it/s][A
175it [00:03, 44.50it/s][A
180it [00:03, 44.84it/s][A
185it [00:04, 44.27it/s][A
190it [00:04, 43.29it/s][A
195it [00:04, 42.45it/s][A
200it [00:04, 42.77it/s][A
205it [00:04, 42.54it/s][A

Epoch: 328, Step: 200, Loss: 4.522030048370361



210it [00:04, 43.07it/s][A
215it [00:04, 43.84it/s][A
220it [00:04, 44.40it/s][A
227it [00:05, 44.68it/s]
 66%|██████▌   | 328/500 [38:59<18:30,  6.46s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.41it/s][A
10it [00:00, 45.62it/s][A
15it [00:00, 45.57it/s][A
20it [00:00, 45.38it/s][A
25it [00:00, 44.15it/s][A
30it [00:00, 44.84it/s][A
35it [00:00, 43.11it/s][A
40it [00:00, 43.32it/s][A
45it [00:01, 43.87it/s][A
50it [00:01, 44.18it/s][A
55it [00:01, 43.26it/s][A
60it [00:01, 44.03it/s][A
65it [00:01, 44.43it/s][A
70it [00:01, 44.89it/s][A
75it [00:01, 44.22it/s][A
80it [00:01, 44.34it/s][A
85it [00:01, 43.44it/s][A
90it [00:02, 43.80it/s][A
95it [00:02, 42.94it/s][A
100it [00:02, 43.32it/s][A
105it [00:02, 43.77it/s][A

Epoch: 329, Step: 100, Loss: 4.506514525413513



110it [00:02, 43.83it/s][A
115it [00:02, 44.00it/s][A
120it [00:02, 44.07it/s][A
125it [00:02, 43.30it/s][A
130it [00:02, 42.77it/s][A
135it [00:03, 43.36it/s][A
140it [00:03, 43.90it/s][A
145it [00:03, 44.31it/s][A
150it [00:03, 43.28it/s][A
155it [00:03, 43.65it/s][A
160it [00:03, 42.98it/s][A
165it [00:03, 43.72it/s][A
170it [00:03, 43.86it/s][A
175it [00:03, 44.36it/s][A
180it [00:04, 44.74it/s][A
185it [00:04, 44.93it/s][A
190it [00:04, 45.06it/s][A
195it [00:04, 44.82it/s][A
200it [00:04, 45.10it/s][A
205it [00:04, 45.23it/s][A

Epoch: 329, Step: 200, Loss: 4.522788712978363



210it [00:04, 45.08it/s][A
215it [00:04, 45.26it/s][A
220it [00:04, 43.91it/s][A
227it [00:05, 44.09it/s]
 66%|██████▌   | 329/500 [39:04<17:17,  6.07s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.89it/s][A
10it [00:00, 45.49it/s][A
15it [00:00, 45.60it/s][A
20it [00:00, 45.57it/s][A
25it [00:00, 45.14it/s][A
30it [00:00, 45.36it/s][A
35it [00:00, 44.52it/s][A
40it [00:00, 44.51it/s][A
45it [00:01, 44.70it/s][A
50it [00:01, 44.78it/s][A
55it [00:01, 44.87it/s][A
60it [00:01, 44.00it/s][A
65it [00:01, 44.47it/s][A
70it [00:01, 44.43it/s][A
75it [00:01, 44.86it/s][A
80it [00:01, 45.06it/s][A
85it [00:01, 45.29it/s][A
90it [00:02, 45.37it/s][A
95it [00:02, 45.12it/s][A
100it [00:02, 44.95it/s][A
105it [00:02, 44.33it/s][A

Epoch: 330, Step: 100, Loss: 4.522676930427552



110it [00:02, 43.17it/s][A
115it [00:02, 43.89it/s][A
120it [00:02, 43.30it/s][A
125it [00:02, 42.62it/s][A
130it [00:02, 43.32it/s][A
135it [00:03, 43.30it/s][A
140it [00:03, 43.69it/s][A
145it [00:03, 44.13it/s][A
150it [00:03, 44.18it/s][A
155it [00:03, 44.27it/s][A
160it [00:03, 44.77it/s][A
165it [00:03, 44.98it/s][A
170it [00:03, 44.60it/s][A
175it [00:03, 44.88it/s][A
180it [00:04, 43.86it/s][A
185it [00:04, 43.73it/s][A
190it [00:04, 44.11it/s][A
195it [00:04, 44.10it/s][A
200it [00:04, 44.07it/s][A
205it [00:04, 44.47it/s][A

Epoch: 330, Step: 200, Loss: 4.526398983001709



210it [00:04, 44.63it/s][A
215it [00:04, 44.82it/s][A
220it [00:04, 43.55it/s][A
227it [00:05, 44.35it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.12it/s][A
12it [00:00, 58.65it/s][A
18it [00:00, 58.32it/s][A
24it [00:00, 58.72it/s][A
30it [00:00, 58.89it/s][A
36it [00:00, 59.02it/s][A
42it [00:00, 58.90it/s][A
48it [00:00, 59.14it/s][A
54it [00:00, 59.25it/s][A
61it [00:01, 59.64it/s][A
67it [00:01, 59.70it/s][A
74it [00:01, 59.94it/s][A
80it [00:01, 59.95it/s][A
86it [00:01, 59.72it/s][A
93it [00:01, 59.95it/s][A
99it [00:01, 59.81it/s][A
105it [00:01, 59.83it/s][A
111it [00:01, 58.32it/s][A
117it [00:01, 58.05it/s][A
124it [00:02, 58.79it/s][A
130it [00:02, 56.66it/s][A
136it [00:02, 57.05it/s][A
142it [00:02, 57.81it/s][A
148it [00:02, 57.97it/s][A
154it [00:02, 58.35it/s][A
160it [00:02, 58.53it/s][A
167it [00:02, 59.13it/s][A
174it [00:02, 59.67it/s][A
180it [00:03, 59.74it/s][A
186it [00:03, 58.70it/s][A
193it [00:03, 59.21it/s][A
199it [00:03, 59


Epoch: 330, Test Loss: 5.512295475657682, Test Perplexity: 248.784027241772




0it [00:00, ?it/s][A
5it [00:00, 44.10it/s][A
10it [00:00, 44.87it/s][A
15it [00:00, 45.13it/s][A
20it [00:00, 43.75it/s][A
25it [00:00, 44.46it/s][A
30it [00:00, 43.63it/s][A
35it [00:00, 43.46it/s][A
40it [00:00, 42.34it/s][A
45it [00:01, 42.07it/s][A
50it [00:01, 41.53it/s][A
55it [00:01, 41.96it/s][A
60it [00:01, 41.74it/s][A
65it [00:01, 42.71it/s][A
70it [00:01, 43.16it/s][A
75it [00:01, 43.87it/s][A
80it [00:01, 44.22it/s][A
85it [00:01, 44.33it/s][A
90it [00:02, 44.53it/s][A
95it [00:02, 44.77it/s][A
100it [00:02, 44.98it/s][A
105it [00:02, 45.07it/s][A

Epoch: 331, Step: 100, Loss: 4.504891004562378



110it [00:02, 45.22it/s][A
115it [00:02, 45.21it/s][A
120it [00:02, 45.14it/s][A
125it [00:02, 45.04it/s][A
130it [00:02, 45.33it/s][A
135it [00:03, 45.50it/s][A
140it [00:03, 45.45it/s][A
145it [00:03, 45.14it/s][A
150it [00:03, 45.05it/s][A
155it [00:03, 45.17it/s][A
160it [00:03, 45.31it/s][A
165it [00:03, 43.55it/s][A
170it [00:03, 43.67it/s][A
175it [00:03, 44.09it/s][A
180it [00:04, 44.33it/s][A
185it [00:04, 43.99it/s][A
190it [00:04, 44.43it/s][A
195it [00:04, 43.43it/s][A
200it [00:04, 43.67it/s][A
205it [00:04, 44.21it/s][A

Epoch: 331, Step: 200, Loss: 4.522383568286895



210it [00:04, 44.29it/s][A
215it [00:04, 44.49it/s][A
220it [00:04, 44.84it/s][A
227it [00:05, 44.17it/s]
 66%|██████▌   | 331/500 [39:26<22:12,  7.88s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.92it/s][A
10it [00:00, 46.24it/s][A
15it [00:00, 46.16it/s][A
20it [00:00, 46.31it/s][A
25it [00:00, 44.36it/s][A
30it [00:00, 44.80it/s][A
35it [00:00, 44.96it/s][A
40it [00:00, 45.56it/s][A
45it [00:00, 44.73it/s][A
50it [00:01, 45.27it/s][A
55it [00:01, 45.42it/s][A
60it [00:01, 45.52it/s][A
65it [00:01, 45.39it/s][A
70it [00:01, 45.11it/s][A
75it [00:01, 45.03it/s][A
80it [00:01, 45.06it/s][A
85it [00:01, 45.18it/s][A
90it [00:01, 45.34it/s][A
95it [00:02, 45.15it/s][A
100it [00:02, 45.23it/s][A
105it [00:02, 44.25it/s][A

Epoch: 332, Step: 100, Loss: 4.513092436790466



110it [00:02, 44.60it/s][A
115it [00:02, 44.79it/s][A
120it [00:02, 44.93it/s][A
125it [00:02, 45.03it/s][A
130it [00:02, 45.29it/s][A
135it [00:02, 45.72it/s][A
140it [00:03, 45.94it/s][A
145it [00:03, 45.20it/s][A
150it [00:03, 44.68it/s][A
155it [00:03, 45.29it/s][A
160it [00:03, 44.80it/s][A
165it [00:03, 45.01it/s][A
170it [00:03, 44.81it/s][A
175it [00:03, 45.13it/s][A
180it [00:03, 44.61it/s][A
185it [00:04, 45.11it/s][A
190it [00:04, 44.52it/s][A
195it [00:04, 45.04it/s][A
200it [00:04, 44.31it/s][A
205it [00:04, 44.79it/s][A

Epoch: 332, Step: 200, Loss: 4.519648070335388



210it [00:04, 44.70it/s][A
215it [00:04, 44.86it/s][A
220it [00:04, 45.66it/s][A
227it [00:05, 45.13it/s]
 66%|██████▋   | 332/500 [39:31<19:40,  7.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.55it/s][A
10it [00:00, 46.46it/s][A
15it [00:00, 46.34it/s][A
20it [00:00, 46.23it/s][A
25it [00:00, 44.87it/s][A
30it [00:00, 45.18it/s][A
35it [00:00, 45.15it/s][A
40it [00:00, 45.34it/s][A
45it [00:00, 45.69it/s][A
50it [00:01, 45.27it/s][A
55it [00:01, 45.23it/s][A
60it [00:01, 44.88it/s][A
65it [00:01, 44.12it/s][A
70it [00:01, 44.66it/s][A
75it [00:01, 44.90it/s][A
80it [00:01, 45.12it/s][A
85it [00:01, 45.21it/s][A
90it [00:01, 45.29it/s][A
95it [00:02, 44.83it/s][A
100it [00:02, 43.61it/s][A
105it [00:02, 43.59it/s][A

Epoch: 333, Step: 100, Loss: 4.511786489486695



110it [00:02, 43.33it/s][A
115it [00:02, 43.44it/s][A
120it [00:02, 43.13it/s][A
125it [00:02, 43.52it/s][A
130it [00:02, 44.11it/s][A
135it [00:03, 44.06it/s][A
140it [00:03, 44.74it/s][A
145it [00:03, 44.75it/s][A
150it [00:03, 44.99it/s][A
155it [00:03, 43.70it/s][A
160it [00:03, 44.37it/s][A
165it [00:03, 44.44it/s][A
170it [00:03, 44.78it/s][A
175it [00:03, 44.84it/s][A
180it [00:04, 45.04it/s][A
185it [00:04, 45.22it/s][A
190it [00:04, 45.28it/s][A
195it [00:04, 45.48it/s][A
200it [00:04, 44.47it/s][A
205it [00:04, 44.49it/s][A

Epoch: 333, Step: 200, Loss: 4.5221523404121395



210it [00:04, 44.53it/s][A
215it [00:04, 43.33it/s][A
220it [00:04, 43.88it/s][A
227it [00:05, 44.59it/s]
 67%|██████▋   | 333/500 [39:36<17:56,  6.45s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.79it/s][A
10it [00:00, 45.38it/s][A
15it [00:00, 44.33it/s][A
20it [00:00, 44.46it/s][A
25it [00:00, 44.51it/s][A
30it [00:00, 44.46it/s][A
35it [00:00, 44.79it/s][A
40it [00:00, 45.08it/s][A
45it [00:01, 44.96it/s][A
50it [00:01, 44.60it/s][A
55it [00:01, 44.98it/s][A
60it [00:01, 45.21it/s][A
65it [00:01, 45.30it/s][A
70it [00:01, 45.36it/s][A
75it [00:01, 45.61it/s][A
80it [00:01, 45.60it/s][A
85it [00:01, 44.54it/s][A
90it [00:02, 44.84it/s][A
95it [00:02, 44.71it/s][A
100it [00:02, 43.83it/s][A
105it [00:02, 44.38it/s][A

Epoch: 334, Step: 100, Loss: 4.500499987602234



110it [00:02, 44.64it/s][A
115it [00:02, 44.91it/s][A
120it [00:02, 43.25it/s][A
125it [00:02, 43.84it/s][A
130it [00:02, 44.29it/s][A
135it [00:03, 44.69it/s][A
140it [00:03, 45.05it/s][A
145it [00:03, 43.81it/s][A
150it [00:03, 44.40it/s][A
155it [00:03, 44.73it/s][A
160it [00:03, 44.75it/s][A
165it [00:03, 44.90it/s][A
170it [00:03, 45.04it/s][A
175it [00:03, 44.98it/s][A
180it [00:04, 44.45it/s][A
185it [00:04, 44.86it/s][A
190it [00:04, 44.93it/s][A
195it [00:04, 44.62it/s][A
200it [00:04, 44.81it/s][A
205it [00:04, 45.08it/s][A

Epoch: 334, Step: 200, Loss: 4.519663488864898



210it [00:04, 45.18it/s][A
215it [00:04, 45.24it/s][A
220it [00:04, 45.23it/s][A
227it [00:05, 44.79it/s]
 67%|██████▋   | 334/500 [39:41<16:41,  6.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.16it/s][A
10it [00:00, 44.45it/s][A
15it [00:00, 44.85it/s][A
20it [00:00, 45.04it/s][A
25it [00:00, 45.10it/s][A
30it [00:00, 45.08it/s][A
35it [00:00, 44.12it/s][A
40it [00:00, 44.35it/s][A
45it [00:01, 43.51it/s][A
50it [00:01, 44.29it/s][A
55it [00:01, 44.63it/s][A
60it [00:01, 44.97it/s][A
65it [00:01, 45.13it/s][A
70it [00:01, 45.01it/s][A
75it [00:01, 45.26it/s][A
80it [00:01, 45.57it/s][A
85it [00:01, 45.53it/s][A
90it [00:02, 45.58it/s][A
95it [00:02, 44.42it/s][A
100it [00:02, 44.91it/s][A
105it [00:02, 44.95it/s][A

Epoch: 335, Step: 100, Loss: 4.511427235603333



110it [00:02, 44.58it/s][A
115it [00:02, 44.77it/s][A
120it [00:02, 43.96it/s][A
125it [00:02, 44.21it/s][A
130it [00:02, 44.31it/s][A
135it [00:03, 44.49it/s][A
140it [00:03, 44.56it/s][A
145it [00:03, 44.64it/s][A
150it [00:03, 44.30it/s][A
155it [00:03, 44.47it/s][A
160it [00:03, 44.41it/s][A
165it [00:03, 44.08it/s][A
170it [00:03, 44.30it/s][A
175it [00:03, 43.90it/s][A
180it [00:04, 44.38it/s][A
185it [00:04, 44.50it/s][A
190it [00:04, 44.90it/s][A
195it [00:04, 44.95it/s][A
200it [00:04, 45.17it/s][A
205it [00:04, 45.48it/s][A

Epoch: 335, Step: 200, Loss: 4.51962657213211



210it [00:04, 45.53it/s][A
215it [00:04, 45.54it/s][A
220it [00:04, 45.47it/s][A
227it [00:05, 44.73it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.88it/s][A
13it [00:00, 60.04it/s][A
20it [00:00, 60.25it/s][A
27it [00:00, 60.02it/s][A
34it [00:00, 60.36it/s][A
41it [00:00, 60.19it/s][A
48it [00:00, 57.98it/s][A
55it [00:00, 58.76it/s][A
61it [00:01, 58.97it/s][A
68it [00:01, 59.48it/s][A
75it [00:01, 59.85it/s][A
81it [00:01, 59.74it/s][A
88it [00:01, 60.01it/s][A
95it [00:01, 60.19it/s][A
102it [00:01, 60.32it/s][A
109it [00:01, 60.37it/s][A
116it [00:01, 58.66it/s][A
123it [00:02, 59.14it/s][A
130it [00:02, 59.72it/s][A
136it [00:02, 59.67it/s][A
143it [00:02, 59.88it/s][A
149it [00:02, 59.60it/s][A
155it [00:02, 59.49it/s][A
161it [00:02, 59.45it/s][A
167it [00:02, 59.41it/s][A
173it [00:02, 59.53it/s][A
179it [00:03, 59.36it/s][A
185it [00:03, 59.21it/s][A
191it [00:03, 59.41it/s][A
197it [00:03, 59.58it/s][A
203it [00:03, 59.43it/s][A
210it [00:03, 


Epoch: 335, Test Loss: 5.521805314543824, Test Perplexity: 251.06724136068215




0it [00:00, ?it/s][A
5it [00:00, 44.34it/s][A
10it [00:00, 42.77it/s][A
15it [00:00, 44.15it/s][A
20it [00:00, 44.66it/s][A
25it [00:00, 43.77it/s][A
30it [00:00, 44.13it/s][A
35it [00:00, 44.76it/s][A
40it [00:00, 43.56it/s][A
45it [00:01, 44.06it/s][A
50it [00:01, 44.66it/s][A
55it [00:01, 44.89it/s][A
60it [00:01, 45.07it/s][A
65it [00:01, 45.15it/s][A
70it [00:01, 45.03it/s][A
75it [00:01, 45.24it/s][A
80it [00:01, 44.37it/s][A
85it [00:01, 44.63it/s][A
90it [00:02, 44.93it/s][A
95it [00:02, 44.70it/s][A
100it [00:02, 44.92it/s][A
105it [00:02, 44.96it/s][A

Epoch: 336, Step: 100, Loss: 4.5095667552948



110it [00:02, 44.79it/s][A
115it [00:02, 44.97it/s][A
120it [00:02, 45.19it/s][A
125it [00:02, 45.15it/s][A
130it [00:02, 45.14it/s][A
135it [00:03, 45.29it/s][A
140it [00:03, 44.97it/s][A
145it [00:03, 43.97it/s][A
150it [00:03, 44.50it/s][A
155it [00:03, 43.69it/s][A
160it [00:03, 44.38it/s][A
165it [00:03, 44.80it/s][A
170it [00:03, 44.91it/s][A
175it [00:03, 45.38it/s][A
180it [00:04, 45.73it/s][A
185it [00:04, 45.96it/s][A
190it [00:04, 45.13it/s][A
195it [00:04, 45.50it/s][A
200it [00:04, 45.70it/s][A
205it [00:04, 45.89it/s][A

Epoch: 336, Step: 200, Loss: 4.5200118684768675



210it [00:04, 44.50it/s][A
215it [00:04, 44.96it/s][A
220it [00:04, 45.03it/s][A
227it [00:05, 44.71it/s]
 67%|██████▋   | 336/500 [40:02<21:28,  7.86s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.37it/s][A
10it [00:00, 45.70it/s][A
15it [00:00, 45.30it/s][A
20it [00:00, 46.04it/s][A
25it [00:00, 46.52it/s][A
30it [00:00, 46.56it/s][A
35it [00:00, 44.91it/s][A
40it [00:00, 45.36it/s][A
45it [00:00, 45.37it/s][A
50it [00:01, 45.24it/s][A
55it [00:01, 43.57it/s][A
60it [00:01, 43.62it/s][A
65it [00:01, 44.09it/s][A
70it [00:01, 44.48it/s][A
75it [00:01, 44.49it/s][A
80it [00:01, 44.69it/s][A
85it [00:01, 44.60it/s][A
90it [00:02, 44.46it/s][A
95it [00:02, 42.92it/s][A
100it [00:02, 43.62it/s][A
105it [00:02, 43.80it/s][A

Epoch: 337, Step: 100, Loss: 4.511164817810059



110it [00:02, 43.36it/s][A
115it [00:02, 43.75it/s][A
120it [00:02, 44.38it/s][A
125it [00:02, 44.63it/s][A
130it [00:02, 44.82it/s][A
135it [00:03, 44.80it/s][A
140it [00:03, 44.80it/s][A
145it [00:03, 44.39it/s][A
150it [00:03, 44.10it/s][A
155it [00:03, 44.13it/s][A
160it [00:03, 44.34it/s][A
165it [00:03, 44.24it/s][A
170it [00:03, 44.31it/s][A
175it [00:03, 44.87it/s][A
180it [00:04, 44.21it/s][A
185it [00:04, 44.25it/s][A
190it [00:04, 44.28it/s][A
195it [00:04, 44.12it/s][A
200it [00:04, 44.38it/s][A
205it [00:04, 44.67it/s][A

Epoch: 337, Step: 200, Loss: 4.518890371322632



210it [00:04, 44.30it/s][A
215it [00:04, 44.30it/s][A
220it [00:04, 44.33it/s][A
227it [00:05, 44.44it/s]
 67%|██████▋   | 337/500 [40:07<19:06,  7.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.55it/s][A
10it [00:00, 43.96it/s][A
15it [00:00, 43.81it/s][A
20it [00:00, 44.33it/s][A
25it [00:00, 44.50it/s][A
30it [00:00, 44.68it/s][A
35it [00:00, 44.99it/s][A
40it [00:00, 43.91it/s][A
45it [00:01, 44.04it/s][A
50it [00:01, 43.03it/s][A
55it [00:01, 43.45it/s][A
60it [00:01, 43.78it/s][A
65it [00:01, 44.21it/s][A
70it [00:01, 44.33it/s][A
75it [00:01, 44.40it/s][A
80it [00:01, 44.78it/s][A
85it [00:01, 44.60it/s][A
90it [00:02, 44.76it/s][A
95it [00:02, 44.77it/s][A
100it [00:02, 44.82it/s][A
105it [00:02, 44.60it/s][A

Epoch: 338, Step: 100, Loss: 4.511522507667541



110it [00:02, 44.59it/s][A
115it [00:02, 44.81it/s][A
120it [00:02, 44.63it/s][A
125it [00:02, 44.64it/s][A
130it [00:02, 43.69it/s][A
135it [00:03, 43.65it/s][A
140it [00:03, 43.79it/s][A
145it [00:03, 44.33it/s][A
150it [00:03, 44.38it/s][A
155it [00:03, 43.57it/s][A
160it [00:03, 44.12it/s][A
165it [00:03, 44.35it/s][A
170it [00:03, 44.49it/s][A
175it [00:03, 44.74it/s][A
180it [00:04, 44.38it/s][A
185it [00:04, 44.54it/s][A
190it [00:04, 44.71it/s][A
195it [00:04, 44.49it/s][A
200it [00:04, 44.58it/s][A
205it [00:04, 44.63it/s][A

Epoch: 338, Step: 200, Loss: 4.5195649099349975



210it [00:04, 44.54it/s][A
215it [00:04, 44.61it/s][A
220it [00:04, 44.51it/s][A
227it [00:05, 44.22it/s]
 68%|██████▊   | 338/500 [40:12<17:27,  6.46s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.67it/s][A
10it [00:00, 45.29it/s][A
15it [00:00, 45.01it/s][A
20it [00:00, 44.86it/s][A
25it [00:00, 44.57it/s][A
30it [00:00, 44.94it/s][A
35it [00:00, 45.08it/s][A
40it [00:00, 45.12it/s][A
45it [00:00, 45.34it/s][A
50it [00:01, 45.41it/s][A
55it [00:01, 45.33it/s][A
60it [00:01, 45.18it/s][A
65it [00:01, 44.81it/s][A
70it [00:01, 42.88it/s][A
75it [00:01, 43.65it/s][A
80it [00:01, 44.13it/s][A
85it [00:01, 44.33it/s][A
90it [00:02, 44.53it/s][A
95it [00:02, 44.40it/s][A
100it [00:02, 44.57it/s][A
105it [00:02, 44.71it/s][A

Epoch: 339, Step: 100, Loss: 4.504084806442261



110it [00:02, 44.54it/s][A
115it [00:02, 44.64it/s][A
120it [00:02, 44.71it/s][A
125it [00:02, 44.96it/s][A
130it [00:02, 44.90it/s][A
135it [00:03, 44.90it/s][A
140it [00:03, 44.83it/s][A
145it [00:03, 44.78it/s][A
150it [00:03, 45.07it/s][A
155it [00:03, 44.75it/s][A
160it [00:03, 44.93it/s][A
165it [00:03, 45.01it/s][A
170it [00:03, 45.16it/s][A
175it [00:03, 44.32it/s][A
180it [00:04, 44.43it/s][A
185it [00:04, 44.20it/s][A
190it [00:04, 44.66it/s][A
195it [00:04, 44.56it/s][A
200it [00:04, 44.67it/s][A
205it [00:04, 43.40it/s][A

Epoch: 339, Step: 200, Loss: 4.519044842720032



210it [00:04, 44.10it/s][A
215it [00:04, 44.56it/s][A
220it [00:04, 44.87it/s][A
227it [00:05, 44.55it/s]
 68%|██████▊   | 339/500 [40:17<16:14,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.57it/s][A
10it [00:00, 44.66it/s][A
15it [00:00, 44.50it/s][A
20it [00:00, 43.04it/s][A
25it [00:00, 43.52it/s][A
30it [00:00, 44.13it/s][A
35it [00:00, 44.27it/s][A
40it [00:00, 44.43it/s][A
45it [00:01, 44.11it/s][A
50it [00:01, 44.40it/s][A
55it [00:01, 44.35it/s][A
60it [00:01, 44.06it/s][A
65it [00:01, 44.37it/s][A
70it [00:01, 44.44it/s][A
75it [00:01, 44.56it/s][A
80it [00:01, 44.73it/s][A
85it [00:01, 44.87it/s][A
90it [00:02, 44.76it/s][A
95it [00:02, 45.08it/s][A
100it [00:02, 44.99it/s][A
105it [00:02, 44.99it/s][A

Epoch: 340, Step: 100, Loss: 4.501077661514282



110it [00:02, 44.95it/s][A
115it [00:02, 45.08it/s][A
120it [00:02, 44.47it/s][A
125it [00:02, 43.53it/s][A
130it [00:02, 43.96it/s][A
135it [00:03, 42.13it/s][A
140it [00:03, 42.90it/s][A
145it [00:03, 43.66it/s][A
150it [00:03, 44.21it/s][A
155it [00:03, 43.00it/s][A
160it [00:03, 43.56it/s][A
165it [00:03, 44.05it/s][A
170it [00:03, 44.53it/s][A
175it [00:03, 44.83it/s][A
180it [00:04, 45.08it/s][A
185it [00:04, 45.27it/s][A
190it [00:04, 45.37it/s][A
195it [00:04, 45.36it/s][A
200it [00:04, 44.01it/s][A
205it [00:04, 44.23it/s][A

Epoch: 340, Step: 200, Loss: 4.517588148117065



210it [00:04, 44.28it/s][A
215it [00:04, 44.61it/s][A
220it [00:04, 42.83it/s][A
227it [00:05, 44.19it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.44it/s][A
13it [00:00, 59.94it/s][A
19it [00:00, 59.53it/s][A
25it [00:00, 59.43it/s][A
31it [00:00, 59.15it/s][A
37it [00:00, 59.35it/s][A
44it [00:00, 59.77it/s][A
50it [00:00, 59.77it/s][A
57it [00:00, 59.98it/s][A
64it [00:01, 60.35it/s][A
71it [00:01, 60.23it/s][A
78it [00:01, 60.32it/s][A
85it [00:01, 60.06it/s][A
92it [00:01, 59.98it/s][A
98it [00:01, 59.69it/s][A
105it [00:01, 59.92it/s][A
111it [00:01, 59.71it/s][A
117it [00:01, 59.38it/s][A
123it [00:02, 57.74it/s][A
129it [00:02, 58.09it/s][A
135it [00:02, 58.33it/s][A
141it [00:02, 55.66it/s][A
147it [00:02, 56.78it/s][A
154it [00:02, 57.95it/s][A
161it [00:02, 58.70it/s][A
168it [00:02, 59.27it/s][A
174it [00:02, 59.25it/s][A
180it [00:03, 59.21it/s][A
187it [00:03, 59.54it/s][A
194it [00:03, 59.86it/s][A
201it [00:03, 60.11it/s][A
208it [00:03, 6


Epoch: 340, Test Loss: 5.516446296472727, Test Perplexity: 249.74704571836483




0it [00:00, ?it/s][A
5it [00:00, 46.84it/s][A
10it [00:00, 45.66it/s][A
15it [00:00, 45.71it/s][A
20it [00:00, 44.47it/s][A
25it [00:00, 44.86it/s][A
30it [00:00, 45.26it/s][A
35it [00:00, 45.81it/s][A
40it [00:00, 45.64it/s][A
45it [00:00, 45.50it/s][A
50it [00:01, 45.22it/s][A
55it [00:01, 45.01it/s][A
60it [00:01, 44.32it/s][A
65it [00:01, 45.45it/s][A
70it [00:01, 46.07it/s][A
75it [00:01, 46.38it/s][A
80it [00:01, 46.05it/s][A
85it [00:01, 45.69it/s][A
90it [00:01, 45.52it/s][A
95it [00:02, 45.17it/s][A
100it [00:02, 45.15it/s][A
105it [00:02, 45.22it/s][A

Epoch: 341, Step: 100, Loss: 4.50035252571106



110it [00:02, 45.20it/s][A
115it [00:02, 45.06it/s][A
120it [00:02, 45.40it/s][A
125it [00:02, 45.47it/s][A
130it [00:02, 44.04it/s][A
135it [00:02, 44.11it/s][A
140it [00:03, 44.50it/s][A
145it [00:03, 44.89it/s][A
150it [00:03, 44.89it/s][A
155it [00:03, 44.79it/s][A
160it [00:03, 44.29it/s][A
165it [00:03, 44.54it/s][A
170it [00:03, 42.92it/s][A
175it [00:03, 42.85it/s][A
180it [00:04, 43.51it/s][A
185it [00:04, 44.14it/s][A
190it [00:04, 44.60it/s][A
195it [00:04, 44.61it/s][A
200it [00:04, 44.74it/s][A
205it [00:04, 44.87it/s][A

Epoch: 341, Step: 200, Loss: 4.519996035099029



210it [00:04, 44.41it/s][A
215it [00:04, 44.80it/s][A
220it [00:04, 44.84it/s][A
227it [00:05, 44.85it/s]
 68%|██████▊   | 341/500 [40:38<20:47,  7.84s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.74it/s][A
10it [00:00, 45.45it/s][A
15it [00:00, 44.97it/s][A
20it [00:00, 44.82it/s][A
25it [00:00, 45.17it/s][A
30it [00:00, 45.17it/s][A
35it [00:00, 45.04it/s][A
40it [00:00, 45.08it/s][A
45it [00:00, 45.24it/s][A
50it [00:01, 45.40it/s][A
55it [00:01, 45.60it/s][A
60it [00:01, 45.59it/s][A
65it [00:01, 45.70it/s][A
70it [00:01, 45.81it/s][A
75it [00:01, 45.67it/s][A
80it [00:01, 45.43it/s][A
85it [00:01, 45.30it/s][A
90it [00:01, 45.09it/s][A
95it [00:02, 44.56it/s][A
100it [00:02, 44.55it/s][A
105it [00:02, 43.34it/s][A

Epoch: 342, Step: 100, Loss: 4.503092308044433



110it [00:02, 44.19it/s][A
115it [00:02, 43.53it/s][A
120it [00:02, 44.21it/s][A
125it [00:02, 42.73it/s][A
130it [00:02, 43.10it/s][A
135it [00:03, 43.05it/s][A
140it [00:03, 43.17it/s][A
145it [00:03, 43.47it/s][A
150it [00:03, 42.41it/s][A
155it [00:03, 42.03it/s][A
160it [00:03, 42.88it/s][A
165it [00:03, 43.51it/s][A
170it [00:03, 43.69it/s][A
175it [00:03, 43.37it/s][A
180it [00:04, 43.54it/s][A
185it [00:04, 43.62it/s][A
190it [00:04, 43.18it/s][A
195it [00:04, 43.58it/s][A
200it [00:04, 42.74it/s][A
205it [00:04, 42.96it/s][A

Epoch: 342, Step: 200, Loss: 4.517937796115875



210it [00:04, 43.35it/s][A
215it [00:04, 43.59it/s][A
220it [00:04, 43.67it/s][A
227it [00:05, 44.03it/s]
 68%|██████▊   | 342/500 [40:44<18:32,  7.04s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.02it/s][A
9it [00:00, 41.92it/s][A
14it [00:00, 41.69it/s][A
19it [00:00, 42.74it/s][A
24it [00:00, 43.39it/s][A
29it [00:00, 43.49it/s][A
34it [00:00, 43.65it/s][A
39it [00:00, 43.79it/s][A
44it [00:01, 43.81it/s][A
49it [00:01, 44.25it/s][A
54it [00:01, 44.43it/s][A
59it [00:01, 44.68it/s][A
64it [00:01, 44.52it/s][A
69it [00:01, 44.73it/s][A
74it [00:01, 44.90it/s][A
79it [00:01, 44.77it/s][A
84it [00:01, 44.79it/s][A
89it [00:02, 43.69it/s][A
94it [00:02, 44.25it/s][A
99it [00:02, 44.27it/s][A
104it [00:02, 43.27it/s][A

Epoch: 343, Step: 100, Loss: 4.493028926849365



109it [00:02, 42.64it/s][A
114it [00:02, 43.08it/s][A
119it [00:02, 42.86it/s][A
124it [00:02, 43.34it/s][A
129it [00:02, 42.65it/s][A
134it [00:03, 43.51it/s][A
139it [00:03, 42.78it/s][A
144it [00:03, 43.33it/s][A
149it [00:03, 43.96it/s][A
154it [00:03, 43.52it/s][A
159it [00:03, 44.01it/s][A
164it [00:03, 42.90it/s][A
169it [00:03, 43.67it/s][A
174it [00:03, 44.07it/s][A
179it [00:04, 43.77it/s][A
184it [00:04, 44.09it/s][A
189it [00:04, 44.40it/s][A
194it [00:04, 44.57it/s][A
199it [00:04, 43.98it/s][A
204it [00:04, 44.07it/s][A
209it [00:04, 44.55it/s][A

Epoch: 343, Step: 200, Loss: 4.517855050563813



214it [00:04, 44.52it/s][A
219it [00:04, 44.46it/s][A
227it [00:05, 43.84it/s]
 69%|██████▊   | 343/500 [40:49<16:57,  6.48s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.89it/s][A
10it [00:00, 44.30it/s][A
15it [00:00, 44.21it/s][A
20it [00:00, 44.14it/s][A
25it [00:00, 42.84it/s][A
30it [00:00, 42.39it/s][A
35it [00:00, 43.23it/s][A
40it [00:00, 43.74it/s][A
45it [00:01, 44.46it/s][A
50it [00:01, 44.97it/s][A
55it [00:01, 44.93it/s][A
60it [00:01, 45.02it/s][A
65it [00:01, 45.18it/s][A
70it [00:01, 45.18it/s][A
75it [00:01, 44.97it/s][A
80it [00:01, 45.15it/s][A
85it [00:01, 45.26it/s][A
90it [00:02, 45.35it/s][A
95it [00:02, 45.15it/s][A
100it [00:02, 45.22it/s][A
105it [00:02, 45.24it/s][A

Epoch: 344, Step: 100, Loss: 4.501414260864258



110it [00:02, 44.03it/s][A
115it [00:02, 44.18it/s][A
120it [00:02, 44.22it/s][A
125it [00:02, 44.17it/s][A
130it [00:02, 44.07it/s][A
135it [00:03, 44.62it/s][A
140it [00:03, 43.44it/s][A
145it [00:03, 42.78it/s][A
150it [00:03, 43.67it/s][A
155it [00:03, 44.25it/s][A
160it [00:03, 44.56it/s][A
165it [00:03, 44.73it/s][A
170it [00:03, 43.85it/s][A
175it [00:03, 44.40it/s][A
180it [00:04, 44.62it/s][A
185it [00:04, 43.40it/s][A
190it [00:04, 43.47it/s][A
195it [00:04, 43.76it/s][A
200it [00:04, 43.56it/s][A
205it [00:04, 43.46it/s][A

Epoch: 344, Step: 200, Loss: 4.514267833232879



210it [00:04, 43.87it/s][A
215it [00:04, 44.31it/s][A
220it [00:04, 44.55it/s][A
227it [00:05, 44.30it/s]
 69%|██████▉   | 344/500 [40:54<15:47,  6.07s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.26it/s][A
10it [00:00, 46.02it/s][A
15it [00:00, 45.46it/s][A
20it [00:00, 45.28it/s][A
25it [00:00, 45.33it/s][A
30it [00:00, 45.03it/s][A
35it [00:00, 44.88it/s][A
40it [00:00, 44.54it/s][A
45it [00:01, 44.48it/s][A
50it [00:01, 44.34it/s][A
55it [00:01, 43.57it/s][A
60it [00:01, 43.81it/s][A
65it [00:01, 42.98it/s][A
70it [00:01, 43.67it/s][A
75it [00:01, 44.01it/s][A
80it [00:01, 44.41it/s][A
85it [00:01, 44.48it/s][A
90it [00:02, 44.86it/s][A
95it [00:02, 44.94it/s][A
100it [00:02, 45.04it/s][A
105it [00:02, 44.66it/s][A

Epoch: 345, Step: 100, Loss: 4.50685562133789



110it [00:02, 44.56it/s][A
115it [00:02, 44.70it/s][A
120it [00:02, 44.86it/s][A
125it [00:02, 44.98it/s][A
130it [00:02, 45.12it/s][A
135it [00:03, 45.15it/s][A
140it [00:03, 45.09it/s][A
145it [00:03, 45.47it/s][A
150it [00:03, 45.60it/s][A
155it [00:03, 45.64it/s][A
160it [00:03, 45.57it/s][A
165it [00:03, 45.64it/s][A
170it [00:03, 44.90it/s][A
175it [00:03, 45.26it/s][A
180it [00:04, 45.60it/s][A
185it [00:04, 45.64it/s][A
190it [00:04, 45.81it/s][A
195it [00:04, 45.78it/s][A
200it [00:04, 45.51it/s][A
205it [00:04, 45.39it/s][A

Epoch: 345, Step: 200, Loss: 4.517482678890229



210it [00:04, 44.91it/s][A
215it [00:04, 45.19it/s][A
220it [00:04, 45.12it/s][A
227it [00:05, 44.88it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.27it/s][A
12it [00:00, 58.52it/s][A
19it [00:00, 59.56it/s][A
25it [00:00, 59.48it/s][A
31it [00:00, 59.23it/s][A
37it [00:00, 59.13it/s][A
43it [00:00, 56.65it/s][A
50it [00:00, 57.91it/s][A
56it [00:00, 58.48it/s][A
62it [00:01, 56.54it/s][A
68it [00:01, 57.36it/s][A
74it [00:01, 57.75it/s][A
80it [00:01, 58.37it/s][A
86it [00:01, 58.67it/s][A
92it [00:01, 58.62it/s][A
99it [00:01, 59.41it/s][A
106it [00:01, 59.86it/s][A
112it [00:01, 59.88it/s][A
118it [00:02, 59.83it/s][A
125it [00:02, 59.93it/s][A
131it [00:02, 59.88it/s][A
138it [00:02, 60.14it/s][A
145it [00:02, 59.62it/s][A
151it [00:02, 57.20it/s][A
157it [00:02, 57.97it/s][A
163it [00:02, 57.88it/s][A
170it [00:02, 58.64it/s][A
176it [00:02, 58.75it/s][A
182it [00:03, 58.84it/s][A
188it [00:03, 58.96it/s][A
194it [00:03, 56.84it/s][A
201it [00:03, 58


Epoch: 345, Test Loss: 5.519364437701539, Test Perplexity: 250.3899770464216




0it [00:00, ?it/s][A
5it [00:00, 43.47it/s][A
10it [00:00, 44.65it/s][A
15it [00:00, 45.03it/s][A
20it [00:00, 44.82it/s][A
25it [00:00, 44.58it/s][A
30it [00:00, 44.79it/s][A
35it [00:00, 44.83it/s][A
40it [00:00, 44.89it/s][A
45it [00:01, 44.80it/s][A
50it [00:01, 44.91it/s][A
55it [00:01, 44.84it/s][A
60it [00:01, 44.87it/s][A
65it [00:01, 44.76it/s][A
70it [00:01, 44.53it/s][A
75it [00:01, 44.62it/s][A
80it [00:01, 44.66it/s][A
85it [00:01, 44.70it/s][A
90it [00:02, 44.70it/s][A
95it [00:02, 44.91it/s][A
100it [00:02, 45.05it/s][A
105it [00:02, 43.80it/s][A

Epoch: 346, Step: 100, Loss: 4.498478908538818



110it [00:02, 44.10it/s][A
115it [00:02, 44.26it/s][A
120it [00:02, 42.81it/s][A
125it [00:02, 43.46it/s][A
130it [00:02, 44.03it/s][A
135it [00:03, 44.40it/s][A
140it [00:03, 44.76it/s][A
145it [00:03, 45.06it/s][A
150it [00:03, 45.13it/s][A
155it [00:03, 45.12it/s][A
160it [00:03, 44.94it/s][A
165it [00:03, 45.21it/s][A
170it [00:03, 45.37it/s][A
175it [00:03, 45.25it/s][A
180it [00:04, 45.15it/s][A
185it [00:04, 45.36it/s][A
190it [00:04, 45.26it/s][A
195it [00:04, 45.22it/s][A
200it [00:04, 44.92it/s][A
205it [00:04, 44.96it/s][A

Epoch: 346, Step: 200, Loss: 4.516178517341614



210it [00:04, 44.91it/s][A
215it [00:04, 45.25it/s][A
220it [00:04, 45.26it/s][A
227it [00:05, 44.68it/s]
 69%|██████▉   | 346/500 [41:15<20:09,  7.86s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.35it/s][A
10it [00:00, 45.02it/s][A
15it [00:00, 44.87it/s][A
20it [00:00, 44.33it/s][A
25it [00:00, 44.59it/s][A
30it [00:00, 43.11it/s][A
35it [00:00, 43.76it/s][A
40it [00:00, 43.32it/s][A
45it [00:01, 43.44it/s][A
50it [00:01, 43.48it/s][A
55it [00:01, 41.70it/s][A
60it [00:01, 42.31it/s][A
65it [00:01, 43.16it/s][A
70it [00:01, 43.80it/s][A
75it [00:01, 44.14it/s][A
80it [00:01, 44.35it/s][A
85it [00:01, 44.49it/s][A
90it [00:02, 44.44it/s][A
95it [00:02, 44.63it/s][A
100it [00:02, 44.57it/s][A
105it [00:02, 44.67it/s][A

Epoch: 347, Step: 100, Loss: 4.4980945920944215



110it [00:02, 44.37it/s][A
115it [00:02, 44.35it/s][A
120it [00:02, 44.01it/s][A
125it [00:02, 43.25it/s][A
130it [00:02, 43.95it/s][A
135it [00:03, 44.41it/s][A
140it [00:03, 44.74it/s][A
145it [00:03, 44.60it/s][A
150it [00:03, 44.27it/s][A
155it [00:03, 43.44it/s][A
160it [00:03, 43.58it/s][A
165it [00:03, 43.90it/s][A
170it [00:03, 44.33it/s][A
175it [00:03, 44.65it/s][A
180it [00:04, 45.07it/s][A
185it [00:04, 45.27it/s][A
190it [00:04, 45.23it/s][A
195it [00:04, 45.02it/s][A
200it [00:04, 45.18it/s][A
205it [00:04, 45.33it/s][A

Epoch: 347, Step: 200, Loss: 4.516886816024781



210it [00:04, 44.95it/s][A
215it [00:04, 45.25it/s][A
220it [00:04, 45.51it/s][A
227it [00:05, 44.23it/s]
 69%|██████▉   | 347/500 [41:20<17:57,  7.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.43it/s][A
10it [00:00, 44.88it/s][A
15it [00:00, 44.83it/s][A
20it [00:00, 43.31it/s][A
25it [00:00, 43.99it/s][A
30it [00:00, 44.31it/s][A
35it [00:00, 44.81it/s][A
40it [00:00, 43.63it/s][A
45it [00:01, 44.43it/s][A
50it [00:01, 44.59it/s][A
55it [00:01, 44.86it/s][A
60it [00:01, 44.59it/s][A
65it [00:01, 44.70it/s][A
70it [00:01, 44.89it/s][A
75it [00:01, 44.94it/s][A
80it [00:01, 44.93it/s][A
85it [00:01, 44.67it/s][A
90it [00:02, 44.86it/s][A
95it [00:02, 44.98it/s][A
100it [00:02, 44.04it/s][A
105it [00:02, 44.35it/s][A

Epoch: 348, Step: 100, Loss: 4.502005491256714



110it [00:02, 44.54it/s][A
115it [00:02, 44.86it/s][A
120it [00:02, 45.06it/s][A
125it [00:02, 45.13it/s][A
130it [00:02, 45.16it/s][A
135it [00:03, 45.27it/s][A
140it [00:03, 45.19it/s][A
145it [00:03, 45.34it/s][A
150it [00:03, 45.38it/s][A
155it [00:03, 44.38it/s][A
160it [00:03, 44.51it/s][A
165it [00:03, 44.79it/s][A
170it [00:03, 45.07it/s][A
175it [00:03, 44.05it/s][A
180it [00:04, 44.31it/s][A
185it [00:04, 44.50it/s][A
190it [00:04, 44.62it/s][A
195it [00:04, 44.87it/s][A
200it [00:04, 44.66it/s][A
205it [00:04, 45.09it/s][A

Epoch: 348, Step: 200, Loss: 4.51244389295578



210it [00:04, 43.94it/s][A
215it [00:04, 44.58it/s][A
220it [00:04, 44.94it/s][A
227it [00:05, 44.68it/s]
 70%|██████▉   | 348/500 [41:25<16:20,  6.45s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.86it/s][A
10it [00:00, 40.83it/s][A
15it [00:00, 42.60it/s][A
20it [00:00, 43.40it/s][A
25it [00:00, 43.97it/s][A
30it [00:00, 44.59it/s][A
35it [00:00, 44.90it/s][A
40it [00:00, 45.19it/s][A
45it [00:01, 44.05it/s][A
50it [00:01, 44.14it/s][A
55it [00:01, 43.88it/s][A
60it [00:01, 44.14it/s][A
65it [00:01, 42.66it/s][A
70it [00:01, 43.24it/s][A
75it [00:01, 43.61it/s][A
80it [00:01, 44.12it/s][A
85it [00:01, 43.17it/s][A
90it [00:02, 43.75it/s][A
95it [00:02, 43.90it/s][A
100it [00:02, 43.90it/s][A
105it [00:02, 43.62it/s][A

Epoch: 349, Step: 100, Loss: 4.500284910202026



110it [00:02, 42.94it/s][A
115it [00:02, 43.51it/s][A
120it [00:02, 43.80it/s][A
125it [00:02, 44.43it/s][A
130it [00:02, 44.74it/s][A
135it [00:03, 45.02it/s][A
140it [00:03, 44.90it/s][A
145it [00:03, 43.83it/s][A
150it [00:03, 43.14it/s][A
155it [00:03, 43.41it/s][A
160it [00:03, 44.26it/s][A
165it [00:03, 44.73it/s][A
170it [00:03, 45.04it/s][A
175it [00:03, 45.24it/s][A
180it [00:04, 45.39it/s][A
185it [00:04, 44.50it/s][A
190it [00:04, 43.77it/s][A
195it [00:04, 44.39it/s][A
200it [00:04, 44.32it/s][A
205it [00:04, 44.21it/s][A

Epoch: 349, Step: 200, Loss: 4.515175960063934



210it [00:04, 43.86it/s][A
215it [00:04, 44.57it/s][A
220it [00:04, 44.76it/s][A
227it [00:05, 44.09it/s]
 70%|██████▉   | 349/500 [41:30<15:15,  6.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.87it/s][A
10it [00:00, 43.01it/s][A
15it [00:00, 44.15it/s][A
20it [00:00, 43.95it/s][A
25it [00:00, 44.75it/s][A
30it [00:00, 45.16it/s][A
35it [00:00, 45.46it/s][A
40it [00:00, 45.55it/s][A
45it [00:01, 45.49it/s][A
50it [00:01, 45.47it/s][A
55it [00:01, 45.55it/s][A
60it [00:01, 45.33it/s][A
65it [00:01, 45.44it/s][A
70it [00:01, 45.39it/s][A
75it [00:01, 45.46it/s][A
80it [00:01, 45.50it/s][A
85it [00:01, 44.33it/s][A
90it [00:02, 44.32it/s][A
95it [00:02, 43.38it/s][A
100it [00:02, 43.97it/s][A
105it [00:02, 44.67it/s][A

Epoch: 350, Step: 100, Loss: 4.49682430267334



110it [00:02, 44.81it/s][A
115it [00:02, 45.02it/s][A
120it [00:02, 45.20it/s][A
125it [00:02, 43.11it/s][A
130it [00:02, 43.57it/s][A
135it [00:03, 43.93it/s][A
140it [00:03, 44.24it/s][A
145it [00:03, 44.45it/s][A
150it [00:03, 43.52it/s][A
155it [00:03, 43.86it/s][A
160it [00:03, 44.30it/s][A
165it [00:03, 44.68it/s][A
170it [00:03, 44.73it/s][A
175it [00:03, 44.74it/s][A
180it [00:04, 42.71it/s][A
185it [00:04, 43.42it/s][A
190it [00:04, 44.14it/s][A
195it [00:04, 44.70it/s][A
200it [00:04, 45.03it/s][A
205it [00:04, 45.21it/s][A

Epoch: 350, Step: 200, Loss: 4.51589661359787



210it [00:04, 45.25it/s][A
215it [00:04, 45.50it/s][A
220it [00:04, 45.68it/s][A
227it [00:05, 44.65it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.16it/s][A
13it [00:00, 60.50it/s][A
20it [00:00, 58.68it/s][A
26it [00:00, 57.09it/s][A
33it [00:00, 58.71it/s][A
39it [00:00, 59.07it/s][A
46it [00:00, 59.79it/s][A
53it [00:00, 60.14it/s][A
60it [00:01, 60.22it/s][A
67it [00:01, 60.51it/s][A
74it [00:01, 58.54it/s][A
80it [00:01, 56.91it/s][A
87it [00:01, 58.27it/s][A
94it [00:01, 59.21it/s][A
100it [00:01, 59.36it/s][A
107it [00:01, 59.84it/s][A
114it [00:01, 60.10it/s][A
121it [00:02, 60.01it/s][A
128it [00:02, 60.27it/s][A
135it [00:02, 58.35it/s][A
141it [00:02, 58.53it/s][A
148it [00:02, 59.39it/s][A
155it [00:02, 59.86it/s][A
161it [00:02, 57.32it/s][A
168it [00:02, 58.57it/s][A
174it [00:02, 56.99it/s][A
181it [00:03, 58.10it/s][A
187it [00:03, 57.43it/s][A
194it [00:03, 58.38it/s][A
200it [00:03, 56.17it/s][A
206it [00:03, 57.04it/s][A
213it [00:03, 


Epoch: 350, Test Loss: 5.526290561101451, Test Perplexity: 252.21287484494795




0it [00:00, ?it/s][A
5it [00:00, 44.53it/s][A
10it [00:00, 44.98it/s][A
15it [00:00, 44.75it/s][A
20it [00:00, 43.02it/s][A
25it [00:00, 43.31it/s][A
30it [00:00, 43.78it/s][A
35it [00:00, 44.22it/s][A
40it [00:00, 44.39it/s][A
45it [00:01, 44.39it/s][A
50it [00:01, 44.44it/s][A
55it [00:01, 44.48it/s][A
60it [00:01, 43.79it/s][A
65it [00:01, 44.01it/s][A
70it [00:01, 43.29it/s][A
75it [00:01, 42.74it/s][A
80it [00:01, 43.52it/s][A
85it [00:01, 44.23it/s][A
90it [00:02, 43.60it/s][A
95it [00:02, 44.19it/s][A
100it [00:02, 44.50it/s][A
105it [00:02, 44.89it/s][A

Epoch: 351, Step: 100, Loss: 4.503736691474915



110it [00:02, 45.04it/s][A
115it [00:02, 45.14it/s][A
120it [00:02, 45.18it/s][A
125it [00:02, 43.61it/s][A
130it [00:02, 42.89it/s][A
135it [00:03, 43.24it/s][A
140it [00:03, 43.67it/s][A
145it [00:03, 43.38it/s][A
150it [00:03, 43.84it/s][A
155it [00:03, 44.31it/s][A
160it [00:03, 44.21it/s][A
165it [00:03, 44.57it/s][A
170it [00:03, 43.25it/s][A
175it [00:03, 43.53it/s][A
180it [00:04, 43.75it/s][A
185it [00:04, 43.75it/s][A
190it [00:04, 44.33it/s][A
195it [00:04, 44.47it/s][A
200it [00:04, 44.72it/s][A
205it [00:04, 44.82it/s][A

Epoch: 351, Step: 200, Loss: 4.512971558570862



210it [00:04, 43.27it/s][A
215it [00:04, 42.33it/s][A
220it [00:05, 43.06it/s][A
227it [00:05, 43.95it/s]
 70%|███████   | 351/500 [41:52<19:37,  7.90s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.80it/s][A
10it [00:00, 45.48it/s][A
15it [00:00, 42.96it/s][A
20it [00:00, 44.07it/s][A
25it [00:00, 43.48it/s][A
30it [00:00, 44.01it/s][A
35it [00:00, 44.37it/s][A
40it [00:00, 44.67it/s][A
45it [00:01, 44.73it/s][A
50it [00:01, 44.76it/s][A
55it [00:01, 44.32it/s][A
60it [00:01, 44.90it/s][A
65it [00:01, 45.17it/s][A
70it [00:01, 45.17it/s][A
75it [00:01, 45.21it/s][A
80it [00:01, 45.22it/s][A
85it [00:01, 44.99it/s][A
90it [00:02, 45.26it/s][A
95it [00:02, 45.28it/s][A
100it [00:02, 43.62it/s][A
105it [00:02, 44.14it/s][A

Epoch: 352, Step: 100, Loss: 4.4872402000427245



110it [00:02, 44.54it/s][A
115it [00:02, 44.68it/s][A
120it [00:02, 44.78it/s][A
125it [00:02, 44.85it/s][A
130it [00:02, 45.01it/s][A
135it [00:03, 45.21it/s][A
140it [00:03, 44.02it/s][A
145it [00:03, 44.73it/s][A
150it [00:03, 44.84it/s][A
155it [00:03, 44.70it/s][A
160it [00:03, 45.00it/s][A
165it [00:03, 44.92it/s][A
170it [00:03, 44.54it/s][A
175it [00:03, 44.42it/s][A
180it [00:04, 44.69it/s][A
185it [00:04, 43.35it/s][A
190it [00:04, 44.07it/s][A
195it [00:04, 43.62it/s][A
200it [00:04, 44.22it/s][A
205it [00:04, 44.35it/s][A

Epoch: 352, Step: 200, Loss: 4.51136864900589



210it [00:04, 44.17it/s][A
215it [00:04, 44.54it/s][A
220it [00:04, 44.83it/s][A
227it [00:05, 44.47it/s]
 70%|███████   | 352/500 [41:57<17:25,  7.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.72it/s][A
10it [00:00, 45.48it/s][A
15it [00:00, 45.19it/s][A
20it [00:00, 44.95it/s][A
25it [00:00, 44.93it/s][A
30it [00:00, 45.16it/s][A
35it [00:00, 45.35it/s][A
40it [00:00, 45.22it/s][A
45it [00:00, 45.10it/s][A
50it [00:01, 45.28it/s][A
55it [00:01, 44.65it/s][A
60it [00:01, 45.10it/s][A
65it [00:01, 45.39it/s][A
70it [00:01, 43.57it/s][A
75it [00:01, 43.98it/s][A
80it [00:01, 44.03it/s][A
85it [00:01, 44.60it/s][A
90it [00:02, 44.91it/s][A
95it [00:02, 45.02it/s][A
100it [00:02, 45.15it/s][A
105it [00:02, 45.29it/s][A

Epoch: 353, Step: 100, Loss: 4.496561822891235



110it [00:02, 45.29it/s][A
115it [00:02, 45.26it/s][A
120it [00:02, 43.72it/s][A
125it [00:02, 44.25it/s][A
130it [00:02, 44.43it/s][A
135it [00:03, 44.72it/s][A
140it [00:03, 44.76it/s][A
145it [00:03, 43.87it/s][A
150it [00:03, 44.35it/s][A
155it [00:03, 44.52it/s][A
160it [00:03, 44.51it/s][A
165it [00:03, 44.32it/s][A
170it [00:03, 44.31it/s][A
175it [00:03, 44.09it/s][A
180it [00:04, 43.96it/s][A
185it [00:04, 44.00it/s][A
190it [00:04, 44.12it/s][A
195it [00:04, 43.62it/s][A
200it [00:04, 43.96it/s][A
205it [00:04, 43.88it/s][A

Epoch: 353, Step: 200, Loss: 4.514184405803681



210it [00:04, 43.20it/s][A
215it [00:04, 43.67it/s][A
220it [00:04, 43.83it/s][A
227it [00:05, 44.42it/s]
 71%|███████   | 353/500 [42:02<15:52,  6.48s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.07it/s][A
10it [00:00, 44.50it/s][A
15it [00:00, 43.46it/s][A
20it [00:00, 44.12it/s][A
25it [00:00, 44.79it/s][A
30it [00:00, 44.77it/s][A
35it [00:00, 45.13it/s][A
40it [00:00, 44.13it/s][A
45it [00:01, 44.49it/s][A
50it [00:01, 44.90it/s][A
55it [00:01, 45.21it/s][A
60it [00:01, 45.53it/s][A
65it [00:01, 45.40it/s][A
70it [00:01, 45.46it/s][A
75it [00:01, 45.58it/s][A
80it [00:01, 45.55it/s][A
85it [00:01, 44.78it/s][A
90it [00:02, 45.04it/s][A
95it [00:02, 44.90it/s][A
100it [00:02, 45.13it/s][A
105it [00:02, 45.24it/s][A

Epoch: 354, Step: 100, Loss: 4.503573455810547



110it [00:02, 43.93it/s][A
115it [00:02, 44.40it/s][A
120it [00:02, 44.72it/s][A
125it [00:02, 44.73it/s][A
130it [00:02, 44.87it/s][A
135it [00:03, 45.10it/s][A
140it [00:03, 43.75it/s][A
145it [00:03, 44.57it/s][A
150it [00:03, 44.62it/s][A
155it [00:03, 44.82it/s][A
160it [00:03, 45.06it/s][A
165it [00:03, 45.02it/s][A
170it [00:03, 43.70it/s][A
175it [00:03, 44.19it/s][A
180it [00:04, 43.73it/s][A
185it [00:04, 44.45it/s][A
190it [00:04, 44.71it/s][A
195it [00:04, 45.00it/s][A
200it [00:04, 43.92it/s][A
205it [00:04, 44.57it/s][A

Epoch: 354, Step: 200, Loss: 4.511958105564117



210it [00:04, 44.51it/s][A
215it [00:04, 45.14it/s][A
220it [00:04, 45.39it/s][A
227it [00:05, 44.76it/s]
 71%|███████   | 354/500 [42:07<14:44,  6.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.47it/s][A
10it [00:00, 45.66it/s][A
15it [00:00, 44.29it/s][A
20it [00:00, 45.09it/s][A
25it [00:00, 45.52it/s][A
30it [00:00, 45.81it/s][A
35it [00:00, 46.17it/s][A
40it [00:00, 46.23it/s][A
45it [00:00, 44.37it/s][A
50it [00:01, 44.92it/s][A
55it [00:01, 44.92it/s][A
60it [00:01, 45.35it/s][A
65it [00:01, 45.50it/s][A
70it [00:01, 45.57it/s][A
75it [00:01, 45.72it/s][A
80it [00:01, 45.49it/s][A
85it [00:01, 44.29it/s][A
90it [00:01, 44.81it/s][A
95it [00:02, 45.30it/s][A
100it [00:02, 45.64it/s][A
105it [00:02, 45.90it/s][A

Epoch: 355, Step: 100, Loss: 4.503434472084045



110it [00:02, 45.91it/s][A
115it [00:02, 45.71it/s][A
120it [00:02, 45.79it/s][A
125it [00:02, 46.12it/s][A
130it [00:02, 45.88it/s][A
135it [00:02, 45.95it/s][A
140it [00:03, 46.21it/s][A
145it [00:03, 46.48it/s][A
150it [00:03, 46.36it/s][A
155it [00:03, 46.41it/s][A
160it [00:03, 46.30it/s][A
165it [00:03, 46.29it/s][A
170it [00:03, 46.35it/s][A
175it [00:03, 46.09it/s][A
180it [00:03, 45.84it/s][A
185it [00:04, 45.87it/s][A
190it [00:04, 46.28it/s][A
195it [00:04, 46.20it/s][A
200it [00:04, 44.26it/s][A
205it [00:04, 44.58it/s][A

Epoch: 355, Step: 200, Loss: 4.514569861888885



210it [00:04, 44.64it/s][A
215it [00:04, 44.81it/s][A
220it [00:04, 43.91it/s][A
227it [00:04, 45.41it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.12it/s][A
12it [00:00, 58.63it/s][A
18it [00:00, 58.71it/s][A
24it [00:00, 58.78it/s][A
30it [00:00, 58.77it/s][A
36it [00:00, 58.94it/s][A
42it [00:00, 58.85it/s][A
48it [00:00, 58.44it/s][A
54it [00:00, 58.41it/s][A
60it [00:01, 56.03it/s][A
66it [00:01, 56.87it/s][A
72it [00:01, 57.26it/s][A
78it [00:01, 57.68it/s][A
84it [00:01, 57.96it/s][A
90it [00:01, 58.29it/s][A
96it [00:01, 58.48it/s][A
102it [00:01, 58.81it/s][A
108it [00:01, 59.00it/s][A
114it [00:01, 58.60it/s][A
120it [00:02, 58.48it/s][A
126it [00:02, 56.98it/s][A
132it [00:02, 57.56it/s][A
138it [00:02, 58.08it/s][A
144it [00:02, 58.37it/s][A
150it [00:02, 58.53it/s][A
156it [00:02, 58.73it/s][A
162it [00:02, 55.67it/s][A
168it [00:02, 56.71it/s][A
174it [00:03, 57.39it/s][A
180it [00:03, 56.73it/s][A
186it [00:03, 57.36it/s][A
192it [00:03, 57


Epoch: 355, Test Loss: 5.52877950890464, Test Perplexity: 252.8929819379534




0it [00:00, ?it/s][A
5it [00:00, 42.18it/s][A
10it [00:00, 43.80it/s][A
15it [00:00, 44.47it/s][A
20it [00:00, 44.90it/s][A
25it [00:00, 45.03it/s][A
30it [00:00, 44.06it/s][A
35it [00:00, 44.48it/s][A
40it [00:00, 44.27it/s][A
45it [00:01, 44.69it/s][A
50it [00:01, 44.98it/s][A
55it [00:01, 44.20it/s][A
60it [00:01, 44.23it/s][A
65it [00:01, 44.30it/s][A
70it [00:01, 44.73it/s][A
75it [00:01, 44.99it/s][A
80it [00:01, 44.63it/s][A
85it [00:01, 44.55it/s][A
90it [00:02, 44.59it/s][A
95it [00:02, 44.58it/s][A
100it [00:02, 44.16it/s][A
105it [00:02, 43.48it/s][A

Epoch: 356, Step: 100, Loss: 4.496564903259277



110it [00:02, 43.18it/s][A
115it [00:02, 42.67it/s][A
120it [00:02, 43.34it/s][A
125it [00:02, 43.71it/s][A
130it [00:02, 43.97it/s][A
135it [00:03, 44.22it/s][A
140it [00:03, 43.55it/s][A
145it [00:03, 43.86it/s][A
150it [00:03, 44.36it/s][A
155it [00:03, 44.54it/s][A
160it [00:03, 44.83it/s][A
165it [00:03, 44.82it/s][A
170it [00:03, 45.11it/s][A
175it [00:03, 45.22it/s][A
180it [00:04, 45.26it/s][A
185it [00:04, 45.41it/s][A
190it [00:04, 45.60it/s][A
195it [00:04, 45.66it/s][A
200it [00:04, 45.67it/s][A
205it [00:04, 44.41it/s][A

Epoch: 356, Step: 200, Loss: 4.511920590400695



210it [00:04, 44.71it/s][A
215it [00:04, 45.09it/s][A
220it [00:04, 45.06it/s][A
227it [00:05, 44.51it/s]
 71%|███████   | 356/500 [42:28<18:49,  7.85s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.51it/s][A
9it [00:00, 43.60it/s][A
14it [00:00, 44.28it/s][A
19it [00:00, 44.16it/s][A
24it [00:00, 44.62it/s][A
29it [00:00, 44.77it/s][A
34it [00:00, 45.14it/s][A
39it [00:00, 44.93it/s][A
44it [00:00, 45.37it/s][A
49it [00:01, 44.80it/s][A
54it [00:01, 43.34it/s][A
59it [00:01, 43.76it/s][A
64it [00:01, 43.21it/s][A
69it [00:01, 43.91it/s][A
74it [00:01, 43.34it/s][A
79it [00:01, 42.86it/s][A
84it [00:01, 43.15it/s][A
89it [00:02, 43.97it/s][A
94it [00:02, 44.20it/s][A
99it [00:02, 44.61it/s][A
104it [00:02, 44.41it/s][A

Epoch: 357, Step: 100, Loss: 4.498000540733337



109it [00:02, 44.40it/s][A
114it [00:02, 44.61it/s][A
119it [00:02, 44.82it/s][A
124it [00:02, 44.97it/s][A
129it [00:02, 45.21it/s][A
134it [00:03, 45.49it/s][A
139it [00:03, 45.46it/s][A
144it [00:03, 44.30it/s][A
149it [00:03, 44.68it/s][A
154it [00:03, 44.92it/s][A
159it [00:03, 45.26it/s][A
164it [00:03, 45.33it/s][A
169it [00:03, 45.10it/s][A
174it [00:03, 45.07it/s][A
179it [00:04, 45.36it/s][A
184it [00:04, 45.32it/s][A
189it [00:04, 44.17it/s][A
194it [00:04, 44.52it/s][A
199it [00:04, 44.81it/s][A
204it [00:04, 44.89it/s][A
209it [00:04, 45.12it/s][A

Epoch: 357, Step: 200, Loss: 4.512954611778259



214it [00:04, 45.05it/s][A
219it [00:04, 43.71it/s][A
227it [00:05, 44.48it/s]
 71%|███████▏  | 357/500 [42:33<16:44,  7.02s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.43it/s][A
9it [00:00, 42.49it/s][A
14it [00:00, 43.74it/s][A
19it [00:00, 41.42it/s][A
24it [00:00, 41.89it/s][A
29it [00:00, 43.37it/s][A
34it [00:00, 43.79it/s][A
39it [00:00, 44.48it/s][A
44it [00:01, 44.92it/s][A
49it [00:01, 45.36it/s][A
54it [00:01, 45.63it/s][A
59it [00:01, 45.43it/s][A
64it [00:01, 45.55it/s][A
69it [00:01, 45.42it/s][A
74it [00:01, 45.31it/s][A
79it [00:01, 45.31it/s][A
84it [00:01, 43.77it/s][A
89it [00:02, 44.11it/s][A
94it [00:02, 43.41it/s][A
99it [00:02, 43.93it/s][A
104it [00:02, 44.10it/s][A
109it [00:02, 44.38it/s][A

Epoch: 358, Step: 100, Loss: 4.495690288543702



114it [00:02, 43.72it/s][A
119it [00:02, 44.33it/s][A
124it [00:02, 44.81it/s][A
129it [00:02, 44.96it/s][A
134it [00:03, 44.72it/s][A
139it [00:03, 44.23it/s][A
144it [00:03, 43.49it/s][A
149it [00:03, 43.65it/s][A
154it [00:03, 43.98it/s][A
159it [00:03, 44.33it/s][A
164it [00:03, 44.68it/s][A
169it [00:03, 44.57it/s][A
174it [00:03, 44.84it/s][A
179it [00:04, 44.97it/s][A
184it [00:04, 45.24it/s][A
189it [00:04, 44.95it/s][A
194it [00:04, 44.33it/s][A
199it [00:04, 44.33it/s][A
204it [00:04, 44.48it/s][A


Epoch: 358, Step: 200, Loss: 4.511067168712616


209it [00:04, 44.36it/s][A
214it [00:04, 43.94it/s][A
219it [00:04, 44.25it/s][A
227it [00:05, 44.22it/s]
 72%|███████▏  | 358/500 [42:38<15:17,  6.46s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.88it/s][A
10it [00:00, 44.23it/s][A
15it [00:00, 44.87it/s][A
20it [00:00, 44.95it/s][A
25it [00:00, 44.49it/s][A
30it [00:00, 44.82it/s][A
35it [00:00, 44.85it/s][A
40it [00:00, 45.43it/s][A
45it [00:00, 45.82it/s][A
50it [00:01, 44.65it/s][A
55it [00:01, 44.79it/s][A
60it [00:01, 45.09it/s][A
65it [00:01, 44.60it/s][A
70it [00:01, 44.80it/s][A
75it [00:01, 45.00it/s][A
80it [00:01, 45.29it/s][A
85it [00:01, 43.80it/s][A
90it [00:02, 44.63it/s][A
95it [00:02, 45.05it/s][A
100it [00:02, 45.29it/s][A
105it [00:02, 44.10it/s][A

Epoch: 359, Step: 100, Loss: 4.494859652519226



110it [00:02, 43.08it/s][A
115it [00:02, 43.76it/s][A
120it [00:02, 42.83it/s][A
125it [00:02, 43.67it/s][A
130it [00:02, 44.45it/s][A
135it [00:03, 45.14it/s][A
140it [00:03, 43.31it/s][A
145it [00:03, 43.96it/s][A
150it [00:03, 44.40it/s][A
155it [00:03, 44.71it/s][A
160it [00:03, 44.79it/s][A
165it [00:03, 45.21it/s][A
170it [00:03, 45.59it/s][A
175it [00:03, 46.02it/s][A
180it [00:04, 46.28it/s][A
185it [00:04, 46.36it/s][A
190it [00:04, 46.39it/s][A
195it [00:04, 46.41it/s][A
200it [00:04, 46.24it/s][A
205it [00:04, 46.27it/s][A

Epoch: 359, Step: 200, Loss: 4.5110271525383



210it [00:04, 45.84it/s][A
215it [00:04, 45.83it/s][A
220it [00:04, 45.42it/s][A
227it [00:05, 45.02it/s]
 72%|███████▏  | 359/500 [42:43<14:10,  6.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.61it/s][A
10it [00:00, 45.47it/s][A
15it [00:00, 45.50it/s][A
20it [00:00, 45.39it/s][A
25it [00:00, 45.38it/s][A
30it [00:00, 45.38it/s][A
35it [00:00, 45.29it/s][A
40it [00:00, 45.11it/s][A
45it [00:00, 45.08it/s][A
50it [00:01, 45.28it/s][A
55it [00:01, 45.41it/s][A
60it [00:01, 45.44it/s][A
65it [00:01, 45.54it/s][A
70it [00:01, 45.42it/s][A
75it [00:01, 45.45it/s][A
80it [00:01, 45.42it/s][A
85it [00:01, 45.45it/s][A
90it [00:01, 45.58it/s][A
95it [00:02, 44.48it/s][A
100it [00:02, 44.84it/s][A
105it [00:02, 45.20it/s][A

Epoch: 360, Step: 100, Loss: 4.499761114120483



110it [00:02, 44.76it/s][A
115it [00:02, 45.03it/s][A
120it [00:02, 43.30it/s][A
125it [00:02, 44.05it/s][A
130it [00:02, 43.38it/s][A
135it [00:03, 44.18it/s][A
140it [00:03, 44.54it/s][A
145it [00:03, 44.64it/s][A
150it [00:03, 43.31it/s][A
155it [00:03, 43.32it/s][A
160it [00:03, 42.83it/s][A
165it [00:03, 43.04it/s][A
170it [00:03, 42.18it/s][A
175it [00:03, 43.28it/s][A
180it [00:04, 42.15it/s][A
185it [00:04, 42.80it/s][A
190it [00:04, 43.15it/s][A
195it [00:04, 43.44it/s][A
200it [00:04, 43.62it/s][A
205it [00:04, 43.89it/s][A

Epoch: 360, Step: 200, Loss: 4.512452828884125



210it [00:04, 43.90it/s][A
215it [00:04, 44.16it/s][A
220it [00:04, 44.74it/s][A
227it [00:05, 44.38it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.50it/s][A
13it [00:00, 59.74it/s][A
19it [00:00, 59.68it/s][A
25it [00:00, 59.61it/s][A
32it [00:00, 59.95it/s][A
38it [00:00, 59.70it/s][A
44it [00:00, 59.74it/s][A
50it [00:00, 59.27it/s][A
57it [00:00, 59.67it/s][A
64it [00:01, 59.94it/s][A
70it [00:01, 58.34it/s][A
77it [00:01, 58.91it/s][A
83it [00:01, 59.18it/s][A
89it [00:01, 59.30it/s][A
95it [00:01, 59.39it/s][A
101it [00:01, 59.20it/s][A
107it [00:01, 58.73it/s][A
113it [00:01, 56.86it/s][A
119it [00:02, 57.49it/s][A
126it [00:02, 58.73it/s][A
133it [00:02, 59.30it/s][A
140it [00:02, 59.65it/s][A
146it [00:02, 59.65it/s][A
153it [00:02, 59.99it/s][A
159it [00:02, 59.81it/s][A
166it [00:02, 60.02it/s][A
172it [00:02, 58.83it/s][A
178it [00:03, 57.83it/s][A
184it [00:03, 58.38it/s][A
190it [00:03, 58.67it/s][A
197it [00:03, 59.27it/s][A
203it [00:03, 5


Epoch: 360, Test Loss: 5.526613244358797, Test Perplexity: 252.28484645393326




0it [00:00, ?it/s][A
5it [00:00, 42.24it/s][A
10it [00:00, 42.70it/s][A
15it [00:00, 42.20it/s][A
20it [00:00, 43.60it/s][A
25it [00:00, 42.78it/s][A
30it [00:00, 43.87it/s][A
35it [00:00, 44.21it/s][A
40it [00:00, 44.69it/s][A
45it [00:01, 42.96it/s][A
50it [00:01, 43.50it/s][A
55it [00:01, 42.67it/s][A
60it [00:01, 43.53it/s][A
65it [00:01, 42.80it/s][A
70it [00:01, 43.70it/s][A
75it [00:01, 44.32it/s][A
80it [00:01, 43.57it/s][A
85it [00:01, 43.92it/s][A
90it [00:02, 44.46it/s][A
95it [00:02, 43.31it/s][A
100it [00:02, 44.04it/s][A
105it [00:02, 43.36it/s][A

Epoch: 361, Step: 100, Loss: 4.494347367286682



110it [00:02, 43.85it/s][A
115it [00:02, 44.21it/s][A
120it [00:02, 44.57it/s][A
125it [00:02, 44.62it/s][A
130it [00:02, 44.08it/s][A
135it [00:03, 43.08it/s][A
140it [00:03, 42.42it/s][A
145it [00:03, 41.54it/s][A
150it [00:03, 42.73it/s][A
155it [00:03, 43.39it/s][A
160it [00:03, 44.24it/s][A
165it [00:03, 44.61it/s][A
170it [00:03, 44.83it/s][A
175it [00:04, 45.07it/s][A
180it [00:04, 44.80it/s][A
185it [00:04, 44.52it/s][A
190it [00:04, 44.33it/s][A
195it [00:04, 44.28it/s][A
200it [00:04, 44.50it/s][A
205it [00:04, 43.44it/s][A

Epoch: 361, Step: 200, Loss: 4.509308340549469



210it [00:04, 43.73it/s][A
215it [00:04, 43.93it/s][A
220it [00:05, 42.66it/s][A
227it [00:05, 43.66it/s]
 72%|███████▏  | 361/500 [43:04<18:14,  7.88s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.23it/s][A
10it [00:00, 44.64it/s][A
15it [00:00, 44.33it/s][A
20it [00:00, 44.92it/s][A
25it [00:00, 45.18it/s][A
30it [00:00, 45.30it/s][A
35it [00:00, 45.24it/s][A
40it [00:00, 43.78it/s][A
45it [00:01, 44.37it/s][A
50it [00:01, 44.83it/s][A
55it [00:01, 44.85it/s][A
60it [00:01, 45.09it/s][A
65it [00:01, 45.32it/s][A
70it [00:01, 45.28it/s][A
75it [00:01, 45.53it/s][A
80it [00:01, 45.43it/s][A
85it [00:01, 45.49it/s][A
90it [00:01, 45.49it/s][A
95it [00:02, 45.52it/s][A
100it [00:02, 43.80it/s][A
105it [00:02, 43.95it/s][A

Epoch: 362, Step: 100, Loss: 4.50640685081482



110it [00:02, 43.96it/s][A
115it [00:02, 44.30it/s][A
120it [00:02, 44.75it/s][A
125it [00:02, 45.04it/s][A
130it [00:02, 44.94it/s][A
135it [00:03, 44.61it/s][A
140it [00:03, 44.71it/s][A
145it [00:03, 45.06it/s][A
150it [00:03, 44.91it/s][A
155it [00:03, 44.83it/s][A
160it [00:03, 45.06it/s][A
165it [00:03, 45.29it/s][A
170it [00:03, 45.44it/s][A
175it [00:03, 45.39it/s][A
180it [00:04, 44.06it/s][A
185it [00:04, 44.44it/s][A
190it [00:04, 44.79it/s][A
195it [00:04, 44.97it/s][A
200it [00:04, 43.72it/s][A
205it [00:04, 44.33it/s][A

Epoch: 362, Step: 200, Loss: 4.510318665504456



210it [00:04, 44.87it/s][A
215it [00:04, 45.23it/s][A
220it [00:04, 45.07it/s][A
227it [00:05, 44.75it/s]
 72%|███████▏  | 362/500 [43:09<16:10,  7.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.45it/s][A
10it [00:00, 44.03it/s][A
15it [00:00, 44.33it/s][A
20it [00:00, 42.88it/s][A
25it [00:00, 44.07it/s][A
30it [00:00, 44.60it/s][A
35it [00:00, 44.70it/s][A
40it [00:00, 44.97it/s][A
45it [00:01, 45.06it/s][A
50it [00:01, 44.84it/s][A
55it [00:01, 44.75it/s][A
60it [00:01, 43.77it/s][A
65it [00:01, 44.69it/s][A
70it [00:01, 45.35it/s][A
75it [00:01, 44.75it/s][A
80it [00:01, 44.60it/s][A
85it [00:01, 43.85it/s][A
90it [00:02, 44.12it/s][A
95it [00:02, 43.02it/s][A
100it [00:02, 43.77it/s][A
105it [00:02, 44.23it/s][A

Epoch: 363, Step: 100, Loss: 4.4909580087661745



110it [00:02, 44.53it/s][A
115it [00:02, 44.76it/s][A
120it [00:02, 45.21it/s][A
125it [00:02, 44.34it/s][A
130it [00:02, 44.88it/s][A
135it [00:03, 42.87it/s][A
140it [00:03, 43.11it/s][A
145it [00:03, 44.14it/s][A
150it [00:03, 44.23it/s][A
155it [00:03, 45.02it/s][A
160it [00:03, 45.51it/s][A
165it [00:03, 45.79it/s][A
170it [00:03, 45.71it/s][A
175it [00:03, 45.82it/s][A
180it [00:04, 45.88it/s][A
185it [00:04, 45.49it/s][A
190it [00:04, 44.71it/s][A
195it [00:04, 45.18it/s][A
200it [00:04, 45.15it/s][A
205it [00:04, 44.20it/s][A

Epoch: 363, Step: 200, Loss: 4.508966150283814



210it [00:04, 43.61it/s][A
215it [00:04, 43.90it/s][A
220it [00:04, 43.86it/s][A
227it [00:05, 44.49it/s]
 73%|███████▎  | 363/500 [43:15<14:44,  6.46s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.62it/s][A
10it [00:00, 45.12it/s][A
15it [00:00, 45.42it/s][A
20it [00:00, 45.95it/s][A
25it [00:00, 46.55it/s][A
30it [00:00, 45.92it/s][A
35it [00:00, 45.35it/s][A
40it [00:00, 45.22it/s][A
45it [00:00, 44.06it/s][A
50it [00:01, 43.04it/s][A
55it [00:01, 43.78it/s][A
60it [00:01, 44.29it/s][A
65it [00:01, 44.50it/s][A
70it [00:01, 44.65it/s][A
75it [00:01, 44.86it/s][A
80it [00:01, 43.50it/s][A
85it [00:01, 43.95it/s][A
90it [00:02, 44.10it/s][A
95it [00:02, 44.26it/s][A
100it [00:02, 44.14it/s][A
105it [00:02, 44.58it/s][A

Epoch: 364, Step: 100, Loss: 4.497816348075867



110it [00:02, 44.54it/s][A
115it [00:02, 45.04it/s][A
120it [00:02, 45.07it/s][A
125it [00:02, 45.20it/s][A
130it [00:02, 44.24it/s][A
135it [00:03, 44.62it/s][A
140it [00:03, 44.63it/s][A
145it [00:03, 45.14it/s][A
150it [00:03, 45.04it/s][A
155it [00:03, 45.25it/s][A
160it [00:03, 45.28it/s][A
165it [00:03, 45.52it/s][A
170it [00:03, 44.68it/s][A
175it [00:03, 44.98it/s][A
180it [00:04, 44.97it/s][A
185it [00:04, 44.68it/s][A
190it [00:04, 44.87it/s][A
195it [00:04, 44.91it/s][A
200it [00:04, 45.10it/s][A
205it [00:04, 45.44it/s][A

Epoch: 364, Step: 200, Loss: 4.5075352263450625



210it [00:04, 45.21it/s][A
215it [00:04, 45.02it/s][A
220it [00:04, 43.49it/s][A
227it [00:05, 44.59it/s]
 73%|███████▎  | 364/500 [43:20<13:42,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.98it/s][A
10it [00:00, 45.12it/s][A
15it [00:00, 44.67it/s][A
20it [00:00, 44.65it/s][A
25it [00:00, 44.94it/s][A
30it [00:00, 43.82it/s][A
35it [00:00, 44.58it/s][A
40it [00:00, 45.09it/s][A
45it [00:01, 45.36it/s][A
50it [00:01, 45.34it/s][A
55it [00:01, 45.02it/s][A
60it [00:01, 45.26it/s][A
65it [00:01, 45.03it/s][A
70it [00:01, 45.19it/s][A
75it [00:01, 45.31it/s][A
80it [00:01, 45.44it/s][A
85it [00:01, 45.54it/s][A
90it [00:01, 45.57it/s][A
95it [00:02, 45.36it/s][A
100it [00:02, 45.39it/s][A
105it [00:02, 45.40it/s][A

Epoch: 365, Step: 100, Loss: 4.4905160665512085



110it [00:02, 44.97it/s][A
115it [00:02, 45.06it/s][A
120it [00:02, 45.10it/s][A
125it [00:02, 45.19it/s][A
130it [00:02, 45.37it/s][A
135it [00:02, 44.51it/s][A
140it [00:03, 44.87it/s][A
145it [00:03, 44.95it/s][A
150it [00:03, 45.08it/s][A
155it [00:03, 45.22it/s][A
160it [00:03, 45.43it/s][A
165it [00:03, 45.50it/s][A
170it [00:03, 45.38it/s][A
175it [00:03, 45.52it/s][A
180it [00:03, 45.58it/s][A
185it [00:04, 45.56it/s][A
190it [00:04, 45.61it/s][A
195it [00:04, 45.64it/s][A
200it [00:04, 45.60it/s][A
205it [00:04, 45.59it/s][A

Epoch: 365, Step: 200, Loss: 4.5080919313430785



210it [00:04, 45.46it/s][A
215it [00:04, 45.57it/s][A
220it [00:04, 45.44it/s][A
227it [00:05, 45.12it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.39it/s][A
13it [00:00, 60.05it/s][A
20it [00:00, 60.27it/s][A
27it [00:00, 60.34it/s][A
34it [00:00, 59.98it/s][A
40it [00:00, 59.80it/s][A
47it [00:00, 59.99it/s][A
53it [00:00, 59.34it/s][A
59it [00:00, 59.29it/s][A
65it [00:01, 57.41it/s][A
71it [00:01, 58.13it/s][A
77it [00:01, 58.67it/s][A
83it [00:01, 58.91it/s][A
89it [00:01, 59.07it/s][A
95it [00:01, 59.11it/s][A
101it [00:01, 59.20it/s][A
107it [00:01, 57.83it/s][A
113it [00:01, 57.78it/s][A
119it [00:02, 58.27it/s][A
125it [00:02, 58.65it/s][A
132it [00:02, 59.42it/s][A
138it [00:02, 59.11it/s][A
145it [00:02, 59.61it/s][A
151it [00:02, 59.48it/s][A
157it [00:02, 59.41it/s][A
164it [00:02, 59.80it/s][A
171it [00:02, 60.09it/s][A
178it [00:03, 60.16it/s][A
185it [00:03, 60.11it/s][A
192it [00:03, 58.19it/s][A
198it [00:03, 57.37it/s][A
204it [00:03, 5


Epoch: 365, Test Loss: 5.530613775579085, Test Perplexity: 253.20354281597255




0it [00:00, ?it/s][A
5it [00:00, 41.26it/s][A
10it [00:00, 43.43it/s][A
15it [00:00, 44.33it/s][A
20it [00:00, 44.69it/s][A
25it [00:00, 44.73it/s][A
30it [00:00, 42.46it/s][A
35it [00:00, 42.87it/s][A
40it [00:00, 42.31it/s][A
45it [00:01, 42.86it/s][A
50it [00:01, 43.71it/s][A
55it [00:01, 43.98it/s][A
60it [00:01, 44.31it/s][A
65it [00:01, 44.37it/s][A
70it [00:01, 44.60it/s][A
75it [00:01, 44.21it/s][A
80it [00:01, 44.62it/s][A
85it [00:01, 44.58it/s][A
90it [00:02, 44.80it/s][A
95it [00:02, 44.77it/s][A
100it [00:02, 44.76it/s][A
105it [00:02, 43.34it/s][A

Epoch: 366, Step: 100, Loss: 4.488124804496765



110it [00:02, 43.12it/s][A
115it [00:02, 43.62it/s][A
120it [00:02, 43.82it/s][A
125it [00:02, 44.20it/s][A
130it [00:02, 44.60it/s][A
135it [00:03, 44.95it/s][A
140it [00:03, 45.16it/s][A
145it [00:03, 45.06it/s][A
150it [00:03, 45.35it/s][A
155it [00:03, 45.23it/s][A
160it [00:03, 45.22it/s][A
165it [00:03, 45.03it/s][A
170it [00:03, 45.15it/s][A
175it [00:03, 45.10it/s][A
180it [00:04, 44.82it/s][A
185it [00:04, 44.95it/s][A
190it [00:04, 45.04it/s][A
195it [00:04, 45.23it/s][A
200it [00:04, 43.24it/s][A
205it [00:04, 43.82it/s][A

Epoch: 366, Step: 200, Loss: 4.509239773750306



210it [00:04, 43.94it/s][A
215it [00:04, 44.41it/s][A
220it [00:04, 44.55it/s][A
227it [00:05, 44.31it/s]
 73%|███████▎  | 366/500 [43:41<17:32,  7.86s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.38it/s][A
10it [00:00, 45.93it/s][A
15it [00:00, 45.70it/s][A
20it [00:00, 45.89it/s][A
25it [00:00, 45.79it/s][A
30it [00:00, 45.33it/s][A
35it [00:00, 45.25it/s][A
40it [00:00, 45.24it/s][A
45it [00:00, 44.89it/s][A
50it [00:01, 43.71it/s][A
55it [00:01, 43.94it/s][A
60it [00:01, 44.42it/s][A
65it [00:01, 44.79it/s][A
70it [00:01, 42.73it/s][A
75it [00:01, 43.63it/s][A
80it [00:01, 44.28it/s][A
85it [00:01, 44.76it/s][A
90it [00:02, 45.12it/s][A
95it [00:02, 45.35it/s][A
100it [00:02, 45.66it/s][A
105it [00:02, 44.19it/s][A

Epoch: 367, Step: 100, Loss: 4.493236327171326



110it [00:02, 44.55it/s][A
115it [00:02, 45.07it/s][A
120it [00:02, 45.26it/s][A
125it [00:02, 45.48it/s][A
130it [00:02, 44.89it/s][A
135it [00:03, 44.40it/s][A
140it [00:03, 44.74it/s][A
145it [00:03, 45.06it/s][A
150it [00:03, 45.11it/s][A
155it [00:03, 45.51it/s][A
160it [00:03, 45.22it/s][A
165it [00:03, 44.88it/s][A
170it [00:03, 45.12it/s][A
175it [00:03, 45.21it/s][A
180it [00:04, 45.23it/s][A
185it [00:04, 45.18it/s][A
190it [00:04, 45.17it/s][A
195it [00:04, 44.75it/s][A
200it [00:04, 45.28it/s][A
205it [00:04, 45.67it/s][A

Epoch: 367, Step: 200, Loss: 4.507664663791656



210it [00:04, 45.60it/s][A
215it [00:04, 44.76it/s][A
220it [00:04, 45.40it/s][A
227it [00:05, 44.96it/s]
 73%|███████▎  | 367/500 [43:46<15:33,  7.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.42it/s][A
10it [00:00, 46.31it/s][A
15it [00:00, 46.36it/s][A
20it [00:00, 45.52it/s][A
25it [00:00, 45.94it/s][A
30it [00:00, 44.29it/s][A
35it [00:00, 45.17it/s][A
40it [00:00, 45.25it/s][A
45it [00:00, 45.04it/s][A
50it [00:01, 45.33it/s][A
55it [00:01, 46.11it/s][A
60it [00:01, 46.56it/s][A
65it [00:01, 46.56it/s][A
70it [00:01, 46.30it/s][A
75it [00:01, 46.07it/s][A
80it [00:01, 45.83it/s][A
85it [00:01, 45.62it/s][A
90it [00:01, 45.41it/s][A
95it [00:02, 45.37it/s][A
100it [00:02, 45.42it/s][A
105it [00:02, 45.53it/s][A

Epoch: 368, Step: 100, Loss: 4.480028338432312



110it [00:02, 45.26it/s][A
115it [00:02, 45.35it/s][A
120it [00:02, 44.55it/s][A
125it [00:02, 44.96it/s][A
130it [00:02, 44.95it/s][A
135it [00:02, 45.00it/s][A
140it [00:03, 45.14it/s][A
145it [00:03, 44.17it/s][A
150it [00:03, 43.14it/s][A
155it [00:03, 43.60it/s][A
160it [00:03, 43.79it/s][A
165it [00:03, 44.14it/s][A
170it [00:03, 44.51it/s][A
175it [00:03, 44.44it/s][A
180it [00:03, 44.70it/s][A
185it [00:04, 44.98it/s][A
190it [00:04, 45.14it/s][A
195it [00:04, 45.26it/s][A
200it [00:04, 45.01it/s][A
205it [00:04, 44.99it/s][A

Epoch: 368, Step: 200, Loss: 4.507594573497772



210it [00:04, 44.82it/s][A
215it [00:04, 44.18it/s][A
220it [00:04, 44.29it/s][A
227it [00:05, 44.86it/s]
 74%|███████▎  | 368/500 [43:51<14:08,  6.43s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.40it/s][A
10it [00:00, 45.13it/s][A
15it [00:00, 45.05it/s][A
20it [00:00, 45.11it/s][A
25it [00:00, 45.26it/s][A
30it [00:00, 44.45it/s][A
35it [00:00, 44.56it/s][A
40it [00:00, 44.96it/s][A
45it [00:00, 45.24it/s][A
50it [00:01, 44.45it/s][A
55it [00:01, 44.63it/s][A
60it [00:01, 45.00it/s][A
65it [00:01, 45.09it/s][A
70it [00:01, 45.20it/s][A
75it [00:01, 44.75it/s][A
80it [00:01, 44.66it/s][A
85it [00:01, 44.97it/s][A
90it [00:02, 45.08it/s][A
95it [00:02, 45.17it/s][A
100it [00:02, 45.57it/s][A
105it [00:02, 45.57it/s][A

Epoch: 369, Step: 100, Loss: 4.487192826271057



110it [00:02, 45.10it/s][A
115it [00:02, 44.54it/s][A
120it [00:02, 44.85it/s][A
125it [00:02, 44.65it/s][A
130it [00:02, 44.84it/s][A
135it [00:03, 45.06it/s][A
140it [00:03, 45.31it/s][A
145it [00:03, 45.61it/s][A
150it [00:03, 45.28it/s][A
155it [00:03, 44.73it/s][A
160it [00:03, 44.89it/s][A
165it [00:03, 44.86it/s][A
170it [00:03, 44.95it/s][A
175it [00:03, 44.69it/s][A
180it [00:04, 44.69it/s][A
185it [00:04, 44.84it/s][A
190it [00:04, 44.70it/s][A
195it [00:04, 44.21it/s][A
200it [00:04, 44.34it/s][A
205it [00:04, 44.47it/s][A

Epoch: 369, Step: 200, Loss: 4.508255746364593



210it [00:04, 42.71it/s][A
215it [00:04, 43.20it/s][A
220it [00:04, 43.27it/s][A
227it [00:05, 44.66it/s]
 74%|███████▍  | 369/500 [43:56<13:09,  6.03s/it]
0it [00:00, ?it/s][A
4it [00:00, 38.50it/s][A
9it [00:00, 41.47it/s][A
14it [00:00, 42.41it/s][A
19it [00:00, 42.87it/s][A
24it [00:00, 42.18it/s][A
29it [00:00, 43.19it/s][A
34it [00:00, 43.88it/s][A
39it [00:00, 44.48it/s][A
44it [00:01, 44.87it/s][A
49it [00:01, 45.01it/s][A
54it [00:01, 45.30it/s][A
59it [00:01, 45.58it/s][A
64it [00:01, 44.43it/s][A
69it [00:01, 44.56it/s][A
74it [00:01, 44.67it/s][A
79it [00:01, 45.04it/s][A
84it [00:01, 44.86it/s][A
89it [00:02, 45.21it/s][A
94it [00:02, 45.46it/s][A
99it [00:02, 45.32it/s][A
104it [00:02, 45.31it/s][A
109it [00:02, 45.48it/s][A

Epoch: 370, Step: 100, Loss: 4.489448356628418



114it [00:02, 45.43it/s][A
119it [00:02, 45.42it/s][A
124it [00:02, 45.38it/s][A
129it [00:02, 44.29it/s][A
134it [00:03, 43.48it/s][A
139it [00:03, 44.35it/s][A
144it [00:03, 44.62it/s][A
149it [00:03, 44.65it/s][A
154it [00:03, 44.82it/s][A
159it [00:03, 45.04it/s][A
164it [00:03, 45.30it/s][A
169it [00:03, 45.45it/s][A
174it [00:03, 45.63it/s][A
179it [00:04, 44.68it/s][A
184it [00:04, 43.76it/s][A
189it [00:04, 44.21it/s][A
194it [00:04, 44.30it/s][A
199it [00:04, 44.49it/s][A
204it [00:04, 44.27it/s][A
209it [00:04, 44.54it/s][A

Epoch: 370, Step: 200, Loss: 4.505172221660614



214it [00:04, 42.96it/s][A
219it [00:04, 43.62it/s][A
227it [00:05, 44.38it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.64it/s][A
12it [00:00, 59.33it/s][A
18it [00:00, 59.13it/s][A
24it [00:00, 59.40it/s][A
31it [00:00, 59.91it/s][A
37it [00:00, 59.89it/s][A
44it [00:00, 60.05it/s][A
51it [00:00, 60.30it/s][A
58it [00:00, 60.51it/s][A
65it [00:01, 57.94it/s][A
71it [00:01, 57.06it/s][A
77it [00:01, 57.47it/s][A
83it [00:01, 58.13it/s][A
89it [00:01, 58.60it/s][A
95it [00:01, 58.76it/s][A
102it [00:01, 59.32it/s][A
108it [00:01, 59.21it/s][A
114it [00:01, 59.22it/s][A
120it [00:02, 57.73it/s][A
127it [00:02, 58.90it/s][A
134it [00:02, 59.48it/s][A
141it [00:02, 59.95it/s][A
147it [00:02, 59.83it/s][A
154it [00:02, 60.19it/s][A
161it [00:02, 60.35it/s][A
168it [00:02, 60.45it/s][A
175it [00:02, 59.12it/s][A
182it [00:03, 59.72it/s][A
189it [00:03, 59.92it/s][A
195it [00:03, 59.77it/s][A
202it [00:03, 60.10it/s][A
209it [00:03, 57.91it/s][A
216it [00:03, 5


Epoch: 370, Test Loss: 5.537168764919969, Test Perplexity: 255.04802668316765




0it [00:00, ?it/s][A
5it [00:00, 42.87it/s][A
10it [00:00, 44.10it/s][A
15it [00:00, 42.14it/s][A
20it [00:00, 42.70it/s][A
25it [00:00, 42.28it/s][A
30it [00:00, 43.13it/s][A
35it [00:00, 43.68it/s][A
40it [00:00, 43.93it/s][A
45it [00:01, 44.31it/s][A
50it [00:01, 44.66it/s][A
55it [00:01, 44.91it/s][A
60it [00:01, 45.10it/s][A
65it [00:01, 45.04it/s][A
70it [00:01, 44.96it/s][A
75it [00:01, 44.99it/s][A
80it [00:01, 45.19it/s][A
85it [00:01, 45.16it/s][A
90it [00:02, 45.25it/s][A
95it [00:02, 45.24it/s][A
100it [00:02, 45.42it/s][A
105it [00:02, 45.32it/s][A

Epoch: 371, Step: 100, Loss: 4.491168098449707



110it [00:02, 45.29it/s][A
115it [00:02, 45.28it/s][A
120it [00:02, 45.45it/s][A
125it [00:02, 45.53it/s][A
130it [00:02, 45.85it/s][A
135it [00:03, 45.30it/s][A
140it [00:03, 45.80it/s][A
145it [00:03, 45.95it/s][A
150it [00:03, 45.84it/s][A
155it [00:03, 45.98it/s][A
160it [00:03, 46.29it/s][A
165it [00:03, 46.24it/s][A
170it [00:03, 46.31it/s][A
175it [00:03, 46.29it/s][A
180it [00:03, 46.14it/s][A
185it [00:04, 46.44it/s][A
190it [00:04, 46.14it/s][A
195it [00:04, 46.02it/s][A
200it [00:04, 45.90it/s][A
205it [00:04, 46.18it/s][A

Epoch: 371, Step: 200, Loss: 4.506401407718658



210it [00:04, 44.86it/s][A
215it [00:04, 45.16it/s][A
220it [00:04, 45.43it/s][A
227it [00:05, 45.19it/s]
 74%|███████▍  | 371/500 [44:17<16:50,  7.84s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.47it/s][A
10it [00:00, 45.76it/s][A
15it [00:00, 45.13it/s][A
20it [00:00, 44.00it/s][A
25it [00:00, 43.24it/s][A
30it [00:00, 43.91it/s][A
35it [00:00, 43.81it/s][A
40it [00:00, 44.69it/s][A
45it [00:01, 44.97it/s][A
50it [00:01, 44.56it/s][A
55it [00:01, 43.59it/s][A
60it [00:01, 43.41it/s][A
65it [00:01, 44.34it/s][A
70it [00:01, 44.68it/s][A
75it [00:01, 43.42it/s][A
80it [00:01, 44.58it/s][A
85it [00:01, 43.68it/s][A
90it [00:02, 44.05it/s][A
95it [00:02, 43.69it/s][A
100it [00:02, 43.95it/s][A
105it [00:02, 43.32it/s][A

Epoch: 372, Step: 100, Loss: 4.493436245918274



110it [00:02, 42.93it/s][A
115it [00:02, 43.63it/s][A
120it [00:02, 42.80it/s][A
125it [00:02, 43.47it/s][A
130it [00:02, 43.96it/s][A
135it [00:03, 44.34it/s][A
140it [00:03, 44.66it/s][A
145it [00:03, 44.93it/s][A
150it [00:03, 43.56it/s][A
155it [00:03, 44.05it/s][A
160it [00:03, 44.29it/s][A
165it [00:03, 44.64it/s][A
170it [00:03, 44.84it/s][A
175it [00:03, 44.94it/s][A
180it [00:04, 44.93it/s][A
185it [00:04, 45.15it/s][A
190it [00:04, 45.23it/s][A
195it [00:04, 44.97it/s][A
200it [00:04, 44.57it/s][A
205it [00:04, 44.81it/s][A

Epoch: 372, Step: 200, Loss: 4.504803986549377



210it [00:04, 44.91it/s][A
215it [00:04, 44.99it/s][A
220it [00:04, 45.15it/s][A
227it [00:05, 44.27it/s]
 74%|███████▍  | 372/500 [44:22<14:59,  7.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.06it/s][A
10it [00:00, 44.85it/s][A
15it [00:00, 45.09it/s][A
20it [00:00, 45.18it/s][A
25it [00:00, 44.88it/s][A
30it [00:00, 44.78it/s][A
35it [00:00, 44.24it/s][A
40it [00:00, 44.13it/s][A
45it [00:01, 42.92it/s][A
50it [00:01, 43.56it/s][A
55it [00:01, 43.95it/s][A
60it [00:01, 44.36it/s][A
65it [00:01, 44.78it/s][A
70it [00:01, 45.03it/s][A
75it [00:01, 45.03it/s][A
80it [00:01, 45.28it/s][A
85it [00:01, 44.94it/s][A
90it [00:02, 45.11it/s][A
95it [00:02, 44.97it/s][A
100it [00:02, 45.06it/s][A
105it [00:02, 45.21it/s][A

Epoch: 373, Step: 100, Loss: 4.488582911491394



110it [00:02, 44.70it/s][A
115it [00:02, 45.04it/s][A
120it [00:02, 45.28it/s][A
125it [00:02, 45.00it/s][A
130it [00:02, 45.25it/s][A
135it [00:03, 44.59it/s][A
140it [00:03, 44.66it/s][A
145it [00:03, 44.92it/s][A
150it [00:03, 43.74it/s][A
155it [00:03, 44.19it/s][A
160it [00:03, 44.20it/s][A
165it [00:03, 44.42it/s][A
170it [00:03, 44.50it/s][A
175it [00:03, 43.29it/s][A
180it [00:04, 44.02it/s][A
185it [00:04, 44.53it/s][A
190it [00:04, 44.79it/s][A
195it [00:04, 44.98it/s][A
200it [00:04, 45.06it/s][A
205it [00:04, 45.18it/s][A

Epoch: 373, Step: 200, Loss: 4.506033501625061



210it [00:04, 44.65it/s][A
215it [00:04, 45.01it/s][A
220it [00:04, 45.22it/s][A
227it [00:05, 44.66it/s]
 75%|███████▍  | 373/500 [44:27<13:38,  6.44s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.86it/s][A
10it [00:00, 43.86it/s][A
15it [00:00, 44.03it/s][A
20it [00:00, 44.67it/s][A
25it [00:00, 45.21it/s][A
30it [00:00, 43.37it/s][A
35it [00:00, 44.05it/s][A
40it [00:00, 44.65it/s][A
45it [00:01, 44.98it/s][A
50it [00:01, 44.75it/s][A
55it [00:01, 45.09it/s][A
60it [00:01, 45.24it/s][A
65it [00:01, 44.91it/s][A
70it [00:01, 45.23it/s][A
75it [00:01, 45.37it/s][A
80it [00:01, 45.15it/s][A
85it [00:01, 45.47it/s][A
90it [00:02, 44.53it/s][A
95it [00:02, 44.92it/s][A
100it [00:02, 45.13it/s][A
105it [00:02, 44.52it/s][A

Epoch: 374, Step: 100, Loss: 4.494222030639649



110it [00:02, 43.43it/s][A
115it [00:02, 44.01it/s][A
120it [00:02, 44.51it/s][A
125it [00:02, 44.91it/s][A
130it [00:02, 44.79it/s][A
135it [00:03, 45.13it/s][A
140it [00:03, 43.99it/s][A
145it [00:03, 44.12it/s][A
150it [00:03, 43.84it/s][A
155it [00:03, 43.48it/s][A
160it [00:03, 42.36it/s][A
165it [00:03, 43.07it/s][A
170it [00:03, 43.97it/s][A
175it [00:03, 44.03it/s][A
180it [00:04, 43.89it/s][A
185it [00:04, 42.92it/s][A
190it [00:04, 43.75it/s][A
195it [00:04, 44.39it/s][A
200it [00:04, 44.61it/s][A
205it [00:04, 43.23it/s][A

Epoch: 374, Step: 200, Loss: 4.504631910324097



210it [00:04, 42.86it/s][A
215it [00:04, 43.43it/s][A
220it [00:04, 44.22it/s][A
227it [00:05, 44.27it/s]
 75%|███████▍  | 374/500 [44:32<12:42,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.14it/s][A
10it [00:00, 45.00it/s][A
15it [00:00, 42.37it/s][A
20it [00:00, 43.11it/s][A
25it [00:00, 43.57it/s][A
30it [00:00, 43.97it/s][A
35it [00:00, 44.06it/s][A
40it [00:00, 44.39it/s][A
45it [00:01, 44.67it/s][A
50it [00:01, 43.74it/s][A
55it [00:01, 43.81it/s][A
60it [00:01, 43.33it/s][A
65it [00:01, 42.56it/s][A
70it [00:01, 43.48it/s][A
75it [00:01, 43.82it/s][A
80it [00:01, 43.02it/s][A
85it [00:01, 43.90it/s][A
90it [00:02, 44.14it/s][A
95it [00:02, 44.58it/s][A
100it [00:02, 44.73it/s][A
105it [00:02, 44.66it/s][A

Epoch: 375, Step: 100, Loss: 4.491334233283997



110it [00:02, 44.42it/s][A
115it [00:02, 42.77it/s][A
120it [00:02, 43.73it/s][A
125it [00:02, 44.06it/s][A
130it [00:02, 43.43it/s][A
135it [00:03, 44.09it/s][A
140it [00:03, 44.63it/s][A
145it [00:03, 45.06it/s][A
150it [00:03, 45.33it/s][A
155it [00:03, 44.35it/s][A
160it [00:03, 44.96it/s][A
165it [00:03, 45.23it/s][A
170it [00:03, 45.35it/s][A
175it [00:03, 45.08it/s][A
180it [00:04, 45.40it/s][A
185it [00:04, 45.53it/s][A
190it [00:04, 45.64it/s][A
195it [00:04, 45.64it/s][A
200it [00:04, 45.55it/s][A
205it [00:04, 44.32it/s][A

Epoch: 375, Step: 200, Loss: 4.503873450756073



210it [00:04, 44.63it/s][A
215it [00:04, 45.05it/s][A
220it [00:04, 45.03it/s][A
227it [00:05, 44.37it/s]

0it [00:00, ?it/s][A
6it [00:00, 55.06it/s][A
12it [00:00, 56.67it/s][A
18it [00:00, 57.68it/s][A
24it [00:00, 58.55it/s][A
31it [00:00, 59.28it/s][A
38it [00:00, 59.62it/s][A
45it [00:00, 59.90it/s][A
52it [00:00, 60.10it/s][A
59it [00:00, 60.18it/s][A
66it [00:01, 60.05it/s][A
73it [00:01, 59.99it/s][A
79it [00:01, 59.84it/s][A
85it [00:01, 59.73it/s][A
91it [00:01, 56.80it/s][A
97it [00:01, 57.48it/s][A
104it [00:01, 58.42it/s][A
110it [00:01, 58.32it/s][A
117it [00:01, 58.94it/s][A
123it [00:02, 59.20it/s][A
130it [00:02, 59.59it/s][A
137it [00:02, 59.99it/s][A
143it [00:02, 59.78it/s][A
150it [00:02, 60.01it/s][A
156it [00:02, 57.50it/s][A
162it [00:02, 58.08it/s][A
169it [00:02, 58.76it/s][A
176it [00:02, 59.45it/s][A
182it [00:03, 57.46it/s][A
189it [00:03, 58.53it/s][A
196it [00:03, 59.10it/s][A
202it [00:03, 58.28it/s][A
208it [00:03, 5


Epoch: 375, Test Loss: 5.535205886230706, Test Perplexity: 254.50320932139522




0it [00:00, ?it/s][A
5it [00:00, 46.03it/s][A
10it [00:00, 45.35it/s][A
15it [00:00, 45.89it/s][A
20it [00:00, 46.09it/s][A
25it [00:00, 45.92it/s][A
30it [00:00, 45.55it/s][A
35it [00:00, 45.13it/s][A
40it [00:00, 45.40it/s][A
45it [00:00, 45.59it/s][A
50it [00:01, 45.67it/s][A
55it [00:01, 45.56it/s][A
60it [00:01, 45.29it/s][A
65it [00:01, 45.24it/s][A
70it [00:01, 45.64it/s][A
75it [00:01, 43.55it/s][A
80it [00:01, 41.98it/s][A
85it [00:01, 42.72it/s][A
90it [00:02, 43.43it/s][A
95it [00:02, 42.39it/s][A
100it [00:02, 42.96it/s][A
105it [00:02, 43.96it/s][A

Epoch: 376, Step: 100, Loss: 4.49174955368042



110it [00:02, 44.02it/s][A
115it [00:02, 44.74it/s][A
120it [00:02, 45.13it/s][A
125it [00:02, 44.79it/s][A
130it [00:02, 44.95it/s][A
135it [00:03, 44.80it/s][A
140it [00:03, 44.84it/s][A
145it [00:03, 44.73it/s][A
150it [00:03, 45.15it/s][A
155it [00:03, 44.99it/s][A
160it [00:03, 44.51it/s][A
165it [00:03, 45.01it/s][A
170it [00:03, 44.94it/s][A
175it [00:03, 44.98it/s][A
180it [00:04, 44.40it/s][A
185it [00:04, 44.44it/s][A
190it [00:04, 43.73it/s][A
195it [00:04, 44.21it/s][A
200it [00:04, 44.63it/s][A
205it [00:04, 44.68it/s][A

Epoch: 376, Step: 200, Loss: 4.502137939929963



210it [00:04, 44.99it/s][A
215it [00:04, 45.28it/s][A
220it [00:04, 43.65it/s][A
227it [00:05, 44.54it/s]
 75%|███████▌  | 376/500 [44:54<16:14,  7.86s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.19it/s][A
10it [00:00, 42.02it/s][A
15it [00:00, 42.35it/s][A
20it [00:00, 43.43it/s][A
25it [00:00, 44.03it/s][A
30it [00:00, 44.63it/s][A
35it [00:00, 44.72it/s][A
40it [00:00, 44.78it/s][A
45it [00:01, 44.92it/s][A
50it [00:01, 45.17it/s][A
55it [00:01, 45.11it/s][A
60it [00:01, 45.35it/s][A
65it [00:01, 45.54it/s][A
70it [00:01, 45.62it/s][A
75it [00:01, 44.29it/s][A
80it [00:01, 44.75it/s][A
85it [00:01, 44.90it/s][A
90it [00:02, 45.15it/s][A
95it [00:02, 45.20it/s][A
100it [00:02, 45.27it/s][A
105it [00:02, 44.95it/s][A

Epoch: 377, Step: 100, Loss: 4.4880074644088745



110it [00:02, 45.00it/s][A
115it [00:02, 45.12it/s][A
120it [00:02, 45.06it/s][A
125it [00:02, 43.85it/s][A
130it [00:02, 44.04it/s][A
135it [00:03, 44.49it/s][A
140it [00:03, 44.32it/s][A
145it [00:03, 44.59it/s][A
150it [00:03, 44.88it/s][A
155it [00:03, 45.04it/s][A
160it [00:03, 45.26it/s][A
165it [00:03, 43.44it/s][A
170it [00:03, 44.11it/s][A
175it [00:03, 44.40it/s][A
180it [00:04, 43.59it/s][A
185it [00:04, 44.32it/s][A
190it [00:04, 44.84it/s][A
195it [00:04, 43.48it/s][A
200it [00:04, 44.13it/s][A
205it [00:04, 44.48it/s][A

Epoch: 377, Step: 200, Loss: 4.50324821472168



210it [00:04, 44.60it/s][A
215it [00:04, 44.22it/s][A
220it [00:04, 44.36it/s][A
227it [00:05, 44.53it/s]
 75%|███████▌  | 377/500 [44:59<14:25,  7.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.10it/s][A
10it [00:00, 43.76it/s][A
15it [00:00, 44.81it/s][A
20it [00:00, 45.19it/s][A
25it [00:00, 44.03it/s][A
30it [00:00, 42.82it/s][A
35it [00:00, 43.67it/s][A
40it [00:00, 41.95it/s][A
45it [00:01, 43.35it/s][A
50it [00:01, 42.34it/s][A
55it [00:01, 43.36it/s][A
60it [00:01, 43.89it/s][A
65it [00:01, 44.71it/s][A
70it [00:01, 44.99it/s][A
75it [00:01, 45.11it/s][A
80it [00:01, 45.04it/s][A
85it [00:01, 45.26it/s][A
90it [00:02, 44.09it/s][A
95it [00:02, 44.57it/s][A
100it [00:02, 44.85it/s][A
105it [00:02, 45.06it/s][A

Epoch: 378, Step: 100, Loss: 4.498486104011536



110it [00:02, 44.95it/s][A
115it [00:02, 45.03it/s][A
120it [00:02, 44.95it/s][A
125it [00:02, 45.16it/s][A
130it [00:02, 45.38it/s][A
135it [00:03, 44.40it/s][A
140it [00:03, 44.48it/s][A
145it [00:03, 44.17it/s][A
150it [00:03, 44.41it/s][A
155it [00:03, 44.70it/s][A
160it [00:03, 45.09it/s][A
165it [00:03, 43.48it/s][A
170it [00:03, 44.35it/s][A
175it [00:03, 44.69it/s][A
180it [00:04, 44.84it/s][A
185it [00:04, 44.88it/s][A
190it [00:04, 44.59it/s][A
195it [00:04, 44.70it/s][A
200it [00:04, 44.83it/s][A
205it [00:04, 43.71it/s][A

Epoch: 378, Step: 200, Loss: 4.504779314994812



210it [00:04, 44.52it/s][A
215it [00:04, 44.74it/s][A
220it [00:04, 44.91it/s][A
227it [00:05, 44.43it/s]
 76%|███████▌  | 378/500 [45:04<13:07,  6.46s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.53it/s][A
10it [00:00, 44.74it/s][A
15it [00:00, 44.94it/s][A
20it [00:00, 44.81it/s][A
25it [00:00, 45.31it/s][A
30it [00:00, 45.29it/s][A
35it [00:00, 44.95it/s][A
40it [00:00, 45.26it/s][A
45it [00:00, 45.20it/s][A
50it [00:01, 45.20it/s][A
55it [00:01, 43.91it/s][A
60it [00:01, 44.03it/s][A
65it [00:01, 43.84it/s][A
70it [00:01, 43.71it/s][A
75it [00:01, 42.96it/s][A
80it [00:01, 43.88it/s][A
85it [00:01, 44.37it/s][A
90it [00:02, 44.77it/s][A
95it [00:02, 44.75it/s][A
100it [00:02, 45.13it/s][A
105it [00:02, 45.08it/s][A

Epoch: 379, Step: 100, Loss: 4.490567350387574



110it [00:02, 45.16it/s][A
115it [00:02, 45.32it/s][A
120it [00:02, 45.38it/s][A
125it [00:02, 45.43it/s][A
130it [00:02, 43.77it/s][A
135it [00:03, 43.92it/s][A
140it [00:03, 44.28it/s][A
145it [00:03, 44.60it/s][A
150it [00:03, 44.91it/s][A
155it [00:03, 45.10it/s][A
160it [00:03, 45.07it/s][A
165it [00:03, 45.28it/s][A
170it [00:03, 45.39it/s][A
175it [00:03, 45.21it/s][A
180it [00:04, 44.90it/s][A
185it [00:04, 44.65it/s][A
190it [00:04, 44.52it/s][A
195it [00:04, 44.67it/s][A
200it [00:04, 43.88it/s][A
205it [00:04, 44.27it/s][A

Epoch: 379, Step: 200, Loss: 4.503301510810852



210it [00:04, 44.18it/s][A
215it [00:04, 44.41it/s][A
220it [00:04, 44.83it/s][A
227it [00:05, 44.58it/s]
 76%|███████▌  | 379/500 [45:09<12:11,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.76it/s][A
10it [00:00, 45.20it/s][A
15it [00:00, 45.11it/s][A
20it [00:00, 45.46it/s][A
25it [00:00, 45.61it/s][A
30it [00:00, 43.89it/s][A
35it [00:00, 44.40it/s][A
40it [00:00, 43.11it/s][A
45it [00:01, 43.15it/s][A
50it [00:01, 42.91it/s][A
55it [00:01, 43.68it/s][A
60it [00:01, 44.01it/s][A
65it [00:01, 44.45it/s][A
70it [00:01, 44.98it/s][A
75it [00:01, 44.98it/s][A
80it [00:01, 45.23it/s][A
85it [00:01, 44.18it/s][A
90it [00:02, 44.57it/s][A
95it [00:02, 44.73it/s][A
100it [00:02, 45.03it/s][A
105it [00:02, 45.06it/s][A

Epoch: 380, Step: 100, Loss: 4.486873950958252



110it [00:02, 45.04it/s][A
115it [00:02, 45.21it/s][A
120it [00:02, 45.12it/s][A
125it [00:02, 45.19it/s][A
130it [00:02, 44.09it/s][A
135it [00:03, 44.30it/s][A
140it [00:03, 44.83it/s][A
145it [00:03, 45.16it/s][A
150it [00:03, 45.47it/s][A
155it [00:03, 45.50it/s][A
160it [00:03, 45.46it/s][A
165it [00:03, 44.08it/s][A
170it [00:03, 43.28it/s][A
175it [00:03, 43.89it/s][A
180it [00:04, 44.13it/s][A
185it [00:04, 44.42it/s][A
190it [00:04, 44.94it/s][A
195it [00:04, 45.06it/s][A
200it [00:04, 44.91it/s][A
205it [00:04, 44.47it/s][A

Epoch: 380, Step: 200, Loss: 4.500242230892181



210it [00:04, 43.86it/s][A
215it [00:04, 44.13it/s][A
220it [00:04, 42.91it/s][A
227it [00:05, 44.40it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.10it/s][A
12it [00:00, 58.62it/s][A
18it [00:00, 56.25it/s][A
24it [00:00, 57.30it/s][A
30it [00:00, 57.76it/s][A
37it [00:00, 58.88it/s][A
44it [00:00, 59.48it/s][A
50it [00:00, 59.60it/s][A
56it [00:00, 59.63it/s][A
62it [00:01, 59.36it/s][A
68it [00:01, 59.29it/s][A
74it [00:01, 58.85it/s][A
80it [00:01, 57.83it/s][A
86it [00:01, 57.81it/s][A
93it [00:01, 58.85it/s][A
99it [00:01, 57.22it/s][A
105it [00:01, 57.82it/s][A
111it [00:01, 57.13it/s][A
117it [00:02, 57.52it/s][A
123it [00:02, 55.81it/s][A
129it [00:02, 56.91it/s][A
135it [00:02, 55.81it/s][A
141it [00:02, 55.27it/s][A
147it [00:02, 54.85it/s][A
153it [00:02, 56.00it/s][A
159it [00:02, 56.75it/s][A
165it [00:02, 57.64it/s][A
171it [00:02, 58.28it/s][A
177it [00:03, 58.43it/s][A
183it [00:03, 58.78it/s][A
189it [00:03, 57.51it/s][A
195it [00:03, 57


Epoch: 380, Test Loss: 5.537585198509027, Test Perplexity: 255.1405181647828




0it [00:00, ?it/s][A
5it [00:00, 45.42it/s][A
10it [00:00, 43.57it/s][A
15it [00:00, 41.92it/s][A
20it [00:00, 42.91it/s][A
25it [00:00, 43.48it/s][A
30it [00:00, 44.22it/s][A
35it [00:00, 44.58it/s][A
40it [00:00, 44.07it/s][A
45it [00:01, 43.78it/s][A
50it [00:01, 43.82it/s][A
55it [00:01, 44.28it/s][A
60it [00:01, 44.43it/s][A
65it [00:01, 44.63it/s][A
70it [00:01, 44.62it/s][A
75it [00:01, 44.86it/s][A
80it [00:01, 44.60it/s][A
85it [00:01, 44.14it/s][A
90it [00:02, 43.68it/s][A
95it [00:02, 41.79it/s][A
100it [00:02, 41.64it/s][A
105it [00:02, 42.13it/s][A

Epoch: 381, Step: 100, Loss: 4.484744434356689



110it [00:02, 42.96it/s][A
115it [00:02, 42.99it/s][A
120it [00:02, 43.63it/s][A
125it [00:02, 43.68it/s][A
130it [00:02, 43.67it/s][A
135it [00:03, 43.58it/s][A
140it [00:03, 43.78it/s][A
145it [00:03, 44.42it/s][A
150it [00:03, 44.64it/s][A
155it [00:03, 45.12it/s][A
160it [00:03, 45.38it/s][A
165it [00:03, 44.86it/s][A
170it [00:03, 44.97it/s][A
175it [00:03, 43.15it/s][A
180it [00:04, 43.79it/s][A
185it [00:04, 44.15it/s][A
190it [00:04, 44.60it/s][A
195it [00:04, 44.58it/s][A
200it [00:04, 44.62it/s][A
205it [00:04, 44.75it/s][A

Epoch: 381, Step: 200, Loss: 4.504325633049011



210it [00:04, 42.82it/s][A
215it [00:04, 43.75it/s][A
220it [00:05, 44.36it/s][A
227it [00:05, 43.93it/s]
 76%|███████▌  | 381/500 [45:30<15:39,  7.90s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.06it/s][A
10it [00:00, 41.86it/s][A
15it [00:00, 43.11it/s][A
20it [00:00, 44.12it/s][A
25it [00:00, 42.28it/s][A
30it [00:00, 43.30it/s][A
35it [00:00, 43.83it/s][A
40it [00:00, 44.32it/s][A
45it [00:01, 44.55it/s][A
50it [00:01, 44.59it/s][A
55it [00:01, 44.89it/s][A
60it [00:01, 44.88it/s][A
65it [00:01, 44.92it/s][A
70it [00:01, 44.97it/s][A
75it [00:01, 44.86it/s][A
80it [00:01, 44.84it/s][A
85it [00:01, 45.04it/s][A
90it [00:02, 44.74it/s][A
95it [00:02, 44.76it/s][A
100it [00:02, 45.05it/s][A
105it [00:02, 45.19it/s][A

Epoch: 382, Step: 100, Loss: 4.4887866067886355



110it [00:02, 45.41it/s][A
115it [00:02, 45.65it/s][A
120it [00:02, 45.72it/s][A
125it [00:02, 45.81it/s][A
130it [00:02, 45.90it/s][A
135it [00:03, 45.77it/s][A
140it [00:03, 44.84it/s][A
145it [00:03, 44.98it/s][A
150it [00:03, 45.30it/s][A
155it [00:03, 43.62it/s][A
160it [00:03, 44.23it/s][A
165it [00:03, 44.64it/s][A
170it [00:03, 44.80it/s][A
175it [00:03, 45.11it/s][A
180it [00:04, 44.28it/s][A
185it [00:04, 44.81it/s][A
190it [00:04, 45.01it/s][A
195it [00:04, 44.92it/s][A
200it [00:04, 45.18it/s][A
205it [00:04, 45.12it/s][A

Epoch: 382, Step: 200, Loss: 4.502141377925873



210it [00:04, 45.01it/s][A
215it [00:04, 45.10it/s][A
220it [00:04, 45.25it/s][A
227it [00:05, 44.77it/s]
 76%|███████▋  | 382/500 [45:35<13:51,  7.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.90it/s][A
10it [00:00, 45.21it/s][A
15it [00:00, 44.79it/s][A
20it [00:00, 45.12it/s][A
25it [00:00, 45.31it/s][A
30it [00:00, 45.52it/s][A
35it [00:00, 45.49it/s][A
40it [00:00, 45.54it/s][A
45it [00:00, 45.46it/s][A
50it [00:01, 45.46it/s][A
55it [00:01, 45.37it/s][A
60it [00:01, 45.36it/s][A
65it [00:01, 45.27it/s][A
70it [00:01, 45.34it/s][A
75it [00:01, 45.46it/s][A
80it [00:01, 45.43it/s][A
85it [00:01, 45.22it/s][A
90it [00:01, 44.94it/s][A
95it [00:02, 44.64it/s][A
100it [00:02, 44.37it/s][A
105it [00:02, 44.38it/s][A

Epoch: 383, Step: 100, Loss: 4.485252504348755



110it [00:02, 44.25it/s][A
115it [00:02, 44.55it/s][A
120it [00:02, 44.87it/s][A
125it [00:02, 44.74it/s][A
130it [00:02, 44.84it/s][A
135it [00:03, 44.57it/s][A
140it [00:03, 44.21it/s][A
145it [00:03, 44.23it/s][A
150it [00:03, 44.50it/s][A
155it [00:03, 43.29it/s][A
160it [00:03, 43.62it/s][A
165it [00:03, 43.02it/s][A
170it [00:03, 43.10it/s][A
175it [00:03, 43.55it/s][A
180it [00:04, 43.63it/s][A
185it [00:04, 43.50it/s][A
190it [00:04, 43.70it/s][A
195it [00:04, 44.01it/s][A
200it [00:04, 42.37it/s][A
205it [00:04, 43.38it/s][A

Epoch: 383, Step: 200, Loss: 4.503042933940887



210it [00:04, 43.72it/s][A
215it [00:04, 44.40it/s][A
220it [00:04, 44.63it/s][A
227it [00:05, 44.40it/s]
 77%|███████▋  | 383/500 [45:40<12:36,  6.47s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.53it/s][A
10it [00:00, 44.62it/s][A
15it [00:00, 44.72it/s][A
20it [00:00, 44.88it/s][A
25it [00:00, 44.93it/s][A
30it [00:00, 44.90it/s][A
35it [00:00, 44.70it/s][A
40it [00:00, 44.65it/s][A
45it [00:01, 44.61it/s][A
50it [00:01, 43.48it/s][A
55it [00:01, 43.99it/s][A
60it [00:01, 44.35it/s][A
65it [00:01, 43.47it/s][A
70it [00:01, 43.91it/s][A
75it [00:01, 43.34it/s][A
80it [00:01, 44.07it/s][A
85it [00:01, 43.75it/s][A
90it [00:02, 44.35it/s][A
95it [00:02, 44.20it/s][A
100it [00:02, 43.61it/s][A
105it [00:02, 44.09it/s][A

Epoch: 384, Step: 100, Loss: 4.484220080375671



110it [00:02, 44.20it/s][A
115it [00:02, 44.46it/s][A
120it [00:02, 44.48it/s][A
125it [00:02, 44.60it/s][A
130it [00:02, 43.12it/s][A
135it [00:03, 43.99it/s][A
140it [00:03, 44.12it/s][A
145it [00:03, 44.29it/s][A
150it [00:03, 44.73it/s][A
155it [00:03, 44.69it/s][A
160it [00:03, 44.91it/s][A
165it [00:03, 44.94it/s][A
170it [00:03, 45.12it/s][A
175it [00:03, 45.20it/s][A
180it [00:04, 45.41it/s][A
185it [00:04, 45.46it/s][A
190it [00:04, 45.49it/s][A
195it [00:04, 44.31it/s][A
200it [00:04, 44.72it/s][A
205it [00:04, 43.59it/s][A

Epoch: 384, Step: 200, Loss: 4.497804815769196



210it [00:04, 44.22it/s][A
215it [00:04, 44.71it/s][A
220it [00:04, 45.06it/s][A
227it [00:05, 44.45it/s]
 77%|███████▋  | 384/500 [45:45<11:43,  6.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.87it/s][A
10it [00:00, 45.57it/s][A
15it [00:00, 43.45it/s][A
20it [00:00, 43.54it/s][A
25it [00:00, 42.49it/s][A
30it [00:00, 43.48it/s][A
35it [00:00, 43.99it/s][A
40it [00:00, 44.53it/s][A
45it [00:01, 43.80it/s][A
50it [00:01, 44.19it/s][A
55it [00:01, 44.40it/s][A
60it [00:01, 44.15it/s][A
65it [00:01, 44.24it/s][A
70it [00:01, 44.67it/s][A
75it [00:01, 44.85it/s][A
80it [00:01, 44.78it/s][A
85it [00:01, 44.86it/s][A
90it [00:02, 45.24it/s][A
95it [00:02, 45.31it/s][A
100it [00:02, 45.33it/s][A
105it [00:02, 45.24it/s][A

Epoch: 385, Step: 100, Loss: 4.489241805076599



110it [00:02, 44.98it/s][A
115it [00:02, 44.63it/s][A
120it [00:02, 44.85it/s][A
125it [00:02, 44.85it/s][A
130it [00:02, 44.88it/s][A
135it [00:03, 45.06it/s][A
140it [00:03, 43.85it/s][A
145it [00:03, 42.70it/s][A
150it [00:03, 43.16it/s][A
155it [00:03, 43.74it/s][A
160it [00:03, 43.93it/s][A
165it [00:03, 44.31it/s][A
170it [00:03, 44.49it/s][A
175it [00:03, 44.79it/s][A
180it [00:04, 44.97it/s][A
185it [00:04, 43.70it/s][A
190it [00:04, 43.58it/s][A
195it [00:04, 43.66it/s][A
200it [00:04, 41.41it/s][A
205it [00:04, 41.32it/s][A

Epoch: 385, Step: 200, Loss: 4.5030556130409245



210it [00:04, 42.52it/s][A
215it [00:04, 43.23it/s][A
220it [00:04, 43.47it/s][A
227it [00:05, 44.08it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.99it/s][A
12it [00:00, 54.19it/s][A
18it [00:00, 55.89it/s][A
24it [00:00, 55.73it/s][A
30it [00:00, 54.96it/s][A
36it [00:00, 55.68it/s][A
42it [00:00, 56.78it/s][A
49it [00:00, 58.04it/s][A
55it [00:00, 56.06it/s][A
61it [00:01, 57.17it/s][A
68it [00:01, 58.26it/s][A
75it [00:01, 59.05it/s][A
82it [00:01, 59.63it/s][A
89it [00:01, 60.17it/s][A
96it [00:01, 60.56it/s][A
103it [00:01, 60.78it/s][A
110it [00:01, 60.75it/s][A
117it [00:01, 60.86it/s][A
124it [00:02, 60.83it/s][A
131it [00:02, 60.88it/s][A
138it [00:02, 60.88it/s][A
145it [00:02, 60.67it/s][A
152it [00:02, 60.65it/s][A
159it [00:02, 60.21it/s][A
166it [00:02, 58.33it/s][A
172it [00:02, 56.12it/s][A
178it [00:03, 55.55it/s][A
185it [00:03, 57.14it/s][A
192it [00:03, 58.52it/s][A
199it [00:03, 59.30it/s][A
205it [00:03, 58.84it/s][A
212it [00:03, 5


Epoch: 385, Test Loss: 5.539061795110288, Test Perplexity: 255.4627945467552




0it [00:00, ?it/s][A
5it [00:00, 45.05it/s][A
10it [00:00, 45.24it/s][A
15it [00:00, 45.02it/s][A
20it [00:00, 45.10it/s][A
25it [00:00, 45.13it/s][A
30it [00:00, 44.79it/s][A
35it [00:00, 44.23it/s][A
40it [00:00, 43.84it/s][A
45it [00:01, 43.18it/s][A
50it [00:01, 43.04it/s][A
55it [00:01, 43.78it/s][A
60it [00:01, 43.87it/s][A
65it [00:01, 44.34it/s][A
70it [00:01, 43.67it/s][A
75it [00:01, 44.16it/s][A
80it [00:01, 43.90it/s][A
85it [00:01, 44.09it/s][A
90it [00:02, 44.09it/s][A
95it [00:02, 44.37it/s][A
100it [00:02, 44.44it/s][A
105it [00:02, 44.78it/s][A

Epoch: 386, Step: 100, Loss: 4.49215751171112



110it [00:02, 44.61it/s][A
115it [00:02, 44.66it/s][A
120it [00:02, 44.90it/s][A
125it [00:02, 44.98it/s][A
130it [00:02, 45.04it/s][A
135it [00:03, 45.18it/s][A
140it [00:03, 45.19it/s][A
145it [00:03, 45.44it/s][A
150it [00:03, 45.40it/s][A
155it [00:03, 45.32it/s][A
160it [00:03, 45.49it/s][A
165it [00:03, 45.42it/s][A
170it [00:03, 45.31it/s][A
175it [00:03, 44.75it/s][A
180it [00:04, 44.84it/s][A
185it [00:04, 45.21it/s][A
190it [00:04, 43.67it/s][A
195it [00:04, 44.39it/s][A
200it [00:04, 44.67it/s][A
205it [00:04, 44.96it/s][A

Epoch: 386, Step: 200, Loss: 4.5017389392852785



210it [00:04, 44.89it/s][A
215it [00:04, 43.70it/s][A
220it [00:04, 43.51it/s][A
227it [00:05, 44.47it/s]
 77%|███████▋  | 386/500 [46:07<14:56,  7.87s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.98it/s][A
10it [00:00, 41.38it/s][A
15it [00:00, 42.93it/s][A
20it [00:00, 44.08it/s][A
25it [00:00, 44.81it/s][A
30it [00:00, 45.21it/s][A
35it [00:00, 45.01it/s][A
40it [00:00, 45.14it/s][A
45it [00:01, 45.30it/s][A
50it [00:01, 45.52it/s][A
55it [00:01, 45.66it/s][A
60it [00:01, 45.70it/s][A
65it [00:01, 45.83it/s][A
70it [00:01, 45.91it/s][A
75it [00:01, 45.65it/s][A
80it [00:01, 44.73it/s][A
85it [00:01, 44.94it/s][A
90it [00:01, 45.22it/s][A
95it [00:02, 45.58it/s][A
100it [00:02, 45.82it/s][A
105it [00:02, 45.79it/s][A

Epoch: 387, Step: 100, Loss: 4.495057420730591



110it [00:02, 44.44it/s][A
115it [00:02, 44.94it/s][A
120it [00:02, 45.31it/s][A
125it [00:02, 45.60it/s][A
130it [00:02, 45.52it/s][A
135it [00:02, 45.73it/s][A
140it [00:03, 45.63it/s][A
145it [00:03, 45.39it/s][A
150it [00:03, 45.27it/s][A
155it [00:03, 45.31it/s][A
160it [00:03, 43.88it/s][A
165it [00:03, 43.87it/s][A
170it [00:03, 42.85it/s][A
175it [00:03, 43.71it/s][A
180it [00:04, 43.42it/s][A
185it [00:04, 43.96it/s][A
190it [00:04, 44.31it/s][A
195it [00:04, 44.83it/s][A
200it [00:04, 43.96it/s][A
205it [00:04, 44.44it/s][A

Epoch: 387, Step: 200, Loss: 4.501720912456513



210it [00:04, 43.20it/s][A
215it [00:04, 42.06it/s][A
220it [00:04, 43.04it/s][A
227it [00:05, 44.60it/s]
 77%|███████▋  | 387/500 [46:12<13:14,  7.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.64it/s][A
10it [00:00, 45.93it/s][A
15it [00:00, 43.46it/s][A
20it [00:00, 44.35it/s][A
25it [00:00, 44.84it/s][A
30it [00:00, 44.96it/s][A
35it [00:00, 44.81it/s][A
40it [00:00, 44.15it/s][A
45it [00:01, 43.72it/s][A
50it [00:01, 42.80it/s][A
55it [00:01, 43.44it/s][A
60it [00:01, 41.92it/s][A
65it [00:01, 42.89it/s][A
70it [00:01, 43.59it/s][A
75it [00:01, 43.97it/s][A
80it [00:01, 44.35it/s][A
85it [00:01, 44.27it/s][A
90it [00:02, 43.65it/s][A
95it [00:02, 43.22it/s][A
100it [00:02, 43.86it/s][A
105it [00:02, 44.14it/s][A

Epoch: 388, Step: 100, Loss: 4.483191022872925



110it [00:02, 44.39it/s][A
115it [00:02, 44.46it/s][A
120it [00:02, 44.59it/s][A
125it [00:02, 43.92it/s][A
130it [00:02, 44.15it/s][A
135it [00:03, 44.40it/s][A
140it [00:03, 43.66it/s][A
145it [00:03, 44.01it/s][A
150it [00:03, 43.88it/s][A
155it [00:03, 43.96it/s][A
160it [00:03, 43.98it/s][A
165it [00:03, 44.02it/s][A
170it [00:03, 44.32it/s][A
175it [00:03, 44.32it/s][A
180it [00:04, 44.46it/s][A
185it [00:04, 44.56it/s][A
190it [00:04, 44.99it/s][A
195it [00:04, 45.03it/s][A
200it [00:04, 45.01it/s][A
205it [00:04, 44.93it/s][A

Epoch: 388, Step: 200, Loss: 4.500641477108002



210it [00:04, 44.99it/s][A
215it [00:04, 45.00it/s][A
220it [00:04, 45.21it/s][A
227it [00:05, 44.22it/s]
 78%|███████▊  | 388/500 [46:17<12:04,  6.47s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.67it/s][A
10it [00:00, 42.42it/s][A
15it [00:00, 43.99it/s][A
20it [00:00, 44.80it/s][A
25it [00:00, 44.94it/s][A
30it [00:00, 45.07it/s][A
35it [00:00, 45.22it/s][A
40it [00:00, 45.53it/s][A
45it [00:00, 45.62it/s][A
50it [00:01, 43.05it/s][A
55it [00:01, 43.76it/s][A
60it [00:01, 44.49it/s][A
65it [00:01, 44.85it/s][A
70it [00:01, 45.04it/s][A
75it [00:01, 44.66it/s][A
80it [00:01, 45.08it/s][A
85it [00:01, 45.56it/s][A
90it [00:02, 45.71it/s][A
95it [00:02, 45.55it/s][A
100it [00:02, 45.58it/s][A
105it [00:02, 45.51it/s][A

Epoch: 389, Step: 100, Loss: 4.491478714942932



110it [00:02, 45.13it/s][A
115it [00:02, 45.20it/s][A
120it [00:02, 44.99it/s][A
125it [00:02, 45.26it/s][A
130it [00:02, 45.23it/s][A
135it [00:03, 44.98it/s][A
140it [00:03, 45.17it/s][A
145it [00:03, 45.15it/s][A
150it [00:03, 45.14it/s][A
155it [00:03, 45.22it/s][A
160it [00:03, 45.25it/s][A
165it [00:03, 45.33it/s][A
170it [00:03, 45.36it/s][A
175it [00:03, 45.24it/s][A
180it [00:03, 45.07it/s][A
185it [00:04, 45.32it/s][A
190it [00:04, 45.07it/s][A
195it [00:04, 45.19it/s][A
200it [00:04, 45.27it/s][A
205it [00:04, 44.21it/s][A

Epoch: 389, Step: 200, Loss: 4.499024128913879



210it [00:04, 44.21it/s][A
215it [00:04, 44.69it/s][A
220it [00:04, 44.49it/s][A
227it [00:05, 44.94it/s]
 78%|███████▊  | 389/500 [46:22<11:10,  6.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.46it/s][A
10it [00:00, 45.65it/s][A
15it [00:00, 45.66it/s][A
20it [00:00, 45.55it/s][A
25it [00:00, 43.96it/s][A
30it [00:00, 44.64it/s][A
35it [00:00, 45.27it/s][A
40it [00:00, 44.57it/s][A
45it [00:00, 45.18it/s][A
50it [00:01, 45.08it/s][A
55it [00:01, 45.32it/s][A
60it [00:01, 45.26it/s][A
65it [00:01, 45.48it/s][A
70it [00:01, 45.59it/s][A
75it [00:01, 45.76it/s][A
80it [00:01, 46.07it/s][A
85it [00:01, 46.09it/s][A
90it [00:01, 46.20it/s][A
95it [00:02, 45.33it/s][A
100it [00:02, 45.15it/s][A
105it [00:02, 45.02it/s][A

Epoch: 390, Step: 100, Loss: 4.480316901206971



110it [00:02, 45.15it/s][A
115it [00:02, 44.85it/s][A
120it [00:02, 45.29it/s][A
125it [00:02, 43.77it/s][A
130it [00:02, 44.52it/s][A
135it [00:03, 43.57it/s][A
140it [00:03, 42.95it/s][A
145it [00:03, 43.75it/s][A
150it [00:03, 44.20it/s][A
155it [00:03, 44.08it/s][A
160it [00:03, 43.38it/s][A
165it [00:03, 43.70it/s][A
170it [00:03, 44.32it/s][A
175it [00:03, 44.84it/s][A
180it [00:04, 45.30it/s][A
185it [00:04, 45.42it/s][A
190it [00:04, 44.07it/s][A
195it [00:04, 44.19it/s][A
200it [00:04, 44.46it/s][A
205it [00:04, 44.79it/s][A

Epoch: 390, Step: 200, Loss: 4.502746224403381



210it [00:04, 44.84it/s][A
215it [00:04, 43.30it/s][A
220it [00:04, 42.46it/s][A
227it [00:05, 44.62it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.12it/s][A
12it [00:00, 58.48it/s][A
18it [00:00, 55.83it/s][A
25it [00:00, 58.05it/s][A
31it [00:00, 58.53it/s][A
38it [00:00, 59.22it/s][A
44it [00:00, 56.91it/s][A
50it [00:00, 57.46it/s][A
56it [00:00, 57.88it/s][A
63it [00:01, 58.77it/s][A
70it [00:01, 59.49it/s][A
76it [00:01, 59.24it/s][A
82it [00:01, 59.22it/s][A
89it [00:01, 59.62it/s][A
96it [00:01, 59.86it/s][A
102it [00:01, 59.77it/s][A
108it [00:01, 59.60it/s][A
114it [00:01, 56.89it/s][A
121it [00:02, 57.98it/s][A
128it [00:02, 58.68it/s][A
134it [00:02, 55.98it/s][A
140it [00:02, 56.62it/s][A
146it [00:02, 56.99it/s][A
152it [00:02, 57.40it/s][A
158it [00:02, 58.01it/s][A
165it [00:02, 58.77it/s][A
172it [00:02, 59.58it/s][A
178it [00:03, 59.37it/s][A
184it [00:03, 57.62it/s][A
190it [00:03, 57.87it/s][A
196it [00:03, 56.10it/s][A
202it [00:03, 5


Epoch: 390, Test Loss: 5.546969319722667, Test Perplexity: 257.50220326014926




0it [00:00, ?it/s][A
5it [00:00, 43.56it/s][A
10it [00:00, 44.22it/s][A
15it [00:00, 42.59it/s][A
20it [00:00, 43.89it/s][A
25it [00:00, 44.43it/s][A
30it [00:00, 44.76it/s][A
35it [00:00, 44.91it/s][A
40it [00:00, 44.85it/s][A
45it [00:01, 45.12it/s][A
50it [00:01, 43.97it/s][A
55it [00:01, 44.40it/s][A
60it [00:01, 43.55it/s][A
65it [00:01, 44.21it/s][A
70it [00:01, 44.54it/s][A
75it [00:01, 44.44it/s][A
80it [00:01, 44.69it/s][A
85it [00:01, 44.62it/s][A
90it [00:02, 44.78it/s][A
95it [00:02, 44.65it/s][A
100it [00:02, 44.93it/s][A
105it [00:02, 45.01it/s][A

Epoch: 391, Step: 100, Loss: 4.47916305065155



110it [00:02, 44.08it/s][A
115it [00:02, 44.37it/s][A
120it [00:02, 44.52it/s][A
125it [00:02, 44.73it/s][A
130it [00:02, 45.01it/s][A
135it [00:03, 44.13it/s][A
140it [00:03, 44.40it/s][A
145it [00:03, 44.75it/s][A
150it [00:03, 45.12it/s][A
155it [00:03, 45.29it/s][A
160it [00:03, 45.48it/s][A
165it [00:03, 45.29it/s][A
170it [00:03, 45.38it/s][A
175it [00:03, 45.48it/s][A
180it [00:04, 45.42it/s][A
185it [00:04, 45.38it/s][A
190it [00:04, 45.61it/s][A
195it [00:04, 45.15it/s][A
200it [00:04, 45.27it/s][A
205it [00:04, 44.02it/s][A

Epoch: 391, Step: 200, Loss: 4.497934050559998



210it [00:04, 44.12it/s][A
215it [00:04, 44.65it/s][A
220it [00:04, 44.86it/s][A
227it [00:05, 44.71it/s]
 78%|███████▊  | 391/500 [46:43<14:15,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.32it/s][A
10it [00:00, 45.53it/s][A
15it [00:00, 45.52it/s][A
20it [00:00, 45.70it/s][A
25it [00:00, 45.55it/s][A
30it [00:00, 45.12it/s][A
35it [00:00, 45.23it/s][A
40it [00:00, 44.21it/s][A
45it [00:00, 44.70it/s][A
50it [00:01, 44.97it/s][A
55it [00:01, 45.26it/s][A
60it [00:01, 45.35it/s][A
65it [00:01, 45.28it/s][A
70it [00:01, 45.34it/s][A
75it [00:01, 45.28it/s][A
80it [00:01, 45.58it/s][A
85it [00:01, 45.80it/s][A
90it [00:01, 45.47it/s][A
95it [00:02, 45.50it/s][A
100it [00:02, 45.71it/s][A
105it [00:02, 45.98it/s][A

Epoch: 392, Step: 100, Loss: 4.493415327072143



110it [00:02, 45.57it/s][A
115it [00:02, 45.58it/s][A
120it [00:02, 45.26it/s][A
125it [00:02, 45.48it/s][A
130it [00:02, 45.74it/s][A
135it [00:02, 45.75it/s][A
140it [00:03, 45.65it/s][A
145it [00:03, 45.64it/s][A
150it [00:03, 45.72it/s][A
155it [00:03, 45.67it/s][A
160it [00:03, 45.61it/s][A
165it [00:03, 45.21it/s][A
170it [00:03, 45.35it/s][A
175it [00:03, 45.17it/s][A
180it [00:03, 45.37it/s][A
185it [00:04, 44.60it/s][A
190it [00:04, 44.36it/s][A
195it [00:04, 44.11it/s][A
200it [00:04, 43.67it/s][A
205it [00:04, 44.03it/s][A

Epoch: 392, Step: 200, Loss: 4.498863651752472



210it [00:04, 44.20it/s][A
215it [00:04, 44.43it/s][A
220it [00:04, 43.28it/s][A
227it [00:05, 45.02it/s]
 78%|███████▊  | 392/500 [46:48<12:36,  7.01s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.77it/s][A
10it [00:00, 43.14it/s][A
15it [00:00, 43.74it/s][A
20it [00:00, 44.35it/s][A
25it [00:00, 44.41it/s][A
30it [00:00, 42.91it/s][A
35it [00:00, 43.42it/s][A
40it [00:00, 43.46it/s][A
45it [00:01, 43.84it/s][A
50it [00:01, 44.39it/s][A
55it [00:01, 43.14it/s][A
60it [00:01, 43.89it/s][A
65it [00:01, 44.16it/s][A
70it [00:01, 44.54it/s][A
75it [00:01, 44.73it/s][A
80it [00:01, 45.03it/s][A
85it [00:01, 45.41it/s][A
90it [00:02, 44.99it/s][A
95it [00:02, 45.25it/s][A
100it [00:02, 45.18it/s][A
105it [00:02, 45.28it/s][A

Epoch: 393, Step: 100, Loss: 4.488501543998718



110it [00:02, 45.21it/s][A
115it [00:02, 44.22it/s][A
120it [00:02, 44.59it/s][A
125it [00:02, 43.74it/s][A
130it [00:02, 44.44it/s][A
135it [00:03, 44.45it/s][A
140it [00:03, 42.95it/s][A
145it [00:03, 43.56it/s][A
150it [00:03, 43.95it/s][A
155it [00:03, 42.71it/s][A
160it [00:03, 43.32it/s][A
165it [00:03, 43.75it/s][A
170it [00:03, 42.63it/s][A
175it [00:03, 43.01it/s][A
180it [00:04, 43.52it/s][A
185it [00:04, 44.06it/s][A
190it [00:04, 44.34it/s][A
195it [00:04, 44.56it/s][A
200it [00:04, 44.57it/s][A
205it [00:04, 44.77it/s][A

Epoch: 393, Step: 200, Loss: 4.5021949791908265



210it [00:04, 44.45it/s][A
215it [00:04, 44.66it/s][A
220it [00:04, 44.90it/s][A
227it [00:05, 44.18it/s]
 79%|███████▊  | 393/500 [46:53<11:29,  6.45s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.90it/s][A
10it [00:00, 45.38it/s][A
15it [00:00, 45.05it/s][A
20it [00:00, 45.18it/s][A
25it [00:00, 45.79it/s][A
30it [00:00, 45.91it/s][A
35it [00:00, 45.73it/s][A
40it [00:00, 45.89it/s][A
45it [00:00, 45.87it/s][A
50it [00:01, 45.96it/s][A
55it [00:01, 45.68it/s][A
60it [00:01, 44.78it/s][A
65it [00:01, 43.49it/s][A
70it [00:01, 44.10it/s][A
75it [00:01, 44.55it/s][A
80it [00:01, 43.68it/s][A
85it [00:01, 44.49it/s][A
90it [00:01, 45.19it/s][A
95it [00:02, 45.56it/s][A
100it [00:02, 45.75it/s][A
105it [00:02, 45.67it/s][A

Epoch: 394, Step: 100, Loss: 4.489362306594849



110it [00:02, 45.99it/s][A
115it [00:02, 46.02it/s][A
120it [00:02, 46.06it/s][A
125it [00:02, 46.35it/s][A
130it [00:02, 45.00it/s][A
135it [00:02, 45.58it/s][A
140it [00:03, 45.90it/s][A
145it [00:03, 45.53it/s][A
150it [00:03, 44.43it/s][A
155it [00:03, 45.15it/s][A
160it [00:03, 45.32it/s][A
165it [00:03, 45.37it/s][A
170it [00:03, 45.72it/s][A
175it [00:03, 46.01it/s][A
180it [00:03, 46.10it/s][A
185it [00:04, 46.36it/s][A
190it [00:04, 45.80it/s][A
195it [00:04, 46.02it/s][A
200it [00:04, 46.13it/s][A
205it [00:04, 46.04it/s][A

Epoch: 394, Step: 200, Loss: 4.499185523986816



210it [00:04, 44.32it/s][A
215it [00:04, 45.11it/s][A
220it [00:04, 44.11it/s][A
227it [00:05, 45.21it/s]
 79%|███████▉  | 394/500 [46:58<10:38,  6.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.46it/s][A
10it [00:00, 44.87it/s][A
15it [00:00, 44.52it/s][A
20it [00:00, 44.73it/s][A
25it [00:00, 45.28it/s][A
30it [00:00, 45.51it/s][A
35it [00:00, 45.49it/s][A
40it [00:00, 43.03it/s][A
45it [00:01, 43.49it/s][A
50it [00:01, 44.72it/s][A
55it [00:01, 44.82it/s][A
60it [00:01, 44.86it/s][A
65it [00:01, 44.87it/s][A
70it [00:01, 44.48it/s][A
75it [00:01, 44.30it/s][A
80it [00:01, 43.76it/s][A
85it [00:01, 44.06it/s][A
90it [00:02, 44.45it/s][A
95it [00:02, 43.52it/s][A
100it [00:02, 43.95it/s][A
105it [00:02, 44.25it/s][A

Epoch: 395, Step: 100, Loss: 4.480567922592163



110it [00:02, 44.47it/s][A
115it [00:02, 44.29it/s][A
120it [00:02, 44.80it/s][A
125it [00:02, 44.06it/s][A
130it [00:02, 42.06it/s][A
135it [00:03, 42.90it/s][A
140it [00:03, 43.47it/s][A
145it [00:03, 44.05it/s][A
150it [00:03, 44.07it/s][A
155it [00:03, 44.44it/s][A
160it [00:03, 43.38it/s][A
165it [00:03, 43.42it/s][A
170it [00:03, 43.07it/s][A
175it [00:03, 42.38it/s][A
180it [00:04, 42.44it/s][A
185it [00:04, 42.86it/s][A
190it [00:04, 43.48it/s][A
195it [00:04, 43.75it/s][A
200it [00:04, 44.27it/s][A
205it [00:04, 44.40it/s][A

Epoch: 395, Step: 200, Loss: 4.4981716346740725



210it [00:04, 44.27it/s][A
215it [00:04, 44.66it/s][A
220it [00:04, 45.05it/s][A
227it [00:05, 43.95it/s]

0it [00:00, ?it/s][A
6it [00:00, 56.53it/s][A
12it [00:00, 57.33it/s][A
19it [00:00, 58.79it/s][A
26it [00:00, 59.37it/s][A
32it [00:00, 57.80it/s][A
38it [00:00, 57.60it/s][A
44it [00:00, 58.25it/s][A
50it [00:00, 58.46it/s][A
56it [00:00, 58.85it/s][A
62it [00:01, 58.89it/s][A
68it [00:01, 58.59it/s][A
74it [00:01, 57.75it/s][A
80it [00:01, 57.98it/s][A
87it [00:01, 58.74it/s][A
93it [00:01, 59.00it/s][A
99it [00:01, 59.03it/s][A
105it [00:01, 58.86it/s][A
112it [00:01, 59.50it/s][A
118it [00:02, 59.36it/s][A
124it [00:02, 57.31it/s][A
130it [00:02, 58.04it/s][A
137it [00:02, 58.78it/s][A
143it [00:02, 57.25it/s][A
149it [00:02, 57.76it/s][A
156it [00:02, 58.73it/s][A
163it [00:02, 59.31it/s][A
169it [00:02, 57.52it/s][A
175it [00:03, 57.63it/s][A
181it [00:03, 58.28it/s][A
187it [00:03, 57.34it/s][A
194it [00:03, 58.48it/s][A
201it [00:03, 59


Epoch: 395, Test Loss: 5.5449814078230295, Test Perplexity: 256.95362536507366




0it [00:00, ?it/s][A
5it [00:00, 44.54it/s][A
10it [00:00, 44.70it/s][A
15it [00:00, 44.06it/s][A
20it [00:00, 44.65it/s][A
25it [00:00, 45.06it/s][A
30it [00:00, 45.05it/s][A
35it [00:00, 45.15it/s][A
40it [00:00, 45.39it/s][A
45it [00:00, 45.43it/s][A
50it [00:01, 45.45it/s][A
55it [00:01, 45.34it/s][A
60it [00:01, 45.43it/s][A
65it [00:01, 45.30it/s][A
70it [00:01, 45.32it/s][A
75it [00:01, 45.26it/s][A
80it [00:01, 45.35it/s][A
85it [00:01, 45.27it/s][A
90it [00:01, 44.91it/s][A
95it [00:02, 44.62it/s][A
100it [00:02, 44.78it/s][A
105it [00:02, 45.01it/s][A

Epoch: 396, Step: 100, Loss: 4.495177965164185



110it [00:02, 44.62it/s][A
115it [00:02, 44.84it/s][A
120it [00:02, 44.76it/s][A
125it [00:02, 43.47it/s][A
130it [00:02, 44.16it/s][A
135it [00:03, 44.34it/s][A
140it [00:03, 43.30it/s][A
145it [00:03, 44.11it/s][A
150it [00:03, 44.32it/s][A
155it [00:03, 44.75it/s][A
160it [00:03, 45.16it/s][A
165it [00:03, 45.25it/s][A
170it [00:03, 45.41it/s][A
175it [00:03, 44.39it/s][A
180it [00:04, 44.65it/s][A
185it [00:04, 44.19it/s][A
190it [00:04, 43.12it/s][A
195it [00:04, 42.13it/s][A
200it [00:04, 43.17it/s][A
205it [00:04, 43.84it/s][A

Epoch: 396, Step: 200, Loss: 4.4980477881431575



210it [00:04, 44.39it/s][A
215it [00:04, 44.69it/s][A
220it [00:04, 43.71it/s][A
227it [00:05, 44.53it/s]
 79%|███████▉  | 396/500 [47:19<13:36,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.13it/s][A
10it [00:00, 46.03it/s][A
15it [00:00, 45.78it/s][A
20it [00:00, 45.78it/s][A
25it [00:00, 45.57it/s][A
30it [00:00, 45.74it/s][A
35it [00:00, 45.85it/s][A
40it [00:00, 45.55it/s][A
45it [00:00, 45.59it/s][A
50it [00:01, 45.69it/s][A
55it [00:01, 44.12it/s][A
60it [00:01, 44.14it/s][A
65it [00:01, 43.54it/s][A
70it [00:01, 43.25it/s][A
75it [00:01, 43.98it/s][A
80it [00:01, 43.38it/s][A
85it [00:01, 42.09it/s][A
90it [00:02, 43.14it/s][A
95it [00:02, 43.15it/s][A
100it [00:02, 43.79it/s][A
105it [00:02, 44.33it/s][A

Epoch: 397, Step: 100, Loss: 4.487199277877807



110it [00:02, 42.80it/s][A
115it [00:02, 43.58it/s][A
120it [00:02, 44.11it/s][A
125it [00:02, 44.15it/s][A
130it [00:02, 44.30it/s][A
135it [00:03, 42.10it/s][A
140it [00:03, 42.13it/s][A
145it [00:03, 42.04it/s][A
150it [00:03, 42.82it/s][A
155it [00:03, 43.20it/s][A
160it [00:03, 43.82it/s][A
165it [00:03, 44.37it/s][A
170it [00:03, 44.77it/s][A
175it [00:03, 44.93it/s][A
180it [00:04, 45.08it/s][A
185it [00:04, 45.30it/s][A
190it [00:04, 45.16it/s][A
195it [00:04, 45.01it/s][A
200it [00:04, 45.23it/s][A
205it [00:04, 45.30it/s][A

Epoch: 397, Step: 200, Loss: 4.4952148342132565



210it [00:04, 45.42it/s][A
215it [00:04, 44.37it/s][A
220it [00:04, 44.69it/s][A
227it [00:05, 44.11it/s]
 79%|███████▉  | 397/500 [47:24<12:05,  7.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.61it/s][A
10it [00:00, 45.34it/s][A
15it [00:00, 44.66it/s][A
20it [00:00, 42.33it/s][A
25it [00:00, 43.56it/s][A
30it [00:00, 44.21it/s][A
35it [00:00, 44.46it/s][A
40it [00:00, 44.70it/s][A
45it [00:01, 43.68it/s][A
50it [00:01, 44.21it/s][A
55it [00:01, 44.61it/s][A
60it [00:01, 44.56it/s][A
65it [00:01, 44.98it/s][A
70it [00:01, 45.36it/s][A
75it [00:01, 45.40it/s][A
80it [00:01, 45.58it/s][A
85it [00:01, 45.37it/s][A
90it [00:02, 45.42it/s][A
95it [00:02, 44.55it/s][A
100it [00:02, 44.90it/s][A
105it [00:02, 43.17it/s][A

Epoch: 398, Step: 100, Loss: 4.475404796600341



110it [00:02, 43.87it/s][A
115it [00:02, 44.46it/s][A
120it [00:02, 44.90it/s][A
125it [00:02, 44.39it/s][A
130it [00:02, 44.37it/s][A
135it [00:03, 44.78it/s][A
140it [00:03, 45.26it/s][A
145it [00:03, 45.73it/s][A
150it [00:03, 45.68it/s][A
155it [00:03, 45.33it/s][A
160it [00:03, 45.36it/s][A
165it [00:03, 43.35it/s][A
170it [00:03, 43.13it/s][A
175it [00:03, 43.96it/s][A
180it [00:04, 44.20it/s][A
185it [00:04, 44.95it/s][A
190it [00:04, 44.25it/s][A
195it [00:04, 44.87it/s][A
200it [00:04, 44.67it/s][A
205it [00:04, 45.23it/s][A

Epoch: 398, Step: 200, Loss: 4.495661268234253



210it [00:04, 45.34it/s][A
215it [00:04, 45.55it/s][A
220it [00:04, 45.56it/s][A
227it [00:05, 44.72it/s]
 80%|███████▉  | 398/500 [47:30<10:58,  6.45s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.11it/s][A
10it [00:00, 46.31it/s][A
15it [00:00, 46.12it/s][A
20it [00:00, 44.25it/s][A
25it [00:00, 45.11it/s][A
30it [00:00, 45.64it/s][A
35it [00:00, 45.85it/s][A
40it [00:00, 43.94it/s][A
45it [00:00, 44.79it/s][A
50it [00:01, 45.39it/s][A
55it [00:01, 45.52it/s][A
60it [00:01, 45.41it/s][A
65it [00:01, 45.46it/s][A
70it [00:01, 45.39it/s][A
75it [00:01, 45.26it/s][A
80it [00:01, 45.38it/s][A
85it [00:01, 42.90it/s][A
90it [00:02, 41.88it/s][A
95it [00:02, 41.80it/s][A
100it [00:02, 43.48it/s][A
105it [00:02, 43.55it/s][A

Epoch: 399, Step: 100, Loss: 4.489281406402588



110it [00:02, 44.34it/s][A
115it [00:02, 43.65it/s][A
120it [00:02, 44.20it/s][A
125it [00:02, 43.00it/s][A
130it [00:02, 43.61it/s][A
135it [00:03, 44.18it/s][A
140it [00:03, 44.48it/s][A
145it [00:03, 44.63it/s][A
150it [00:03, 44.44it/s][A
155it [00:03, 44.56it/s][A
160it [00:03, 43.25it/s][A
165it [00:03, 43.76it/s][A
170it [00:03, 43.74it/s][A
175it [00:03, 43.80it/s][A
180it [00:04, 44.42it/s][A
185it [00:04, 44.73it/s][A
190it [00:04, 44.13it/s][A
195it [00:04, 44.05it/s][A
200it [00:04, 44.01it/s][A
205it [00:04, 43.25it/s][A

Epoch: 399, Step: 200, Loss: 4.494995529651642



210it [00:04, 43.10it/s][A
215it [00:04, 43.51it/s][A
220it [00:04, 44.09it/s][A
227it [00:05, 44.23it/s]
 80%|███████▉  | 399/500 [47:35<10:11,  6.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.60it/s][A
10it [00:00, 42.83it/s][A
15it [00:00, 43.45it/s][A
20it [00:00, 44.30it/s][A
25it [00:00, 44.65it/s][A
30it [00:00, 44.73it/s][A
35it [00:00, 44.64it/s][A
40it [00:00, 44.51it/s][A
45it [00:01, 44.89it/s][A
50it [00:01, 45.00it/s][A
55it [00:01, 45.29it/s][A
60it [00:01, 45.47it/s][A
65it [00:01, 45.47it/s][A
70it [00:01, 45.37it/s][A
75it [00:01, 45.07it/s][A
80it [00:01, 45.08it/s][A
85it [00:01, 45.02it/s][A
90it [00:02, 45.29it/s][A
95it [00:02, 44.99it/s][A
100it [00:02, 45.20it/s][A
105it [00:02, 45.19it/s][A

Epoch: 400, Step: 100, Loss: 4.476699848175048



110it [00:02, 45.07it/s][A
115it [00:02, 45.19it/s][A
120it [00:02, 45.15it/s][A
125it [00:02, 43.62it/s][A
130it [00:02, 44.02it/s][A
135it [00:03, 44.54it/s][A
140it [00:03, 44.91it/s][A
145it [00:03, 45.01it/s][A
150it [00:03, 44.33it/s][A
155it [00:03, 44.64it/s][A
160it [00:03, 44.87it/s][A
165it [00:03, 44.94it/s][A
170it [00:03, 44.72it/s][A
175it [00:03, 44.55it/s][A
180it [00:04, 44.96it/s][A
185it [00:04, 45.01it/s][A
190it [00:04, 45.01it/s][A
195it [00:04, 45.06it/s][A
200it [00:04, 44.89it/s][A
205it [00:04, 44.85it/s][A

Epoch: 400, Step: 200, Loss: 4.4948880791664125



210it [00:04, 42.76it/s][A
215it [00:04, 42.40it/s][A
220it [00:04, 43.22it/s][A
227it [00:05, 44.61it/s]

0it [00:00, ?it/s][A
6it [00:00, 53.89it/s][A
12it [00:00, 57.02it/s][A
19it [00:00, 58.60it/s][A
25it [00:00, 59.03it/s][A
31it [00:00, 59.17it/s][A
37it [00:00, 59.27it/s][A
43it [00:00, 59.29it/s][A
50it [00:00, 59.62it/s][A
57it [00:00, 59.91it/s][A
64it [00:01, 60.10it/s][A
71it [00:01, 60.28it/s][A
78it [00:01, 60.46it/s][A
85it [00:01, 60.28it/s][A
92it [00:01, 58.78it/s][A
98it [00:01, 58.99it/s][A
105it [00:01, 59.42it/s][A
112it [00:01, 59.83it/s][A
119it [00:02, 59.96it/s][A
126it [00:02, 60.12it/s][A
133it [00:02, 60.36it/s][A
140it [00:02, 60.43it/s][A
147it [00:02, 59.70it/s][A
153it [00:02, 57.64it/s][A
159it [00:02, 58.28it/s][A
166it [00:02, 58.86it/s][A
173it [00:02, 59.43it/s][A
179it [00:03, 57.87it/s][A
186it [00:03, 58.79it/s][A
193it [00:03, 59.31it/s][A
199it [00:03, 59.47it/s][A
205it [00:03, 59.57it/s][A
212it [00:03, 5


Epoch: 400, Test Loss: 5.546793978406776, Test Perplexity: 257.50782432319215




0it [00:00, ?it/s][A
5it [00:00, 45.43it/s][A
10it [00:00, 45.70it/s][A
15it [00:00, 45.70it/s][A
20it [00:00, 43.12it/s][A
25it [00:00, 43.95it/s][A
30it [00:00, 44.35it/s][A
35it [00:00, 44.62it/s][A
40it [00:00, 45.19it/s][A
45it [00:01, 45.55it/s][A
50it [00:01, 45.61it/s][A
55it [00:01, 45.79it/s][A
60it [00:01, 45.71it/s][A
65it [00:01, 45.66it/s][A
70it [00:01, 45.10it/s][A
75it [00:01, 44.31it/s][A
80it [00:01, 43.30it/s][A
85it [00:01, 43.95it/s][A
90it [00:02, 43.30it/s][A
95it [00:02, 44.11it/s][A
100it [00:02, 43.22it/s][A
105it [00:02, 44.12it/s][A

Epoch: 401, Step: 100, Loss: 4.48377905368805



110it [00:02, 44.45it/s][A
115it [00:02, 43.77it/s][A
120it [00:02, 44.48it/s][A
125it [00:02, 44.90it/s][A
130it [00:02, 44.93it/s][A
135it [00:03, 45.27it/s][A
140it [00:03, 44.98it/s][A
145it [00:03, 45.16it/s][A
150it [00:03, 45.20it/s][A
155it [00:03, 44.94it/s][A
160it [00:03, 43.11it/s][A
165it [00:03, 43.64it/s][A
170it [00:03, 44.13it/s][A
175it [00:03, 44.49it/s][A
180it [00:04, 44.88it/s][A
185it [00:04, 43.45it/s][A
190it [00:04, 43.54it/s][A
195it [00:04, 44.31it/s][A
200it [00:04, 43.55it/s][A
205it [00:04, 43.59it/s][A

Epoch: 401, Step: 200, Loss: 4.495714671611786



210it [00:04, 41.82it/s][A
215it [00:04, 42.35it/s][A
220it [00:04, 41.83it/s][A
227it [00:05, 44.07it/s]
 80%|████████  | 401/500 [47:56<13:01,  7.90s/it]
0it [00:00, ?it/s][A
5it [00:00, 40.43it/s][A
10it [00:00, 39.81it/s][A
15it [00:00, 41.36it/s][A
20it [00:00, 41.84it/s][A
25it [00:00, 42.91it/s][A
30it [00:00, 43.28it/s][A
35it [00:00, 42.63it/s][A
40it [00:00, 42.52it/s][A
45it [00:01, 42.50it/s][A
50it [00:01, 43.05it/s][A
55it [00:01, 43.50it/s][A
60it [00:01, 43.85it/s][A
65it [00:01, 44.19it/s][A
70it [00:01, 44.70it/s][A
75it [00:01, 44.91it/s][A
80it [00:01, 44.92it/s][A
85it [00:01, 44.83it/s][A
90it [00:02, 45.15it/s][A
95it [00:02, 44.12it/s][A
100it [00:02, 44.33it/s][A
105it [00:02, 44.85it/s][A

Epoch: 402, Step: 100, Loss: 4.473542408943176



110it [00:02, 44.81it/s][A
115it [00:02, 45.09it/s][A
120it [00:02, 45.12it/s][A
125it [00:02, 44.04it/s][A
130it [00:02, 42.62it/s][A
135it [00:03, 43.68it/s][A
140it [00:03, 43.04it/s][A
145it [00:03, 44.10it/s][A
150it [00:03, 44.78it/s][A
155it [00:03, 45.37it/s][A
160it [00:03, 45.81it/s][A
165it [00:03, 46.07it/s][A
170it [00:03, 45.92it/s][A
175it [00:03, 45.89it/s][A
180it [00:04, 46.09it/s][A
185it [00:04, 44.98it/s][A
190it [00:04, 45.44it/s][A
195it [00:04, 45.61it/s][A
200it [00:04, 45.71it/s][A
205it [00:04, 45.75it/s][A

Epoch: 402, Step: 200, Loss: 4.49660950422287



210it [00:04, 45.13it/s][A
215it [00:04, 45.53it/s][A
220it [00:04, 45.89it/s][A
227it [00:05, 44.45it/s]
 80%|████████  | 402/500 [48:01<11:31,  7.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.43it/s][A
10it [00:00, 46.57it/s][A
15it [00:00, 46.50it/s][A
20it [00:00, 45.56it/s][A
25it [00:00, 45.97it/s][A
30it [00:00, 45.38it/s][A
35it [00:00, 45.78it/s][A
40it [00:00, 46.15it/s][A
45it [00:00, 46.07it/s][A
50it [00:01, 46.17it/s][A
55it [00:01, 46.41it/s][A
60it [00:01, 46.04it/s][A
65it [00:01, 46.25it/s][A
70it [00:01, 46.02it/s][A
75it [00:01, 44.00it/s][A
80it [00:01, 44.41it/s][A
85it [00:01, 43.89it/s][A
90it [00:01, 44.29it/s][A
95it [00:02, 44.93it/s][A
100it [00:02, 45.22it/s][A
105it [00:02, 44.57it/s][A

Epoch: 403, Step: 100, Loss: 4.477843747138977



110it [00:02, 43.70it/s][A
115it [00:02, 44.94it/s][A
120it [00:02, 44.67it/s][A
125it [00:02, 45.11it/s][A
130it [00:02, 45.50it/s][A
135it [00:02, 44.58it/s][A
140it [00:03, 44.64it/s][A
145it [00:03, 44.82it/s][A
150it [00:03, 44.60it/s][A
155it [00:03, 44.81it/s][A
160it [00:03, 45.05it/s][A
165it [00:03, 43.66it/s][A
170it [00:03, 44.08it/s][A
175it [00:03, 44.59it/s][A
180it [00:03, 44.96it/s][A
185it [00:04, 45.35it/s][A
190it [00:04, 45.11it/s][A
195it [00:04, 44.87it/s][A
200it [00:04, 45.03it/s][A
205it [00:04, 44.62it/s][A

Epoch: 403, Step: 200, Loss: 4.494780037403107



210it [00:04, 44.79it/s][A
215it [00:04, 45.02it/s][A
220it [00:04, 45.30it/s][A
227it [00:05, 44.99it/s]
 81%|████████  | 403/500 [48:06<10:26,  6.46s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.85it/s][A
10it [00:00, 43.35it/s][A
15it [00:00, 44.32it/s][A
20it [00:00, 45.00it/s][A
25it [00:00, 43.03it/s][A
30it [00:00, 43.93it/s][A
35it [00:00, 44.28it/s][A
40it [00:00, 44.64it/s][A
45it [00:01, 43.42it/s][A
50it [00:01, 43.31it/s][A
55it [00:01, 43.11it/s][A
60it [00:01, 43.21it/s][A
65it [00:01, 43.83it/s][A
70it [00:01, 43.68it/s][A
75it [00:01, 44.02it/s][A
80it [00:01, 43.72it/s][A
85it [00:01, 43.94it/s][A
90it [00:02, 44.26it/s][A
95it [00:02, 43.83it/s][A
100it [00:02, 42.82it/s][A
105it [00:02, 41.38it/s][A

Epoch: 404, Step: 100, Loss: 4.481105728149414



110it [00:02, 41.97it/s][A
115it [00:02, 42.79it/s][A
120it [00:02, 43.56it/s][A
125it [00:02, 44.12it/s][A
130it [00:02, 43.95it/s][A
135it [00:03, 44.21it/s][A
140it [00:03, 44.31it/s][A
145it [00:03, 43.96it/s][A
150it [00:03, 44.39it/s][A
155it [00:03, 42.34it/s][A
160it [00:03, 41.86it/s][A
165it [00:03, 42.76it/s][A
170it [00:03, 43.29it/s][A
175it [00:04, 43.19it/s][A
180it [00:04, 43.93it/s][A
185it [00:04, 44.31it/s][A
190it [00:04, 44.57it/s][A
195it [00:04, 44.84it/s][A
200it [00:04, 45.19it/s][A
205it [00:04, 44.98it/s][A

Epoch: 404, Step: 200, Loss: 4.494452095031738



210it [00:04, 45.11it/s][A
215it [00:04, 44.53it/s][A
220it [00:05, 44.88it/s][A
227it [00:05, 43.84it/s]
 81%|████████  | 404/500 [48:11<09:43,  6.07s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.21it/s][A
9it [00:00, 42.95it/s][A
14it [00:00, 44.31it/s][A
19it [00:00, 44.49it/s][A
24it [00:00, 44.90it/s][A
29it [00:00, 45.15it/s][A
34it [00:00, 43.75it/s][A
39it [00:00, 44.45it/s][A
44it [00:00, 44.71it/s][A
49it [00:01, 43.51it/s][A
54it [00:01, 44.13it/s][A
59it [00:01, 44.45it/s][A
64it [00:01, 44.73it/s][A
69it [00:01, 45.10it/s][A
74it [00:01, 45.42it/s][A
79it [00:01, 44.15it/s][A
84it [00:01, 44.62it/s][A
89it [00:02, 44.54it/s][A
94it [00:02, 44.65it/s][A
99it [00:02, 44.90it/s][A
104it [00:02, 45.23it/s][A
109it [00:02, 45.42it/s][A

Epoch: 405, Step: 100, Loss: 4.483595671653748



114it [00:02, 45.47it/s][A
119it [00:02, 45.41it/s][A
124it [00:02, 45.32it/s][A
129it [00:02, 45.41it/s][A
134it [00:02, 45.02it/s][A
139it [00:03, 44.97it/s][A
144it [00:03, 44.91it/s][A
149it [00:03, 44.60it/s][A
154it [00:03, 44.78it/s][A
159it [00:03, 44.02it/s][A
164it [00:03, 44.37it/s][A
169it [00:03, 44.65it/s][A
174it [00:03, 43.42it/s][A
179it [00:04, 43.88it/s][A
184it [00:04, 42.93it/s][A
189it [00:04, 43.50it/s][A
194it [00:04, 43.81it/s][A
199it [00:04, 43.20it/s][A
204it [00:04, 43.87it/s][A
209it [00:04, 44.52it/s][A

Epoch: 405, Step: 200, Loss: 4.4928986978530885



214it [00:04, 44.53it/s][A
219it [00:04, 44.87it/s][A
227it [00:05, 44.37it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.78it/s][A
12it [00:00, 58.05it/s][A
19it [00:00, 59.12it/s][A
26it [00:00, 59.76it/s][A
33it [00:00, 60.02it/s][A
40it [00:00, 60.16it/s][A
47it [00:00, 60.00it/s][A
53it [00:00, 59.99it/s][A
59it [00:00, 59.54it/s][A
66it [00:01, 60.03it/s][A
73it [00:01, 60.22it/s][A
80it [00:01, 60.06it/s][A
87it [00:01, 58.95it/s][A
94it [00:01, 59.47it/s][A
100it [00:01, 59.53it/s][A
106it [00:01, 59.45it/s][A
112it [00:01, 59.54it/s][A
118it [00:01, 59.28it/s][A
125it [00:02, 59.69it/s][A
132it [00:02, 59.94it/s][A
138it [00:02, 59.56it/s][A
144it [00:02, 58.96it/s][A
151it [00:02, 59.43it/s][A
157it [00:02, 59.52it/s][A
163it [00:02, 57.04it/s][A
169it [00:02, 57.63it/s][A
175it [00:02, 58.22it/s][A
182it [00:03, 59.04it/s][A
189it [00:03, 59.66it/s][A
195it [00:03, 59.46it/s][A
201it [00:03, 59.16it/s][A
207it [00:03, 59.00it/s][A
213it [00:03, 


Epoch: 405, Test Loss: 5.552903098349246, Test Perplexity: 259.036312671922




0it [00:00, ?it/s][A
5it [00:00, 43.32it/s][A
10it [00:00, 44.66it/s][A
15it [00:00, 44.58it/s][A
20it [00:00, 44.85it/s][A
25it [00:00, 45.12it/s][A
30it [00:00, 45.12it/s][A
35it [00:00, 45.40it/s][A
40it [00:00, 43.94it/s][A
45it [00:01, 44.43it/s][A
50it [00:01, 44.13it/s][A
55it [00:01, 44.46it/s][A
60it [00:01, 44.65it/s][A
65it [00:01, 43.84it/s][A
70it [00:01, 44.44it/s][A
75it [00:01, 43.12it/s][A
80it [00:01, 43.95it/s][A
85it [00:01, 44.06it/s][A
90it [00:02, 44.16it/s][A
95it [00:02, 44.36it/s][A
100it [00:02, 44.73it/s][A
105it [00:02, 44.83it/s][A

Epoch: 406, Step: 100, Loss: 4.474714818000794



110it [00:02, 44.85it/s][A
115it [00:02, 45.08it/s][A
120it [00:02, 45.25it/s][A
125it [00:02, 44.79it/s][A
130it [00:02, 43.61it/s][A
135it [00:03, 43.90it/s][A
140it [00:03, 44.18it/s][A
145it [00:03, 44.68it/s][A
150it [00:03, 43.57it/s][A
155it [00:03, 43.84it/s][A
160it [00:03, 44.61it/s][A
165it [00:03, 45.25it/s][A
170it [00:03, 44.53it/s][A
175it [00:03, 44.24it/s][A
180it [00:04, 42.10it/s][A
185it [00:04, 42.76it/s][A
190it [00:04, 43.67it/s][A
195it [00:04, 44.51it/s][A
200it [00:04, 45.27it/s][A
205it [00:04, 45.36it/s][A

Epoch: 406, Step: 200, Loss: 4.495338590145111



210it [00:04, 45.38it/s][A
215it [00:04, 45.87it/s][A
220it [00:04, 46.09it/s][A
227it [00:05, 44.51it/s]
 81%|████████  | 406/500 [48:32<12:21,  7.88s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.21it/s][A
10it [00:00, 45.68it/s][A
15it [00:00, 45.63it/s][A
20it [00:00, 45.93it/s][A
25it [00:00, 45.97it/s][A
30it [00:00, 46.15it/s][A
35it [00:00, 46.29it/s][A
40it [00:00, 46.51it/s][A
45it [00:00, 46.31it/s][A
50it [00:01, 46.04it/s][A
55it [00:01, 45.98it/s][A
60it [00:01, 46.08it/s][A
65it [00:01, 45.90it/s][A
70it [00:01, 46.08it/s][A
75it [00:01, 45.92it/s][A
80it [00:01, 46.08it/s][A
85it [00:01, 46.30it/s][A
90it [00:01, 45.93it/s][A
95it [00:02, 45.67it/s][A
100it [00:02, 45.43it/s][A
105it [00:02, 45.45it/s][A

Epoch: 407, Step: 100, Loss: 4.48147485256195



110it [00:02, 45.47it/s][A
115it [00:02, 45.47it/s][A
120it [00:02, 45.55it/s][A
125it [00:02, 44.40it/s][A
130it [00:02, 44.88it/s][A
135it [00:02, 45.02it/s][A
140it [00:03, 43.45it/s][A
145it [00:03, 44.02it/s][A
150it [00:03, 45.13it/s][A
155it [00:03, 46.04it/s][A
160it [00:03, 46.49it/s][A
165it [00:03, 46.07it/s][A
170it [00:03, 45.75it/s][A
175it [00:03, 45.59it/s][A
180it [00:03, 45.55it/s][A
185it [00:04, 45.74it/s][A
190it [00:04, 45.90it/s][A
195it [00:04, 45.60it/s][A
200it [00:04, 45.61it/s][A
205it [00:04, 44.12it/s][A

Epoch: 407, Step: 200, Loss: 4.4933051896095275



210it [00:04, 43.62it/s][A
215it [00:04, 43.36it/s][A
220it [00:04, 43.93it/s][A
227it [00:05, 45.35it/s]
 81%|████████▏ | 407/500 [48:37<10:52,  7.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.35it/s][A
10it [00:00, 44.24it/s][A
15it [00:00, 44.87it/s][A
20it [00:00, 45.12it/s][A
25it [00:00, 45.27it/s][A
30it [00:00, 45.26it/s][A
35it [00:00, 45.31it/s][A
40it [00:00, 45.39it/s][A
45it [00:01, 44.45it/s][A
50it [00:01, 44.87it/s][A
55it [00:01, 45.24it/s][A
60it [00:01, 44.50it/s][A
65it [00:01, 44.77it/s][A
70it [00:01, 45.01it/s][A
75it [00:01, 45.16it/s][A
80it [00:01, 45.06it/s][A
85it [00:01, 44.97it/s][A
90it [00:02, 44.76it/s][A
95it [00:02, 44.93it/s][A
100it [00:02, 45.06it/s][A
105it [00:02, 44.87it/s][A

Epoch: 408, Step: 100, Loss: 4.477945489883423



110it [00:02, 44.85it/s][A
115it [00:02, 45.14it/s][A
120it [00:02, 45.27it/s][A
125it [00:02, 45.25it/s][A
130it [00:02, 44.88it/s][A
135it [00:03, 45.11it/s][A
140it [00:03, 44.93it/s][A
145it [00:03, 44.88it/s][A
150it [00:03, 44.84it/s][A
155it [00:03, 45.12it/s][A
160it [00:03, 45.25it/s][A
165it [00:03, 45.32it/s][A
170it [00:03, 45.13it/s][A
175it [00:03, 45.21it/s][A
180it [00:03, 45.02it/s][A
185it [00:04, 43.67it/s][A
190it [00:04, 43.40it/s][A
195it [00:04, 43.84it/s][A
200it [00:04, 44.46it/s][A
205it [00:04, 44.37it/s][A

Epoch: 408, Step: 200, Loss: 4.4922318625450135



210it [00:04, 44.22it/s][A
215it [00:04, 42.81it/s][A
220it [00:04, 42.80it/s][A
227it [00:05, 44.52it/s]
 82%|████████▏ | 408/500 [48:43<09:52,  6.45s/it]
0it [00:00, ?it/s][A
4it [00:00, 36.47it/s][A
9it [00:00, 39.61it/s][A
14it [00:00, 40.91it/s][A
19it [00:00, 41.66it/s][A
24it [00:00, 43.33it/s][A
29it [00:00, 44.11it/s][A
34it [00:00, 44.55it/s][A
39it [00:00, 44.83it/s][A
44it [00:01, 45.04it/s][A
49it [00:01, 45.29it/s][A
54it [00:01, 45.53it/s][A
59it [00:01, 45.63it/s][A
64it [00:01, 45.61it/s][A
69it [00:01, 45.63it/s][A
74it [00:01, 45.03it/s][A
79it [00:01, 44.86it/s][A
84it [00:01, 44.28it/s][A
89it [00:02, 44.57it/s][A
94it [00:02, 44.43it/s][A
99it [00:02, 44.62it/s][A
104it [00:02, 44.00it/s][A

Epoch: 409, Step: 100, Loss: 4.48396755695343



109it [00:02, 44.53it/s][A
114it [00:02, 44.94it/s][A
119it [00:02, 45.09it/s][A
124it [00:02, 45.19it/s][A
129it [00:02, 45.19it/s][A
134it [00:03, 44.99it/s][A
139it [00:03, 45.27it/s][A
144it [00:03, 45.13it/s][A
149it [00:03, 45.32it/s][A
154it [00:03, 45.13it/s][A
159it [00:03, 45.39it/s][A
164it [00:03, 45.46it/s][A
169it [00:03, 45.48it/s][A
174it [00:03, 43.96it/s][A
179it [00:04, 44.55it/s][A
184it [00:04, 44.99it/s][A
189it [00:04, 44.97it/s][A
194it [00:04, 45.16it/s][A
199it [00:04, 45.23it/s][A
204it [00:04, 45.40it/s][A
209it [00:04, 45.03it/s][A

Epoch: 409, Step: 200, Loss: 4.496553649902344



214it [00:04, 43.95it/s][A
219it [00:04, 42.19it/s][A
227it [00:05, 44.38it/s]
 82%|████████▏ | 409/500 [48:48<09:10,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.26it/s][A
10it [00:00, 42.37it/s][A
15it [00:00, 43.51it/s][A
20it [00:00, 42.09it/s][A
25it [00:00, 42.87it/s][A
30it [00:00, 43.83it/s][A
35it [00:00, 44.53it/s][A
40it [00:00, 44.66it/s][A
45it [00:01, 45.02it/s][A
50it [00:01, 45.35it/s][A
55it [00:01, 45.59it/s][A
60it [00:01, 45.46it/s][A
65it [00:01, 45.72it/s][A
70it [00:01, 43.92it/s][A
75it [00:01, 44.44it/s][A
80it [00:01, 44.69it/s][A
85it [00:01, 44.87it/s][A
90it [00:02, 45.18it/s][A
95it [00:02, 45.09it/s][A
100it [00:02, 45.27it/s][A
105it [00:02, 45.41it/s][A

Epoch: 410, Step: 100, Loss: 4.477610840797424



110it [00:02, 45.35it/s][A
115it [00:02, 45.65it/s][A
120it [00:02, 45.53it/s][A
125it [00:02, 44.46it/s][A
130it [00:02, 44.87it/s][A
135it [00:03, 45.26it/s][A
140it [00:03, 45.48it/s][A
145it [00:03, 45.13it/s][A
150it [00:03, 43.72it/s][A
155it [00:03, 44.21it/s][A
160it [00:03, 43.22it/s][A
165it [00:03, 42.82it/s][A
170it [00:03, 43.20it/s][A
175it [00:03, 43.75it/s][A
180it [00:04, 42.86it/s][A
185it [00:04, 43.53it/s][A
190it [00:04, 44.13it/s][A
195it [00:04, 43.33it/s][A
200it [00:04, 43.65it/s][A
205it [00:04, 44.21it/s][A

Epoch: 410, Step: 200, Loss: 4.492251858711243



210it [00:04, 44.50it/s][A
215it [00:04, 44.51it/s][A
220it [00:04, 44.72it/s][A
227it [00:05, 44.29it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.29it/s][A
13it [00:00, 59.68it/s][A
19it [00:00, 59.35it/s][A
26it [00:00, 59.85it/s][A
32it [00:00, 59.48it/s][A
39it [00:00, 59.76it/s][A
46it [00:00, 60.09it/s][A
53it [00:00, 57.86it/s][A
59it [00:01, 58.41it/s][A
65it [00:01, 58.77it/s][A
72it [00:01, 59.53it/s][A
78it [00:01, 57.84it/s][A
84it [00:01, 57.75it/s][A
90it [00:01, 57.94it/s][A
96it [00:01, 58.48it/s][A
102it [00:01, 58.87it/s][A
109it [00:01, 59.49it/s][A
115it [00:01, 59.64it/s][A
122it [00:02, 60.14it/s][A
129it [00:02, 60.39it/s][A
136it [00:02, 60.54it/s][A
143it [00:02, 60.45it/s][A
150it [00:02, 60.59it/s][A
157it [00:02, 60.68it/s][A
164it [00:02, 60.69it/s][A
171it [00:02, 60.77it/s][A
178it [00:02, 60.84it/s][A
185it [00:03, 60.31it/s][A
192it [00:03, 60.46it/s][A
199it [00:03, 60.37it/s][A
206it [00:03, 60.35it/s][A
213it [00:03, 6


Epoch: 410, Test Loss: 5.5534420650197855, Test Perplexity: 259.13720814485725




0it [00:00, ?it/s][A
4it [00:00, 39.07it/s][A
8it [00:00, 39.23it/s][A
13it [00:00, 42.35it/s][A
18it [00:00, 43.71it/s][A
23it [00:00, 44.13it/s][A
28it [00:00, 44.99it/s][A
33it [00:00, 45.00it/s][A
38it [00:00, 45.31it/s][A
43it [00:00, 44.60it/s][A
48it [00:01, 44.50it/s][A
53it [00:01, 44.80it/s][A
58it [00:01, 45.11it/s][A
63it [00:01, 45.49it/s][A
68it [00:01, 44.38it/s][A
73it [00:01, 43.46it/s][A
78it [00:01, 43.88it/s][A
83it [00:01, 44.10it/s][A
88it [00:01, 44.07it/s][A
93it [00:02, 43.73it/s][A
98it [00:02, 43.81it/s][A
103it [00:02, 43.31it/s][A
108it [00:02, 43.96it/s][A

Epoch: 411, Step: 100, Loss: 4.4733587884902954



113it [00:02, 44.36it/s][A
118it [00:02, 44.78it/s][A
123it [00:02, 45.06it/s][A
128it [00:02, 45.41it/s][A
133it [00:02, 45.58it/s][A
138it [00:03, 45.89it/s][A
143it [00:03, 44.43it/s][A
148it [00:03, 43.71it/s][A
153it [00:03, 44.71it/s][A
158it [00:03, 44.40it/s][A
163it [00:03, 44.94it/s][A
168it [00:03, 45.23it/s][A
173it [00:03, 45.19it/s][A
178it [00:04, 45.43it/s][A
183it [00:04, 46.11it/s][A
188it [00:04, 46.31it/s][A
193it [00:04, 45.56it/s][A
198it [00:04, 43.46it/s][A
203it [00:04, 44.38it/s][A
208it [00:04, 44.61it/s][A

Epoch: 411, Step: 200, Loss: 4.49315699338913



213it [00:04, 44.50it/s][A
218it [00:04, 44.86it/s][A
227it [00:05, 44.54it/s]
 82%|████████▏ | 411/500 [49:09<11:38,  7.84s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.61it/s][A
10it [00:00, 45.87it/s][A
15it [00:00, 45.46it/s][A
20it [00:00, 45.16it/s][A
25it [00:00, 45.09it/s][A
30it [00:00, 45.21it/s][A
35it [00:00, 45.15it/s][A
40it [00:00, 45.34it/s][A
45it [00:00, 45.47it/s][A
50it [00:01, 45.75it/s][A
55it [00:01, 45.97it/s][A
60it [00:01, 45.25it/s][A
65it [00:01, 45.21it/s][A
70it [00:01, 43.94it/s][A
75it [00:01, 44.56it/s][A
80it [00:01, 44.37it/s][A
85it [00:01, 44.69it/s][A
90it [00:02, 43.69it/s][A
95it [00:02, 44.38it/s][A
100it [00:02, 42.74it/s][A
105it [00:02, 43.17it/s][A

Epoch: 412, Step: 100, Loss: 4.471353859901428



110it [00:02, 43.20it/s][A
115it [00:02, 43.57it/s][A
120it [00:02, 43.78it/s][A
125it [00:02, 44.50it/s][A
130it [00:02, 44.77it/s][A
135it [00:03, 44.98it/s][A
140it [00:03, 45.14it/s][A
145it [00:03, 45.01it/s][A
150it [00:03, 45.34it/s][A
155it [00:03, 45.49it/s][A
160it [00:03, 45.35it/s][A
165it [00:03, 45.25it/s][A
170it [00:03, 44.87it/s][A
175it [00:03, 45.08it/s][A
180it [00:04, 42.84it/s][A
185it [00:04, 42.90it/s][A
190it [00:04, 43.06it/s][A
195it [00:04, 43.77it/s][A
200it [00:04, 43.37it/s][A
205it [00:04, 43.73it/s][A

Epoch: 412, Step: 200, Loss: 4.491560218334198



210it [00:04, 42.76it/s][A
215it [00:04, 43.44it/s][A
220it [00:04, 43.85it/s][A
227it [00:05, 44.38it/s]
 82%|████████▏ | 412/500 [49:14<10:18,  7.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.37it/s][A
10it [00:00, 45.94it/s][A
15it [00:00, 45.90it/s][A
20it [00:00, 43.78it/s][A
25it [00:00, 42.67it/s][A
30it [00:00, 43.69it/s][A
35it [00:00, 44.46it/s][A
40it [00:00, 44.37it/s][A
45it [00:01, 44.64it/s][A
50it [00:01, 44.87it/s][A
55it [00:01, 44.92it/s][A
60it [00:01, 44.97it/s][A
65it [00:01, 45.06it/s][A
70it [00:01, 44.70it/s][A
75it [00:01, 43.74it/s][A
80it [00:01, 44.23it/s][A
85it [00:01, 44.48it/s][A
90it [00:02, 44.89it/s][A
95it [00:02, 45.12it/s][A
100it [00:02, 44.83it/s][A
105it [00:02, 44.96it/s][A

Epoch: 413, Step: 100, Loss: 4.4771092510223385



110it [00:02, 44.65it/s][A
115it [00:02, 44.97it/s][A
120it [00:02, 44.80it/s][A
125it [00:02, 44.52it/s][A
130it [00:02, 43.23it/s][A
135it [00:03, 43.78it/s][A
140it [00:03, 44.25it/s][A
145it [00:03, 44.25it/s][A
150it [00:03, 44.53it/s][A
155it [00:03, 44.41it/s][A
160it [00:03, 44.39it/s][A
165it [00:03, 44.50it/s][A
170it [00:03, 44.70it/s][A
175it [00:03, 45.01it/s][A
180it [00:04, 45.29it/s][A
185it [00:04, 45.42it/s][A
190it [00:04, 44.94it/s][A
195it [00:04, 43.81it/s][A
200it [00:04, 44.49it/s][A
205it [00:04, 44.48it/s][A

Epoch: 413, Step: 200, Loss: 4.492944309711456



210it [00:04, 43.36it/s][A
215it [00:04, 44.19it/s][A
220it [00:04, 44.77it/s][A
227it [00:05, 44.50it/s]
 83%|████████▎ | 413/500 [49:19<09:21,  6.45s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.48it/s][A
10it [00:00, 43.34it/s][A
15it [00:00, 43.77it/s][A
20it [00:00, 44.47it/s][A
25it [00:00, 44.53it/s][A
30it [00:00, 44.77it/s][A
35it [00:00, 43.20it/s][A
40it [00:00, 43.96it/s][A
45it [00:01, 42.74it/s][A
50it [00:01, 42.78it/s][A
55it [00:01, 43.69it/s][A
60it [00:01, 43.88it/s][A
65it [00:01, 42.71it/s][A
70it [00:01, 43.75it/s][A
75it [00:01, 44.55it/s][A
80it [00:01, 43.79it/s][A
85it [00:01, 44.26it/s][A
90it [00:02, 43.76it/s][A
95it [00:02, 44.23it/s][A
100it [00:02, 44.22it/s][A
105it [00:02, 44.53it/s][A

Epoch: 414, Step: 100, Loss: 4.478954911231995



110it [00:02, 44.78it/s][A
115it [00:02, 45.26it/s][A
120it [00:02, 42.68it/s][A
125it [00:02, 43.49it/s][A
130it [00:02, 42.88it/s][A
135it [00:03, 42.05it/s][A
140it [00:03, 42.08it/s][A
145it [00:03, 42.76it/s][A
150it [00:03, 43.70it/s][A
155it [00:03, 44.32it/s][A
160it [00:03, 44.68it/s][A
165it [00:03, 43.82it/s][A
170it [00:03, 44.54it/s][A
175it [00:04, 42.92it/s][A
180it [00:04, 43.79it/s][A
185it [00:04, 44.30it/s][A
190it [00:04, 44.67it/s][A
195it [00:04, 44.82it/s][A
200it [00:04, 45.06it/s][A
205it [00:04, 44.05it/s][A

Epoch: 414, Step: 200, Loss: 4.4904272389411926



210it [00:04, 44.10it/s][A
215it [00:04, 44.58it/s][A
220it [00:05, 44.92it/s][A
227it [00:05, 43.81it/s]
 83%|████████▎ | 414/500 [49:24<08:41,  6.07s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.29it/s][A
10it [00:00, 45.90it/s][A
15it [00:00, 42.97it/s][A
20it [00:00, 43.81it/s][A
25it [00:00, 44.52it/s][A
30it [00:00, 44.92it/s][A
35it [00:00, 45.23it/s][A
40it [00:00, 44.94it/s][A
45it [00:01, 45.38it/s][A
50it [00:01, 45.02it/s][A
55it [00:01, 45.12it/s][A
60it [00:01, 44.98it/s][A
65it [00:01, 44.88it/s][A
70it [00:01, 44.94it/s][A
75it [00:01, 44.90it/s][A
80it [00:01, 45.05it/s][A
85it [00:01, 45.02it/s][A
90it [00:02, 45.21it/s][A
95it [00:02, 45.02it/s][A
100it [00:02, 44.90it/s][A
105it [00:02, 44.29it/s][A

Epoch: 415, Step: 100, Loss: 4.480744080543518



110it [00:02, 44.66it/s][A
115it [00:02, 44.92it/s][A
120it [00:02, 43.27it/s][A
125it [00:02, 43.85it/s][A
130it [00:02, 44.21it/s][A
135it [00:03, 44.56it/s][A
140it [00:03, 44.56it/s][A
145it [00:03, 44.63it/s][A
150it [00:03, 43.37it/s][A
155it [00:03, 42.58it/s][A
160it [00:03, 42.48it/s][A
165it [00:03, 43.19it/s][A
170it [00:03, 43.57it/s][A
175it [00:03, 43.89it/s][A
180it [00:04, 44.32it/s][A
185it [00:04, 44.36it/s][A
190it [00:04, 44.35it/s][A
195it [00:04, 42.84it/s][A
200it [00:04, 42.96it/s][A
205it [00:04, 43.51it/s][A

Epoch: 415, Step: 200, Loss: 4.4921252727508545



210it [00:04, 42.82it/s][A
215it [00:04, 43.49it/s][A
220it [00:04, 43.75it/s][A
227it [00:05, 44.20it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.96it/s][A
12it [00:00, 56.97it/s][A
18it [00:00, 58.16it/s][A
25it [00:00, 59.38it/s][A
31it [00:00, 59.26it/s][A
38it [00:00, 59.83it/s][A
45it [00:00, 60.19it/s][A
52it [00:00, 58.04it/s][A
59it [00:01, 58.99it/s][A
65it [00:01, 59.26it/s][A
72it [00:01, 59.59it/s][A
79it [00:01, 59.88it/s][A
85it [00:01, 57.49it/s][A
92it [00:01, 58.48it/s][A
99it [00:01, 59.21it/s][A
105it [00:01, 58.31it/s][A
111it [00:01, 58.55it/s][A
117it [00:01, 57.80it/s][A
123it [00:02, 58.27it/s][A
129it [00:02, 57.38it/s][A
136it [00:02, 58.55it/s][A
143it [00:02, 59.33it/s][A
150it [00:02, 59.75it/s][A
157it [00:02, 60.01it/s][A
163it [00:02, 59.77it/s][A
170it [00:02, 60.25it/s][A
177it [00:02, 60.26it/s][A
184it [00:03, 60.01it/s][A
190it [00:03, 59.99it/s][A
196it [00:03, 57.58it/s][A
203it [00:03, 58.44it/s][A
209it [00:03, 5


Epoch: 415, Test Loss: 5.547631117127697, Test Perplexity: 257.7560731402095




0it [00:00, ?it/s][A
4it [00:00, 37.08it/s][A
9it [00:00, 41.40it/s][A
14it [00:00, 42.91it/s][A
19it [00:00, 44.01it/s][A
24it [00:00, 44.36it/s][A
29it [00:00, 44.65it/s][A
34it [00:00, 44.78it/s][A
39it [00:00, 44.10it/s][A
44it [00:01, 43.46it/s][A
49it [00:01, 44.16it/s][A
54it [00:01, 44.44it/s][A
59it [00:01, 44.53it/s][A
64it [00:01, 44.89it/s][A
69it [00:01, 45.12it/s][A
74it [00:01, 45.25it/s][A
79it [00:01, 45.24it/s][A
84it [00:01, 45.34it/s][A
89it [00:02, 45.38it/s][A
94it [00:02, 45.33it/s][A
99it [00:02, 45.44it/s][A
104it [00:02, 45.25it/s][A
109it [00:02, 45.31it/s][A

Epoch: 416, Step: 100, Loss: 4.4815764904022215



114it [00:02, 45.21it/s][A
119it [00:02, 43.96it/s][A
124it [00:02, 44.71it/s][A
129it [00:02, 44.88it/s][A
134it [00:03, 44.72it/s][A
139it [00:03, 44.84it/s][A
144it [00:03, 44.89it/s][A
149it [00:03, 45.09it/s][A
154it [00:03, 45.12it/s][A
159it [00:03, 45.05it/s][A
164it [00:03, 45.26it/s][A
169it [00:03, 45.40it/s][A
174it [00:03, 45.39it/s][A
179it [00:03, 45.38it/s][A
184it [00:04, 45.48it/s][A
189it [00:04, 45.38it/s][A
194it [00:04, 45.43it/s][A
199it [00:04, 45.42it/s][A
204it [00:04, 45.51it/s][A
209it [00:04, 45.00it/s]

Epoch: 416, Step: 200, Loss: 4.491225347518921


[A
214it [00:04, 43.79it/s][A
219it [00:04, 44.28it/s][A
227it [00:05, 44.77it/s]
 83%|████████▎ | 416/500 [49:45<10:59,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.41it/s][A
10it [00:00, 44.90it/s][A
15it [00:00, 44.58it/s][A
20it [00:00, 44.93it/s][A
25it [00:00, 45.22it/s][A
30it [00:00, 44.91it/s][A
35it [00:00, 45.09it/s][A
40it [00:00, 45.07it/s][A
45it [00:00, 45.31it/s][A
50it [00:01, 45.28it/s][A
55it [00:01, 45.44it/s][A
60it [00:01, 45.68it/s][A
65it [00:01, 44.68it/s][A
70it [00:01, 44.87it/s][A
75it [00:01, 45.07it/s][A
80it [00:01, 43.86it/s][A
85it [00:01, 44.51it/s][A
90it [00:02, 44.83it/s][A
95it [00:02, 45.04it/s][A
100it [00:02, 45.08it/s][A
105it [00:02, 45.42it/s][A

Epoch: 417, Step: 100, Loss: 4.475233335494995



110it [00:02, 45.07it/s][A
115it [00:02, 45.21it/s][A
120it [00:02, 45.48it/s][A
125it [00:02, 45.56it/s][A
130it [00:02, 45.54it/s][A
135it [00:02, 45.69it/s][A
140it [00:03, 45.66it/s][A
145it [00:03, 45.76it/s][A
150it [00:03, 45.73it/s][A
155it [00:03, 45.87it/s][A
160it [00:03, 45.74it/s][A
165it [00:03, 45.50it/s][A
170it [00:03, 45.27it/s][A
175it [00:03, 45.16it/s][A
180it [00:03, 45.12it/s][A
185it [00:04, 44.99it/s][A
190it [00:04, 44.64it/s][A
195it [00:04, 44.53it/s][A
200it [00:04, 44.62it/s][A
205it [00:04, 44.77it/s][A

Epoch: 417, Step: 200, Loss: 4.488959610462189



210it [00:04, 43.71it/s][A
215it [00:04, 44.33it/s][A
220it [00:04, 44.57it/s][A
227it [00:05, 44.97it/s]
 83%|████████▎ | 417/500 [49:50<09:41,  7.01s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.80it/s][A
10it [00:00, 42.83it/s][A
15it [00:00, 44.06it/s][A
20it [00:00, 44.32it/s][A
25it [00:00, 44.55it/s][A
30it [00:00, 44.93it/s][A
35it [00:00, 44.76it/s][A
40it [00:00, 42.62it/s][A
45it [00:01, 43.38it/s][A
50it [00:01, 43.17it/s][A
55it [00:01, 42.88it/s][A
60it [00:01, 42.08it/s][A
65it [00:01, 42.16it/s][A
70it [00:01, 41.95it/s][A
75it [00:01, 42.95it/s][A
80it [00:01, 43.57it/s][A
85it [00:01, 44.07it/s][A
90it [00:02, 42.59it/s][A
95it [00:02, 43.28it/s][A
100it [00:02, 43.96it/s][A
105it [00:02, 44.40it/s][A

Epoch: 418, Step: 100, Loss: 4.477299709320068



110it [00:02, 44.68it/s][A
115it [00:02, 45.06it/s][A
120it [00:02, 45.18it/s][A
125it [00:02, 45.16it/s][A
130it [00:02, 43.26it/s][A
135it [00:03, 43.50it/s][A
140it [00:03, 43.99it/s][A
145it [00:03, 44.36it/s][A
150it [00:03, 44.41it/s][A
155it [00:03, 44.76it/s][A
160it [00:03, 44.03it/s][A
165it [00:03, 44.67it/s][A
170it [00:03, 44.93it/s][A
175it [00:03, 45.15it/s][A
180it [00:04, 43.47it/s][A
185it [00:04, 43.88it/s][A
190it [00:04, 44.08it/s][A
195it [00:04, 44.77it/s][A
200it [00:04, 45.03it/s][A
205it [00:04, 45.14it/s][A

Epoch: 418, Step: 200, Loss: 4.490208554267883



210it [00:04, 45.33it/s][A
215it [00:04, 45.19it/s][A
220it [00:04, 45.17it/s][A
227it [00:05, 44.09it/s]
 84%|████████▎ | 418/500 [49:55<08:49,  6.45s/it]
0it [00:00, ?it/s][A
4it [00:00, 38.14it/s][A
9it [00:00, 42.29it/s][A
14it [00:00, 43.09it/s][A
19it [00:00, 44.05it/s][A
24it [00:00, 44.82it/s][A
29it [00:00, 43.01it/s][A
34it [00:00, 42.70it/s][A
39it [00:00, 43.40it/s][A
44it [00:01, 44.06it/s][A
49it [00:01, 44.46it/s][A
54it [00:01, 44.61it/s][A
59it [00:01, 44.72it/s][A
64it [00:01, 44.72it/s][A
69it [00:01, 45.14it/s][A
74it [00:01, 45.06it/s][A
79it [00:01, 45.04it/s][A
84it [00:01, 45.11it/s][A
89it [00:02, 44.93it/s][A
94it [00:02, 45.08it/s][A
99it [00:02, 45.37it/s][A
104it [00:02, 45.57it/s][A

Epoch: 419, Step: 100, Loss: 4.481219778060913



109it [00:02, 43.54it/s][A
114it [00:02, 43.90it/s][A
119it [00:02, 44.50it/s][A
124it [00:02, 44.92it/s][A
129it [00:02, 44.83it/s][A
134it [00:03, 45.19it/s][A
139it [00:03, 45.32it/s][A
144it [00:03, 45.28it/s][A
149it [00:03, 45.55it/s][A
154it [00:03, 45.67it/s][A
159it [00:03, 44.79it/s][A
164it [00:03, 45.12it/s][A
169it [00:03, 44.74it/s][A
174it [00:03, 43.24it/s][A
179it [00:04, 43.54it/s][A
184it [00:04, 44.20it/s][A
189it [00:04, 44.73it/s][A
194it [00:04, 45.02it/s][A
199it [00:04, 45.26it/s][A
204it [00:04, 44.98it/s][A
209it [00:04, 45.06it/s][A

Epoch: 419, Step: 200, Loss: 4.488045485019684



214it [00:04, 44.66it/s][A
219it [00:04, 44.67it/s][A
227it [00:05, 44.45it/s]
 84%|████████▍ | 419/500 [50:00<08:10,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.52it/s][A
10it [00:00, 41.84it/s][A
15it [00:00, 42.48it/s][A
20it [00:00, 41.57it/s][A
25it [00:00, 42.88it/s][A
30it [00:00, 43.86it/s][A
35it [00:00, 44.33it/s][A
40it [00:00, 44.59it/s][A
45it [00:01, 42.90it/s][A
50it [00:01, 42.16it/s][A
55it [00:01, 43.20it/s][A
60it [00:01, 43.02it/s][A
65it [00:01, 43.19it/s][A
70it [00:01, 43.79it/s][A
75it [00:01, 44.27it/s][A
80it [00:01, 43.84it/s][A
85it [00:01, 44.02it/s][A
90it [00:02, 44.14it/s][A
95it [00:02, 44.57it/s][A
100it [00:02, 44.36it/s][A
105it [00:02, 44.36it/s][A

Epoch: 420, Step: 100, Loss: 4.480389046669006



110it [00:02, 44.01it/s][A
115it [00:02, 43.50it/s][A
120it [00:02, 43.43it/s][A
125it [00:02, 43.89it/s][A
130it [00:02, 44.40it/s][A
135it [00:03, 44.24it/s][A
140it [00:03, 44.08it/s][A
145it [00:03, 44.28it/s][A
150it [00:03, 44.51it/s][A
155it [00:03, 44.39it/s][A
160it [00:03, 44.64it/s][A
165it [00:03, 44.65it/s][A
170it [00:03, 44.61it/s][A
175it [00:03, 44.93it/s][A
180it [00:04, 44.87it/s][A
185it [00:04, 43.40it/s][A
190it [00:04, 43.05it/s][A
195it [00:04, 43.72it/s][A
200it [00:04, 43.72it/s][A
205it [00:04, 42.98it/s][A

Epoch: 420, Step: 200, Loss: 4.489003410339356



210it [00:04, 43.53it/s][A
215it [00:04, 42.48it/s][A
220it [00:05, 41.79it/s][A
227it [00:05, 43.55it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.28it/s][A
13it [00:00, 60.18it/s][A
20it [00:00, 56.20it/s][A
27it [00:00, 57.92it/s][A
33it [00:00, 58.36it/s][A
40it [00:00, 59.16it/s][A
46it [00:00, 56.77it/s][A
52it [00:00, 57.66it/s][A
58it [00:01, 57.47it/s][A
65it [00:01, 58.88it/s][A
72it [00:01, 59.62it/s][A
79it [00:01, 60.12it/s][A
86it [00:01, 60.52it/s][A
93it [00:01, 60.39it/s][A
100it [00:01, 60.59it/s][A
107it [00:01, 60.83it/s][A
114it [00:01, 61.23it/s][A
121it [00:02, 61.32it/s][A
128it [00:02, 61.26it/s][A
135it [00:02, 61.44it/s][A
142it [00:02, 61.30it/s][A
149it [00:02, 61.21it/s][A
156it [00:02, 61.22it/s][A
163it [00:02, 60.94it/s][A
170it [00:02, 60.74it/s][A
177it [00:02, 59.39it/s][A
184it [00:03, 60.14it/s][A
191it [00:03, 60.43it/s][A
198it [00:03, 60.39it/s][A
205it [00:03, 60.87it/s][A
212it [00:03, 60.96it/s][A
219it [00:03, 


Epoch: 420, Test Loss: 5.546933822750304, Test Perplexity: 257.4422056067804




0it [00:00, ?it/s][A
5it [00:00, 44.56it/s][A
10it [00:00, 44.25it/s][A
15it [00:00, 44.36it/s][A
20it [00:00, 44.03it/s][A
25it [00:00, 44.25it/s][A
30it [00:00, 42.91it/s][A
35it [00:00, 43.26it/s][A
40it [00:00, 44.16it/s][A
45it [00:01, 44.66it/s][A
50it [00:01, 44.78it/s][A
55it [00:01, 45.18it/s][A
60it [00:01, 45.03it/s][A
65it [00:01, 45.27it/s][A
70it [00:01, 45.35it/s][A
75it [00:01, 45.47it/s][A
80it [00:01, 45.56it/s][A
85it [00:01, 45.51it/s][A
90it [00:02, 45.71it/s][A
95it [00:02, 45.67it/s][A
100it [00:02, 44.60it/s][A
105it [00:02, 45.04it/s][A

Epoch: 421, Step: 100, Loss: 4.485078043937683



110it [00:02, 44.90it/s][A
115it [00:02, 45.07it/s][A
120it [00:02, 45.26it/s][A
125it [00:02, 45.38it/s][A
130it [00:02, 44.09it/s][A
135it [00:03, 43.47it/s][A
140it [00:03, 44.18it/s][A
145it [00:03, 44.72it/s][A
150it [00:03, 44.94it/s][A
155it [00:03, 45.21it/s][A
160it [00:03, 44.48it/s][A
165it [00:03, 44.77it/s][A
170it [00:03, 44.83it/s][A
175it [00:03, 45.01it/s][A
180it [00:04, 45.12it/s][A
185it [00:04, 44.75it/s][A
190it [00:04, 44.95it/s][A
195it [00:04, 45.15it/s][A
200it [00:04, 45.20it/s][A
205it [00:04, 45.28it/s][A

Epoch: 421, Step: 200, Loss: 4.490923748016358



210it [00:04, 45.35it/s][A
215it [00:04, 45.36it/s][A
220it [00:04, 45.30it/s][A
227it [00:05, 44.76it/s]
 84%|████████▍ | 421/500 [50:22<10:22,  7.88s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.32it/s][A
10it [00:00, 45.63it/s][A
15it [00:00, 45.57it/s][A
20it [00:00, 45.50it/s][A
25it [00:00, 45.46it/s][A
30it [00:00, 45.60it/s][A
35it [00:00, 45.46it/s][A
40it [00:00, 45.50it/s][A
45it [00:00, 45.60it/s][A
50it [00:01, 45.47it/s][A
55it [00:01, 45.45it/s][A
60it [00:01, 45.53it/s][A
65it [00:01, 45.61it/s][A
70it [00:01, 45.75it/s][A
75it [00:01, 45.44it/s][A
80it [00:01, 45.54it/s][A
85it [00:01, 44.89it/s][A
90it [00:01, 45.33it/s][A
95it [00:02, 45.44it/s][A
100it [00:02, 45.34it/s][A
105it [00:02, 45.48it/s][A

Epoch: 422, Step: 100, Loss: 4.481542615890503



110it [00:02, 45.37it/s][A
115it [00:02, 45.49it/s][A
120it [00:02, 45.35it/s][A
125it [00:02, 45.32it/s][A
130it [00:02, 45.41it/s][A
135it [00:02, 45.39it/s][A
140it [00:03, 44.75it/s][A
145it [00:03, 44.63it/s][A
150it [00:03, 43.95it/s][A
155it [00:03, 43.90it/s][A
160it [00:03, 44.27it/s][A
165it [00:03, 44.31it/s][A
170it [00:03, 44.65it/s][A
175it [00:03, 45.04it/s][A
180it [00:03, 44.82it/s][A
185it [00:04, 44.30it/s][A
190it [00:04, 42.59it/s][A
195it [00:04, 42.69it/s][A
200it [00:04, 43.21it/s][A
205it [00:04, 43.46it/s][A

Epoch: 422, Step: 200, Loss: 4.490960257053375



210it [00:04, 43.06it/s][A
215it [00:04, 43.66it/s][A
220it [00:04, 43.15it/s][A
227it [00:05, 44.65it/s]
 84%|████████▍ | 422/500 [50:27<09:09,  7.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.99it/s][A
10it [00:00, 45.63it/s][A
15it [00:00, 42.38it/s][A
20it [00:00, 43.31it/s][A
25it [00:00, 42.77it/s][A
30it [00:00, 43.17it/s][A
35it [00:00, 43.62it/s][A
40it [00:00, 44.09it/s][A
45it [00:01, 44.73it/s][A
50it [00:01, 43.63it/s][A
55it [00:01, 43.55it/s][A
60it [00:01, 42.11it/s][A
65it [00:01, 42.95it/s][A
70it [00:01, 43.66it/s][A
75it [00:01, 41.99it/s][A
80it [00:01, 43.19it/s][A
85it [00:01, 42.05it/s][A
90it [00:02, 43.00it/s][A
95it [00:02, 43.57it/s][A
100it [00:02, 43.93it/s][A
105it [00:02, 43.94it/s][A

Epoch: 423, Step: 100, Loss: 4.476188931465149



110it [00:02, 44.02it/s][A
115it [00:02, 44.32it/s][A
120it [00:02, 44.64it/s][A
125it [00:02, 44.90it/s][A
130it [00:02, 45.06it/s][A
135it [00:03, 44.69it/s][A
140it [00:03, 44.85it/s][A
145it [00:03, 44.87it/s][A
150it [00:03, 45.05it/s][A
155it [00:03, 45.04it/s][A
160it [00:03, 45.08it/s][A
165it [00:03, 43.80it/s][A
170it [00:03, 44.29it/s][A
175it [00:03, 44.62it/s][A
180it [00:04, 44.80it/s][A
185it [00:04, 44.76it/s][A
190it [00:04, 44.95it/s][A
195it [00:04, 44.88it/s][A
200it [00:04, 43.37it/s][A
205it [00:04, 44.02it/s][A

Epoch: 423, Step: 200, Loss: 4.488735063076019



210it [00:04, 44.57it/s][A
215it [00:04, 44.50it/s][A
220it [00:04, 44.33it/s][A
227it [00:05, 43.99it/s]
 85%|████████▍ | 423/500 [50:32<08:18,  6.48s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.00it/s][A
10it [00:00, 41.64it/s][A
15it [00:00, 42.42it/s][A
20it [00:00, 43.58it/s][A
25it [00:00, 43.88it/s][A
30it [00:00, 42.38it/s][A
35it [00:00, 43.37it/s][A
40it [00:00, 44.01it/s][A
45it [00:01, 43.36it/s][A
50it [00:01, 43.64it/s][A
55it [00:01, 44.08it/s][A
60it [00:01, 44.46it/s][A
65it [00:01, 44.73it/s][A
70it [00:01, 43.67it/s][A
75it [00:01, 44.32it/s][A
80it [00:01, 44.10it/s][A
85it [00:01, 44.31it/s][A
90it [00:02, 44.60it/s][A
95it [00:02, 44.69it/s][A
100it [00:02, 44.71it/s][A
105it [00:02, 44.96it/s][A

Epoch: 424, Step: 100, Loss: 4.474762258529663



110it [00:02, 44.99it/s][A
115it [00:02, 45.16it/s][A
120it [00:02, 45.27it/s][A
125it [00:02, 45.46it/s][A
130it [00:02, 43.88it/s][A
135it [00:03, 44.28it/s][A
140it [00:03, 44.68it/s][A
145it [00:03, 44.98it/s][A
150it [00:03, 45.00it/s][A
155it [00:03, 45.02it/s][A
160it [00:03, 44.95it/s][A
165it [00:03, 45.12it/s][A
170it [00:03, 45.00it/s][A
175it [00:03, 44.89it/s][A
180it [00:04, 44.93it/s][A
185it [00:04, 44.94it/s][A
190it [00:04, 42.54it/s][A
195it [00:04, 43.16it/s][A
200it [00:04, 41.51it/s][A
205it [00:04, 42.49it/s][A

Epoch: 424, Step: 200, Loss: 4.489627053737641



210it [00:04, 43.05it/s][A
215it [00:04, 43.63it/s][A
220it [00:04, 44.18it/s][A
227it [00:05, 44.12it/s]
 85%|████████▍ | 424/500 [50:37<07:42,  6.08s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.17it/s][A
10it [00:00, 42.72it/s][A
15it [00:00, 42.19it/s][A
20it [00:00, 43.08it/s][A
25it [00:00, 43.48it/s][A
30it [00:00, 44.06it/s][A
35it [00:00, 44.42it/s][A
40it [00:00, 44.39it/s][A
45it [00:01, 44.59it/s][A
50it [00:01, 44.72it/s][A
55it [00:01, 44.98it/s][A
60it [00:01, 45.23it/s][A
65it [00:01, 45.55it/s][A
70it [00:01, 45.94it/s][A
75it [00:01, 46.16it/s][A
80it [00:01, 44.52it/s][A
85it [00:01, 45.25it/s][A
90it [00:02, 45.48it/s][A
95it [00:02, 45.59it/s][A
100it [00:02, 45.18it/s][A
105it [00:02, 45.41it/s][A

Epoch: 425, Step: 100, Loss: 4.475317602157593



110it [00:02, 45.70it/s][A
115it [00:02, 45.38it/s][A
120it [00:02, 45.59it/s][A
125it [00:02, 45.83it/s][A
130it [00:02, 43.78it/s][A
135it [00:03, 42.56it/s][A
140it [00:03, 43.23it/s][A
145it [00:03, 43.87it/s][A
150it [00:03, 43.31it/s][A
155it [00:03, 43.76it/s][A
160it [00:03, 43.01it/s][A
165it [00:03, 43.92it/s][A
170it [00:03, 44.32it/s][A
175it [00:03, 44.74it/s][A
180it [00:04, 43.14it/s][A
185it [00:04, 43.42it/s][A
190it [00:04, 44.34it/s][A
195it [00:04, 42.87it/s][A
200it [00:04, 43.88it/s][A
205it [00:04, 44.54it/s][A

Epoch: 425, Step: 200, Loss: 4.488271441459656



210it [00:04, 44.68it/s][A
215it [00:04, 45.01it/s][A
220it [00:04, 45.40it/s][A
227it [00:05, 44.42it/s]

0it [00:00, ?it/s][A
6it [00:00, 52.86it/s][A
13it [00:00, 57.34it/s][A
19it [00:00, 54.90it/s][A
26it [00:00, 57.38it/s][A
33it [00:00, 58.62it/s][A
40it [00:00, 59.55it/s][A
47it [00:00, 60.28it/s][A
54it [00:00, 60.33it/s][A
61it [00:01, 61.02it/s][A
68it [00:01, 61.63it/s][A
75it [00:01, 61.78it/s][A
82it [00:01, 61.32it/s][A
89it [00:01, 61.08it/s][A
96it [00:01, 60.84it/s][A
103it [00:01, 60.46it/s][A
110it [00:01, 60.63it/s][A
117it [00:01, 60.44it/s][A
124it [00:02, 60.10it/s][A
131it [00:02, 57.97it/s][A
138it [00:02, 58.77it/s][A
144it [00:02, 58.79it/s][A
150it [00:02, 56.67it/s][A
157it [00:02, 57.86it/s][A
164it [00:02, 58.58it/s][A
170it [00:02, 58.90it/s][A
176it [00:02, 58.96it/s][A
182it [00:03, 59.14it/s][A
188it [00:03, 59.35it/s][A
194it [00:03, 59.30it/s][A
201it [00:03, 59.74it/s][A
207it [00:03, 59.52it/s][A
214it [00:03, 


Epoch: 425, Test Loss: 5.556635165806883, Test Perplexity: 260.03647187037495




0it [00:00, ?it/s][A
5it [00:00, 44.85it/s][A
10it [00:00, 44.51it/s][A
15it [00:00, 44.88it/s][A
20it [00:00, 43.22it/s][A
25it [00:00, 43.72it/s][A
30it [00:00, 44.12it/s][A
35it [00:00, 44.70it/s][A
40it [00:00, 44.08it/s][A
45it [00:01, 44.80it/s][A
50it [00:01, 44.54it/s][A
55it [00:01, 44.98it/s][A
60it [00:01, 45.02it/s][A
65it [00:01, 45.19it/s][A
70it [00:01, 45.22it/s][A
75it [00:01, 43.76it/s][A
80it [00:01, 44.30it/s][A
85it [00:01, 44.22it/s][A
90it [00:02, 42.83it/s][A
95it [00:02, 42.27it/s][A
100it [00:02, 43.02it/s][A
105it [00:02, 43.57it/s][A

Epoch: 426, Step: 100, Loss: 4.475597643852234



110it [00:02, 44.02it/s][A
115it [00:02, 44.28it/s][A
120it [00:02, 44.81it/s][A
125it [00:02, 43.96it/s][A
130it [00:02, 44.31it/s][A
135it [00:03, 44.28it/s][A
140it [00:03, 44.25it/s][A
145it [00:03, 42.90it/s][A
150it [00:03, 43.50it/s][A
155it [00:03, 42.98it/s][A
160it [00:03, 43.76it/s][A
165it [00:03, 44.42it/s][A
170it [00:03, 43.17it/s][A
175it [00:03, 43.84it/s][A
180it [00:04, 42.67it/s][A
185it [00:04, 43.52it/s][A
190it [00:04, 43.97it/s][A
195it [00:04, 44.07it/s][A
200it [00:04, 44.49it/s][A
205it [00:04, 44.00it/s][A

Epoch: 426, Step: 200, Loss: 4.487919187545776



210it [00:04, 43.70it/s][A
215it [00:04, 44.35it/s][A
220it [00:04, 44.89it/s][A
227it [00:05, 44.06it/s]
 85%|████████▌ | 426/500 [50:58<09:43,  7.89s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.92it/s][A
10it [00:00, 46.13it/s][A
15it [00:00, 45.29it/s][A
20it [00:00, 42.74it/s][A
25it [00:00, 43.60it/s][A
30it [00:00, 44.42it/s][A
35it [00:00, 43.85it/s][A
40it [00:00, 42.43it/s][A
45it [00:01, 42.75it/s][A
50it [00:01, 41.15it/s][A
55it [00:01, 41.62it/s][A
60it [00:01, 41.06it/s][A
65it [00:01, 41.64it/s][A
70it [00:01, 42.75it/s][A
75it [00:01, 43.71it/s][A
80it [00:01, 44.15it/s][A
85it [00:01, 43.91it/s][A
90it [00:02, 44.10it/s][A
95it [00:02, 43.81it/s][A
100it [00:02, 43.77it/s][A
105it [00:02, 43.83it/s][A

Epoch: 427, Step: 100, Loss: 4.483131785392761



110it [00:02, 43.42it/s][A
115it [00:02, 42.80it/s][A
120it [00:02, 43.84it/s][A
125it [00:02, 44.21it/s][A
130it [00:02, 44.59it/s][A
135it [00:03, 44.80it/s][A
140it [00:03, 45.02it/s][A
145it [00:03, 45.26it/s][A
150it [00:03, 45.20it/s][A
155it [00:03, 45.23it/s][A
160it [00:03, 45.34it/s][A
165it [00:03, 45.46it/s][A
170it [00:03, 44.00it/s][A
175it [00:04, 43.18it/s][A
180it [00:04, 42.77it/s][A
185it [00:04, 43.56it/s][A
190it [00:04, 44.24it/s][A
195it [00:04, 44.54it/s][A
200it [00:04, 44.73it/s][A
205it [00:04, 43.71it/s][A

Epoch: 427, Step: 200, Loss: 4.489379558563233



210it [00:04, 43.35it/s][A
215it [00:04, 44.16it/s][A
220it [00:05, 43.11it/s][A
227it [00:05, 43.59it/s]
 85%|████████▌ | 427/500 [51:03<08:37,  7.08s/it]
0it [00:00, ?it/s][A
4it [00:00, 37.60it/s][A
9it [00:00, 41.73it/s][A
14it [00:00, 43.58it/s][A
19it [00:00, 44.00it/s][A
24it [00:00, 44.55it/s][A
29it [00:00, 44.68it/s][A
34it [00:00, 44.59it/s][A
39it [00:00, 43.34it/s][A
44it [00:01, 44.30it/s][A
49it [00:01, 44.71it/s][A
54it [00:01, 45.12it/s][A
59it [00:01, 45.00it/s][A
64it [00:01, 45.26it/s][A
69it [00:01, 45.48it/s][A
74it [00:01, 45.49it/s][A
79it [00:01, 45.08it/s][A
84it [00:01, 45.16it/s][A
89it [00:01, 44.89it/s][A
94it [00:02, 42.85it/s][A
99it [00:02, 43.11it/s][A
104it [00:02, 43.87it/s][A

Epoch: 428, Step: 100, Loss: 4.472510395050048



109it [00:02, 42.74it/s][A
114it [00:02, 43.68it/s][A
119it [00:02, 44.27it/s][A
124it [00:02, 42.88it/s][A
129it [00:02, 43.29it/s][A
134it [00:03, 43.84it/s][A
139it [00:03, 44.33it/s][A
144it [00:03, 43.01it/s][A
149it [00:03, 43.68it/s][A
154it [00:03, 42.87it/s][A
159it [00:03, 43.94it/s][A
164it [00:03, 43.20it/s][A
169it [00:03, 43.79it/s][A
174it [00:03, 43.76it/s][A
179it [00:04, 44.03it/s][A
184it [00:04, 43.36it/s][A
189it [00:04, 42.31it/s][A
194it [00:04, 43.11it/s][A
199it [00:04, 43.47it/s][A
204it [00:04, 43.34it/s][A
209it [00:04, 43.94it/s][A

Epoch: 428, Step: 200, Loss: 4.486864457130432



214it [00:04, 42.85it/s][A
219it [00:05, 42.34it/s][A
227it [00:05, 43.77it/s]
 86%|████████▌ | 428/500 [51:09<07:49,  6.51s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.90it/s][A
10it [00:00, 41.72it/s][A
15it [00:00, 43.46it/s][A
20it [00:00, 44.30it/s][A
25it [00:00, 44.91it/s][A
30it [00:00, 43.93it/s][A
35it [00:00, 42.94it/s][A
40it [00:00, 44.03it/s][A
45it [00:01, 44.34it/s][A
50it [00:01, 44.85it/s][A
55it [00:01, 45.11it/s][A
60it [00:01, 45.31it/s][A
65it [00:01, 45.54it/s][A
70it [00:01, 44.32it/s][A
75it [00:01, 44.61it/s][A
80it [00:01, 44.90it/s][A
85it [00:01, 45.29it/s][A
90it [00:02, 45.05it/s][A
95it [00:02, 45.33it/s][A
100it [00:02, 45.50it/s][A
105it [00:02, 45.56it/s][A

Epoch: 429, Step: 100, Loss: 4.474535994529724



110it [00:02, 43.60it/s][A
115it [00:02, 43.71it/s][A
120it [00:02, 44.47it/s][A
125it [00:02, 44.92it/s][A
130it [00:02, 45.14it/s][A
135it [00:03, 44.99it/s][A
140it [00:03, 45.26it/s][A
145it [00:03, 44.83it/s][A
150it [00:03, 45.12it/s][A
155it [00:03, 45.32it/s][A
160it [00:03, 44.54it/s][A
165it [00:03, 44.85it/s][A
170it [00:03, 45.31it/s][A
175it [00:03, 45.34it/s][A
180it [00:04, 44.16it/s][A
185it [00:04, 44.89it/s][A
190it [00:04, 45.27it/s][A
195it [00:04, 45.44it/s][A
200it [00:04, 45.64it/s][A
205it [00:04, 45.86it/s][A

Epoch: 429, Step: 200, Loss: 4.487856435775757



210it [00:04, 43.88it/s][A
215it [00:04, 44.71it/s][A
220it [00:04, 44.43it/s][A
227it [00:05, 44.72it/s]
 86%|████████▌ | 429/500 [51:14<07:11,  6.08s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.77it/s][A
10it [00:00, 45.82it/s][A
15it [00:00, 45.83it/s][A
20it [00:00, 45.47it/s][A
25it [00:00, 45.59it/s][A
30it [00:00, 44.23it/s][A
35it [00:00, 44.87it/s][A
40it [00:00, 45.37it/s][A
45it [00:00, 45.57it/s][A
50it [00:01, 45.56it/s][A
55it [00:01, 45.62it/s][A
60it [00:01, 45.85it/s][A
65it [00:01, 46.40it/s][A
70it [00:01, 46.05it/s][A
75it [00:01, 45.68it/s][A
80it [00:01, 45.49it/s][A
85it [00:01, 45.14it/s][A
90it [00:01, 44.89it/s][A
95it [00:02, 44.95it/s][A
100it [00:02, 45.25it/s][A
105it [00:02, 44.86it/s][A

Epoch: 430, Step: 100, Loss: 4.476613445281982



110it [00:02, 43.39it/s][A
115it [00:02, 43.31it/s][A
120it [00:02, 43.65it/s][A
125it [00:02, 44.18it/s][A
130it [00:02, 42.59it/s][A
135it [00:03, 43.39it/s][A
140it [00:03, 43.91it/s][A
145it [00:03, 44.15it/s][A
150it [00:03, 44.31it/s][A
155it [00:03, 43.58it/s][A
160it [00:03, 42.99it/s][A
165it [00:03, 43.16it/s][A
170it [00:03, 42.23it/s][A
175it [00:03, 42.96it/s][A
180it [00:04, 41.82it/s][A
185it [00:04, 42.75it/s][A
190it [00:04, 43.36it/s][A
195it [00:04, 43.63it/s][A
200it [00:04, 43.83it/s][A
205it [00:04, 44.04it/s][A

Epoch: 430, Step: 200, Loss: 4.486319494247437



210it [00:04, 42.79it/s][A
215it [00:04, 43.31it/s][A
220it [00:04, 43.90it/s][A
227it [00:05, 44.23it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.38it/s][A
12it [00:00, 57.69it/s][A
18it [00:00, 55.69it/s][A
24it [00:00, 55.54it/s][A
30it [00:00, 56.65it/s][A
36it [00:00, 55.10it/s][A
42it [00:00, 56.55it/s][A
48it [00:00, 55.13it/s][A
55it [00:00, 56.95it/s][A
62it [00:01, 58.06it/s][A
69it [00:01, 58.76it/s][A
75it [00:01, 58.99it/s][A
81it [00:01, 59.13it/s][A
87it [00:01, 59.19it/s][A
93it [00:01, 59.40it/s][A
100it [00:01, 59.71it/s][A
106it [00:01, 59.66it/s][A
112it [00:01, 59.70it/s][A
119it [00:02, 59.86it/s][A
125it [00:02, 59.56it/s][A
132it [00:02, 60.02it/s][A
138it [00:02, 59.79it/s][A
144it [00:02, 59.58it/s][A
151it [00:02, 59.79it/s][A
158it [00:02, 60.18it/s][A
165it [00:02, 60.11it/s][A
172it [00:02, 60.21it/s][A
179it [00:03, 60.32it/s][A
186it [00:03, 60.31it/s][A
193it [00:03, 60.01it/s][A
200it [00:03, 59.96it/s][A
206it [00:03, 5


Epoch: 430, Test Loss: 5.554919924795257, Test Perplexity: 259.5001711875015




0it [00:00, ?it/s][A
4it [00:00, 37.45it/s][A
8it [00:00, 36.78it/s][A
13it [00:00, 40.38it/s][A
18it [00:00, 40.32it/s][A
23it [00:00, 41.53it/s][A
28it [00:00, 42.96it/s][A
33it [00:00, 43.45it/s][A
38it [00:00, 44.33it/s][A
43it [00:01, 42.80it/s][A
48it [00:01, 43.70it/s][A
53it [00:01, 44.21it/s][A
58it [00:01, 44.56it/s][A
63it [00:01, 44.43it/s][A
68it [00:01, 44.76it/s][A
73it [00:01, 44.86it/s][A
78it [00:01, 45.16it/s][A
83it [00:01, 44.98it/s][A
88it [00:02, 45.11it/s][A
93it [00:02, 45.13it/s][A
98it [00:02, 45.20it/s][A
103it [00:02, 45.36it/s][A
108it [00:02, 45.57it/s][A

Epoch: 431, Step: 100, Loss: 4.480010499954224



113it [00:02, 45.21it/s][A
118it [00:02, 45.16it/s][A
123it [00:02, 45.34it/s][A
128it [00:02, 44.97it/s][A
133it [00:03, 45.15it/s][A
138it [00:03, 44.87it/s][A
143it [00:03, 44.91it/s][A
148it [00:03, 44.88it/s][A
153it [00:03, 44.89it/s][A
158it [00:03, 45.02it/s][A
163it [00:03, 44.59it/s][A
168it [00:03, 44.41it/s][A
173it [00:03, 44.26it/s][A
178it [00:04, 44.44it/s][A
183it [00:04, 44.88it/s][A
188it [00:04, 44.89it/s][A
193it [00:04, 43.84it/s][A
198it [00:04, 42.92it/s][A
203it [00:04, 43.15it/s][A
208it [00:04, 43.19it/s][A

Epoch: 431, Step: 200, Loss: 4.486105945110321



213it [00:04, 41.53it/s][A
218it [00:04, 41.73it/s][A
227it [00:05, 43.82it/s]
 86%|████████▌ | 431/500 [51:35<09:05,  7.91s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.50it/s][A
10it [00:00, 46.49it/s][A
15it [00:00, 45.62it/s][A
20it [00:00, 45.65it/s][A
25it [00:00, 45.68it/s][A
30it [00:00, 43.39it/s][A
35it [00:00, 43.48it/s][A
40it [00:00, 43.84it/s][A
45it [00:01, 44.45it/s][A
50it [00:01, 42.26it/s][A
55it [00:01, 43.22it/s][A
60it [00:01, 42.36it/s][A
65it [00:01, 43.27it/s][A
70it [00:01, 43.84it/s][A
75it [00:01, 44.40it/s][A
80it [00:01, 44.82it/s][A
85it [00:01, 45.03it/s][A
90it [00:02, 44.91it/s][A
95it [00:02, 44.98it/s][A
100it [00:02, 44.99it/s][A
105it [00:02, 45.22it/s][A

Epoch: 432, Step: 100, Loss: 4.474907994270325



110it [00:02, 43.66it/s][A
115it [00:02, 44.28it/s][A
120it [00:02, 44.54it/s][A
125it [00:02, 44.77it/s][A
130it [00:02, 44.92it/s][A
135it [00:03, 45.01it/s][A
140it [00:03, 44.63it/s][A
145it [00:03, 44.84it/s][A
150it [00:03, 44.78it/s][A
155it [00:03, 44.91it/s][A
160it [00:03, 45.28it/s][A
165it [00:03, 45.32it/s][A
170it [00:03, 45.40it/s][A
175it [00:03, 45.40it/s][A
180it [00:04, 45.35it/s][A
185it [00:04, 45.03it/s][A
190it [00:04, 44.92it/s][A
195it [00:04, 45.02it/s][A
200it [00:04, 43.96it/s][A
205it [00:04, 44.10it/s][A

Epoch: 432, Step: 200, Loss: 4.48755841255188



210it [00:04, 42.37it/s][A
215it [00:04, 42.87it/s][A
220it [00:04, 43.66it/s][A
227it [00:05, 44.43it/s]
 86%|████████▋ | 432/500 [51:40<08:00,  7.07s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.49it/s][A
10it [00:00, 44.76it/s][A
15it [00:00, 44.71it/s][A
20it [00:00, 45.05it/s][A
25it [00:00, 45.36it/s][A
30it [00:00, 45.40it/s][A
35it [00:00, 45.38it/s][A
40it [00:00, 45.29it/s][A
45it [00:00, 45.11it/s][A
50it [00:01, 44.77it/s][A
55it [00:01, 44.60it/s][A
60it [00:01, 44.19it/s][A
65it [00:01, 44.29it/s][A
70it [00:01, 44.09it/s][A
75it [00:01, 44.06it/s][A
80it [00:01, 44.62it/s][A
85it [00:01, 44.97it/s][A
90it [00:02, 45.16it/s][A
95it [00:02, 44.96it/s][A
100it [00:02, 45.13it/s][A
105it [00:02, 43.26it/s][A

Epoch: 433, Step: 100, Loss: 4.471693768501281



110it [00:02, 43.64it/s][A
115it [00:02, 43.44it/s][A
120it [00:02, 44.34it/s][A
125it [00:02, 44.76it/s][A
130it [00:02, 44.36it/s][A
135it [00:03, 44.68it/s][A
140it [00:03, 45.31it/s][A
145it [00:03, 45.69it/s][A
150it [00:03, 45.81it/s][A
155it [00:03, 46.15it/s][A
160it [00:03, 46.23it/s][A
165it [00:03, 46.50it/s][A
170it [00:03, 46.43it/s][A
175it [00:03, 46.41it/s][A
180it [00:03, 46.61it/s][A
185it [00:04, 45.17it/s][A
190it [00:04, 45.61it/s][A
195it [00:04, 45.12it/s][A
200it [00:04, 45.44it/s][A
205it [00:04, 44.68it/s][A

Epoch: 433, Step: 200, Loss: 4.483533160686493



210it [00:04, 45.22it/s][A
215it [00:04, 45.57it/s][A
220it [00:04, 43.85it/s][A
227it [00:05, 44.86it/s]
 87%|████████▋ | 433/500 [51:45<07:13,  6.47s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.66it/s][A
10it [00:00, 45.59it/s][A
15it [00:00, 45.43it/s][A
20it [00:00, 45.39it/s][A
25it [00:00, 43.63it/s][A
30it [00:00, 42.99it/s][A
35it [00:00, 43.38it/s][A
40it [00:00, 43.73it/s][A
45it [00:01, 44.12it/s][A
50it [00:01, 44.08it/s][A
55it [00:01, 42.87it/s][A
60it [00:01, 43.84it/s][A
65it [00:01, 44.44it/s][A
70it [00:01, 44.87it/s][A
75it [00:01, 45.24it/s][A
80it [00:01, 45.10it/s][A
85it [00:01, 45.13it/s][A
90it [00:02, 45.75it/s][A
95it [00:02, 46.13it/s][A
100it [00:02, 45.89it/s][A
105it [00:02, 45.38it/s][A

Epoch: 434, Step: 100, Loss: 4.46342438697815



110it [00:02, 45.48it/s][A
115it [00:02, 45.32it/s][A
120it [00:02, 45.40it/s][A
125it [00:02, 45.62it/s][A
130it [00:02, 45.75it/s][A
135it [00:03, 45.79it/s][A
140it [00:03, 45.73it/s][A
145it [00:03, 45.65it/s][A
150it [00:03, 45.52it/s][A
155it [00:03, 44.90it/s][A
160it [00:03, 44.88it/s][A
165it [00:03, 44.31it/s][A
170it [00:03, 43.75it/s][A
175it [00:03, 43.91it/s][A
180it [00:04, 44.27it/s][A
185it [00:04, 44.61it/s][A
190it [00:04, 44.34it/s][A
195it [00:04, 44.00it/s][A
200it [00:04, 44.34it/s][A
205it [00:04, 44.39it/s][A

Epoch: 434, Step: 200, Loss: 4.484824454784393



210it [00:04, 43.88it/s][A
215it [00:04, 43.46it/s][A
220it [00:04, 42.26it/s][A
227it [00:05, 44.45it/s]
 87%|████████▋ | 434/500 [51:50<06:40,  6.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.02it/s][A
10it [00:00, 45.44it/s][A
15it [00:00, 45.60it/s][A
20it [00:00, 45.56it/s][A
25it [00:00, 45.71it/s][A
30it [00:00, 45.81it/s][A
35it [00:00, 45.66it/s][A
40it [00:00, 44.93it/s][A
45it [00:00, 45.09it/s][A
50it [00:01, 45.12it/s][A
55it [00:01, 45.21it/s][A
60it [00:01, 45.26it/s][A
65it [00:01, 44.98it/s][A
70it [00:01, 44.91it/s][A
75it [00:01, 44.99it/s][A
80it [00:01, 44.93it/s][A
85it [00:01, 44.62it/s][A
90it [00:01, 44.84it/s][A
95it [00:02, 44.44it/s][A
100it [00:02, 42.95it/s][A
105it [00:02, 43.50it/s][A

Epoch: 435, Step: 100, Loss: 4.459908156394959



110it [00:02, 43.78it/s][A
115it [00:02, 43.97it/s][A
120it [00:02, 44.34it/s][A
125it [00:02, 44.75it/s][A
130it [00:02, 44.96it/s][A
135it [00:03, 45.06it/s][A
140it [00:03, 44.77it/s][A
145it [00:03, 44.31it/s][A
150it [00:03, 42.80it/s][A
155it [00:03, 41.52it/s][A
160it [00:03, 42.83it/s][A
165it [00:03, 43.44it/s][A
170it [00:03, 43.43it/s][A
175it [00:03, 43.80it/s][A
180it [00:04, 44.28it/s][A
185it [00:04, 44.46it/s][A
190it [00:04, 44.80it/s][A
195it [00:04, 45.09it/s][A
200it [00:04, 45.14it/s][A
205it [00:04, 45.36it/s][A

Epoch: 435, Step: 200, Loss: 4.48740315914154



210it [00:04, 45.16it/s][A
215it [00:04, 44.99it/s][A
220it [00:04, 45.05it/s][A
227it [00:05, 44.56it/s]

0it [00:00, ?it/s][A
6it [00:00, 56.81it/s][A
12it [00:00, 54.53it/s][A
18it [00:00, 56.81it/s][A
25it [00:00, 58.44it/s][A
32it [00:00, 59.30it/s][A
39it [00:00, 59.72it/s][A
46it [00:00, 60.00it/s][A
53it [00:00, 60.20it/s][A
60it [00:01, 60.09it/s][A
67it [00:01, 59.56it/s][A
73it [00:01, 59.56it/s][A
79it [00:01, 59.56it/s][A
85it [00:01, 57.24it/s][A
91it [00:01, 57.85it/s][A
98it [00:01, 58.71it/s][A
104it [00:01, 59.04it/s][A
110it [00:01, 59.22it/s][A
116it [00:01, 58.95it/s][A
122it [00:02, 57.72it/s][A
128it [00:02, 57.91it/s][A
134it [00:02, 56.79it/s][A
140it [00:02, 57.41it/s][A
146it [00:02, 57.94it/s][A
152it [00:02, 58.17it/s][A
158it [00:02, 55.84it/s][A
164it [00:02, 54.56it/s][A
170it [00:02, 55.97it/s][A
176it [00:03, 56.84it/s][A
182it [00:03, 56.55it/s][A
188it [00:03, 57.51it/s][A
194it [00:03, 55.20it/s][A
200it [00:03, 5


Epoch: 435, Test Loss: 5.5629353256699465, Test Perplexity: 261.66306804870226




0it [00:00, ?it/s][A
5it [00:00, 44.34it/s][A
10it [00:00, 45.04it/s][A
15it [00:00, 45.57it/s][A
20it [00:00, 45.73it/s][A
25it [00:00, 44.30it/s][A
30it [00:00, 42.76it/s][A
35it [00:00, 43.81it/s][A
40it [00:00, 44.02it/s][A
45it [00:01, 44.00it/s][A
50it [00:01, 44.35it/s][A
55it [00:01, 44.27it/s][A
60it [00:01, 44.73it/s][A
65it [00:01, 43.48it/s][A
70it [00:01, 43.78it/s][A
75it [00:01, 41.87it/s][A
80it [00:01, 42.63it/s][A
85it [00:01, 43.10it/s][A
90it [00:02, 43.34it/s][A
95it [00:02, 43.51it/s][A
100it [00:02, 44.07it/s][A
105it [00:02, 44.46it/s][A

Epoch: 436, Step: 100, Loss: 4.476354365348816



110it [00:02, 43.79it/s][A
115it [00:02, 43.93it/s][A
120it [00:02, 43.80it/s][A
125it [00:02, 43.99it/s][A
130it [00:02, 43.96it/s][A
135it [00:03, 44.22it/s][A
140it [00:03, 44.44it/s][A
145it [00:03, 44.77it/s][A
150it [00:03, 43.04it/s][A
155it [00:03, 43.70it/s][A
160it [00:03, 44.27it/s][A
165it [00:03, 44.44it/s][A
170it [00:03, 44.63it/s][A
175it [00:03, 44.84it/s][A
180it [00:04, 44.93it/s][A
185it [00:04, 45.05it/s][A
190it [00:04, 45.07it/s][A
195it [00:04, 45.18it/s][A
200it [00:04, 45.18it/s][A
205it [00:04, 45.25it/s][A

Epoch: 436, Step: 200, Loss: 4.48841236114502



210it [00:04, 44.83it/s][A
215it [00:04, 44.84it/s][A
220it [00:04, 44.85it/s][A
227it [00:05, 44.22it/s]
 87%|████████▋ | 436/500 [52:11<08:24,  7.88s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.63it/s][A
10it [00:00, 42.03it/s][A
15it [00:00, 43.39it/s][A
20it [00:00, 44.00it/s][A
25it [00:00, 43.95it/s][A
30it [00:00, 44.59it/s][A
35it [00:00, 44.72it/s][A
40it [00:00, 44.67it/s][A
45it [00:01, 43.44it/s][A
50it [00:01, 43.90it/s][A
55it [00:01, 44.31it/s][A
60it [00:01, 44.62it/s][A
65it [00:01, 45.00it/s][A
70it [00:01, 45.34it/s][A
75it [00:01, 45.40it/s][A
80it [00:01, 45.55it/s][A
85it [00:01, 45.68it/s][A
90it [00:02, 45.76it/s][A
95it [00:02, 45.72it/s][A
100it [00:02, 44.29it/s][A
105it [00:02, 44.13it/s][A

Epoch: 437, Step: 100, Loss: 4.477862710952759



110it [00:02, 44.32it/s][A
115it [00:02, 44.76it/s][A
120it [00:02, 43.39it/s][A
125it [00:02, 43.95it/s][A
130it [00:02, 44.30it/s][A
135it [00:03, 44.58it/s][A
140it [00:03, 44.58it/s][A
145it [00:03, 44.65it/s][A
150it [00:03, 43.80it/s][A
155it [00:03, 43.24it/s][A
160it [00:03, 43.84it/s][A
165it [00:03, 44.31it/s][A
170it [00:03, 44.45it/s][A
175it [00:03, 44.83it/s][A
180it [00:04, 45.22it/s][A
185it [00:04, 44.39it/s][A
190it [00:04, 44.79it/s][A
195it [00:04, 45.01it/s][A
200it [00:04, 45.45it/s][A
205it [00:04, 45.77it/s][A

Epoch: 437, Step: 200, Loss: 4.484333393573761



210it [00:04, 44.78it/s][A
215it [00:04, 45.08it/s][A
220it [00:04, 45.03it/s][A
227it [00:05, 44.57it/s]
 87%|████████▋ | 437/500 [52:17<07:23,  7.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.80it/s][A
10it [00:00, 44.92it/s][A
15it [00:00, 45.72it/s][A
20it [00:00, 44.09it/s][A
25it [00:00, 44.79it/s][A
30it [00:00, 45.40it/s][A
35it [00:00, 45.57it/s][A
40it [00:00, 43.63it/s][A
45it [00:01, 44.24it/s][A
50it [00:01, 44.31it/s][A
55it [00:01, 44.68it/s][A
60it [00:01, 44.92it/s][A
65it [00:01, 45.13it/s][A
70it [00:01, 45.22it/s][A
75it [00:01, 45.46it/s][A
80it [00:01, 45.75it/s][A
85it [00:01, 45.04it/s][A
90it [00:02, 45.19it/s][A
95it [00:02, 45.74it/s][A
100it [00:02, 45.13it/s][A
105it [00:02, 45.46it/s][A

Epoch: 438, Step: 100, Loss: 4.467139148712159



110it [00:02, 45.49it/s][A
115it [00:02, 42.91it/s][A
120it [00:02, 43.94it/s][A
125it [00:02, 41.70it/s][A
130it [00:02, 42.72it/s][A
135it [00:03, 43.56it/s][A
140it [00:03, 44.27it/s][A
145it [00:03, 45.25it/s][A
150it [00:03, 45.85it/s][A
155it [00:03, 45.45it/s][A
160it [00:03, 45.41it/s][A
165it [00:03, 44.94it/s][A
170it [00:03, 44.64it/s][A
175it [00:03, 44.51it/s][A
180it [00:04, 44.05it/s][A
185it [00:04, 42.36it/s][A
190it [00:04, 41.57it/s][A
195it [00:04, 42.45it/s][A
200it [00:04, 43.35it/s][A
205it [00:04, 43.87it/s][A

Epoch: 438, Step: 200, Loss: 4.4859258699417115



210it [00:04, 44.04it/s][A
215it [00:04, 44.61it/s][A
220it [00:04, 44.88it/s][A
227it [00:05, 44.33it/s]
 88%|████████▊ | 438/500 [52:22<06:41,  6.47s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.76it/s][A
10it [00:00, 45.21it/s][A
15it [00:00, 45.22it/s][A
20it [00:00, 44.96it/s][A
25it [00:00, 45.06it/s][A
30it [00:00, 45.23it/s][A
35it [00:00, 45.33it/s][A
40it [00:00, 45.39it/s][A
45it [00:00, 45.58it/s][A
50it [00:01, 45.54it/s][A
55it [00:01, 45.39it/s][A
60it [00:01, 45.29it/s][A
65it [00:01, 45.50it/s][A
70it [00:01, 43.77it/s][A
75it [00:01, 44.38it/s][A
80it [00:01, 44.82it/s][A
85it [00:01, 45.16it/s][A
90it [00:01, 45.45it/s][A
95it [00:02, 45.26it/s][A
100it [00:02, 45.15it/s][A
105it [00:02, 44.85it/s][A

Epoch: 439, Step: 100, Loss: 4.479688334465027



110it [00:02, 43.89it/s][A
115it [00:02, 43.88it/s][A
120it [00:02, 43.56it/s][A
125it [00:02, 44.05it/s][A
130it [00:02, 44.53it/s][A
135it [00:03, 44.70it/s][A
140it [00:03, 45.04it/s][A
145it [00:03, 45.24it/s][A
150it [00:03, 45.46it/s][A
155it [00:03, 45.48it/s][A
160it [00:03, 45.13it/s][A
165it [00:03, 44.73it/s][A
170it [00:03, 44.70it/s][A
175it [00:03, 44.84it/s][A
180it [00:04, 44.80it/s][A
185it [00:04, 45.04it/s][A
190it [00:04, 45.35it/s][A
195it [00:04, 45.49it/s][A
200it [00:04, 45.42it/s][A
205it [00:04, 45.36it/s][A

Epoch: 439, Step: 200, Loss: 4.484873921871185



210it [00:04, 43.04it/s][A
215it [00:04, 43.62it/s][A
220it [00:04, 44.14it/s][A
227it [00:05, 44.83it/s]
 88%|████████▊ | 439/500 [52:27<06:08,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.64it/s][A
10it [00:00, 45.45it/s][A
15it [00:00, 45.32it/s][A
20it [00:00, 45.16it/s][A
25it [00:00, 45.08it/s][A
30it [00:00, 45.26it/s][A
35it [00:00, 44.88it/s][A
40it [00:00, 43.70it/s][A
45it [00:01, 44.27it/s][A
50it [00:01, 44.54it/s][A
55it [00:01, 42.90it/s][A
60it [00:01, 43.26it/s][A
65it [00:01, 43.01it/s][A
70it [00:01, 43.94it/s][A
75it [00:01, 43.76it/s][A
80it [00:01, 42.54it/s][A
85it [00:01, 43.43it/s][A
90it [00:02, 44.01it/s][A
95it [00:02, 44.34it/s][A
100it [00:02, 44.62it/s][A
105it [00:02, 44.55it/s][A

Epoch: 440, Step: 100, Loss: 4.469549374580383



110it [00:02, 44.61it/s][A
115it [00:02, 44.26it/s][A
120it [00:02, 44.53it/s][A
125it [00:02, 44.34it/s][A
130it [00:02, 44.74it/s][A
135it [00:03, 44.98it/s][A
140it [00:03, 45.09it/s][A
145it [00:03, 44.99it/s][A
150it [00:03, 45.15it/s][A
155it [00:03, 45.09it/s][A
160it [00:03, 45.05it/s][A
165it [00:03, 45.04it/s][A
170it [00:03, 45.21it/s][A
175it [00:03, 45.47it/s][A
180it [00:04, 45.03it/s][A
185it [00:04, 44.78it/s][A
190it [00:04, 45.09it/s][A
195it [00:04, 45.29it/s][A
200it [00:04, 45.28it/s][A
205it [00:04, 44.67it/s][A

Epoch: 440, Step: 200, Loss: 4.484913177490235



210it [00:04, 44.67it/s][A
215it [00:04, 44.91it/s][A
220it [00:04, 44.99it/s][A
227it [00:05, 44.54it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.15it/s][A
12it [00:00, 57.57it/s][A
18it [00:00, 58.47it/s][A
25it [00:00, 59.48it/s][A
32it [00:00, 59.90it/s][A
38it [00:00, 59.38it/s][A
45it [00:00, 59.85it/s][A
52it [00:00, 60.15it/s][A
59it [00:00, 60.34it/s][A
66it [00:01, 60.54it/s][A
73it [00:01, 60.78it/s][A
80it [00:01, 60.72it/s][A
87it [00:01, 60.78it/s][A
94it [00:01, 60.88it/s][A
101it [00:01, 60.59it/s][A
108it [00:01, 60.71it/s][A
115it [00:01, 60.43it/s][A
122it [00:02, 60.62it/s][A
129it [00:02, 60.03it/s][A
136it [00:02, 60.36it/s][A
143it [00:02, 60.40it/s][A
150it [00:02, 60.36it/s][A
157it [00:02, 60.52it/s][A
164it [00:02, 60.40it/s][A
171it [00:02, 60.48it/s][A
178it [00:02, 60.43it/s][A
185it [00:03, 60.12it/s][A
192it [00:03, 60.21it/s][A
199it [00:03, 60.19it/s][A
206it [00:03, 59.81it/s][A
212it [00:03, 59.47it/s][A
218it [00:03, 


Epoch: 440, Test Loss: 5.565468874777326, Test Perplexity: 262.35388742766764




0it [00:00, ?it/s][A
5it [00:00, 45.56it/s][A
10it [00:00, 45.14it/s][A
15it [00:00, 45.01it/s][A
20it [00:00, 42.41it/s][A
25it [00:00, 42.50it/s][A
30it [00:00, 43.22it/s][A
35it [00:00, 43.69it/s][A
40it [00:00, 43.87it/s][A
45it [00:01, 44.46it/s][A
50it [00:01, 44.41it/s][A
55it [00:01, 44.81it/s][A
60it [00:01, 45.08it/s][A
65it [00:01, 45.19it/s][A
70it [00:01, 44.99it/s][A
75it [00:01, 45.17it/s][A
80it [00:01, 45.03it/s][A
85it [00:01, 45.24it/s][A
90it [00:02, 45.41it/s][A
95it [00:02, 45.41it/s][A
100it [00:02, 45.60it/s][A
105it [00:02, 45.36it/s][A

Epoch: 441, Step: 100, Loss: 4.473382234573364



110it [00:02, 45.04it/s][A
115it [00:02, 45.01it/s][A
120it [00:02, 43.68it/s][A
125it [00:02, 44.19it/s][A
130it [00:02, 44.44it/s][A
135it [00:03, 44.78it/s][A
140it [00:03, 44.62it/s][A
145it [00:03, 43.57it/s][A
150it [00:03, 43.94it/s][A
155it [00:03, 43.96it/s][A
160it [00:03, 43.50it/s][A
165it [00:03, 43.87it/s][A
170it [00:03, 43.70it/s][A
175it [00:03, 43.79it/s][A
180it [00:04, 44.26it/s][A
185it [00:04, 44.29it/s][A
190it [00:04, 44.53it/s][A
195it [00:04, 44.88it/s][A
200it [00:04, 45.21it/s][A
205it [00:04, 45.17it/s][A

Epoch: 441, Step: 200, Loss: 4.484680795669556



210it [00:04, 45.10it/s][A
215it [00:04, 44.98it/s][A
220it [00:04, 45.37it/s][A
227it [00:05, 44.60it/s]
 88%|████████▊ | 441/500 [52:48<07:43,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.51it/s][A
10it [00:00, 45.66it/s][A
15it [00:00, 45.72it/s][A
20it [00:00, 42.93it/s][A
25it [00:00, 43.36it/s][A
30it [00:00, 44.30it/s][A
35it [00:00, 44.81it/s][A
40it [00:00, 43.99it/s][A
45it [00:01, 44.00it/s][A
50it [00:01, 44.61it/s][A
55it [00:01, 44.95it/s][A
60it [00:01, 45.33it/s][A
65it [00:01, 45.43it/s][A
70it [00:01, 45.48it/s][A
75it [00:01, 44.46it/s][A
80it [00:01, 44.91it/s][A
85it [00:01, 43.65it/s][A
90it [00:02, 44.66it/s][A
95it [00:02, 45.17it/s][A
100it [00:02, 44.71it/s][A
105it [00:02, 45.18it/s][A

Epoch: 442, Step: 100, Loss: 4.48090964794159



110it [00:02, 45.45it/s][A
115it [00:02, 45.76it/s][A
120it [00:02, 45.67it/s][A
125it [00:02, 45.95it/s][A
130it [00:02, 44.84it/s][A
135it [00:03, 45.49it/s][A
140it [00:03, 45.36it/s][A
145it [00:03, 45.59it/s][A
150it [00:03, 45.87it/s][A
155it [00:03, 45.61it/s][A
160it [00:03, 45.39it/s][A
165it [00:03, 45.29it/s][A
170it [00:03, 45.27it/s][A
175it [00:03, 45.36it/s][A
180it [00:03, 45.39it/s][A
185it [00:04, 45.50it/s][A
190it [00:04, 45.50it/s][A
195it [00:04, 45.42it/s][A
200it [00:04, 45.53it/s][A
205it [00:04, 44.74it/s][A

Epoch: 442, Step: 200, Loss: 4.4856141304969785



210it [00:04, 44.68it/s][A
215it [00:04, 44.66it/s][A
220it [00:04, 44.93it/s][A
227it [00:05, 44.90it/s]
 88%|████████▊ | 442/500 [52:53<06:46,  7.01s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.21it/s][A
10it [00:00, 45.35it/s][A
15it [00:00, 45.58it/s][A
20it [00:00, 45.62it/s][A
25it [00:00, 45.51it/s][A
30it [00:00, 45.23it/s][A
35it [00:00, 44.83it/s][A
40it [00:00, 43.85it/s][A
45it [00:01, 43.79it/s][A
50it [00:01, 44.09it/s][A
55it [00:01, 44.33it/s][A
60it [00:01, 43.05it/s][A
65it [00:01, 43.85it/s][A
70it [00:01, 43.21it/s][A
75it [00:01, 43.56it/s][A
80it [00:01, 43.94it/s][A
85it [00:01, 44.07it/s][A
90it [00:02, 43.72it/s][A
95it [00:02, 43.94it/s][A
100it [00:02, 44.20it/s][A
105it [00:02, 44.45it/s][A

Epoch: 443, Step: 100, Loss: 4.471754765510559



110it [00:02, 43.13it/s][A
115it [00:02, 43.89it/s][A
120it [00:02, 43.65it/s][A
125it [00:02, 44.37it/s][A
130it [00:02, 44.49it/s][A
135it [00:03, 44.83it/s][A
140it [00:03, 45.19it/s][A
145it [00:03, 45.37it/s][A
150it [00:03, 45.70it/s][A
155it [00:03, 45.60it/s][A
160it [00:03, 45.66it/s][A
165it [00:03, 45.22it/s][A
170it [00:03, 45.07it/s][A
175it [00:03, 43.97it/s][A
180it [00:04, 44.43it/s][A
185it [00:04, 44.53it/s][A
190it [00:04, 44.74it/s][A
195it [00:04, 44.70it/s][A
200it [00:04, 43.47it/s][A
205it [00:04, 44.09it/s][A

Epoch: 443, Step: 200, Loss: 4.485600392818451



210it [00:04, 44.38it/s][A
215it [00:04, 44.70it/s][A
220it [00:04, 43.51it/s][A
227it [00:05, 44.37it/s]
 89%|████████▊ | 443/500 [52:58<06:07,  6.45s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.16it/s][A
10it [00:00, 45.03it/s][A
15it [00:00, 45.06it/s][A
20it [00:00, 45.27it/s][A
25it [00:00, 43.97it/s][A
30it [00:00, 44.53it/s][A
35it [00:00, 44.55it/s][A
40it [00:00, 45.06it/s][A
45it [00:01, 45.14it/s][A
50it [00:01, 45.32it/s][A
55it [00:01, 45.00it/s][A
60it [00:01, 44.90it/s][A
65it [00:01, 44.83it/s][A
70it [00:01, 44.82it/s][A
75it [00:01, 44.94it/s][A
80it [00:01, 44.02it/s][A
85it [00:01, 44.02it/s][A
90it [00:02, 44.40it/s][A
95it [00:02, 42.35it/s][A
100it [00:02, 43.08it/s][A
105it [00:02, 43.41it/s][A

Epoch: 444, Step: 100, Loss: 4.461988005638123



110it [00:02, 43.89it/s][A
115it [00:02, 44.28it/s][A
120it [00:02, 44.55it/s][A
125it [00:02, 43.29it/s][A
130it [00:02, 43.61it/s][A
135it [00:03, 43.90it/s][A
140it [00:03, 44.28it/s][A
145it [00:03, 44.76it/s][A
150it [00:03, 44.81it/s][A
155it [00:03, 44.27it/s][A
160it [00:03, 44.50it/s][A
165it [00:03, 44.84it/s][A
170it [00:03, 44.97it/s][A
175it [00:03, 44.86it/s][A
180it [00:04, 44.87it/s][A
185it [00:04, 44.88it/s][A
190it [00:04, 44.78it/s][A
195it [00:04, 44.85it/s][A
200it [00:04, 44.99it/s][A
205it [00:04, 44.94it/s][A

Epoch: 444, Step: 200, Loss: 4.482997918128968



210it [00:04, 44.28it/s][A
215it [00:04, 44.59it/s][A
220it [00:04, 43.34it/s][A
227it [00:05, 44.40it/s]
 89%|████████▉ | 444/500 [53:03<05:38,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.36it/s][A
10it [00:00, 44.19it/s][A
15it [00:00, 43.66it/s][A
20it [00:00, 43.97it/s][A
25it [00:00, 44.76it/s][A
30it [00:00, 45.10it/s][A
35it [00:00, 45.05it/s][A
40it [00:00, 44.75it/s][A
45it [00:01, 44.78it/s][A
50it [00:01, 44.54it/s][A
55it [00:01, 44.10it/s][A
60it [00:01, 44.48it/s][A
65it [00:01, 44.69it/s][A
70it [00:01, 43.42it/s][A
75it [00:01, 43.99it/s][A
80it [00:01, 43.15it/s][A
85it [00:01, 43.33it/s][A
90it [00:02, 43.16it/s][A
95it [00:02, 43.44it/s][A
100it [00:02, 43.01it/s][A
105it [00:02, 43.77it/s][A

Epoch: 445, Step: 100, Loss: 4.464478664398193



110it [00:02, 44.00it/s][A
115it [00:02, 44.08it/s][A
120it [00:02, 44.46it/s][A
125it [00:02, 43.53it/s][A
130it [00:02, 44.00it/s][A
135it [00:03, 44.11it/s][A
140it [00:03, 43.94it/s][A
145it [00:03, 44.06it/s][A
150it [00:03, 41.93it/s][A
155it [00:03, 42.85it/s][A
160it [00:03, 43.63it/s][A
165it [00:03, 44.42it/s][A
170it [00:03, 44.79it/s][A
175it [00:03, 44.88it/s][A
180it [00:04, 45.15it/s][A
185it [00:04, 45.11it/s][A
190it [00:04, 45.13it/s][A
195it [00:04, 45.39it/s][A
200it [00:04, 45.19it/s][A
205it [00:04, 45.35it/s][A

Epoch: 445, Step: 200, Loss: 4.480171122550964



210it [00:04, 45.51it/s][A
215it [00:04, 45.40it/s][A
220it [00:04, 45.35it/s][A
227it [00:05, 44.24it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.54it/s][A
13it [00:00, 59.74it/s][A
19it [00:00, 59.24it/s][A
25it [00:00, 59.47it/s][A
32it [00:00, 59.98it/s][A
39it [00:00, 60.18it/s][A
46it [00:00, 60.35it/s][A
53it [00:00, 58.34it/s][A
59it [00:01, 56.84it/s][A
65it [00:01, 57.61it/s][A
72it [00:01, 58.70it/s][A
78it [00:01, 58.96it/s][A
85it [00:01, 59.58it/s][A
92it [00:01, 59.96it/s][A
99it [00:01, 60.27it/s][A
106it [00:01, 58.43it/s][A
113it [00:01, 59.10it/s][A
120it [00:02, 59.47it/s][A
126it [00:02, 56.87it/s][A
132it [00:02, 57.55it/s][A
138it [00:02, 58.09it/s][A
145it [00:02, 58.84it/s][A
152it [00:02, 59.43it/s][A
159it [00:02, 59.79it/s][A
165it [00:02, 59.39it/s][A
171it [00:02, 57.67it/s][A
177it [00:03, 57.79it/s][A
183it [00:03, 58.04it/s][A
189it [00:03, 56.11it/s][A
195it [00:03, 57.21it/s][A
202it [00:03, 58.26it/s][A
208it [00:03, 5


Epoch: 445, Test Loss: 5.561932326103589, Test Perplexity: 261.3865006962178




0it [00:00, ?it/s][A
4it [00:00, 35.34it/s][A
9it [00:00, 41.02it/s][A
14it [00:00, 43.14it/s][A
19it [00:00, 44.21it/s][A
24it [00:00, 44.58it/s][A
29it [00:00, 44.88it/s][A
34it [00:00, 45.39it/s][A
39it [00:00, 45.85it/s][A
44it [00:00, 46.02it/s][A
49it [00:01, 45.66it/s][A
54it [00:01, 45.76it/s][A
59it [00:01, 45.80it/s][A
64it [00:01, 46.04it/s][A
69it [00:01, 46.11it/s][A
74it [00:01, 45.79it/s][A
79it [00:01, 45.74it/s][A
84it [00:01, 45.40it/s][A
89it [00:01, 45.75it/s][A
94it [00:02, 44.59it/s][A
99it [00:02, 44.94it/s][A
104it [00:02, 45.25it/s][A

Epoch: 446, Step: 100, Loss: 4.466112790107727



109it [00:02, 43.77it/s][A
114it [00:02, 44.37it/s][A
119it [00:02, 44.97it/s][A
124it [00:02, 45.55it/s][A
129it [00:02, 45.49it/s][A
134it [00:02, 45.59it/s][A
139it [00:03, 45.72it/s][A
144it [00:03, 45.62it/s][A
149it [00:03, 45.57it/s][A
154it [00:03, 43.37it/s][A
159it [00:03, 43.86it/s][A
164it [00:03, 44.53it/s][A
169it [00:03, 45.07it/s][A
174it [00:03, 45.52it/s][A
179it [00:03, 45.74it/s][A
184it [00:04, 46.00it/s][A
189it [00:04, 46.09it/s][A
194it [00:04, 46.07it/s][A
199it [00:04, 45.93it/s][A
204it [00:04, 45.74it/s][A
209it [00:04, 45.94it/s][A

Epoch: 446, Step: 200, Loss: 4.484430558681488



214it [00:04, 45.56it/s][A
219it [00:04, 45.72it/s][A
227it [00:05, 45.28it/s]
 89%|████████▉ | 446/500 [53:24<07:03,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.68it/s][A
10it [00:00, 45.03it/s][A
15it [00:00, 45.37it/s][A
20it [00:00, 45.26it/s][A
25it [00:00, 45.23it/s][A
30it [00:00, 43.80it/s][A
35it [00:00, 42.99it/s][A
40it [00:00, 43.94it/s][A
45it [00:01, 43.01it/s][A
50it [00:01, 43.75it/s][A
55it [00:01, 44.24it/s][A
60it [00:01, 43.21it/s][A
65it [00:01, 43.93it/s][A
70it [00:01, 42.20it/s][A
75it [00:01, 42.80it/s][A
80it [00:01, 43.52it/s][A
85it [00:01, 44.21it/s][A
90it [00:02, 43.90it/s][A
95it [00:02, 44.11it/s][A
100it [00:02, 43.34it/s][A
105it [00:02, 43.58it/s][A

Epoch: 447, Step: 100, Loss: 4.474744491577148



110it [00:02, 44.19it/s][A
115it [00:02, 44.85it/s][A
120it [00:02, 45.09it/s][A
125it [00:02, 45.29it/s][A
130it [00:02, 45.24it/s][A
135it [00:03, 45.01it/s][A
140it [00:03, 44.72it/s][A
145it [00:03, 44.50it/s][A
150it [00:03, 44.27it/s][A
155it [00:03, 44.40it/s][A
160it [00:03, 44.22it/s][A
165it [00:03, 44.58it/s][A
170it [00:03, 43.08it/s][A
175it [00:03, 43.08it/s][A
180it [00:04, 43.19it/s][A
185it [00:04, 43.84it/s][A
190it [00:04, 43.36it/s][A
195it [00:04, 43.88it/s][A
200it [00:04, 44.16it/s][A
205it [00:04, 44.36it/s][A

Epoch: 447, Step: 200, Loss: 4.4819806909561155



210it [00:04, 44.45it/s][A
215it [00:04, 44.38it/s][A
220it [00:04, 44.51it/s][A
227it [00:05, 44.05it/s]
 89%|████████▉ | 447/500 [53:29<06:13,  7.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.34it/s][A
10it [00:00, 43.50it/s][A
15it [00:00, 44.24it/s][A
20it [00:00, 44.34it/s][A
25it [00:00, 44.62it/s][A
30it [00:00, 43.77it/s][A
35it [00:00, 44.36it/s][A
40it [00:00, 44.69it/s][A
45it [00:01, 44.82it/s][A
50it [00:01, 45.09it/s][A
55it [00:01, 45.31it/s][A
60it [00:01, 45.28it/s][A
65it [00:01, 43.97it/s][A
70it [00:01, 44.61it/s][A
75it [00:01, 44.95it/s][A
80it [00:01, 45.03it/s][A
85it [00:01, 44.98it/s][A
90it [00:02, 45.19it/s][A
95it [00:02, 45.05it/s][A
100it [00:02, 45.12it/s][A
105it [00:02, 43.62it/s][A

Epoch: 448, Step: 100, Loss: 4.462672457695008



110it [00:02, 44.22it/s][A
115it [00:02, 44.32it/s][A
120it [00:02, 43.44it/s][A
125it [00:02, 44.21it/s][A
130it [00:02, 44.13it/s][A
135it [00:03, 44.65it/s][A
140it [00:03, 44.73it/s][A
145it [00:03, 44.63it/s][A
150it [00:03, 45.09it/s][A
155it [00:03, 45.04it/s][A
160it [00:03, 45.26it/s][A
165it [00:03, 45.49it/s][A
170it [00:03, 45.42it/s][A
175it [00:03, 44.15it/s][A
180it [00:04, 44.57it/s][A
185it [00:04, 44.41it/s][A
190it [00:04, 43.44it/s][A
195it [00:04, 42.41it/s][A
200it [00:04, 42.72it/s][A
205it [00:04, 43.42it/s][A

Epoch: 448, Step: 200, Loss: 4.482586085796356



210it [00:04, 43.72it/s][A
215it [00:04, 44.36it/s][A
220it [00:04, 44.52it/s][A
227it [00:05, 44.34it/s]
 90%|████████▉ | 448/500 [53:35<05:36,  6.47s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.58it/s][A
10it [00:00, 45.31it/s][A
15it [00:00, 45.23it/s][A
20it [00:00, 43.30it/s][A
25it [00:00, 44.15it/s][A
30it [00:00, 44.51it/s][A
35it [00:00, 44.75it/s][A
40it [00:00, 43.07it/s][A
45it [00:01, 44.01it/s][A
50it [00:01, 44.62it/s][A
55it [00:01, 44.77it/s][A
60it [00:01, 44.90it/s][A
65it [00:01, 44.92it/s][A
70it [00:01, 45.13it/s][A
75it [00:01, 45.28it/s][A
80it [00:01, 45.22it/s][A
85it [00:01, 45.20it/s][A
90it [00:02, 45.43it/s][A
95it [00:02, 45.70it/s][A
100it [00:02, 45.56it/s][A
105it [00:02, 45.72it/s][A

Epoch: 449, Step: 100, Loss: 4.461434082984924



110it [00:02, 45.20it/s][A
115it [00:02, 43.13it/s][A
120it [00:02, 43.71it/s][A
125it [00:02, 44.25it/s][A
130it [00:02, 44.51it/s][A
135it [00:03, 44.42it/s][A
140it [00:03, 43.45it/s][A
145it [00:03, 44.36it/s][A
150it [00:03, 44.89it/s][A
155it [00:03, 45.14it/s][A
160it [00:03, 43.53it/s][A
165it [00:03, 44.11it/s][A
170it [00:03, 44.44it/s][A
175it [00:03, 44.58it/s][A
180it [00:04, 44.81it/s][A
185it [00:04, 44.58it/s][A
190it [00:04, 42.16it/s][A
195it [00:04, 43.22it/s][A
200it [00:04, 43.85it/s][A
205it [00:04, 43.84it/s][A

Epoch: 449, Step: 200, Loss: 4.480646450519561



210it [00:04, 42.36it/s][A
215it [00:04, 42.21it/s][A
220it [00:04, 43.44it/s][A
227it [00:05, 44.17it/s]
 90%|████████▉ | 449/500 [53:40<05:09,  6.07s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.46it/s][A
10it [00:00, 43.52it/s][A
15it [00:00, 41.40it/s][A
20it [00:00, 42.61it/s][A
25it [00:00, 43.40it/s][A
30it [00:00, 43.83it/s][A
35it [00:00, 43.83it/s][A
40it [00:00, 43.53it/s][A
45it [00:01, 43.33it/s][A
50it [00:01, 41.80it/s][A
55it [00:01, 41.74it/s][A
60it [00:01, 42.59it/s][A
65it [00:01, 43.08it/s][A
70it [00:01, 43.93it/s][A
75it [00:01, 44.38it/s][A
80it [00:01, 44.82it/s][A
85it [00:01, 44.86it/s][A
90it [00:02, 44.35it/s][A
95it [00:02, 44.51it/s][A
100it [00:02, 44.42it/s][A
105it [00:02, 44.04it/s][A

Epoch: 450, Step: 100, Loss: 4.468640732765198



110it [00:02, 41.97it/s][A
115it [00:02, 43.07it/s][A
120it [00:02, 43.85it/s][A
125it [00:02, 43.86it/s][A
130it [00:02, 44.52it/s][A
135it [00:03, 43.47it/s][A
140it [00:03, 44.21it/s][A
145it [00:03, 44.38it/s][A
150it [00:03, 44.84it/s][A
155it [00:03, 44.92it/s][A
160it [00:03, 45.05it/s][A
165it [00:03, 45.16it/s][A
170it [00:03, 45.47it/s][A
175it [00:03, 45.37it/s][A
180it [00:04, 45.09it/s][A
185it [00:04, 45.32it/s][A
190it [00:04, 45.48it/s][A
195it [00:04, 45.47it/s][A
200it [00:04, 45.40it/s][A
205it [00:04, 45.28it/s][A

Epoch: 450, Step: 200, Loss: 4.479474239349365



210it [00:04, 45.16it/s][A
215it [00:04, 43.45it/s][A
220it [00:04, 43.96it/s][A
227it [00:05, 44.08it/s]

0it [00:00, ?it/s][A
6it [00:00, 57.67it/s][A
12it [00:00, 58.78it/s][A
18it [00:00, 58.97it/s][A
24it [00:00, 59.33it/s][A
30it [00:00, 59.41it/s][A
37it [00:00, 59.90it/s][A
44it [00:00, 60.15it/s][A
51it [00:00, 60.08it/s][A
58it [00:00, 57.81it/s][A
65it [00:01, 58.67it/s][A
72it [00:01, 59.19it/s][A
78it [00:01, 59.28it/s][A
84it [00:01, 59.39it/s][A
90it [00:01, 59.31it/s][A
96it [00:01, 59.44it/s][A
102it [00:01, 59.60it/s][A
109it [00:01, 59.91it/s][A
115it [00:01, 59.93it/s][A
122it [00:02, 60.30it/s][A
129it [00:02, 60.19it/s][A
136it [00:02, 60.23it/s][A
143it [00:02, 60.16it/s][A
150it [00:02, 60.23it/s][A
157it [00:02, 60.15it/s][A
164it [00:02, 59.92it/s][A
170it [00:02, 59.90it/s][A
176it [00:02, 59.87it/s][A
183it [00:03, 60.22it/s][A
190it [00:03, 60.40it/s][A
197it [00:03, 60.25it/s][A
204it [00:03, 60.30it/s][A
211it [00:03, 6


Epoch: 450, Test Loss: 5.565773735875669, Test Perplexity: 262.4318722790072




0it [00:00, ?it/s][A
5it [00:00, 45.94it/s][A
10it [00:00, 46.07it/s][A
15it [00:00, 46.01it/s][A
20it [00:00, 45.82it/s][A
25it [00:00, 45.13it/s][A
30it [00:00, 45.80it/s][A
35it [00:00, 46.19it/s][A
40it [00:00, 45.70it/s][A
45it [00:00, 44.92it/s][A
50it [00:01, 45.84it/s][A
55it [00:01, 46.11it/s][A
60it [00:01, 45.37it/s][A
65it [00:01, 45.16it/s][A
70it [00:01, 42.86it/s][A
75it [00:01, 42.82it/s][A
80it [00:01, 43.11it/s][A
85it [00:01, 43.35it/s][A
90it [00:02, 43.39it/s][A
95it [00:02, 43.71it/s][A
100it [00:02, 43.63it/s][A
105it [00:02, 44.64it/s][A

Epoch: 451, Step: 100, Loss: 4.465477485656738



110it [00:02, 44.83it/s][A
115it [00:02, 44.72it/s][A
120it [00:02, 44.70it/s][A
125it [00:02, 44.66it/s][A
130it [00:02, 43.58it/s][A
135it [00:03, 44.32it/s][A
140it [00:03, 44.31it/s][A
145it [00:03, 42.82it/s][A
150it [00:03, 43.66it/s][A
155it [00:03, 43.98it/s][A
160it [00:03, 44.40it/s][A
165it [00:03, 44.44it/s][A
170it [00:03, 43.27it/s][A
175it [00:03, 44.03it/s][A
180it [00:04, 43.04it/s][A
185it [00:04, 43.85it/s][A
190it [00:04, 44.49it/s][A
195it [00:04, 44.89it/s][A
200it [00:04, 45.10it/s][A
205it [00:04, 43.74it/s][A

Epoch: 451, Step: 200, Loss: 4.479903428554535



210it [00:04, 44.27it/s][A
215it [00:04, 44.48it/s][A
220it [00:04, 44.93it/s][A
227it [00:05, 44.44it/s]
 90%|█████████ | 451/500 [54:01<06:26,  7.88s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.33it/s][A
10it [00:00, 45.50it/s][A
15it [00:00, 45.75it/s][A
20it [00:00, 43.34it/s][A
25it [00:00, 44.35it/s][A
30it [00:00, 44.75it/s][A
35it [00:00, 43.67it/s][A
40it [00:00, 44.57it/s][A
45it [00:01, 44.76it/s][A
50it [00:01, 45.10it/s][A
55it [00:01, 45.53it/s][A
60it [00:01, 43.41it/s][A
65it [00:01, 44.21it/s][A
70it [00:01, 44.71it/s][A
75it [00:01, 43.27it/s][A
80it [00:01, 43.00it/s][A
85it [00:01, 43.60it/s][A
90it [00:02, 44.14it/s][A
95it [00:02, 44.61it/s][A
100it [00:02, 44.89it/s][A
105it [00:02, 45.07it/s][A

Epoch: 452, Step: 100, Loss: 4.470705542564392



110it [00:02, 45.27it/s][A
115it [00:02, 45.11it/s][A
120it [00:02, 44.31it/s][A
125it [00:02, 42.96it/s][A
130it [00:02, 41.43it/s][A
135it [00:03, 42.34it/s][A
140it [00:03, 41.32it/s][A
145it [00:03, 41.18it/s][A
150it [00:03, 41.09it/s][A
155it [00:03, 42.36it/s][A
160it [00:03, 43.29it/s][A
165it [00:03, 43.87it/s][A
170it [00:03, 44.37it/s][A
175it [00:03, 44.64it/s][A
180it [00:04, 42.77it/s][A
185it [00:04, 43.48it/s][A
190it [00:04, 44.02it/s][A
195it [00:04, 44.32it/s][A
200it [00:04, 44.58it/s][A
205it [00:04, 44.84it/s][A

Epoch: 452, Step: 200, Loss: 4.479015293121338



210it [00:04, 44.46it/s][A
215it [00:04, 43.27it/s][A
220it [00:05, 43.84it/s][A
227it [00:05, 43.83it/s]
 90%|█████████ | 452/500 [54:06<05:39,  7.07s/it]
0it [00:00, ?it/s][A
4it [00:00, 39.54it/s][A
9it [00:00, 40.89it/s][A
14it [00:00, 43.02it/s][A
19it [00:00, 44.09it/s][A
24it [00:00, 44.57it/s][A
29it [00:00, 44.59it/s][A
34it [00:00, 44.81it/s][A
39it [00:00, 42.97it/s][A
44it [00:01, 43.92it/s][A
49it [00:01, 44.38it/s][A
54it [00:01, 44.75it/s][A
59it [00:01, 45.09it/s][A
64it [00:01, 45.32it/s][A
69it [00:01, 45.31it/s][A
74it [00:01, 44.99it/s][A
79it [00:01, 44.85it/s][A
84it [00:01, 45.30it/s][A
89it [00:01, 45.44it/s][A
94it [00:02, 45.71it/s][A
99it [00:02, 45.72it/s][A
104it [00:02, 45.58it/s][A
109it [00:02, 45.66it/s][A

Epoch: 453, Step: 100, Loss: 4.466453552246094



114it [00:02, 45.23it/s][A
119it [00:02, 45.28it/s][A
124it [00:02, 45.44it/s][A
129it [00:02, 45.14it/s][A
134it [00:02, 45.30it/s][A
139it [00:03, 45.18it/s][A
144it [00:03, 43.39it/s][A
149it [00:03, 44.09it/s][A
154it [00:03, 44.60it/s][A
159it [00:03, 43.93it/s][A
164it [00:03, 44.47it/s][A
169it [00:03, 44.74it/s][A
174it [00:03, 44.97it/s][A
179it [00:04, 44.91it/s][A
184it [00:04, 45.11it/s][A
189it [00:04, 45.04it/s][A
194it [00:04, 44.83it/s][A
199it [00:04, 44.93it/s][A
204it [00:04, 43.79it/s][A

Epoch: 453, Step: 200, Loss: 4.48130202293396



209it [00:04, 44.05it/s][A
214it [00:04, 44.56it/s][A
219it [00:04, 44.80it/s][A
227it [00:05, 44.71it/s]
 91%|█████████ | 453/500 [54:11<05:04,  6.48s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.79it/s][A
10it [00:00, 45.94it/s][A
15it [00:00, 45.85it/s][A
20it [00:00, 45.21it/s][A
25it [00:00, 45.52it/s][A
30it [00:00, 45.75it/s][A
35it [00:00, 45.73it/s][A
40it [00:00, 45.59it/s][A
45it [00:00, 45.24it/s][A
50it [00:01, 45.40it/s][A
55it [00:01, 45.31it/s][A
60it [00:01, 45.24it/s][A
65it [00:01, 45.37it/s][A
70it [00:01, 45.34it/s][A
75it [00:01, 45.61it/s][A
80it [00:01, 45.32it/s][A
85it [00:01, 45.30it/s][A
90it [00:01, 45.51it/s][A
95it [00:02, 45.49it/s][A
100it [00:02, 45.09it/s][A
105it [00:02, 45.10it/s][A

Epoch: 454, Step: 100, Loss: 4.467569584846497



110it [00:02, 45.21it/s][A
115it [00:02, 45.49it/s][A
120it [00:02, 45.59it/s][A
125it [00:02, 45.73it/s][A
130it [00:02, 45.43it/s][A
135it [00:02, 44.87it/s][A
140it [00:03, 44.41it/s][A
145it [00:03, 44.31it/s][A
150it [00:03, 44.90it/s][A
155it [00:03, 43.99it/s][A
160it [00:03, 42.93it/s][A
165it [00:03, 42.88it/s][A
170it [00:03, 43.63it/s][A
175it [00:03, 43.84it/s][A
180it [00:04, 42.86it/s][A
185it [00:04, 42.17it/s][A
190it [00:04, 42.65it/s][A
195it [00:04, 43.22it/s][A
200it [00:04, 43.85it/s][A
205it [00:04, 41.88it/s][A

Epoch: 454, Step: 200, Loss: 4.479524457454682



210it [00:04, 42.97it/s][A
215it [00:04, 43.84it/s][A
220it [00:04, 41.88it/s][A
227it [00:05, 44.22it/s]
 91%|█████████ | 454/500 [54:16<04:39,  6.07s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.75it/s][A
10it [00:00, 41.52it/s][A
15it [00:00, 43.05it/s][A
20it [00:00, 44.01it/s][A
25it [00:00, 43.71it/s][A
30it [00:00, 42.21it/s][A
35it [00:00, 41.60it/s][A
40it [00:00, 42.51it/s][A
45it [00:01, 43.32it/s][A
50it [00:01, 43.97it/s][A
55it [00:01, 44.37it/s][A
60it [00:01, 43.25it/s][A
65it [00:01, 44.04it/s][A
70it [00:01, 43.08it/s][A
75it [00:01, 43.55it/s][A
80it [00:01, 43.78it/s][A
85it [00:01, 42.63it/s][A
90it [00:02, 43.35it/s][A
95it [00:02, 43.58it/s][A
100it [00:02, 43.76it/s][A
105it [00:02, 43.00it/s][A

Epoch: 455, Step: 100, Loss: 4.461678805351258



110it [00:02, 43.07it/s][A
115it [00:02, 43.43it/s][A
120it [00:02, 43.66it/s][A
125it [00:02, 44.18it/s][A
130it [00:02, 44.56it/s][A
135it [00:03, 44.46it/s][A
140it [00:03, 43.26it/s][A
145it [00:03, 44.21it/s][A
150it [00:03, 44.28it/s][A
155it [00:03, 44.88it/s][A
160it [00:03, 45.16it/s][A
165it [00:03, 45.53it/s][A
170it [00:03, 45.68it/s][A
175it [00:03, 45.61it/s][A
180it [00:04, 45.63it/s][A
185it [00:04, 45.48it/s][A
190it [00:04, 45.58it/s][A
195it [00:04, 45.52it/s][A
200it [00:04, 45.59it/s][A
205it [00:04, 44.32it/s][A

Epoch: 455, Step: 200, Loss: 4.483377046585083



210it [00:04, 44.81it/s][A
215it [00:04, 44.86it/s][A
220it [00:04, 44.58it/s][A
227it [00:05, 44.10it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.15it/s][A
12it [00:00, 59.35it/s][A
19it [00:00, 59.96it/s][A
26it [00:00, 60.41it/s][A
33it [00:00, 60.35it/s][A
40it [00:00, 60.49it/s][A
47it [00:00, 57.89it/s][A
54it [00:00, 58.76it/s][A
61it [00:01, 59.34it/s][A
68it [00:01, 59.77it/s][A
75it [00:01, 60.10it/s][A
82it [00:01, 60.41it/s][A
89it [00:01, 60.52it/s][A
96it [00:01, 60.16it/s][A
103it [00:01, 60.42it/s][A
110it [00:01, 60.51it/s][A
117it [00:01, 60.58it/s][A
124it [00:02, 60.72it/s][A
131it [00:02, 60.82it/s][A
138it [00:02, 60.68it/s][A
145it [00:02, 60.51it/s][A
152it [00:02, 60.47it/s][A
159it [00:02, 60.47it/s][A
166it [00:02, 60.80it/s][A
173it [00:02, 60.28it/s][A
180it [00:02, 60.45it/s][A
187it [00:03, 60.90it/s][A
194it [00:03, 61.10it/s][A
201it [00:03, 60.82it/s][A
208it [00:03, 60.93it/s][A
215it [00:03, 60.99it/s][A
222it [00:03, 


Epoch: 455, Test Loss: 5.57304000336191, Test Perplexity: 264.29510768155876




0it [00:00, ?it/s][A
5it [00:00, 45.49it/s][A
10it [00:00, 45.43it/s][A
15it [00:00, 43.87it/s][A
20it [00:00, 44.74it/s][A
25it [00:00, 45.12it/s][A
30it [00:00, 45.36it/s][A
35it [00:00, 43.87it/s][A
40it [00:00, 44.66it/s][A
45it [00:01, 44.78it/s][A
50it [00:01, 45.07it/s][A
55it [00:01, 44.99it/s][A
60it [00:01, 45.08it/s][A
65it [00:01, 45.37it/s][A
70it [00:01, 45.36it/s][A
75it [00:01, 43.38it/s][A
80it [00:01, 44.23it/s][A
85it [00:01, 44.44it/s][A
90it [00:02, 44.98it/s][A
95it [00:02, 44.80it/s][A
100it [00:02, 45.11it/s][A
105it [00:02, 45.35it/s][A

Epoch: 456, Step: 100, Loss: 4.4690622186660764



110it [00:02, 45.56it/s][A
115it [00:02, 45.59it/s][A
120it [00:02, 45.69it/s][A
125it [00:02, 45.92it/s][A
130it [00:02, 44.65it/s][A
135it [00:03, 44.32it/s][A
140it [00:03, 44.92it/s][A
145it [00:03, 45.30it/s][A
150it [00:03, 45.25it/s][A
155it [00:03, 45.27it/s][A
160it [00:03, 45.08it/s][A
165it [00:03, 45.22it/s][A
170it [00:03, 45.43it/s][A
175it [00:03, 45.51it/s][A
180it [00:04, 44.21it/s][A
185it [00:04, 44.79it/s][A
190it [00:04, 44.76it/s][A
195it [00:04, 44.91it/s][A
200it [00:04, 45.33it/s][A
205it [00:04, 45.56it/s][A

Epoch: 456, Step: 200, Loss: 4.4798226118087765



210it [00:04, 45.04it/s][A
215it [00:04, 44.85it/s][A
220it [00:04, 44.26it/s][A
227it [00:05, 44.93it/s]
 91%|█████████ | 456/500 [54:37<05:45,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.68it/s][A
10it [00:00, 45.02it/s][A
15it [00:00, 45.30it/s][A
20it [00:00, 45.61it/s][A
25it [00:00, 45.63it/s][A
30it [00:00, 45.06it/s][A
35it [00:00, 44.83it/s][A
40it [00:00, 44.84it/s][A
45it [00:00, 44.92it/s][A
50it [00:01, 44.75it/s][A
55it [00:01, 43.29it/s][A
60it [00:01, 43.55it/s][A
65it [00:01, 43.72it/s][A
70it [00:01, 44.38it/s][A
75it [00:01, 44.75it/s][A
80it [00:01, 45.34it/s][A
85it [00:01, 45.47it/s][A
90it [00:02, 45.55it/s][A
95it [00:02, 45.73it/s][A
100it [00:02, 45.93it/s][A
105it [00:02, 43.91it/s][A

Epoch: 457, Step: 100, Loss: 4.462006087303162



110it [00:02, 44.02it/s][A
115it [00:02, 43.90it/s][A
120it [00:02, 44.73it/s][A
125it [00:02, 43.62it/s][A
130it [00:02, 44.12it/s][A
135it [00:03, 44.28it/s][A
140it [00:03, 44.46it/s][A
145it [00:03, 44.39it/s][A
150it [00:03, 43.73it/s][A
155it [00:03, 44.31it/s][A
160it [00:03, 44.71it/s][A
165it [00:03, 43.56it/s][A
170it [00:03, 44.07it/s][A
175it [00:03, 44.46it/s][A
180it [00:04, 44.59it/s][A
185it [00:04, 44.82it/s][A
190it [00:04, 43.26it/s][A
195it [00:04, 42.29it/s][A
200it [00:04, 42.12it/s][A
205it [00:04, 41.45it/s][A

Epoch: 457, Step: 200, Loss: 4.4788776206970216



210it [00:04, 42.14it/s][A
215it [00:04, 42.86it/s][A
220it [00:04, 43.32it/s][A
227it [00:05, 44.16it/s]
 91%|█████████▏| 457/500 [54:42<05:02,  7.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.98it/s][A
10it [00:00, 46.14it/s][A
15it [00:00, 45.44it/s][A
20it [00:00, 45.59it/s][A
25it [00:00, 45.69it/s][A
30it [00:00, 45.84it/s][A
35it [00:00, 45.87it/s][A
40it [00:00, 45.82it/s][A
45it [00:00, 46.00it/s][A
50it [00:01, 46.20it/s][A
55it [00:01, 45.96it/s][A
60it [00:01, 43.69it/s][A
65it [00:01, 42.69it/s][A
70it [00:01, 43.54it/s][A
75it [00:01, 43.91it/s][A
80it [00:01, 44.15it/s][A
85it [00:01, 42.76it/s][A
90it [00:02, 43.51it/s][A
95it [00:02, 44.22it/s][A
100it [00:02, 44.49it/s][A
105it [00:02, 44.62it/s][A

Epoch: 458, Step: 100, Loss: 4.468770008087159



110it [00:02, 44.80it/s][A
115it [00:02, 45.18it/s][A
120it [00:02, 44.38it/s][A
125it [00:02, 44.63it/s][A
130it [00:02, 44.93it/s][A
135it [00:03, 45.20it/s][A
140it [00:03, 45.46it/s][A
145it [00:03, 45.62it/s][A
150it [00:03, 45.74it/s][A
155it [00:03, 45.95it/s][A
160it [00:03, 45.87it/s][A
165it [00:03, 45.81it/s][A
170it [00:03, 45.35it/s][A
175it [00:03, 45.42it/s][A
180it [00:03, 45.50it/s][A
185it [00:04, 45.64it/s][A
190it [00:04, 45.60it/s][A
195it [00:04, 44.08it/s][A
200it [00:04, 44.78it/s][A
205it [00:04, 44.94it/s][A

Epoch: 458, Step: 200, Loss: 4.479075055122376



210it [00:04, 45.17it/s][A
215it [00:04, 44.12it/s][A
220it [00:04, 44.83it/s][A
227it [00:05, 44.86it/s]
 92%|█████████▏| 458/500 [54:48<04:30,  6.45s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.89it/s][A
10it [00:00, 45.74it/s][A
15it [00:00, 45.80it/s][A
20it [00:00, 45.72it/s][A
25it [00:00, 45.71it/s][A
30it [00:00, 45.83it/s][A
35it [00:00, 46.06it/s][A
40it [00:00, 45.94it/s][A
45it [00:00, 45.29it/s][A
50it [00:01, 44.11it/s][A
55it [00:01, 44.63it/s][A
60it [00:01, 44.88it/s][A
65it [00:01, 45.06it/s][A
70it [00:01, 45.30it/s][A
75it [00:01, 45.18it/s][A
80it [00:01, 45.35it/s][A
85it [00:01, 44.18it/s][A
90it [00:01, 44.44it/s][A
95it [00:02, 44.30it/s][A
100it [00:02, 44.12it/s][A
105it [00:02, 44.42it/s][A

Epoch: 459, Step: 100, Loss: 4.476419291496277



110it [00:02, 44.87it/s][A
115it [00:02, 45.28it/s][A
120it [00:02, 45.27it/s][A
125it [00:02, 45.35it/s][A
130it [00:02, 45.51it/s][A
135it [00:02, 45.73it/s][A
140it [00:03, 45.81it/s][A
145it [00:03, 45.88it/s][A
150it [00:03, 46.00it/s][A
155it [00:03, 46.06it/s][A
160it [00:03, 46.09it/s][A
165it [00:03, 46.02it/s][A
170it [00:03, 45.89it/s][A
175it [00:03, 45.85it/s][A
180it [00:03, 45.99it/s][A
185it [00:04, 45.74it/s][A
190it [00:04, 45.81it/s][A
195it [00:04, 46.01it/s][A
200it [00:04, 45.67it/s][A
205it [00:04, 45.50it/s][A

Epoch: 459, Step: 200, Loss: 4.481384222507477



210it [00:04, 45.16it/s][A
215it [00:04, 45.31it/s][A
220it [00:04, 43.49it/s][A
227it [00:05, 45.18it/s]
 92%|█████████▏| 459/500 [54:53<04:06,  6.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.07it/s][A
10it [00:00, 45.90it/s][A
15it [00:00, 45.23it/s][A
20it [00:00, 43.38it/s][A
25it [00:00, 42.00it/s][A
30it [00:00, 42.74it/s][A
35it [00:00, 43.58it/s][A
40it [00:00, 43.91it/s][A
45it [00:01, 44.41it/s][A
50it [00:01, 44.56it/s][A
55it [00:01, 44.97it/s][A
60it [00:01, 45.16it/s][A
65it [00:01, 45.07it/s][A
70it [00:01, 43.17it/s][A
75it [00:01, 43.55it/s][A
80it [00:01, 42.49it/s][A
85it [00:01, 43.49it/s][A
90it [00:02, 44.02it/s][A
95it [00:02, 44.58it/s][A
100it [00:02, 44.73it/s][A
105it [00:02, 45.03it/s][A

Epoch: 460, Step: 100, Loss: 4.468500895500183



110it [00:02, 44.64it/s][A
115it [00:02, 45.07it/s][A
120it [00:02, 45.42it/s][A
125it [00:02, 45.56it/s][A
130it [00:02, 43.91it/s][A
135it [00:03, 44.32it/s][A
140it [00:03, 44.29it/s][A
145it [00:03, 44.29it/s][A
150it [00:03, 44.62it/s][A
155it [00:03, 44.75it/s][A
160it [00:03, 44.57it/s][A
165it [00:03, 44.83it/s][A
170it [00:03, 44.64it/s][A
175it [00:03, 45.17it/s][A
180it [00:04, 45.23it/s][A
185it [00:04, 45.58it/s][A
190it [00:04, 45.88it/s][A
195it [00:04, 45.89it/s][A
200it [00:04, 44.49it/s][A
205it [00:04, 42.80it/s][A

Epoch: 460, Step: 200, Loss: 4.479074010848999



210it [00:04, 43.41it/s][A
215it [00:04, 44.09it/s][A
220it [00:04, 44.62it/s][A
227it [00:05, 44.46it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.58it/s][A
12it [00:00, 58.79it/s][A
18it [00:00, 55.11it/s][A
25it [00:00, 57.41it/s][A
31it [00:00, 55.50it/s][A
38it [00:00, 57.15it/s][A
45it [00:00, 58.66it/s][A
52it [00:00, 59.72it/s][A
59it [00:01, 60.32it/s][A
66it [00:01, 60.69it/s][A
73it [00:01, 59.46it/s][A
80it [00:01, 59.80it/s][A
87it [00:01, 60.15it/s][A
94it [00:01, 59.74it/s][A
101it [00:01, 60.01it/s][A
108it [00:01, 60.36it/s][A
115it [00:01, 60.73it/s][A
122it [00:02, 59.49it/s][A
128it [00:02, 59.58it/s][A
134it [00:02, 59.20it/s][A
141it [00:02, 59.70it/s][A
147it [00:02, 59.52it/s][A
153it [00:02, 59.48it/s][A
160it [00:02, 59.69it/s][A
166it [00:02, 59.53it/s][A
173it [00:02, 60.05it/s][A
180it [00:03, 59.90it/s][A
187it [00:03, 60.25it/s][A
194it [00:03, 59.56it/s][A
200it [00:03, 58.86it/s][A
206it [00:03, 58.26it/s][A
212it [00:03, 


Epoch: 460, Test Loss: 5.569769995553153, Test Perplexity: 263.5747432590271




0it [00:00, ?it/s][A
5it [00:00, 42.53it/s][A
10it [00:00, 44.23it/s][A
15it [00:00, 45.15it/s][A
20it [00:00, 45.53it/s][A
25it [00:00, 45.58it/s][A
30it [00:00, 43.86it/s][A
35it [00:00, 44.35it/s][A
40it [00:00, 44.87it/s][A
45it [00:01, 44.94it/s][A
50it [00:01, 45.08it/s][A
55it [00:01, 43.65it/s][A
60it [00:01, 44.42it/s][A
65it [00:01, 44.56it/s][A
70it [00:01, 44.95it/s][A
75it [00:01, 45.11it/s][A
80it [00:01, 44.98it/s][A
85it [00:01, 45.26it/s][A
90it [00:02, 45.03it/s][A
95it [00:02, 45.34it/s][A
100it [00:02, 45.56it/s][A
105it [00:02, 45.65it/s][A

Epoch: 461, Step: 100, Loss: 4.461325941085815



110it [00:02, 45.50it/s][A
115it [00:02, 45.72it/s][A
120it [00:02, 45.71it/s][A
125it [00:02, 45.44it/s][A
130it [00:02, 45.24it/s][A
135it [00:03, 44.55it/s][A
140it [00:03, 42.72it/s][A
145it [00:03, 43.44it/s][A
150it [00:03, 42.85it/s][A
155it [00:03, 43.42it/s][A
160it [00:03, 43.80it/s][A
165it [00:03, 44.01it/s][A
170it [00:03, 43.66it/s][A
175it [00:03, 43.86it/s][A
180it [00:04, 44.03it/s][A
185it [00:04, 44.22it/s][A
190it [00:04, 44.35it/s][A
195it [00:04, 44.27it/s][A
200it [00:04, 43.58it/s][A
205it [00:04, 44.19it/s][A

Epoch: 461, Step: 200, Loss: 4.476193554401398



210it [00:04, 44.40it/s][A
215it [00:04, 44.80it/s][A
220it [00:04, 45.13it/s][A
227it [00:05, 44.59it/s]
 92%|█████████▏| 461/500 [55:14<05:05,  7.84s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.56it/s][A
10it [00:00, 41.70it/s][A
15it [00:00, 43.53it/s][A
20it [00:00, 42.25it/s][A
25it [00:00, 42.88it/s][A
30it [00:00, 43.81it/s][A
35it [00:00, 44.34it/s][A
40it [00:00, 44.26it/s][A
45it [00:01, 44.89it/s][A
50it [00:01, 44.68it/s][A
55it [00:01, 45.02it/s][A
60it [00:01, 45.14it/s][A
65it [00:01, 44.99it/s][A
70it [00:01, 44.95it/s][A
75it [00:01, 43.49it/s][A
80it [00:01, 44.20it/s][A
85it [00:01, 44.61it/s][A
90it [00:02, 44.92it/s][A
95it [00:02, 45.22it/s][A
100it [00:02, 45.53it/s][A
105it [00:02, 45.64it/s][A

Epoch: 462, Step: 100, Loss: 4.461199502944947



110it [00:02, 45.20it/s][A
115it [00:02, 45.63it/s][A
120it [00:02, 45.65it/s][A
125it [00:02, 45.75it/s][A
130it [00:02, 45.61it/s][A
135it [00:03, 44.01it/s][A
140it [00:03, 44.70it/s][A
145it [00:03, 44.70it/s][A
150it [00:03, 44.65it/s][A
155it [00:03, 44.80it/s][A
160it [00:03, 44.81it/s][A
165it [00:03, 44.95it/s][A
170it [00:03, 45.16it/s][A
175it [00:03, 45.45it/s][A
180it [00:04, 45.60it/s][A
185it [00:04, 45.58it/s][A
190it [00:04, 45.27it/s][A
195it [00:04, 45.41it/s][A
200it [00:04, 45.35it/s][A
205it [00:04, 45.34it/s][A

Epoch: 462, Step: 200, Loss: 4.4781933736801145



210it [00:04, 45.34it/s][A
215it [00:04, 45.28it/s][A
220it [00:04, 45.52it/s][A
227it [00:05, 44.80it/s]
 92%|█████████▏| 462/500 [55:19<04:26,  7.01s/it]
0it [00:00, ?it/s][A
4it [00:00, 37.05it/s][A
9it [00:00, 42.30it/s][A
14it [00:00, 42.95it/s][A
19it [00:00, 44.15it/s][A
24it [00:00, 44.72it/s][A
29it [00:00, 45.04it/s][A
34it [00:00, 43.24it/s][A
39it [00:00, 44.18it/s][A
44it [00:01, 44.77it/s][A
49it [00:01, 43.20it/s][A
54it [00:01, 42.16it/s][A
59it [00:01, 43.08it/s][A
64it [00:01, 43.84it/s][A
69it [00:01, 42.92it/s][A
74it [00:01, 42.57it/s][A
79it [00:01, 43.48it/s][A
84it [00:01, 42.93it/s][A
89it [00:02, 43.86it/s][A
94it [00:02, 44.54it/s][A
99it [00:02, 44.82it/s][A
104it [00:02, 45.11it/s][A
109it [00:02, 45.46it/s][A

Epoch: 463, Step: 100, Loss: 4.465060567855835



114it [00:02, 45.33it/s][A
119it [00:02, 45.62it/s][A
124it [00:02, 45.68it/s][A
129it [00:02, 45.73it/s][A
134it [00:03, 45.86it/s][A
139it [00:03, 45.78it/s][A
144it [00:03, 44.57it/s][A
149it [00:03, 43.29it/s][A
154it [00:03, 44.00it/s][A
159it [00:03, 44.25it/s][A
164it [00:03, 44.63it/s][A
169it [00:03, 43.21it/s][A
174it [00:03, 43.69it/s][A
179it [00:04, 44.13it/s][A
184it [00:04, 43.72it/s][A
189it [00:04, 44.28it/s][A
194it [00:04, 44.24it/s][A
199it [00:04, 44.56it/s][A
204it [00:04, 44.37it/s][A
209it [00:04, 44.73it/s][A

Epoch: 463, Step: 200, Loss: 4.475691602230072



214it [00:04, 44.84it/s][A
219it [00:04, 45.04it/s][A
227it [00:05, 44.17it/s]
 93%|█████████▎| 463/500 [55:24<03:58,  6.45s/it]
0it [00:00, ?it/s][A
4it [00:00, 38.03it/s][A
9it [00:00, 40.70it/s][A
14it [00:00, 42.00it/s][A
19it [00:00, 41.13it/s][A
24it [00:00, 42.72it/s][A
29it [00:00, 43.80it/s][A
34it [00:00, 44.16it/s][A
39it [00:00, 44.77it/s][A
44it [00:01, 45.07it/s][A
49it [00:01, 43.97it/s][A
54it [00:01, 44.46it/s][A
59it [00:01, 44.92it/s][A
64it [00:01, 43.07it/s][A
69it [00:01, 43.76it/s][A
74it [00:01, 43.94it/s][A
79it [00:01, 44.64it/s][A
84it [00:01, 45.10it/s][A
89it [00:02, 44.95it/s][A
94it [00:02, 45.38it/s][A
99it [00:02, 45.51it/s][A
104it [00:02, 45.28it/s][A
109it [00:02, 45.22it/s][A

Epoch: 464, Step: 100, Loss: 4.464664616584778



114it [00:02, 45.39it/s][A
119it [00:02, 45.63it/s][A
124it [00:02, 44.19it/s][A
129it [00:02, 44.60it/s][A
134it [00:03, 44.98it/s][A
139it [00:03, 45.35it/s][A
144it [00:03, 45.43it/s][A
149it [00:03, 44.71it/s][A
154it [00:03, 44.96it/s][A
159it [00:03, 44.70it/s][A
164it [00:03, 45.14it/s][A
169it [00:03, 45.66it/s][A
174it [00:03, 45.89it/s][A
179it [00:04, 45.58it/s][A
184it [00:04, 45.90it/s][A
189it [00:04, 45.79it/s][A
194it [00:04, 45.26it/s][A
199it [00:04, 45.71it/s][A
204it [00:04, 44.23it/s][A

Epoch: 464, Step: 200, Loss: 4.47830331325531



209it [00:04, 44.55it/s][A
214it [00:04, 44.92it/s][A
219it [00:04, 44.77it/s][A
227it [00:05, 44.49it/s]
 93%|█████████▎| 464/500 [55:29<03:37,  6.05s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.45it/s][A
10it [00:00, 45.54it/s][A
15it [00:00, 45.10it/s][A
20it [00:00, 45.25it/s][A
25it [00:00, 44.99it/s][A
30it [00:00, 45.14it/s][A
35it [00:00, 45.16it/s][A
40it [00:00, 45.23it/s][A
45it [00:01, 44.27it/s][A
50it [00:01, 44.82it/s][A
55it [00:01, 44.79it/s][A
60it [00:01, 44.98it/s][A
65it [00:01, 45.05it/s][A
70it [00:01, 45.27it/s][A
75it [00:01, 45.60it/s][A
80it [00:01, 45.53it/s][A
85it [00:01, 45.74it/s][A
90it [00:01, 45.99it/s][A
95it [00:02, 44.63it/s][A
100it [00:02, 45.01it/s][A
105it [00:02, 44.93it/s][A

Epoch: 465, Step: 100, Loss: 4.46267689704895



110it [00:02, 43.68it/s][A
115it [00:02, 42.10it/s][A
120it [00:02, 43.10it/s][A
125it [00:02, 44.36it/s][A
130it [00:02, 44.88it/s][A
135it [00:03, 45.20it/s][A
140it [00:03, 45.57it/s][A
145it [00:03, 45.01it/s][A
150it [00:03, 45.21it/s][A
155it [00:03, 44.70it/s][A
160it [00:03, 45.32it/s][A
165it [00:03, 45.43it/s][A
170it [00:03, 45.61it/s][A
175it [00:03, 45.68it/s][A
180it [00:03, 45.71it/s][A
185it [00:04, 45.27it/s][A
190it [00:04, 45.32it/s][A
195it [00:04, 44.98it/s][A
200it [00:04, 43.49it/s][A
205it [00:04, 43.82it/s][A

Epoch: 465, Step: 200, Loss: 4.476056125164032



210it [00:04, 42.39it/s][A
215it [00:04, 43.28it/s][A
220it [00:04, 43.93it/s][A
227it [00:05, 44.77it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.93it/s][A
13it [00:00, 59.82it/s][A
19it [00:00, 59.54it/s][A
25it [00:00, 59.35it/s][A
32it [00:00, 59.97it/s][A
38it [00:00, 59.91it/s][A
44it [00:00, 59.15it/s][A
50it [00:00, 58.81it/s][A
56it [00:00, 58.89it/s][A
62it [00:01, 55.54it/s][A
68it [00:01, 56.58it/s][A
74it [00:01, 54.91it/s][A
80it [00:01, 55.80it/s][A
86it [00:01, 54.88it/s][A
92it [00:01, 53.12it/s][A
98it [00:01, 54.87it/s][A
104it [00:01, 56.20it/s][A
110it [00:01, 57.23it/s][A
116it [00:02, 55.73it/s][A
122it [00:02, 56.80it/s][A
128it [00:02, 57.70it/s][A
134it [00:02, 58.37it/s][A
140it [00:02, 58.84it/s][A
147it [00:02, 59.42it/s][A
153it [00:02, 59.46it/s][A
159it [00:02, 57.24it/s][A
165it [00:02, 57.94it/s][A
172it [00:02, 58.84it/s][A
179it [00:03, 59.43it/s][A
186it [00:03, 59.74it/s][A
192it [00:03, 57.34it/s][A
199it [00:03, 58


Epoch: 465, Test Loss: 5.575150943690946, Test Perplexity: 264.9817526444145




0it [00:00, ?it/s][A
5it [00:00, 45.88it/s][A
10it [00:00, 45.15it/s][A
15it [00:00, 45.53it/s][A
20it [00:00, 45.53it/s][A
25it [00:00, 45.42it/s][A
30it [00:00, 45.13it/s][A
35it [00:00, 45.05it/s][A
40it [00:00, 44.91it/s][A
45it [00:00, 44.83it/s][A
50it [00:01, 44.58it/s][A
55it [00:01, 44.97it/s][A
60it [00:01, 44.99it/s][A
65it [00:01, 43.46it/s][A
70it [00:01, 44.00it/s][A
75it [00:01, 42.62it/s][A
80it [00:01, 42.84it/s][A
85it [00:01, 43.30it/s][A
90it [00:02, 43.51it/s][A
95it [00:02, 43.76it/s][A
100it [00:02, 44.06it/s][A
105it [00:02, 44.23it/s][A

Epoch: 466, Step: 100, Loss: 4.467668118476868



110it [00:02, 44.12it/s][A
115it [00:02, 43.42it/s][A
120it [00:02, 43.92it/s][A
125it [00:02, 43.13it/s][A
130it [00:02, 42.33it/s][A
135it [00:03, 41.62it/s][A
140it [00:03, 42.71it/s][A
145it [00:03, 42.82it/s][A
150it [00:03, 43.26it/s][A
155it [00:03, 44.03it/s][A
160it [00:03, 44.40it/s][A
165it [00:03, 44.63it/s][A
170it [00:03, 44.93it/s][A
175it [00:03, 44.59it/s][A
180it [00:04, 44.48it/s][A
185it [00:04, 44.70it/s][A
190it [00:04, 44.74it/s][A
195it [00:04, 43.87it/s][A
200it [00:04, 43.23it/s][A
205it [00:04, 44.17it/s][A

Epoch: 466, Step: 200, Loss: 4.478398699760437



210it [00:04, 44.27it/s][A
215it [00:04, 44.56it/s][A
220it [00:04, 45.07it/s][A
227it [00:05, 44.15it/s]
 93%|█████████▎| 466/500 [55:50<04:27,  7.87s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.74it/s][A
10it [00:00, 41.61it/s][A
15it [00:00, 42.92it/s][A
20it [00:00, 42.42it/s][A
25it [00:00, 42.39it/s][A
30it [00:00, 43.32it/s][A
35it [00:00, 42.38it/s][A
40it [00:00, 43.56it/s][A
45it [00:01, 44.31it/s][A
50it [00:01, 44.73it/s][A
55it [00:01, 45.25it/s][A
60it [00:01, 44.58it/s][A
65it [00:01, 45.04it/s][A
70it [00:01, 45.14it/s][A
75it [00:01, 44.97it/s][A
80it [00:01, 45.04it/s][A
85it [00:01, 44.91it/s][A
90it [00:02, 44.90it/s][A
95it [00:02, 44.89it/s][A
100it [00:02, 45.01it/s][A
105it [00:02, 45.15it/s][A

Epoch: 467, Step: 100, Loss: 4.463285164833069



110it [00:02, 44.88it/s][A
115it [00:02, 44.95it/s][A
120it [00:02, 45.01it/s][A
125it [00:02, 44.82it/s][A
130it [00:02, 45.07it/s][A
135it [00:03, 45.31it/s][A
140it [00:03, 45.21it/s][A
145it [00:03, 45.13it/s][A
150it [00:03, 44.49it/s][A
155it [00:03, 44.95it/s][A
160it [00:03, 44.02it/s][A
165it [00:03, 44.34it/s][A
170it [00:03, 44.41it/s][A
175it [00:03, 44.88it/s][A
180it [00:04, 44.82it/s][A
185it [00:04, 44.91it/s][A
190it [00:04, 44.23it/s][A
195it [00:04, 44.52it/s][A
200it [00:04, 44.76it/s][A
205it [00:04, 44.72it/s][A

Epoch: 467, Step: 200, Loss: 4.4724959683418275



210it [00:04, 45.09it/s][A
215it [00:04, 44.65it/s][A
220it [00:04, 44.94it/s][A
227it [00:05, 44.56it/s]
 93%|█████████▎| 467/500 [55:55<03:52,  7.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.28it/s][A
10it [00:00, 43.09it/s][A
15it [00:00, 44.46it/s][A
20it [00:00, 45.08it/s][A
25it [00:00, 45.25it/s][A
30it [00:00, 45.50it/s][A
35it [00:00, 45.75it/s][A
40it [00:00, 45.83it/s][A
45it [00:01, 44.01it/s][A
50it [00:01, 44.77it/s][A
55it [00:01, 45.24it/s][A
60it [00:01, 45.32it/s][A
65it [00:01, 45.64it/s][A
70it [00:01, 44.68it/s][A
75it [00:01, 45.38it/s][A
80it [00:01, 45.52it/s][A
85it [00:01, 45.53it/s][A
90it [00:02, 44.39it/s][A
95it [00:02, 44.30it/s][A
100it [00:02, 44.24it/s][A
105it [00:02, 44.59it/s][A

Epoch: 468, Step: 100, Loss: 4.469289984703064



110it [00:02, 44.98it/s][A
115it [00:02, 43.58it/s][A
120it [00:02, 44.44it/s][A
125it [00:02, 44.46it/s][A
130it [00:02, 44.97it/s][A
135it [00:03, 44.89it/s][A
140it [00:03, 42.45it/s][A
145it [00:03, 43.15it/s][A
150it [00:03, 43.66it/s][A
155it [00:03, 44.18it/s][A
160it [00:03, 44.90it/s][A
165it [00:03, 45.40it/s][A
170it [00:03, 43.99it/s][A
175it [00:03, 43.31it/s][A
180it [00:04, 43.99it/s][A
185it [00:04, 44.43it/s][A
190it [00:04, 45.08it/s][A
195it [00:04, 45.48it/s][A
200it [00:04, 45.81it/s][A
205it [00:04, 44.36it/s][A

Epoch: 468, Step: 200, Loss: 4.4779409146308895



210it [00:04, 43.12it/s][A
215it [00:04, 43.62it/s][A
220it [00:04, 44.11it/s][A
227it [00:05, 44.46it/s]
 94%|█████████▎| 468/500 [56:00<03:26,  6.46s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.56it/s][A
10it [00:00, 44.81it/s][A
15it [00:00, 44.82it/s][A
20it [00:00, 44.68it/s][A
25it [00:00, 44.86it/s][A
30it [00:00, 45.23it/s][A
35it [00:00, 45.40it/s][A
40it [00:00, 45.43it/s][A
45it [00:00, 45.37it/s][A
50it [00:01, 45.39it/s][A
55it [00:01, 45.40it/s][A
60it [00:01, 45.19it/s][A
65it [00:01, 45.31it/s][A
70it [00:01, 45.40it/s][A
75it [00:01, 45.57it/s][A
80it [00:01, 45.89it/s][A
85it [00:01, 45.98it/s][A
90it [00:01, 46.01it/s][A
95it [00:02, 46.27it/s][A
100it [00:02, 46.29it/s][A
105it [00:02, 46.56it/s][A

Epoch: 469, Step: 100, Loss: 4.457584013938904



110it [00:02, 44.50it/s][A
115it [00:02, 45.16it/s][A
120it [00:02, 45.43it/s][A
125it [00:02, 45.65it/s][A
130it [00:02, 45.27it/s][A
135it [00:02, 45.47it/s][A
140it [00:03, 45.23it/s][A
145it [00:03, 45.49it/s][A
150it [00:03, 45.57it/s][A
155it [00:03, 45.23it/s][A
160it [00:03, 45.46it/s][A
165it [00:03, 45.51it/s][A
170it [00:03, 44.61it/s][A
175it [00:03, 44.58it/s][A
180it [00:03, 43.44it/s][A
185it [00:04, 44.19it/s][A
190it [00:04, 44.40it/s][A
195it [00:04, 44.57it/s][A
200it [00:04, 44.68it/s][A
205it [00:04, 44.88it/s][A

Epoch: 469, Step: 200, Loss: 4.475445868968964



210it [00:04, 43.15it/s][A
215it [00:04, 43.91it/s][A
220it [00:04, 44.00it/s][A
227it [00:05, 45.04it/s]
 94%|█████████▍| 469/500 [56:05<03:07,  6.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.37it/s][A
10it [00:00, 45.91it/s][A
15it [00:00, 45.26it/s][A
20it [00:00, 45.16it/s][A
25it [00:00, 44.94it/s][A
30it [00:00, 45.06it/s][A
35it [00:00, 45.04it/s][A
40it [00:00, 42.68it/s][A
45it [00:01, 43.41it/s][A
50it [00:01, 42.00it/s][A
55it [00:01, 42.84it/s][A
60it [00:01, 43.41it/s][A
65it [00:01, 44.23it/s][A
70it [00:01, 44.86it/s][A
75it [00:01, 44.90it/s][A
80it [00:01, 44.98it/s][A
85it [00:01, 45.03it/s][A
90it [00:02, 43.79it/s][A
95it [00:02, 43.02it/s][A
100it [00:02, 44.08it/s][A
105it [00:02, 43.07it/s][A

Epoch: 470, Step: 100, Loss: 4.468137068748474



110it [00:02, 43.54it/s][A
115it [00:02, 44.29it/s][A
120it [00:02, 44.47it/s][A
125it [00:02, 44.72it/s][A
130it [00:02, 44.69it/s][A
135it [00:03, 45.01it/s][A
140it [00:03, 45.46it/s][A
145it [00:03, 44.37it/s][A
150it [00:03, 43.56it/s][A
155it [00:03, 42.41it/s][A
160it [00:03, 43.36it/s][A
165it [00:03, 44.00it/s][A
170it [00:03, 44.72it/s][A
175it [00:03, 44.65it/s][A
180it [00:04, 44.59it/s][A
185it [00:04, 44.73it/s][A
190it [00:04, 44.78it/s][A
195it [00:04, 44.66it/s][A
200it [00:04, 44.03it/s][A
205it [00:04, 44.24it/s][A

Epoch: 470, Step: 200, Loss: 4.475642590522766



210it [00:04, 43.88it/s][A
215it [00:04, 44.29it/s][A
220it [00:04, 44.10it/s][A
227it [00:05, 44.20it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.41it/s][A
13it [00:00, 60.65it/s][A
20it [00:00, 61.10it/s][A
27it [00:00, 61.13it/s][A
34it [00:00, 60.99it/s][A
41it [00:00, 60.72it/s][A
48it [00:00, 59.81it/s][A
54it [00:00, 59.80it/s][A
60it [00:01, 57.24it/s][A
67it [00:01, 58.24it/s][A
74it [00:01, 59.03it/s][A
81it [00:01, 59.44it/s][A
87it [00:01, 59.43it/s][A
94it [00:01, 59.99it/s][A
100it [00:01, 59.79it/s][A
107it [00:01, 60.07it/s][A
114it [00:01, 60.05it/s][A
121it [00:02, 60.33it/s][A
128it [00:02, 60.38it/s][A
135it [00:02, 57.80it/s][A
142it [00:02, 58.65it/s][A
148it [00:02, 56.30it/s][A
155it [00:02, 57.49it/s][A
161it [00:02, 55.93it/s][A
167it [00:02, 56.96it/s][A
173it [00:02, 57.47it/s][A
179it [00:03, 57.95it/s][A
186it [00:03, 58.92it/s][A
193it [00:03, 59.40it/s][A
200it [00:03, 59.82it/s][A
207it [00:03, 60.15it/s][A
214it [00:03, 


Epoch: 470, Test Loss: 5.5751914252405586, Test Perplexity: 264.9807413231512




0it [00:00, ?it/s][A
5it [00:00, 43.67it/s][A
10it [00:00, 44.03it/s][A
15it [00:00, 44.69it/s][A
20it [00:00, 44.78it/s][A
25it [00:00, 45.15it/s][A
30it [00:00, 45.22it/s][A
35it [00:00, 45.31it/s][A
40it [00:00, 45.06it/s][A
45it [00:01, 45.16it/s][A
50it [00:01, 45.30it/s][A
55it [00:01, 45.21it/s][A
60it [00:01, 44.46it/s][A
65it [00:01, 44.98it/s][A
70it [00:01, 45.11it/s][A
75it [00:01, 45.62it/s][A
80it [00:01, 43.90it/s][A
85it [00:01, 44.40it/s][A
90it [00:02, 44.69it/s][A
95it [00:02, 43.15it/s][A
100it [00:02, 43.74it/s][A
105it [00:02, 44.25it/s][A

Epoch: 471, Step: 100, Loss: 4.45794460773468



110it [00:02, 43.02it/s][A
115it [00:02, 43.38it/s][A
120it [00:02, 44.22it/s][A
125it [00:02, 44.34it/s][A
130it [00:02, 44.71it/s][A
135it [00:03, 43.75it/s][A
140it [00:03, 44.06it/s][A
145it [00:03, 44.20it/s][A
150it [00:03, 44.62it/s][A
155it [00:03, 44.37it/s][A
160it [00:03, 44.80it/s][A
165it [00:03, 43.80it/s][A
170it [00:03, 44.02it/s][A
175it [00:03, 44.44it/s][A
180it [00:04, 44.86it/s][A
185it [00:04, 43.36it/s][A
190it [00:04, 42.46it/s][A
195it [00:04, 43.35it/s][A
200it [00:04, 43.16it/s][A
205it [00:04, 43.68it/s][A

Epoch: 471, Step: 200, Loss: 4.473240015506744



210it [00:04, 44.13it/s][A
215it [00:04, 44.48it/s][A
220it [00:04, 44.77it/s][A
227it [00:05, 44.24it/s]
 94%|█████████▍| 471/500 [56:27<03:48,  7.87s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.72it/s][A
10it [00:00, 45.22it/s][A
15it [00:00, 44.69it/s][A
20it [00:00, 45.30it/s][A
25it [00:00, 45.10it/s][A
30it [00:00, 45.28it/s][A
35it [00:00, 44.27it/s][A
40it [00:00, 44.65it/s][A
45it [00:01, 44.55it/s][A
50it [00:01, 43.18it/s][A
55it [00:01, 42.33it/s][A
60it [00:01, 43.55it/s][A
65it [00:01, 44.21it/s][A
70it [00:01, 44.56it/s][A
75it [00:01, 44.86it/s][A
80it [00:01, 45.18it/s][A
85it [00:01, 45.41it/s][A
90it [00:02, 45.67it/s][A
95it [00:02, 45.74it/s][A
100it [00:02, 44.47it/s][A
105it [00:02, 42.86it/s][A

Epoch: 472, Step: 100, Loss: 4.4576579141616826



110it [00:02, 42.61it/s][A
115it [00:02, 43.06it/s][A
120it [00:02, 44.00it/s][A
125it [00:02, 43.86it/s][A
130it [00:02, 43.87it/s][A
135it [00:03, 43.43it/s][A
140it [00:03, 43.74it/s][A
145it [00:03, 44.44it/s][A
150it [00:03, 44.90it/s][A
155it [00:03, 45.23it/s][A
160it [00:03, 45.67it/s][A
165it [00:03, 46.00it/s][A
170it [00:03, 45.71it/s][A
175it [00:03, 45.57it/s][A
180it [00:04, 44.93it/s][A
185it [00:04, 44.92it/s][A
190it [00:04, 43.56it/s][A
195it [00:04, 44.19it/s][A
200it [00:04, 44.92it/s][A
205it [00:04, 45.36it/s][A

Epoch: 472, Step: 200, Loss: 4.475089197158813



210it [00:04, 45.42it/s][A
215it [00:04, 44.54it/s][A
220it [00:04, 45.32it/s][A
227it [00:05, 44.54it/s]
 94%|█████████▍| 472/500 [56:32<03:17,  7.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.44it/s][A
10it [00:00, 41.22it/s][A
15it [00:00, 43.53it/s][A
20it [00:00, 44.65it/s][A
25it [00:00, 44.63it/s][A
30it [00:00, 43.48it/s][A
35it [00:00, 44.39it/s][A
40it [00:00, 43.52it/s][A
45it [00:01, 42.98it/s][A
50it [00:01, 41.39it/s][A
55it [00:01, 40.43it/s][A
60it [00:01, 39.64it/s][A
65it [00:01, 41.47it/s][A
70it [00:01, 42.91it/s][A
75it [00:01, 43.91it/s][A
80it [00:01, 44.74it/s][A
85it [00:01, 45.19it/s][A
90it [00:02, 45.39it/s][A
95it [00:02, 44.81it/s][A
100it [00:02, 44.92it/s][A
105it [00:02, 45.06it/s][A

Epoch: 473, Step: 100, Loss: 4.460339670181274



110it [00:02, 43.86it/s][A
115it [00:02, 44.60it/s][A
120it [00:02, 45.06it/s][A
125it [00:02, 45.49it/s][A
130it [00:02, 45.77it/s][A
135it [00:03, 45.79it/s][A
140it [00:03, 46.42it/s][A
145it [00:03, 46.93it/s][A
150it [00:03, 46.90it/s][A
155it [00:03, 46.70it/s][A
160it [00:03, 46.14it/s][A
165it [00:03, 46.15it/s][A
170it [00:03, 46.09it/s][A
175it [00:03, 46.14it/s][A
180it [00:04, 46.17it/s][A
185it [00:04, 46.22it/s][A
190it [00:04, 46.04it/s][A
195it [00:04, 45.76it/s][A
200it [00:04, 45.84it/s][A
205it [00:04, 44.45it/s][A

Epoch: 473, Step: 200, Loss: 4.473887281417847



210it [00:04, 42.96it/s][A
215it [00:04, 43.56it/s][A
220it [00:04, 44.08it/s][A
227it [00:05, 44.53it/s]
 95%|█████████▍| 473/500 [56:37<02:54,  6.46s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.20it/s][A
10it [00:00, 46.15it/s][A
15it [00:00, 45.89it/s][A
20it [00:00, 45.56it/s][A
25it [00:00, 45.49it/s][A
30it [00:00, 45.55it/s][A
35it [00:00, 45.13it/s][A
40it [00:00, 43.03it/s][A
45it [00:01, 43.17it/s][A
50it [00:01, 43.55it/s][A
55it [00:01, 44.28it/s][A
60it [00:01, 42.55it/s][A
65it [00:01, 42.74it/s][A
70it [00:01, 43.19it/s][A
75it [00:01, 42.94it/s][A
80it [00:01, 41.76it/s][A
85it [00:01, 42.18it/s][A
90it [00:02, 40.78it/s][A
95it [00:02, 41.10it/s][A
100it [00:02, 42.07it/s][A
105it [00:02, 42.75it/s][A

Epoch: 474, Step: 100, Loss: 4.460190114974975



110it [00:02, 43.43it/s][A
115it [00:02, 43.98it/s][A
120it [00:02, 44.30it/s][A
125it [00:02, 43.50it/s][A
130it [00:02, 44.35it/s][A
135it [00:03, 44.43it/s][A
140it [00:03, 44.81it/s][A
145it [00:03, 44.74it/s][A
150it [00:03, 45.16it/s][A
155it [00:03, 45.16it/s][A
160it [00:03, 45.42it/s][A
165it [00:03, 45.57it/s][A
170it [00:03, 44.38it/s][A
175it [00:03, 44.67it/s][A
180it [00:04, 44.88it/s][A
185it [00:04, 44.91it/s][A
190it [00:04, 45.09it/s][A
195it [00:04, 45.08it/s][A
200it [00:04, 45.11it/s][A
205it [00:04, 45.21it/s][A

Epoch: 474, Step: 200, Loss: 4.473670344352723



210it [00:04, 45.12it/s][A
215it [00:04, 45.18it/s][A
220it [00:04, 45.31it/s][A
227it [00:05, 44.15it/s]
 95%|█████████▍| 474/500 [56:42<02:37,  6.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.50it/s][A
10it [00:00, 45.46it/s][A
15it [00:00, 45.53it/s][A
20it [00:00, 45.20it/s][A
25it [00:00, 45.37it/s][A
30it [00:00, 45.54it/s][A
35it [00:00, 43.38it/s][A
40it [00:00, 43.71it/s][A
45it [00:01, 42.78it/s][A
50it [00:01, 43.60it/s][A
55it [00:01, 42.13it/s][A
60it [00:01, 41.69it/s][A
65it [00:01, 42.86it/s][A
70it [00:01, 42.00it/s][A
75it [00:01, 41.17it/s][A
80it [00:01, 42.53it/s][A
85it [00:01, 41.50it/s][A
90it [00:02, 42.49it/s][A
95it [00:02, 42.09it/s][A
100it [00:02, 41.52it/s][A
105it [00:02, 42.30it/s][A

Epoch: 475, Step: 100, Loss: 4.45554636001587



110it [00:02, 42.01it/s][A
115it [00:02, 43.12it/s][A
120it [00:02, 43.99it/s][A
125it [00:02, 44.42it/s][A
130it [00:03, 44.54it/s][A
135it [00:03, 44.84it/s][A
140it [00:03, 45.00it/s][A
145it [00:03, 43.93it/s][A
150it [00:03, 44.42it/s][A
155it [00:03, 44.69it/s][A
160it [00:03, 45.16it/s][A
165it [00:03, 44.96it/s][A
170it [00:03, 44.98it/s][A
175it [00:04, 45.26it/s][A
180it [00:04, 43.56it/s][A
185it [00:04, 42.74it/s][A
190it [00:04, 42.88it/s][A
195it [00:04, 43.67it/s][A
200it [00:04, 44.26it/s][A
205it [00:04, 44.86it/s][A

Epoch: 475, Step: 200, Loss: 4.473847115039826



210it [00:04, 45.10it/s][A
215it [00:04, 45.32it/s][A
220it [00:05, 45.43it/s][A
227it [00:05, 43.82it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.58it/s][A
13it [00:00, 60.20it/s][A
20it [00:00, 60.32it/s][A
27it [00:00, 57.11it/s][A
33it [00:00, 58.02it/s][A
40it [00:00, 58.95it/s][A
47it [00:00, 59.57it/s][A
54it [00:00, 60.02it/s][A
61it [00:01, 60.09it/s][A
68it [00:01, 60.45it/s][A
75it [00:01, 60.65it/s][A
82it [00:01, 60.77it/s][A
89it [00:01, 60.93it/s][A
96it [00:01, 60.74it/s][A
103it [00:01, 59.04it/s][A
110it [00:01, 59.56it/s][A
116it [00:01, 57.69it/s][A
122it [00:02, 56.70it/s][A
129it [00:02, 57.89it/s][A
135it [00:02, 58.29it/s][A
141it [00:02, 58.73it/s][A
148it [00:02, 59.28it/s][A
154it [00:02, 59.44it/s][A
161it [00:02, 59.81it/s][A
168it [00:02, 60.14it/s][A
175it [00:02, 60.37it/s][A
182it [00:03, 60.11it/s][A
189it [00:03, 60.23it/s][A
196it [00:03, 58.06it/s][A
202it [00:03, 58.50it/s][A
209it [00:03, 59.16it/s][A
215it [00:03, 


Epoch: 475, Test Loss: 5.575485213202719, Test Perplexity: 265.05156584556056




0it [00:00, ?it/s][A
5it [00:00, 41.24it/s][A
10it [00:00, 41.17it/s][A
15it [00:00, 42.99it/s][A
20it [00:00, 43.32it/s][A
25it [00:00, 44.00it/s][A
30it [00:00, 43.33it/s][A
35it [00:00, 43.88it/s][A
40it [00:00, 41.36it/s][A
45it [00:01, 42.75it/s][A
50it [00:01, 43.18it/s][A
55it [00:01, 43.58it/s][A
60it [00:01, 44.11it/s][A
65it [00:01, 44.81it/s][A
70it [00:01, 45.27it/s][A
75it [00:01, 45.24it/s][A
80it [00:01, 43.83it/s][A
85it [00:01, 44.56it/s][A
90it [00:02, 45.02it/s][A
95it [00:02, 43.84it/s][A
100it [00:02, 44.35it/s][A
105it [00:02, 44.57it/s][A

Epoch: 476, Step: 100, Loss: 4.465530910491943



110it [00:02, 44.58it/s][A
115it [00:02, 45.13it/s][A
120it [00:02, 45.22it/s][A
125it [00:02, 45.69it/s][A
130it [00:02, 45.90it/s][A
135it [00:03, 45.79it/s][A
140it [00:03, 45.75it/s][A
145it [00:03, 45.88it/s][A
150it [00:03, 45.36it/s][A
155it [00:03, 45.53it/s][A
160it [00:03, 45.60it/s][A
165it [00:03, 44.00it/s][A
170it [00:03, 44.64it/s][A
175it [00:03, 43.52it/s][A
180it [00:04, 44.25it/s][A
185it [00:04, 44.51it/s][A
190it [00:04, 45.00it/s][A
195it [00:04, 45.36it/s][A
200it [00:04, 45.56it/s][A
205it [00:04, 45.78it/s][A

Epoch: 476, Step: 200, Loss: 4.4765535259246825



210it [00:04, 45.86it/s][A
215it [00:04, 45.92it/s][A
220it [00:04, 46.07it/s][A
227it [00:05, 44.70it/s]
 95%|█████████▌| 476/500 [57:03<03:09,  7.88s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.73it/s][A
10it [00:00, 45.92it/s][A
15it [00:00, 45.96it/s][A
20it [00:00, 46.13it/s][A
25it [00:00, 46.29it/s][A
30it [00:00, 46.17it/s][A
35it [00:00, 44.71it/s][A
40it [00:00, 45.12it/s][A
45it [00:00, 45.30it/s][A
50it [00:01, 45.53it/s][A
55it [00:01, 45.41it/s][A
60it [00:01, 45.40it/s][A
65it [00:01, 45.21it/s][A
70it [00:01, 45.41it/s][A
75it [00:01, 45.43it/s][A
80it [00:01, 45.64it/s][A
85it [00:01, 45.88it/s][A
90it [00:01, 45.66it/s][A
95it [00:02, 45.49it/s][A
100it [00:02, 44.72it/s][A
105it [00:02, 45.09it/s][A

Epoch: 477, Step: 100, Loss: 4.4584370470047



110it [00:02, 44.45it/s][A
115it [00:02, 44.79it/s][A
120it [00:02, 45.01it/s][A
125it [00:02, 45.17it/s][A
130it [00:02, 45.52it/s][A
135it [00:02, 45.00it/s][A
140it [00:03, 44.57it/s][A
145it [00:03, 44.63it/s][A
150it [00:03, 43.11it/s][A
155it [00:03, 43.70it/s][A
160it [00:03, 42.76it/s][A
165it [00:03, 44.10it/s][A
170it [00:03, 44.52it/s][A
175it [00:03, 44.44it/s][A
180it [00:04, 44.64it/s][A
185it [00:04, 44.74it/s][A
190it [00:04, 42.78it/s][A
195it [00:04, 41.38it/s][A
200it [00:04, 42.40it/s][A
205it [00:04, 43.08it/s][A

Epoch: 477, Step: 200, Loss: 4.474624607563019



210it [00:04, 43.85it/s][A
215it [00:04, 44.30it/s][A
220it [00:04, 44.38it/s][A
227it [00:05, 44.68it/s]
 95%|█████████▌| 477/500 [57:08<02:41,  7.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.28it/s][A
10it [00:00, 44.59it/s][A
15it [00:00, 44.66it/s][A
20it [00:00, 45.13it/s][A
25it [00:00, 44.79it/s][A
30it [00:00, 45.41it/s][A
35it [00:00, 44.22it/s][A
40it [00:00, 44.92it/s][A
45it [00:01, 41.52it/s][A
50it [00:01, 41.78it/s][A
55it [00:01, 42.91it/s][A
60it [00:01, 41.60it/s][A
65it [00:01, 42.76it/s][A
70it [00:01, 42.36it/s][A
75it [00:01, 43.47it/s][A
80it [00:01, 43.09it/s][A
85it [00:01, 43.98it/s][A
90it [00:02, 44.48it/s][A
95it [00:02, 43.20it/s][A
100it [00:02, 44.02it/s][A
105it [00:02, 43.69it/s][A

Epoch: 478, Step: 100, Loss: 4.456802282333374



110it [00:02, 44.18it/s][A
115it [00:02, 44.56it/s][A
120it [00:02, 44.48it/s][A
125it [00:02, 43.90it/s][A
130it [00:02, 44.50it/s][A
135it [00:03, 44.62it/s][A
140it [00:03, 44.81it/s][A
145it [00:03, 44.85it/s][A
150it [00:03, 44.83it/s][A
155it [00:03, 44.74it/s][A
160it [00:03, 43.69it/s][A
165it [00:03, 44.38it/s][A
170it [00:03, 44.38it/s][A
175it [00:03, 44.37it/s][A
180it [00:04, 44.98it/s][A
185it [00:04, 44.90it/s][A
190it [00:04, 44.96it/s][A
195it [00:04, 45.24it/s][A
200it [00:04, 45.47it/s][A
205it [00:04, 44.40it/s][A

Epoch: 478, Step: 200, Loss: 4.474718523025513



210it [00:04, 44.75it/s][A
215it [00:04, 43.90it/s][A
220it [00:04, 44.55it/s][A
227it [00:05, 44.14it/s]
 96%|█████████▌| 478/500 [57:13<02:22,  6.47s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.87it/s][A
10it [00:00, 44.82it/s][A
15it [00:00, 45.50it/s][A
20it [00:00, 45.39it/s][A
25it [00:00, 45.36it/s][A
30it [00:00, 45.46it/s][A
35it [00:00, 42.73it/s][A
40it [00:00, 42.24it/s][A
45it [00:01, 43.44it/s][A
50it [00:01, 44.26it/s][A
55it [00:01, 44.90it/s][A
60it [00:01, 45.20it/s][A
65it [00:01, 45.33it/s][A
70it [00:01, 45.55it/s][A
75it [00:01, 45.85it/s][A
80it [00:01, 44.08it/s][A
85it [00:01, 44.30it/s][A
90it [00:02, 44.88it/s][A
95it [00:02, 45.15it/s][A
100it [00:02, 44.94it/s][A
105it [00:02, 45.02it/s][A

Epoch: 479, Step: 100, Loss: 4.462499976158142



110it [00:02, 45.11it/s][A
115it [00:02, 45.36it/s][A
120it [00:02, 44.43it/s][A
125it [00:02, 44.87it/s][A
130it [00:02, 44.88it/s][A
135it [00:03, 45.20it/s][A
140it [00:03, 45.16it/s][A
145it [00:03, 45.06it/s][A
150it [00:03, 45.19it/s][A
155it [00:03, 45.25it/s][A
160it [00:03, 45.19it/s][A
165it [00:03, 45.01it/s][A
170it [00:03, 45.06it/s][A
175it [00:03, 45.12it/s][A
180it [00:04, 45.29it/s][A
185it [00:04, 45.15it/s][A
190it [00:04, 44.51it/s][A
195it [00:04, 44.52it/s][A
200it [00:04, 44.60it/s][A
205it [00:04, 43.40it/s][A

Epoch: 479, Step: 200, Loss: 4.473098478317261



210it [00:04, 42.96it/s][A
215it [00:04, 43.71it/s][A
220it [00:04, 44.26it/s][A
227it [00:05, 44.64it/s]
 96%|█████████▌| 479/500 [57:18<02:07,  6.06s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.36it/s][A
10it [00:00, 40.90it/s][A
15it [00:00, 41.29it/s][A
20it [00:00, 39.96it/s][A
25it [00:00, 41.24it/s][A
30it [00:00, 42.38it/s][A
35it [00:00, 43.17it/s][A
40it [00:00, 44.03it/s][A
45it [00:01, 44.30it/s][A
50it [00:01, 44.34it/s][A
55it [00:01, 44.80it/s][A
60it [00:01, 44.71it/s][A
65it [00:01, 43.72it/s][A
70it [00:01, 44.50it/s][A
75it [00:01, 44.29it/s][A
80it [00:01, 44.83it/s][A
85it [00:01, 45.19it/s][A
90it [00:02, 45.46it/s][A
95it [00:02, 45.68it/s][A
100it [00:02, 45.73it/s][A
105it [00:02, 45.73it/s][A

Epoch: 480, Step: 100, Loss: 4.452611441612244



110it [00:02, 45.01it/s][A
115it [00:02, 45.11it/s][A
120it [00:02, 45.59it/s][A
125it [00:02, 45.43it/s][A
130it [00:02, 45.45it/s][A
135it [00:03, 45.32it/s][A
140it [00:03, 45.20it/s][A
145it [00:03, 45.16it/s][A
150it [00:03, 45.31it/s][A
155it [00:03, 45.35it/s][A
160it [00:03, 45.26it/s][A
165it [00:03, 45.25it/s][A
170it [00:03, 45.13it/s][A
175it [00:03, 44.69it/s][A
180it [00:04, 41.86it/s][A
185it [00:04, 43.18it/s][A
190it [00:04, 43.44it/s][A
195it [00:04, 44.35it/s][A
200it [00:04, 43.05it/s][A
205it [00:04, 44.08it/s][A

Epoch: 480, Step: 200, Loss: 4.473701176643371



210it [00:04, 44.28it/s][A
215it [00:04, 44.63it/s][A
220it [00:04, 44.66it/s][A
227it [00:05, 44.35it/s]

0it [00:00, ?it/s][A
6it [00:00, 59.20it/s][A
13it [00:00, 60.35it/s][A
20it [00:00, 57.59it/s][A
27it [00:00, 58.90it/s][A
34it [00:00, 59.68it/s][A
41it [00:00, 59.97it/s][A
48it [00:00, 59.84it/s][A
55it [00:00, 60.11it/s][A
62it [00:01, 60.18it/s][A
69it [00:01, 60.58it/s][A
76it [00:01, 60.65it/s][A
83it [00:01, 60.59it/s][A
90it [00:01, 60.67it/s][A
97it [00:01, 60.92it/s][A
104it [00:01, 60.99it/s][A
111it [00:01, 58.11it/s][A
117it [00:01, 56.74it/s][A
124it [00:02, 57.91it/s][A
131it [00:02, 58.87it/s][A
138it [00:02, 59.47it/s][A
144it [00:02, 58.98it/s][A
150it [00:02, 58.63it/s][A
157it [00:02, 59.45it/s][A
164it [00:02, 59.86it/s][A
171it [00:02, 60.24it/s][A
178it [00:02, 60.42it/s][A
185it [00:03, 60.44it/s][A
192it [00:03, 60.67it/s][A
199it [00:03, 60.88it/s][A
206it [00:03, 58.71it/s][A
213it [00:03, 59.36it/s][A
220it [00:03, 


Epoch: 480, Test Loss: 5.576732345249342, Test Perplexity: 265.36440229712065




0it [00:00, ?it/s][A
5it [00:00, 46.18it/s][A
10it [00:00, 46.04it/s][A
15it [00:00, 45.41it/s][A
20it [00:00, 43.22it/s][A
25it [00:00, 44.18it/s][A
30it [00:00, 43.37it/s][A
35it [00:00, 44.36it/s][A
40it [00:00, 44.77it/s][A
45it [00:01, 43.75it/s][A
50it [00:01, 43.40it/s][A
55it [00:01, 44.05it/s][A
60it [00:01, 44.85it/s][A
65it [00:01, 45.46it/s][A
70it [00:01, 45.85it/s][A
75it [00:01, 46.24it/s][A
80it [00:01, 45.86it/s][A
85it [00:01, 45.69it/s][A
90it [00:02, 45.89it/s][A
95it [00:02, 45.42it/s][A
100it [00:02, 45.69it/s][A
105it [00:02, 45.63it/s][A

Epoch: 481, Step: 100, Loss: 4.4663350820541385



110it [00:02, 45.48it/s][A
115it [00:02, 45.23it/s][A
120it [00:02, 45.38it/s][A
125it [00:02, 45.62it/s][A
130it [00:02, 45.83it/s][A
135it [00:02, 46.06it/s][A
140it [00:03, 46.07it/s][A
145it [00:03, 46.26it/s][A
150it [00:03, 46.26it/s][A
155it [00:03, 45.29it/s][A
160it [00:03, 45.42it/s][A
165it [00:03, 45.44it/s][A
170it [00:03, 46.04it/s][A
175it [00:03, 46.51it/s][A
180it [00:03, 44.76it/s][A
185it [00:04, 44.64it/s][A
190it [00:04, 44.83it/s][A
195it [00:04, 44.86it/s][A
200it [00:04, 44.90it/s][A
205it [00:04, 44.73it/s][A

Epoch: 481, Step: 200, Loss: 4.473732423782349



210it [00:04, 44.52it/s][A
215it [00:04, 43.37it/s][A
220it [00:04, 44.17it/s][A
227it [00:05, 45.06it/s]
 96%|█████████▌| 481/500 [57:40<02:29,  7.85s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.14it/s][A
10it [00:00, 44.40it/s][A
15it [00:00, 44.74it/s][A
20it [00:00, 44.97it/s][A
25it [00:00, 45.18it/s][A
30it [00:00, 44.97it/s][A
35it [00:00, 43.08it/s][A
40it [00:00, 42.89it/s][A
45it [00:01, 42.74it/s][A
50it [00:01, 43.16it/s][A
55it [00:01, 43.07it/s][A
60it [00:01, 43.26it/s][A
65it [00:01, 43.51it/s][A
70it [00:01, 43.80it/s][A
75it [00:01, 43.50it/s][A
80it [00:01, 43.90it/s][A
85it [00:01, 44.25it/s][A
90it [00:02, 44.20it/s][A
95it [00:02, 44.40it/s][A
100it [00:02, 44.29it/s][A
105it [00:02, 43.85it/s][A

Epoch: 482, Step: 100, Loss: 4.450573558807373



110it [00:02, 43.95it/s][A
115it [00:02, 44.51it/s][A
120it [00:02, 44.69it/s][A
125it [00:02, 44.85it/s][A
130it [00:02, 45.07it/s][A
135it [00:03, 45.13it/s][A
140it [00:03, 43.83it/s][A
145it [00:03, 44.41it/s][A
150it [00:03, 44.64it/s][A
155it [00:03, 43.26it/s][A
160it [00:03, 43.37it/s][A
165it [00:03, 44.08it/s][A
170it [00:03, 44.60it/s][A
175it [00:03, 44.38it/s][A
180it [00:04, 44.61it/s][A
185it [00:04, 44.89it/s][A
190it [00:04, 45.09it/s][A
195it [00:04, 45.23it/s][A
200it [00:04, 45.49it/s][A
205it [00:04, 45.49it/s][A

Epoch: 482, Step: 200, Loss: 4.472591879367829



210it [00:04, 45.45it/s][A
215it [00:04, 45.47it/s][A
220it [00:04, 45.36it/s][A
227it [00:05, 44.35it/s]
 96%|█████████▋| 482/500 [57:45<02:06,  7.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.18it/s][A
10it [00:00, 45.43it/s][A
15it [00:00, 45.23it/s][A
20it [00:00, 45.51it/s][A
25it [00:00, 44.07it/s][A
30it [00:00, 44.63it/s][A
35it [00:00, 44.81it/s][A
40it [00:00, 44.54it/s][A
45it [00:01, 44.59it/s][A
50it [00:01, 44.81it/s][A
55it [00:01, 45.09it/s][A
60it [00:01, 45.16it/s][A
65it [00:01, 45.30it/s][A
70it [00:01, 45.30it/s][A
75it [00:01, 43.56it/s][A
80it [00:01, 43.24it/s][A
85it [00:01, 43.87it/s][A
90it [00:02, 44.37it/s][A
95it [00:02, 44.25it/s][A
100it [00:02, 44.86it/s][A
105it [00:02, 44.68it/s][A

Epoch: 483, Step: 100, Loss: 4.4621474552154545



110it [00:02, 44.76it/s][A
115it [00:02, 44.95it/s][A
120it [00:02, 44.89it/s][A
125it [00:02, 45.10it/s][A
130it [00:02, 45.06it/s][A
135it [00:03, 45.28it/s][A
140it [00:03, 45.36it/s][A
145it [00:03, 45.48it/s][A
150it [00:03, 45.64it/s][A
155it [00:03, 45.91it/s][A
160it [00:03, 45.97it/s][A
165it [00:03, 46.01it/s][A
170it [00:03, 45.90it/s][A
175it [00:03, 45.96it/s][A
180it [00:03, 45.59it/s][A
185it [00:04, 45.44it/s][A
190it [00:04, 45.66it/s][A
195it [00:04, 45.17it/s][A
200it [00:04, 43.94it/s][A
205it [00:04, 44.43it/s][A

Epoch: 483, Step: 200, Loss: 4.470662980079651



210it [00:04, 44.72it/s][A
215it [00:04, 44.94it/s][A
220it [00:04, 45.35it/s][A
227it [00:05, 44.85it/s]
 97%|█████████▋| 483/500 [57:50<01:49,  6.44s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.03it/s][A
10it [00:00, 42.31it/s][A
15it [00:00, 43.51it/s][A
20it [00:00, 44.25it/s][A
25it [00:00, 44.70it/s][A
30it [00:00, 42.99it/s][A
35it [00:00, 43.95it/s][A
40it [00:00, 44.18it/s][A
45it [00:01, 44.22it/s][A
50it [00:01, 44.75it/s][A
55it [00:01, 43.97it/s][A
60it [00:01, 44.58it/s][A
65it [00:01, 44.74it/s][A
70it [00:01, 44.98it/s][A
75it [00:01, 45.34it/s][A
80it [00:01, 45.19it/s][A
85it [00:01, 45.34it/s][A
90it [00:02, 45.54it/s][A
95it [00:02, 45.77it/s][A
100it [00:02, 45.72it/s][A
105it [00:02, 45.48it/s][A

Epoch: 484, Step: 100, Loss: 4.459703660011291



110it [00:02, 44.99it/s][A
115it [00:02, 44.83it/s][A
120it [00:02, 43.74it/s][A
125it [00:02, 43.95it/s][A
130it [00:02, 44.13it/s][A
135it [00:03, 44.67it/s][A
140it [00:03, 45.20it/s][A
145it [00:03, 45.07it/s][A
150it [00:03, 43.86it/s][A
155it [00:03, 43.09it/s][A
160it [00:03, 43.05it/s][A
165it [00:03, 43.44it/s][A
170it [00:03, 44.33it/s][A
175it [00:03, 44.72it/s][A
180it [00:04, 45.03it/s][A
185it [00:04, 44.79it/s][A
190it [00:04, 44.72it/s][A
195it [00:04, 44.87it/s][A
200it [00:04, 45.19it/s][A
205it [00:04, 45.15it/s][A

Epoch: 484, Step: 200, Loss: 4.475312571525574



210it [00:04, 44.50it/s][A
215it [00:04, 45.03it/s][A
220it [00:04, 45.29it/s][A
227it [00:05, 44.60it/s]
 97%|█████████▋| 484/500 [57:55<01:36,  6.04s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.83it/s][A
10it [00:00, 45.08it/s][A
15it [00:00, 45.14it/s][A
20it [00:00, 45.11it/s][A
25it [00:00, 43.50it/s][A
30it [00:00, 44.31it/s][A
35it [00:00, 44.38it/s][A
40it [00:00, 44.61it/s][A
45it [00:01, 44.83it/s][A
50it [00:01, 45.18it/s][A
55it [00:01, 45.20it/s][A
60it [00:01, 43.79it/s][A
65it [00:01, 44.29it/s][A
70it [00:01, 44.45it/s][A
75it [00:01, 43.47it/s][A
80it [00:01, 44.21it/s][A
85it [00:01, 44.38it/s][A
90it [00:02, 44.87it/s][A
95it [00:02, 44.88it/s][A
100it [00:02, 45.26it/s][A
105it [00:02, 45.42it/s][A

Epoch: 485, Step: 100, Loss: 4.454377799034119



110it [00:02, 45.29it/s][A
115it [00:02, 45.34it/s][A
120it [00:02, 45.57it/s][A
125it [00:02, 45.69it/s][A
130it [00:02, 45.63it/s][A
135it [00:03, 45.83it/s][A
140it [00:03, 45.73it/s][A
145it [00:03, 45.28it/s][A
150it [00:03, 45.57it/s][A
155it [00:03, 45.57it/s][A
160it [00:03, 45.16it/s][A
165it [00:03, 45.19it/s][A
170it [00:03, 45.43it/s][A
175it [00:03, 45.67it/s][A
180it [00:04, 43.85it/s][A
185it [00:04, 44.12it/s][A
190it [00:04, 43.51it/s][A
195it [00:04, 44.33it/s][A
200it [00:04, 42.67it/s][A
205it [00:04, 43.16it/s][A

Epoch: 485, Step: 200, Loss: 4.471463329792023



210it [00:04, 43.61it/s][A
215it [00:04, 44.23it/s][A
220it [00:04, 44.56it/s][A
227it [00:05, 44.65it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.38it/s][A
13it [00:00, 60.05it/s][A
20it [00:00, 60.27it/s][A
27it [00:00, 60.26it/s][A
34it [00:00, 60.41it/s][A
41it [00:00, 60.72it/s][A
48it [00:00, 60.72it/s][A
55it [00:00, 60.34it/s][A
62it [00:01, 60.69it/s][A
69it [00:01, 60.71it/s][A
76it [00:01, 58.50it/s][A
83it [00:01, 59.23it/s][A
89it [00:01, 56.49it/s][A
96it [00:01, 58.00it/s][A
102it [00:01, 58.52it/s][A
109it [00:01, 59.40it/s][A
116it [00:01, 60.17it/s][A
123it [00:02, 60.39it/s][A
130it [00:02, 60.45it/s][A
137it [00:02, 60.38it/s][A
144it [00:02, 60.39it/s][A
151it [00:02, 60.19it/s][A
158it [00:02, 58.68it/s][A
164it [00:02, 57.31it/s][A
170it [00:02, 57.95it/s][A
176it [00:02, 58.36it/s][A
183it [00:03, 59.38it/s][A
190it [00:03, 59.96it/s][A
196it [00:03, 59.88it/s][A
202it [00:03, 57.51it/s][A
208it [00:03, 57.76it/s][A
214it [00:03, 


Epoch: 485, Test Loss: 5.578784082987294, Test Perplexity: 266.0780873269028




0it [00:00, ?it/s][A
5it [00:00, 45.33it/s][A
10it [00:00, 45.50it/s][A
15it [00:00, 45.14it/s][A
20it [00:00, 45.54it/s][A
25it [00:00, 45.52it/s][A
30it [00:00, 44.25it/s][A
35it [00:00, 43.15it/s][A
40it [00:00, 43.72it/s][A
45it [00:01, 44.08it/s][A
50it [00:01, 44.40it/s][A
55it [00:01, 44.62it/s][A
60it [00:01, 43.53it/s][A
65it [00:01, 44.27it/s][A
70it [00:01, 44.20it/s][A
75it [00:01, 44.50it/s][A
80it [00:01, 44.57it/s][A
85it [00:01, 44.83it/s][A
90it [00:02, 45.16it/s][A
95it [00:02, 45.34it/s][A
100it [00:02, 45.64it/s][A
105it [00:02, 45.99it/s][A

Epoch: 486, Step: 100, Loss: 4.455779361724853



110it [00:02, 46.00it/s][A
115it [00:02, 45.85it/s][A
120it [00:02, 45.57it/s][A
125it [00:02, 45.78it/s][A
130it [00:02, 46.01it/s][A
135it [00:02, 46.15it/s][A
140it [00:03, 44.31it/s][A
145it [00:03, 43.95it/s][A
150it [00:03, 44.55it/s][A
155it [00:03, 44.91it/s][A
160it [00:03, 44.74it/s][A
165it [00:03, 45.00it/s][A
170it [00:03, 44.77it/s][A
175it [00:03, 44.80it/s][A
180it [00:04, 45.09it/s][A
185it [00:04, 44.96it/s][A
190it [00:04, 45.12it/s][A
195it [00:04, 44.25it/s][A
200it [00:04, 43.42it/s][A
205it [00:04, 43.82it/s][A

Epoch: 486, Step: 200, Loss: 4.4706221771240235



210it [00:04, 43.83it/s][A
215it [00:04, 44.11it/s][A
220it [00:04, 44.67it/s][A
227it [00:05, 44.74it/s]
 97%|█████████▋| 486/500 [58:16<01:49,  7.81s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.10it/s][A
10it [00:00, 44.45it/s][A
15it [00:00, 44.95it/s][A
20it [00:00, 41.85it/s][A
25it [00:00, 42.48it/s][A
30it [00:00, 43.52it/s][A
35it [00:00, 44.11it/s][A
40it [00:00, 44.55it/s][A
45it [00:01, 44.89it/s][A
50it [00:01, 44.65it/s][A
55it [00:01, 45.07it/s][A
60it [00:01, 45.00it/s][A
65it [00:01, 45.22it/s][A
70it [00:01, 45.09it/s][A
75it [00:01, 44.94it/s][A
80it [00:01, 45.36it/s][A
85it [00:01, 45.05it/s][A
90it [00:02, 45.34it/s][A
95it [00:02, 45.26it/s][A
100it [00:02, 45.44it/s][A
105it [00:02, 45.43it/s][A

Epoch: 487, Step: 100, Loss: 4.452975440025329



110it [00:02, 45.55it/s][A
115it [00:02, 45.17it/s][A
120it [00:02, 45.56it/s][A
125it [00:02, 45.58it/s][A
130it [00:02, 43.59it/s][A
135it [00:03, 44.26it/s][A
140it [00:03, 44.71it/s][A
145it [00:03, 44.54it/s][A
150it [00:03, 44.70it/s][A
155it [00:03, 44.85it/s][A
160it [00:03, 44.78it/s][A
165it [00:03, 44.98it/s][A
170it [00:03, 45.04it/s][A
175it [00:03, 45.13it/s][A
180it [00:04, 45.17it/s][A
185it [00:04, 45.50it/s][A
190it [00:04, 45.30it/s][A
195it [00:04, 45.47it/s][A
200it [00:04, 45.39it/s][A
205it [00:04, 45.44it/s][A

Epoch: 487, Step: 200, Loss: 4.472095887660981



210it [00:04, 45.06it/s][A
215it [00:04, 44.44it/s][A
220it [00:04, 44.77it/s][A
227it [00:05, 44.77it/s]
 97%|█████████▋| 487/500 [58:21<01:30,  6.99s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.60it/s][A
10it [00:00, 44.58it/s][A
15it [00:00, 44.91it/s][A
20it [00:00, 45.54it/s][A
25it [00:00, 45.40it/s][A
30it [00:00, 45.58it/s][A
35it [00:00, 45.58it/s][A
40it [00:00, 45.60it/s][A
45it [00:00, 45.88it/s][A
50it [00:01, 46.09it/s][A
55it [00:01, 46.23it/s][A
60it [00:01, 46.19it/s][A
65it [00:01, 46.17it/s][A
70it [00:01, 46.14it/s][A
75it [00:01, 45.75it/s][A
80it [00:01, 45.22it/s][A
85it [00:01, 45.16it/s][A
90it [00:01, 45.13it/s][A
95it [00:02, 45.11it/s][A
100it [00:02, 45.00it/s][A
105it [00:02, 45.21it/s][A

Epoch: 488, Step: 100, Loss: 4.456747107505798



110it [00:02, 45.21it/s][A
115it [00:02, 45.07it/s][A
120it [00:02, 45.33it/s][A
125it [00:02, 45.19it/s][A
130it [00:02, 45.17it/s][A
135it [00:02, 45.05it/s][A
140it [00:03, 45.10it/s][A
145it [00:03, 45.14it/s][A
150it [00:03, 45.13it/s][A
155it [00:03, 43.31it/s][A
160it [00:03, 43.93it/s][A
165it [00:03, 44.47it/s][A
170it [00:03, 42.85it/s][A
175it [00:03, 43.36it/s][A
180it [00:04, 43.97it/s][A
185it [00:04, 44.76it/s][A
190it [00:04, 44.73it/s][A
195it [00:04, 44.71it/s][A
200it [00:04, 44.83it/s][A
205it [00:04, 45.08it/s][A

Epoch: 488, Step: 200, Loss: 4.4719871497154235



210it [00:04, 44.94it/s][A
215it [00:04, 44.73it/s][A
220it [00:04, 44.98it/s][A
227it [00:05, 44.96it/s]
 98%|█████████▊| 488/500 [58:26<01:16,  6.41s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.10it/s][A
10it [00:00, 44.43it/s][A
15it [00:00, 44.99it/s][A
20it [00:00, 43.73it/s][A
25it [00:00, 43.48it/s][A
30it [00:00, 44.12it/s][A
35it [00:00, 44.52it/s][A
40it [00:00, 43.70it/s][A
45it [00:01, 42.55it/s][A
50it [00:01, 42.74it/s][A
55it [00:01, 43.61it/s][A
60it [00:01, 44.05it/s][A
65it [00:01, 43.84it/s][A
70it [00:01, 43.73it/s][A
75it [00:01, 43.72it/s][A
80it [00:01, 42.12it/s][A
85it [00:01, 43.01it/s][A
90it [00:02, 43.64it/s][A
95it [00:02, 44.27it/s][A
100it [00:02, 44.15it/s][A
105it [00:02, 44.30it/s][A

Epoch: 489, Step: 100, Loss: 4.458519706726074



110it [00:02, 44.32it/s][A
115it [00:02, 44.74it/s][A
120it [00:02, 45.03it/s][A
125it [00:02, 43.71it/s][A
130it [00:02, 44.27it/s][A
135it [00:03, 44.86it/s][A
140it [00:03, 45.25it/s][A
145it [00:03, 45.33it/s][A
150it [00:03, 45.58it/s][A
155it [00:03, 45.53it/s][A
160it [00:03, 43.94it/s][A
165it [00:03, 44.33it/s][A
170it [00:03, 44.51it/s][A
175it [00:03, 44.37it/s][A
180it [00:04, 44.95it/s][A
185it [00:04, 44.75it/s][A
190it [00:04, 44.78it/s][A
195it [00:04, 44.86it/s][A
200it [00:04, 45.14it/s][A
205it [00:04, 44.69it/s][A

Epoch: 489, Step: 200, Loss: 4.472151205539704



210it [00:04, 44.80it/s][A
215it [00:04, 45.03it/s][A
220it [00:04, 45.19it/s][A
227it [00:05, 44.34it/s]
 98%|█████████▊| 489/500 [58:31<01:06,  6.02s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.70it/s][A
10it [00:00, 43.87it/s][A
15it [00:00, 44.03it/s][A
20it [00:00, 44.46it/s][A
25it [00:00, 44.96it/s][A
30it [00:00, 44.85it/s][A
35it [00:00, 45.10it/s][A
40it [00:00, 45.36it/s][A
45it [00:01, 45.34it/s][A
50it [00:01, 45.04it/s][A
55it [00:01, 45.36it/s][A
60it [00:01, 45.49it/s][A
65it [00:01, 45.64it/s][A
70it [00:01, 45.75it/s][A
75it [00:01, 45.91it/s][A
80it [00:01, 45.77it/s][A
85it [00:01, 45.83it/s][A
90it [00:01, 45.66it/s][A
95it [00:02, 45.65it/s][A
100it [00:02, 45.22it/s][A
105it [00:02, 45.01it/s][A

Epoch: 490, Step: 100, Loss: 4.457347874641418



110it [00:02, 44.92it/s][A
115it [00:02, 45.09it/s][A
120it [00:02, 45.14it/s][A
125it [00:02, 45.09it/s][A
130it [00:02, 45.04it/s][A
135it [00:02, 45.21it/s][A
140it [00:03, 45.49it/s][A
145it [00:03, 45.28it/s][A
150it [00:03, 45.37it/s][A
155it [00:03, 45.21it/s][A
160it [00:03, 44.98it/s][A
165it [00:03, 45.09it/s][A
170it [00:03, 45.37it/s][A
175it [00:03, 44.23it/s][A
180it [00:03, 44.56it/s][A
185it [00:04, 43.90it/s][A
190it [00:04, 44.66it/s][A
195it [00:04, 44.91it/s][A
200it [00:04, 45.34it/s][A
205it [00:04, 45.58it/s][A

Epoch: 490, Step: 200, Loss: 4.46976991891861



210it [00:04, 45.58it/s][A
215it [00:04, 45.57it/s][A
220it [00:04, 45.71it/s][A
227it [00:05, 45.18it/s]

0it [00:00, ?it/s][A
6it [00:00, 51.80it/s][A
13it [00:00, 56.91it/s][A
19it [00:00, 57.83it/s][A
25it [00:00, 58.53it/s][A
32it [00:00, 59.29it/s][A
39it [00:00, 59.95it/s][A
46it [00:00, 60.32it/s][A
53it [00:00, 60.34it/s][A
60it [00:01, 60.55it/s][A
67it [00:01, 60.85it/s][A
74it [00:01, 60.89it/s][A
81it [00:01, 60.21it/s][A
88it [00:01, 59.97it/s][A
95it [00:01, 60.28it/s][A
102it [00:01, 60.55it/s][A
109it [00:01, 60.55it/s][A
116it [00:01, 60.39it/s][A
123it [00:02, 60.33it/s][A
130it [00:02, 60.31it/s][A
137it [00:02, 58.66it/s][A
143it [00:02, 58.68it/s][A
150it [00:02, 59.47it/s][A
156it [00:02, 59.41it/s][A
163it [00:02, 59.64it/s][A
169it [00:02, 57.01it/s][A
175it [00:02, 57.71it/s][A
182it [00:03, 58.52it/s][A
188it [00:03, 58.93it/s][A
194it [00:03, 59.18it/s][A
201it [00:03, 59.78it/s][A
208it [00:03, 60.34it/s][A
215it [00:03, 


Epoch: 490, Test Loss: 5.580812337976065, Test Perplexity: 266.45982664120123




0it [00:00, ?it/s][A
5it [00:00, 44.19it/s][A
10it [00:00, 42.25it/s][A
15it [00:00, 43.63it/s][A
20it [00:00, 44.32it/s][A
25it [00:00, 42.47it/s][A
30it [00:00, 43.76it/s][A
35it [00:00, 44.18it/s][A
40it [00:00, 44.64it/s][A
45it [00:01, 44.87it/s][A
50it [00:01, 45.29it/s][A
55it [00:01, 45.51it/s][A
60it [00:01, 45.39it/s][A
65it [00:01, 45.55it/s][A
70it [00:01, 45.69it/s][A
75it [00:01, 45.73it/s][A
80it [00:01, 44.42it/s][A
85it [00:01, 44.52it/s][A
90it [00:02, 44.85it/s][A
95it [00:02, 44.03it/s][A
100it [00:02, 44.70it/s][A
105it [00:02, 45.16it/s][A

Epoch: 491, Step: 100, Loss: 4.457248420715332



110it [00:02, 44.90it/s][A
115it [00:02, 44.60it/s][A
120it [00:02, 44.81it/s][A
125it [00:02, 45.06it/s][A
130it [00:02, 44.69it/s][A
135it [00:03, 44.81it/s][A
140it [00:03, 44.89it/s][A
145it [00:03, 45.33it/s][A
150it [00:03, 45.48it/s][A
155it [00:03, 45.16it/s][A
160it [00:03, 45.37it/s][A
165it [00:03, 45.11it/s][A
170it [00:03, 45.06it/s][A
175it [00:03, 42.71it/s][A
180it [00:04, 42.73it/s][A
185it [00:04, 43.92it/s][A
190it [00:04, 43.95it/s][A
195it [00:04, 44.67it/s][A
200it [00:04, 44.73it/s][A
205it [00:04, 44.76it/s][A

Epoch: 491, Step: 200, Loss: 4.472320408821106



210it [00:04, 44.84it/s][A
215it [00:04, 44.43it/s][A
220it [00:04, 43.45it/s][A
227it [00:05, 44.42it/s]
 98%|█████████▊| 491/500 [58:52<01:10,  7.82s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.05it/s][A
10it [00:00, 45.48it/s][A
15it [00:00, 45.36it/s][A
20it [00:00, 45.37it/s][A
25it [00:00, 45.34it/s][A
30it [00:00, 45.44it/s][A
35it [00:00, 43.72it/s][A
40it [00:00, 44.58it/s][A
45it [00:01, 43.03it/s][A
50it [00:01, 43.86it/s][A
55it [00:01, 44.44it/s][A
60it [00:01, 44.87it/s][A
65it [00:01, 43.34it/s][A
70it [00:01, 43.65it/s][A
75it [00:01, 42.27it/s][A
80it [00:01, 43.34it/s][A
85it [00:01, 43.52it/s][A
90it [00:02, 44.25it/s][A
95it [00:02, 44.42it/s][A
100it [00:02, 43.16it/s][A
105it [00:02, 43.91it/s][A

Epoch: 492, Step: 100, Loss: 4.451677789688111



110it [00:02, 44.43it/s][A
115it [00:02, 44.85it/s][A
120it [00:02, 45.06it/s][A
125it [00:02, 45.39it/s][A
130it [00:02, 45.32it/s][A
135it [00:03, 45.51it/s][A
140it [00:03, 45.38it/s][A
145it [00:03, 44.23it/s][A
150it [00:03, 44.67it/s][A
155it [00:03, 44.97it/s][A
160it [00:03, 43.83it/s][A
165it [00:03, 44.49it/s][A
170it [00:03, 44.57it/s][A
175it [00:03, 44.78it/s][A
180it [00:04, 45.32it/s][A
185it [00:04, 45.63it/s][A
190it [00:04, 45.71it/s][A
195it [00:04, 45.53it/s][A
200it [00:04, 45.73it/s][A
205it [00:04, 45.74it/s][A

Epoch: 492, Step: 200, Loss: 4.471458241939545



210it [00:04, 44.14it/s][A
215it [00:04, 43.92it/s][A
220it [00:04, 44.50it/s][A
227it [00:05, 44.55it/s]
 98%|█████████▊| 492/500 [58:57<00:56,  7.00s/it]
0it [00:00, ?it/s][A
5it [00:00, 44.75it/s][A
10it [00:00, 45.48it/s][A
15it [00:00, 45.29it/s][A
20it [00:00, 43.01it/s][A
25it [00:00, 43.90it/s][A
30it [00:00, 44.55it/s][A
35it [00:00, 43.32it/s][A
40it [00:00, 44.09it/s][A
45it [00:01, 43.71it/s][A
50it [00:01, 44.36it/s][A
55it [00:01, 44.86it/s][A
60it [00:01, 44.82it/s][A
65it [00:01, 45.01it/s][A
70it [00:01, 45.07it/s][A
75it [00:01, 43.80it/s][A
80it [00:01, 43.14it/s][A
85it [00:01, 42.21it/s][A
90it [00:02, 43.15it/s][A
95it [00:02, 43.54it/s][A
100it [00:02, 44.00it/s][A
105it [00:02, 43.05it/s][A

Epoch: 493, Step: 100, Loss: 4.460159063339233



110it [00:02, 43.87it/s][A
115it [00:02, 44.63it/s][A
120it [00:02, 44.70it/s][A
125it [00:02, 43.16it/s][A
130it [00:02, 43.78it/s][A
135it [00:03, 44.34it/s][A
140it [00:03, 44.57it/s][A
145it [00:03, 44.94it/s][A
150it [00:03, 42.98it/s][A
155it [00:03, 43.80it/s][A
160it [00:03, 44.42it/s][A
165it [00:03, 43.96it/s][A
170it [00:03, 42.43it/s][A
175it [00:03, 43.31it/s][A
180it [00:04, 44.22it/s][A
185it [00:04, 44.64it/s][A
190it [00:04, 45.04it/s][A
195it [00:04, 45.07it/s][A
200it [00:04, 45.27it/s][A
205it [00:04, 45.43it/s][A

Epoch: 493, Step: 200, Loss: 4.472605266571045



210it [00:04, 44.92it/s][A
215it [00:04, 44.77it/s][A
220it [00:04, 42.66it/s][A
227it [00:05, 44.01it/s]
 99%|█████████▊| 493/500 [59:02<00:45,  6.45s/it]
0it [00:00, ?it/s][A
5it [00:00, 45.68it/s][A
10it [00:00, 45.26it/s][A
15it [00:00, 45.33it/s][A
20it [00:00, 45.17it/s][A
25it [00:00, 44.98it/s][A
30it [00:00, 45.27it/s][A
35it [00:00, 45.14it/s][A
40it [00:00, 44.97it/s][A
45it [00:00, 45.27it/s][A
50it [00:01, 44.98it/s][A
55it [00:01, 45.12it/s][A
60it [00:01, 45.39it/s][A
65it [00:01, 45.64it/s][A
70it [00:01, 45.24it/s][A
75it [00:01, 45.33it/s][A
80it [00:01, 43.17it/s][A
85it [00:01, 44.00it/s][A
90it [00:02, 44.70it/s][A
95it [00:02, 45.13it/s][A
100it [00:02, 43.88it/s][A
105it [00:02, 44.01it/s][A

Epoch: 494, Step: 100, Loss: 4.4572635793685915



110it [00:02, 44.62it/s][A
115it [00:02, 43.62it/s][A
120it [00:02, 44.18it/s][A
125it [00:02, 44.39it/s][A
130it [00:02, 44.82it/s][A
135it [00:03, 45.16it/s][A
140it [00:03, 45.52it/s][A
145it [00:03, 45.21it/s][A
150it [00:03, 45.44it/s][A
155it [00:03, 45.54it/s][A
160it [00:03, 45.69it/s][A
165it [00:03, 45.91it/s][A
170it [00:03, 46.02it/s][A
175it [00:03, 46.06it/s][A
180it [00:03, 45.95it/s][A
185it [00:04, 45.68it/s][A
190it [00:04, 45.60it/s][A
195it [00:04, 45.57it/s][A
200it [00:04, 45.27it/s][A
205it [00:04, 44.06it/s][A

Epoch: 494, Step: 200, Loss: 4.469430782794952



210it [00:04, 43.77it/s][A
215it [00:04, 44.47it/s][A
220it [00:04, 44.50it/s][A
227it [00:05, 44.90it/s]
 99%|█████████▉| 494/500 [59:07<00:36,  6.03s/it]
0it [00:00, ?it/s][A
5it [00:00, 41.10it/s][A
10it [00:00, 43.55it/s][A
15it [00:00, 43.93it/s][A
20it [00:00, 42.65it/s][A
25it [00:00, 43.95it/s][A
30it [00:00, 44.70it/s][A
35it [00:00, 45.21it/s][A
40it [00:00, 45.85it/s][A
45it [00:01, 46.09it/s][A
50it [00:01, 46.41it/s][A
55it [00:01, 44.38it/s][A
60it [00:01, 45.06it/s][A
65it [00:01, 45.62it/s][A
70it [00:01, 45.86it/s][A
75it [00:01, 46.06it/s][A
80it [00:01, 46.32it/s][A
85it [00:01, 46.42it/s][A
90it [00:01, 46.18it/s][A
95it [00:02, 46.18it/s][A
100it [00:02, 45.63it/s][A
105it [00:02, 45.86it/s][A

Epoch: 495, Step: 100, Loss: 4.453896675109863



110it [00:02, 44.71it/s][A
115it [00:02, 45.02it/s][A
120it [00:02, 44.19it/s][A
125it [00:02, 44.50it/s][A
130it [00:02, 44.74it/s][A
135it [00:02, 44.57it/s][A
140it [00:03, 44.96it/s][A
145it [00:03, 44.87it/s][A
150it [00:03, 45.16it/s][A
155it [00:03, 45.54it/s][A
160it [00:03, 45.84it/s][A
165it [00:03, 45.98it/s][A
170it [00:03, 44.29it/s][A
175it [00:03, 44.44it/s][A
180it [00:03, 44.63it/s][A
185it [00:04, 44.55it/s][A
190it [00:04, 44.90it/s][A
195it [00:04, 45.28it/s][A
200it [00:04, 45.16it/s][A
205it [00:04, 45.41it/s][A

Epoch: 495, Step: 200, Loss: 4.468382320404053



210it [00:04, 45.16it/s][A
215it [00:04, 44.97it/s][A
220it [00:04, 44.93it/s][A
227it [00:05, 45.04it/s]

0it [00:00, ?it/s][A
6it [00:00, 58.09it/s][A
12it [00:00, 59.04it/s][A
18it [00:00, 56.50it/s][A
25it [00:00, 58.87it/s][A
32it [00:00, 60.07it/s][A
39it [00:00, 61.02it/s][A
46it [00:00, 61.36it/s][A
53it [00:00, 59.10it/s][A
59it [00:01, 56.36it/s][A
65it [00:01, 57.21it/s][A
71it [00:01, 57.55it/s][A
77it [00:01, 58.15it/s][A
84it [00:01, 58.83it/s][A
90it [00:01, 59.10it/s][A
97it [00:01, 59.63it/s][A
104it [00:01, 60.00it/s][A
111it [00:01, 60.28it/s][A
118it [00:01, 60.36it/s][A
125it [00:02, 60.45it/s][A
132it [00:02, 60.49it/s][A
139it [00:02, 60.71it/s][A
146it [00:02, 60.57it/s][A
153it [00:02, 60.50it/s][A
160it [00:02, 60.76it/s][A
167it [00:02, 60.90it/s][A
174it [00:02, 60.90it/s][A
181it [00:03, 60.81it/s][A
188it [00:03, 60.79it/s][A
195it [00:03, 60.72it/s][A
202it [00:03, 60.73it/s][A
209it [00:03, 60.72it/s][A
216it [00:03, 6


Epoch: 495, Test Loss: 5.578588674527518, Test Perplexity: 265.88828945752255




0it [00:00, ?it/s][A
5it [00:00, 47.52it/s][A
10it [00:00, 47.12it/s][A
15it [00:00, 44.29it/s][A
20it [00:00, 42.89it/s][A
25it [00:00, 42.74it/s][A
30it [00:00, 42.40it/s][A
35it [00:00, 43.50it/s][A
40it [00:00, 42.58it/s][A
45it [00:01, 43.48it/s][A
50it [00:01, 44.25it/s][A
55it [00:01, 43.98it/s][A
60it [00:01, 44.53it/s][A
65it [00:01, 45.43it/s][A
70it [00:01, 45.87it/s][A
75it [00:01, 45.57it/s][A
80it [00:01, 45.47it/s][A
85it [00:01, 44.64it/s][A
90it [00:02, 45.30it/s][A
95it [00:02, 45.54it/s][A
100it [00:02, 45.90it/s][A
105it [00:02, 46.41it/s][A

Epoch: 496, Step: 100, Loss: 4.466658625602722



110it [00:02, 46.55it/s][A
115it [00:02, 46.83it/s][A
120it [00:02, 47.33it/s][A
125it [00:02, 47.57it/s][A
130it [00:02, 46.56it/s][A
135it [00:02, 46.91it/s][A
140it [00:03, 47.45it/s][A
145it [00:03, 47.07it/s][A
150it [00:03, 46.30it/s][A
155it [00:03, 45.62it/s][A
160it [00:03, 44.33it/s][A
165it [00:03, 44.18it/s][A
170it [00:03, 44.43it/s][A
175it [00:03, 43.53it/s][A
180it [00:03, 43.66it/s][A
185it [00:04, 42.03it/s][A
190it [00:04, 42.88it/s][A
195it [00:04, 43.29it/s][A
200it [00:04, 43.94it/s][A
205it [00:04, 44.62it/s][A

Epoch: 496, Step: 200, Loss: 4.472749693393707



210it [00:04, 44.52it/s][A
215it [00:04, 44.94it/s][A
220it [00:04, 45.03it/s][A
227it [00:05, 44.91it/s]
 99%|█████████▉| 496/500 [59:28<00:31,  7.79s/it]
0it [00:00, ?it/s][A
5it [00:00, 43.64it/s][A
10it [00:00, 44.98it/s][A
15it [00:00, 43.58it/s][A
20it [00:00, 44.63it/s][A
25it [00:00, 44.62it/s][A
30it [00:00, 44.95it/s][A
35it [00:00, 45.07it/s][A
40it [00:00, 45.41it/s][A
45it [00:00, 45.66it/s][A
50it [00:01, 45.87it/s][A
55it [00:01, 45.85it/s][A
60it [00:01, 45.86it/s][A
65it [00:01, 45.72it/s][A
70it [00:01, 45.58it/s][A
75it [00:01, 45.56it/s][A
80it [00:01, 45.42it/s][A
85it [00:01, 45.11it/s][A
90it [00:02, 43.66it/s][A
95it [00:02, 44.43it/s][A
100it [00:02, 44.58it/s][A
105it [00:02, 44.79it/s][A

Epoch: 497, Step: 100, Loss: 4.460387902259827



110it [00:02, 45.16it/s][A
115it [00:02, 45.28it/s][A
120it [00:02, 45.88it/s][A
125it [00:02, 45.89it/s][A
130it [00:02, 46.62it/s][A
135it [00:02, 47.09it/s][A
140it [00:03, 45.17it/s][A
145it [00:03, 44.13it/s][A
150it [00:03, 44.98it/s][A
155it [00:03, 45.89it/s][A
160it [00:03, 46.65it/s][A
165it [00:03, 47.05it/s][A
170it [00:03, 47.14it/s][A
175it [00:03, 47.57it/s][A
180it [00:03, 47.94it/s][A
185it [00:04, 45.53it/s][A
190it [00:04, 46.48it/s][A
195it [00:04, 47.05it/s][A
200it [00:04, 47.56it/s][A
205it [00:04, 47.84it/s][A

Epoch: 497, Step: 200, Loss: 4.470164425373078



210it [00:04, 47.94it/s][A
215it [00:04, 48.06it/s][A
220it [00:04, 48.12it/s][A
227it [00:04, 46.04it/s]
 99%|█████████▉| 497/500 [59:33<00:20,  6.93s/it]
0it [00:00, ?it/s][A
5it [00:00, 47.94it/s][A
10it [00:00, 47.98it/s][A
15it [00:00, 44.81it/s][A
20it [00:00, 46.16it/s][A
25it [00:00, 46.99it/s][A
30it [00:00, 47.31it/s][A
35it [00:00, 47.45it/s][A
40it [00:00, 47.62it/s][A
45it [00:00, 47.98it/s][A
50it [00:01, 47.84it/s][A
55it [00:01, 46.89it/s][A
60it [00:01, 46.61it/s][A
65it [00:01, 46.77it/s][A
70it [00:01, 47.08it/s][A
75it [00:01, 47.32it/s][A
80it [00:01, 47.42it/s][A
85it [00:01, 47.55it/s][A
90it [00:01, 45.97it/s][A
95it [00:02, 44.39it/s][A
100it [00:02, 44.72it/s][A
105it [00:02, 45.35it/s][A

Epoch: 498, Step: 100, Loss: 4.458018703460693



110it [00:02, 45.34it/s][A
115it [00:02, 44.54it/s][A
120it [00:02, 45.66it/s][A
125it [00:02, 46.34it/s][A
130it [00:02, 46.75it/s][A
135it [00:02, 46.49it/s][A
140it [00:03, 46.94it/s][A
145it [00:03, 46.98it/s][A
150it [00:03, 47.14it/s][A
155it [00:03, 45.47it/s][A
160it [00:03, 45.99it/s][A
165it [00:03, 45.37it/s][A
170it [00:03, 45.92it/s][A
175it [00:03, 46.59it/s][A
180it [00:03, 47.17it/s][A
185it [00:03, 47.43it/s][A
190it [00:04, 47.83it/s][A
195it [00:04, 48.02it/s][A
200it [00:04, 47.88it/s][A
205it [00:04, 47.53it/s][A

Epoch: 498, Step: 200, Loss: 4.469804124832153



210it [00:04, 47.67it/s][A
215it [00:04, 45.84it/s][A
220it [00:04, 46.69it/s][A
227it [00:04, 46.66it/s]
100%|█████████▉| 498/500 [59:38<00:12,  6.31s/it]
0it [00:00, ?it/s][A
5it [00:00, 46.84it/s][A
10it [00:00, 47.96it/s][A
15it [00:00, 48.29it/s][A
20it [00:00, 48.30it/s][A
25it [00:00, 48.35it/s][A
30it [00:00, 48.49it/s][A
35it [00:00, 48.54it/s][A
40it [00:00, 48.61it/s][A
45it [00:00, 48.59it/s][A
50it [00:01, 48.36it/s][A
55it [00:01, 47.78it/s][A
60it [00:01, 47.77it/s][A
65it [00:01, 47.95it/s][A
70it [00:01, 48.13it/s][A
75it [00:01, 46.14it/s][A
80it [00:01, 46.49it/s][A
85it [00:01, 46.88it/s][A
90it [00:01, 47.38it/s][A
95it [00:01, 47.62it/s][A
100it [00:02, 46.16it/s][A
105it [00:02, 45.61it/s][A

Epoch: 499, Step: 100, Loss: 4.457639832496643



110it [00:02, 46.14it/s][A
115it [00:02, 46.61it/s][A
120it [00:02, 44.77it/s][A
125it [00:02, 45.83it/s][A
130it [00:02, 46.70it/s][A
135it [00:02, 46.74it/s][A
140it [00:02, 46.49it/s][A
145it [00:03, 45.03it/s][A
150it [00:03, 45.88it/s][A
155it [00:03, 46.75it/s][A
160it [00:03, 45.34it/s][A
165it [00:03, 45.25it/s][A
170it [00:03, 46.17it/s][A
175it [00:03, 46.76it/s][A
180it [00:03, 47.26it/s][A
185it [00:03, 47.72it/s][A
190it [00:04, 48.07it/s][A
195it [00:04, 46.48it/s][A
200it [00:04, 46.85it/s][A
205it [00:04, 45.38it/s][A

Epoch: 499, Step: 200, Loss: 4.472324817180634



210it [00:04, 43.75it/s][A
215it [00:04, 45.02it/s][A
220it [00:04, 45.98it/s][A
227it [00:04, 46.74it/s]
100%|█████████▉| 499/500 [59:43<00:05,  5.88s/it]
0it [00:00, ?it/s][A
5it [00:00, 42.55it/s][A
10it [00:00, 45.74it/s][A
15it [00:00, 46.63it/s][A
20it [00:00, 47.15it/s][A
25it [00:00, 46.99it/s][A
30it [00:00, 47.26it/s][A
35it [00:00, 47.30it/s][A
40it [00:00, 47.29it/s][A
45it [00:00, 46.86it/s][A
50it [00:01, 46.65it/s][A
55it [00:01, 44.55it/s][A
60it [00:01, 45.68it/s][A
65it [00:01, 46.37it/s][A
70it [00:01, 46.96it/s][A
75it [00:01, 47.17it/s][A
80it [00:01, 47.58it/s][A
85it [00:01, 48.02it/s][A
90it [00:01, 47.73it/s][A
95it [00:02, 45.67it/s][A
100it [00:02, 46.29it/s][A
105it [00:02, 46.96it/s][A

Epoch: 500, Step: 100, Loss: 4.452858281135559



110it [00:02, 44.03it/s][A
115it [00:02, 45.03it/s][A
120it [00:02, 45.84it/s][A
125it [00:02, 45.92it/s][A
130it [00:02, 46.62it/s][A
135it [00:02, 46.29it/s][A
140it [00:03, 47.01it/s][A
145it [00:03, 47.26it/s][A
150it [00:03, 47.55it/s][A
155it [00:03, 47.49it/s][A
160it [00:03, 47.52it/s][A
165it [00:03, 47.68it/s][A
170it [00:03, 45.65it/s][A
175it [00:03, 46.05it/s][A
180it [00:03, 44.71it/s][A
185it [00:04, 43.13it/s][A
190it [00:04, 44.22it/s][A
195it [00:04, 45.26it/s][A
200it [00:04, 44.91it/s][A
205it [00:04, 44.60it/s][A

Epoch: 500, Step: 200, Loss: 4.47013290643692



210it [00:04, 43.40it/s][A
215it [00:04, 44.61it/s][A
220it [00:04, 45.23it/s][A
227it [00:04, 46.06it/s]

0it [00:00, ?it/s][A
7it [00:00, 60.68it/s][A
14it [00:00, 61.33it/s][A
21it [00:00, 60.29it/s][A
28it [00:00, 58.23it/s][A
35it [00:00, 59.81it/s][A
42it [00:00, 60.10it/s][A
49it [00:00, 58.35it/s][A
56it [00:00, 59.68it/s][A
63it [00:01, 60.64it/s][A
70it [00:01, 61.35it/s][A
77it [00:01, 61.86it/s][A
84it [00:01, 60.67it/s][A
91it [00:01, 59.14it/s][A
98it [00:01, 60.27it/s][A
105it [00:01, 61.08it/s][A
112it [00:01, 61.32it/s][A
119it [00:01, 61.56it/s][A
126it [00:02, 59.26it/s][A
132it [00:02, 58.08it/s][A
139it [00:02, 59.68it/s][A
146it [00:02, 60.76it/s][A
153it [00:02, 61.15it/s][A
160it [00:02, 58.93it/s][A
167it [00:02, 59.74it/s][A
174it [00:02, 60.64it/s][A
181it [00:03, 61.31it/s][A
188it [00:03, 61.61it/s][A
195it [00:03, 61.91it/s][A
202it [00:03, 62.28it/s][A
209it [00:03, 62.51it/s][A
216it [00:03, 62.69it/s][A
223it [00:03, 


Epoch: 500, Test Loss: 5.583091965373258, Test Perplexity: 267.0334675237999






## IMDB

In [None]:
model = CLAWrapperTextClassification(in_features=512, out_features=512, chunk_size=64, vocab_size=28996, num_classes=2, seq_len=512)

  M = torch.tensor(torch.tril(torch.ones(chunk_size, chunk_size)))


In [None]:
epochs = 10
lr = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

for epoch in tqdm.tqdm(range(epochs)):
  total_loss = 0.0
  for step, batch in tqdm.tqdm(enumerate(imdb_train_loader)):
    model.train()
    data_fields = ['input_ids', 'attention_mask', 'label']

    for k in data_fields:
      if k == 'label':
        continue
      try:
        batch[k] = torch.stack(batch[k], dim=1)
      except:
        print(batch[k])
        raise ValueError("CLOWN")

    batch = {k: torch.tensor(batch[k]).to(device, non_blocking=True) for k in data_fields}

    y_pred = model(batch['input_ids'], attention_mask=batch['attention_mask'])
    loss = IMDBMetrics.loss(y_pred, batch['label'])

    total_loss += loss.item()

    loss.backward()
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)

    # Periodic loss updates on training
    if (step + 1) % 100 == 0:
      avg_train_loss = total_loss / (step + 1)
      print(f"Epoch: {epoch + 1}, Step: {step + 1}, Loss: {avg_train_loss}")


  accuracies = []
  losses = []
  model.eval()
  with torch.no_grad():
    for step, batch in tqdm.tqdm(enumerate(imdb_test_loader)):
      data_fields = ['input_ids', 'attention_mask', 'label']

      for k in data_fields:
        if k == 'label':
          continue
        try:
          batch[k] = torch.stack(batch[k], dim=1)
        except:
          print(batch[k])
          raise ValueError("CLOWN")


      batch = {k: torch.tensor(batch[k]).to(device, non_blocking=True) for k in data_fields}
      y_pred = model(batch['input_ids'], attention_mask=batch['attention_mask'])

      loss = IMDBMetrics.loss(y_pred, batch['label'])
      accuracies.append(IMDBMetrics.accuracy(y_pred, batch['label']).item())
      losses.append(loss.item())


    avg_loss = np.mean(losses)
    avg_accuracy = np.mean(accuracies)
    print()
    print(f"Epoch: {epoch + 1}, Test Loss: {avg_loss}, Test Accuracy: {avg_accuracy}")
    print()

  0%|          | 0/30 [00:00<?, ?it/s]
  batch = {k: torch.tensor(batch[k]).to(device, non_blocking=True) for k in data_fields}

1it [00:00,  9.44it/s][A
2it [00:00,  8.58it/s][A
3it [00:00,  8.26it/s][A
4it [00:00,  8.25it/s][A
5it [00:00,  8.19it/s][A
6it [00:00,  8.15it/s][A
7it [00:00,  8.06it/s][A
8it [00:00,  8.12it/s][A
9it [00:01,  8.11it/s][A
10it [00:01,  8.09it/s][A
11it [00:01,  8.09it/s][A
12it [00:01,  8.08it/s][A
13it [00:01,  8.07it/s][A
14it [00:01,  8.09it/s][A
15it [00:01,  8.07it/s][A
16it [00:01,  8.07it/s][A
17it [00:02,  8.06it/s][A
18it [00:02,  8.07it/s][A
19it [00:02,  8.06it/s][A
20it [00:02,  8.06it/s][A
21it [00:02,  8.06it/s][A
22it [00:02,  8.06it/s][A
23it [00:02,  8.05it/s][A
24it [00:02,  8.05it/s][A
25it [00:03,  8.06it/s][A
26it [00:03,  8.05it/s][A
27it [00:03,  8.05it/s][A
28it [00:03,  8.06it/s][A
29it [00:03,  8.06it/s][A
30it [00:03,  8.06it/s][A
31it [00:03,  8.06it/s][A
32it [00:03,  8.05it/s][A
33it [00:04,  8.

Epoch: 1, Step: 100, Loss: 73.28202268183232



102it [00:12,  8.05it/s][A
103it [00:12,  8.06it/s][A
104it [00:12,  8.05it/s][A
105it [00:13,  7.99it/s][A
106it [00:13,  8.06it/s][A
107it [00:13,  8.04it/s][A
108it [00:13,  8.05it/s][A
109it [00:13,  8.05it/s][A
110it [00:13,  7.99it/s][A
111it [00:13,  8.06it/s][A
112it [00:13,  8.05it/s][A
113it [00:14,  8.05it/s][A
114it [00:14,  8.04it/s][A
115it [00:14,  8.04it/s][A
116it [00:14,  8.04it/s][A
117it [00:14,  8.00it/s][A
118it [00:14,  8.06it/s][A
119it [00:14,  8.07it/s][A
120it [00:14,  8.05it/s][A
121it [00:15,  8.06it/s][A
122it [00:15,  8.05it/s][A
123it [00:15,  8.01it/s][A
124it [00:15,  8.06it/s][A
125it [00:15,  8.05it/s][A
126it [00:15,  8.05it/s][A
127it [00:15,  8.05it/s][A
128it [00:15,  8.05it/s][A
129it [00:16,  8.04it/s][A
130it [00:16,  8.04it/s][A
131it [00:16,  8.05it/s][A
132it [00:16,  8.05it/s][A
133it [00:16,  8.04it/s][A
134it [00:16,  8.05it/s][A
135it [00:16,  8.04it/s][A
136it [00:16,  8.05it/s][A
137it [00:17,  8.04

Epoch: 1, Step: 200, Loss: 37.073400209248064



202it [00:25,  8.04it/s][A
203it [00:25,  8.03it/s][A
204it [00:25,  8.04it/s][A
205it [00:25,  8.03it/s][A
206it [00:25,  8.03it/s][A
207it [00:25,  7.99it/s][A
208it [00:25,  8.01it/s][A
209it [00:25,  8.04it/s][A
210it [00:26,  8.04it/s][A
211it [00:26,  8.05it/s][A
212it [00:26,  8.00it/s][A
213it [00:26,  8.06it/s][A
214it [00:26,  8.05it/s][A
215it [00:26,  8.05it/s][A
216it [00:26,  8.04it/s][A
217it [00:26,  8.04it/s][A
218it [00:27,  8.04it/s][A
219it [00:27,  8.04it/s][A
220it [00:27,  8.04it/s][A
221it [00:27,  8.04it/s][A
222it [00:27,  8.05it/s][A
223it [00:27,  8.05it/s][A
224it [00:27,  8.05it/s][A
225it [00:27,  8.04it/s][A
226it [00:28,  8.03it/s][A
227it [00:28,  8.04it/s][A
228it [00:28,  8.03it/s][A
229it [00:28,  7.98it/s][A
230it [00:28,  8.03it/s][A
231it [00:28,  8.05it/s][A
232it [00:28,  8.04it/s][A
233it [00:28,  8.05it/s][A
234it [00:29,  8.03it/s][A
235it [00:29,  8.04it/s][A
236it [00:29,  8.03it/s][A
237it [00:29,  7.98

Epoch: 1, Step: 300, Loss: 24.938498672743638



302it [00:37,  8.03it/s][A
303it [00:37,  8.04it/s][A
304it [00:37,  8.03it/s][A
305it [00:37,  8.03it/s][A
306it [00:38,  8.02it/s][A
307it [00:38,  8.03it/s][A
308it [00:38,  8.02it/s][A
309it [00:38,  8.03it/s][A
310it [00:38,  8.01it/s][A
311it [00:38,  8.03it/s][A
312it [00:38,  8.02it/s][A
313it [00:38,  8.03it/s][A
314it [00:39,  8.03it/s][A
315it [00:39,  8.03it/s][A
316it [00:39,  8.02it/s][A
317it [00:39,  8.03it/s][A
318it [00:39,  8.02it/s][A
319it [00:39,  8.03it/s][A
320it [00:39,  8.03it/s][A
321it [00:39,  8.02it/s][A
322it [00:40,  8.03it/s][A
323it [00:40,  8.02it/s][A
324it [00:40,  8.03it/s][A
325it [00:40,  8.02it/s][A
326it [00:40,  8.03it/s][A
327it [00:40,  8.01it/s][A
328it [00:40,  8.04it/s][A
329it [00:40,  8.04it/s][A
330it [00:41,  8.03it/s][A
331it [00:41,  8.03it/s][A
332it [00:41,  8.02it/s][A
333it [00:41,  8.04it/s][A
334it [00:41,  8.04it/s][A
335it [00:41,  8.00it/s][A
336it [00:41,  8.04it/s][A
337it [00:41,  8.04


Epoch: 1, Test Loss: 0.5765035919216283, Test Accuracy: 0.7998721227621484




0it [00:00, ?it/s][A
1it [00:00,  9.80it/s][A
2it [00:00,  8.69it/s][A
3it [00:00,  8.37it/s][A
4it [00:00,  8.24it/s][A
5it [00:00,  8.15it/s][A
6it [00:00,  8.09it/s][A
7it [00:00,  8.09it/s][A
8it [00:00,  8.07it/s][A
9it [00:01,  8.06it/s][A
10it [00:01,  8.05it/s][A
11it [00:01,  8.03it/s][A
12it [00:01,  8.03it/s][A
13it [00:01,  8.03it/s][A
14it [00:01,  8.03it/s][A
15it [00:01,  8.03it/s][A
16it [00:01,  8.03it/s][A
17it [00:02,  8.03it/s][A
18it [00:02,  8.03it/s][A
19it [00:02,  8.04it/s][A
20it [00:02,  8.03it/s][A
21it [00:02,  7.98it/s][A
22it [00:02,  8.05it/s][A
23it [00:02,  8.04it/s][A
24it [00:02,  8.03it/s][A
25it [00:03,  8.03it/s][A
26it [00:03,  8.03it/s][A
27it [00:03,  8.02it/s][A
28it [00:03,  8.02it/s][A
29it [00:03,  7.97it/s][A
30it [00:03,  8.04it/s][A
31it [00:03,  8.04it/s][A
32it [00:04,  4.67it/s][A
33it [00:04,  5.35it/s][A
34it [00:04,  5.94it/s][A
35it [00:04,  6.44it/s][A
36it [00:04,  6.84it/s][A
37it [00:04,  

Epoch: 2, Step: 100, Loss: 0.5849859228730202



102it [00:12,  8.03it/s][A
103it [00:13,  8.02it/s][A
104it [00:13,  8.02it/s][A
105it [00:13,  8.01it/s][A
106it [00:13,  8.02it/s][A
107it [00:13,  8.01it/s][A
108it [00:13,  8.01it/s][A
109it [00:13,  8.02it/s][A
110it [00:13,  7.97it/s][A
111it [00:14,  8.04it/s][A
112it [00:14,  8.04it/s][A
113it [00:14,  8.03it/s][A
114it [00:14,  8.03it/s][A
115it [00:14,  8.02it/s][A
116it [00:14,  8.03it/s][A
117it [00:14,  8.03it/s][A
118it [00:14,  8.03it/s][A
119it [00:15,  8.03it/s][A
120it [00:15,  8.03it/s][A
121it [00:15,  8.02it/s][A
122it [00:15,  8.02it/s][A
123it [00:15,  8.03it/s][A
124it [00:15,  8.02it/s][A
125it [00:15,  8.01it/s][A
126it [00:15,  8.02it/s][A
127it [00:16,  8.01it/s][A
128it [00:16,  8.02it/s][A
129it [00:16,  8.02it/s][A
130it [00:16,  8.02it/s][A
131it [00:16,  8.03it/s][A
132it [00:16,  8.02it/s][A
133it [00:16,  8.03it/s][A
134it [00:16,  8.03it/s][A
135it [00:17,  8.03it/s][A
136it [00:17,  8.01it/s][A
137it [00:17,  8.01

Epoch: 2, Step: 200, Loss: 0.5347091387957335



202it [00:25,  8.02it/s][A
203it [00:25,  8.02it/s][A
204it [00:25,  8.03it/s][A
205it [00:25,  8.02it/s][A
206it [00:25,  8.02it/s][A
207it [00:26,  8.03it/s][A
208it [00:26,  8.03it/s][A
209it [00:26,  8.02it/s][A
210it [00:26,  8.03it/s][A
211it [00:26,  8.04it/s][A
212it [00:26,  8.03it/s][A
213it [00:26,  8.02it/s][A
214it [00:26,  8.03it/s][A
215it [00:27,  8.02it/s][A
216it [00:27,  8.03it/s][A
217it [00:27,  8.03it/s][A
218it [00:27,  8.03it/s][A
219it [00:27,  8.03it/s][A
220it [00:27,  7.98it/s][A
221it [00:27,  8.05it/s][A
222it [00:27,  8.04it/s][A
223it [00:28,  8.03it/s][A
224it [00:28,  8.03it/s][A
225it [00:28,  8.03it/s][A
226it [00:28,  8.03it/s][A
227it [00:28,  8.03it/s][A
228it [00:28,  8.03it/s][A
229it [00:28,  8.04it/s][A
230it [00:28,  8.03it/s][A
231it [00:29,  8.03it/s][A
232it [00:29,  8.03it/s][A
233it [00:29,  8.03it/s][A
234it [00:29,  8.03it/s][A
235it [00:29,  8.02it/s][A
236it [00:29,  8.01it/s][A
237it [00:29,  8.03

Epoch: 2, Step: 300, Loss: 0.509352769802014



302it [00:37,  8.04it/s][A
303it [00:38,  8.03it/s][A
304it [00:38,  8.04it/s][A
305it [00:38,  8.04it/s][A
306it [00:38,  8.04it/s][A
307it [00:38,  8.04it/s][A
308it [00:38,  8.04it/s][A
309it [00:38,  8.04it/s][A
310it [00:38,  8.03it/s][A
311it [00:39,  8.05it/s][A
312it [00:39,  8.03it/s][A
313it [00:39,  8.04it/s][A
314it [00:39,  8.03it/s][A
315it [00:39,  8.03it/s][A
316it [00:39,  8.04it/s][A
317it [00:39,  8.03it/s][A
318it [00:39,  8.04it/s][A
319it [00:40,  8.04it/s][A
320it [00:40,  8.03it/s][A
321it [00:40,  8.03it/s][A
322it [00:40,  8.04it/s][A
323it [00:40,  8.03it/s][A
324it [00:40,  8.03it/s][A
325it [00:40,  8.02it/s][A
326it [00:40,  8.04it/s][A
327it [00:41,  8.05it/s][A
328it [00:41,  8.01it/s][A
329it [00:41,  8.05it/s][A
330it [00:41,  8.04it/s][A
331it [00:41,  8.03it/s][A
332it [00:41,  8.05it/s][A
333it [00:41,  8.03it/s][A
334it [00:41,  8.04it/s][A
335it [00:42,  8.03it/s][A
336it [00:42,  8.03it/s][A
337it [00:42,  8.03


Epoch: 2, Test Loss: 1.1623218534395212, Test Accuracy: 0.7428628517233807




0it [00:00, ?it/s][A
1it [00:00,  9.61it/s][A
2it [00:00,  8.61it/s][A
3it [00:00,  8.34it/s][A
4it [00:00,  8.22it/s][A
5it [00:00,  8.09it/s][A
6it [00:00,  8.12it/s][A
7it [00:00,  8.09it/s][A
8it [00:00,  8.06it/s][A
9it [00:01,  8.05it/s][A
10it [00:01,  7.99it/s][A
11it [00:01,  8.06it/s][A
12it [00:01,  8.05it/s][A
13it [00:01,  8.05it/s][A
14it [00:01,  8.05it/s][A
15it [00:01,  8.04it/s][A
16it [00:01,  8.03it/s][A
17it [00:02,  8.03it/s][A
18it [00:02,  8.03it/s][A
19it [00:02,  8.04it/s][A
20it [00:02,  8.04it/s][A
21it [00:02,  8.03it/s][A
22it [00:02,  8.02it/s][A
23it [00:02,  8.03it/s][A
24it [00:02,  8.03it/s][A
25it [00:03,  8.03it/s][A
26it [00:03,  8.04it/s][A
27it [00:03,  8.03it/s][A
28it [00:03,  8.03it/s][A
29it [00:03,  8.03it/s][A
30it [00:03,  8.02it/s][A
31it [00:03,  8.03it/s][A
32it [00:03,  8.03it/s][A
33it [00:04,  8.01it/s][A
34it [00:04,  7.53it/s][A
35it [00:04,  7.67it/s][A
36it [00:04,  7.78it/s][A
37it [00:04,  

Epoch: 3, Step: 100, Loss: 0.45611252799630164



102it [00:12,  8.02it/s][A
103it [00:12,  8.02it/s][A
104it [00:12,  8.02it/s][A
105it [00:13,  8.02it/s][A
106it [00:13,  8.02it/s][A
107it [00:13,  8.02it/s][A
108it [00:13,  8.02it/s][A
109it [00:13,  8.02it/s][A
110it [00:13,  8.01it/s][A
111it [00:13,  8.03it/s][A
112it [00:13,  7.95it/s][A
113it [00:14,  7.99it/s][A
114it [00:14,  8.05it/s][A
115it [00:14,  8.04it/s][A
116it [00:14,  8.04it/s][A
117it [00:14,  8.04it/s][A
118it [00:14,  8.02it/s][A
119it [00:14,  8.03it/s][A
120it [00:14,  8.03it/s][A
121it [00:15,  8.01it/s][A
122it [00:15,  8.02it/s][A
123it [00:15,  8.02it/s][A
124it [00:15,  8.02it/s][A
125it [00:15,  8.02it/s][A
126it [00:15,  8.02it/s][A
127it [00:15,  8.02it/s][A
128it [00:15,  8.03it/s][A
129it [00:16,  8.01it/s][A
130it [00:16,  8.02it/s][A
131it [00:16,  8.01it/s][A
132it [00:16,  8.01it/s][A
133it [00:16,  8.02it/s][A
134it [00:16,  8.02it/s][A
135it [00:16,  8.02it/s][A
136it [00:16,  7.97it/s][A
137it [00:17,  8.03

Epoch: 3, Step: 200, Loss: 0.4378155267238617



202it [00:25,  8.02it/s][A
203it [00:25,  8.02it/s][A
204it [00:25,  8.02it/s][A
205it [00:25,  8.02it/s][A
206it [00:25,  8.02it/s][A
207it [00:25,  8.02it/s][A
208it [00:25,  8.01it/s][A
209it [00:26,  8.01it/s][A
210it [00:26,  8.01it/s][A
211it [00:26,  8.01it/s][A
212it [00:26,  8.02it/s][A
213it [00:26,  7.96it/s][A
214it [00:26,  8.02it/s][A
215it [00:26,  8.01it/s][A
216it [00:26,  8.02it/s][A
217it [00:27,  8.01it/s][A
218it [00:27,  8.02it/s][A
219it [00:27,  8.01it/s][A
220it [00:27,  7.99it/s][A
221it [00:27,  8.01it/s][A
222it [00:27,  8.00it/s][A
223it [00:27,  8.01it/s][A
224it [00:27,  8.01it/s][A
225it [00:28,  8.01it/s][A
226it [00:28,  8.01it/s][A
227it [00:28,  8.01it/s][A
228it [00:28,  8.01it/s][A
229it [00:28,  8.01it/s][A
230it [00:28,  8.01it/s][A
231it [00:28,  8.01it/s][A
232it [00:28,  8.01it/s][A
233it [00:29,  7.96it/s][A
234it [00:29,  8.03it/s][A
235it [00:29,  8.03it/s][A
236it [00:29,  8.03it/s][A
237it [00:29,  8.03

Epoch: 3, Step: 300, Loss: 0.4616526807347933



302it [00:37,  8.04it/s][A
303it [00:37,  8.04it/s][A
304it [00:37,  8.01it/s][A
305it [00:38,  8.02it/s][A
306it [00:38,  7.97it/s][A
307it [00:38,  7.98it/s][A
308it [00:38,  7.99it/s][A
309it [00:38,  8.05it/s][A
310it [00:38,  8.04it/s][A
311it [00:38,  8.03it/s][A
312it [00:38,  8.03it/s][A
313it [00:39,  8.02it/s][A
314it [00:39,  7.95it/s][A
315it [00:39,  8.05it/s][A
316it [00:39,  8.05it/s][A
317it [00:39,  8.04it/s][A
318it [00:39,  8.02it/s][A
319it [00:39,  8.02it/s][A
320it [00:39,  8.03it/s][A
321it [00:40,  8.02it/s][A
322it [00:40,  8.02it/s][A
323it [00:40,  8.02it/s][A
324it [00:40,  8.03it/s][A
325it [00:40,  8.02it/s][A
326it [00:40,  8.02it/s][A
327it [00:40,  8.02it/s][A
328it [00:40,  8.01it/s][A
329it [00:41,  8.00it/s][A
330it [00:41,  8.01it/s][A
331it [00:41,  8.01it/s][A
332it [00:41,  8.02it/s][A
333it [00:41,  8.01it/s][A
334it [00:41,  8.01it/s][A
335it [00:41,  8.01it/s][A
336it [00:41,  8.02it/s][A
337it [00:42,  8.02


Epoch: 3, Test Loss: 0.8504830091200826, Test Accuracy: 0.7630594628851127




0it [00:00, ?it/s][A
1it [00:00,  9.98it/s][A
2it [00:00,  8.78it/s][A
3it [00:00,  8.43it/s][A
4it [00:00,  8.27it/s][A
5it [00:00,  8.18it/s][A
6it [00:00,  8.13it/s][A
7it [00:00,  8.10it/s][A
8it [00:00,  8.08it/s][A
9it [00:01,  8.07it/s][A
10it [00:01,  8.05it/s][A
11it [00:01,  8.06it/s][A
12it [00:01,  8.05it/s][A
13it [00:01,  8.04it/s][A
14it [00:01,  8.04it/s][A
15it [00:01,  8.05it/s][A
16it [00:01,  8.04it/s][A
17it [00:02,  8.04it/s][A
18it [00:02,  8.03it/s][A
19it [00:02,  8.03it/s][A
20it [00:02,  8.03it/s][A
21it [00:02,  8.03it/s][A
22it [00:02,  8.03it/s][A
23it [00:02,  8.03it/s][A
24it [00:02,  8.03it/s][A
25it [00:03,  8.03it/s][A
26it [00:03,  8.04it/s][A
27it [00:03,  8.03it/s][A
28it [00:03,  8.03it/s][A
29it [00:03,  8.04it/s][A
30it [00:03,  8.03it/s][A
31it [00:03,  8.03it/s][A
32it [00:03,  8.04it/s][A
33it [00:04,  8.03it/s][A
34it [00:04,  8.03it/s][A
35it [00:04,  8.03it/s][A
36it [00:04,  8.03it/s][A
37it [00:04,  

Epoch: 4, Step: 100, Loss: 0.40482425920665266



102it [00:12,  8.03it/s][A
103it [00:12,  8.03it/s][A
104it [00:12,  8.03it/s][A
105it [00:13,  8.03it/s][A
106it [00:13,  8.03it/s][A
107it [00:13,  8.02it/s][A
108it [00:13,  8.03it/s][A
109it [00:13,  8.02it/s][A
110it [00:13,  8.02it/s][A
111it [00:13,  8.02it/s][A
112it [00:13,  8.02it/s][A
113it [00:14,  8.02it/s][A
114it [00:14,  8.02it/s][A
115it [00:14,  8.01it/s][A
116it [00:14,  8.02it/s][A
117it [00:14,  8.03it/s][A
118it [00:14,  7.95it/s][A
119it [00:14,  8.04it/s][A
120it [00:14,  8.02it/s][A
121it [00:15,  8.03it/s][A
122it [00:15,  8.03it/s][A
123it [00:15,  8.03it/s][A
124it [00:15,  8.02it/s][A
125it [00:15,  8.03it/s][A
126it [00:15,  8.03it/s][A
127it [00:15,  8.03it/s][A
128it [00:15,  8.03it/s][A
129it [00:16,  8.02it/s][A
130it [00:16,  8.02it/s][A
131it [00:16,  8.02it/s][A
132it [00:16,  8.02it/s][A
133it [00:16,  8.03it/s][A
134it [00:16,  8.03it/s][A
135it [00:16,  8.03it/s][A
136it [00:16,  8.02it/s][A
137it [00:17,  8.03

Epoch: 4, Step: 200, Loss: 0.3721511374413967



202it [00:25,  8.01it/s][A
203it [00:25,  8.02it/s][A
204it [00:25,  8.02it/s][A
205it [00:25,  8.01it/s][A
206it [00:25,  8.02it/s][A
207it [00:26,  8.01it/s][A
208it [00:26,  8.02it/s][A
209it [00:26,  8.02it/s][A
210it [00:26,  7.99it/s][A
211it [00:26,  8.02it/s][A
212it [00:26,  8.03it/s][A
213it [00:26,  8.03it/s][A
214it [00:26,  8.02it/s][A
215it [00:27,  8.01it/s][A
216it [00:27,  8.02it/s][A
217it [00:27,  8.01it/s][A
218it [00:27,  8.01it/s][A
219it [00:27,  8.01it/s][A
220it [00:27,  8.01it/s][A
221it [00:27,  8.02it/s][A
222it [00:27,  8.02it/s][A
223it [00:28,  8.01it/s][A
224it [00:28,  8.01it/s][A
225it [00:28,  8.01it/s][A
226it [00:28,  8.01it/s][A
227it [00:28,  8.01it/s][A
228it [00:28,  8.02it/s][A
229it [00:28,  8.02it/s][A
230it [00:28,  8.02it/s][A
231it [00:29,  8.02it/s][A
232it [00:29,  8.01it/s][A
233it [00:29,  8.02it/s][A
234it [00:29,  8.01it/s][A
235it [00:29,  8.01it/s][A
236it [00:29,  8.02it/s][A
237it [00:29,  8.02

Epoch: 4, Step: 300, Loss: 0.37302022342880564



302it [00:37,  8.02it/s][A
303it [00:38,  8.01it/s][A
304it [00:38,  8.02it/s][A
305it [00:38,  8.01it/s][A
306it [00:38,  8.02it/s][A
307it [00:38,  8.01it/s][A
308it [00:38,  7.99it/s][A
309it [00:38,  8.02it/s][A
310it [00:38,  8.02it/s][A
311it [00:39,  8.01it/s][A
312it [00:39,  8.02it/s][A
313it [00:39,  8.02it/s][A
314it [00:39,  8.02it/s][A
315it [00:39,  8.01it/s][A
316it [00:39,  8.01it/s][A
317it [00:39,  8.01it/s][A
318it [00:39,  8.01it/s][A
319it [00:40,  8.01it/s][A
320it [00:40,  8.02it/s][A
321it [00:40,  8.02it/s][A
322it [00:40,  8.01it/s][A
323it [00:40,  8.02it/s][A
324it [00:40,  7.97it/s][A
325it [00:40,  8.04it/s][A
326it [00:40,  8.04it/s][A
327it [00:41,  8.03it/s][A
328it [00:41,  8.03it/s][A
329it [00:41,  8.03it/s][A
330it [00:41,  8.03it/s][A
331it [00:41,  8.03it/s][A
332it [00:41,  8.03it/s][A
333it [00:41,  8.03it/s][A
334it [00:41,  8.04it/s][A
335it [00:42,  8.02it/s][A
336it [00:42,  8.02it/s][A
337it [00:42,  8.03


Epoch: 4, Test Loss: 1.3579058593039013, Test Accuracy: 0.8242087595907929




0it [00:00, ?it/s][A
2it [00:00,  8.95it/s][A
3it [00:00,  8.55it/s][A
4it [00:00,  8.35it/s][A
5it [00:00,  8.24it/s][A
6it [00:00,  8.17it/s][A
7it [00:00,  8.13it/s][A
8it [00:00,  8.10it/s][A
9it [00:01,  8.08it/s][A
10it [00:01,  8.07it/s][A
11it [00:01,  8.05it/s][A
12it [00:01,  8.05it/s][A
13it [00:01,  8.04it/s][A
14it [00:01,  8.04it/s][A
15it [00:01,  8.03it/s][A
16it [00:01,  8.04it/s][A
17it [00:02,  8.04it/s][A
18it [00:02,  8.03it/s][A
19it [00:02,  8.03it/s][A
20it [00:02,  8.03it/s][A
21it [00:02,  8.03it/s][A
22it [00:02,  8.03it/s][A
23it [00:02,  8.04it/s][A
24it [00:02,  8.03it/s][A
25it [00:03,  8.03it/s][A
26it [00:03,  8.04it/s][A
27it [00:03,  8.02it/s][A
28it [00:03,  8.04it/s][A
29it [00:03,  8.04it/s][A
30it [00:03,  8.04it/s][A
31it [00:03,  8.04it/s][A
32it [00:03,  8.04it/s][A
33it [00:04,  8.04it/s][A
34it [00:04,  8.04it/s][A
35it [00:04,  8.04it/s][A
36it [00:04,  8.03it/s][A
37it [00:04,  8.03it/s][A
38it [00:04, 

Epoch: 5, Step: 100, Loss: 1.4829219470918178



102it [00:12,  7.97it/s][A
103it [00:12,  8.05it/s][A
104it [00:12,  8.04it/s][A
105it [00:13,  8.04it/s][A
106it [00:13,  8.04it/s][A
107it [00:13,  8.03it/s][A
108it [00:13,  8.03it/s][A
109it [00:13,  8.02it/s][A
110it [00:13,  8.03it/s][A
111it [00:13,  8.03it/s][A
112it [00:13,  8.03it/s][A
113it [00:14,  8.03it/s][A
114it [00:14,  8.02it/s][A
115it [00:14,  8.03it/s][A
116it [00:14,  8.03it/s][A
117it [00:14,  8.03it/s][A
118it [00:14,  8.02it/s][A
119it [00:14,  8.03it/s][A
120it [00:14,  8.02it/s][A
121it [00:15,  8.02it/s][A
122it [00:15,  8.03it/s][A
123it [00:15,  8.03it/s][A
124it [00:15,  8.03it/s][A
125it [00:15,  8.03it/s][A
126it [00:15,  8.03it/s][A
127it [00:15,  8.02it/s][A
128it [00:15,  8.02it/s][A
129it [00:16,  8.02it/s][A
130it [00:16,  8.00it/s][A
131it [00:16,  8.02it/s][A
132it [00:16,  7.97it/s][A
133it [00:16,  7.99it/s][A
134it [00:16,  8.05it/s][A
135it [00:16,  8.04it/s][A
136it [00:16,  8.03it/s][A
137it [00:17,  8.03

Epoch: 5, Step: 200, Loss: 0.9916340604424476



202it [00:25,  8.03it/s][A
203it [00:25,  7.97it/s][A
204it [00:25,  8.03it/s][A
205it [00:25,  8.03it/s][A
206it [00:25,  8.02it/s][A
207it [00:25,  8.03it/s][A
208it [00:25,  8.01it/s][A
209it [00:26,  7.97it/s][A
210it [00:26,  8.04it/s][A
211it [00:26,  8.04it/s][A
212it [00:26,  8.03it/s][A
213it [00:26,  8.02it/s][A
214it [00:26,  8.02it/s][A
215it [00:26,  8.03it/s][A
216it [00:26,  8.01it/s][A
217it [00:27,  8.02it/s][A
218it [00:27,  8.02it/s][A
219it [00:27,  8.02it/s][A
220it [00:27,  8.02it/s][A
221it [00:27,  8.00it/s][A
222it [00:27,  8.01it/s][A
223it [00:27,  8.02it/s][A
224it [00:27,  8.01it/s][A
225it [00:28,  8.03it/s][A
226it [00:28,  8.03it/s][A
227it [00:28,  8.02it/s][A
228it [00:28,  8.02it/s][A
229it [00:28,  8.02it/s][A
230it [00:28,  8.01it/s][A
231it [00:28,  8.02it/s][A
232it [00:28,  8.01it/s][A
233it [00:29,  8.02it/s][A
234it [00:29,  8.02it/s][A
235it [00:29,  8.02it/s][A
236it [00:29,  8.00it/s][A
237it [00:29,  8.02

Epoch: 5, Step: 300, Loss: 0.7801235432426135



302it [00:37,  8.02it/s][A
303it [00:37,  7.97it/s][A
304it [00:37,  8.03it/s][A
305it [00:37,  8.03it/s][A
306it [00:38,  8.03it/s][A
307it [00:38,  8.03it/s][A
308it [00:38,  8.03it/s][A
309it [00:38,  8.03it/s][A
310it [00:38,  8.03it/s][A
311it [00:38,  8.00it/s][A
312it [00:38,  8.02it/s][A
313it [00:38,  8.03it/s][A
314it [00:39,  8.02it/s][A
315it [00:39,  8.01it/s][A
316it [00:39,  8.02it/s][A
317it [00:39,  8.01it/s][A
318it [00:39,  8.00it/s][A
319it [00:39,  8.01it/s][A
320it [00:39,  8.02it/s][A
321it [00:39,  7.97it/s][A
322it [00:40,  8.04it/s][A
323it [00:40,  8.03it/s][A
324it [00:40,  8.03it/s][A
325it [00:40,  8.03it/s][A
326it [00:40,  8.03it/s][A
327it [00:40,  8.03it/s][A
328it [00:40,  8.03it/s][A
329it [00:40,  8.03it/s][A
330it [00:41,  8.03it/s][A
331it [00:41,  8.03it/s][A
332it [00:41,  8.03it/s][A
333it [00:41,  8.03it/s][A
334it [00:41,  8.03it/s][A
335it [00:41,  8.02it/s][A
336it [00:41,  8.02it/s][A
337it [00:41,  8.03


Epoch: 5, Test Loss: 0.6455908578337001, Test Accuracy: 0.8349424553344317




0it [00:00, ?it/s][A
1it [00:00,  9.98it/s][A
2it [00:00,  8.75it/s][A
3it [00:00,  8.39it/s][A
4it [00:00,  8.23it/s][A
5it [00:00,  8.17it/s][A
6it [00:00,  8.11it/s][A
7it [00:00,  8.09it/s][A
8it [00:00,  8.07it/s][A
9it [00:01,  8.05it/s][A
10it [00:01,  7.99it/s][A
11it [00:01,  8.06it/s][A
12it [00:01,  8.04it/s][A
13it [00:01,  8.00it/s][A
14it [00:01,  8.05it/s][A
15it [00:01,  8.04it/s][A
16it [00:01,  8.04it/s][A
17it [00:02,  8.04it/s][A
18it [00:02,  8.04it/s][A
19it [00:02,  8.04it/s][A
20it [00:02,  8.03it/s][A
21it [00:02,  8.03it/s][A
22it [00:02,  8.03it/s][A
23it [00:02,  8.02it/s][A
24it [00:02,  8.03it/s][A
25it [00:03,  8.02it/s][A
26it [00:03,  8.03it/s][A
27it [00:03,  8.03it/s][A
28it [00:03,  8.03it/s][A
29it [00:03,  8.04it/s][A
30it [00:03,  8.04it/s][A
31it [00:03,  8.04it/s][A
32it [00:03,  8.04it/s][A
33it [00:04,  8.03it/s][A
34it [00:04,  8.03it/s][A
35it [00:04,  8.04it/s][A
36it [00:04,  8.03it/s][A
37it [00:04,  

Epoch: 6, Step: 100, Loss: 0.22662085792049766



102it [00:12,  8.04it/s][A
103it [00:12,  8.03it/s][A
104it [00:12,  8.03it/s][A
105it [00:13,  8.02it/s][A
106it [00:13,  8.04it/s][A
107it [00:13,  8.03it/s][A
108it [00:13,  8.04it/s][A
109it [00:13,  8.04it/s][A
110it [00:13,  8.04it/s][A
111it [00:13,  8.02it/s][A
112it [00:13,  8.03it/s][A
113it [00:14,  8.03it/s][A
114it [00:14,  8.04it/s][A
115it [00:14,  8.01it/s][A
116it [00:14,  8.04it/s][A
117it [00:14,  7.97it/s][A
118it [00:14,  8.05it/s][A
119it [00:14,  8.04it/s][A
120it [00:14,  8.04it/s][A
121it [00:15,  8.04it/s][A
122it [00:15,  8.03it/s][A
123it [00:15,  8.03it/s][A
124it [00:15,  8.03it/s][A
125it [00:15,  8.03it/s][A
126it [00:15,  8.03it/s][A
127it [00:15,  8.02it/s][A
128it [00:15,  8.03it/s][A
129it [00:16,  8.03it/s][A
130it [00:16,  8.02it/s][A
131it [00:16,  8.02it/s][A
132it [00:16,  8.02it/s][A
133it [00:16,  8.03it/s][A
134it [00:16,  8.03it/s][A
135it [00:16,  8.03it/s][A
136it [00:16,  8.02it/s][A
137it [00:17,  8.02

Epoch: 6, Step: 200, Loss: 0.434676751261577



202it [00:25,  8.03it/s][A
203it [00:25,  8.01it/s][A
204it [00:25,  8.03it/s][A
205it [00:25,  8.02it/s][A
206it [00:25,  8.03it/s][A
207it [00:26,  7.96it/s][A
208it [00:26,  8.01it/s][A
209it [00:26,  8.01it/s][A
210it [00:26,  8.02it/s][A
211it [00:26,  8.03it/s][A
212it [00:26,  8.03it/s][A
213it [00:26,  8.02it/s][A
214it [00:26,  8.02it/s][A
215it [00:27,  8.02it/s][A
216it [00:27,  8.03it/s][A
217it [00:27,  8.03it/s][A
218it [00:27,  8.03it/s][A
219it [00:27,  8.03it/s][A
220it [00:27,  7.97it/s][A
221it [00:27,  8.05it/s][A
222it [00:27,  8.04it/s][A
223it [00:28,  8.04it/s][A
224it [00:28,  8.02it/s][A
225it [00:28,  8.02it/s][A
226it [00:28,  8.02it/s][A
227it [00:28,  8.03it/s][A
228it [00:28,  8.02it/s][A
229it [00:28,  8.02it/s][A
230it [00:28,  8.02it/s][A
231it [00:29,  8.02it/s][A
232it [00:29,  8.02it/s][A
233it [00:29,  8.02it/s][A
234it [00:29,  8.03it/s][A
235it [00:29,  8.03it/s][A
236it [00:29,  7.98it/s][A
237it [00:29,  8.04

Epoch: 6, Step: 300, Loss: 867.5778635382342



302it [00:37,  8.04it/s][A
303it [00:38,  8.04it/s][A
304it [00:38,  8.04it/s][A
305it [00:38,  8.03it/s][A
306it [00:38,  8.04it/s][A
307it [00:38,  8.03it/s][A
308it [00:38,  8.05it/s][A
309it [00:38,  8.05it/s][A
310it [00:38,  8.04it/s][A
311it [00:39,  8.03it/s][A
312it [00:39,  8.04it/s][A
313it [00:39,  8.04it/s][A
314it [00:39,  8.04it/s][A
315it [00:39,  8.04it/s][A
316it [00:39,  8.05it/s][A
317it [00:39,  8.05it/s][A
318it [00:39,  8.04it/s][A
319it [00:40,  8.05it/s][A
320it [00:40,  8.04it/s][A
321it [00:40,  8.04it/s][A
322it [00:40,  8.05it/s][A
323it [00:40,  8.04it/s][A
324it [00:40,  8.04it/s][A
325it [00:40,  8.03it/s][A
326it [00:40,  8.03it/s][A
327it [00:41,  8.03it/s][A
328it [00:41,  8.03it/s][A
329it [00:41,  8.03it/s][A
330it [00:41,  8.02it/s][A
331it [00:41,  8.03it/s][A
332it [00:41,  8.03it/s][A
333it [00:41,  8.04it/s][A
334it [00:41,  8.01it/s][A
335it [00:42,  8.05it/s][A
336it [00:42,  8.05it/s][A
337it [00:42,  8.03


Epoch: 6, Test Loss: 3.4143905042077574, Test Accuracy: 0.8345668158872658




0it [00:00, ?it/s][A
2it [00:00,  8.96it/s][A
3it [00:00,  8.56it/s][A
4it [00:00,  8.37it/s][A
5it [00:00,  8.25it/s][A
6it [00:00,  8.17it/s][A
7it [00:00,  8.14it/s][A
8it [00:00,  8.11it/s][A
9it [00:01,  8.09it/s][A
10it [00:01,  8.08it/s][A
11it [00:01,  8.07it/s][A
12it [00:01,  8.06it/s][A
13it [00:01,  8.06it/s][A
14it [00:01,  8.06it/s][A
15it [00:01,  8.05it/s][A
16it [00:01,  8.06it/s][A
17it [00:02,  8.05it/s][A
18it [00:02,  8.04it/s][A
19it [00:02,  8.03it/s][A
20it [00:02,  8.04it/s][A
21it [00:02,  8.04it/s][A
22it [00:02,  8.02it/s][A
23it [00:02,  8.02it/s][A
24it [00:02,  8.03it/s][A
25it [00:03,  8.03it/s][A
26it [00:03,  8.03it/s][A
27it [00:03,  8.03it/s][A
28it [00:03,  8.03it/s][A
29it [00:03,  8.04it/s][A
30it [00:03,  8.03it/s][A
31it [00:03,  8.04it/s][A
32it [00:03,  8.04it/s][A
33it [00:04,  8.03it/s][A
34it [00:04,  8.03it/s][A
35it [00:04,  8.03it/s][A
36it [00:04,  8.01it/s][A
37it [00:04,  8.04it/s][A
38it [00:04, 

Epoch: 7, Step: 100, Loss: 1.212793165743351



102it [00:12,  8.02it/s][A
103it [00:12,  8.04it/s][A
104it [00:12,  8.03it/s][A
105it [00:13,  8.03it/s][A
106it [00:13,  8.03it/s][A
107it [00:13,  8.03it/s][A
108it [00:13,  8.04it/s][A
109it [00:13,  8.04it/s][A
110it [00:13,  8.03it/s][A
111it [00:13,  8.02it/s][A
112it [00:13,  8.03it/s][A
113it [00:14,  8.02it/s][A
114it [00:14,  8.03it/s][A
115it [00:14,  8.02it/s][A
116it [00:14,  8.00it/s][A
117it [00:14,  8.04it/s][A
118it [00:14,  8.03it/s][A
119it [00:14,  8.04it/s][A
120it [00:14,  8.03it/s][A
121it [00:15,  8.03it/s][A
122it [00:15,  8.03it/s][A
123it [00:15,  8.04it/s][A
124it [00:15,  8.03it/s][A
125it [00:15,  8.04it/s][A
126it [00:15,  8.04it/s][A
127it [00:15,  8.03it/s][A
128it [00:15,  8.00it/s][A
129it [00:16,  8.03it/s][A
130it [00:16,  8.03it/s][A
131it [00:16,  8.03it/s][A
132it [00:16,  7.97it/s][A
133it [00:16,  8.02it/s][A
134it [00:16,  8.00it/s][A
135it [00:16,  8.01it/s][A
136it [00:16,  8.00it/s][A
137it [00:17,  8.01

Epoch: 7, Step: 200, Loss: 0.9951608566753566



202it [00:25,  8.03it/s][A
203it [00:25,  8.03it/s][A
204it [00:25,  8.02it/s][A
205it [00:25,  8.02it/s][A
206it [00:25,  8.03it/s][A
207it [00:25,  8.02it/s][A
208it [00:25,  8.02it/s][A
209it [00:26,  8.02it/s][A
210it [00:26,  8.02it/s][A
211it [00:26,  8.01it/s][A
212it [00:26,  8.01it/s][A
213it [00:26,  8.01it/s][A
214it [00:26,  8.01it/s][A
215it [00:26,  8.02it/s][A
216it [00:26,  8.02it/s][A
217it [00:27,  8.02it/s][A
218it [00:27,  8.02it/s][A
219it [00:27,  8.02it/s][A
220it [00:27,  8.01it/s][A
221it [00:27,  8.01it/s][A
222it [00:27,  8.02it/s][A
223it [00:27,  8.02it/s][A
224it [00:27,  8.02it/s][A
225it [00:28,  8.02it/s][A
226it [00:28,  8.01it/s][A
227it [00:28,  8.01it/s][A
228it [00:28,  8.01it/s][A
229it [00:28,  8.01it/s][A
230it [00:28,  8.03it/s][A
231it [00:28,  8.02it/s][A
232it [00:28,  8.02it/s][A
233it [00:29,  8.02it/s][A
234it [00:29,  8.01it/s][A
235it [00:29,  8.02it/s][A
236it [00:29,  8.02it/s][A
237it [00:29,  8.00

Epoch: 7, Step: 300, Loss: 0.8472936454042792



302it [00:37,  8.02it/s][A
303it [00:37,  8.02it/s][A
304it [00:37,  8.02it/s][A
305it [00:37,  8.02it/s][A
306it [00:38,  8.02it/s][A
307it [00:38,  8.02it/s][A
308it [00:38,  8.02it/s][A
309it [00:38,  8.01it/s][A
310it [00:38,  8.02it/s][A
311it [00:38,  8.00it/s][A
312it [00:38,  8.01it/s][A
313it [00:38,  8.01it/s][A
314it [00:39,  8.02it/s][A
315it [00:39,  8.02it/s][A
316it [00:39,  8.02it/s][A
317it [00:39,  8.02it/s][A
318it [00:39,  8.02it/s][A
319it [00:39,  8.00it/s][A
320it [00:39,  8.03it/s][A
321it [00:39,  8.03it/s][A
322it [00:40,  8.02it/s][A
323it [00:40,  8.02it/s][A
324it [00:40,  8.02it/s][A
325it [00:40,  7.98it/s][A
326it [00:40,  8.04it/s][A
327it [00:40,  8.04it/s][A
328it [00:40,  8.03it/s][A
329it [00:40,  8.03it/s][A
330it [00:41,  8.02it/s][A
331it [00:41,  8.03it/s][A
332it [00:41,  8.02it/s][A
333it [00:41,  8.03it/s][A
334it [00:41,  8.03it/s][A
335it [00:41,  8.03it/s][A
336it [00:41,  8.03it/s][A
337it [00:41,  8.03


Epoch: 7, Test Loss: 0.9361355628275201, Test Accuracy: 0.8346787084398977




0it [00:00, ?it/s][A
1it [00:00,  9.92it/s][A
2it [00:00,  8.70it/s][A
3it [00:00,  8.41it/s][A
4it [00:00,  8.24it/s][A
5it [00:00,  8.17it/s][A
6it [00:00,  8.11it/s][A
7it [00:00,  8.08it/s][A
8it [00:00,  8.02it/s][A
9it [00:01,  8.07it/s][A
10it [00:01,  8.07it/s][A
11it [00:01,  8.05it/s][A
12it [00:01,  8.06it/s][A
13it [00:01,  8.04it/s][A
14it [00:01,  8.04it/s][A
15it [00:01,  8.04it/s][A
16it [00:01,  8.01it/s][A
17it [00:02,  8.05it/s][A
18it [00:02,  8.04it/s][A
19it [00:02,  8.04it/s][A
20it [00:02,  8.04it/s][A
21it [00:02,  8.04it/s][A
22it [00:02,  8.04it/s][A
23it [00:02,  8.04it/s][A
24it [00:02,  8.04it/s][A
25it [00:03,  8.04it/s][A
26it [00:03,  8.04it/s][A
27it [00:03,  8.03it/s][A
28it [00:03,  8.03it/s][A
29it [00:03,  8.03it/s][A
30it [00:03,  8.03it/s][A
31it [00:03,  8.03it/s][A
32it [00:03,  8.04it/s][A
33it [00:04,  8.03it/s][A
34it [00:04,  8.04it/s][A
35it [00:04,  8.04it/s][A
36it [00:04,  8.04it/s][A
37it [00:04,  

KeyboardInterrupt: 

# Quantization Test

In [None]:
import copy

model_quant_naive = copy.deepcopy(model)

pseudo_quantize_model_weight(model_quant_naive, w_bit=4, q_group_size=128)

accuracies = []
losses = []
model_quant_naive.eval()
with torch.no_grad():
  for step, batch in tqdm.tqdm(enumerate(imdb_test_loader)):
    data_fields = ['input_ids', 'attention_mask', 'label']

    for k in data_fields:
      if k == 'label':
        continue
      try:
        batch[k] = torch.stack(batch[k], dim=1)
      except:
        print(batch[k])
        raise ValueError("CLOWN")


    batch = {k: torch.tensor(batch[k]).to(device, non_blocking=True) for k in data_fields}
    y_pred = model_quant_naive(batch['input_ids'], attention_mask=batch['attention_mask'])

    loss = IMDBMetrics.loss(y_pred, batch['label'])
    accuracies.append(IMDBMetrics.accuracy(y_pred, batch['label']).item())
    losses.append(loss.item())


  avg_loss = np.mean(losses)
  avg_accuracy = np.mean(accuracies)
  print()
  print(f"Epoch: {epoch + 1}, Test Loss: {avg_loss}, Test Accuracy: {avg_accuracy}")
  print()

  batch = {k: torch.tensor(batch[k]).to(device, non_blocking=True) for k in data_fields}
391it [00:35, 10.97it/s]


Epoch: 8, Test Loss: 1.105022748870313, Test Accuracy: 0.827141943794992






# Misc