In [1]:
# Taken from https://medium.com/data-and-beyond/complete-guide-to-building-bert-model-from-sratch-3e6562228891
import os
from pathlib import Path
import torch
import re
import random
from tokenizers import BertWordPieceTokenizer
import tqdm
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam
import datasets
import transformers
from transformers import BertTokenizer

In [2]:
MAX_LEN = 64

### loading all data into memory
corpus_movie_conv = './datasets/movie_conversations.txt'
corpus_movie_lines = './datasets/movie_lines.txt'
with open(corpus_movie_conv, 'r', encoding='iso-8859-1') as c:
    conv = c.readlines() 
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as l:
    lines = l.readlines()

In [3]:
### splitting text using special lines
lines_dic = {}
for line in lines:
    objects = line.split(" +++$+++ ")
    lines_dic[objects[0]] = objects[-1]
list(lines_dic.items())[0:4]

[('L1045', 'They do not!\n'),
 ('L1044', 'They do to!\n'),
 ('L985', 'I hope so.\n'),
 ('L984', 'She okay?\n')]

In [4]:
### generate question answer pairs
pairs = []
for con in conv:
    ids = eval(con.split(" +++$+++ ")[-1])
    for i in range(len(ids)):
        qa_pairs = []
        
        if i == len(ids) - 1:
            break

        first = lines_dic[ids[i]].strip()  
        second = lines_dic[ids[i+1]].strip() 

        qa_pairs.append(' '.join(first.split()[:MAX_LEN]))
        qa_pairs.append(' '.join(second.split()[:MAX_LEN]))
        pairs.append(qa_pairs)
print(len(pairs))
print(pairs[0:3])

221616
[['Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again.', "Well, I thought we'd start with pronunciation, if that's okay with you."], ["Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part. Please.'], ['Not the hacking and gagging and spitting part. Please.', "Okay... then how 'bout we try out some French cuisine. Saturday? Night?"]]


In [5]:
### save data as txt file
os.mkdir('./data')
text_data = []
file_count = 0

for sample in tqdm.tqdm([x[0] for x in pairs]):
    text_data.append(sample)

    # once we hit the 10K mark, save to file
    if len(text_data) == 10000:
        with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1

paths = [str(x) for x in Path('./data').glob('**/*.txt')]

100%|████████████████████████████████████████████████████████████████████████| 221616/221616 [00:00<00:00, 2307967.99it/s]


In [6]:
### training own tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tokenizer.train( 
    files=paths,
    vocab_size=30_000, 
    min_frequency=5,
    limit_alphabet=1000, 
    wordpieces_prefix='##',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
    )

os.mkdir('./bert-it-1')
tokenizer.save_model('./bert-it-1', 'bert-it')
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)








In [7]:
class BERTDataset(Dataset):
    def __init__(self, data_pair, tokenizer, seq_len=64):

        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.corpus_lines = len(data_pair)
        self.lines = data_pair

    def __len__(self):
        return self.corpus_lines

    def __getitem__(self, item):

        # Step 1: get random sentence pair, either negative or positive (saved as is_next_label)
        t1, t2, is_next_label = self.get_sent(item)    

        # Step 2: replace random words in sentence with mask / random words
        t1_random, t1_label = self.random_word(t1)
        t2_random, t2_label = self.random_word(t2)

        # Step 3: Adding CLS and SEP tokens to the start and end of sentences
         # Adding PAD token for labels
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]

        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]

        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Step 4: combine sentence 1 and 2 as one input
        # adding PAD tokens to make the sentence same length as seq_len
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}

    def random_word(self, sentence):
        tokens = sentence.split()
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):
            prob = random.random()

            # remove cls and sep token
            token_id = self.tokenizer(token)['input_ids'][1:-1]
            
            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    for i in range(len(token_id)):
                        output.append(self.tokenizer.vocab['[MASK]'])

                # 10% chance change token to random token
                elif prob < 0.9:
                    for i in range(len(token_id)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))

                # 10% chance change token to current token
                else:
                    output.append(token_id)

                output_label.append(token_id)

            else:
                output.append(token_id)
                for i in range(len(token_id)):
                    output_label.append(0)

        # flattening
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        return output, output_label

    def get_sent(self, index):
        '''return random sentence pair'''
        t1, t2 = self.get_corpus_line(index)

        # negative or positive pair, for next sentence prediction
        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self.get_random_line(), 0

    def get_corpus_line(self, item):
        '''return sentence pair'''
        return self.lines[item][0], self.lines[item][1]

    def get_random_line(self):
        '''return random single sentence'''
        return self.lines[random.randrange(len(self.lines))][1]

In [8]:
train_data = BERTDataset(
   pairs, seq_len=MAX_LEN, tokenizer=tokenizer)
train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)
print_sample = train_data[random.randrange(len(train_data))]
print(print_sample)
print(tokenizer.decode(print_sample['bert_input']))

{'bert_input': tensor([  1, 580,  17,   2,   3, 146,  15, 592,  17,   2,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0]), 'bert_label': tensor([  0,   0,   0,   0, 419,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0]), 'segment_label': tensor([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'is_next': t

In [9]:
train_data[0]

{'bert_input': tensor([    1,   230,   184,   432,   208,  1712,    34,   529,   119, 13336,
             3,     3,     3,     3, 12824,   234,  1135,   160,  6991,  4996,
         19495,   313,  1993,  1023,    16,     3,   192,   150,  8110,    17,
           542,    17,     2,   303,    15,    48,   515,   184,    11,    43,
           672,   231, 15149,   295,  7732,   105,   242,    15,   270,   173,
            11,    58,   459,   231,   146,    17,     2,     0,     0,     0,
             0,     0,     0,     0]),
 'bert_label': tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
          8254, 10999,   179,  5189,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,   275,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0, 15149,   295,  7732,   105,   242,    15,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,

In [824]:
from torch import nn
class TokenTypelEmbeddings(nn.Module):
    def __init__(self, num_token_types = 3, embedding_dim = 768):
        super().__init__()
        self.token_type_embeddings = nn.Embedding(num_token_types, embedding_dim, padding_idx=0) # TokenizEr pad id

    def forward(self, input):
        return self.token_type_embeddings(input)

In [825]:
class WordEmbeddings(nn.Module):
    def __init__(self, vocab_size, embedding_dim = 768):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

    def forward(self, input):
        return self.word_embeddings(input)

In [826]:
class PositionalEmbeddings(nn.Module):
    def __init__(self, seq_len, embedding_dim = 768):
        super().__init__()
        self.seq_len = seq_len
        positional_embedding = torch.zeros((seq_len, embedding_dim))
        positional_embedding.require_grad = False
        
        for pos in range(seq_len):
            for i in range(0, embedding_dim, 2):
                positional_embedding[pos, i] = math.sin(pos / (10000 ** ((2 * i)/embedding_dim)))
                positional_embedding[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/embedding_dim)))
            
        self.positional_embeddings = positional_embedding.unsqueeze(0)

    def forward(self, input):
        return self.positional_embeddings
        

In [827]:
class BertEmbeddingLayer(nn.Module):
    
    def __init__(self, vocab_size, seq_len = 64, embedding_dim = 768, num_token_types = 3, dropout=0.1):
        super().__init__()
        self.positional_embeddings = PositionalEmbeddings(seq_len, embedding_dim)
        self.token_type_embeddings = TokenTypelEmbeddings(num_token_types, embedding_dim)
        self.word_embeddings = WordEmbeddings(vocab_size, embedding_dim)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, input_tokens, input_token_types):
        x = self.positional_embeddings(input_tokens) + self.word_embeddings(input_tokens) + self.token_type_embeddings(input_token_types)
        return self.dropout(x)
        

In [828]:
class Projection(nn.Module):
    def __init__(self, embedding_dim = 768):
        super().__init__()
        self.queries = nn.Linear(embedding_dim, embedding_dim)
        self.keys = nn.Linear(embedding_dim, embedding_dim)
        self.values = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, input_embeddings):
        return self.queries(input_embeddings), self.keys(input_embeddings), self.values(input_embeddings)

In [829]:
import math
class Attention(nn.Module):
    def __init__(self, num_attention_heads = 8, embedding_dim = 768, dropout=0.1):
        super().__init__()
        self.num_attention_heads = num_attention_heads
        self.dropout = torch.nn.Dropout(dropout)
        self.output_linear = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, q, k, v, padding_mask):
        bs, seq_len, embed_dim = q.shape
        num_elem_per_attn_head = embed_dim//self.num_attention_heads

        qr = q.view(bs, seq_len, self.num_attention_heads, -1).permute(0,2,1,3)
        vr = v.view(bs, seq_len, self.num_attention_heads, -1).permute(0,2,1,3)
        kr = k.view(bs, seq_len, self.num_attention_heads, -1).permute(0,2,1,3)

        attn_scores = torch.matmul(qr, kr.permute(0,1,3,2))/math.sqrt(num_elem_per_attn_head)
        #ATTN SCORES SHAPE torch.Size([16, 12, 64, 64]) , PADDING MASK SHAPE torch.Size([16, 1, 64, 64])
        attn_scores_masked = attn_scores.masked_fill(padding_mask == 0, -1e9)
        attn_scores_normalized = torch.nn.functional.softmax(attn_scores_masked, dim=-1)
        attn_scores_regularized = self.dropout(attn_scores_normalized)
                        
        attn_output = torch.matmul(attn_scores_regularized, vr).permute(0,2,1,3).contiguous().view(bs, seq_len, -1)
        return self.output_linear(attn_output)

In [830]:
class FullyConnected(nn.Module):
    def __init__(self, embedding_dim = 768, expansion_factor = 4, dropout = 0.1):
        super().__init__()
        intermediate_dim = embedding_dim * expansion_factor 
        self.fc1 = nn.Linear(embedding_dim, intermediate_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(intermediate_dim, embedding_dim)
        self.activation = nn.GELU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.activation(x)
        return x

In [831]:
class TransformerLayer(nn.Module):
    def __init__(self, embedding_dim = 768, num_attention_heads = 8, expansion_factor = 4, dropout = 0.1):
        super().__init__()
        self.projection = Projection(embedding_dim)
        self.attention = Attention(num_attention_heads, embedding_dim)
        self.layernorm1 = nn.LayerNorm(embedding_dim)
        self.fully_connected = FullyConnected(embedding_dim, expansion_factor)
        self.layernorm2 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, padding_mask):
        queries, keys, values = self.projection(x)
        attention_ouptut = self.attention(queries, keys, values, padding_mask)
        attention_output_regularized = self.dropout(attention_ouptut)
        residual_1 = x + attention_output_regularized
        layer_norm_1 = self.layernorm1(residual_1)
        fully_connected_output = self.fully_connected(layer_norm_1)
        fully_connected_output_regularized = self.dropout(fully_connected_output)
        residual_2 = layer_norm_1 + fully_connected_output_regularized
        final_output = self.layernorm2(residual_2)
        return final_output

In [862]:
class TransformerBlock(nn.Module):
    def __init__(self, num_layers = 12, embedding_dim = 768, num_attention_heads = 12, expansion_factor = 4):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerLayer(embedding_dim, num_attention_heads, expansion_factor) for _ in range(num_layers)
                       ])
        
    def forward(self, x, padding_mask):
        for l in range(1, num_layers):
            x = self.layers[l](x, padding_mask)
        return x

In [863]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, seq_len = 64, num_token_types = 3, num_layers = 12, embedding_dim = 768, num_attention_heads = 12, expansion_factor = 4):
        super().__init__()
        self.embedding_layer = BertEmbeddingLayer(vocab_size, seq_len, embedding_dim, num_token_types)
        self.transformer_blocks = TransformerBlock(num_layers, embedding_dim, num_attention_heads, expansion_factor)

    def forward(self, inp_tokens, input_token_types):
        seq_len = inp_tokens.size(1)
        padding_mask = (inp_tokens > 0).unsqueeze(1).repeat(1, seq_len, 1).unsqueeze(1)
        embeddings = self.embedding_layer(inp_tokens, input_token_types)
        encoded_representation = self.transformer_blocks(embeddings, padding_mask)
        return encoded_representation


e = Encoder(tokenizer.vocab_size)
op = e(inp['bert_input'], inp['segment_label'])

In [882]:
class NextSentancePrediction(nn.Module):
    def __init__(self, embedding_dim = 768):
        super().__init__()
        self.linear_layer = nn.Linear(embedding_dim, 2)

    def forward(self, encoded_representation, target_labels):
        cls_token_representation = encoded_representation[:,0,:]
        logits = self.linear_layer(cls_token_representation)
        nsp_loss = torch.nn.CrossEntropyLoss()(logits, target_labels)
        return nsp_loss

class MaskedLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim = 768):
        super().__init__()
        self.linear_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, encoded_representation, target_labels):
        bs, seq_len , embedding_dim = encoded_representation.shape
        target_labels_flattened = target_labels.view(bs*seq_len)
        logits = self.linear_layer(encoded_representation)
        logits_flattenend = logits.view((bs*seq_len, -1))
        all_loss = nn.CrossEntropyLoss(ignore_index=0)(logits_flattenend, target_labels_flattened) #reduction='none', 
        #target_labels_flattened[target_labels_flattened > 0] = 1         
        #mlm_loss = torch.dot(target_labels_flattened.float(), all_loss) / torch.sum(target_labels_flattened)
        return all_loss #mlm_loss
        
        

In [883]:
vocab_size = 200
seq_len = 64
num_token_types = 3
num_layers = 12
embedding_dim = 768
num_attention_heads = 8
expansion_factor = 4
vocab_size = tokenizer.vocab_size
batch_size = 16


from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
train_data_iterator  = iter(train_dataloader)

inp = next(train_data_iterator)


class BertModel(nn.Module):
    def __init__(self, vocab_size, seq_len = 64, num_token_types=3, num_layers=12, embedding_dim = 768, num_attention_heads = 12, expansion_factor=4):
        super().__init__()
        self.encoder = Encoder(vocab_size, seq_len, num_token_types, num_layers, embedding_dim, num_attention_heads, expansion_factor)
        self.masked_language_model = MaskedLanguageModel(vocab_size, embedding_dim) 
        self.next_sentance_prediction = NextSentancePrediction(embedding_dim)

    def forward(self, input_tokens, input_token_types, target_mlm_labels, target_nsp_labels):
        encoded_representation = self.encoder(input_tokens, input_token_types)
        nsp_loss = self.next_sentance_prediction(encoded_representation, target_nsp_labels)
        mlm_loss = self.masked_language_model(encoded_representation, target_mlm_labels)
        total_loss = mlm_loss + nsp_loss
        return total_loss
        
loss = BertModel(tokenizer.vocab_size)(inp['bert_input'], inp['segment_label'], inp['bert_label'].clone(), inp['is_next'])
print(loss)

tensor(10.8747, grad_fn=<AddBackward0>)


In [899]:
macs = profile_macs( BertModel(tokenizer.vocab_size), (inp['bert_input'], inp['segment_label'], inp['bert_label'].clone(), inp['is_next']))

NameError: name 'profile_macs' is not defined

In [884]:
class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

In [887]:
from itertools import cycle
class BERTTrainer:
    def __init__(
        self, 
        model, 
        train_dataloader, 
        test_dataloader=None, 
        lr= 1e-4,
        weight_decay=0.01,
        betas=(0.9, 0.999),
        warmup_steps=10000,
        log_freq=10,
        device='cuda',
        embedding_dim = 768
        ):

        self.device = device
        self.model = model
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(
            self.optim, embedding_dim, n_warmup_steps=warmup_steps
            )

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = torch.nn.NLLLoss(ignore_index=0)
        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
    
    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        
        avg_loss = 0.0
        total_correct = 0
        total_element = 0
        
        mode = "train" if train else "test"

        # progress bar
        data_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc="EP_%s:%d" % (mode, epoch),
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        for i, data in data_iter:

            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            loss = self.model(data['bert_input'], data['segment_label'], data['bert_label'].clone(), data['is_next'])

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next sentence prediction accuracy
           #correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
            #avg_loss += loss.item()
            #total_correct += correct
            #total_element += data["is_next"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
              #  "avg_loss": avg_loss / (i + 1),
              #  "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))
        print(f"EP{epoch}, {mode}: \
            avg_loss={avg_loss / len(data_iter)}, \
            total_acc={total_correct * 100.0 / total_element}"
        ) 

In [888]:
'''test run'''

train_data = BERTDataset(
   pairs, seq_len=MAX_LEN, tokenizer=tokenizer)

train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)

bert_model =  BertModel(tokenizer.vocab_size)

bert_trainer = BERTTrainer(bert_model, train_loader, device='cpu')
epochs = 20

for epoch in range(epochs):
  bert_trainer.train(epoch)

Total Parameters: 117581226


EP_train:0:   0%|| 1/6926 [00:03<6:24:37,  3.33s/it]

{'epoch': 0, 'iter': 0, 'loss': 10.980493545532227}


EP_train:0:   0%|| 11/6926 [00:30<5:12:06,  2.71s/it]

{'epoch': 0, 'iter': 10, 'loss': 10.817377090454102}


EP_train:0:   0%|| 21/6926 [00:57<5:01:25,  2.62s/it]

{'epoch': 0, 'iter': 20, 'loss': 10.684929847717285}


EP_train:0:   0%|| 31/6926 [01:24<5:25:23,  2.83s/it]

{'epoch': 0, 'iter': 30, 'loss': 10.592150688171387}


EP_train:0:   1%|| 41/6926 [01:52<5:26:45,  2.85s/it]

{'epoch': 0, 'iter': 40, 'loss': 10.368402481079102}


EP_train:0:   1%|| 51/6926 [02:21<5:26:54,  2.85s/it]

{'epoch': 0, 'iter': 50, 'loss': 10.154437065124512}


EP_train:0:   1%|| 61/6926 [02:50<5:26:21,  2.85s/it]

{'epoch': 0, 'iter': 60, 'loss': 10.01576042175293}


EP_train:0:   1%|| 71/6926 [03:19<5:29:36,  2.88s/it]

{'epoch': 0, 'iter': 70, 'loss': 9.73730182647705}


EP_train:0:   1%|| 81/6926 [03:47<5:28:58,  2.88s/it]

{'epoch': 0, 'iter': 80, 'loss': 9.776338577270508}


EP_train:0:   1%|| 91/6926 [04:16<5:28:32,  2.88s/it]

{'epoch': 0, 'iter': 90, 'loss': 9.896708488464355}


EP_train:0:   1%|| 101/6926 [04:45<5:22:11,  2.83s/it]

{'epoch': 0, 'iter': 100, 'loss': 9.029985427856445}


EP_train:0:   2%|| 111/6926 [05:14<5:32:10,  2.92s/it]

{'epoch': 0, 'iter': 110, 'loss': 9.456299781799316}


EP_train:0:   2%|| 121/6926 [05:43<5:19:44,  2.82s/it]

{'epoch': 0, 'iter': 120, 'loss': 9.487717628479004}


EP_train:0:   2%|| 131/6926 [06:11<5:22:16,  2.85s/it]

{'epoch': 0, 'iter': 130, 'loss': 8.932494163513184}


EP_train:0:   2%|| 141/6926 [06:40<5:24:02,  2.87s/it]

{'epoch': 0, 'iter': 140, 'loss': 9.09089469909668}


EP_train:0:   2%|| 151/6926 [07:09<5:31:42,  2.94s/it]

{'epoch': 0, 'iter': 150, 'loss': 8.567710876464844}


EP_train:0:   2%|| 161/6926 [07:38<5:26:58,  2.90s/it]

{'epoch': 0, 'iter': 160, 'loss': 8.72232437133789}


EP_train:0:   2%|| 171/6926 [08:08<5:29:33,  2.93s/it]

{'epoch': 0, 'iter': 170, 'loss': 9.097349166870117}


EP_train:0:   3%|| 181/6926 [08:37<5:22:26,  2.87s/it]

{'epoch': 0, 'iter': 180, 'loss': 9.052787780761719}


EP_train:0:   3%|| 191/6926 [09:05<5:20:08,  2.85s/it]

{'epoch': 0, 'iter': 190, 'loss': 9.030463218688965}


EP_train:0:   3%|| 201/6926 [09:34<5:14:03,  2.80s/it]

{'epoch': 0, 'iter': 200, 'loss': 8.959490776062012}


EP_train:0:   3%|| 211/6926 [10:02<5:20:12,  2.86s/it]

{'epoch': 0, 'iter': 210, 'loss': 8.217765808105469}


EP_train:0:   3%|| 221/6926 [10:31<5:13:07,  2.80s/it]

{'epoch': 0, 'iter': 220, 'loss': 8.1987886428833}


EP_train:0:   3%|| 231/6926 [10:59<5:20:01,  2.87s/it]

{'epoch': 0, 'iter': 230, 'loss': 8.425214767456055}


EP_train:0:   3%|| 241/6926 [11:27<5:14:19,  2.82s/it]

{'epoch': 0, 'iter': 240, 'loss': 8.595732688903809}


EP_train:0:   4%|| 251/6926 [11:55<5:12:52,  2.81s/it]

{'epoch': 0, 'iter': 250, 'loss': 8.49946403503418}


EP_train:0:   4%|| 261/6926 [12:23<5:14:58,  2.84s/it]

{'epoch': 0, 'iter': 260, 'loss': 8.666421890258789}


EP_train:0:   4%|| 271/6926 [12:52<5:23:20,  2.92s/it]

{'epoch': 0, 'iter': 270, 'loss': 8.396100044250488}


EP_train:0:   4%|| 281/6926 [13:21<5:15:44,  2.85s/it]

{'epoch': 0, 'iter': 280, 'loss': 7.685707092285156}


EP_train:0:   4%|| 291/6926 [13:49<5:17:24,  2.87s/it]

{'epoch': 0, 'iter': 290, 'loss': 8.00890064239502}


EP_train:0:   4%|| 301/6926 [14:18<5:16:53,  2.87s/it]

{'epoch': 0, 'iter': 300, 'loss': 7.827650547027588}


EP_train:0:   4%|| 311/6926 [14:46<5:14:54,  2.86s/it]

{'epoch': 0, 'iter': 310, 'loss': 7.8784050941467285}


EP_train:0:   5%|| 321/6926 [15:15<5:10:45,  2.82s/it]

{'epoch': 0, 'iter': 320, 'loss': 7.988260269165039}


EP_train:0:   5%|| 331/6926 [15:43<5:10:43,  2.83s/it]

{'epoch': 0, 'iter': 330, 'loss': 8.360150337219238}


EP_train:0:   5%|| 341/6926 [16:11<5:09:33,  2.82s/it]

{'epoch': 0, 'iter': 340, 'loss': 8.207344055175781}


EP_train:0:   5%|| 351/6926 [16:40<5:11:24,  2.84s/it]

{'epoch': 0, 'iter': 350, 'loss': 7.680410385131836}


EP_train:0:   5%|| 361/6926 [17:08<5:09:22,  2.83s/it]

{'epoch': 0, 'iter': 360, 'loss': 7.701846599578857}


EP_train:0:   5%|| 371/6926 [17:36<5:05:54,  2.80s/it]

{'epoch': 0, 'iter': 370, 'loss': 7.728377819061279}


EP_train:0:   6%|| 381/6926 [18:06<5:11:37,  2.86s/it]

{'epoch': 0, 'iter': 380, 'loss': 7.9051337242126465}


EP_train:0:   6%|| 391/6926 [18:33<5:02:41,  2.78s/it]

{'epoch': 0, 'iter': 390, 'loss': 8.045838356018066}


EP_train:0:   6%|| 401/6926 [19:02<5:07:30,  2.83s/it]

{'epoch': 0, 'iter': 400, 'loss': 7.804863929748535}


EP_train:0:   6%|| 411/6926 [19:30<5:04:02,  2.80s/it]

{'epoch': 0, 'iter': 410, 'loss': 7.756747722625732}


EP_train:0:   6%|| 421/6926 [19:58<5:12:43,  2.88s/it]

{'epoch': 0, 'iter': 420, 'loss': 7.805475234985352}


EP_train:0:   6%|| 431/6926 [20:26<5:05:50,  2.83s/it]

{'epoch': 0, 'iter': 430, 'loss': 7.678016662597656}


EP_train:0:   6%|| 441/6926 [20:56<5:15:19,  2.92s/it]

{'epoch': 0, 'iter': 440, 'loss': 7.813820838928223}


EP_train:0:   7%|| 451/6926 [21:24<5:06:14,  2.84s/it]

{'epoch': 0, 'iter': 450, 'loss': 7.274168491363525}


EP_train:0:   7%|| 461/6926 [21:53<5:12:53,  2.90s/it]

{'epoch': 0, 'iter': 460, 'loss': 7.900283336639404}


EP_train:0:   7%|| 471/6926 [22:22<5:01:11,  2.80s/it]

{'epoch': 0, 'iter': 470, 'loss': 7.404370307922363}


EP_train:0:   7%|| 481/6926 [22:50<5:05:16,  2.84s/it]

{'epoch': 0, 'iter': 480, 'loss': 7.634117603302002}


EP_train:0:   7%|| 491/6926 [23:19<5:22:08,  3.00s/it]

{'epoch': 0, 'iter': 490, 'loss': 7.511850833892822}


EP_train:0:   7%|| 501/6926 [23:48<5:22:37,  3.01s/it]

{'epoch': 0, 'iter': 500, 'loss': 7.398242473602295}


EP_train:0:   7%|| 511/6926 [24:17<5:05:38,  2.86s/it]

{'epoch': 0, 'iter': 510, 'loss': 7.413008213043213}


EP_train:0:   8%|| 521/6926 [24:45<5:00:06,  2.81s/it]

{'epoch': 0, 'iter': 520, 'loss': 7.3771467208862305}


EP_train:0:   8%|| 531/6926 [25:14<4:59:57,  2.81s/it]

{'epoch': 0, 'iter': 530, 'loss': 7.497809410095215}


EP_train:0:   8%|| 541/6926 [25:41<4:56:29,  2.79s/it]

{'epoch': 0, 'iter': 540, 'loss': 7.1005401611328125}


EP_train:0:   8%|| 551/6926 [26:10<5:04:04,  2.86s/it]

{'epoch': 0, 'iter': 550, 'loss': 7.296873092651367}


EP_train:0:   8%|| 561/6926 [26:38<5:09:02,  2.91s/it]

{'epoch': 0, 'iter': 560, 'loss': 7.4314069747924805}


EP_train:0:   8%|| 571/6926 [27:06<5:01:31,  2.85s/it]

{'epoch': 0, 'iter': 570, 'loss': 7.780167102813721}


EP_train:0:   8%|| 581/6926 [27:35<4:57:14,  2.81s/it]

{'epoch': 0, 'iter': 580, 'loss': 7.273318290710449}


EP_train:0:   9%|| 591/6926 [28:04<5:15:19,  2.99s/it]

{'epoch': 0, 'iter': 590, 'loss': 7.442378997802734}


EP_train:0:   9%|| 601/6926 [28:32<4:55:32,  2.80s/it]

{'epoch': 0, 'iter': 600, 'loss': 7.357780933380127}


EP_train:0:   9%|| 611/6926 [29:02<5:32:33,  3.16s/it]

{'epoch': 0, 'iter': 610, 'loss': 7.144816875457764}


EP_train:0:   9%|| 621/6926 [29:28<4:34:24,  2.61s/it]

{'epoch': 0, 'iter': 620, 'loss': 7.337190628051758}


EP_train:0:   9%|| 631/6926 [29:54<4:36:32,  2.64s/it]

{'epoch': 0, 'iter': 630, 'loss': 7.205127239227295}


EP_train:0:   9%|| 641/6926 [30:20<4:36:39,  2.64s/it]

{'epoch': 0, 'iter': 640, 'loss': 7.419194221496582}


EP_train:0:   9%|| 651/6926 [30:45<4:25:00,  2.53s/it]

{'epoch': 0, 'iter': 650, 'loss': 7.352833271026611}


EP_train:0:  10%|| 661/6926 [31:11<4:30:08,  2.59s/it]

{'epoch': 0, 'iter': 660, 'loss': 7.170953273773193}


EP_train:0:  10%|| 671/6926 [31:38<4:37:13,  2.66s/it]

{'epoch': 0, 'iter': 670, 'loss': 6.945201873779297}


EP_train:0:  10%|| 681/6926 [32:04<4:30:28,  2.60s/it]

{'epoch': 0, 'iter': 680, 'loss': 7.025094985961914}


EP_train:0:  10%|| 691/6926 [32:29<4:22:24,  2.53s/it]

{'epoch': 0, 'iter': 690, 'loss': 7.467131614685059}


EP_train:0:  10%|| 701/6926 [32:55<4:24:58,  2.55s/it]

{'epoch': 0, 'iter': 700, 'loss': 6.85335111618042}


EP_train:0:  10%|| 711/6926 [33:20<4:25:40,  2.56s/it]

{'epoch': 0, 'iter': 710, 'loss': 6.850571632385254}


EP_train:0:  10%|| 721/6926 [33:46<4:22:14,  2.54s/it]

{'epoch': 0, 'iter': 720, 'loss': 6.895808696746826}


EP_train:0:  11%|| 731/6926 [34:11<4:21:02,  2.53s/it]

{'epoch': 0, 'iter': 730, 'loss': 7.2144389152526855}


EP_train:0:  11%|| 741/6926 [34:36<4:21:32,  2.54s/it]

{'epoch': 0, 'iter': 740, 'loss': 6.788376331329346}


EP_train:0:  11%|| 751/6926 [35:01<4:25:47,  2.58s/it]

{'epoch': 0, 'iter': 750, 'loss': 7.098903656005859}


EP_train:0:  11%|| 761/6926 [35:27<4:19:23,  2.52s/it]

{'epoch': 0, 'iter': 760, 'loss': 7.268545627593994}


EP_train:0:  11%|| 771/6926 [35:52<4:22:47,  2.56s/it]

{'epoch': 0, 'iter': 770, 'loss': 6.933078765869141}


EP_train:0:  11%|| 781/6926 [36:19<4:22:03,  2.56s/it]

{'epoch': 0, 'iter': 780, 'loss': 7.08230447769165}


EP_train:0:  11%|| 791/6926 [36:44<4:16:46,  2.51s/it]

{'epoch': 0, 'iter': 790, 'loss': 6.526589870452881}


EP_train:0:  12%|| 801/6926 [37:10<4:26:50,  2.61s/it]

{'epoch': 0, 'iter': 800, 'loss': 6.953088283538818}


EP_train:0:  12%|| 811/6926 [37:35<4:19:48,  2.55s/it]

{'epoch': 0, 'iter': 810, 'loss': 6.583212852478027}


EP_train:0:  12%|| 821/6926 [38:01<4:24:29,  2.60s/it]

{'epoch': 0, 'iter': 820, 'loss': 6.842612266540527}


EP_train:0:  12%|| 831/6926 [38:27<4:20:01,  2.56s/it]

{'epoch': 0, 'iter': 830, 'loss': 7.197347640991211}


EP_train:0:  12%|| 841/6926 [38:52<4:21:47,  2.58s/it]

{'epoch': 0, 'iter': 840, 'loss': 7.280065536499023}


EP_train:0:  12%|| 851/6926 [39:19<4:24:10,  2.61s/it]

{'epoch': 0, 'iter': 850, 'loss': 6.664368629455566}


EP_train:0:  12%|| 861/6926 [39:44<4:20:17,  2.57s/it]

{'epoch': 0, 'iter': 860, 'loss': 6.546392440795898}


EP_train:0:  13%|| 871/6926 [40:10<4:20:30,  2.58s/it]

{'epoch': 0, 'iter': 870, 'loss': 6.891797065734863}


EP_train:0:  13%|| 881/6926 [40:35<4:15:46,  2.54s/it]

{'epoch': 0, 'iter': 880, 'loss': 6.8766703605651855}


EP_train:0:  13%|| 891/6926 [41:01<4:21:29,  2.60s/it]

{'epoch': 0, 'iter': 890, 'loss': 6.92625617980957}


EP_train:0:  13%|| 901/6926 [41:27<4:16:07,  2.55s/it]

{'epoch': 0, 'iter': 900, 'loss': 6.715897083282471}


EP_train:0:  13%|| 911/6926 [41:52<4:18:13,  2.58s/it]

{'epoch': 0, 'iter': 910, 'loss': 7.081655502319336}


EP_train:0:  13%|| 921/6926 [42:18<4:14:13,  2.54s/it]

{'epoch': 0, 'iter': 920, 'loss': 6.758141040802002}


EP_train:0:  13%|| 931/6926 [42:44<4:18:40,  2.59s/it]

{'epoch': 0, 'iter': 930, 'loss': 6.787111759185791}


EP_train:0:  14%|| 941/6926 [43:10<4:16:03,  2.57s/it]

{'epoch': 0, 'iter': 940, 'loss': 6.588351726531982}


EP_train:0:  14%|| 951/6926 [43:35<4:11:37,  2.53s/it]

{'epoch': 0, 'iter': 950, 'loss': 6.7983808517456055}


EP_train:0:  14%|| 961/6926 [44:01<4:17:48,  2.59s/it]

{'epoch': 0, 'iter': 960, 'loss': 6.883325576782227}


EP_train:0:  14%|| 971/6926 [44:26<4:10:18,  2.52s/it]

{'epoch': 0, 'iter': 970, 'loss': 6.690804958343506}


EP_train:0:  14%|| 981/6926 [44:52<4:10:19,  2.53s/it]

{'epoch': 0, 'iter': 980, 'loss': 7.590610980987549}


EP_train:0:  14%|| 991/6926 [45:17<4:12:51,  2.56s/it]

{'epoch': 0, 'iter': 990, 'loss': 6.785765647888184}


EP_train:0:  14%|| 1001/6926 [45:43<4:12:55,  2.56s/it]

{'epoch': 0, 'iter': 1000, 'loss': 6.35586404800415}


EP_train:0:  15%|| 1011/6926 [46:08<4:09:52,  2.53s/it]

{'epoch': 0, 'iter': 1010, 'loss': 7.258150100708008}


EP_train:0:  15%|| 1021/6926 [46:34<4:10:24,  2.54s/it]

{'epoch': 0, 'iter': 1020, 'loss': 6.5650482177734375}


EP_train:0:  15%|| 1031/6926 [46:59<4:11:07,  2.56s/it]

{'epoch': 0, 'iter': 1030, 'loss': 6.647977828979492}


EP_train:0:  15%|| 1041/6926 [47:25<4:07:26,  2.52s/it]

{'epoch': 0, 'iter': 1040, 'loss': 6.508030414581299}


EP_train:0:  15%|| 1051/6926 [47:50<4:09:30,  2.55s/it]

{'epoch': 0, 'iter': 1050, 'loss': 6.63740873336792}


EP_train:0:  15%|| 1061/6926 [48:16<4:09:22,  2.55s/it]

{'epoch': 0, 'iter': 1060, 'loss': 6.542643070220947}


EP_train:0:  15%|| 1071/6926 [48:42<4:12:25,  2.59s/it]

{'epoch': 0, 'iter': 1070, 'loss': 6.640796661376953}


EP_train:0:  16%|| 1081/6926 [49:08<4:12:03,  2.59s/it]

{'epoch': 0, 'iter': 1080, 'loss': 6.852349281311035}


EP_train:0:  16%|| 1091/6926 [49:34<4:13:32,  2.61s/it]

{'epoch': 0, 'iter': 1090, 'loss': 6.909759998321533}


EP_train:0:  16%|| 1101/6926 [50:00<4:15:50,  2.64s/it]

{'epoch': 0, 'iter': 1100, 'loss': 6.436010837554932}


EP_train:0:  16%|| 1111/6926 [50:26<4:08:19,  2.56s/it]

{'epoch': 0, 'iter': 1110, 'loss': 6.221969127655029}


EP_train:0:  16%|| 1121/6926 [50:51<4:07:25,  2.56s/it]

{'epoch': 0, 'iter': 1120, 'loss': 6.578386306762695}


EP_train:0:  16%|| 1131/6926 [51:17<4:07:05,  2.56s/it]

{'epoch': 0, 'iter': 1130, 'loss': 7.1309919357299805}


EP_train:0:  16%|| 1141/6926 [51:43<4:10:27,  2.60s/it]

{'epoch': 0, 'iter': 1140, 'loss': 6.876181602478027}


EP_train:0:  17%|| 1151/6926 [52:09<4:19:45,  2.70s/it]

{'epoch': 0, 'iter': 1150, 'loss': 6.885873794555664}


EP_train:0:  17%|| 1161/6926 [52:36<4:11:23,  2.62s/it]

{'epoch': 0, 'iter': 1160, 'loss': 6.148808479309082}


EP_train:0:  17%|| 1171/6926 [53:02<4:14:36,  2.65s/it]

{'epoch': 0, 'iter': 1170, 'loss': 6.888280391693115}


EP_train:0:  17%|| 1181/6926 [53:28<4:09:22,  2.60s/it]

{'epoch': 0, 'iter': 1180, 'loss': 6.82539176940918}


EP_train:0:  17%|| 1191/6926 [53:54<4:07:47,  2.59s/it]

{'epoch': 0, 'iter': 1190, 'loss': 6.892240524291992}


EP_train:0:  17%|| 1201/6926 [54:20<4:02:15,  2.54s/it]

{'epoch': 0, 'iter': 1200, 'loss': 7.073554992675781}


EP_train:0:  17%|| 1211/6926 [54:45<4:01:32,  2.54s/it]

{'epoch': 0, 'iter': 1210, 'loss': 6.5708465576171875}


EP_train:0:  18%|| 1221/6926 [55:11<4:06:31,  2.59s/it]

{'epoch': 0, 'iter': 1220, 'loss': 7.080777168273926}


EP_train:0:  18%|| 1231/6926 [55:38<4:13:25,  2.67s/it]

{'epoch': 0, 'iter': 1230, 'loss': 6.93747091293335}


EP_train:0:  18%|| 1241/6926 [56:04<4:04:51,  2.58s/it]

{'epoch': 0, 'iter': 1240, 'loss': 6.9177775382995605}


EP_train:0:  18%|| 1251/6926 [56:30<4:06:21,  2.60s/it]

{'epoch': 0, 'iter': 1250, 'loss': 6.729647159576416}


EP_train:0:  18%|| 1261/6926 [56:55<4:03:50,  2.58s/it]

{'epoch': 0, 'iter': 1260, 'loss': 6.750103950500488}


EP_train:0:  18%|| 1271/6926 [57:22<4:05:34,  2.61s/it]

{'epoch': 0, 'iter': 1270, 'loss': 7.221626281738281}


EP_train:0:  18%|| 1281/6926 [57:48<4:10:56,  2.67s/it]

{'epoch': 0, 'iter': 1280, 'loss': 6.52549409866333}


EP_train:0:  19%|| 1291/6926 [58:15<4:05:49,  2.62s/it]

{'epoch': 0, 'iter': 1290, 'loss': 6.465312480926514}


EP_train:0:  19%|| 1301/6926 [58:41<4:06:42,  2.63s/it]

{'epoch': 0, 'iter': 1300, 'loss': 6.459814548492432}


EP_train:0:  19%|| 1311/6926 [59:07<4:08:34,  2.66s/it]

{'epoch': 0, 'iter': 1310, 'loss': 6.609257698059082}


EP_train:0:  19%|| 1321/6926 [59:33<4:05:47,  2.63s/it]

{'epoch': 0, 'iter': 1320, 'loss': 6.575453281402588}


EP_train:0:  19%|| 1331/6926 [59:59<4:05:27,  2.63s/it]

{'epoch': 0, 'iter': 1330, 'loss': 6.914897441864014}


EP_train:0:  19%|| 1341/6926 [1:00:36<6:03:24,  3.90s/it]

{'epoch': 0, 'iter': 1340, 'loss': 6.6813764572143555}


EP_train:0:  20%|| 1351/6926 [1:01:17<6:22:26,  4.12s/it]

{'epoch': 0, 'iter': 1350, 'loss': 6.240934371948242}


EP_train:0:  20%|| 1361/6926 [1:02:00<6:33:31,  4.24s/it]

{'epoch': 0, 'iter': 1360, 'loss': 6.725067138671875}


EP_train:0:  20%|| 1371/6926 [1:02:42<6:26:40,  4.18s/it]

{'epoch': 0, 'iter': 1370, 'loss': 6.918652534484863}


EP_train:0:  20%|| 1381/6926 [1:03:24<6:22:43,  4.14s/it]

{'epoch': 0, 'iter': 1380, 'loss': 6.899061679840088}


EP_train:0:  20%|| 1391/6926 [1:04:06<6:29:15,  4.22s/it]

{'epoch': 0, 'iter': 1390, 'loss': 6.415584087371826}


EP_train:0:  20%|| 1401/6926 [1:04:48<6:28:22,  4.22s/it]

{'epoch': 0, 'iter': 1400, 'loss': 7.158693790435791}


EP_train:0:  20%|| 1411/6926 [1:05:31<6:34:03,  4.29s/it]

{'epoch': 0, 'iter': 1410, 'loss': 6.698447227478027}


EP_train:0:  21%|| 1421/6926 [1:06:13<6:28:44,  4.24s/it]

{'epoch': 0, 'iter': 1420, 'loss': 6.199317932128906}


EP_train:0:  21%|| 1431/6926 [1:06:55<6:21:28,  4.17s/it]

{'epoch': 0, 'iter': 1430, 'loss': 6.702169418334961}


EP_train:0:  21%|| 1441/6926 [1:07:37<6:20:00,  4.16s/it]

{'epoch': 0, 'iter': 1440, 'loss': 6.3121771812438965}


EP_train:0:  21%|| 1451/6926 [1:08:17<6:04:58,  4.00s/it]

{'epoch': 0, 'iter': 1450, 'loss': 6.546307563781738}


EP_train:0:  21%|| 1461/6926 [1:08:58<6:08:26,  4.05s/it]

{'epoch': 0, 'iter': 1460, 'loss': 6.56934118270874}


EP_train:0:  21%|| 1471/6926 [1:09:38<6:06:59,  4.04s/it]

{'epoch': 0, 'iter': 1470, 'loss': 6.462996959686279}


EP_train:0:  21%|| 1481/6926 [1:10:19<6:09:11,  4.07s/it]

{'epoch': 0, 'iter': 1480, 'loss': 6.394591331481934}


EP_train:0:  22%|| 1491/6926 [1:11:00<6:12:33,  4.11s/it]

{'epoch': 0, 'iter': 1490, 'loss': 6.351978302001953}


EP_train:0:  22%|| 1501/6926 [1:11:41<6:10:52,  4.10s/it]

{'epoch': 0, 'iter': 1500, 'loss': 6.5469584465026855}


EP_train:0:  22%|| 1511/6926 [1:12:22<6:08:56,  4.09s/it]

{'epoch': 0, 'iter': 1510, 'loss': 6.572189807891846}


EP_train:0:  22%|| 1521/6926 [1:13:02<6:08:15,  4.09s/it]

{'epoch': 0, 'iter': 1520, 'loss': 6.57815408706665}


EP_train:0:  22%|| 1531/6926 [1:13:43<6:06:23,  4.07s/it]

{'epoch': 0, 'iter': 1530, 'loss': 6.90241813659668}


EP_train:0:  22%|| 1541/6926 [1:14:24<6:06:03,  4.08s/it]

{'epoch': 0, 'iter': 1540, 'loss': 6.481794834136963}


EP_train:0:  22%|| 1551/6926 [1:15:05<6:09:39,  4.13s/it]

{'epoch': 0, 'iter': 1550, 'loss': 7.228412628173828}


EP_train:0:  23%|| 1561/6926 [1:15:46<6:07:20,  4.11s/it]

{'epoch': 0, 'iter': 1560, 'loss': 6.8024702072143555}


EP_train:0:  23%|| 1571/6926 [1:16:27<6:06:30,  4.11s/it]

{'epoch': 0, 'iter': 1570, 'loss': 6.509736061096191}


EP_train:0:  23%|| 1581/6926 [1:17:08<6:07:29,  4.13s/it]

{'epoch': 0, 'iter': 1580, 'loss': 6.325521469116211}


EP_train:0:  23%|| 1591/6926 [1:17:49<6:04:35,  4.10s/it]

{'epoch': 0, 'iter': 1590, 'loss': 6.485474109649658}


EP_train:0:  23%|| 1601/6926 [1:18:30<6:02:18,  4.08s/it]

{'epoch': 0, 'iter': 1600, 'loss': 6.744209289550781}


EP_train:0:  23%|| 1611/6926 [1:19:10<6:03:37,  4.10s/it]

{'epoch': 0, 'iter': 1610, 'loss': 6.591963768005371}


EP_train:0:  23%|| 1621/6926 [1:19:51<5:58:14,  4.05s/it]

{'epoch': 0, 'iter': 1620, 'loss': 6.907036781311035}


EP_train:0:  24%|| 1631/6926 [1:20:32<5:59:55,  4.08s/it]

{'epoch': 0, 'iter': 1630, 'loss': 6.2539873123168945}


EP_train:0:  24%|| 1641/6926 [1:21:13<6:02:58,  4.12s/it]

{'epoch': 0, 'iter': 1640, 'loss': 6.5845746994018555}


EP_train:0:  24%|| 1651/6926 [1:21:54<5:58:32,  4.08s/it]

{'epoch': 0, 'iter': 1650, 'loss': 6.704841136932373}


EP_train:0:  24%|| 1661/6926 [1:22:35<5:59:34,  4.10s/it]

{'epoch': 0, 'iter': 1660, 'loss': 6.810030460357666}


EP_train:0:  24%|| 1671/6926 [1:23:16<5:57:03,  4.08s/it]

{'epoch': 0, 'iter': 1670, 'loss': 6.572900295257568}


EP_train:0:  24%|| 1681/6926 [1:23:58<6:06:18,  4.19s/it]

{'epoch': 0, 'iter': 1680, 'loss': 6.961022853851318}


EP_train:0:  24%|| 1691/6926 [1:24:38<5:57:41,  4.10s/it]

{'epoch': 0, 'iter': 1690, 'loss': 6.428500175476074}


EP_train:0:  25%|| 1701/6926 [1:25:19<5:48:51,  4.01s/it]

{'epoch': 0, 'iter': 1700, 'loss': 6.737051486968994}


EP_train:0:  25%|| 1711/6926 [1:26:00<5:55:46,  4.09s/it]

{'epoch': 0, 'iter': 1710, 'loss': 6.453733444213867}


EP_train:0:  25%|| 1721/6926 [1:26:41<5:55:02,  4.09s/it]

{'epoch': 0, 'iter': 1720, 'loss': 6.2866339683532715}


EP_train:0:  25%|| 1731/6926 [1:27:22<6:01:09,  4.17s/it]

{'epoch': 0, 'iter': 1730, 'loss': 6.787047386169434}


EP_train:0:  25%|| 1741/6926 [1:28:03<5:54:33,  4.10s/it]

{'epoch': 0, 'iter': 1740, 'loss': 6.7160563468933105}


EP_train:0:  25%|| 1751/6926 [1:28:45<5:51:57,  4.08s/it]

{'epoch': 0, 'iter': 1750, 'loss': 6.475584983825684}


EP_train:0:  25%|| 1761/6926 [1:29:25<5:54:15,  4.12s/it]

{'epoch': 0, 'iter': 1760, 'loss': 6.620995044708252}


EP_train:0:  26%|| 1771/6926 [1:30:07<5:52:12,  4.10s/it]

{'epoch': 0, 'iter': 1770, 'loss': 6.553742408752441}


EP_train:0:  26%|| 1781/6926 [1:30:48<5:55:02,  4.14s/it]

{'epoch': 0, 'iter': 1780, 'loss': 6.320147514343262}


EP_train:0:  26%|| 1791/6926 [1:31:29<5:52:20,  4.12s/it]

{'epoch': 0, 'iter': 1790, 'loss': 6.337746620178223}


EP_train:0:  26%|| 1801/6926 [1:32:11<5:51:39,  4.12s/it]

{'epoch': 0, 'iter': 1800, 'loss': 6.970925807952881}


EP_train:0:  26%|| 1811/6926 [1:32:52<5:51:40,  4.13s/it]

{'epoch': 0, 'iter': 1810, 'loss': 6.500648021697998}


EP_train:0:  26%|| 1821/6926 [1:33:34<5:51:35,  4.13s/it]

{'epoch': 0, 'iter': 1820, 'loss': 6.8618483543396}


EP_train:0:  26%|| 1831/6926 [1:34:15<5:45:32,  4.07s/it]

{'epoch': 0, 'iter': 1830, 'loss': 6.592775821685791}


EP_train:0:  27%|| 1841/6926 [1:34:57<5:53:47,  4.17s/it]

{'epoch': 0, 'iter': 1840, 'loss': 6.5312089920043945}


EP_train:0:  27%|| 1851/6926 [1:35:38<5:48:45,  4.12s/it]

{'epoch': 0, 'iter': 1850, 'loss': 7.18349027633667}


EP_train:0:  27%|| 1861/6926 [1:36:19<5:40:55,  4.04s/it]

{'epoch': 0, 'iter': 1860, 'loss': 6.716352939605713}


EP_train:0:  27%|| 1871/6926 [1:37:00<5:33:01,  3.95s/it]

{'epoch': 0, 'iter': 1870, 'loss': 6.668985843658447}


EP_train:0:  27%|| 1881/6926 [1:37:42<5:56:56,  4.25s/it]

{'epoch': 0, 'iter': 1880, 'loss': 6.053119659423828}


EP_train:0:  27%|| 1891/6926 [1:38:24<5:57:19,  4.26s/it]

{'epoch': 0, 'iter': 1890, 'loss': 6.923961639404297}


EP_train:0:  27%|| 1901/6926 [1:39:07<5:57:01,  4.26s/it]

{'epoch': 0, 'iter': 1900, 'loss': 6.641719341278076}


EP_train:0:  28%|| 1911/6926 [1:39:49<5:50:24,  4.19s/it]

{'epoch': 0, 'iter': 1910, 'loss': 6.75493860244751}


EP_train:0:  28%|| 1921/6926 [1:40:31<5:51:37,  4.22s/it]

{'epoch': 0, 'iter': 1920, 'loss': 6.277565956115723}


EP_train:0:  28%|| 1931/6926 [1:41:13<5:55:11,  4.27s/it]

{'epoch': 0, 'iter': 1930, 'loss': 6.3773088455200195}


EP_train:0:  28%|| 1941/6926 [1:41:56<5:57:59,  4.31s/it]

{'epoch': 0, 'iter': 1940, 'loss': 6.310199737548828}


EP_train:0:  28%|| 1951/6926 [1:42:38<5:50:41,  4.23s/it]

{'epoch': 0, 'iter': 1950, 'loss': 6.410434722900391}


EP_train:0:  28%|| 1961/6926 [1:43:20<5:48:06,  4.21s/it]

{'epoch': 0, 'iter': 1960, 'loss': 6.4456610679626465}


EP_train:0:  28%|| 1971/6926 [1:44:03<5:53:54,  4.29s/it]

{'epoch': 0, 'iter': 1970, 'loss': 6.592911243438721}


EP_train:0:  29%|| 1981/6926 [1:44:45<5:52:12,  4.27s/it]

{'epoch': 0, 'iter': 1980, 'loss': 6.866596698760986}


EP_train:0:  29%|| 1991/6926 [1:45:27<5:39:15,  4.12s/it]

{'epoch': 0, 'iter': 1990, 'loss': 7.001783847808838}


EP_train:0:  29%|| 2001/6926 [1:46:09<5:49:40,  4.26s/it]

{'epoch': 0, 'iter': 2000, 'loss': 7.071491718292236}


EP_train:0:  29%|| 2011/6926 [1:46:52<5:55:24,  4.34s/it]

{'epoch': 0, 'iter': 2010, 'loss': 6.530882358551025}


EP_train:0:  29%|| 2021/6926 [1:47:34<5:37:05,  4.12s/it]

{'epoch': 0, 'iter': 2020, 'loss': 6.354452610015869}


EP_train:0:  29%|| 2031/6926 [1:48:16<5:48:29,  4.27s/it]

{'epoch': 0, 'iter': 2030, 'loss': 6.628175258636475}


EP_train:0:  29%|| 2041/6926 [1:48:59<5:45:33,  4.24s/it]

{'epoch': 0, 'iter': 2040, 'loss': 6.646540641784668}


EP_train:0:  30%|| 2051/6926 [1:49:40<5:37:18,  4.15s/it]

{'epoch': 0, 'iter': 2050, 'loss': 6.595117092132568}


EP_train:0:  30%|| 2061/6926 [1:50:23<5:45:13,  4.26s/it]

{'epoch': 0, 'iter': 2060, 'loss': 6.740567207336426}


EP_train:0:  30%|| 2071/6926 [1:51:05<5:45:02,  4.26s/it]

{'epoch': 0, 'iter': 2070, 'loss': 6.535266876220703}


EP_train:0:  30%|| 2081/6926 [1:51:47<5:38:07,  4.19s/it]

{'epoch': 0, 'iter': 2080, 'loss': 6.217376232147217}


EP_train:0:  30%|| 2091/6926 [1:52:30<5:42:05,  4.25s/it]

{'epoch': 0, 'iter': 2090, 'loss': 6.1768317222595215}


EP_train:0:  30%|| 2101/6926 [1:53:12<5:45:01,  4.29s/it]

{'epoch': 0, 'iter': 2100, 'loss': 6.72865629196167}


EP_train:0:  30%|| 2111/6926 [1:53:55<5:38:50,  4.22s/it]

{'epoch': 0, 'iter': 2110, 'loss': 6.531556606292725}


EP_train:0:  31%|| 2121/6926 [1:54:38<5:42:22,  4.28s/it]

{'epoch': 0, 'iter': 2120, 'loss': 6.645843982696533}


EP_train:0:  31%|| 2131/6926 [1:55:20<5:44:34,  4.31s/it]

{'epoch': 0, 'iter': 2130, 'loss': 6.875978469848633}


EP_train:0:  31%|| 2141/6926 [1:56:02<5:40:43,  4.27s/it]

{'epoch': 0, 'iter': 2140, 'loss': 6.920370101928711}


EP_train:0:  31%|| 2151/6926 [1:56:45<5:39:40,  4.27s/it]

{'epoch': 0, 'iter': 2150, 'loss': 6.754677772521973}


EP_train:0:  31%|| 2161/6926 [1:57:27<5:37:51,  4.25s/it]

{'epoch': 0, 'iter': 2160, 'loss': 6.733028888702393}


EP_train:0:  31%|| 2171/6926 [1:58:10<5:36:44,  4.25s/it]

{'epoch': 0, 'iter': 2170, 'loss': 6.848942279815674}


EP_train:0:  31%|| 2181/6926 [1:58:53<5:33:34,  4.22s/it]

{'epoch': 0, 'iter': 2180, 'loss': 7.042013168334961}


EP_train:0:  32%|| 2191/6926 [1:59:35<5:25:10,  4.12s/it]

{'epoch': 0, 'iter': 2190, 'loss': 6.759464263916016}


EP_train:0:  32%|| 2201/6926 [2:00:16<5:29:41,  4.19s/it]

{'epoch': 0, 'iter': 2200, 'loss': 6.80509090423584}


EP_train:0:  32%|| 2211/6926 [2:00:58<5:31:04,  4.21s/it]

{'epoch': 0, 'iter': 2210, 'loss': 6.434140205383301}


EP_train:0:  32%|| 2221/6926 [2:01:41<5:31:47,  4.23s/it]

{'epoch': 0, 'iter': 2220, 'loss': 6.757564544677734}


EP_train:0:  32%|| 2231/6926 [2:02:23<5:25:56,  4.17s/it]

{'epoch': 0, 'iter': 2230, 'loss': 6.582906723022461}


EP_train:0:  32%|| 2241/6926 [2:03:05<5:19:51,  4.10s/it]

{'epoch': 0, 'iter': 2240, 'loss': 6.611074447631836}


EP_train:0:  33%|| 2251/6926 [2:03:47<5:26:36,  4.19s/it]

{'epoch': 0, 'iter': 2250, 'loss': 6.351900577545166}


EP_train:0:  33%|| 2261/6926 [2:04:29<5:26:01,  4.19s/it]

{'epoch': 0, 'iter': 2260, 'loss': 6.371326446533203}


EP_train:0:  33%|| 2271/6926 [2:05:11<5:28:03,  4.23s/it]

{'epoch': 0, 'iter': 2270, 'loss': 7.0168375968933105}


EP_train:0:  33%|| 2281/6926 [2:05:52<5:22:32,  4.17s/it]

{'epoch': 0, 'iter': 2280, 'loss': 6.7299933433532715}


EP_train:0:  33%|| 2291/6926 [2:06:35<5:24:36,  4.20s/it]

{'epoch': 0, 'iter': 2290, 'loss': 6.3592729568481445}


EP_train:0:  33%|| 2301/6926 [2:07:16<5:22:58,  4.19s/it]

{'epoch': 0, 'iter': 2300, 'loss': 6.472850322723389}


EP_train:0:  33%|| 2311/6926 [2:07:58<5:16:24,  4.11s/it]

{'epoch': 0, 'iter': 2310, 'loss': 6.8321380615234375}


EP_train:0:  34%|| 2321/6926 [2:08:39<5:20:25,  4.17s/it]

{'epoch': 0, 'iter': 2320, 'loss': 6.089073657989502}


EP_train:0:  34%|| 2331/6926 [2:09:22<5:27:00,  4.27s/it]

{'epoch': 0, 'iter': 2330, 'loss': 6.463432312011719}


EP_train:0:  34%|| 2341/6926 [2:10:04<5:26:17,  4.27s/it]

{'epoch': 0, 'iter': 2340, 'loss': 6.566033363342285}


EP_train:0:  34%|| 2351/6926 [2:10:47<5:20:08,  4.20s/it]

{'epoch': 0, 'iter': 2350, 'loss': 7.076557636260986}


EP_train:0:  34%|| 2361/6926 [2:11:28<5:17:47,  4.18s/it]

{'epoch': 0, 'iter': 2360, 'loss': 6.126839637756348}


EP_train:0:  34%|| 2371/6926 [2:12:10<5:13:29,  4.13s/it]

{'epoch': 0, 'iter': 2370, 'loss': 6.476442813873291}


EP_train:0:  34%|| 2381/6926 [2:12:52<5:15:42,  4.17s/it]

{'epoch': 0, 'iter': 2380, 'loss': 6.097146034240723}


EP_train:0:  35%|| 2391/6926 [2:13:34<5:20:12,  4.24s/it]

{'epoch': 0, 'iter': 2390, 'loss': 6.612268447875977}


EP_train:0:  35%|| 2401/6926 [2:14:17<5:18:26,  4.22s/it]

{'epoch': 0, 'iter': 2400, 'loss': 6.233328819274902}


EP_train:0:  35%|| 2411/6926 [2:15:00<5:25:47,  4.33s/it]

{'epoch': 0, 'iter': 2410, 'loss': 6.512572765350342}


EP_train:0:  35%|| 2421/6926 [2:15:42<5:17:14,  4.23s/it]

{'epoch': 0, 'iter': 2420, 'loss': 6.361563682556152}


EP_train:0:  35%|| 2431/6926 [2:16:24<5:19:58,  4.27s/it]

{'epoch': 0, 'iter': 2430, 'loss': 6.909940242767334}


EP_train:0:  35%|| 2441/6926 [2:17:07<5:18:57,  4.27s/it]

{'epoch': 0, 'iter': 2440, 'loss': 6.6474151611328125}


EP_train:0:  35%|| 2451/6926 [2:17:49<5:17:53,  4.26s/it]

{'epoch': 0, 'iter': 2450, 'loss': 6.611946105957031}


EP_train:0:  36%|| 2461/6926 [2:18:30<5:01:09,  4.05s/it]

{'epoch': 0, 'iter': 2460, 'loss': 6.680189609527588}


EP_train:0:  36%|| 2471/6926 [2:19:13<5:14:39,  4.24s/it]

{'epoch': 0, 'iter': 2470, 'loss': 6.311417102813721}


EP_train:0:  36%|| 2481/6926 [2:19:55<5:11:09,  4.20s/it]

{'epoch': 0, 'iter': 2480, 'loss': 6.776192665100098}


EP_train:0:  36%|| 2491/6926 [2:20:37<5:08:14,  4.17s/it]

{'epoch': 0, 'iter': 2490, 'loss': 6.3158745765686035}


EP_train:0:  36%|| 2501/6926 [2:21:19<5:09:39,  4.20s/it]

{'epoch': 0, 'iter': 2500, 'loss': 6.852359294891357}


EP_train:0:  36%|| 2511/6926 [2:22:01<5:04:18,  4.14s/it]

{'epoch': 0, 'iter': 2510, 'loss': 6.425663948059082}


EP_train:0:  36%|| 2521/6926 [2:22:44<5:08:35,  4.20s/it]

{'epoch': 0, 'iter': 2520, 'loss': 6.379358291625977}


EP_train:0:  37%|| 2531/6926 [2:23:26<5:03:43,  4.15s/it]

{'epoch': 0, 'iter': 2530, 'loss': 6.462167739868164}


EP_train:0:  37%|| 2541/6926 [2:24:07<5:03:25,  4.15s/it]

{'epoch': 0, 'iter': 2540, 'loss': 6.649077892303467}


EP_train:0:  37%|| 2551/6926 [2:24:51<5:10:48,  4.26s/it]

{'epoch': 0, 'iter': 2550, 'loss': 6.99330997467041}


EP_train:0:  37%|| 2561/6926 [2:25:31<4:47:22,  3.95s/it]

{'epoch': 0, 'iter': 2560, 'loss': 6.813333988189697}


EP_train:0:  37%|| 2571/6926 [2:26:11<4:54:56,  4.06s/it]

{'epoch': 0, 'iter': 2570, 'loss': 6.473178863525391}


EP_train:0:  37%|| 2581/6926 [2:26:51<4:50:53,  4.02s/it]

{'epoch': 0, 'iter': 2580, 'loss': 6.554617881774902}


EP_train:0:  37%|| 2591/6926 [2:27:31<4:45:48,  3.96s/it]

{'epoch': 0, 'iter': 2590, 'loss': 6.671253681182861}


EP_train:0:  38%|| 2601/6926 [2:28:11<4:47:28,  3.99s/it]

{'epoch': 0, 'iter': 2600, 'loss': 6.307362079620361}


EP_train:0:  38%|| 2611/6926 [2:28:52<4:52:24,  4.07s/it]

{'epoch': 0, 'iter': 2610, 'loss': 6.851354598999023}


EP_train:0:  38%|| 2621/6926 [2:29:33<4:52:52,  4.08s/it]

{'epoch': 0, 'iter': 2620, 'loss': 6.417898654937744}


EP_train:0:  38%|| 2631/6926 [2:30:13<4:46:42,  4.01s/it]

{'epoch': 0, 'iter': 2630, 'loss': 6.193329334259033}


EP_train:0:  38%|| 2641/6926 [2:30:53<4:47:38,  4.03s/it]

{'epoch': 0, 'iter': 2640, 'loss': 6.193037986755371}


EP_train:0:  38%|| 2651/6926 [2:31:33<4:43:58,  3.99s/it]

{'epoch': 0, 'iter': 2650, 'loss': 6.793360233306885}


EP_train:0:  38%|| 2661/6926 [2:32:14<4:47:41,  4.05s/it]

{'epoch': 0, 'iter': 2660, 'loss': 6.511561393737793}


EP_train:0:  39%|| 2671/6926 [2:32:54<4:37:51,  3.92s/it]

{'epoch': 0, 'iter': 2670, 'loss': 6.635089874267578}


EP_train:0:  39%|| 2681/6926 [2:33:34<4:45:13,  4.03s/it]

{'epoch': 0, 'iter': 2680, 'loss': 6.401362419128418}


EP_train:0:  39%|| 2691/6926 [2:34:15<4:45:50,  4.05s/it]

{'epoch': 0, 'iter': 2690, 'loss': 6.828788757324219}


EP_train:0:  39%|| 2701/6926 [2:34:55<4:50:07,  4.12s/it]

{'epoch': 0, 'iter': 2700, 'loss': 6.824191570281982}


EP_train:0:  39%|| 2711/6926 [2:35:36<4:49:32,  4.12s/it]

{'epoch': 0, 'iter': 2710, 'loss': 6.188155651092529}


EP_train:0:  39%|| 2721/6926 [2:36:16<4:38:04,  3.97s/it]

{'epoch': 0, 'iter': 2720, 'loss': 6.545516490936279}


EP_train:0:  39%|| 2731/6926 [2:36:56<4:44:48,  4.07s/it]

{'epoch': 0, 'iter': 2730, 'loss': 6.593472480773926}


EP_train:0:  40%|| 2741/6926 [2:37:37<4:43:04,  4.06s/it]

{'epoch': 0, 'iter': 2740, 'loss': 6.746532440185547}


EP_train:0:  40%|| 2751/6926 [2:38:17<4:42:16,  4.06s/it]

{'epoch': 0, 'iter': 2750, 'loss': 6.668141841888428}


EP_train:0:  40%|| 2761/6926 [2:38:57<4:39:52,  4.03s/it]

{'epoch': 0, 'iter': 2760, 'loss': 6.58176851272583}


EP_train:0:  40%|| 2771/6926 [2:39:38<4:44:58,  4.12s/it]

{'epoch': 0, 'iter': 2770, 'loss': 6.8581929206848145}


EP_train:0:  40%|| 2781/6926 [2:40:19<4:43:47,  4.11s/it]

{'epoch': 0, 'iter': 2780, 'loss': 6.788348197937012}


EP_train:0:  40%|| 2791/6926 [2:40:59<4:42:27,  4.10s/it]

{'epoch': 0, 'iter': 2790, 'loss': 6.372860431671143}


EP_train:0:  40%|| 2801/6926 [2:41:40<4:42:34,  4.11s/it]

{'epoch': 0, 'iter': 2800, 'loss': 6.623529434204102}


EP_train:0:  41%|| 2811/6926 [2:42:21<4:41:41,  4.11s/it]

{'epoch': 0, 'iter': 2810, 'loss': 6.773873805999756}


EP_train:0:  41%|| 2821/6926 [2:43:01<4:36:28,  4.04s/it]

{'epoch': 0, 'iter': 2820, 'loss': 6.652586936950684}


EP_train:0:  41%|| 2831/6926 [2:43:42<4:35:23,  4.03s/it]

{'epoch': 0, 'iter': 2830, 'loss': 6.955265045166016}


EP_train:0:  41%|| 2841/6926 [2:44:22<4:35:16,  4.04s/it]

{'epoch': 0, 'iter': 2840, 'loss': 6.774853706359863}


EP_train:0:  41%|| 2851/6926 [2:45:03<4:37:26,  4.09s/it]

{'epoch': 0, 'iter': 2850, 'loss': 6.4982709884643555}


EP_train:0:  41%|| 2861/6926 [2:45:44<4:34:23,  4.05s/it]

{'epoch': 0, 'iter': 2860, 'loss': 6.264394760131836}


EP_train:0:  41%|| 2871/6926 [2:46:24<4:34:40,  4.06s/it]

{'epoch': 0, 'iter': 2870, 'loss': 6.368768215179443}


EP_train:0:  42%|| 2881/6926 [2:47:04<4:28:03,  3.98s/it]

{'epoch': 0, 'iter': 2880, 'loss': 6.633325576782227}


EP_train:0:  42%|| 2891/6926 [2:47:46<4:39:49,  4.16s/it]

{'epoch': 0, 'iter': 2890, 'loss': 6.691486835479736}


EP_train:0:  42%|| 2901/6926 [2:48:27<4:36:19,  4.12s/it]

{'epoch': 0, 'iter': 2900, 'loss': 6.389592170715332}


EP_train:0:  42%|| 2911/6926 [2:49:07<4:31:56,  4.06s/it]

{'epoch': 0, 'iter': 2910, 'loss': 6.153446197509766}


EP_train:0:  42%|| 2921/6926 [2:49:48<4:34:18,  4.11s/it]

{'epoch': 0, 'iter': 2920, 'loss': 6.325006008148193}


EP_train:0:  42%|| 2931/6926 [2:50:29<4:32:28,  4.09s/it]

{'epoch': 0, 'iter': 2930, 'loss': 5.987374782562256}


EP_train:0:  42%|| 2941/6926 [2:51:10<4:29:06,  4.05s/it]

{'epoch': 0, 'iter': 2940, 'loss': 6.123432636260986}


EP_train:0:  43%|| 2951/6926 [2:51:51<4:33:29,  4.13s/it]

{'epoch': 0, 'iter': 2950, 'loss': 6.330215930938721}


EP_train:0:  43%|| 2961/6926 [2:52:32<4:33:25,  4.14s/it]

{'epoch': 0, 'iter': 2960, 'loss': 6.569223403930664}


EP_train:0:  43%|| 2971/6926 [2:53:13<4:31:19,  4.12s/it]

{'epoch': 0, 'iter': 2970, 'loss': 6.398012638092041}


EP_train:0:  43%|| 2981/6926 [2:53:54<4:29:04,  4.09s/it]

{'epoch': 0, 'iter': 2980, 'loss': 6.642857074737549}


EP_train:0:  43%|| 2991/6926 [2:54:35<4:28:19,  4.09s/it]

{'epoch': 0, 'iter': 2990, 'loss': 6.191871166229248}


EP_train:0:  43%|| 3001/6926 [2:55:16<4:26:56,  4.08s/it]

{'epoch': 0, 'iter': 3000, 'loss': 6.574568748474121}


EP_train:0:  43%|| 3011/6926 [2:55:57<4:30:26,  4.14s/it]

{'epoch': 0, 'iter': 3010, 'loss': 6.399039268493652}


EP_train:0:  44%|| 3021/6926 [2:56:39<4:28:58,  4.13s/it]

{'epoch': 0, 'iter': 3020, 'loss': 6.736607551574707}


EP_train:0:  44%|| 3031/6926 [2:57:19<4:26:25,  4.10s/it]

{'epoch': 0, 'iter': 3030, 'loss': 6.3453168869018555}


EP_train:0:  44%|| 3041/6926 [2:58:00<4:22:53,  4.06s/it]

{'epoch': 0, 'iter': 3040, 'loss': 6.8593316078186035}


EP_train:0:  44%|| 3051/6926 [2:58:41<4:29:30,  4.17s/it]

{'epoch': 0, 'iter': 3050, 'loss': 6.4188055992126465}


EP_train:0:  44%|| 3061/6926 [2:59:22<4:27:35,  4.15s/it]

{'epoch': 0, 'iter': 3060, 'loss': 7.054901123046875}


EP_train:0:  44%|| 3071/6926 [3:00:04<4:25:51,  4.14s/it]

{'epoch': 0, 'iter': 3070, 'loss': 6.468173503875732}


EP_train:0:  44%|| 3081/6926 [3:00:45<4:21:16,  4.08s/it]

{'epoch': 0, 'iter': 3080, 'loss': 6.41637659072876}


EP_train:0:  45%|| 3091/6926 [3:01:26<4:29:20,  4.21s/it]

{'epoch': 0, 'iter': 3090, 'loss': 6.188133716583252}


EP_train:0:  45%|| 3101/6926 [3:02:10<4:34:17,  4.30s/it]

{'epoch': 0, 'iter': 3100, 'loss': 6.380772590637207}


EP_train:0:  45%|| 3111/6926 [3:02:52<4:28:37,  4.22s/it]

{'epoch': 0, 'iter': 3110, 'loss': 6.37169885635376}


EP_train:0:  45%|| 3121/6926 [3:03:34<4:24:20,  4.17s/it]

{'epoch': 0, 'iter': 3120, 'loss': 6.41666841506958}


EP_train:0:  45%|| 3131/6926 [3:04:17<4:32:25,  4.31s/it]

{'epoch': 0, 'iter': 3130, 'loss': 6.46490478515625}


EP_train:0:  45%|| 3141/6926 [3:05:00<4:27:15,  4.24s/it]

{'epoch': 0, 'iter': 3140, 'loss': 6.6178507804870605}


EP_train:0:  45%|| 3151/6926 [3:05:42<4:23:41,  4.19s/it]

{'epoch': 0, 'iter': 3150, 'loss': 6.129010200500488}


EP_train:0:  46%|| 3161/6926 [3:06:24<4:26:56,  4.25s/it]

{'epoch': 0, 'iter': 3160, 'loss': 6.267211437225342}


EP_train:0:  46%|| 3171/6926 [3:07:06<4:26:13,  4.25s/it]

{'epoch': 0, 'iter': 3170, 'loss': 6.379340171813965}


EP_train:0:  46%|| 3181/6926 [3:07:49<4:25:57,  4.26s/it]

{'epoch': 0, 'iter': 3180, 'loss': 6.679071426391602}


EP_train:0:  46%|| 3191/6926 [3:08:31<4:19:51,  4.17s/it]

{'epoch': 0, 'iter': 3190, 'loss': 6.648940086364746}


EP_train:0:  46%|| 3201/6926 [3:09:14<4:24:07,  4.25s/it]

{'epoch': 0, 'iter': 3200, 'loss': 6.238692760467529}


EP_train:0:  46%|| 3211/6926 [3:09:56<4:21:32,  4.22s/it]

{'epoch': 0, 'iter': 3210, 'loss': 6.72393798828125}


EP_train:0:  47%|| 3221/6926 [3:10:38<4:20:53,  4.23s/it]

{'epoch': 0, 'iter': 3220, 'loss': 6.977975845336914}


EP_train:0:  47%|| 3231/6926 [3:11:21<4:27:33,  4.34s/it]

{'epoch': 0, 'iter': 3230, 'loss': 6.505522727966309}


EP_train:0:  47%|| 3241/6926 [3:12:03<4:19:17,  4.22s/it]

{'epoch': 0, 'iter': 3240, 'loss': 6.341251373291016}


EP_train:0:  47%|| 3251/6926 [3:12:46<4:21:18,  4.27s/it]

{'epoch': 0, 'iter': 3250, 'loss': 6.962259292602539}


EP_train:0:  47%|| 3261/6926 [3:13:28<4:21:00,  4.27s/it]

{'epoch': 0, 'iter': 3260, 'loss': 6.5994367599487305}


EP_train:0:  47%|| 3271/6926 [3:14:11<4:19:08,  4.25s/it]

{'epoch': 0, 'iter': 3270, 'loss': 6.5414838790893555}


EP_train:0:  47%|| 3281/6926 [3:14:53<4:17:26,  4.24s/it]

{'epoch': 0, 'iter': 3280, 'loss': 6.587400913238525}


EP_train:0:  48%|| 3291/6926 [3:15:35<4:17:15,  4.25s/it]

{'epoch': 0, 'iter': 3290, 'loss': 7.021934509277344}


EP_train:0:  48%|| 3301/6926 [3:16:17<4:18:10,  4.27s/it]

{'epoch': 0, 'iter': 3300, 'loss': 6.135509490966797}


EP_train:0:  48%|| 3311/6926 [3:16:59<4:15:27,  4.24s/it]

{'epoch': 0, 'iter': 3310, 'loss': 7.158744812011719}


EP_train:0:  48%|| 3321/6926 [3:17:42<4:10:01,  4.16s/it]

{'epoch': 0, 'iter': 3320, 'loss': 6.865646839141846}


EP_train:0:  48%|| 3331/6926 [3:18:24<4:15:33,  4.27s/it]

{'epoch': 0, 'iter': 3330, 'loss': 6.574974060058594}


EP_train:0:  48%|| 3341/6926 [3:19:07<4:14:08,  4.25s/it]

{'epoch': 0, 'iter': 3340, 'loss': 6.466408729553223}


EP_train:0:  48%|| 3351/6926 [3:19:50<4:12:58,  4.25s/it]

{'epoch': 0, 'iter': 3350, 'loss': 6.598042964935303}


EP_train:0:  49%|| 3361/6926 [3:20:32<4:13:18,  4.26s/it]

{'epoch': 0, 'iter': 3360, 'loss': 6.180822372436523}


EP_train:0:  49%|| 3371/6926 [3:21:15<4:14:51,  4.30s/it]

{'epoch': 0, 'iter': 3370, 'loss': 6.69344425201416}


EP_train:0:  49%|| 3381/6926 [3:21:57<4:04:35,  4.14s/it]

{'epoch': 0, 'iter': 3380, 'loss': 6.298222064971924}


EP_train:0:  49%|| 3391/6926 [3:22:39<4:09:16,  4.23s/it]

{'epoch': 0, 'iter': 3390, 'loss': 6.457891941070557}


EP_train:0:  49%|| 3401/6926 [3:23:23<4:22:01,  4.46s/it]

{'epoch': 0, 'iter': 3400, 'loss': 6.812070846557617}


EP_train:0:  49%|| 3411/6926 [3:24:06<4:10:42,  4.28s/it]

{'epoch': 0, 'iter': 3410, 'loss': 6.246264934539795}


EP_train:0:  49%|| 3421/6926 [3:24:48<4:08:56,  4.26s/it]

{'epoch': 0, 'iter': 3420, 'loss': 6.4958906173706055}


EP_train:0:  50%|| 3431/6926 [3:25:31<4:11:03,  4.31s/it]

{'epoch': 0, 'iter': 3430, 'loss': 6.45327615737915}


EP_train:0:  50%|| 3441/6926 [3:26:15<4:10:00,  4.30s/it]

{'epoch': 0, 'iter': 3440, 'loss': 6.498260974884033}


EP_train:0:  50%|| 3451/6926 [3:26:57<3:59:30,  4.14s/it]

{'epoch': 0, 'iter': 3450, 'loss': 6.004724979400635}


EP_train:0:  50%|| 3461/6926 [3:27:40<4:06:41,  4.27s/it]

{'epoch': 0, 'iter': 3460, 'loss': 6.682702541351318}


EP_train:0:  50%|| 3471/6926 [3:28:22<4:05:16,  4.26s/it]

{'epoch': 0, 'iter': 3470, 'loss': 6.271570205688477}


EP_train:0:  50%|| 3481/6926 [3:29:04<4:01:55,  4.21s/it]

{'epoch': 0, 'iter': 3480, 'loss': 6.787234306335449}


EP_train:0:  50%|| 3491/6926 [3:29:47<4:02:40,  4.24s/it]

{'epoch': 0, 'iter': 3490, 'loss': 6.820544719696045}


EP_train:0:  51%|| 3501/6926 [3:30:29<4:06:10,  4.31s/it]

{'epoch': 0, 'iter': 3500, 'loss': 6.575282573699951}


EP_train:0:  51%|| 3511/6926 [3:31:11<4:00:39,  4.23s/it]

{'epoch': 0, 'iter': 3510, 'loss': 6.763660907745361}


EP_train:0:  51%|| 3521/6926 [3:31:53<4:01:07,  4.25s/it]

{'epoch': 0, 'iter': 3520, 'loss': 6.607110500335693}


EP_train:0:  51%|| 3531/6926 [3:32:35<3:55:33,  4.16s/it]

{'epoch': 0, 'iter': 3530, 'loss': 6.596709251403809}


EP_train:0:  51%|| 3541/6926 [3:33:16<3:54:48,  4.16s/it]

{'epoch': 0, 'iter': 3540, 'loss': 6.643033981323242}


EP_train:0:  51%|| 3551/6926 [3:33:58<3:52:19,  4.13s/it]

{'epoch': 0, 'iter': 3550, 'loss': 6.415520191192627}


EP_train:0:  51%|| 3561/6926 [3:34:39<3:51:43,  4.13s/it]

{'epoch': 0, 'iter': 3560, 'loss': 6.553550720214844}


EP_train:0:  52%|| 3571/6926 [3:35:20<3:53:01,  4.17s/it]

{'epoch': 0, 'iter': 3570, 'loss': 6.559764862060547}


EP_train:0:  52%|| 3581/6926 [3:36:02<3:52:08,  4.16s/it]

{'epoch': 0, 'iter': 3580, 'loss': 6.699310779571533}


EP_train:0:  52%|| 3591/6926 [3:36:43<3:50:07,  4.14s/it]

{'epoch': 0, 'iter': 3590, 'loss': 6.106431484222412}


EP_train:0:  52%|| 3601/6926 [3:37:25<3:53:08,  4.21s/it]

{'epoch': 0, 'iter': 3600, 'loss': 6.493234634399414}


EP_train:0:  52%|| 3611/6926 [3:38:06<3:52:48,  4.21s/it]

{'epoch': 0, 'iter': 3610, 'loss': 6.563889503479004}


EP_train:0:  52%|| 3621/6926 [3:38:48<3:49:35,  4.17s/it]

{'epoch': 0, 'iter': 3620, 'loss': 6.632420539855957}


EP_train:0:  52%|| 3631/6926 [3:39:29<3:49:36,  4.18s/it]

{'epoch': 0, 'iter': 3630, 'loss': 6.442342758178711}


EP_train:0:  53%|| 3641/6926 [3:40:11<3:50:38,  4.21s/it]

{'epoch': 0, 'iter': 3640, 'loss': 6.686977386474609}


EP_train:0:  53%|| 3651/6926 [3:40:54<3:52:36,  4.26s/it]

{'epoch': 0, 'iter': 3650, 'loss': 6.923647403717041}


EP_train:0:  53%|| 3661/6926 [3:41:36<3:50:59,  4.24s/it]

{'epoch': 0, 'iter': 3660, 'loss': 6.374137878417969}


EP_train:0:  53%|| 3671/6926 [3:42:18<3:43:44,  4.12s/it]

{'epoch': 0, 'iter': 3670, 'loss': 6.799701690673828}


EP_train:0:  53%|| 3681/6926 [3:43:01<3:51:02,  4.27s/it]

{'epoch': 0, 'iter': 3680, 'loss': 6.294975280761719}


EP_train:0:  53%|| 3691/6926 [3:43:44<3:50:55,  4.28s/it]

{'epoch': 0, 'iter': 3690, 'loss': 6.264002799987793}


EP_train:0:  53%|| 3701/6926 [3:44:26<3:44:40,  4.18s/it]

{'epoch': 0, 'iter': 3700, 'loss': 6.357506275177002}


EP_train:0:  54%|| 3711/6926 [3:45:08<3:46:28,  4.23s/it]

{'epoch': 0, 'iter': 3710, 'loss': 6.873950004577637}


EP_train:0:  54%|| 3721/6926 [3:45:51<3:47:23,  4.26s/it]

{'epoch': 0, 'iter': 3720, 'loss': 6.395504474639893}


EP_train:0:  54%|| 3731/6926 [3:46:33<3:45:46,  4.24s/it]

{'epoch': 0, 'iter': 3730, 'loss': 6.178622722625732}


EP_train:0:  54%|| 3741/6926 [3:47:16<3:41:51,  4.18s/it]

{'epoch': 0, 'iter': 3740, 'loss': 5.935133934020996}


EP_train:0:  54%|| 3751/6926 [3:47:58<3:45:17,  4.26s/it]

{'epoch': 0, 'iter': 3750, 'loss': 6.4778733253479}


EP_train:0:  54%|| 3761/6926 [3:48:41<3:43:10,  4.23s/it]

{'epoch': 0, 'iter': 3760, 'loss': 6.658216953277588}


EP_train:0:  54%|| 3771/6926 [3:49:23<3:42:59,  4.24s/it]

{'epoch': 0, 'iter': 3770, 'loss': 6.718311309814453}


EP_train:0:  55%|| 3781/6926 [3:50:06<3:44:18,  4.28s/it]

{'epoch': 0, 'iter': 3780, 'loss': 5.885298252105713}


EP_train:0:  55%|| 3791/6926 [3:50:48<3:41:02,  4.23s/it]

{'epoch': 0, 'iter': 3790, 'loss': 6.629024982452393}


EP_train:0:  55%|| 3801/6926 [3:51:31<3:42:04,  4.26s/it]

{'epoch': 0, 'iter': 3800, 'loss': 6.472931861877441}


EP_train:0:  55%|| 3811/6926 [3:52:14<3:41:16,  4.26s/it]

{'epoch': 0, 'iter': 3810, 'loss': 6.604378700256348}


EP_train:0:  55%|| 3821/6926 [3:52:57<3:37:53,  4.21s/it]

{'epoch': 0, 'iter': 3820, 'loss': 5.891790390014648}


EP_train:0:  55%|| 3831/6926 [3:53:39<3:34:23,  4.16s/it]

{'epoch': 0, 'iter': 3830, 'loss': 6.651917934417725}


EP_train:0:  55%|| 3841/6926 [3:54:21<3:38:00,  4.24s/it]

{'epoch': 0, 'iter': 3840, 'loss': 6.486023426055908}


EP_train:0:  56%|| 3851/6926 [3:55:04<3:37:37,  4.25s/it]

{'epoch': 0, 'iter': 3850, 'loss': 6.464644908905029}


EP_train:0:  56%|| 3861/6926 [3:55:46<3:34:18,  4.20s/it]

{'epoch': 0, 'iter': 3860, 'loss': 6.177788734436035}


EP_train:0:  56%|| 3871/6926 [3:56:29<3:37:06,  4.26s/it]

{'epoch': 0, 'iter': 3870, 'loss': 6.377863883972168}


EP_train:0:  56%|| 3881/6926 [3:57:11<3:39:27,  4.32s/it]

{'epoch': 0, 'iter': 3880, 'loss': 6.779869079589844}


EP_train:0:  56%|| 3891/6926 [3:57:53<3:28:32,  4.12s/it]

{'epoch': 0, 'iter': 3890, 'loss': 6.676414489746094}


EP_train:0:  56%|| 3901/6926 [3:58:36<3:35:39,  4.28s/it]

{'epoch': 0, 'iter': 3900, 'loss': 6.30963659286499}


EP_train:0:  56%|| 3911/6926 [3:59:18<3:31:39,  4.21s/it]

{'epoch': 0, 'iter': 3910, 'loss': 6.132951736450195}


EP_train:0:  57%|| 3921/6926 [4:00:01<3:34:23,  4.28s/it]

{'epoch': 0, 'iter': 3920, 'loss': 6.605841636657715}


EP_train:0:  57%|| 3931/6926 [4:00:43<3:27:55,  4.17s/it]

{'epoch': 0, 'iter': 3930, 'loss': 6.348088264465332}


EP_train:0:  57%|| 3941/6926 [4:01:25<3:24:17,  4.11s/it]

{'epoch': 0, 'iter': 3940, 'loss': 6.11338996887207}


EP_train:0:  57%|| 3951/6926 [4:02:07<3:28:31,  4.21s/it]

{'epoch': 0, 'iter': 3950, 'loss': 6.451095104217529}


EP_train:0:  57%|| 3961/6926 [4:02:48<3:21:18,  4.07s/it]

{'epoch': 0, 'iter': 3960, 'loss': 5.986704349517822}


EP_train:0:  57%|| 3971/6926 [4:03:30<3:24:41,  4.16s/it]

{'epoch': 0, 'iter': 3970, 'loss': 6.3772873878479}


EP_train:0:  57%|| 3981/6926 [4:04:12<3:22:37,  4.13s/it]

{'epoch': 0, 'iter': 3980, 'loss': 6.7298126220703125}


EP_train:0:  58%|| 3991/6926 [4:04:53<3:22:07,  4.13s/it]

{'epoch': 0, 'iter': 3990, 'loss': 6.253079891204834}


EP_train:0:  58%|| 4001/6926 [4:05:35<3:25:19,  4.21s/it]

{'epoch': 0, 'iter': 4000, 'loss': 6.85193395614624}


EP_train:0:  58%|| 4011/6926 [4:06:18<3:27:45,  4.28s/it]

{'epoch': 0, 'iter': 4010, 'loss': 6.249364852905273}


EP_train:0:  58%|| 4021/6926 [4:06:59<3:15:36,  4.04s/it]

{'epoch': 0, 'iter': 4020, 'loss': 6.683557510375977}


EP_train:0:  58%|| 4031/6926 [4:07:41<3:19:34,  4.14s/it]

{'epoch': 0, 'iter': 4030, 'loss': 6.069925308227539}


EP_train:0:  58%|| 4041/6926 [4:08:22<3:19:13,  4.14s/it]

{'epoch': 0, 'iter': 4040, 'loss': 6.642129898071289}


EP_train:0:  58%|| 4051/6926 [4:09:04<3:22:09,  4.22s/it]

{'epoch': 0, 'iter': 4050, 'loss': 6.050846576690674}


EP_train:0:  59%|| 4061/6926 [4:09:46<3:20:52,  4.21s/it]

{'epoch': 0, 'iter': 4060, 'loss': 6.220083713531494}


EP_train:0:  59%|| 4071/6926 [4:10:27<3:16:35,  4.13s/it]

{'epoch': 0, 'iter': 4070, 'loss': 6.3281474113464355}


EP_train:0:  59%|| 4081/6926 [4:11:08<3:16:22,  4.14s/it]

{'epoch': 0, 'iter': 4080, 'loss': 5.973665237426758}


EP_train:0:  59%|| 4091/6926 [4:11:50<3:14:49,  4.12s/it]

{'epoch': 0, 'iter': 4090, 'loss': 6.001520156860352}


EP_train:0:  59%|| 4101/6926 [4:12:31<3:13:38,  4.11s/it]

{'epoch': 0, 'iter': 4100, 'loss': 6.603658199310303}


EP_train:0:  59%|| 4111/6926 [4:13:12<3:15:22,  4.16s/it]

{'epoch': 0, 'iter': 4110, 'loss': 6.515074253082275}


EP_train:0:  60%|| 4121/6926 [4:13:55<3:18:57,  4.26s/it]

{'epoch': 0, 'iter': 4120, 'loss': 6.522736072540283}


EP_train:0:  60%|| 4131/6926 [4:14:36<3:14:52,  4.18s/it]

{'epoch': 0, 'iter': 4130, 'loss': 6.347981929779053}


EP_train:0:  60%|| 4141/6926 [4:15:18<3:16:38,  4.24s/it]

{'epoch': 0, 'iter': 4140, 'loss': 6.04838228225708}


EP_train:0:  60%|| 4151/6926 [4:16:00<3:12:00,  4.15s/it]

{'epoch': 0, 'iter': 4150, 'loss': 6.344769477844238}


EP_train:0:  60%|| 4161/6926 [4:16:41<3:13:04,  4.19s/it]

{'epoch': 0, 'iter': 4160, 'loss': 6.723227500915527}


EP_train:0:  60%|| 4171/6926 [4:17:23<3:15:51,  4.27s/it]

{'epoch': 0, 'iter': 4170, 'loss': 6.824166774749756}


EP_train:0:  60%|| 4181/6926 [4:18:05<3:11:38,  4.19s/it]

{'epoch': 0, 'iter': 4180, 'loss': 6.35142183303833}


EP_train:0:  61%|| 4191/6926 [4:18:47<3:13:30,  4.25s/it]

{'epoch': 0, 'iter': 4190, 'loss': 6.413609504699707}


EP_train:0:  61%|| 4201/6926 [4:19:29<3:11:23,  4.21s/it]

{'epoch': 0, 'iter': 4200, 'loss': 6.287151336669922}


EP_train:0:  61%|| 4211/6926 [4:20:12<3:07:56,  4.15s/it]

{'epoch': 0, 'iter': 4210, 'loss': 6.457993030548096}


EP_train:0:  61%|| 4221/6926 [4:20:54<3:08:33,  4.18s/it]

{'epoch': 0, 'iter': 4220, 'loss': 6.425192356109619}


EP_train:0:  61%|| 4231/6926 [4:21:35<3:06:26,  4.15s/it]

{'epoch': 0, 'iter': 4230, 'loss': 6.404788017272949}


EP_train:0:  61%|| 4241/6926 [4:22:18<3:11:10,  4.27s/it]

{'epoch': 0, 'iter': 4240, 'loss': 6.813311576843262}


EP_train:0:  61%|| 4251/6926 [4:23:01<3:09:52,  4.26s/it]

{'epoch': 0, 'iter': 4250, 'loss': 6.481600284576416}


EP_train:0:  62%|| 4261/6926 [4:23:43<3:10:48,  4.30s/it]

{'epoch': 0, 'iter': 4260, 'loss': 6.144352436065674}


EP_train:0:  62%|| 4271/6926 [4:24:25<3:03:45,  4.15s/it]

{'epoch': 0, 'iter': 4270, 'loss': 5.754516124725342}


EP_train:0:  62%|| 4281/6926 [4:25:07<3:07:05,  4.24s/it]

{'epoch': 0, 'iter': 4280, 'loss': 6.180569171905518}


EP_train:0:  62%|| 4291/6926 [4:25:49<3:06:11,  4.24s/it]

{'epoch': 0, 'iter': 4290, 'loss': 6.619673728942871}


EP_train:0:  62%|| 4301/6926 [4:26:31<3:02:15,  4.17s/it]

{'epoch': 0, 'iter': 4300, 'loss': 6.444633960723877}


EP_train:0:  62%|| 4311/6926 [4:27:14<3:05:22,  4.25s/it]

{'epoch': 0, 'iter': 4310, 'loss': 6.718325614929199}


EP_train:0:  62%|| 4321/6926 [4:27:56<3:02:29,  4.20s/it]

{'epoch': 0, 'iter': 4320, 'loss': 6.618796348571777}


EP_train:0:  63%|| 4331/6926 [4:28:38<3:01:58,  4.21s/it]

{'epoch': 0, 'iter': 4330, 'loss': 6.3881001472473145}


EP_train:0:  63%|| 4341/6926 [4:29:20<2:59:21,  4.16s/it]

{'epoch': 0, 'iter': 4340, 'loss': 6.104949951171875}


EP_train:0:  63%|| 4351/6926 [4:30:02<3:02:40,  4.26s/it]

{'epoch': 0, 'iter': 4350, 'loss': 6.860218048095703}


EP_train:0:  63%|| 4361/6926 [4:30:45<3:02:03,  4.26s/it]

{'epoch': 0, 'iter': 4360, 'loss': 6.674605369567871}


EP_train:0:  63%|| 4371/6926 [4:31:27<3:02:15,  4.28s/it]

{'epoch': 0, 'iter': 4370, 'loss': 6.024733066558838}


EP_train:0:  63%|| 4381/6926 [4:32:10<3:01:06,  4.27s/it]

{'epoch': 0, 'iter': 4380, 'loss': 6.516086578369141}


EP_train:0:  63%|| 4391/6926 [4:32:51<2:55:57,  4.16s/it]

{'epoch': 0, 'iter': 4390, 'loss': 6.975014686584473}


EP_train:0:  64%|| 4401/6926 [4:33:33<2:52:53,  4.11s/it]

{'epoch': 0, 'iter': 4400, 'loss': 6.642664909362793}


EP_train:0:  64%|| 4411/6926 [4:34:15<2:57:00,  4.22s/it]

{'epoch': 0, 'iter': 4410, 'loss': 5.914079189300537}


EP_train:0:  64%|| 4421/6926 [4:34:57<2:53:37,  4.16s/it]

{'epoch': 0, 'iter': 4420, 'loss': 6.536232948303223}


EP_train:0:  64%|| 4431/6926 [4:35:39<2:53:54,  4.18s/it]

{'epoch': 0, 'iter': 4430, 'loss': 6.532085418701172}


EP_train:0:  64%|| 4441/6926 [4:36:22<2:54:39,  4.22s/it]

{'epoch': 0, 'iter': 4440, 'loss': 6.428864002227783}


EP_train:0:  64%|| 4451/6926 [4:37:04<2:51:07,  4.15s/it]

{'epoch': 0, 'iter': 4450, 'loss': 6.317174434661865}


EP_train:0:  64%|| 4461/6926 [4:37:46<2:49:44,  4.13s/it]

{'epoch': 0, 'iter': 4460, 'loss': 6.987366199493408}


EP_train:0:  65%|| 4471/6926 [4:38:28<2:52:52,  4.22s/it]

{'epoch': 0, 'iter': 4470, 'loss': 6.116776466369629}


EP_train:0:  65%|| 4481/6926 [4:39:10<2:50:11,  4.18s/it]

{'epoch': 0, 'iter': 4480, 'loss': 6.671289443969727}


EP_train:0:  65%|| 4491/6926 [4:39:51<2:51:15,  4.22s/it]

{'epoch': 0, 'iter': 4490, 'loss': 6.827786445617676}


EP_train:0:  65%|| 4501/6926 [4:40:34<2:49:50,  4.20s/it]

{'epoch': 0, 'iter': 4500, 'loss': 6.345432758331299}


EP_train:0:  65%|| 4511/6926 [4:41:16<2:51:25,  4.26s/it]

{'epoch': 0, 'iter': 4510, 'loss': 6.610424995422363}


EP_train:0:  65%|| 4521/6926 [4:41:58<2:46:39,  4.16s/it]

{'epoch': 0, 'iter': 4520, 'loss': 6.227712631225586}


EP_train:0:  65%|| 4531/6926 [4:42:40<2:49:35,  4.25s/it]

{'epoch': 0, 'iter': 4530, 'loss': 6.25495719909668}


EP_train:0:  66%|| 4541/6926 [4:43:23<2:52:52,  4.35s/it]

{'epoch': 0, 'iter': 4540, 'loss': 6.49331521987915}


EP_train:0:  66%|| 4551/6926 [4:44:06<2:51:03,  4.32s/it]

{'epoch': 0, 'iter': 4550, 'loss': 6.323776721954346}


EP_train:0:  66%|| 4561/6926 [4:44:47<2:43:41,  4.15s/it]

{'epoch': 0, 'iter': 4560, 'loss': 6.130199909210205}


EP_train:0:  66%|| 4571/6926 [4:45:29<2:42:28,  4.14s/it]

{'epoch': 0, 'iter': 4570, 'loss': 6.045596599578857}


EP_train:0:  66%|| 4581/6926 [4:46:11<2:43:56,  4.19s/it]

{'epoch': 0, 'iter': 4580, 'loss': 6.520138740539551}


EP_train:0:  66%|| 4591/6926 [4:46:52<2:38:17,  4.07s/it]

{'epoch': 0, 'iter': 4590, 'loss': 6.142780303955078}


EP_train:0:  66%|| 4601/6926 [4:47:34<2:43:16,  4.21s/it]

{'epoch': 0, 'iter': 4600, 'loss': 6.9373064041137695}


EP_train:0:  67%|| 4611/6926 [4:48:16<2:38:25,  4.11s/it]

{'epoch': 0, 'iter': 4610, 'loss': 7.005645751953125}


EP_train:0:  67%|| 4621/6926 [4:48:58<2:38:40,  4.13s/it]

{'epoch': 0, 'iter': 4620, 'loss': 6.2558369636535645}


EP_train:0:  67%|| 4631/6926 [4:49:40<2:41:39,  4.23s/it]

{'epoch': 0, 'iter': 4630, 'loss': 6.838585376739502}


EP_train:0:  67%|| 4641/6926 [4:50:22<2:40:00,  4.20s/it]

{'epoch': 0, 'iter': 4640, 'loss': 6.69957971572876}


EP_train:0:  67%|| 4651/6926 [4:51:04<2:40:09,  4.22s/it]

{'epoch': 0, 'iter': 4650, 'loss': 6.903911590576172}


EP_train:0:  67%|| 4661/6926 [4:51:47<2:40:25,  4.25s/it]

{'epoch': 0, 'iter': 4660, 'loss': 6.2249884605407715}


EP_train:0:  67%|| 4671/6926 [4:52:29<2:36:40,  4.17s/it]

{'epoch': 0, 'iter': 4670, 'loss': 6.4951558113098145}


EP_train:0:  68%|| 4681/6926 [4:53:12<2:40:52,  4.30s/it]

{'epoch': 0, 'iter': 4680, 'loss': 6.7635345458984375}


EP_train:0:  68%|| 4691/6926 [4:53:54<2:38:39,  4.26s/it]

{'epoch': 0, 'iter': 4690, 'loss': 6.686708450317383}


EP_train:0:  68%|| 4701/6926 [4:54:36<2:33:14,  4.13s/it]

{'epoch': 0, 'iter': 4700, 'loss': 6.530459880828857}


EP_train:0:  68%|| 4711/6926 [4:55:18<2:35:27,  4.21s/it]

{'epoch': 0, 'iter': 4710, 'loss': 6.656693935394287}


EP_train:0:  68%|| 4721/6926 [4:56:00<2:33:47,  4.18s/it]

{'epoch': 0, 'iter': 4720, 'loss': 6.399863243103027}


EP_train:0:  68%|| 4731/6926 [4:56:42<2:32:14,  4.16s/it]

{'epoch': 0, 'iter': 4730, 'loss': 6.5870361328125}


EP_train:0:  68%|| 4741/6926 [4:57:23<2:32:38,  4.19s/it]

{'epoch': 0, 'iter': 4740, 'loss': 6.318546295166016}


EP_train:0:  69%|| 4751/6926 [4:58:06<2:31:36,  4.18s/it]

{'epoch': 0, 'iter': 4750, 'loss': 6.426755428314209}


EP_train:0:  69%|| 4761/6926 [4:58:49<2:33:22,  4.25s/it]

{'epoch': 0, 'iter': 4760, 'loss': 6.22062349319458}


EP_train:0:  69%|| 4771/6926 [4:59:31<2:32:55,  4.26s/it]

{'epoch': 0, 'iter': 4770, 'loss': 6.3339338302612305}


EP_train:0:  69%|| 4781/6926 [5:00:13<2:27:19,  4.12s/it]

{'epoch': 0, 'iter': 4780, 'loss': 6.302624225616455}


EP_train:0:  69%|| 4791/6926 [5:00:57<2:38:08,  4.44s/it]

{'epoch': 0, 'iter': 4790, 'loss': 6.0085649490356445}


EP_train:0:  69%|| 4801/6926 [5:01:38<2:26:58,  4.15s/it]

{'epoch': 0, 'iter': 4800, 'loss': 6.758691787719727}


EP_train:0:  69%|| 4811/6926 [5:02:21<2:28:27,  4.21s/it]

{'epoch': 0, 'iter': 4810, 'loss': 6.017941951751709}


EP_train:0:  70%|| 4821/6926 [5:03:03<2:26:34,  4.18s/it]

{'epoch': 0, 'iter': 4820, 'loss': 6.413397312164307}


EP_train:0:  70%|| 4831/6926 [5:03:45<2:26:58,  4.21s/it]

{'epoch': 0, 'iter': 4830, 'loss': 6.665712356567383}


EP_train:0:  70%|| 4841/6926 [5:04:27<2:27:33,  4.25s/it]

{'epoch': 0, 'iter': 4840, 'loss': 6.754311561584473}


EP_train:0:  70%|| 4851/6926 [5:05:10<2:27:40,  4.27s/it]

{'epoch': 0, 'iter': 4850, 'loss': 6.893001556396484}


EP_train:0:  70%|| 4861/6926 [5:05:52<2:26:44,  4.26s/it]

{'epoch': 0, 'iter': 4860, 'loss': 6.36196231842041}


EP_train:0:  70%|| 4871/6926 [5:06:35<2:27:47,  4.32s/it]

{'epoch': 0, 'iter': 4870, 'loss': 6.60344123840332}


EP_train:0:  70%|| 4881/6926 [5:07:18<2:26:36,  4.30s/it]

{'epoch': 0, 'iter': 4880, 'loss': 6.134089946746826}


EP_train:0:  71%|| 4891/6926 [5:08:00<2:22:46,  4.21s/it]

{'epoch': 0, 'iter': 4890, 'loss': 6.531342029571533}


EP_train:0:  71%|| 4901/6926 [5:08:42<2:23:15,  4.24s/it]

{'epoch': 0, 'iter': 4900, 'loss': 5.9774603843688965}


EP_train:0:  71%|| 4911/6926 [5:09:25<2:20:24,  4.18s/it]

{'epoch': 0, 'iter': 4910, 'loss': 6.368823051452637}


EP_train:0:  71%|| 4921/6926 [5:10:07<2:21:40,  4.24s/it]

{'epoch': 0, 'iter': 4920, 'loss': 6.448410987854004}


EP_train:0:  71%|| 4931/6926 [5:10:50<2:22:07,  4.27s/it]

{'epoch': 0, 'iter': 4930, 'loss': 6.766286373138428}


EP_train:0:  71%|| 4941/6926 [5:11:32<2:19:57,  4.23s/it]

{'epoch': 0, 'iter': 4940, 'loss': 6.247921943664551}


EP_train:0:  71%|| 4951/6926 [5:12:16<2:22:42,  4.34s/it]

{'epoch': 0, 'iter': 4950, 'loss': 6.632506847381592}


EP_train:0:  72%|| 4961/6926 [5:12:57<2:17:46,  4.21s/it]

{'epoch': 0, 'iter': 4960, 'loss': 6.39146614074707}


EP_train:0:  72%|| 4971/6926 [5:13:40<2:18:49,  4.26s/it]

{'epoch': 0, 'iter': 4970, 'loss': 6.8190484046936035}


EP_train:0:  72%|| 4981/6926 [5:14:23<2:19:42,  4.31s/it]

{'epoch': 0, 'iter': 4980, 'loss': 6.291867256164551}


EP_train:0:  72%|| 4991/6926 [5:15:05<2:17:02,  4.25s/it]

{'epoch': 0, 'iter': 4990, 'loss': 6.65775203704834}


EP_train:0:  72%|| 5001/6926 [5:15:48<2:16:50,  4.27s/it]

{'epoch': 0, 'iter': 5000, 'loss': 6.750972270965576}


EP_train:0:  72%|| 5011/6926 [5:16:30<2:17:23,  4.30s/it]

{'epoch': 0, 'iter': 5010, 'loss': 6.582996368408203}


EP_train:0:  72%|| 5021/6926 [5:17:13<2:15:38,  4.27s/it]

{'epoch': 0, 'iter': 5020, 'loss': 6.308555603027344}


EP_train:0:  73%|| 5031/6926 [5:17:55<2:11:57,  4.18s/it]

{'epoch': 0, 'iter': 5030, 'loss': 6.186202526092529}


EP_train:0:  73%|| 5041/6926 [5:18:37<2:11:29,  4.19s/it]

{'epoch': 0, 'iter': 5040, 'loss': 6.6310601234436035}


EP_train:0:  73%|| 5051/6926 [5:19:19<2:11:40,  4.21s/it]

{'epoch': 0, 'iter': 5050, 'loss': 6.360294818878174}


EP_train:0:  73%|| 5061/6926 [5:20:01<2:11:31,  4.23s/it]

{'epoch': 0, 'iter': 5060, 'loss': 6.908473014831543}


EP_train:0:  73%|| 5071/6926 [5:20:43<2:12:00,  4.27s/it]

{'epoch': 0, 'iter': 5070, 'loss': 7.071699619293213}


EP_train:0:  73%|| 5081/6926 [5:21:26<2:10:42,  4.25s/it]

{'epoch': 0, 'iter': 5080, 'loss': 6.946551322937012}


EP_train:0:  74%|| 5091/6926 [5:22:08<2:08:52,  4.21s/it]

{'epoch': 0, 'iter': 5090, 'loss': 6.489626884460449}


EP_train:0:  74%|| 5101/6926 [5:22:51<2:08:55,  4.24s/it]

{'epoch': 0, 'iter': 5100, 'loss': 6.96905517578125}


EP_train:0:  74%|| 5111/6926 [5:23:34<2:09:25,  4.28s/it]

{'epoch': 0, 'iter': 5110, 'loss': 6.613163471221924}


EP_train:0:  74%|| 5121/6926 [5:24:17<2:08:41,  4.28s/it]

{'epoch': 0, 'iter': 5120, 'loss': 7.164847373962402}


EP_train:0:  74%|| 5131/6926 [5:24:59<2:07:40,  4.27s/it]

{'epoch': 0, 'iter': 5130, 'loss': 6.826591968536377}


EP_train:0:  74%|| 5141/6926 [5:25:42<2:06:18,  4.25s/it]

{'epoch': 0, 'iter': 5140, 'loss': 6.273380279541016}


EP_train:0:  74%|| 5151/6926 [5:26:24<2:07:03,  4.30s/it]

{'epoch': 0, 'iter': 5150, 'loss': 6.579494476318359}


EP_train:0:  75%|| 5161/6926 [5:27:06<2:04:50,  4.24s/it]

{'epoch': 0, 'iter': 5160, 'loss': 6.748312950134277}


EP_train:0:  75%|| 5171/6926 [5:27:49<2:02:30,  4.19s/it]

{'epoch': 0, 'iter': 5170, 'loss': 6.785363674163818}


EP_train:0:  75%|| 5181/6926 [5:28:31<2:03:31,  4.25s/it]

{'epoch': 0, 'iter': 5180, 'loss': 6.450759410858154}


EP_train:0:  75%|| 5191/6926 [5:29:14<2:04:16,  4.30s/it]

{'epoch': 0, 'iter': 5190, 'loss': 6.972221851348877}


EP_train:0:  75%|| 5201/6926 [5:29:56<2:00:56,  4.21s/it]

{'epoch': 0, 'iter': 5200, 'loss': 6.711421966552734}


EP_train:0:  75%|| 5211/6926 [5:30:38<2:00:35,  4.22s/it]

{'epoch': 0, 'iter': 5210, 'loss': 6.727732181549072}


EP_train:0:  75%|| 5221/6926 [5:31:20<1:58:59,  4.19s/it]

{'epoch': 0, 'iter': 5220, 'loss': 6.431234359741211}


EP_train:0:  76%|| 5231/6926 [5:32:02<2:01:59,  4.32s/it]

{'epoch': 0, 'iter': 5230, 'loss': 6.091779708862305}


EP_train:0:  76%|| 5241/6926 [5:32:44<1:57:33,  4.19s/it]

{'epoch': 0, 'iter': 5240, 'loss': 6.47713041305542}


EP_train:0:  76%|| 5251/6926 [5:33:27<2:00:13,  4.31s/it]

{'epoch': 0, 'iter': 5250, 'loss': 6.521236896514893}


EP_train:0:  76%|| 5261/6926 [5:34:09<1:56:44,  4.21s/it]

{'epoch': 0, 'iter': 5260, 'loss': 6.377130508422852}


EP_train:0:  76%|| 5271/6926 [5:34:52<1:57:29,  4.26s/it]

{'epoch': 0, 'iter': 5270, 'loss': 6.248863697052002}


EP_train:0:  76%|| 5281/6926 [5:35:35<1:56:22,  4.24s/it]

{'epoch': 0, 'iter': 5280, 'loss': 5.97696590423584}


EP_train:0:  76%|| 5291/6926 [5:36:16<1:49:40,  4.02s/it]

{'epoch': 0, 'iter': 5290, 'loss': 6.352745532989502}


EP_train:0:  77%|| 5301/6926 [5:36:59<1:55:04,  4.25s/it]

{'epoch': 0, 'iter': 5300, 'loss': 6.637901306152344}


EP_train:0:  77%|| 5311/6926 [5:37:42<1:54:27,  4.25s/it]

{'epoch': 0, 'iter': 5310, 'loss': 6.158117771148682}


EP_train:0:  77%|| 5321/6926 [5:38:24<1:53:48,  4.25s/it]

{'epoch': 0, 'iter': 5320, 'loss': 6.382528305053711}


EP_train:0:  77%|| 5331/6926 [5:39:07<1:52:23,  4.23s/it]

{'epoch': 0, 'iter': 5330, 'loss': 7.005890369415283}


EP_train:0:  77%|| 5341/6926 [5:39:50<1:53:56,  4.31s/it]

{'epoch': 0, 'iter': 5340, 'loss': 6.650279998779297}


EP_train:0:  77%|| 5351/6926 [5:40:32<1:52:13,  4.28s/it]

{'epoch': 0, 'iter': 5350, 'loss': 6.621920585632324}


EP_train:0:  77%|| 5361/6926 [5:41:14<1:50:25,  4.23s/it]

{'epoch': 0, 'iter': 5360, 'loss': 6.881036281585693}


EP_train:0:  78%|| 5371/6926 [5:41:57<1:51:45,  4.31s/it]

{'epoch': 0, 'iter': 5370, 'loss': 6.599723815917969}


EP_train:0:  78%|| 5381/6926 [5:42:40<1:49:51,  4.27s/it]

{'epoch': 0, 'iter': 5380, 'loss': 6.6075921058654785}


EP_train:0:  78%|| 5391/6926 [5:43:22<1:44:47,  4.10s/it]

{'epoch': 0, 'iter': 5390, 'loss': 6.140737056732178}


EP_train:0:  78%|| 5401/6926 [5:44:05<1:49:28,  4.31s/it]

{'epoch': 0, 'iter': 5400, 'loss': 6.546850204467773}


EP_train:0:  78%|| 5411/6926 [5:44:48<1:48:11,  4.29s/it]

{'epoch': 0, 'iter': 5410, 'loss': 6.598869323730469}


EP_train:0:  78%|| 5421/6926 [5:45:30<1:46:18,  4.24s/it]

{'epoch': 0, 'iter': 5420, 'loss': 6.266997337341309}


EP_train:0:  78%|| 5431/6926 [5:46:14<1:48:06,  4.34s/it]

{'epoch': 0, 'iter': 5430, 'loss': 6.553002834320068}


EP_train:0:  79%|| 5441/6926 [5:46:56<1:44:37,  4.23s/it]

{'epoch': 0, 'iter': 5440, 'loss': 6.437440872192383}


EP_train:0:  79%|| 5451/6926 [5:47:39<1:47:06,  4.36s/it]

{'epoch': 0, 'iter': 5450, 'loss': 6.124105930328369}


EP_train:0:  79%|| 5461/6926 [5:48:22<1:44:23,  4.28s/it]

{'epoch': 0, 'iter': 5460, 'loss': 6.952751636505127}


EP_train:0:  79%|| 5471/6926 [5:49:05<1:42:41,  4.23s/it]

{'epoch': 0, 'iter': 5470, 'loss': 6.314328193664551}


EP_train:0:  79%|| 5481/6926 [5:49:47<1:41:30,  4.21s/it]

{'epoch': 0, 'iter': 5480, 'loss': 6.430561065673828}


EP_train:0:  79%|| 5491/6926 [5:50:29<1:40:13,  4.19s/it]

{'epoch': 0, 'iter': 5490, 'loss': 6.281064510345459}


EP_train:0:  79%|| 5501/6926 [5:51:13<1:44:25,  4.40s/it]

{'epoch': 0, 'iter': 5500, 'loss': 6.582269668579102}


EP_train:0:  80%|| 5511/6926 [5:51:56<1:40:11,  4.25s/it]

{'epoch': 0, 'iter': 5510, 'loss': 6.926053047180176}


EP_train:0:  80%|| 5521/6926 [5:52:39<1:41:42,  4.34s/it]

{'epoch': 0, 'iter': 5520, 'loss': 6.448729991912842}


EP_train:0:  80%|| 5531/6926 [5:53:22<1:39:00,  4.26s/it]

{'epoch': 0, 'iter': 5530, 'loss': 6.319498062133789}


EP_train:0:  80%|| 5541/6926 [5:54:04<1:39:36,  4.32s/it]

{'epoch': 0, 'iter': 5540, 'loss': 6.943202495574951}


EP_train:0:  80%|| 5551/6926 [5:54:47<1:38:30,  4.30s/it]

{'epoch': 0, 'iter': 5550, 'loss': 6.278879642486572}


EP_train:0:  80%|| 5561/6926 [5:55:30<1:36:43,  4.25s/it]

{'epoch': 0, 'iter': 5560, 'loss': 7.069982528686523}


EP_train:0:  80%|| 5571/6926 [5:56:13<1:36:24,  4.27s/it]

{'epoch': 0, 'iter': 5570, 'loss': 6.623528003692627}


EP_train:0:  81%|| 5581/6926 [5:56:55<1:36:09,  4.29s/it]

{'epoch': 0, 'iter': 5580, 'loss': 6.351459503173828}


EP_train:0:  81%|| 5591/6926 [5:57:38<1:35:13,  4.28s/it]

{'epoch': 0, 'iter': 5590, 'loss': 6.698626518249512}


EP_train:0:  81%|| 5601/6926 [5:58:21<1:33:55,  4.25s/it]

{'epoch': 0, 'iter': 5600, 'loss': 5.857157230377197}


EP_train:0:  81%|| 5611/6926 [5:59:04<1:35:02,  4.34s/it]

{'epoch': 0, 'iter': 5610, 'loss': 6.852749347686768}


EP_train:0:  81%|| 5621/6926 [5:59:47<1:32:39,  4.26s/it]

{'epoch': 0, 'iter': 5620, 'loss': 7.158393859863281}


EP_train:0:  81%|| 5631/6926 [6:00:29<1:33:13,  4.32s/it]

{'epoch': 0, 'iter': 5630, 'loss': 7.203127861022949}


EP_train:0:  81%|| 5641/6926 [6:01:12<1:31:49,  4.29s/it]

{'epoch': 0, 'iter': 5640, 'loss': 6.357213973999023}


EP_train:0:  82%|| 5651/6926 [6:01:56<1:32:30,  4.35s/it]

{'epoch': 0, 'iter': 5650, 'loss': 6.443183898925781}


EP_train:0:  82%|| 5661/6926 [6:02:38<1:27:43,  4.16s/it]

{'epoch': 0, 'iter': 5660, 'loss': 6.336967468261719}


EP_train:0:  82%|| 5671/6926 [6:03:21<1:29:30,  4.28s/it]

{'epoch': 0, 'iter': 5670, 'loss': 6.575072288513184}


EP_train:0:  82%|| 5681/6926 [6:04:03<1:28:58,  4.29s/it]

{'epoch': 0, 'iter': 5680, 'loss': 5.9850311279296875}


EP_train:0:  82%|| 5691/6926 [6:04:45<1:25:34,  4.16s/it]

{'epoch': 0, 'iter': 5690, 'loss': 6.508340835571289}


EP_train:0:  82%|| 5701/6926 [6:05:27<1:25:42,  4.20s/it]

{'epoch': 0, 'iter': 5700, 'loss': 6.856263160705566}


EP_train:0:  82%|| 5711/6926 [6:06:09<1:26:12,  4.26s/it]

{'epoch': 0, 'iter': 5710, 'loss': 6.66064977645874}


EP_train:0:  83%|| 5721/6926 [6:06:51<1:25:11,  4.24s/it]

{'epoch': 0, 'iter': 5720, 'loss': 6.254003524780273}


EP_train:0:  83%|| 5731/6926 [6:07:34<1:23:49,  4.21s/it]

{'epoch': 0, 'iter': 5730, 'loss': 6.860940456390381}


EP_train:0:  83%|| 5741/6926 [6:08:16<1:23:25,  4.22s/it]

{'epoch': 0, 'iter': 5740, 'loss': 6.153923034667969}


EP_train:0:  83%|| 5751/6926 [6:08:58<1:22:51,  4.23s/it]

{'epoch': 0, 'iter': 5750, 'loss': 6.673842430114746}


EP_train:0:  83%|| 5761/6926 [6:09:41<1:21:46,  4.21s/it]

{'epoch': 0, 'iter': 5760, 'loss': 6.507546424865723}


EP_train:0:  83%|| 5771/6926 [6:10:23<1:20:27,  4.18s/it]

{'epoch': 0, 'iter': 5770, 'loss': 6.272229194641113}


EP_train:0:  83%|| 5781/6926 [6:11:05<1:20:27,  4.22s/it]

{'epoch': 0, 'iter': 5780, 'loss': 6.567092418670654}


EP_train:0:  84%|| 5791/6926 [6:11:48<1:19:38,  4.21s/it]

{'epoch': 0, 'iter': 5790, 'loss': 6.6238017082214355}


EP_train:0:  84%|| 5801/6926 [6:12:30<1:19:19,  4.23s/it]

{'epoch': 0, 'iter': 5800, 'loss': 6.199941158294678}


EP_train:0:  84%|| 5811/6926 [6:13:12<1:18:49,  4.24s/it]

{'epoch': 0, 'iter': 5810, 'loss': 5.882711410522461}


EP_train:0:  84%|| 5821/6926 [6:13:55<1:18:59,  4.29s/it]

{'epoch': 0, 'iter': 5820, 'loss': 6.953503608703613}


EP_train:0:  84%|| 5831/6926 [6:14:38<1:17:43,  4.26s/it]

{'epoch': 0, 'iter': 5830, 'loss': 6.75377082824707}


EP_train:0:  84%|| 5841/6926 [6:15:21<1:17:45,  4.30s/it]

{'epoch': 0, 'iter': 5840, 'loss': 6.431156635284424}


EP_train:0:  84%|| 5851/6926 [6:16:03<1:15:08,  4.19s/it]

{'epoch': 0, 'iter': 5850, 'loss': 7.009109973907471}


EP_train:0:  85%|| 5861/6926 [6:16:46<1:15:21,  4.25s/it]

{'epoch': 0, 'iter': 5860, 'loss': 6.4508562088012695}


EP_train:0:  85%|| 5871/6926 [6:17:29<1:15:23,  4.29s/it]

{'epoch': 0, 'iter': 5870, 'loss': 6.433557033538818}


EP_train:0:  85%|| 5881/6926 [6:18:13<1:14:28,  4.28s/it]

{'epoch': 0, 'iter': 5880, 'loss': 7.078103542327881}


EP_train:0:  85%|| 5891/6926 [6:18:56<1:13:22,  4.25s/it]

{'epoch': 0, 'iter': 5890, 'loss': 6.175863742828369}


EP_train:0:  85%|| 5901/6926 [6:19:38<1:11:04,  4.16s/it]

{'epoch': 0, 'iter': 5900, 'loss': 6.855776309967041}


EP_train:0:  85%|| 5911/6926 [6:20:21<1:12:49,  4.31s/it]

{'epoch': 0, 'iter': 5910, 'loss': 6.464379787445068}


EP_train:0:  85%|| 5921/6926 [6:21:03<1:11:17,  4.26s/it]

{'epoch': 0, 'iter': 5920, 'loss': 6.787362575531006}


EP_train:0:  86%|| 5931/6926 [6:21:46<1:12:40,  4.38s/it]

{'epoch': 0, 'iter': 5930, 'loss': 6.816312789916992}


EP_train:0:  86%|| 5941/6926 [6:22:30<1:09:48,  4.25s/it]

{'epoch': 0, 'iter': 5940, 'loss': 6.629576683044434}


EP_train:0:  86%|| 5951/6926 [6:23:12<1:08:25,  4.21s/it]

{'epoch': 0, 'iter': 5950, 'loss': 6.5645952224731445}


EP_train:0:  86%|| 5961/6926 [6:23:55<1:08:06,  4.24s/it]

{'epoch': 0, 'iter': 5960, 'loss': 6.218899726867676}


EP_train:0:  86%|| 5971/6926 [6:24:37<1:07:32,  4.24s/it]

{'epoch': 0, 'iter': 5970, 'loss': 6.9514312744140625}


EP_train:0:  86%|| 5981/6926 [6:25:20<1:07:51,  4.31s/it]

{'epoch': 0, 'iter': 5980, 'loss': 6.9852142333984375}


EP_train:0:  87%|| 5991/6926 [6:26:03<1:06:32,  4.27s/it]

{'epoch': 0, 'iter': 5990, 'loss': 6.765519618988037}


EP_train:0:  87%|| 6001/6926 [6:26:45<1:05:05,  4.22s/it]

{'epoch': 0, 'iter': 6000, 'loss': 6.6060471534729}


EP_train:0:  87%|| 6011/6926 [6:27:27<1:04:41,  4.24s/it]

{'epoch': 0, 'iter': 6010, 'loss': 5.700097560882568}


EP_train:0:  87%|| 6021/6926 [6:28:11<1:04:10,  4.26s/it]

{'epoch': 0, 'iter': 6020, 'loss': 6.116816997528076}


EP_train:0:  87%|| 6031/6926 [6:28:53<1:03:51,  4.28s/it]

{'epoch': 0, 'iter': 6030, 'loss': 6.564675807952881}


EP_train:0:  87%|| 6041/6926 [6:29:36<1:02:39,  4.25s/it]

{'epoch': 0, 'iter': 6040, 'loss': 6.50661563873291}


EP_train:0:  87%|| 6051/6926 [6:30:19<1:02:54,  4.31s/it]

{'epoch': 0, 'iter': 6050, 'loss': 7.089773654937744}


EP_train:0:  88%|| 6061/6926 [6:31:01<1:01:12,  4.25s/it]

{'epoch': 0, 'iter': 6060, 'loss': 6.372762203216553}


EP_train:0:  88%|| 6071/6926 [6:31:43<1:00:57,  4.28s/it]

{'epoch': 0, 'iter': 6070, 'loss': 6.21228551864624}


EP_train:0:  88%|| 6081/6926 [6:32:26<58:43,  4.17s/it]  

{'epoch': 0, 'iter': 6080, 'loss': 6.307191371917725}


EP_train:0:  88%|| 6091/6926 [6:33:08<59:30,  4.28s/it]

{'epoch': 0, 'iter': 6090, 'loss': 6.5861592292785645}


EP_train:0:  88%|| 6101/6926 [6:33:51<58:36,  4.26s/it]

{'epoch': 0, 'iter': 6100, 'loss': 6.012978553771973}


EP_train:0:  88%|| 6111/6926 [6:34:33<56:57,  4.19s/it]

{'epoch': 0, 'iter': 6110, 'loss': 6.205115795135498}


EP_train:0:  88%|| 6121/6926 [6:35:16<57:50,  4.31s/it]

{'epoch': 0, 'iter': 6120, 'loss': 6.289916515350342}


EP_train:0:  89%|| 6131/6926 [6:35:59<56:18,  4.25s/it]

{'epoch': 0, 'iter': 6130, 'loss': 6.207505226135254}


EP_train:0:  89%|| 6141/6926 [6:36:40<55:00,  4.20s/it]

{'epoch': 0, 'iter': 6140, 'loss': 6.6875457763671875}


EP_train:0:  89%|| 6151/6926 [6:37:24<55:04,  4.26s/it]

{'epoch': 0, 'iter': 6150, 'loss': 6.482076644897461}


EP_train:0:  89%|| 6161/6926 [6:38:06<54:07,  4.25s/it]

{'epoch': 0, 'iter': 6160, 'loss': 7.347252368927002}


EP_train:0:  89%|| 6171/6926 [6:38:49<53:05,  4.22s/it]

{'epoch': 0, 'iter': 6170, 'loss': 6.644051551818848}


EP_train:0:  89%|| 6181/6926 [6:39:32<53:56,  4.34s/it]

{'epoch': 0, 'iter': 6180, 'loss': 6.396815776824951}


EP_train:0:  89%|| 6191/6926 [6:40:15<52:43,  4.30s/it]

{'epoch': 0, 'iter': 6190, 'loss': 6.712141990661621}


EP_train:0:  90%|| 6201/6926 [6:40:58<52:10,  4.32s/it]

{'epoch': 0, 'iter': 6200, 'loss': 6.95470666885376}


EP_train:0:  90%|| 6211/6926 [6:41:40<50:05,  4.20s/it]

{'epoch': 0, 'iter': 6210, 'loss': 6.399807929992676}


EP_train:0:  90%|| 6221/6926 [6:42:24<50:57,  4.34s/it]

{'epoch': 0, 'iter': 6220, 'loss': 7.11556339263916}


EP_train:0:  90%|| 6231/6926 [6:43:06<49:09,  4.24s/it]

{'epoch': 0, 'iter': 6230, 'loss': 6.29572057723999}


EP_train:0:  90%|| 6241/6926 [6:43:50<49:05,  4.30s/it]

{'epoch': 0, 'iter': 6240, 'loss': 6.393573760986328}


EP_train:0:  90%|| 6251/6926 [6:44:33<48:25,  4.30s/it]

{'epoch': 0, 'iter': 6250, 'loss': 6.564793586730957}


EP_train:0:  90%|| 6261/6926 [6:45:16<47:54,  4.32s/it]

{'epoch': 0, 'iter': 6260, 'loss': 6.695060729980469}


EP_train:0:  91%|| 6271/6926 [6:45:59<46:25,  4.25s/it]

{'epoch': 0, 'iter': 6270, 'loss': 6.456600666046143}


EP_train:0:  91%|| 6281/6926 [6:46:41<45:46,  4.26s/it]

{'epoch': 0, 'iter': 6280, 'loss': 6.299896717071533}


EP_train:0:  91%|| 6291/6926 [6:47:24<45:24,  4.29s/it]

{'epoch': 0, 'iter': 6290, 'loss': 6.89680814743042}


EP_train:0:  91%|| 6301/6926 [6:48:07<43:52,  4.21s/it]

{'epoch': 0, 'iter': 6300, 'loss': 6.7527899742126465}


EP_train:0:  91%|| 6311/6926 [6:48:50<44:21,  4.33s/it]

{'epoch': 0, 'iter': 6310, 'loss': 6.681653022766113}


EP_train:0:  91%|| 6321/6926 [6:49:33<43:24,  4.31s/it]

{'epoch': 0, 'iter': 6320, 'loss': 6.689919948577881}


EP_train:0:  91%|| 6331/6926 [6:50:16<43:09,  4.35s/it]

{'epoch': 0, 'iter': 6330, 'loss': 6.524555206298828}


EP_train:0:  92%|| 6341/6926 [6:51:00<42:25,  4.35s/it]

{'epoch': 0, 'iter': 6340, 'loss': 7.0499067306518555}


EP_train:0:  92%|| 6351/6926 [6:51:44<41:55,  4.37s/it]

{'epoch': 0, 'iter': 6350, 'loss': 6.656920909881592}


EP_train:0:  92%|| 6361/6926 [6:52:27<40:30,  4.30s/it]

{'epoch': 0, 'iter': 6360, 'loss': 6.84847354888916}


EP_train:0:  92%|| 6371/6926 [6:53:10<39:50,  4.31s/it]

{'epoch': 0, 'iter': 6370, 'loss': 6.469264984130859}


EP_train:0:  92%|| 6381/6926 [6:53:52<38:40,  4.26s/it]

{'epoch': 0, 'iter': 6380, 'loss': 6.196928024291992}


EP_train:0:  92%|| 6391/6926 [6:54:35<38:07,  4.27s/it]

{'epoch': 0, 'iter': 6390, 'loss': 6.238376140594482}


EP_train:0:  92%|| 6401/6926 [6:55:17<37:10,  4.25s/it]

{'epoch': 0, 'iter': 6400, 'loss': 6.445345878601074}


EP_train:0:  93%|| 6411/6926 [6:56:01<36:42,  4.28s/it]

{'epoch': 0, 'iter': 6410, 'loss': 6.310887813568115}


EP_train:0:  93%|| 6421/6926 [6:56:44<36:15,  4.31s/it]

{'epoch': 0, 'iter': 6420, 'loss': 6.485260009765625}


EP_train:0:  93%|| 6431/6926 [6:57:28<35:46,  4.34s/it]

{'epoch': 0, 'iter': 6430, 'loss': 6.09249210357666}


EP_train:0:  93%|| 6441/6926 [6:58:11<34:44,  4.30s/it]

{'epoch': 0, 'iter': 6440, 'loss': 6.5648393630981445}


EP_train:0:  93%|| 6451/6926 [6:58:54<33:56,  4.29s/it]

{'epoch': 0, 'iter': 6450, 'loss': 6.716659069061279}


EP_train:0:  93%|| 6461/6926 [6:59:38<33:50,  4.37s/it]

{'epoch': 0, 'iter': 6460, 'loss': 6.404367446899414}


EP_train:0:  93%|| 6471/6926 [7:00:21<33:14,  4.38s/it]

{'epoch': 0, 'iter': 6470, 'loss': 6.395771503448486}


EP_train:0:  94%|| 6481/6926 [7:01:04<32:14,  4.35s/it]

{'epoch': 0, 'iter': 6480, 'loss': 6.526921272277832}


EP_train:0:  94%|| 6491/6926 [7:01:48<31:56,  4.41s/it]

{'epoch': 0, 'iter': 6490, 'loss': 7.021016597747803}


EP_train:0:  94%|| 6501/6926 [7:02:30<30:11,  4.26s/it]

{'epoch': 0, 'iter': 6500, 'loss': 6.2121405601501465}


EP_train:0:  94%|| 6511/6926 [7:03:13<29:00,  4.19s/it]

{'epoch': 0, 'iter': 6510, 'loss': 6.635948181152344}


EP_train:0:  94%|| 6521/6926 [7:03:55<28:45,  4.26s/it]

{'epoch': 0, 'iter': 6520, 'loss': 6.512888431549072}


EP_train:0:  94%|| 6531/6926 [7:04:38<27:46,  4.22s/it]

{'epoch': 0, 'iter': 6530, 'loss': 6.4810686111450195}


EP_train:0:  94%|| 6541/6926 [7:05:20<27:00,  4.21s/it]

{'epoch': 0, 'iter': 6540, 'loss': 6.515596389770508}


EP_train:0:  95%|| 6551/6926 [7:06:03<26:28,  4.24s/it]

{'epoch': 0, 'iter': 6550, 'loss': 6.665632247924805}


EP_train:0:  95%|| 6561/6926 [7:06:44<25:33,  4.20s/it]

{'epoch': 0, 'iter': 6560, 'loss': 6.738984107971191}


EP_train:0:  95%|| 6571/6926 [7:07:27<25:14,  4.27s/it]

{'epoch': 0, 'iter': 6570, 'loss': 6.716711044311523}


EP_train:0:  95%|| 6581/6926 [7:08:09<24:31,  4.27s/it]

{'epoch': 0, 'iter': 6580, 'loss': 7.010966777801514}


EP_train:0:  95%|| 6591/6926 [7:08:52<24:04,  4.31s/it]

{'epoch': 0, 'iter': 6590, 'loss': 6.577805995941162}


EP_train:0:  95%|| 6601/6926 [7:09:34<22:59,  4.24s/it]

{'epoch': 0, 'iter': 6600, 'loss': 6.315138816833496}


EP_train:0:  95%|| 6611/6926 [7:10:17<22:18,  4.25s/it]

{'epoch': 0, 'iter': 6610, 'loss': 6.769558429718018}


EP_train:0:  96%|| 6621/6926 [7:10:59<21:25,  4.21s/it]

{'epoch': 0, 'iter': 6620, 'loss': 6.792551517486572}


EP_train:0:  96%|| 6631/6926 [7:11:41<20:46,  4.23s/it]

{'epoch': 0, 'iter': 6630, 'loss': 6.579977035522461}


EP_train:0:  96%|| 6641/6926 [7:12:24<20:05,  4.23s/it]

{'epoch': 0, 'iter': 6640, 'loss': 6.768193244934082}


EP_train:0:  96%|| 6651/6926 [7:13:07<19:50,  4.33s/it]

{'epoch': 0, 'iter': 6650, 'loss': 6.149838447570801}


EP_train:0:  96%|| 6661/6926 [7:13:50<19:12,  4.35s/it]

{'epoch': 0, 'iter': 6660, 'loss': 6.493819713592529}


EP_train:0:  96%|| 6671/6926 [7:14:33<18:06,  4.26s/it]

{'epoch': 0, 'iter': 6670, 'loss': 6.433303356170654}


EP_train:0:  96%|| 6681/6926 [7:15:15<17:10,  4.21s/it]

{'epoch': 0, 'iter': 6680, 'loss': 6.179806232452393}


EP_train:0:  97%|| 6691/6926 [7:15:58<16:39,  4.25s/it]

{'epoch': 0, 'iter': 6690, 'loss': 6.959714412689209}


EP_train:0:  97%|| 6701/6926 [7:16:41<16:01,  4.27s/it]

{'epoch': 0, 'iter': 6700, 'loss': 6.883444309234619}


EP_train:0:  97%|| 6711/6926 [7:17:24<15:33,  4.34s/it]

{'epoch': 0, 'iter': 6710, 'loss': 6.70276403427124}


EP_train:0:  97%|| 6721/6926 [7:18:07<14:36,  4.27s/it]

{'epoch': 0, 'iter': 6720, 'loss': 6.851257801055908}


EP_train:0:  97%|| 6731/6926 [7:18:50<14:07,  4.35s/it]

{'epoch': 0, 'iter': 6730, 'loss': 6.346704483032227}


EP_train:0:  97%|| 6741/6926 [7:19:33<13:09,  4.27s/it]

{'epoch': 0, 'iter': 6740, 'loss': 6.769784450531006}


EP_train:0:  97%|| 6751/6926 [7:20:16<12:28,  4.28s/it]

{'epoch': 0, 'iter': 6750, 'loss': 6.161133289337158}


EP_train:0:  98%|| 6761/6926 [7:20:58<11:49,  4.30s/it]

{'epoch': 0, 'iter': 6760, 'loss': 6.662722110748291}


EP_train:0:  98%|| 6771/6926 [7:21:41<10:57,  4.24s/it]

{'epoch': 0, 'iter': 6770, 'loss': 6.769390106201172}


EP_train:0:  98%|| 6781/6926 [7:22:25<10:40,  4.42s/it]

{'epoch': 0, 'iter': 6780, 'loss': 7.022343158721924}


EP_train:0:  98%|| 6791/6926 [7:23:08<09:33,  4.25s/it]

{'epoch': 0, 'iter': 6790, 'loss': 6.332185745239258}


EP_train:0:  98%|| 6801/6926 [7:23:52<09:08,  4.39s/it]

{'epoch': 0, 'iter': 6800, 'loss': 6.359706401824951}


EP_train:0:  98%|| 6811/6926 [7:24:35<08:16,  4.32s/it]

{'epoch': 0, 'iter': 6810, 'loss': 7.4704132080078125}


EP_train:0:  98%|| 6821/6926 [7:25:18<07:34,  4.33s/it]

{'epoch': 0, 'iter': 6820, 'loss': 6.565096378326416}


EP_train:0:  99%|| 6831/6926 [7:26:01<06:49,  4.31s/it]

{'epoch': 0, 'iter': 6830, 'loss': 6.847146511077881}


EP_train:0:  99%|| 6841/6926 [7:26:44<06:05,  4.30s/it]

{'epoch': 0, 'iter': 6840, 'loss': 5.992539882659912}


EP_train:0:  99%|| 6851/6926 [7:27:28<05:26,  4.35s/it]

{'epoch': 0, 'iter': 6850, 'loss': 6.966829299926758}


EP_train:0:  99%|| 6861/6926 [7:28:10<04:37,  4.28s/it]

{'epoch': 0, 'iter': 6860, 'loss': 6.664290428161621}


EP_train:0:  99%|| 6871/6926 [7:28:54<03:58,  4.34s/it]

{'epoch': 0, 'iter': 6870, 'loss': 6.984534740447998}


EP_train:0:  99%|| 6881/6926 [7:29:37<03:15,  4.33s/it]

{'epoch': 0, 'iter': 6880, 'loss': 6.457298278808594}


EP_train:0:  99%|| 6891/6926 [7:30:20<02:33,  4.37s/it]

{'epoch': 0, 'iter': 6890, 'loss': 6.22625207901001}


EP_train:0: 100%|| 6901/6926 [7:31:03<01:46,  4.25s/it]

{'epoch': 0, 'iter': 6900, 'loss': 6.341737270355225}


EP_train:0: 100%|| 6911/6926 [7:31:46<01:01,  4.10s/it]

{'epoch': 0, 'iter': 6910, 'loss': 6.621090888977051}


EP_train:0: 100%|| 6921/6926 [7:32:29<00:21,  4.33s/it]

{'epoch': 0, 'iter': 6920, 'loss': 6.206551551818848}


EP_train:0: 100%|| 6926/6926 [7:32:48<00:00,  3.92s/it]


ZeroDivisionError: float division by zero

In [889]:
class Counter:

    def __init__(self, start: int = 0) -> None:
        self.counter = start

    def __next__(self) -> int:
        i = self.counter
        self.counter += 1
        return i

    def reset(self) -> None:
        self.counter = 0

In [890]:
c = Counter()

In [891]:
next(c)

0

In [892]:
next(c)

1

In [893]:
next(c)

2

In [894]:
reset(c)

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


Don't know how to reset  (c), please run `%reset?` for details


In [895]:
c


<__main__.Counter at 0x34fdec970>

In [896]:
next(c)

3

In [897]:
c.reset()

In [898]:
next(c)

0