In [None]:
import os, sys
import pickle   
from transformers import CamembertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from torch.nn import functional as F
import torch.nn as nn
from datasets import load_from_disk
from tqdm import tqdm
import time

# 1. Prepare Data :

In [2]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

In [3]:
class Tokenized_oscar_dataset(Dataset):
    def __init__(self, raw_tokenised_data):
        """
        Initialiser le dataset avec les données sous forme de dictionnaire.
        """
        self.raw_data = raw_tokenised_data  # Le dictionnaire avec 'input_ids', 'attention_mask', et 'labels'
        self.data = {
            "texts": self.raw_data['text'],
            "input_ids": self.raw_data['input_ids'],
            "attention_mask": self.raw_data['attention_mask'],
            "labels": self.raw_data['labels']
        }
        
    def __len__(self):
        """
        Retourne le nombre d'exemples dans le dataset.
        """
        return len(self.data['input_ids'])
    
    def __getitem__(self, idx):
        """
        Retourne un exemple individuel sous forme de dictionnaire.
        """
        return {
            'input_ids': torch.tensor(self.data['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.data['attention_mask'][idx], dtype=torch.long),
            'labels': torch.tensor(self.data['labels'][idx], dtype=torch.long)
        }

    
def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
    """Prepare masked tokens for MLM."""
    labels = inputs.clone()
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # Replace 80% of masked tokens with [MASK]
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # Replace 10% of masked tokens with random words
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    return inputs, labels


def preprocess_function(texts, tokenizer, max_length=512):
    """
    Préparer les données pour l'entraînement MLM.
    :param texts: Liste de textes.
    :param tokenizer: Tokenizer de CamemBERT.
    :return: Dictionnaire avec 'input_ids', 'attention_mask', et 'labels'.
    """
    # Tokeniser les textes
    tokenized = tokenizer(texts, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]

    # Appliquer le masquage
    masked_inputs, labels = mask_tokens(input_ids, tokenizer)

    return { 
        "text": list(texts),
        "input_ids": masked_inputs.tolist(),
        "attention_mask": attention_mask.tolist(),
        "labels": labels.tolist()
    }

load the downlaoded Oscar :

In [None]:
dataset_path ="data/datasets/CamemBERT/data/CamemBERT/data/mini_oscar/mini_dataset.arrow"
mini_oscar_dataset = load_from_disk(dataset_path)
train_texts, val_texts = train_test_split(mini_oscar_dataset['text'], test_size=0.2, random_state=42)

In [None]:
train_data = preprocess_function(train_texts, tokenizer)
val_data = preprocess_function(val_texts, tokenizer)

In [None]:
import pickle

def save_tokenized_data(data, filename):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f:
        pickle.dump(data, f)

save_tokenized_data(train_data, "Noureddine/MLA-CamemBERT/data/Oscar/tokenized_train_data.pkl")
save_tokenized_data(val_data, "Noureddine/MLA-CamemBERT/data/Oscar/tokenized_val_data.pkl")

In [7]:
def load_tokenized_data(filename):
    with open(filename, "rb") as f:
        data = pickle.load(f)
    return data

loaded_train_data = load_tokenized_data("/home/amine/Noureddine/MLA-CamemBERT/data/Oscar/tokenized_train_data.pkl")
loaded_val_data = load_tokenized_data("/home/amine/Noureddine/MLA-CamemBERT/data/Oscar/tokenized_val_data.pkl")

# 2. Model :

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class CamembertConfig:
    def __init__(self):
        self.vocab_size = 32005
        self.hidden_size = 768
        self.num_hidden_layers = 12
        self.num_attention_heads = 12
        self.intermediate_size = 3072
        self.hidden_act = "gelu"
        self.hidden_dropout_prob = 0.1
        self.attention_probs_dropout_prob = 0.1
        self.max_position_embeddings = 514
        self.type_vocab_size = 1
        self.initializer_range = 0.02
        self.layer_norm_eps = 1e-5
        self.pad_token_id = 1
        self.head_type = "MLM"

class CamembertEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None, position_ids=None):
        input_shape = input_ids.size()
        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device).unsqueeze(0)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=input_ids.device)

        inputs_embeds = self.word_embeddings(input_ids)
        position_embeds = self.position_embeddings(position_ids)
        token_type_embeds = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + position_embeds + token_type_embeds
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)

        # Debug prints
        # print(f"Embeddings NaN: {torch.isnan(embeddings).any()}")

        return embeddings

class CamembertSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = config.hidden_size // config.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        self.dropout = nn.Dropout(0.2)  # Increased dropout rate

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        return x.view(new_x_shape).permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask=None):
        query_layer = self.transpose_for_scores(self.query(hidden_states))
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        # Debug query, key, value
        # print(f"Query NaN: {torch.isnan(query_layer).any()}")
        # print(f"Key NaN: {torch.isnan(key_layer).any()}")
        # print(f"Value NaN: {torch.isnan(value_layer).any()}")

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores /= math.sqrt(self.attention_head_size)

        # Clamp scores to prevent overflow
        attention_scores = torch.clamp(attention_scores, min=-1e9, max=1e9)
        attention_probs = nn.functional.softmax(attention_scores, dim=-1) + 1e-9
        attention_probs = self.dropout(attention_probs)

        # Debug attention scores and probabilities
        # print(f"Attention Scores NaN Before Clamp: {torch.isnan(attention_scores).any()}")
        # print(f"Attention Probs NaN: {torch.isnan(attention_probs).any()}")

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        context_layer = context_layer.view(context_layer.size(0), -1, self.all_head_size)

        # Debug context layer
        # print(f"Context Layer NaN: {torch.isnan(context_layer).any()}")

        return context_layer



class CamembertFeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.activation = F.gelu if config.hidden_act == "gelu" else nn.ReLU()
        self.dense_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(0.2)  # Increased dropout rate

    def forward(self, hidden_states):
        intermediate_output = self.activation(self.dense_1(hidden_states))
        intermediate_output = torch.clamp(intermediate_output, min=-1e9, max=1e9)

        output = self.dense_2(intermediate_output)
        output = self.dropout(output)
        output = self.LayerNorm(output + hidden_states)

        # Debug intermediate and final outputs
        # print(f"Intermediate Output NaN: {torch.isnan(intermediate_output).any()}")
        # print(f"Final Output NaN: {torch.isnan(output).any()}")

        return output


class CamembertLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = CamembertSelfAttention(config)
        self.attention_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.feed_forward = CamembertFeedForward(config)

    def forward(self, hidden_states, attention_mask=None):
        attention_output = self.attention(hidden_states, attention_mask)
        hidden_states = self.attention_norm(hidden_states + attention_output)
        return self.feed_forward(hidden_states)

class CamembertEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.ModuleList([CamembertLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask=None):
        for i, layer in enumerate(self.layers):
            hidden_states = layer(hidden_states, attention_mask)

            # Debug prints for each layer
            # print(f"Layer {i} Hidden States NaN: {torch.isnan(hidden_states).any()}")

        return hidden_states

class CamembertLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

    def forward(self, hidden_states):
        hidden_states = F.gelu(self.dense(hidden_states))
        hidden_states = self.layer_norm(hidden_states)
        logits = self.decoder(hidden_states)

        # Debug prints
        # print(f"Logits NaN: {torch.isnan(logits).any()}")

        return logits

class CamembertModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = CamembertEmbeddings(config)
        self.encoder = CamembertEncoder(config)
        self.head = CamembertLMHead(config) if config.head_type == "MLM" else None

    def forward(self, input_ids, attention_mask=None):
        embedded_input = self.embeddings(input_ids)

        if attention_mask is not None:
            attention_mask = (1.0 - attention_mask) * -float('inf')

        encoder_output = self.encoder(embedded_input, attention_mask)
        return self.head(encoder_output)


In [18]:
def reinitialize_weights(module):
    if isinstance(module, (nn.Linear, nn.Embedding)):
        module.reset_parameters()
    elif isinstance(module, nn.LayerNorm):
        module.reset_parameters()
    elif hasattr(module, "weight") and module.weight is not None:
        nn.init.xavier_uniform_(module.weight)  # Xavier initialization
    if hasattr(module, "bias") and module.bias is not None:
        nn.init.zeros_(module.bias)


In [19]:
from torchinfo import summary 

config = CamembertConfig()
model = CamembertModel(config)
# # Reinitialize weights
model.apply(reinitialize_weights)
print("Model weights reinitialized successfully.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model moved to GPU successfully.")
summary(model)

Model weights reinitialized successfully.
Model moved to GPU successfully.


Layer (type:depth-idx)                             Param #
CamembertModel                                     --
├─CamembertEmbeddings: 1-1                         --
│    └─Embedding: 2-1                              24,579,840
│    └─Embedding: 2-2                              394,752
│    └─Embedding: 2-3                              768
│    └─LayerNorm: 2-4                              1,536
│    └─Dropout: 2-5                                --
├─CamembertEncoder: 1-2                            --
│    └─ModuleList: 2-6                             --
│    │    └─CamembertLayer: 3-1                    6,497,280
│    │    └─CamembertLayer: 3-2                    6,497,280
│    │    └─CamembertLayer: 3-3                    6,497,280
│    │    └─CamembertLayer: 3-4                    6,497,280
│    │    └─CamembertLayer: 3-5                    6,497,280
│    │    └─CamembertLayer: 3-6                    6,497,280
│    │    └─CamembertLayer: 3-7                    6,497,280
│    │    └

# 3. Train :

In [11]:
def save_checkpoint(epoch, model, optimizer, loss, save_dir):
    """
    Sauvegarder l'état actuel du modèle, optimiseur et perte.
    """
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss,
    }
    save_path = os.path.join(save_dir, f"checkpoint_epoch_{epoch}.pth")
    torch.save(checkpoint, save_path)
    print(f"Checkpoint saved at {save_path}")

In [12]:
batch_size = 32
train_dataset = Tokenized_oscar_dataset(loaded_train_data)
val_dataset = Tokenized_oscar_dataset(loaded_val_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [13]:
# 2. model :
#from transformers import CamembertForMaskedLM
#model = CamembertForMaskedLM.from_pretrained("camembert-base")
#model.apply(init_weights)
#print()

training loop :

In [None]:
import os
import time
import torch
import torch.nn.functional as F
from tqdm import tqdm
from transformers import get_scheduler

# Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)
print("Model moved to device.")

max_lr = 6e-4
min_lr = max_lr * 0.1  # 10 %
warmup_steps = 10
num_epochs = 10  # Total epochs
save_interval = 1  
eval_interval = 1  
learning_rate = 1e-4

# Optimizer and Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler(
    "cosine", 
    optimizer=optimizer, 
    num_warmup_steps=warmup_steps, 
    num_training_steps=num_epochs * len(train_loader)
)

history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}

# Save Directory
save_dir = "models/Pretraining/model_checkpoints"
os.makedirs(save_dir, exist_ok=True)
print(f"Model checkpoints will be saved in: {save_dir}")

############################## Utility Functions ################################
def compute_accuracy(predictions, labels, ignore_index=-100):
    mask = labels != ignore_index
    correct = (predictions[mask] == labels[mask]).sum().item()
    total = mask.sum().item()
    return correct / total if total > 0 else 0.0

############################## Training Loop ################################
for epoch in range(num_epochs):
    print(f"\n--- Starting Epoch {epoch + 1}/{num_epochs} ---")
    model.train()  # Training mode
    train_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    t0 = time.time()

    # Training Phase
    with tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}/{num_epochs}") as pbar:
        for batch_index, batch in enumerate(train_loader):
            # Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)

            # Backward pass
            loss.backward()
            optimizer.step()
            scheduler.step()  # Update learning rate

            # Track Loss and Accuracy
            train_loss += loss.item()
            predictions = logits.argmax(dim=-1)
            correct_predictions += compute_accuracy(predictions, labels)
            pbar.set_postfix({'loss': loss.item()})
            pbar.update(1)

            # Print batch loss every 50 batches
            if batch_index % 50 == 0:
                print(f"  [Batch {batch_index}/{len(train_loader)}] Loss: {loss.item():.4f}")

    avg_train_loss = train_loss / len(train_loader)
    train_acc = correct_predictions / len(train_loader)

    t1 = time.time()
    print(f"Epoch {epoch + 1} Completed. Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_acc:.4f}, Time: {t1 - t0:.2f}s")

    history['train_loss'].append(avg_train_loss)
    history['train_acc'].append(train_acc)

    # Validation Phase
    if (epoch + 1) % eval_interval == 0 or (epoch + 1) == num_epochs:
        model.eval()
        val_loss = 0.0
        correct_val_predictions = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                logits = model(input_ids, attention_mask=attention_mask)
                loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)
                val_loss += loss.item()

                # Accuracy
                predictions = logits.argmax(dim=-1)
                correct_val_predictions += compute_accuracy(predictions, labels)

            avg_val_loss = val_loss / len(val_loader)
            avg_val_acc = correct_val_predictions / len(val_loader)
            print(f"Epoch {epoch + 1}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {avg_val_acc:.4f}")
            history['val_loss'].append(avg_val_loss)
            history['val_acc'].append(avg_val_acc)

    # Save Model Checkpoints
    if (epoch + 1) % save_interval == 0 or (epoch + 1) == num_epochs:
        print(f"Saving checkpoint at epoch {epoch + 1}...")
        save_checkpoint(epoch + 1, model, optimizer, avg_train_loss, save_dir)
        print(f"Checkpoint saved for epoch {epoch + 1}.")
        
print("\nTraining completed.")


Using device: cuda
Model moved to device.
Model checkpoints will be saved in: models/Pretraining/model_checkpoints

--- Starting Epoch 1/10 ---


Epoch 1/10:   0%|          | 1/16913 [00:00<4:16:46,  1.10it/s, loss=10.5]

  [Batch 0/16913] Loss: 10.5191


Epoch 1/10:   0%|          | 51/16913 [00:44<4:07:49,  1.13it/s, loss=7.88]

  [Batch 50/16913] Loss: 7.8792


Epoch 1/10:   1%|          | 101/16913 [01:29<4:07:50,  1.13it/s, loss=7.79]

  [Batch 100/16913] Loss: 7.7887


Epoch 1/10:   1%|          | 151/16913 [02:13<4:07:01,  1.13it/s, loss=7.76]

  [Batch 150/16913] Loss: 7.7563


Epoch 1/10:   1%|          | 201/16913 [02:57<4:06:06,  1.13it/s, loss=7.71]

  [Batch 200/16913] Loss: 7.7143


Epoch 1/10:   1%|▏         | 251/16913 [03:41<4:05:34,  1.13it/s, loss=7.84]

  [Batch 250/16913] Loss: 7.8373


Epoch 1/10:   2%|▏         | 301/16913 [04:25<4:04:43,  1.13it/s, loss=7.88]

  [Batch 300/16913] Loss: 7.8757


Epoch 1/10:   2%|▏         | 351/16913 [05:10<4:03:48,  1.13it/s, loss=7.75]

  [Batch 350/16913] Loss: 7.7451


Epoch 1/10:   2%|▏         | 401/16913 [05:54<4:03:22,  1.13it/s, loss=7.73]

  [Batch 400/16913] Loss: 7.7329


Epoch 1/10:   3%|▎         | 451/16913 [06:38<4:02:47,  1.13it/s, loss=7.47]

  [Batch 450/16913] Loss: 7.4669


Epoch 1/10:   3%|▎         | 501/16913 [07:22<4:01:41,  1.13it/s, loss=7.55]

  [Batch 500/16913] Loss: 7.5526


Epoch 1/10:   3%|▎         | 551/16913 [08:06<4:01:13,  1.13it/s, loss=7.78]

  [Batch 550/16913] Loss: 7.7770


Epoch 1/10:   4%|▎         | 601/16913 [08:51<4:00:32,  1.13it/s, loss=7.51]

  [Batch 600/16913] Loss: 7.5147


Epoch 1/10:   4%|▍         | 651/16913 [09:35<3:59:48,  1.13it/s, loss=7.65]

  [Batch 650/16913] Loss: 7.6475


Epoch 1/10:   4%|▍         | 701/16913 [10:19<3:59:00,  1.13it/s, loss=7.59]

  [Batch 700/16913] Loss: 7.5917


Epoch 1/10:   4%|▍         | 751/16913 [11:03<3:58:10,  1.13it/s, loss=7.6] 

  [Batch 750/16913] Loss: 7.6000


Epoch 1/10:   5%|▍         | 801/16913 [11:48<3:57:34,  1.13it/s, loss=7.75]

  [Batch 800/16913] Loss: 7.7520


Epoch 1/10:   5%|▌         | 851/16913 [12:32<3:56:27,  1.13it/s, loss=7.81]

  [Batch 850/16913] Loss: 7.8076


Epoch 1/10:   5%|▌         | 901/16913 [13:16<3:55:48,  1.13it/s, loss=7.73]

  [Batch 900/16913] Loss: 7.7295


Epoch 1/10:   6%|▌         | 951/16913 [14:00<3:55:04,  1.13it/s, loss=7.8] 

  [Batch 950/16913] Loss: 7.7979


Epoch 1/10:   6%|▌         | 1001/16913 [14:44<3:54:34,  1.13it/s, loss=7.64]

  [Batch 1000/16913] Loss: 7.6419


Epoch 1/10:   6%|▌         | 1051/16913 [15:29<3:53:23,  1.13it/s, loss=7.71]

  [Batch 1050/16913] Loss: 7.7135


Epoch 1/10:   7%|▋         | 1101/16913 [16:13<3:53:11,  1.13it/s, loss=7.75]

  [Batch 1100/16913] Loss: 7.7492


Epoch 1/10:   7%|▋         | 1151/16913 [16:57<3:52:18,  1.13it/s, loss=7.53]

  [Batch 1150/16913] Loss: 7.5306


Epoch 1/10:   7%|▋         | 1201/16913 [17:41<3:51:41,  1.13it/s, loss=7.63]

  [Batch 1200/16913] Loss: 7.6318


Epoch 1/10:   7%|▋         | 1251/16913 [18:25<3:50:44,  1.13it/s, loss=7.61]

  [Batch 1250/16913] Loss: 7.6130


Epoch 1/10:   8%|▊         | 1301/16913 [19:10<3:50:10,  1.13it/s, loss=7.6] 

  [Batch 1300/16913] Loss: 7.6047


Epoch 1/10:   8%|▊         | 1351/16913 [19:54<3:49:12,  1.13it/s, loss=7.73]

  [Batch 1350/16913] Loss: 7.7264


Epoch 1/10:   8%|▊         | 1401/16913 [20:38<3:48:43,  1.13it/s, loss=7.7] 

  [Batch 1400/16913] Loss: 7.6972


Epoch 1/10:   9%|▊         | 1451/16913 [21:22<3:47:55,  1.13it/s, loss=7.69]

  [Batch 1450/16913] Loss: 7.6869


Epoch 1/10:   9%|▉         | 1501/16913 [22:06<3:46:50,  1.13it/s, loss=7.82]

  [Batch 1500/16913] Loss: 7.8170


Epoch 1/10:   9%|▉         | 1551/16913 [22:51<3:46:16,  1.13it/s, loss=7.73]

  [Batch 1550/16913] Loss: 7.7255


Epoch 1/10:   9%|▉         | 1601/16913 [23:35<3:45:46,  1.13it/s, loss=7.54]

  [Batch 1600/16913] Loss: 7.5448


Epoch 1/10:  10%|▉         | 1651/16913 [24:19<3:44:56,  1.13it/s, loss=7.74]

  [Batch 1650/16913] Loss: 7.7390


Epoch 1/10:  10%|█         | 1701/16913 [25:03<3:43:54,  1.13it/s, loss=7.6] 

  [Batch 1700/16913] Loss: 7.5998


Epoch 1/10:  10%|█         | 1751/16913 [25:47<3:43:32,  1.13it/s, loss=7.67]

  [Batch 1750/16913] Loss: 7.6661


Epoch 1/10:  11%|█         | 1801/16913 [26:32<3:42:50,  1.13it/s, loss=7.75]

  [Batch 1800/16913] Loss: 7.7478


Epoch 1/10:  11%|█         | 1851/16913 [27:16<3:41:47,  1.13it/s, loss=7.65]

  [Batch 1850/16913] Loss: 7.6471


Epoch 1/10:  11%|█         | 1901/16913 [28:00<3:41:03,  1.13it/s, loss=7.61]

  [Batch 1900/16913] Loss: 7.6147


Epoch 1/10:  12%|█▏        | 1951/16913 [28:44<3:40:24,  1.13it/s, loss=7.83]

  [Batch 1950/16913] Loss: 7.8344


Epoch 1/10:  12%|█▏        | 2001/16913 [29:29<3:39:36,  1.13it/s, loss=7.49]

  [Batch 2000/16913] Loss: 7.4878


Epoch 1/10:  12%|█▏        | 2051/16913 [30:13<3:39:01,  1.13it/s, loss=7.79]

  [Batch 2050/16913] Loss: 7.7938


Epoch 1/10:  12%|█▏        | 2101/16913 [30:57<3:38:14,  1.13it/s, loss=7.82]

  [Batch 2100/16913] Loss: 7.8152


Epoch 1/10:  13%|█▎        | 2151/16913 [31:41<3:37:25,  1.13it/s, loss=7.58]

  [Batch 2150/16913] Loss: 7.5846


Epoch 1/10:  13%|█▎        | 2201/16913 [32:25<3:36:33,  1.13it/s, loss=7.76]

  [Batch 2200/16913] Loss: 7.7592


Epoch 1/10:  13%|█▎        | 2251/16913 [33:09<3:36:00,  1.13it/s, loss=7.5] 

  [Batch 2250/16913] Loss: 7.5044


Epoch 1/10:  14%|█▎        | 2301/16913 [33:54<3:35:05,  1.13it/s, loss=7.54]

  [Batch 2300/16913] Loss: 7.5370


Epoch 1/10:  14%|█▍        | 2351/16913 [34:38<3:34:19,  1.13it/s, loss=7.67]

  [Batch 2350/16913] Loss: 7.6707


Epoch 1/10:  14%|█▍        | 2401/16913 [35:22<3:33:42,  1.13it/s, loss=7.71]

  [Batch 2400/16913] Loss: 7.7118


Epoch 1/10:  14%|█▍        | 2451/16913 [36:06<3:33:12,  1.13it/s, loss=7.78]

  [Batch 2450/16913] Loss: 7.7839


Epoch 1/10:  15%|█▍        | 2501/16913 [36:50<3:32:31,  1.13it/s, loss=7.63]

  [Batch 2500/16913] Loss: 7.6286


Epoch 1/10:  15%|█▌        | 2551/16913 [37:35<3:31:44,  1.13it/s, loss=7.64]

  [Batch 2550/16913] Loss: 7.6407


Epoch 1/10:  15%|█▌        | 2601/16913 [38:19<3:30:45,  1.13it/s, loss=7.78]

  [Batch 2600/16913] Loss: 7.7762


Epoch 1/10:  16%|█▌        | 2651/16913 [39:03<3:30:00,  1.13it/s, loss=7.5] 

  [Batch 2650/16913] Loss: 7.4997


Epoch 1/10:  16%|█▌        | 2701/16913 [39:47<3:29:31,  1.13it/s, loss=7.78]

  [Batch 2700/16913] Loss: 7.7755


Epoch 1/10:  16%|█▋        | 2751/16913 [40:31<3:28:26,  1.13it/s, loss=7.65]

  [Batch 2750/16913] Loss: 7.6462


Epoch 1/10:  17%|█▋        | 2801/16913 [41:16<3:28:00,  1.13it/s, loss=7.64]

  [Batch 2800/16913] Loss: 7.6366


Epoch 1/10:  17%|█▋        | 2851/16913 [42:00<3:27:22,  1.13it/s, loss=7.67]

  [Batch 2850/16913] Loss: 7.6734


Epoch 1/10:  17%|█▋        | 2901/16913 [42:44<3:26:43,  1.13it/s, loss=7.73]

  [Batch 2900/16913] Loss: 7.7296


Epoch 1/10:  17%|█▋        | 2951/16913 [43:28<3:26:01,  1.13it/s, loss=7.75]

  [Batch 2950/16913] Loss: 7.7516


Epoch 1/10:  18%|█▊        | 3001/16913 [44:13<3:25:05,  1.13it/s, loss=7.78]

  [Batch 3000/16913] Loss: 7.7792


Epoch 1/10:  18%|█▊        | 3051/16913 [44:57<3:24:18,  1.13it/s, loss=7.53]

  [Batch 3050/16913] Loss: 7.5283


Epoch 1/10:  18%|█▊        | 3101/16913 [45:41<3:23:40,  1.13it/s, loss=7.6] 

  [Batch 3100/16913] Loss: 7.6007


Epoch 1/10:  19%|█▊        | 3151/16913 [46:25<3:22:58,  1.13it/s, loss=7.63]

  [Batch 3150/16913] Loss: 7.6346


Epoch 1/10:  19%|█▉        | 3201/16913 [47:10<3:22:12,  1.13it/s, loss=7.86]

  [Batch 3200/16913] Loss: 7.8587


Epoch 1/10:  19%|█▉        | 3251/16913 [47:54<3:21:17,  1.13it/s, loss=7.76]

  [Batch 3250/16913] Loss: 7.7623


Epoch 1/10:  20%|█▉        | 3301/16913 [48:38<3:20:43,  1.13it/s, loss=7.6] 

  [Batch 3300/16913] Loss: 7.6021


Epoch 1/10:  20%|█▉        | 3351/16913 [49:22<3:19:40,  1.13it/s, loss=7.76]

  [Batch 3350/16913] Loss: 7.7600


Epoch 1/10:  20%|██        | 3401/16913 [50:06<3:18:56,  1.13it/s, loss=7.78]

  [Batch 3400/16913] Loss: 7.7833


Epoch 1/10:  20%|██        | 3451/16913 [50:51<3:18:20,  1.13it/s, loss=7.68]

  [Batch 3450/16913] Loss: 7.6802


Epoch 1/10:  21%|██        | 3501/16913 [51:35<3:17:43,  1.13it/s, loss=7.49]

  [Batch 3500/16913] Loss: 7.4880


Epoch 1/10:  21%|██        | 3551/16913 [52:19<3:16:53,  1.13it/s, loss=7.58]

  [Batch 3550/16913] Loss: 7.5816


Epoch 1/10:  21%|██▏       | 3601/16913 [53:03<3:16:08,  1.13it/s, loss=7.72]

  [Batch 3600/16913] Loss: 7.7212


Epoch 1/10:  22%|██▏       | 3651/16913 [53:47<3:15:18,  1.13it/s, loss=7.66]

  [Batch 3650/16913] Loss: 7.6647


Epoch 1/10:  22%|██▏       | 3701/16913 [54:32<3:14:47,  1.13it/s, loss=7.63]

  [Batch 3700/16913] Loss: 7.6310


Epoch 1/10:  22%|██▏       | 3751/16913 [55:16<3:14:00,  1.13it/s, loss=7.69]

  [Batch 3750/16913] Loss: 7.6938


Epoch 1/10:  22%|██▏       | 3801/16913 [56:00<3:13:26,  1.13it/s, loss=7.81]

  [Batch 3800/16913] Loss: 7.8101


Epoch 1/10:  23%|██▎       | 3851/16913 [56:44<3:12:20,  1.13it/s, loss=7.73]

  [Batch 3850/16913] Loss: 7.7286


Epoch 1/10:  23%|██▎       | 3901/16913 [57:28<3:11:56,  1.13it/s, loss=7.58]

  [Batch 3900/16913] Loss: 7.5842


Epoch 1/10:  23%|██▎       | 3951/16913 [58:13<3:11:05,  1.13it/s, loss=7.7] 

  [Batch 3950/16913] Loss: 7.6992


Epoch 1/10:  24%|██▎       | 4001/16913 [58:57<3:10:15,  1.13it/s, loss=7.68]

  [Batch 4000/16913] Loss: 7.6843


Epoch 1/10:  24%|██▍       | 4051/16913 [59:41<3:09:19,  1.13it/s, loss=7.59]

  [Batch 4050/16913] Loss: 7.5947


Epoch 1/10:  24%|██▍       | 4101/16913 [1:00:25<3:08:57,  1.13it/s, loss=7.64]

  [Batch 4100/16913] Loss: 7.6418


Epoch 1/10:  25%|██▍       | 4151/16913 [1:01:10<3:08:16,  1.13it/s, loss=7.62]

  [Batch 4150/16913] Loss: 7.6188


Epoch 1/10:  25%|██▍       | 4201/16913 [1:01:54<3:07:23,  1.13it/s, loss=7.71]

  [Batch 4200/16913] Loss: 7.7077


Epoch 1/10:  25%|██▌       | 4251/16913 [1:02:38<3:06:17,  1.13it/s, loss=7.61]

  [Batch 4250/16913] Loss: 7.6122


Epoch 1/10:  25%|██▌       | 4301/16913 [1:03:22<3:05:48,  1.13it/s, loss=7.63]

  [Batch 4300/16913] Loss: 7.6343


Epoch 1/10:  26%|██▌       | 4351/16913 [1:04:06<3:05:08,  1.13it/s, loss=7.87]

  [Batch 4350/16913] Loss: 7.8685


Epoch 1/10:  26%|██▌       | 4401/16913 [1:04:51<3:04:07,  1.13it/s, loss=7.59]

  [Batch 4400/16913] Loss: 7.5864


Epoch 1/10:  26%|██▋       | 4451/16913 [1:05:35<3:03:30,  1.13it/s, loss=7.61]

  [Batch 4450/16913] Loss: 7.6100


Epoch 1/10:  27%|██▋       | 4501/16913 [1:06:19<3:02:54,  1.13it/s, loss=7.58]

  [Batch 4500/16913] Loss: 7.5754


Epoch 1/10:  27%|██▋       | 4551/16913 [1:07:03<3:02:16,  1.13it/s, loss=7.72]

  [Batch 4550/16913] Loss: 7.7238


Epoch 1/10:  27%|██▋       | 4601/16913 [1:07:47<3:01:31,  1.13it/s, loss=7.6] 

  [Batch 4600/16913] Loss: 7.5968


Epoch 1/10:  27%|██▋       | 4651/16913 [1:08:32<3:00:40,  1.13it/s, loss=7.58]

  [Batch 4650/16913] Loss: 7.5818


Epoch 1/10:  28%|██▊       | 4701/16913 [1:09:16<2:59:51,  1.13it/s, loss=7.77]

  [Batch 4700/16913] Loss: 7.7672


Epoch 1/10:  28%|██▊       | 4751/16913 [1:10:00<2:59:20,  1.13it/s, loss=7.76]

  [Batch 4750/16913] Loss: 7.7563


Epoch 1/10:  28%|██▊       | 4801/16913 [1:10:44<2:58:32,  1.13it/s, loss=7.75]

  [Batch 4800/16913] Loss: 7.7482


Epoch 1/10:  29%|██▊       | 4851/16913 [1:11:29<2:57:47,  1.13it/s, loss=7.77]

  [Batch 4850/16913] Loss: 7.7714


Epoch 1/10:  29%|██▉       | 4901/16913 [1:12:13<2:57:01,  1.13it/s, loss=7.71]

  [Batch 4900/16913] Loss: 7.7146


Epoch 1/10:  29%|██▉       | 4951/16913 [1:12:57<2:56:16,  1.13it/s, loss=7.68]

  [Batch 4950/16913] Loss: 7.6780


Epoch 1/10:  30%|██▉       | 5001/16913 [1:13:41<2:55:32,  1.13it/s, loss=7.83]

  [Batch 5000/16913] Loss: 7.8272


Epoch 1/10:  30%|██▉       | 5051/16913 [1:14:25<2:54:54,  1.13it/s, loss=7.82]

  [Batch 5050/16913] Loss: 7.8187


Epoch 1/10:  30%|███       | 5101/16913 [1:15:10<2:54:01,  1.13it/s, loss=7.79]

  [Batch 5100/16913] Loss: 7.7935


Epoch 1/10:  30%|███       | 5151/16913 [1:15:54<2:53:31,  1.13it/s, loss=7.56]

  [Batch 5150/16913] Loss: 7.5604


Epoch 1/10:  31%|███       | 5201/16913 [1:16:38<2:52:42,  1.13it/s, loss=7.58]

  [Batch 5200/16913] Loss: 7.5757


Epoch 1/10:  31%|███       | 5251/16913 [1:17:22<2:51:49,  1.13it/s, loss=7.79]

  [Batch 5250/16913] Loss: 7.7866


Epoch 1/10:  31%|███▏      | 5301/16913 [1:18:06<2:50:52,  1.13it/s, loss=7.73]

  [Batch 5300/16913] Loss: 7.7251


Epoch 1/10:  32%|███▏      | 5351/16913 [1:18:51<2:50:19,  1.13it/s, loss=7.7] 

  [Batch 5350/16913] Loss: 7.7050


Epoch 1/10:  32%|███▏      | 5401/16913 [1:19:35<2:49:41,  1.13it/s, loss=7.74]

  [Batch 5400/16913] Loss: 7.7363


Epoch 1/10:  32%|███▏      | 5451/16913 [1:20:19<2:48:43,  1.13it/s, loss=7.59]

  [Batch 5450/16913] Loss: 7.5885


Epoch 1/10:  33%|███▎      | 5501/16913 [1:21:03<2:48:01,  1.13it/s, loss=7.59]

  [Batch 5500/16913] Loss: 7.5942


Epoch 1/10:  33%|███▎      | 5551/16913 [1:21:47<2:47:20,  1.13it/s, loss=7.77]

  [Batch 5550/16913] Loss: 7.7722


Epoch 1/10:  33%|███▎      | 5601/16913 [1:22:32<2:46:46,  1.13it/s, loss=7.72]

  [Batch 5600/16913] Loss: 7.7187


Epoch 1/10:  33%|███▎      | 5651/16913 [1:23:16<2:45:45,  1.13it/s, loss=7.69]

  [Batch 5650/16913] Loss: 7.6921


Epoch 1/10:  34%|███▎      | 5701/16913 [1:24:00<2:45:10,  1.13it/s, loss=7.64]

  [Batch 5700/16913] Loss: 7.6400


Epoch 1/10:  34%|███▍      | 5751/16913 [1:24:44<2:44:27,  1.13it/s, loss=7.71]

  [Batch 5750/16913] Loss: 7.7077


Epoch 1/10:  34%|███▍      | 5801/16913 [1:25:29<2:43:51,  1.13it/s, loss=7.63]

  [Batch 5800/16913] Loss: 7.6267


Epoch 1/10:  35%|███▍      | 5851/16913 [1:26:13<2:43:01,  1.13it/s, loss=7.8] 

  [Batch 5850/16913] Loss: 7.8036


Epoch 1/10:  35%|███▍      | 5901/16913 [1:26:57<2:42:09,  1.13it/s, loss=7.67]

  [Batch 5900/16913] Loss: 7.6687


Epoch 1/10:  35%|███▌      | 5951/16913 [1:27:41<2:41:34,  1.13it/s, loss=7.67]

  [Batch 5950/16913] Loss: 7.6663


Epoch 1/10:  35%|███▌      | 6001/16913 [1:28:25<2:40:49,  1.13it/s, loss=7.53]

  [Batch 6000/16913] Loss: 7.5312


Epoch 1/10:  36%|███▌      | 6051/16913 [1:29:10<2:39:57,  1.13it/s, loss=7.52]

  [Batch 6050/16913] Loss: 7.5193


Epoch 1/10:  36%|███▌      | 6101/16913 [1:29:54<2:39:27,  1.13it/s, loss=7.71]

  [Batch 6100/16913] Loss: 7.7136


Epoch 1/10:  36%|███▋      | 6151/16913 [1:30:38<2:38:34,  1.13it/s, loss=7.69]

  [Batch 6150/16913] Loss: 7.6853


Epoch 1/10:  37%|███▋      | 6201/16913 [1:31:22<2:37:55,  1.13it/s, loss=7.68]

  [Batch 6200/16913] Loss: 7.6804


Epoch 1/10:  37%|███▋      | 6251/16913 [1:32:06<2:37:08,  1.13it/s, loss=7.7] 

  [Batch 6250/16913] Loss: 7.6973


Epoch 1/10:  37%|███▋      | 6301/16913 [1:32:51<2:36:26,  1.13it/s, loss=7.7] 

  [Batch 6300/16913] Loss: 7.7021


Epoch 1/10:  38%|███▊      | 6351/16913 [1:33:35<2:35:39,  1.13it/s, loss=7.6] 

  [Batch 6350/16913] Loss: 7.5976


Epoch 1/10:  38%|███▊      | 6401/16913 [1:34:19<2:35:03,  1.13it/s, loss=7.55]

  [Batch 6400/16913] Loss: 7.5452


Epoch 1/10:  38%|███▊      | 6451/16913 [1:35:03<2:34:07,  1.13it/s, loss=7.68]

  [Batch 6450/16913] Loss: 7.6782


Epoch 1/10:  38%|███▊      | 6501/16913 [1:35:48<2:33:33,  1.13it/s, loss=7.67]

  [Batch 6500/16913] Loss: 7.6690


Epoch 1/10:  39%|███▊      | 6551/16913 [1:36:32<2:32:47,  1.13it/s, loss=7.64]

  [Batch 6550/16913] Loss: 7.6370


Epoch 1/10:  39%|███▉      | 6601/16913 [1:37:16<2:31:57,  1.13it/s, loss=7.79]

  [Batch 6600/16913] Loss: 7.7940


Epoch 1/10:  39%|███▉      | 6651/16913 [1:38:00<2:31:17,  1.13it/s, loss=7.59]

  [Batch 6650/16913] Loss: 7.5871


Epoch 1/10:  40%|███▉      | 6701/16913 [1:38:44<2:30:21,  1.13it/s, loss=7.62]

  [Batch 6700/16913] Loss: 7.6184


Epoch 1/10:  40%|███▉      | 6751/16913 [1:39:29<2:29:45,  1.13it/s, loss=7.78]

  [Batch 6750/16913] Loss: 7.7780


Epoch 1/10:  40%|████      | 6801/16913 [1:40:13<2:29:05,  1.13it/s, loss=7.74]

  [Batch 6800/16913] Loss: 7.7366


Epoch 1/10:  41%|████      | 6851/16913 [1:40:57<2:28:20,  1.13it/s, loss=7.67]

  [Batch 6850/16913] Loss: 7.6709


Epoch 1/10:  41%|████      | 6901/16913 [1:41:41<2:27:21,  1.13it/s, loss=7.52]

  [Batch 6900/16913] Loss: 7.5216


Epoch 1/10:  41%|████      | 6951/16913 [1:42:25<2:26:35,  1.13it/s, loss=7.71]

  [Batch 6950/16913] Loss: 7.7125


Epoch 1/10:  41%|████▏     | 7001/16913 [1:43:10<2:26:01,  1.13it/s, loss=7.68]

  [Batch 7000/16913] Loss: 7.6766


Epoch 1/10:  42%|████▏     | 7051/16913 [1:43:54<2:25:19,  1.13it/s, loss=7.77]

  [Batch 7050/16913] Loss: 7.7654


Epoch 1/10:  42%|████▏     | 7101/16913 [1:44:38<2:24:27,  1.13it/s, loss=7.65]

  [Batch 7100/16913] Loss: 7.6468


Epoch 1/10:  42%|████▏     | 7151/16913 [1:45:22<2:23:41,  1.13it/s, loss=7.81]

  [Batch 7150/16913] Loss: 7.8062


Epoch 1/10:  43%|████▎     | 7201/16913 [1:46:06<2:22:57,  1.13it/s, loss=7.69]

  [Batch 7200/16913] Loss: 7.6914


Epoch 1/10:  43%|████▎     | 7251/16913 [1:46:51<2:22:23,  1.13it/s, loss=7.6] 

  [Batch 7250/16913] Loss: 7.6021


Epoch 1/10:  43%|████▎     | 7301/16913 [1:47:35<2:21:36,  1.13it/s, loss=7.54]

  [Batch 7300/16913] Loss: 7.5357


Epoch 1/10:  43%|████▎     | 7351/16913 [1:48:19<2:20:57,  1.13it/s, loss=7.61]

  [Batch 7350/16913] Loss: 7.6096


Epoch 1/10:  44%|████▍     | 7401/16913 [1:49:03<2:20:15,  1.13it/s, loss=7.75]

  [Batch 7400/16913] Loss: 7.7534


Epoch 1/10:  44%|████▍     | 7451/16913 [1:49:47<2:19:28,  1.13it/s, loss=7.78]

  [Batch 7450/16913] Loss: 7.7809


Epoch 1/10:  44%|████▍     | 7501/16913 [1:50:32<2:18:30,  1.13it/s, loss=7.78]

  [Batch 7500/16913] Loss: 7.7755


Epoch 1/10:  45%|████▍     | 7551/16913 [1:51:16<2:18:00,  1.13it/s, loss=7.73]

  [Batch 7550/16913] Loss: 7.7280


Epoch 1/10:  45%|████▍     | 7601/16913 [1:52:00<2:17:12,  1.13it/s, loss=7.63]

  [Batch 7600/16913] Loss: 7.6325


Epoch 1/10:  45%|████▌     | 7651/16913 [1:52:44<2:16:38,  1.13it/s, loss=7.67]

  [Batch 7650/16913] Loss: 7.6744


Epoch 1/10:  46%|████▌     | 7701/16913 [1:53:28<2:15:45,  1.13it/s, loss=7.85]

  [Batch 7700/16913] Loss: 7.8536


Epoch 1/10:  46%|████▌     | 7751/16913 [1:54:13<2:15:10,  1.13it/s, loss=7.73]

  [Batch 7750/16913] Loss: 7.7255


Epoch 1/10:  46%|████▌     | 7801/16913 [1:54:57<2:14:14,  1.13it/s, loss=7.47]

  [Batch 7800/16913] Loss: 7.4722


Epoch 1/10:  46%|████▋     | 7851/16913 [1:55:41<2:13:38,  1.13it/s, loss=7.79]

  [Batch 7850/16913] Loss: 7.7938


Epoch 1/10:  47%|████▋     | 7901/16913 [1:56:25<2:12:42,  1.13it/s, loss=7.54]

  [Batch 7900/16913] Loss: 7.5354


Epoch 1/10:  47%|████▋     | 7951/16913 [1:57:09<2:12:00,  1.13it/s, loss=7.64]

  [Batch 7950/16913] Loss: 7.6450


Epoch 1/10:  47%|████▋     | 8001/16913 [1:57:54<2:11:14,  1.13it/s, loss=7.74]

  [Batch 8000/16913] Loss: 7.7416


Epoch 1/10:  48%|████▊     | 8051/16913 [1:58:38<2:10:38,  1.13it/s, loss=7.76]

  [Batch 8050/16913] Loss: 7.7584


Epoch 1/10:  48%|████▊     | 8101/16913 [1:59:22<2:09:43,  1.13it/s, loss=7.89]

  [Batch 8100/16913] Loss: 7.8937


Epoch 1/10:  48%|████▊     | 8151/16913 [2:00:06<2:09:01,  1.13it/s, loss=7.69]

  [Batch 8150/16913] Loss: 7.6869


Epoch 1/10:  48%|████▊     | 8201/16913 [2:00:50<2:08:28,  1.13it/s, loss=7.83]

  [Batch 8200/16913] Loss: 7.8275


Epoch 1/10:  49%|████▉     | 8251/16913 [2:01:35<2:07:41,  1.13it/s, loss=7.64]

  [Batch 8250/16913] Loss: 7.6444


Epoch 1/10:  49%|████▉     | 8301/16913 [2:02:19<2:07:01,  1.13it/s, loss=7.75]

  [Batch 8300/16913] Loss: 7.7520


Epoch 1/10:  49%|████▉     | 8351/16913 [2:03:03<2:06:07,  1.13it/s, loss=7.7] 

  [Batch 8350/16913] Loss: 7.6972


Epoch 1/10:  50%|████▉     | 8401/16913 [2:03:47<2:05:34,  1.13it/s, loss=7.5] 

  [Batch 8400/16913] Loss: 7.5040


Epoch 1/10:  50%|████▉     | 8451/16913 [2:04:32<2:04:44,  1.13it/s, loss=7.7] 

  [Batch 8450/16913] Loss: 7.7039


Epoch 1/10:  50%|█████     | 8501/16913 [2:05:16<2:03:59,  1.13it/s, loss=7.88]

  [Batch 8500/16913] Loss: 7.8833


Epoch 1/10:  51%|█████     | 8551/16913 [2:06:00<2:03:08,  1.13it/s, loss=7.72]

  [Batch 8550/16913] Loss: 7.7159


Epoch 1/10:  51%|█████     | 8601/16913 [2:06:44<2:02:25,  1.13it/s, loss=7.69]

  [Batch 8600/16913] Loss: 7.6914


Epoch 1/10:  51%|█████     | 8651/16913 [2:07:28<2:01:41,  1.13it/s, loss=7.91]

  [Batch 8650/16913] Loss: 7.9091


Epoch 1/10:  51%|█████▏    | 8701/16913 [2:08:13<2:01:06,  1.13it/s, loss=7.54]

  [Batch 8700/16913] Loss: 7.5447


Epoch 1/10:  52%|█████▏    | 8751/16913 [2:08:57<2:00:09,  1.13it/s, loss=7.57]

  [Batch 8750/16913] Loss: 7.5674


Epoch 1/10:  52%|█████▏    | 8801/16913 [2:09:41<1:59:27,  1.13it/s, loss=7.73]

  [Batch 8800/16913] Loss: 7.7335


Epoch 1/10:  52%|█████▏    | 8851/16913 [2:10:25<1:58:42,  1.13it/s, loss=7.67]

  [Batch 8850/16913] Loss: 7.6653


Epoch 1/10:  53%|█████▎    | 8901/16913 [2:11:09<1:58:06,  1.13it/s, loss=7.68]

  [Batch 8900/16913] Loss: 7.6839


Epoch 1/10:  53%|█████▎    | 8951/16913 [2:11:54<1:57:16,  1.13it/s, loss=7.67]

  [Batch 8950/16913] Loss: 7.6675


Epoch 1/10:  53%|█████▎    | 9001/16913 [2:12:38<1:56:35,  1.13it/s, loss=7.64]

  [Batch 9000/16913] Loss: 7.6353


Epoch 1/10:  54%|█████▎    | 9051/16913 [2:13:22<1:55:50,  1.13it/s, loss=7.61]

  [Batch 9050/16913] Loss: 7.6130


Epoch 1/10:  54%|█████▍    | 9101/16913 [2:14:06<1:55:02,  1.13it/s, loss=7.76]

  [Batch 9100/16913] Loss: 7.7607


Epoch 1/10:  54%|█████▍    | 9151/16913 [2:14:51<1:54:19,  1.13it/s, loss=7.72]

  [Batch 9150/16913] Loss: 7.7215


Epoch 1/10:  54%|█████▍    | 9201/16913 [2:15:35<1:53:42,  1.13it/s, loss=7.84]

  [Batch 9200/16913] Loss: 7.8353


Epoch 1/10:  55%|█████▍    | 9251/16913 [2:16:19<1:52:44,  1.13it/s, loss=7.73]

  [Batch 9250/16913] Loss: 7.7334


Epoch 1/10:  55%|█████▍    | 9301/16913 [2:17:03<1:52:16,  1.13it/s, loss=7.74]

  [Batch 9300/16913] Loss: 7.7442


Epoch 1/10:  55%|█████▌    | 9351/16913 [2:17:47<1:51:25,  1.13it/s, loss=7.82]

  [Batch 9350/16913] Loss: 7.8182


Epoch 1/10:  56%|█████▌    | 9401/16913 [2:18:32<1:50:38,  1.13it/s, loss=7.7] 

  [Batch 9400/16913] Loss: 7.7038


Epoch 1/10:  56%|█████▌    | 9451/16913 [2:19:16<1:49:52,  1.13it/s, loss=7.84]

  [Batch 9450/16913] Loss: 7.8403


Epoch 1/10:  56%|█████▌    | 9501/16913 [2:20:00<1:49:04,  1.13it/s, loss=7.94]

  [Batch 9500/16913] Loss: 7.9385


Epoch 1/10:  56%|█████▋    | 9551/16913 [2:20:44<1:48:29,  1.13it/s, loss=7.73]

  [Batch 9550/16913] Loss: 7.7281


Epoch 1/10:  57%|█████▋    | 9601/16913 [2:21:28<1:47:48,  1.13it/s, loss=7.6] 

  [Batch 9600/16913] Loss: 7.6038


Epoch 1/10:  57%|█████▋    | 9651/16913 [2:22:13<1:47:00,  1.13it/s, loss=7.67]

  [Batch 9650/16913] Loss: 7.6681


Epoch 1/10:  57%|█████▋    | 9701/16913 [2:22:57<1:46:18,  1.13it/s, loss=7.77]

  [Batch 9700/16913] Loss: 7.7701


Epoch 1/10:  58%|█████▊    | 9751/16913 [2:23:41<1:45:33,  1.13it/s, loss=7.72]

  [Batch 9750/16913] Loss: 7.7214


Epoch 1/10:  58%|█████▊    | 9801/16913 [2:24:25<1:44:54,  1.13it/s, loss=7.71]

  [Batch 9800/16913] Loss: 7.7053


Epoch 1/10:  58%|█████▊    | 9851/16913 [2:25:09<1:44:05,  1.13it/s, loss=7.54]

  [Batch 9850/16913] Loss: 7.5385


Epoch 1/10:  59%|█████▊    | 9901/16913 [2:25:54<1:43:14,  1.13it/s, loss=7.82]

  [Batch 9900/16913] Loss: 7.8178


Epoch 1/10:  59%|█████▉    | 9951/16913 [2:26:38<1:42:36,  1.13it/s, loss=7.56]

  [Batch 9950/16913] Loss: 7.5587


Epoch 1/10:  59%|█████▉    | 10001/16913 [2:27:22<1:41:55,  1.13it/s, loss=7.82]

  [Batch 10000/16913] Loss: 7.8178


Epoch 1/10:  59%|█████▉    | 10051/16913 [2:28:06<1:41:13,  1.13it/s, loss=7.62]

  [Batch 10050/16913] Loss: 7.6205


Epoch 1/10:  60%|█████▉    | 10101/16913 [2:28:51<1:40:29,  1.13it/s, loss=7.66]

  [Batch 10100/16913] Loss: 7.6632


Epoch 1/10:  60%|██████    | 10151/16913 [2:29:35<1:39:32,  1.13it/s, loss=7.73]

  [Batch 10150/16913] Loss: 7.7331


Epoch 1/10:  60%|██████    | 10201/16913 [2:30:19<1:38:49,  1.13it/s, loss=7.71]

  [Batch 10200/16913] Loss: 7.7149


Epoch 1/10:  61%|██████    | 10251/16913 [2:31:03<1:38:13,  1.13it/s, loss=7.59]

  [Batch 10250/16913] Loss: 7.5938


Epoch 1/10:  61%|██████    | 10301/16913 [2:31:47<1:37:29,  1.13it/s, loss=7.8] 

  [Batch 10300/16913] Loss: 7.8043


Epoch 1/10:  61%|██████    | 10351/16913 [2:32:32<1:36:39,  1.13it/s, loss=7.71]

  [Batch 10350/16913] Loss: 7.7145


Epoch 1/10:  61%|██████▏   | 10401/16913 [2:33:16<1:35:52,  1.13it/s, loss=7.56]

  [Batch 10400/16913] Loss: 7.5573


Epoch 1/10:  62%|██████▏   | 10451/16913 [2:34:00<1:35:17,  1.13it/s, loss=7.8] 

  [Batch 10450/16913] Loss: 7.7978


Epoch 1/10:  62%|██████▏   | 10501/16913 [2:34:44<1:34:34,  1.13it/s, loss=7.56]

  [Batch 10500/16913] Loss: 7.5630


Epoch 1/10:  62%|██████▏   | 10551/16913 [2:35:28<1:33:49,  1.13it/s, loss=7.63]

  [Batch 10550/16913] Loss: 7.6339


Epoch 1/10:  63%|██████▎   | 10601/16913 [2:36:13<1:32:52,  1.13it/s, loss=7.61]

  [Batch 10600/16913] Loss: 7.6105


Epoch 1/10:  63%|██████▎   | 10651/16913 [2:36:57<1:32:09,  1.13it/s, loss=7.65]

  [Batch 10650/16913] Loss: 7.6456


Epoch 1/10:  63%|██████▎   | 10701/16913 [2:37:41<1:31:32,  1.13it/s, loss=7.88]

  [Batch 10700/16913] Loss: 7.8830


Epoch 1/10:  64%|██████▎   | 10751/16913 [2:38:25<1:30:44,  1.13it/s, loss=7.72]

  [Batch 10750/16913] Loss: 7.7199


Epoch 1/10:  64%|██████▍   | 10801/16913 [2:39:09<1:30:00,  1.13it/s, loss=7.65]

  [Batch 10800/16913] Loss: 7.6485


Epoch 1/10:  64%|██████▍   | 10851/16913 [2:39:54<1:29:15,  1.13it/s, loss=7.47]

  [Batch 10850/16913] Loss: 7.4700


Epoch 1/10:  64%|██████▍   | 10901/16913 [2:40:38<1:28:37,  1.13it/s, loss=7.83]

  [Batch 10900/16913] Loss: 7.8324


Epoch 1/10:  65%|██████▍   | 10951/16913 [2:41:22<1:27:52,  1.13it/s, loss=7.78]

  [Batch 10950/16913] Loss: 7.7828


Epoch 1/10:  65%|██████▌   | 11001/16913 [2:42:06<1:27:08,  1.13it/s, loss=7.65]

  [Batch 11000/16913] Loss: 7.6489


Epoch 1/10:  65%|██████▌   | 11051/16913 [2:42:50<1:26:23,  1.13it/s, loss=7.77]

  [Batch 11050/16913] Loss: 7.7734


Epoch 1/10:  66%|██████▌   | 11101/16913 [2:43:35<1:25:35,  1.13it/s, loss=7.67]

  [Batch 11100/16913] Loss: 7.6705


Epoch 1/10:  66%|██████▌   | 11151/16913 [2:44:19<1:24:50,  1.13it/s, loss=7.53]

  [Batch 11150/16913] Loss: 7.5299


Epoch 1/10:  66%|██████▌   | 11201/16913 [2:45:03<1:24:13,  1.13it/s, loss=7.61]

  [Batch 11200/16913] Loss: 7.6088


Epoch 1/10:  67%|██████▋   | 11251/16913 [2:45:47<1:23:27,  1.13it/s, loss=7.6] 

  [Batch 11250/16913] Loss: 7.6009


Epoch 1/10:  67%|██████▋   | 11301/16913 [2:46:31<1:22:46,  1.13it/s, loss=7.53]

  [Batch 11300/16913] Loss: 7.5330


Epoch 1/10:  67%|██████▋   | 11351/16913 [2:47:16<1:21:59,  1.13it/s, loss=7.59]

  [Batch 11350/16913] Loss: 7.5873


Epoch 1/10:  67%|██████▋   | 11401/16913 [2:48:00<1:21:13,  1.13it/s, loss=7.82]

  [Batch 11400/16913] Loss: 7.8225


Epoch 1/10:  68%|██████▊   | 11451/16913 [2:48:44<1:20:35,  1.13it/s, loss=7.74]

  [Batch 11450/16913] Loss: 7.7371


Epoch 1/10:  68%|██████▊   | 11501/16913 [2:49:28<1:19:48,  1.13it/s, loss=7.52]

  [Batch 11500/16913] Loss: 7.5218


Epoch 1/10:  68%|██████▊   | 11551/16913 [2:50:13<1:19:04,  1.13it/s, loss=7.61]

  [Batch 11550/16913] Loss: 7.6085


Epoch 1/10:  69%|██████▊   | 11601/16913 [2:50:57<1:18:20,  1.13it/s, loss=7.56]

  [Batch 11600/16913] Loss: 7.5594


Epoch 1/10:  69%|██████▉   | 11651/16913 [2:51:41<1:17:38,  1.13it/s, loss=7.76]

  [Batch 11650/16913] Loss: 7.7588


Epoch 1/10:  69%|██████▉   | 11701/16913 [2:52:25<1:16:51,  1.13it/s, loss=7.67]

  [Batch 11700/16913] Loss: 7.6735


Epoch 1/10:  69%|██████▉   | 11751/16913 [2:53:10<1:16:08,  1.13it/s, loss=7.8] 

  [Batch 11750/16913] Loss: 7.8000


Epoch 1/10:  70%|██████▉   | 11801/16913 [2:53:54<1:15:20,  1.13it/s, loss=7.67]

  [Batch 11800/16913] Loss: 7.6701


Epoch 1/10:  70%|███████   | 11851/16913 [2:54:38<1:14:37,  1.13it/s, loss=7.52]

  [Batch 11850/16913] Loss: 7.5182


Epoch 1/10:  70%|███████   | 11901/16913 [2:55:22<1:13:45,  1.13it/s, loss=7.82]

  [Batch 11900/16913] Loss: 7.8170


Epoch 1/10:  71%|███████   | 11951/16913 [2:56:06<1:13:02,  1.13it/s, loss=7.61]

  [Batch 11950/16913] Loss: 7.6054


Epoch 1/10:  71%|███████   | 12001/16913 [2:56:51<1:12:18,  1.13it/s, loss=7.68]

  [Batch 12000/16913] Loss: 7.6771


Epoch 1/10:  71%|███████▏  | 12051/16913 [2:57:35<1:11:35,  1.13it/s, loss=7.71]

  [Batch 12050/16913] Loss: 7.7142


Epoch 1/10:  72%|███████▏  | 12101/16913 [2:58:19<1:10:51,  1.13it/s, loss=7.56]

  [Batch 12100/16913] Loss: 7.5554


Epoch 1/10:  72%|███████▏  | 12151/16913 [2:59:03<1:10:09,  1.13it/s, loss=7.68]

  [Batch 12150/16913] Loss: 7.6847


Epoch 1/10:  72%|███████▏  | 12201/16913 [2:59:47<1:09:22,  1.13it/s, loss=7.61]

  [Batch 12200/16913] Loss: 7.6100


Epoch 1/10:  72%|███████▏  | 12251/16913 [3:00:31<1:08:37,  1.13it/s, loss=7.63]

  [Batch 12250/16913] Loss: 7.6342


Epoch 1/10:  73%|███████▎  | 12301/16913 [3:01:16<1:07:56,  1.13it/s, loss=7.72]

  [Batch 12300/16913] Loss: 7.7152


Epoch 1/10:  73%|███████▎  | 12351/16913 [3:02:00<1:07:07,  1.13it/s, loss=7.46]

  [Batch 12350/16913] Loss: 7.4580


Epoch 1/10:  73%|███████▎  | 12401/16913 [3:02:44<1:06:26,  1.13it/s, loss=7.75]

  [Batch 12400/16913] Loss: 7.7495


Epoch 1/10:  74%|███████▎  | 12451/16913 [3:03:28<1:05:48,  1.13it/s, loss=7.8] 

  [Batch 12450/16913] Loss: 7.8049


Epoch 1/10:  74%|███████▍  | 12501/16913 [3:04:12<1:05:03,  1.13it/s, loss=7.72]

  [Batch 12500/16913] Loss: 7.7214


Epoch 1/10:  74%|███████▍  | 12551/16913 [3:04:57<1:04:16,  1.13it/s, loss=7.67]

  [Batch 12550/16913] Loss: 7.6698


Epoch 1/10:  75%|███████▍  | 12601/16913 [3:05:41<1:03:27,  1.13it/s, loss=7.81]

  [Batch 12600/16913] Loss: 7.8096


Epoch 1/10:  75%|███████▍  | 12651/16913 [3:06:25<1:02:45,  1.13it/s, loss=7.77]

  [Batch 12650/16913] Loss: 7.7693


Epoch 1/10:  75%|███████▌  | 12701/16913 [3:07:09<1:02:05,  1.13it/s, loss=7.58]

  [Batch 12700/16913] Loss: 7.5789


Epoch 1/10:  75%|███████▌  | 12751/16913 [3:07:53<1:01:19,  1.13it/s, loss=7.57]

  [Batch 12750/16913] Loss: 7.5709


Epoch 1/10:  76%|███████▌  | 12801/16913 [3:08:38<1:00:38,  1.13it/s, loss=7.57]

  [Batch 12800/16913] Loss: 7.5650


Epoch 1/10:  76%|███████▌  | 12851/16913 [3:09:22<59:48,  1.13it/s, loss=7.82]  

  [Batch 12850/16913] Loss: 7.8169


Epoch 1/10:  76%|███████▋  | 12901/16913 [3:10:06<59:04,  1.13it/s, loss=7.6] 

  [Batch 12900/16913] Loss: 7.6040


Epoch 1/10:  77%|███████▋  | 12951/16913 [3:10:50<58:24,  1.13it/s, loss=7.69]

  [Batch 12950/16913] Loss: 7.6940


Epoch 1/10:  77%|███████▋  | 13001/16913 [3:11:34<57:39,  1.13it/s, loss=7.6] 

  [Batch 13000/16913] Loss: 7.6043


Epoch 1/10:  77%|███████▋  | 13051/16913 [3:12:19<56:55,  1.13it/s, loss=7.42]

  [Batch 13050/16913] Loss: 7.4203


Epoch 1/10:  77%|███████▋  | 13101/16913 [3:13:03<56:11,  1.13it/s, loss=7.81]

  [Batch 13100/16913] Loss: 7.8086


Epoch 1/10:  78%|███████▊  | 13151/16913 [3:13:47<55:27,  1.13it/s, loss=7.7] 

  [Batch 13150/16913] Loss: 7.7030


Epoch 1/10:  78%|███████▊  | 13201/16913 [3:14:31<54:43,  1.13it/s, loss=7.73]

  [Batch 13200/16913] Loss: 7.7307


Epoch 1/10:  78%|███████▊  | 13251/16913 [3:15:15<53:54,  1.13it/s, loss=7.7] 

  [Batch 13250/16913] Loss: 7.7037


Epoch 1/10:  79%|███████▊  | 13301/16913 [3:16:00<53:15,  1.13it/s, loss=7.75]

  [Batch 13300/16913] Loss: 7.7463


Epoch 1/10:  79%|███████▉  | 13351/16913 [3:16:44<52:32,  1.13it/s, loss=7.7] 

  [Batch 13350/16913] Loss: 7.7041


Epoch 1/10:  79%|███████▉  | 13401/16913 [3:17:28<51:45,  1.13it/s, loss=7.69]

  [Batch 13400/16913] Loss: 7.6914


Epoch 1/10:  80%|███████▉  | 13451/16913 [3:18:12<51:04,  1.13it/s, loss=7.79]

  [Batch 13450/16913] Loss: 7.7949


Epoch 1/10:  80%|███████▉  | 13501/16913 [3:18:57<50:19,  1.13it/s, loss=7.69]

  [Batch 13500/16913] Loss: 7.6936


Epoch 1/10:  80%|████████  | 13551/16913 [3:19:41<49:36,  1.13it/s, loss=7.69]

  [Batch 13550/16913] Loss: 7.6868


Epoch 1/10:  80%|████████  | 13601/16913 [3:20:25<48:48,  1.13it/s, loss=7.7] 

  [Batch 13600/16913] Loss: 7.6959


Epoch 1/10:  81%|████████  | 13651/16913 [3:21:09<48:09,  1.13it/s, loss=7.77]

  [Batch 13650/16913] Loss: 7.7653


Epoch 1/10:  81%|████████  | 13701/16913 [3:21:53<47:22,  1.13it/s, loss=7.56]

  [Batch 13700/16913] Loss: 7.5588


Epoch 1/10:  81%|████████▏ | 13751/16913 [3:22:38<46:38,  1.13it/s, loss=7.74]

  [Batch 13750/16913] Loss: 7.7411


Epoch 1/10:  82%|████████▏ | 13801/16913 [3:23:22<45:49,  1.13it/s, loss=7.55]

  [Batch 13800/16913] Loss: 7.5499


Epoch 1/10:  82%|████████▏ | 13851/16913 [3:24:06<45:08,  1.13it/s, loss=7.61]

  [Batch 13850/16913] Loss: 7.6137


Epoch 1/10:  82%|████████▏ | 13901/16913 [3:24:50<44:24,  1.13it/s, loss=7.7] 

  [Batch 13900/16913] Loss: 7.7039


Epoch 1/10:  82%|████████▏ | 13951/16913 [3:25:35<43:39,  1.13it/s, loss=7.8] 

  [Batch 13950/16913] Loss: 7.8049


Epoch 1/10:  83%|████████▎ | 14001/16913 [3:26:19<42:52,  1.13it/s, loss=7.77]

  [Batch 14000/16913] Loss: 7.7673


Epoch 1/10:  83%|████████▎ | 14051/16913 [3:27:03<42:13,  1.13it/s, loss=7.74]

  [Batch 14050/16913] Loss: 7.7447


Epoch 1/10:  83%|████████▎ | 14101/16913 [3:27:47<41:30,  1.13it/s, loss=7.73]

  [Batch 14100/16913] Loss: 7.7317


Epoch 1/10:  84%|████████▎ | 14151/16913 [3:28:32<40:43,  1.13it/s, loss=7.75]

  [Batch 14150/16913] Loss: 7.7474


Epoch 1/10:  84%|████████▍ | 14201/16913 [3:29:16<40:00,  1.13it/s, loss=7.61]

  [Batch 14200/16913] Loss: 7.6118


Epoch 1/10:  84%|████████▍ | 14251/16913 [3:30:00<39:11,  1.13it/s, loss=7.73]

  [Batch 14250/16913] Loss: 7.7348


Epoch 1/10:  85%|████████▍ | 14301/16913 [3:30:44<38:31,  1.13it/s, loss=7.72]

  [Batch 14300/16913] Loss: 7.7201


Epoch 1/10:  85%|████████▍ | 14351/16913 [3:31:28<37:47,  1.13it/s, loss=7.79]

  [Batch 14350/16913] Loss: 7.7915


Epoch 1/10:  85%|████████▌ | 14401/16913 [3:32:13<37:02,  1.13it/s, loss=7.65]

  [Batch 14400/16913] Loss: 7.6516


Epoch 1/10:  85%|████████▌ | 14451/16913 [3:32:57<36:15,  1.13it/s, loss=7.53]

  [Batch 14450/16913] Loss: 7.5282


Epoch 1/10:  86%|████████▌ | 14501/16913 [3:33:41<35:31,  1.13it/s, loss=7.87]

  [Batch 14500/16913] Loss: 7.8724


Epoch 1/10:  86%|████████▌ | 14551/16913 [3:34:25<34:50,  1.13it/s, loss=7.82]

  [Batch 14550/16913] Loss: 7.8246


Epoch 1/10:  86%|████████▋ | 14601/16913 [3:35:10<34:05,  1.13it/s, loss=7.69]

  [Batch 14600/16913] Loss: 7.6853


Epoch 1/10:  87%|████████▋ | 14651/16913 [3:35:54<33:21,  1.13it/s, loss=7.63]

  [Batch 14650/16913] Loss: 7.6305


Epoch 1/10:  87%|████████▋ | 14701/16913 [3:36:38<32:37,  1.13it/s, loss=7.69]

  [Batch 14700/16913] Loss: 7.6885


Epoch 1/10:  87%|████████▋ | 14751/16913 [3:37:22<31:53,  1.13it/s, loss=7.57]

  [Batch 14750/16913] Loss: 7.5703


Epoch 1/10:  88%|████████▊ | 14801/16913 [3:38:07<31:10,  1.13it/s, loss=7.5] 

  [Batch 14800/16913] Loss: 7.4969


Epoch 1/10:  88%|████████▊ | 14851/16913 [3:38:51<30:22,  1.13it/s, loss=7.64]

  [Batch 14850/16913] Loss: 7.6388


Epoch 1/10:  88%|████████▊ | 14901/16913 [3:39:35<29:37,  1.13it/s, loss=7.78]

  [Batch 14900/16913] Loss: 7.7754


Epoch 1/10:  88%|████████▊ | 14951/16913 [3:40:19<28:56,  1.13it/s, loss=7.63]

  [Batch 14950/16913] Loss: 7.6274


Epoch 1/10:  89%|████████▊ | 15001/16913 [3:41:03<28:09,  1.13it/s, loss=7.58]

  [Batch 15000/16913] Loss: 7.5805


Epoch 2/10:   5%|▍         | 801/16913 [11:48<3:57:36,  1.13it/s, loss=7.81]4]

  [Batch 800/16913] Loss: 7.8083


Epoch 2/10:   5%|▌         | 851/16913 [12:32<3:56:55,  1.13it/s, loss=7.68]

  [Batch 850/16913] Loss: 7.6819


Epoch 2/10:   5%|▌         | 901/16913 [13:16<3:55:54,  1.13it/s, loss=7.86]

  [Batch 900/16913] Loss: 7.8632


Epoch 2/10:   6%|▌         | 951/16913 [14:01<3:55:00,  1.13it/s, loss=7.56]

  [Batch 950/16913] Loss: 7.5624


Epoch 2/10:   6%|▌         | 1001/16913 [14:45<3:54:46,  1.13it/s, loss=7.61]

  [Batch 1000/16913] Loss: 7.6138


Epoch 2/10:   6%|▌         | 1051/16913 [15:29<3:53:52,  1.13it/s, loss=7.64]

  [Batch 1050/16913] Loss: 7.6411


Epoch 2/10:   7%|▋         | 1101/16913 [16:13<3:53:15,  1.13it/s, loss=7.54]

  [Batch 1100/16913] Loss: 7.5429


Epoch 2/10:   7%|▋         | 1151/16913 [16:58<3:52:26,  1.13it/s, loss=7.7] 

  [Batch 1150/16913] Loss: 7.6951


Epoch 2/10:   7%|▋         | 1201/16913 [17:42<3:51:49,  1.13it/s, loss=7.86]

  [Batch 1200/16913] Loss: 7.8589


Epoch 2/10:   7%|▋         | 1251/16913 [18:26<3:50:57,  1.13it/s, loss=7.61]

  [Batch 1250/16913] Loss: 7.6102


Epoch 2/10:   8%|▊         | 1301/16913 [19:10<3:50:21,  1.13it/s, loss=7.68]

  [Batch 1300/16913] Loss: 7.6755


Epoch 2/10:   8%|▊         | 1351/16913 [19:55<3:49:33,  1.13it/s, loss=7.6] 

  [Batch 1350/16913] Loss: 7.6027


Epoch 2/10:   8%|▊         | 1401/16913 [20:39<3:48:28,  1.13it/s, loss=7.65]

  [Batch 1400/16913] Loss: 7.6545


Epoch 2/10:   9%|▊         | 1451/16913 [21:23<3:47:58,  1.13it/s, loss=7.81]

  [Batch 1450/16913] Loss: 7.8066


Epoch 2/10:   9%|▉         | 1501/16913 [22:07<3:46:56,  1.13it/s, loss=7.64]

  [Batch 1500/16913] Loss: 7.6437


Epoch 2/10:   9%|▉         | 1551/16913 [22:51<3:46:31,  1.13it/s, loss=7.66]

  [Batch 1550/16913] Loss: 7.6614


Epoch 2/10:   9%|▉         | 1601/16913 [23:36<3:45:34,  1.13it/s, loss=7.62]

  [Batch 1600/16913] Loss: 7.6157


Epoch 2/10:  10%|▉         | 1651/16913 [24:20<3:44:49,  1.13it/s, loss=7.9] 

  [Batch 1650/16913] Loss: 7.9033


Epoch 2/10:  10%|█         | 1701/16913 [25:04<3:44:26,  1.13it/s, loss=7.7] 

  [Batch 1700/16913] Loss: 7.6961


Epoch 2/10:  10%|█         | 1751/16913 [25:48<3:43:32,  1.13it/s, loss=7.86]

  [Batch 1750/16913] Loss: 7.8603


Epoch 2/10:  11%|█         | 1801/16913 [26:32<3:42:53,  1.13it/s, loss=7.74]

  [Batch 1800/16913] Loss: 7.7427


Epoch 2/10:  11%|█         | 1851/16913 [27:17<3:42:02,  1.13it/s, loss=7.62]

  [Batch 1850/16913] Loss: 7.6218


Epoch 2/10:  11%|█         | 1901/16913 [28:01<3:41:20,  1.13it/s, loss=7.62]

  [Batch 1900/16913] Loss: 7.6236


Epoch 2/10:  12%|█▏        | 1951/16913 [28:45<3:40:16,  1.13it/s, loss=7.59]

  [Batch 1950/16913] Loss: 7.5915


Epoch 2/10:  12%|█▏        | 2001/16913 [29:29<3:39:54,  1.13it/s, loss=7.95]

  [Batch 2000/16913] Loss: 7.9465


Epoch 2/10:  12%|█▏        | 2051/16913 [30:14<3:39:11,  1.13it/s, loss=7.75]

  [Batch 2050/16913] Loss: 7.7535


Epoch 2/10:  12%|█▏        | 2101/16913 [30:58<3:38:21,  1.13it/s, loss=7.69]

  [Batch 2100/16913] Loss: 7.6864


Epoch 2/10:  13%|█▎        | 2151/16913 [31:42<3:37:39,  1.13it/s, loss=7.72]

  [Batch 2150/16913] Loss: 7.7151


Epoch 2/10:  13%|█▎        | 2201/16913 [32:26<3:36:56,  1.13it/s, loss=7.66]

  [Batch 2200/16913] Loss: 7.6606


Epoch 2/10:  13%|█▎        | 2251/16913 [33:11<3:36:20,  1.13it/s, loss=7.67]

  [Batch 2250/16913] Loss: 7.6728


Epoch 2/10:  14%|█▎        | 2301/16913 [33:55<3:35:29,  1.13it/s, loss=7.58]

  [Batch 2300/16913] Loss: 7.5799


Epoch 2/10:  14%|█▍        | 2351/16913 [34:39<3:34:36,  1.13it/s, loss=7.89]

  [Batch 2350/16913] Loss: 7.8864


Epoch 2/10:  14%|█▍        | 2401/16913 [35:23<3:33:52,  1.13it/s, loss=7.77]

  [Batch 2400/16913] Loss: 7.7744


Epoch 2/10:  14%|█▍        | 2451/16913 [36:07<3:33:12,  1.13it/s, loss=7.78]

  [Batch 2450/16913] Loss: 7.7825


Epoch 2/10:  15%|█▍        | 2501/16913 [36:52<3:32:35,  1.13it/s, loss=7.66]

  [Batch 2500/16913] Loss: 7.6630


Epoch 2/10:  15%|█▌        | 2551/16913 [37:36<3:31:57,  1.13it/s, loss=7.82]

  [Batch 2550/16913] Loss: 7.8246


Epoch 2/10:  15%|█▌        | 2601/16913 [38:20<3:31:02,  1.13it/s, loss=7.82]

  [Batch 2600/16913] Loss: 7.8217


Epoch 2/10:  16%|█▌        | 2651/16913 [39:04<3:30:25,  1.13it/s, loss=7.67]

  [Batch 2650/16913] Loss: 7.6661


Epoch 2/10:  16%|█▌        | 2701/16913 [39:49<3:29:43,  1.13it/s, loss=7.81]

  [Batch 2700/16913] Loss: 7.8076


Epoch 2/10:  16%|█▋        | 2751/16913 [40:33<3:28:36,  1.13it/s, loss=7.55]

  [Batch 2750/16913] Loss: 7.5530


Epoch 2/10:  17%|█▋        | 2801/16913 [41:17<3:28:00,  1.13it/s, loss=7.58]

  [Batch 2800/16913] Loss: 7.5838


Epoch 2/10:  17%|█▋        | 2851/16913 [42:01<3:27:25,  1.13it/s, loss=7.67]

  [Batch 2850/16913] Loss: 7.6687


Epoch 2/10:  17%|█▋        | 2901/16913 [42:46<3:26:43,  1.13it/s, loss=7.65]

  [Batch 2900/16913] Loss: 7.6523


Epoch 2/10:  17%|█▋        | 2951/16913 [43:30<3:25:57,  1.13it/s, loss=7.57]

  [Batch 2950/16913] Loss: 7.5703


Epoch 2/10:  18%|█▊        | 3001/16913 [44:14<3:25:19,  1.13it/s, loss=7.87]

  [Batch 3000/16913] Loss: 7.8721


Epoch 2/10:  18%|█▊        | 3051/16913 [44:58<3:24:23,  1.13it/s, loss=7.78]

  [Batch 3050/16913] Loss: 7.7791


Epoch 2/10:  18%|█▊        | 3101/16913 [45:43<3:23:27,  1.13it/s, loss=7.67]

  [Batch 3100/16913] Loss: 7.6691


Epoch 2/10:  19%|█▊        | 3151/16913 [46:27<3:22:54,  1.13it/s, loss=7.67]

  [Batch 3150/16913] Loss: 7.6671


Epoch 2/10:  19%|█▉        | 3201/16913 [47:11<3:21:55,  1.13it/s, loss=7.68]

  [Batch 3200/16913] Loss: 7.6829


Epoch 2/10:  19%|█▉        | 3251/16913 [47:55<3:21:21,  1.13it/s, loss=7.77]

  [Batch 3250/16913] Loss: 7.7663


Epoch 2/10:  20%|█▉        | 3301/16913 [48:39<3:20:50,  1.13it/s, loss=7.63]

  [Batch 3300/16913] Loss: 7.6262


Epoch 2/10:  20%|█▉        | 3351/16913 [49:24<3:20:02,  1.13it/s, loss=7.78]

  [Batch 3350/16913] Loss: 7.7815


Epoch 2/10:  20%|██        | 3401/16913 [50:08<3:19:22,  1.13it/s, loss=7.75]

  [Batch 3400/16913] Loss: 7.7468


Epoch 2/10:  20%|██        | 3451/16913 [50:52<3:18:25,  1.13it/s, loss=7.74]

  [Batch 3450/16913] Loss: 7.7387


Epoch 2/10:  21%|██        | 3501/16913 [51:36<3:17:45,  1.13it/s, loss=7.85]

  [Batch 3500/16913] Loss: 7.8455


Epoch 2/10:  21%|██        | 3551/16913 [52:21<3:16:49,  1.13it/s, loss=7.66]

  [Batch 3550/16913] Loss: 7.6634


Epoch 2/10:  21%|██▏       | 3601/16913 [53:05<3:16:19,  1.13it/s, loss=7.63]

  [Batch 3600/16913] Loss: 7.6298


Epoch 2/10:  22%|██▏       | 3651/16913 [53:49<3:15:26,  1.13it/s, loss=7.67]

  [Batch 3650/16913] Loss: 7.6690


Epoch 2/10:  22%|██▏       | 3701/16913 [54:33<3:14:38,  1.13it/s, loss=7.76]

  [Batch 3700/16913] Loss: 7.7566


Epoch 2/10:  22%|██▏       | 3751/16913 [55:18<3:13:57,  1.13it/s, loss=7.69]

  [Batch 3750/16913] Loss: 7.6916


Epoch 2/10:  22%|██▏       | 3801/16913 [56:02<3:13:28,  1.13it/s, loss=7.7] 

  [Batch 3800/16913] Loss: 7.7011


Epoch 2/10:  23%|██▎       | 3851/16913 [56:46<3:12:32,  1.13it/s, loss=7.67]

  [Batch 3850/16913] Loss: 7.6680


Epoch 2/10:  23%|██▎       | 3901/16913 [57:30<3:11:54,  1.13it/s, loss=7.86]

  [Batch 3900/16913] Loss: 7.8583


Epoch 2/10:  23%|██▎       | 3951/16913 [58:15<3:11:02,  1.13it/s, loss=7.61]

  [Batch 3950/16913] Loss: 7.6060


Epoch 2/10:  24%|██▎       | 4001/16913 [58:59<3:10:23,  1.13it/s, loss=7.62]

  [Batch 4000/16913] Loss: 7.6198


Epoch 2/10:  24%|██▍       | 4051/16913 [59:43<3:09:38,  1.13it/s, loss=7.81]

  [Batch 4050/16913] Loss: 7.8110


Epoch 2/10:  24%|██▍       | 4101/16913 [1:00:27<3:09:05,  1.13it/s, loss=7.66]

  [Batch 4100/16913] Loss: 7.6626


Epoch 2/10:  25%|██▍       | 4151/16913 [1:01:12<3:08:08,  1.13it/s, loss=7.75]

  [Batch 4150/16913] Loss: 7.7504


Epoch 2/10:  25%|██▍       | 4201/16913 [1:01:56<3:07:16,  1.13it/s, loss=7.91]

  [Batch 4200/16913] Loss: 7.9119


Epoch 2/10:  47%|████▋     | 7951/16913 [1:57:12<2:12:12,  1.13it/s, loss=7.82]

  [Batch 7950/16913] Loss: 7.8153


Epoch 2/10:  47%|████▋     | 8001/16913 [1:57:56<2:11:23,  1.13it/s, loss=7.86]

  [Batch 8000/16913] Loss: 7.8558


Epoch 2/10:  48%|████▊     | 8051/16913 [1:58:41<2:10:36,  1.13it/s, loss=7.58]

  [Batch 8050/16913] Loss: 7.5844


Epoch 2/10:  48%|████▊     | 8101/16913 [1:59:25<2:09:57,  1.13it/s, loss=7.74]

  [Batch 8100/16913] Loss: 7.7352


Epoch 2/10:  48%|████▊     | 8151/16913 [2:00:09<2:09:12,  1.13it/s, loss=7.53]

  [Batch 8150/16913] Loss: 7.5284


Epoch 2/10:  48%|████▊     | 8201/16913 [2:00:53<2:08:28,  1.13it/s, loss=7.73]

  [Batch 8200/16913] Loss: 7.7326


Epoch 2/10:  49%|████▉     | 8251/16913 [2:01:38<2:07:34,  1.13it/s, loss=7.64]

  [Batch 8250/16913] Loss: 7.6422


Epoch 2/10:  49%|████▉     | 8301/16913 [2:02:22<2:06:59,  1.13it/s, loss=7.72]

  [Batch 8300/16913] Loss: 7.7191


Epoch 2/10:  49%|████▉     | 8351/16913 [2:03:06<2:06:14,  1.13it/s, loss=7.67]

  [Batch 8350/16913] Loss: 7.6727


Epoch 2/10:  50%|████▉     | 8401/16913 [2:03:50<2:05:22,  1.13it/s, loss=7.79]

  [Batch 8400/16913] Loss: 7.7939


Epoch 2/10:  50%|████▉     | 8451/16913 [2:04:34<2:04:35,  1.13it/s, loss=7.56]

  [Batch 8450/16913] Loss: 7.5568


Epoch 2/10:  50%|█████     | 8501/16913 [2:05:19<2:03:55,  1.13it/s, loss=7.73]

  [Batch 8500/16913] Loss: 7.7325


Epoch 2/10:  51%|█████     | 8551/16913 [2:06:03<2:03:10,  1.13it/s, loss=7.75]

  [Batch 8550/16913] Loss: 7.7464


Epoch 2/10:  51%|█████     | 8601/16913 [2:06:47<2:02:27,  1.13it/s, loss=7.74]

  [Batch 8600/16913] Loss: 7.7383


Epoch 2/10:  51%|█████     | 8651/16913 [2:07:31<2:01:49,  1.13it/s, loss=7.72]

  [Batch 8650/16913] Loss: 7.7218


Epoch 2/10:  51%|█████▏    | 8701/16913 [2:08:15<2:00:56,  1.13it/s, loss=7.53]

  [Batch 8700/16913] Loss: 7.5256


Epoch 2/10:  52%|█████▏    | 8751/16913 [2:09:00<2:00:11,  1.13it/s, loss=7.68]

  [Batch 8750/16913] Loss: 7.6832


Epoch 2/10:  52%|█████▏    | 8801/16913 [2:09:44<1:59:39,  1.13it/s, loss=7.78]

  [Batch 8800/16913] Loss: 7.7813


Epoch 2/10:  52%|█████▏    | 8851/16913 [2:10:28<1:58:43,  1.13it/s, loss=7.75]

  [Batch 8850/16913] Loss: 7.7514


Epoch 2/10:  53%|█████▎    | 8901/16913 [2:11:12<1:58:06,  1.13it/s, loss=7.78]

  [Batch 8900/16913] Loss: 7.7815


Epoch 2/10:  53%|█████▎    | 8951/16913 [2:11:56<1:57:23,  1.13it/s, loss=7.55]

  [Batch 8950/16913] Loss: 7.5512


Epoch 2/10:  53%|█████▎    | 9001/16913 [2:12:41<1:56:42,  1.13it/s, loss=7.93]

  [Batch 9000/16913] Loss: 7.9331


Epoch 2/10:  54%|█████▎    | 9051/16913 [2:13:25<1:55:48,  1.13it/s, loss=7.59]

  [Batch 9050/16913] Loss: 7.5860


Epoch 2/10:  54%|█████▍    | 9101/16913 [2:14:09<1:55:01,  1.13it/s, loss=7.63]

  [Batch 9100/16913] Loss: 7.6286


Epoch 2/10:  54%|█████▍    | 9151/16913 [2:14:53<1:54:26,  1.13it/s, loss=7.65]

  [Batch 9150/16913] Loss: 7.6492


Epoch 2/10:  54%|█████▍    | 9201/16913 [2:15:37<1:53:41,  1.13it/s, loss=7.51]

  [Batch 9200/16913] Loss: 7.5138


Epoch 2/10:  55%|█████▍    | 9251/16913 [2:16:22<1:52:57,  1.13it/s, loss=7.78]

  [Batch 9250/16913] Loss: 7.7763


Epoch 2/10:  55%|█████▍    | 9301/16913 [2:17:06<1:52:15,  1.13it/s, loss=7.64]

  [Batch 9300/16913] Loss: 7.6411


Epoch 2/10:  55%|█████▌    | 9351/16913 [2:17:50<1:51:31,  1.13it/s, loss=7.78]

  [Batch 9350/16913] Loss: 7.7834


Epoch 2/10:  56%|█████▌    | 9401/16913 [2:18:34<1:50:47,  1.13it/s, loss=7.71]

  [Batch 9400/16913] Loss: 7.7131


Epoch 2/10:  56%|█████▌    | 9451/16913 [2:19:19<1:50:02,  1.13it/s, loss=7.83]

  [Batch 9450/16913] Loss: 7.8285


Epoch 2/10:  56%|█████▌    | 9501/16913 [2:20:03<1:49:12,  1.13it/s, loss=7.72]

  [Batch 9500/16913] Loss: 7.7192


Epoch 2/10:  56%|█████▋    | 9551/16913 [2:20:47<1:48:33,  1.13it/s, loss=7.49]

  [Batch 9550/16913] Loss: 7.4870


Epoch 2/10:  57%|█████▋    | 9601/16913 [2:21:31<1:47:50,  1.13it/s, loss=7.72]

  [Batch 9600/16913] Loss: 7.7171


Epoch 2/10:  57%|█████▋    | 9651/16913 [2:22:15<1:46:59,  1.13it/s, loss=7.79]

  [Batch 9650/16913] Loss: 7.7936


Epoch 2/10:  57%|█████▋    | 9701/16913 [2:23:00<1:46:26,  1.13it/s, loss=7.65]

  [Batch 9700/16913] Loss: 7.6524


Epoch 2/10:  58%|█████▊    | 9751/16913 [2:23:44<1:45:28,  1.13it/s, loss=7.82]

  [Batch 9750/16913] Loss: 7.8209


Epoch 2/10:  58%|█████▊    | 9801/16913 [2:24:28<1:44:51,  1.13it/s, loss=7.63]

  [Batch 9800/16913] Loss: 7.6255


Epoch 2/10:  58%|█████▊    | 9851/16913 [2:25:12<1:44:08,  1.13it/s, loss=7.45]

  [Batch 9850/16913] Loss: 7.4524


Epoch 2/10:  59%|█████▊    | 9901/16913 [2:25:57<1:43:29,  1.13it/s, loss=7.6] 

  [Batch 9900/16913] Loss: 7.5969


Epoch 2/10:  59%|█████▉    | 9951/16913 [2:26:41<1:42:37,  1.13it/s, loss=7.73]

  [Batch 9950/16913] Loss: 7.7334


Epoch 2/10:  59%|█████▉    | 10001/16913 [2:27:25<1:42:00,  1.13it/s, loss=7.79]

  [Batch 10000/16913] Loss: 7.7864


Epoch 2/10:  59%|█████▉    | 10051/16913 [2:28:09<1:41:10,  1.13it/s, loss=7.41]

  [Batch 10050/16913] Loss: 7.4104


Epoch 2/10:  60%|█████▉    | 10101/16913 [2:28:53<1:40:23,  1.13it/s, loss=7.8] 

  [Batch 10100/16913] Loss: 7.8030


Epoch 2/10:  60%|██████    | 10151/16913 [2:29:38<1:39:41,  1.13it/s, loss=7.79]

  [Batch 10150/16913] Loss: 7.7906


Epoch 2/10:  60%|██████    | 10201/16913 [2:30:22<1:38:52,  1.13it/s, loss=7.58]

  [Batch 10200/16913] Loss: 7.5785


Epoch 2/10:  61%|██████    | 10251/16913 [2:31:06<1:38:14,  1.13it/s, loss=7.68]

  [Batch 10250/16913] Loss: 7.6838


Epoch 2/10:  61%|██████    | 10301/16913 [2:31:50<1:37:18,  1.13it/s, loss=7.55]

  [Batch 10300/16913] Loss: 7.5548


Epoch 2/10:  61%|██████    | 10351/16913 [2:32:35<1:36:45,  1.13it/s, loss=7.61]

  [Batch 10350/16913] Loss: 7.6080


Epoch 2/10:  61%|██████▏   | 10401/16913 [2:33:19<1:35:58,  1.13it/s, loss=7.64]

  [Batch 10400/16913] Loss: 7.6417


Epoch 2/10:  62%|██████▏   | 10451/16913 [2:34:03<1:35:17,  1.13it/s, loss=7.48]

  [Batch 10450/16913] Loss: 7.4821


Epoch 2/10:  62%|██████▏   | 10501/16913 [2:34:47<1:34:29,  1.13it/s, loss=7.6] 

  [Batch 10500/16913] Loss: 7.6000


Epoch 2/10:  62%|██████▏   | 10551/16913 [2:35:31<1:33:49,  1.13it/s, loss=7.55]

  [Batch 10550/16913] Loss: 7.5548


Epoch 2/10:  63%|██████▎   | 10601/16913 [2:36:16<1:33:05,  1.13it/s, loss=7.71]

  [Batch 10600/16913] Loss: 7.7135


Epoch 2/10:  63%|██████▎   | 10651/16913 [2:37:00<1:32:13,  1.13it/s, loss=7.7] 

  [Batch 10650/16913] Loss: 7.6954


Epoch 2/10:  63%|██████▎   | 10701/16913 [2:37:44<1:31:36,  1.13it/s, loss=7.77]

  [Batch 10700/16913] Loss: 7.7672


Epoch 2/10:  64%|██████▎   | 10751/16913 [2:38:28<1:30:44,  1.13it/s, loss=7.79]

  [Batch 10750/16913] Loss: 7.7853


Epoch 2/10:  64%|██████▍   | 10801/16913 [2:39:13<1:30:00,  1.13it/s, loss=7.64]

  [Batch 10800/16913] Loss: 7.6446


Epoch 2/10:  64%|██████▍   | 10851/16913 [2:39:57<1:29:16,  1.13it/s, loss=7.73]

  [Batch 10850/16913] Loss: 7.7345


Epoch 2/10:  64%|██████▍   | 10901/16913 [2:40:41<1:28:32,  1.13it/s, loss=7.61]

  [Batch 10900/16913] Loss: 7.6132


Epoch 2/10:  65%|██████▍   | 10951/16913 [2:41:25<1:27:52,  1.13it/s, loss=7.55]

  [Batch 10950/16913] Loss: 7.5466


Epoch 2/10:  65%|██████▌   | 11001/16913 [2:42:10<1:27:12,  1.13it/s, loss=7.77]

  [Batch 11000/16913] Loss: 7.7697


Epoch 2/10:  65%|██████▌   | 11051/16913 [2:42:54<1:26:25,  1.13it/s, loss=7.67]

  [Batch 11050/16913] Loss: 7.6750


Epoch 2/10:  66%|██████▌   | 11101/16913 [2:43:38<1:25:36,  1.13it/s, loss=7.83]

  [Batch 11100/16913] Loss: 7.8320


Epoch 2/10:  66%|██████▌   | 11151/16913 [2:44:22<1:24:53,  1.13it/s, loss=7.57]

  [Batch 11150/16913] Loss: 7.5714


Epoch 2/10:  66%|██████▌   | 11201/16913 [2:45:07<1:24:17,  1.13it/s, loss=7.74]

  [Batch 11200/16913] Loss: 7.7368


Epoch 2/10:  67%|██████▋   | 11251/16913 [2:45:51<1:23:30,  1.13it/s, loss=7.69]

  [Batch 11250/16913] Loss: 7.6942


Epoch 2/10:  67%|██████▋   | 11301/16913 [2:46:35<1:22:45,  1.13it/s, loss=7.66]

  [Batch 11300/16913] Loss: 7.6621


Epoch 2/10:  67%|██████▋   | 11351/16913 [2:47:19<1:22:00,  1.13it/s, loss=7.62]

  [Batch 11350/16913] Loss: 7.6220


Epoch 2/10:  67%|██████▋   | 11401/16913 [2:48:03<1:21:16,  1.13it/s, loss=7.73]

  [Batch 11400/16913] Loss: 7.7346


Epoch 2/10:  68%|██████▊   | 11451/16913 [2:48:48<1:20:23,  1.13it/s, loss=7.58]

  [Batch 11450/16913] Loss: 7.5810


Epoch 2/10:  68%|██████▊   | 11501/16913 [2:49:32<1:19:48,  1.13it/s, loss=7.54]

  [Batch 11500/16913] Loss: 7.5367


Epoch 2/10:  68%|██████▊   | 11551/16913 [2:50:16<1:19:03,  1.13it/s, loss=7.6] 

  [Batch 11550/16913] Loss: 7.5985


Epoch 2/10:  69%|██████▊   | 11601/16913 [2:51:00<1:18:14,  1.13it/s, loss=7.64]

  [Batch 11600/16913] Loss: 7.6434


Epoch 2/10:  69%|██████▉   | 11651/16913 [2:51:45<1:17:33,  1.13it/s, loss=7.6] 

  [Batch 11650/16913] Loss: 7.5994


Epoch 2/10:  69%|██████▉   | 11701/16913 [2:52:29<1:16:45,  1.13it/s, loss=7.76]

  [Batch 11700/16913] Loss: 7.7590


Epoch 2/10:  69%|██████▉   | 11751/16913 [2:53:13<1:16:00,  1.13it/s, loss=7.68]

  [Batch 11750/16913] Loss: 7.6774


Epoch 2/10:  70%|██████▉   | 11801/16913 [2:53:57<1:15:18,  1.13it/s, loss=7.58]

  [Batch 11800/16913] Loss: 7.5793


Epoch 2/10:  70%|███████   | 11851/16913 [2:54:41<1:14:39,  1.13it/s, loss=7.62]

  [Batch 11850/16913] Loss: 7.6168


Epoch 2/10:  70%|███████   | 11901/16913 [2:55:26<1:13:51,  1.13it/s, loss=7.77]

  [Batch 11900/16913] Loss: 7.7669


Epoch 2/10:  71%|███████   | 11951/16913 [2:56:10<1:13:10,  1.13it/s, loss=7.61]

  [Batch 11950/16913] Loss: 7.6088


Epoch 2/10:  71%|███████   | 12001/16913 [2:56:54<1:12:24,  1.13it/s, loss=7.59]

  [Batch 12000/16913] Loss: 7.5943


Epoch 2/10:  71%|███████▏  | 12051/16913 [2:57:38<1:11:40,  1.13it/s, loss=7.89]

  [Batch 12050/16913] Loss: 7.8948


Epoch 2/10:  72%|███████▏  | 12101/16913 [2:58:22<1:10:58,  1.13it/s, loss=7.76]

  [Batch 12100/16913] Loss: 7.7590


Epoch 2/10:  72%|███████▏  | 12151/16913 [2:59:07<1:10:10,  1.13it/s, loss=7.53]

  [Batch 12150/16913] Loss: 7.5338


Epoch 2/10:  72%|███████▏  | 12201/16913 [2:59:51<1:09:32,  1.13it/s, loss=7.72]

  [Batch 12200/16913] Loss: 7.7169


Epoch 2/10:  72%|███████▏  | 12251/16913 [3:00:35<1:08:43,  1.13it/s, loss=7.65]

  [Batch 12250/16913] Loss: 7.6548


Epoch 2/10:  73%|███████▎  | 12301/16913 [3:01:19<1:08:05,  1.13it/s, loss=7.84]

  [Batch 12300/16913] Loss: 7.8390


Epoch 2/10:  73%|███████▎  | 12351/16913 [3:02:04<1:07:16,  1.13it/s, loss=7.92]

  [Batch 12350/16913] Loss: 7.9192


Epoch 2/10:  73%|███████▎  | 12401/16913 [3:02:48<1:06:30,  1.13it/s, loss=7.69]

  [Batch 12400/16913] Loss: 7.6914


Epoch 2/10:  74%|███████▎  | 12451/16913 [3:03:32<1:05:46,  1.13it/s, loss=7.45]

  [Batch 12450/16913] Loss: 7.4507


Epoch 2/10:  74%|███████▍  | 12501/16913 [3:04:16<1:05:00,  1.13it/s, loss=7.62]

  [Batch 12500/16913] Loss: 7.6187


Epoch 2/10:  74%|███████▍  | 12551/16913 [3:05:01<1:04:14,  1.13it/s, loss=7.51]

  [Batch 12550/16913] Loss: 7.5090


Epoch 2/10:  75%|███████▍  | 12601/16913 [3:05:45<1:03:37,  1.13it/s, loss=7.67]

  [Batch 12600/16913] Loss: 7.6667


Epoch 2/10:  75%|███████▍  | 12651/16913 [3:06:29<1:02:51,  1.13it/s, loss=7.8] 

  [Batch 12650/16913] Loss: 7.7993


Epoch 2/10:  75%|███████▌  | 12701/16913 [3:07:13<1:02:04,  1.13it/s, loss=7.69]

  [Batch 12700/16913] Loss: 7.6895


Epoch 2/10:  75%|███████▌  | 12751/16913 [3:07:57<1:01:20,  1.13it/s, loss=7.61]

  [Batch 12750/16913] Loss: 7.6063


Epoch 2/10:  76%|███████▌  | 12801/16913 [3:08:42<1:00:40,  1.13it/s, loss=7.56]

  [Batch 12800/16913] Loss: 7.5603


Epoch 2/10:  76%|███████▌  | 12851/16913 [3:09:26<59:54,  1.13it/s, loss=7.66]  

  [Batch 12850/16913] Loss: 7.6570


Epoch 2/10:  76%|███████▋  | 12901/16913 [3:10:10<59:09,  1.13it/s, loss=7.72]

  [Batch 12900/16913] Loss: 7.7239


Epoch 2/10:  77%|███████▋  | 12951/16913 [3:10:54<58:28,  1.13it/s, loss=7.57]

  [Batch 12950/16913] Loss: 7.5702


Epoch 2/10:  77%|███████▋  | 13001/16913 [3:11:39<57:41,  1.13it/s, loss=7.69]

  [Batch 13000/16913] Loss: 7.6926


Epoch 2/10:  77%|███████▋  | 13051/16913 [3:12:23<56:53,  1.13it/s, loss=7.58]

  [Batch 13050/16913] Loss: 7.5786


Epoch 2/10:  77%|███████▋  | 13101/16913 [3:13:07<56:13,  1.13it/s, loss=7.46]

  [Batch 13100/16913] Loss: 7.4637


Epoch 2/10:  78%|███████▊  | 13151/16913 [3:13:51<55:29,  1.13it/s, loss=7.68]

  [Batch 13150/16913] Loss: 7.6827


Epoch 2/10:  78%|███████▊  | 13201/16913 [3:14:36<54:43,  1.13it/s, loss=7.42]

  [Batch 13200/16913] Loss: 7.4248


Epoch 2/10:  78%|███████▊  | 13251/16913 [3:15:20<54:01,  1.13it/s, loss=7.69]

  [Batch 13250/16913] Loss: 7.6936


Epoch 2/10:  79%|███████▊  | 13301/16913 [3:16:04<53:14,  1.13it/s, loss=7.71]

  [Batch 13300/16913] Loss: 7.7071


Epoch 2/10:  79%|███████▉  | 13351/16913 [3:16:48<52:27,  1.13it/s, loss=7.68]

  [Batch 13350/16913] Loss: 7.6796


Epoch 2/10:  79%|███████▉  | 13401/16913 [3:17:32<51:42,  1.13it/s, loss=7.7] 

  [Batch 13400/16913] Loss: 7.6957


Epoch 2/10:  80%|███████▉  | 13451/16913 [3:18:17<50:59,  1.13it/s, loss=7.76]

  [Batch 13450/16913] Loss: 7.7569


Epoch 2/10:  80%|███████▉  | 13501/16913 [3:19:01<50:14,  1.13it/s, loss=7.64]

  [Batch 13500/16913] Loss: 7.6413


Epoch 2/10:  80%|████████  | 13551/16913 [3:19:45<49:36,  1.13it/s, loss=7.7] 

  [Batch 13550/16913] Loss: 7.7013


Epoch 2/10:  80%|████████  | 13601/16913 [3:20:29<48:50,  1.13it/s, loss=7.74]

  [Batch 13600/16913] Loss: 7.7438


Epoch 2/10:  81%|████████  | 13651/16913 [3:21:14<48:05,  1.13it/s, loss=7.5] 

  [Batch 13650/16913] Loss: 7.5021


Epoch 2/10:  81%|████████  | 13701/16913 [3:21:58<47:21,  1.13it/s, loss=7.69]

  [Batch 13700/16913] Loss: 7.6946


Epoch 2/10:  81%|████████▏ | 13751/16913 [3:22:42<46:35,  1.13it/s, loss=7.61]

  [Batch 13750/16913] Loss: 7.6053


Epoch 2/10:  82%|████████▏ | 13801/16913 [3:23:26<45:53,  1.13it/s, loss=7.64]

  [Batch 13800/16913] Loss: 7.6371


Epoch 2/10:  82%|████████▏ | 13851/16913 [3:24:11<45:08,  1.13it/s, loss=7.78]

  [Batch 13850/16913] Loss: 7.7828


Epoch 2/10:  82%|████████▏ | 13901/16913 [3:24:55<44:24,  1.13it/s, loss=7.52]

  [Batch 13900/16913] Loss: 7.5214


Epoch 2/10:  82%|████████▏ | 13951/16913 [3:25:39<43:43,  1.13it/s, loss=7.64]

  [Batch 13950/16913] Loss: 7.6403


Epoch 2/10:  83%|████████▎ | 14001/16913 [3:26:23<42:52,  1.13it/s, loss=7.66]

  [Batch 14000/16913] Loss: 7.6642


Epoch 2/10:  83%|████████▎ | 14051/16913 [3:27:07<42:11,  1.13it/s, loss=7.53]

  [Batch 14050/16913] Loss: 7.5343


Epoch 2/10:  83%|████████▎ | 14101/16913 [3:27:52<41:28,  1.13it/s, loss=7.67]

  [Batch 14100/16913] Loss: 7.6742


Epoch 2/10:  84%|████████▎ | 14151/16913 [3:28:36<40:39,  1.13it/s, loss=7.68]

  [Batch 14150/16913] Loss: 7.6845


Epoch 2/10:  84%|████████▍ | 14201/16913 [3:29:20<39:59,  1.13it/s, loss=7.67]

  [Batch 14200/16913] Loss: 7.6656


Epoch 2/10:  84%|████████▍ | 14251/16913 [3:30:04<39:12,  1.13it/s, loss=7.51]

  [Batch 14250/16913] Loss: 7.5125


Epoch 2/10:  85%|████████▍ | 14301/16913 [3:30:49<38:31,  1.13it/s, loss=7.81]

  [Batch 14300/16913] Loss: 7.8077


Epoch 2/10:  85%|████████▍ | 14351/16913 [3:31:33<37:47,  1.13it/s, loss=7.57]

  [Batch 14350/16913] Loss: 7.5680


Epoch 2/10:  85%|████████▌ | 14401/16913 [3:32:17<37:01,  1.13it/s, loss=7.84]

  [Batch 14400/16913] Loss: 7.8412


Epoch 2/10:  85%|████████▌ | 14451/16913 [3:33:01<36:14,  1.13it/s, loss=7.5] 

  [Batch 14450/16913] Loss: 7.4957


Epoch 2/10:  86%|████████▌ | 14501/16913 [3:33:45<35:32,  1.13it/s, loss=7.49]

  [Batch 14500/16913] Loss: 7.4878


Epoch 2/10:  86%|████████▌ | 14551/16913 [3:34:30<34:49,  1.13it/s, loss=7.82]

  [Batch 14550/16913] Loss: 7.8213


Epoch 2/10:  86%|████████▋ | 14601/16913 [3:35:14<34:04,  1.13it/s, loss=7.53]

  [Batch 14600/16913] Loss: 7.5251


Epoch 2/10:  87%|████████▋ | 14651/16913 [3:35:58<33:22,  1.13it/s, loss=7.73]

  [Batch 14650/16913] Loss: 7.7305


Epoch 2/10:  87%|████████▋ | 14701/16913 [3:36:42<32:37,  1.13it/s, loss=7.68]

  [Batch 14700/16913] Loss: 7.6783


Epoch 2/10:  87%|████████▋ | 14751/16913 [3:37:27<31:51,  1.13it/s, loss=7.54]

  [Batch 14750/16913] Loss: 7.5395


Epoch 2/10:  88%|████████▊ | 14801/16913 [3:38:11<31:06,  1.13it/s, loss=7.62]

  [Batch 14800/16913] Loss: 7.6231


Epoch 2/10:  88%|████████▊ | 14851/16913 [3:38:55<30:22,  1.13it/s, loss=7.71]

  [Batch 14850/16913] Loss: 7.7105


Epoch 2/10:  88%|████████▊ | 14901/16913 [3:39:39<29:38,  1.13it/s, loss=7.49]

  [Batch 14900/16913] Loss: 7.4875


Epoch 2/10:  88%|████████▊ | 14951/16913 [3:40:23<28:53,  1.13it/s, loss=7.6] 

  [Batch 14950/16913] Loss: 7.5988


Epoch 2/10:  89%|████████▊ | 15001/16913 [3:41:08<28:11,  1.13it/s, loss=7.77]

  [Batch 15000/16913] Loss: 7.7691


Epoch 2/10:  89%|████████▉ | 15051/16913 [3:41:52<27:25,  1.13it/s, loss=7.87]

  [Batch 15050/16913] Loss: 7.8728


Epoch 2/10:  89%|████████▉ | 15101/16913 [3:42:36<26:44,  1.13it/s, loss=7.71]

  [Batch 15100/16913] Loss: 7.7134


Epoch 2/10:  90%|████████▉ | 15151/16913 [3:43:20<25:55,  1.13it/s, loss=7.62]

  [Batch 15150/16913] Loss: 7.6171


Epoch 2/10:  90%|████████▉ | 15201/16913 [3:44:05<25:14,  1.13it/s, loss=7.46]

  [Batch 15200/16913] Loss: 7.4579


Epoch 2/10:  90%|█████████ | 15251/16913 [3:44:49<24:29,  1.13it/s, loss=7.62]

  [Batch 15250/16913] Loss: 7.6241


Epoch 2/10:  90%|█████████ | 15301/16913 [3:45:33<23:47,  1.13it/s, loss=7.68]

  [Batch 15300/16913] Loss: 7.6816


Epoch 2/10:  91%|█████████ | 15351/16913 [3:46:17<23:01,  1.13it/s, loss=7.71]

  [Batch 15350/16913] Loss: 7.7132


Epoch 2/10:  91%|█████████ | 15401/16913 [3:47:01<22:18,  1.13it/s, loss=7.67]

  [Batch 15400/16913] Loss: 7.6704


Epoch 2/10:  91%|█████████▏| 15451/16913 [3:47:46<21:33,  1.13it/s, loss=7.74]

  [Batch 15450/16913] Loss: 7.7396


Epoch 2/10:  92%|█████████▏| 15501/16913 [3:48:30<20:49,  1.13it/s, loss=7.63]

  [Batch 15500/16913] Loss: 7.6267


Epoch 2/10:  92%|█████████▏| 15551/16913 [3:49:14<20:03,  1.13it/s, loss=7.72]

  [Batch 15550/16913] Loss: 7.7195


Epoch 2/10:  92%|█████████▏| 15601/16913 [3:49:58<19:20,  1.13it/s, loss=7.62]

  [Batch 15600/16913] Loss: 7.6182


Epoch 2/10:  93%|█████████▎| 15651/16913 [3:50:43<18:36,  1.13it/s, loss=7.62]

  [Batch 15650/16913] Loss: 7.6234


Epoch 2/10:  93%|█████████▎| 15701/16913 [3:51:27<17:51,  1.13it/s, loss=7.51]

  [Batch 15700/16913] Loss: 7.5097


Epoch 2/10:  93%|█████████▎| 15751/16913 [3:52:11<17:06,  1.13it/s, loss=7.77]

  [Batch 15750/16913] Loss: 7.7702


Epoch 2/10:  93%|█████████▎| 15801/16913 [3:52:55<16:22,  1.13it/s, loss=7.64]

  [Batch 15800/16913] Loss: 7.6371


Epoch 2/10:  94%|█████████▎| 15851/16913 [3:53:39<15:40,  1.13it/s, loss=7.72]

  [Batch 15850/16913] Loss: 7.7230


Epoch 2/10:  94%|█████████▍| 15901/16913 [3:54:24<14:54,  1.13it/s, loss=7.62]

  [Batch 15900/16913] Loss: 7.6159


Epoch 2/10:  94%|█████████▍| 15951/16913 [3:55:08<14:10,  1.13it/s, loss=7.74]

  [Batch 15950/16913] Loss: 7.7433


Epoch 2/10:  95%|█████████▍| 16001/16913 [3:55:52<13:26,  1.13it/s, loss=7.58]

  [Batch 16000/16913] Loss: 7.5819


Epoch 2/10:  95%|█████████▍| 16051/16913 [3:56:36<12:41,  1.13it/s, loss=7.67]

  [Batch 16050/16913] Loss: 7.6703


Epoch 2/10:  95%|█████████▌| 16101/16913 [3:57:20<11:57,  1.13it/s, loss=7.97]

  [Batch 16100/16913] Loss: 7.9717


Epoch 2/10:  95%|█████████▌| 16151/16913 [3:58:05<11:13,  1.13it/s, loss=7.52]

  [Batch 16150/16913] Loss: 7.5165


Epoch 2/10:  96%|█████████▌| 16201/16913 [3:58:49<10:29,  1.13it/s, loss=7.67]

  [Batch 16200/16913] Loss: 7.6707


Epoch 2/10:  96%|█████████▌| 16251/16913 [3:59:33<09:45,  1.13it/s, loss=7.7] 

  [Batch 16250/16913] Loss: 7.6994


Epoch 2/10:  96%|█████████▋| 16301/16913 [4:00:17<09:01,  1.13it/s, loss=7.79]

  [Batch 16300/16913] Loss: 7.7941


Epoch 2/10:  97%|█████████▋| 16351/16913 [4:01:02<08:17,  1.13it/s, loss=7.54]

  [Batch 16350/16913] Loss: 7.5440


Epoch 2/10:  97%|█████████▋| 16401/16913 [4:01:46<07:32,  1.13it/s, loss=7.53]

  [Batch 16400/16913] Loss: 7.5344


Epoch 2/10:  97%|█████████▋| 16451/16913 [4:02:30<06:49,  1.13it/s, loss=7.56]

  [Batch 16450/16913] Loss: 7.5619


Epoch 2/10:  98%|█████████▊| 16501/16913 [4:03:14<06:04,  1.13it/s, loss=7.58]

  [Batch 16500/16913] Loss: 7.5784


Epoch 2/10:  98%|█████████▊| 16551/16913 [4:03:58<05:20,  1.13it/s, loss=7.65]

  [Batch 16550/16913] Loss: 7.6458


Epoch 2/10:  98%|█████████▊| 16601/16913 [4:04:43<04:36,  1.13it/s, loss=7.67]

  [Batch 16600/16913] Loss: 7.6684


Epoch 2/10:  98%|█████████▊| 16651/16913 [4:05:27<03:51,  1.13it/s, loss=7.7] 

  [Batch 16650/16913] Loss: 7.7035


Epoch 2/10:  99%|█████████▊| 16701/16913 [4:06:11<03:07,  1.13it/s, loss=7.74]

  [Batch 16700/16913] Loss: 7.7396


Epoch 2/10:  99%|█████████▉| 16751/16913 [4:06:55<02:23,  1.13it/s, loss=7.66]

  [Batch 16750/16913] Loss: 7.6586


Epoch 2/10:  99%|█████████▉| 16801/16913 [4:07:40<01:39,  1.13it/s, loss=7.55]

  [Batch 16800/16913] Loss: 7.5473


Epoch 2/10: 100%|█████████▉| 16851/16913 [4:08:24<00:54,  1.13it/s, loss=7.63]

  [Batch 16850/16913] Loss: 7.6348


Epoch 2/10: 100%|█████████▉| 16901/16913 [4:09:08<00:10,  1.13it/s, loss=7.6] 

  [Batch 16900/16913] Loss: 7.5980


Epoch 2/10: 100%|██████████| 16913/16913 [4:09:18<00:00,  1.13it/s, loss=8.38]


Epoch 2 Completed. Training Loss: 7.6784, Training Accuracy: 0.0353, Time: 14958.52s
Epoch 2, Validation Loss: 7.6775, Validation Accuracy: 0.0352
Saving checkpoint at epoch 2...
Checkpoint saved at models/Pretraining/model_checkpoints/checkpoint_epoch_2.pth
Checkpoint saved for epoch 2.

--- Starting Epoch 3/10 ---


Epoch 3/10:   0%|          | 1/16913 [00:00<4:18:09,  1.09it/s, loss=7.79]

  [Batch 0/16913] Loss: 7.7860


Epoch 3/10:   0%|          | 51/16913 [00:45<4:08:33,  1.13it/s, loss=7.73]

  [Batch 50/16913] Loss: 7.7305


Epoch 3/10:   1%|          | 101/16913 [01:29<4:07:50,  1.13it/s, loss=7.6] 

  [Batch 100/16913] Loss: 7.5986


Epoch 3/10:   1%|          | 151/16913 [02:13<4:07:08,  1.13it/s, loss=7.66]

  [Batch 150/16913] Loss: 7.6617


Epoch 3/10:   1%|          | 201/16913 [02:57<4:06:24,  1.13it/s, loss=7.69]

  [Batch 200/16913] Loss: 7.6856


Epoch 3/10:   1%|▏         | 251/16913 [03:42<4:05:44,  1.13it/s, loss=7.67]

  [Batch 250/16913] Loss: 7.6723


Epoch 3/10:   2%|▏         | 301/16913 [04:26<4:04:54,  1.13it/s, loss=7.84]

  [Batch 300/16913] Loss: 7.8351


Epoch 3/10:   2%|▏         | 351/16913 [05:10<4:04:00,  1.13it/s, loss=7.92]

  [Batch 350/16913] Loss: 7.9157


Epoch 3/10:   2%|▏         | 401/16913 [05:54<4:03:38,  1.13it/s, loss=7.54]

  [Batch 400/16913] Loss: 7.5404


Epoch 3/10:   3%|▎         | 451/16913 [06:38<4:02:38,  1.13it/s, loss=7.77]

  [Batch 450/16913] Loss: 7.7726


Epoch 3/10:   3%|▎         | 501/16913 [07:23<4:01:52,  1.13it/s, loss=7.73]

  [Batch 500/16913] Loss: 7.7266


Epoch 3/10:   3%|▎         | 551/16913 [08:07<4:00:54,  1.13it/s, loss=7.61]

  [Batch 550/16913] Loss: 7.6104


Epoch 3/10:   4%|▎         | 601/16913 [08:51<4:00:19,  1.13it/s, loss=7.75]

  [Batch 600/16913] Loss: 7.7519


Epoch 3/10:   4%|▍         | 651/16913 [09:35<3:59:28,  1.13it/s, loss=7.51]

  [Batch 650/16913] Loss: 7.5095


Epoch 3/10:   4%|▍         | 701/16913 [10:19<3:58:49,  1.13it/s, loss=7.77]

  [Batch 700/16913] Loss: 7.7696


Epoch 3/10:   4%|▍         | 751/16913 [11:04<3:57:57,  1.13it/s, loss=7.6] 

  [Batch 750/16913] Loss: 7.5999


Epoch 3/10:   5%|▍         | 801/16913 [11:48<3:57:18,  1.13it/s, loss=7.8] 

  [Batch 800/16913] Loss: 7.8003


Epoch 3/10:   5%|▌         | 851/16913 [12:32<3:56:38,  1.13it/s, loss=7.58]

  [Batch 850/16913] Loss: 7.5769


Epoch 3/10:   5%|▌         | 901/16913 [13:16<3:55:47,  1.13it/s, loss=7.86]

  [Batch 900/16913] Loss: 7.8585


Epoch 3/10:   6%|▌         | 951/16913 [14:00<3:55:19,  1.13it/s, loss=7.61]

  [Batch 950/16913] Loss: 7.6124


Epoch 3/10:   6%|▌         | 1001/16913 [14:45<3:54:24,  1.13it/s, loss=7.59]

  [Batch 1000/16913] Loss: 7.5913


Epoch 3/10:   6%|▌         | 1051/16913 [15:29<3:53:54,  1.13it/s, loss=7.42]

  [Batch 1050/16913] Loss: 7.4190


Epoch 3/10:   7%|▋         | 1101/16913 [16:13<3:53:13,  1.13it/s, loss=7.5] 

  [Batch 1100/16913] Loss: 7.5004


Epoch 3/10:   7%|▋         | 1151/16913 [16:57<3:52:05,  1.13it/s, loss=7.72]

  [Batch 1150/16913] Loss: 7.7226


Epoch 3/10:   7%|▋         | 1201/16913 [17:42<3:51:28,  1.13it/s, loss=7.54]

  [Batch 1200/16913] Loss: 7.5440


Epoch 3/10:   7%|▋         | 1251/16913 [18:26<3:50:58,  1.13it/s, loss=7.7] 

  [Batch 1250/16913] Loss: 7.6964


Epoch 3/10:   8%|▊         | 1301/16913 [19:10<3:50:02,  1.13it/s, loss=7.77]

  [Batch 1300/16913] Loss: 7.7677


Epoch 3/10:   8%|▊         | 1351/16913 [19:54<3:49:19,  1.13it/s, loss=7.87]

  [Batch 1350/16913] Loss: 7.8717


Epoch 3/10:   8%|▊         | 1401/16913 [20:39<3:48:40,  1.13it/s, loss=7.73]

  [Batch 1400/16913] Loss: 7.7279


Epoch 3/10:   9%|▊         | 1451/16913 [21:23<3:47:57,  1.13it/s, loss=7.72]

  [Batch 1450/16913] Loss: 7.7189


Epoch 3/10:   9%|▉         | 1501/16913 [22:07<3:47:07,  1.13it/s, loss=7.62]

  [Batch 1500/16913] Loss: 7.6181


Epoch 3/10:   9%|▉         | 1551/16913 [22:51<3:46:13,  1.13it/s, loss=7.94]

  [Batch 1550/16913] Loss: 7.9415


Epoch 3/10:   9%|▉         | 1601/16913 [23:35<3:45:47,  1.13it/s, loss=7.54]

  [Batch 1600/16913] Loss: 7.5426


Epoch 3/10:  10%|▉         | 1651/16913 [24:20<3:45:04,  1.13it/s, loss=7.71]

  [Batch 1650/16913] Loss: 7.7144


Epoch 3/10:  10%|█         | 1701/16913 [25:04<3:44:19,  1.13it/s, loss=7.74]

  [Batch 1700/16913] Loss: 7.7385


Epoch 3/10:  10%|█         | 1751/16913 [25:48<3:43:41,  1.13it/s, loss=7.77]

  [Batch 1750/16913] Loss: 7.7673


Epoch 3/10:  43%|████▎     | 7201/16913 [1:46:09<2:23:03,  1.13it/s, loss=7.85]

  [Batch 7200/16913] Loss: 7.8467


Epoch 3/10:  43%|████▎     | 7251/16913 [1:46:53<2:22:26,  1.13it/s, loss=7.74]

  [Batch 7250/16913] Loss: 7.7401


Epoch 3/10:  43%|████▎     | 7301/16913 [1:47:37<2:21:33,  1.13it/s, loss=7.7] 

  [Batch 7300/16913] Loss: 7.7034


Epoch 3/10:  43%|████▎     | 7351/16913 [1:48:21<2:21:06,  1.13it/s, loss=7.62]

  [Batch 7350/16913] Loss: 7.6203


Epoch 3/10:  44%|████▍     | 7401/16913 [1:49:06<2:20:18,  1.13it/s, loss=7.65]

  [Batch 7400/16913] Loss: 7.6473


Epoch 3/10:  44%|████▍     | 7451/16913 [1:49:50<2:19:20,  1.13it/s, loss=7.65]

  [Batch 7450/16913] Loss: 7.6452


Epoch 3/10:  44%|████▍     | 7501/16913 [1:50:34<2:18:52,  1.13it/s, loss=7.58]

  [Batch 7500/16913] Loss: 7.5783


Epoch 3/10:  45%|████▍     | 7551/16913 [1:51:18<2:18:09,  1.13it/s, loss=7.75]

  [Batch 7550/16913] Loss: 7.7524


Epoch 3/10:  45%|████▍     | 7601/16913 [1:52:03<2:17:23,  1.13it/s, loss=7.74]

  [Batch 7600/16913] Loss: 7.7401


Epoch 3/10:  45%|████▌     | 7651/16913 [1:52:47<2:16:32,  1.13it/s, loss=7.65]

  [Batch 7650/16913] Loss: 7.6479


Epoch 3/10:  46%|████▌     | 7701/16913 [1:53:31<2:15:46,  1.13it/s, loss=7.77]

  [Batch 7700/16913] Loss: 7.7657


Epoch 3/10:  46%|████▌     | 7751/16913 [1:54:15<2:15:10,  1.13it/s, loss=7.68]

  [Batch 7750/16913] Loss: 7.6831


Epoch 3/10:  46%|████▌     | 7801/16913 [1:55:00<2:14:15,  1.13it/s, loss=7.85]

  [Batch 7800/16913] Loss: 7.8494


Epoch 3/10:  46%|████▋     | 7851/16913 [1:55:44<2:13:44,  1.13it/s, loss=7.63]

  [Batch 7850/16913] Loss: 7.6343


Epoch 3/10:  73%|███████▎  | 12351/16913 [3:02:05<1:07:13,  1.13it/s, loss=7.66]

  [Batch 12350/16913] Loss: 7.6618


Epoch 3/10:  73%|███████▎  | 12401/16913 [3:02:49<1:06:26,  1.13it/s, loss=7.6] 

  [Batch 12400/16913] Loss: 7.6020


Epoch 3/10:  74%|███████▎  | 12451/16913 [3:03:33<1:05:43,  1.13it/s, loss=7.46]

  [Batch 12450/16913] Loss: 7.4604


Epoch 3/10:  74%|███████▍  | 12501/16913 [3:04:17<1:05:03,  1.13it/s, loss=7.62]

  [Batch 12500/16913] Loss: 7.6151


Epoch 3/10:  74%|███████▍  | 12551/16913 [3:05:01<1:04:24,  1.13it/s, loss=7.6] 

  [Batch 12550/16913] Loss: 7.5972


Epoch 3/10:  75%|███████▍  | 12601/16913 [3:05:46<1:03:36,  1.13it/s, loss=7.75]

  [Batch 12600/16913] Loss: 7.7547


Epoch 3/10:  75%|███████▍  | 12651/16913 [3:06:30<1:02:53,  1.13it/s, loss=7.74]

  [Batch 12650/16913] Loss: 7.7417


Epoch 3/10:  75%|███████▌  | 12701/16913 [3:07:14<1:02:07,  1.13it/s, loss=7.71]

  [Batch 12700/16913] Loss: 7.7062


Epoch 3/10:  75%|███████▌  | 12751/16913 [3:07:58<1:01:25,  1.13it/s, loss=7.75]

  [Batch 12750/16913] Loss: 7.7502


Epoch 3/10:  76%|███████▌  | 12801/16913 [3:08:43<1:00:38,  1.13it/s, loss=7.75]

  [Batch 12800/16913] Loss: 7.7485


Epoch 3/10:  76%|███████▌  | 12851/16913 [3:09:27<59:56,  1.13it/s, loss=7.57]  

  [Batch 12850/16913] Loss: 7.5726


Epoch 3/10:  76%|███████▋  | 12901/16913 [3:10:11<59:05,  1.13it/s, loss=7.59]

  [Batch 12900/16913] Loss: 7.5896


Epoch 3/10:  77%|███████▋  | 12951/16913 [3:10:55<58:22,  1.13it/s, loss=7.85]

  [Batch 12950/16913] Loss: 7.8505


Epoch 3/10:  77%|███████▋  | 13001/16913 [3:11:40<57:41,  1.13it/s, loss=7.64]

  [Batch 13000/16913] Loss: 7.6425


Epoch 3/10:  77%|███████▋  | 13051/16913 [3:12:24<57:01,  1.13it/s, loss=7.6] 

  [Batch 13050/16913] Loss: 7.5999


Epoch 3/10:  77%|███████▋  | 13101/16913 [3:13:08<56:12,  1.13it/s, loss=7.67]

  [Batch 13100/16913] Loss: 7.6705


Epoch 3/10:  78%|███████▊  | 13151/16913 [3:13:52<55:27,  1.13it/s, loss=7.68]

  [Batch 13150/16913] Loss: 7.6804


Epoch 3/10:  78%|███████▊  | 13201/16913 [3:14:37<54:40,  1.13it/s, loss=7.55]

  [Batch 13200/16913] Loss: 7.5511


Epoch 3/10:  78%|███████▊  | 13251/16913 [3:15:21<53:56,  1.13it/s, loss=7.67]

  [Batch 13250/16913] Loss: 7.6673


Epoch 3/10:  79%|███████▊  | 13301/16913 [3:16:05<53:14,  1.13it/s, loss=7.73]

  [Batch 13300/16913] Loss: 7.7314


Epoch 3/10:  79%|███████▉  | 13351/16913 [3:16:49<52:33,  1.13it/s, loss=7.54]

  [Batch 13350/16913] Loss: 7.5433


Epoch 3/10:  79%|███████▉  | 13401/16913 [3:17:33<51:45,  1.13it/s, loss=7.63]

  [Batch 13400/16913] Loss: 7.6336


Epoch 3/10:  80%|███████▉  | 13451/16913 [3:18:18<51:02,  1.13it/s, loss=7.66]

  [Batch 13450/16913] Loss: 7.6572


Epoch 3/10:  80%|███████▉  | 13501/16913 [3:19:02<50:17,  1.13it/s, loss=7.54]

  [Batch 13500/16913] Loss: 7.5440


Epoch 3/10:  80%|████████  | 13551/16913 [3:19:46<49:29,  1.13it/s, loss=7.51]

  [Batch 13550/16913] Loss: 7.5135


Epoch 3/10:  80%|████████  | 13601/16913 [3:20:30<48:49,  1.13it/s, loss=7.84]

  [Batch 13600/16913] Loss: 7.8417


Epoch 3/10:  81%|████████  | 13651/16913 [3:21:15<48:04,  1.13it/s, loss=7.71]

  [Batch 13650/16913] Loss: 7.7052


Epoch 3/10:  81%|████████  | 13701/16913 [3:21:59<47:21,  1.13it/s, loss=7.54]

  [Batch 13700/16913] Loss: 7.5363


Epoch 3/10:  81%|████████▏ | 13751/16913 [3:22:43<46:36,  1.13it/s, loss=7.67]

  [Batch 13750/16913] Loss: 7.6742


Epoch 3/10:  82%|████████▏ | 13801/16913 [3:23:27<45:49,  1.13it/s, loss=7.46]

  [Batch 13800/16913] Loss: 7.4573


Epoch 3/10:  82%|████████▏ | 13851/16913 [3:24:11<45:05,  1.13it/s, loss=7.88]

  [Batch 13850/16913] Loss: 7.8844


Epoch 3/10:  82%|████████▏ | 13901/16913 [3:24:56<44:22,  1.13it/s, loss=7.43]

  [Batch 13900/16913] Loss: 7.4313


Epoch 3/10:  82%|████████▏ | 13951/16913 [3:25:40<43:40,  1.13it/s, loss=7.61]

  [Batch 13950/16913] Loss: 7.6059


Epoch 3/10:  83%|████████▎ | 14001/16913 [3:26:24<42:56,  1.13it/s, loss=7.51]

  [Batch 14000/16913] Loss: 7.5097


Epoch 3/10:  83%|████████▎ | 14051/16913 [3:27:08<42:08,  1.13it/s, loss=7.56]

  [Batch 14050/16913] Loss: 7.5601


Epoch 3/10:  83%|████████▎ | 14101/16913 [3:27:53<41:29,  1.13it/s, loss=7.7] 

  [Batch 14100/16913] Loss: 7.7012


Epoch 3/10:  84%|████████▎ | 14151/16913 [3:28:37<40:43,  1.13it/s, loss=7.53]

  [Batch 14150/16913] Loss: 7.5298


Epoch 3/10:  84%|████████▍ | 14201/16913 [3:29:21<40:00,  1.13it/s, loss=7.62]

  [Batch 14200/16913] Loss: 7.6188


Epoch 3/10:  84%|████████▍ | 14251/16913 [3:30:05<39:15,  1.13it/s, loss=7.67]

  [Batch 14250/16913] Loss: 7.6706


Epoch 3/10:  85%|████████▍ | 14301/16913 [3:30:49<38:33,  1.13it/s, loss=7.76]

  [Batch 14300/16913] Loss: 7.7650


Epoch 3/10:  85%|████████▍ | 14351/16913 [3:31:34<37:45,  1.13it/s, loss=7.73]

  [Batch 14350/16913] Loss: 7.7274


Epoch 3/10:  85%|████████▌ | 14401/16913 [3:32:18<37:03,  1.13it/s, loss=7.63]

  [Batch 14400/16913] Loss: 7.6254


Epoch 3/10:  85%|████████▌ | 14451/16913 [3:33:02<36:13,  1.13it/s, loss=7.65]

  [Batch 14450/16913] Loss: 7.6537


Epoch 3/10:  86%|████████▌ | 14501/16913 [3:33:46<35:34,  1.13it/s, loss=7.67]

  [Batch 14500/16913] Loss: 7.6685


Epoch 3/10:  86%|████████▌ | 14551/16913 [3:34:31<34:50,  1.13it/s, loss=7.61]

  [Batch 14550/16913] Loss: 7.6126


Epoch 3/10:  86%|████████▋ | 14601/16913 [3:35:15<34:07,  1.13it/s, loss=7.82]

  [Batch 14600/16913] Loss: 7.8176


Epoch 3/10:  87%|████████▋ | 14651/16913 [3:35:59<33:21,  1.13it/s, loss=7.69]

  [Batch 14650/16913] Loss: 7.6910


Epoch 3/10:  87%|████████▋ | 14701/16913 [3:36:43<32:38,  1.13it/s, loss=7.61]

  [Batch 14700/16913] Loss: 7.6125


Epoch 3/10:  87%|████████▋ | 14751/16913 [3:37:28<31:50,  1.13it/s, loss=7.67]

  [Batch 14750/16913] Loss: 7.6720


Epoch 3/10:  88%|████████▊ | 14801/16913 [3:38:12<31:10,  1.13it/s, loss=7.71]

  [Batch 14800/16913] Loss: 7.7150


Epoch 3/10:  88%|████████▊ | 14851/16913 [3:38:56<30:23,  1.13it/s, loss=7.7] 

  [Batch 14850/16913] Loss: 7.6977


Epoch 3/10:  88%|████████▊ | 14901/16913 [3:39:40<29:39,  1.13it/s, loss=7.66]

  [Batch 14900/16913] Loss: 7.6590


Epoch 3/10:  88%|████████▊ | 14951/16913 [3:40:25<28:56,  1.13it/s, loss=7.76]

  [Batch 14950/16913] Loss: 7.7571


Epoch 3/10:  89%|████████▊ | 15001/16913 [3:41:09<28:11,  1.13it/s, loss=7.64]

  [Batch 15000/16913] Loss: 7.6385


Epoch 3/10:  89%|████████▉ | 15051/16913 [3:41:53<27:28,  1.13it/s, loss=7.71]

  [Batch 15050/16913] Loss: 7.7076


Epoch 3/10:  89%|████████▉ | 15101/16913 [3:42:37<26:43,  1.13it/s, loss=7.65]

  [Batch 15100/16913] Loss: 7.6464


Epoch 3/10:  90%|████████▉ | 15151/16913 [3:43:22<25:59,  1.13it/s, loss=7.51]

  [Batch 15150/16913] Loss: 7.5141


Epoch 3/10:  90%|████████▉ | 15201/16913 [3:44:06<25:14,  1.13it/s, loss=7.76]

  [Batch 15200/16913] Loss: 7.7553


Epoch 3/10:  90%|█████████ | 15251/16913 [3:44:50<24:31,  1.13it/s, loss=7.76]

  [Batch 15250/16913] Loss: 7.7569


Epoch 3/10:  90%|█████████ | 15301/16913 [3:45:34<23:46,  1.13it/s, loss=7.75]

  [Batch 15300/16913] Loss: 7.7539


Epoch 3/10:  91%|█████████ | 15351/16913 [3:46:18<23:01,  1.13it/s, loss=7.85]

  [Batch 15350/16913] Loss: 7.8503


Epoch 3/10:  91%|█████████ | 15401/16913 [3:47:03<22:16,  1.13it/s, loss=7.59]

  [Batch 15400/16913] Loss: 7.5888


Epoch 3/10:  91%|█████████▏| 15451/16913 [3:47:47<21:34,  1.13it/s, loss=7.71]

  [Batch 15450/16913] Loss: 7.7143


Epoch 3/10:  92%|█████████▏| 15501/16913 [3:48:31<20:47,  1.13it/s, loss=7.62]

  [Batch 15500/16913] Loss: 7.6162


Epoch 3/10:  92%|█████████▏| 15551/16913 [3:49:15<20:02,  1.13it/s, loss=7.77]

  [Batch 15550/16913] Loss: 7.7744


Epoch 3/10:  92%|█████████▏| 15601/16913 [3:50:00<19:19,  1.13it/s, loss=7.62]

  [Batch 15600/16913] Loss: 7.6186


Epoch 3/10:  93%|█████████▎| 15651/16913 [3:50:44<18:36,  1.13it/s, loss=7.72]

  [Batch 15650/16913] Loss: 7.7180


Epoch 3/10:  93%|█████████▎| 15701/16913 [3:51:28<17:50,  1.13it/s, loss=7.56]

  [Batch 15700/16913] Loss: 7.5579


Epoch 3/10:  93%|█████████▎| 15751/16913 [3:52:12<17:06,  1.13it/s, loss=7.86]

  [Batch 15750/16913] Loss: 7.8614


Epoch 3/10:  93%|█████████▎| 15801/16913 [3:52:56<16:23,  1.13it/s, loss=7.75]

  [Batch 15800/16913] Loss: 7.7497


Epoch 3/10:  94%|█████████▎| 15851/16913 [3:53:41<15:39,  1.13it/s, loss=7.66]

  [Batch 15850/16913] Loss: 7.6638


Epoch 3/10:  94%|█████████▍| 15901/16913 [3:54:25<14:54,  1.13it/s, loss=7.45]

  [Batch 15900/16913] Loss: 7.4526


Epoch 3/10:  94%|█████████▍| 15930/16913 [3:54:50<14:30,  1.13it/s, loss=7.79]

In [15]:
print(len(train_loader))

16913
