In [2]:
# https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset

from typing import NamedTuple, Generator
import pandas as pd
import torch
import torch.nn.functional as F
from torch import Tensor, optim, nn
from tqdm import tqdm
from transformers import AutoTokenizer
from src.transformer import Transformer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
class PairedSentences(NamedTuple):
    fr: str
    en: str

class ListPairedSentences(NamedTuple):
    fr: list[str]
    en: list[str]

    def __getitem__(self, index: int) -> PairedSentences:
        return PairedSentences(self.fr[index], self.en[index])

class TrainingBatch(NamedTuple):
    input_ids: Tensor
    """In our case french"""
    
    encoder_mask: Tensor
    """basically padding tokens"""
    
    output_ids: Tensor
    """In our case english"""
    
    decoder_mask: Tensor
    """Padding tokens, don't forget to add causal mask during training"""

    def __repr__(self):
        return f"TrainingBatch(x.shape={self.input_ids.shape}, y.shape={self.output_ids.shape})"

In [14]:
class Processor:
    def __init__(self, sequence_length: int, tokenizer_name: str) -> None:
        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self._seq_length = sequence_length
        
    @property
    def tokenizer(self) -> AutoTokenizer:
        return self._tokenizer

    @property
    def sequence_length(self) -> int:
        return self._seq_length

    @property
    def vocab_size(self) -> int:
        return self._tokenizer.vocab_size

    def tokenize(self, text: str, padding: str="max_length", truncation:bool=True, extra: int = 0):
        return self._tokenizer(
            text,
            return_tensors="pt",
            max_length=self._seq_length + extra,
            padding=padding,
            truncation=truncation)

    def decode(self, token_ids: Tensor, **kwargs) -> str:
        return self._tokenizer.decode(token_ids, **kwargs)

    def make_batch(self, paired_sentences: ListPairedSentences, dtype=torch.float32) -> TrainingBatch:
        # Tokenize each sentence in the 'fr' and 'en' lists
        fr_sentences = [self.tokenize(sentence) for sentence in paired_sentences.fr]
        en_sentences = [self.tokenize(sentence, extra=1) for sentence in paired_sentences.en]

        # Stack tokenized tensors for batching
        X_batch = torch.stack([x['input_ids'].squeeze(0) for x in fr_sentences])
        Y_batch = torch.stack([y['input_ids'].squeeze(0) for y in en_sentences])

        # Create encoder and decoder padding mask: 1 for real tokens, 0 for padding
        encoder_mask = torch.stack([x['attention_mask'].squeeze(0) for x in fr_sentences]) \
            .unsqueeze(1).unsqueeze(2)
        decoder_mask = torch.stack([y['attention_mask'].squeeze(0) for y in en_sentences]) \
            .unsqueeze(1).unsqueeze(2)

        return TrainingBatch(
            input_ids=X_batch,
            output_ids=Y_batch,
            encoder_mask=encoder_mask.to(dtype),
            decoder_mask=decoder_mask.to(dtype))

def get_first_masked_token(mask: torch.Tensor) -> Tensor:
    squeezed_mask = mask.squeeze(1).squeeze(1) # mask is shaped (bs, 1, 1, sequence_length)
    first_masked_indices = (squeezed_mask == 0).int().argmax(dim=1)
    first_masked_indices[squeezed_mask.sum(dim=1) == squeezed_mask.size(1)] = squeezed_mask.size(1)
    return first_masked_indices.to(dtype=torch.int32)

def mask_last_token(current_mask: torch.Tensor) -> torch.Tensor:
    first_masked_indices = get_first_masked_token(current_mask) # get the index of first masked token
    last_token_indices = torch.clamp(first_masked_indices - 1, min=0) # to avoid negative indices
    current_mask[torch.arange(current_mask.size(0)), 0, 0, last_token_indices] = 0 # set the last 1 token to 0
    return current_mask

In [15]:
def get_page(csv_path: str, page: int, rows_per_page: int):
    return pd.read_csv(csv_path, skiprows = 1 + page * rows_per_page, nrows=rows_per_page, header=None, names=["en", "fr"])

def make_generator(csv_path: str, rows_per_page: int) -> Generator[ListPairedSentences, None, None]:
    i = 0
    while True:
        page = get_page(csv_path, i, rows_per_page)
        fr_sentences = page["fr"].to_list()
        en_sentences = page["en"].to_list()
        yield ListPairedSentences(fr_sentences, en_sentences)
        i += 1

In [25]:
dtype=torch.bfloat16
csv_path = "archive/en-fr.csv"
processor = Processor(200, "bert-base-uncased")
model = Transformer(vocab_size=processor.vocab_size, max_sequence_len=processor.sequence_length, d_model=256).to(DEVICE).to(dtype)

# opt and loss
learning_rate = 1e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss(ignore_index=processor.tokenizer.pad_token_id).to(DEVICE)

# training loop qty
num_epochs = 500
batch_size = 32
break_at = 10
# get_num_steps(csv_path, batch_size) running this function is too long ... just use the cached value
num_steps = 22520376 // batch_size



In [26]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    model.to(DEVICE)
    epoch_loss = 0.0
    loop_error = 0.0

    # Reset generator at the start of each epoch
    progress_bar = tqdm(make_generator(csv_path, batch_size), total=num_steps)
    progress_bar.set_description(f"Epoch {epoch + 1}")

    for step, raw_batch in enumerate(progress_bar, start=1):
        try:
            training_batch = processor.make_batch(raw_batch, dtype)        # Converts batch to `TrainingBatch` format
            input_ids = training_batch.input_ids.to(DEVICE)            # Encoder input (source sentence)
            target_ids = training_batch.output_ids.to(DEVICE)          # Decoder target sequence
            encoder_mask = training_batch.encoder_mask.to(DEVICE)   # Mask for encoder

            # adjust decoder input
            decoder_input_ids = target_ids[:, :-1]
            target_ids_flat = target_ids[:, 1:].contiguous().view(-1)

            # adjust decoder mask too
            decoder_mask = training_batch.decoder_mask[:, :, :, :-1].to(DEVICE)
            
            # We need to create a causal mask too
            seq_len = decoder_input_ids.shape[1]
            causal_mask = torch.tril(torch.ones((seq_len, seq_len))).to(DEVICE).to(dtype)
            
            # final decoder mask as prod of padding * causal
            final_decoder_mask = decoder_mask * causal_mask.unsqueeze(0)

        except Exception as e:
            print(f"Error in batch {step}: {e}")
            loop_error += 1
            continue
        
        # change learning rate
        # new_lr = 512 ** (-1.5) * (epoch * 1000 + step + 1) ** (-0.5)
        # for param_group in optimizer.param_groups:
        #     param_group['lr'] = new_lr

        # Forward pass
        optimizer.zero_grad()
        output_probs = model(
            input_ids,
            decoder_input_ids,
            encoder_mask=encoder_mask,
            decoder_mask=final_decoder_mask)

        # flatten target and outputprobs to compute cce loss
        output_probs_flat = output_probs.view(-1, output_probs.size(-1))

        # Calculate the loss
        loss = loss_fn(output_probs_flat, target_ids_flat)
        loss.backward()
        optimizer.step()

        # Track loss
        epoch_loss += loss.item()

        # Optionally, print progress
        progress_bar.set_postfix_str(f"lr : {learning_rate:.4f} ; "
                                     f"epoch loss : {epoch_loss / step:.4f} ; "
                                     f"loop error : {loop_error}")
        
        
        if step == break_at: # train only on a subset for now
            break

    # with torch.no_grad():
    #     sample_output = output_probs.argmax(dim=-1)
    #     print("Predicted tokens:", sample_output[0, :10].squeeze())  
    #     print("Target tokens:   ", target_ids[0, 1:11].squeeze())  

    # Print average loss per epoch
    print(f"Epoch [{epoch+1}/{num_epochs}] completed, Average Loss: {epoch_loss / num_steps:.4f}")

Epoch 1:   0%|          | 9/703761 [00:00<20:24:10,  9.58it/s, lr : 0.0001 ; epoch loss : 9.6438 ; loop error : 0.0] 


Epoch [1/500] completed, Average Loss: 0.0001


Epoch 2:   0%|          | 9/703761 [00:00<19:55:57,  9.81it/s, lr : 0.0001 ; epoch loss : 9.1250 ; loop error : 0.0]


Epoch [2/500] completed, Average Loss: 0.0001


Epoch 3:   0%|          | 9/703761 [00:00<20:11:07,  9.68it/s, lr : 0.0001 ; epoch loss : 8.8250 ; loop error : 0.0]


Epoch [3/500] completed, Average Loss: 0.0001


Epoch 4:   0%|          | 9/703761 [00:00<19:51:37,  9.84it/s, lr : 0.0001 ; epoch loss : 8.5312 ; loop error : 0.0]


Epoch [4/500] completed, Average Loss: 0.0001


Epoch 5:   0%|          | 9/703761 [00:00<19:29:02, 10.03it/s, lr : 0.0001 ; epoch loss : 8.2250 ; loop error : 0.0]


Epoch [5/500] completed, Average Loss: 0.0001


Epoch 6:   0%|          | 9/703761 [00:00<19:43:15,  9.91it/s, lr : 0.0001 ; epoch loss : 7.9406 ; loop error : 0.0]


Epoch [6/500] completed, Average Loss: 0.0001


Epoch 7:   0%|          | 9/703761 [00:00<18:46:35, 10.41it/s, lr : 0.0001 ; epoch loss : 7.6813 ; loop error : 0.0]


Epoch [7/500] completed, Average Loss: 0.0001


Epoch 8:   0%|          | 9/703761 [00:00<19:30:45, 10.02it/s, lr : 0.0001 ; epoch loss : 7.4375 ; loop error : 0.0]


Epoch [8/500] completed, Average Loss: 0.0001


Epoch 9:   0%|          | 9/703761 [00:00<19:55:42,  9.81it/s, lr : 0.0001 ; epoch loss : 7.2219 ; loop error : 0.0]


Epoch [9/500] completed, Average Loss: 0.0001


Epoch 10:   0%|          | 9/703761 [00:00<18:47:02, 10.41it/s, lr : 0.0001 ; epoch loss : 7.0187 ; loop error : 0.0]


Epoch [10/500] completed, Average Loss: 0.0001


Epoch 11:   0%|          | 9/703761 [00:00<19:25:28, 10.06it/s, lr : 0.0001 ; epoch loss : 6.8094 ; loop error : 0.0]


Epoch [11/500] completed, Average Loss: 0.0001


Epoch 12:   0%|          | 9/703761 [00:00<20:39:20,  9.46it/s, lr : 0.0001 ; epoch loss : 6.6219 ; loop error : 0.0]


Epoch [12/500] completed, Average Loss: 0.0001


Epoch 13:   0%|          | 9/703761 [00:00<19:08:35, 10.21it/s, lr : 0.0001 ; epoch loss : 6.4469 ; loop error : 0.0]


Epoch [13/500] completed, Average Loss: 0.0001


Epoch 14:   0%|          | 9/703761 [00:00<19:13:38, 10.17it/s, lr : 0.0001 ; epoch loss : 6.2781 ; loop error : 0.0]


Epoch [14/500] completed, Average Loss: 0.0001


Epoch 15:   0%|          | 9/703761 [00:00<19:59:08,  9.78it/s, lr : 0.0001 ; epoch loss : 6.1188 ; loop error : 0.0]


Epoch [15/500] completed, Average Loss: 0.0001


Epoch 16:   0%|          | 9/703761 [00:00<18:12:57, 10.73it/s, lr : 0.0001 ; epoch loss : 5.9875 ; loop error : 0.0]


Epoch [16/500] completed, Average Loss: 0.0001


Epoch 17:   0%|          | 9/703761 [00:00<18:46:07, 10.42it/s, lr : 0.0001 ; epoch loss : 5.8406 ; loop error : 0.0]


Epoch [17/500] completed, Average Loss: 0.0001


Epoch 18:   0%|          | 9/703761 [00:00<19:32:21, 10.00it/s, lr : 0.0001 ; epoch loss : 5.7094 ; loop error : 0.0]


Epoch [18/500] completed, Average Loss: 0.0001


Epoch 19:   0%|          | 9/703761 [00:00<19:06:57, 10.23it/s, lr : 0.0001 ; epoch loss : 5.6281 ; loop error : 0.0]


Epoch [19/500] completed, Average Loss: 0.0001


Epoch 20:   0%|          | 9/703761 [00:00<19:07:36, 10.22it/s, lr : 0.0001 ; epoch loss : 5.5500 ; loop error : 0.0]


Epoch [20/500] completed, Average Loss: 0.0001


Epoch 21:   0%|          | 9/703761 [00:00<19:58:18,  9.79it/s, lr : 0.0001 ; epoch loss : 5.4500 ; loop error : 0.0]


Epoch [21/500] completed, Average Loss: 0.0001


Epoch 22:   0%|          | 9/703761 [00:00<19:51:12,  9.85it/s, lr : 0.0001 ; epoch loss : 5.3844 ; loop error : 0.0]


Epoch [22/500] completed, Average Loss: 0.0001


Epoch 23:   0%|          | 9/703761 [00:00<19:15:40, 10.15it/s, lr : 0.0001 ; epoch loss : 5.2719 ; loop error : 0.0]


Epoch [23/500] completed, Average Loss: 0.0001


Epoch 24:   0%|          | 9/703761 [00:00<20:37:29,  9.48it/s, lr : 0.0001 ; epoch loss : 5.1813 ; loop error : 0.0]


Epoch [24/500] completed, Average Loss: 0.0001


Epoch 25:   0%|          | 9/703761 [00:00<18:46:35, 10.41it/s, lr : 0.0001 ; epoch loss : 5.1063 ; loop error : 0.0]


Epoch [25/500] completed, Average Loss: 0.0001


Epoch 26:   0%|          | 9/703761 [00:00<19:02:40, 10.26it/s, lr : 0.0001 ; epoch loss : 5.0531 ; loop error : 0.0]


Epoch [26/500] completed, Average Loss: 0.0001


Epoch 27:   0%|          | 9/703761 [00:00<19:20:22, 10.11it/s, lr : 0.0001 ; epoch loss : 4.9781 ; loop error : 0.0]


Epoch [27/500] completed, Average Loss: 0.0001


Epoch 28:   0%|          | 9/703761 [00:00<18:50:25, 10.38it/s, lr : 0.0001 ; epoch loss : 4.8953 ; loop error : 0.0]


Epoch [28/500] completed, Average Loss: 0.0001


Epoch 29:   0%|          | 9/703761 [00:00<16:55:34, 11.55it/s, lr : 0.0001 ; epoch loss : 4.8187 ; loop error : 0.0]


Epoch [29/500] completed, Average Loss: 0.0001


Epoch 30:   0%|          | 9/703761 [00:00<17:52:55, 10.93it/s, lr : 0.0001 ; epoch loss : 4.7562 ; loop error : 0.0]


Epoch [30/500] completed, Average Loss: 0.0001


Epoch 31:   0%|          | 9/703761 [00:00<19:30:05, 10.02it/s, lr : 0.0001 ; epoch loss : 4.7234 ; loop error : 0.0]


Epoch [31/500] completed, Average Loss: 0.0001


Epoch 32:   0%|          | 9/703761 [00:00<18:44:03, 10.43it/s, lr : 0.0001 ; epoch loss : 4.6531 ; loop error : 0.0]


Epoch [32/500] completed, Average Loss: 0.0001


Epoch 33:   0%|          | 9/703761 [00:00<19:39:07,  9.95it/s, lr : 0.0001 ; epoch loss : 4.6203 ; loop error : 0.0]


Epoch [33/500] completed, Average Loss: 0.0001


Epoch 34:   0%|          | 9/703761 [00:00<20:21:36,  9.60it/s, lr : 0.0001 ; epoch loss : 4.5891 ; loop error : 0.0]


Epoch [34/500] completed, Average Loss: 0.0001


Epoch 35:   0%|          | 9/703761 [00:00<21:01:45,  9.30it/s, lr : 0.0001 ; epoch loss : 4.5266 ; loop error : 0.0]


Epoch [35/500] completed, Average Loss: 0.0001


Epoch 36:   0%|          | 9/703761 [00:00<19:32:11, 10.01it/s, lr : 0.0001 ; epoch loss : 4.4859 ; loop error : 0.0]


Epoch [36/500] completed, Average Loss: 0.0001


Epoch 37:   0%|          | 9/703761 [00:00<19:31:43, 10.01it/s, lr : 0.0001 ; epoch loss : 4.4813 ; loop error : 0.0]


Epoch [37/500] completed, Average Loss: 0.0001


Epoch 38:   0%|          | 9/703761 [00:00<20:32:13,  9.52it/s, lr : 0.0001 ; epoch loss : 4.4078 ; loop error : 0.0]


Epoch [38/500] completed, Average Loss: 0.0001


Epoch 39:   0%|          | 9/703761 [00:00<19:57:49,  9.79it/s, lr : 0.0001 ; epoch loss : 4.3875 ; loop error : 0.0]


Epoch [39/500] completed, Average Loss: 0.0001


Epoch 40:   0%|          | 9/703761 [00:00<19:33:09, 10.00it/s, lr : 0.0001 ; epoch loss : 4.3438 ; loop error : 0.0]


Epoch [40/500] completed, Average Loss: 0.0001


Epoch 41:   0%|          | 9/703761 [00:00<19:57:33,  9.79it/s, lr : 0.0001 ; epoch loss : 4.3094 ; loop error : 0.0]


Epoch [41/500] completed, Average Loss: 0.0001


Epoch 42:   0%|          | 9/703761 [00:00<18:29:49, 10.57it/s, lr : 0.0001 ; epoch loss : 4.2781 ; loop error : 0.0]


Epoch [42/500] completed, Average Loss: 0.0001


Epoch 43:   0%|          | 9/703761 [00:00<18:46:48, 10.41it/s, lr : 0.0001 ; epoch loss : 4.2094 ; loop error : 0.0]


Epoch [43/500] completed, Average Loss: 0.0001


Epoch 44:   0%|          | 9/703761 [00:00<18:14:57, 10.71it/s, lr : 0.0001 ; epoch loss : 4.1641 ; loop error : 0.0]


Epoch [44/500] completed, Average Loss: 0.0001


Epoch 45:   0%|          | 9/703761 [00:00<18:31:17, 10.55it/s, lr : 0.0001 ; epoch loss : 4.1328 ; loop error : 0.0]


Epoch [45/500] completed, Average Loss: 0.0001


Epoch 46:   0%|          | 9/703761 [00:00<18:26:58, 10.60it/s, lr : 0.0001 ; epoch loss : 4.1250 ; loop error : 0.0]


Epoch [46/500] completed, Average Loss: 0.0001


Epoch 47:   0%|          | 9/703761 [00:00<19:12:09, 10.18it/s, lr : 0.0001 ; epoch loss : 4.1312 ; loop error : 0.0]


Epoch [47/500] completed, Average Loss: 0.0001


Epoch 48:   0%|          | 9/703761 [00:00<19:15:40, 10.15it/s, lr : 0.0001 ; epoch loss : 4.0406 ; loop error : 0.0]


Epoch [48/500] completed, Average Loss: 0.0001


Epoch 49:   0%|          | 9/703761 [00:00<19:22:25, 10.09it/s, lr : 0.0001 ; epoch loss : 4.0219 ; loop error : 0.0]


Epoch [49/500] completed, Average Loss: 0.0001


Epoch 50:   0%|          | 9/703761 [00:00<19:09:35, 10.20it/s, lr : 0.0001 ; epoch loss : 3.9609 ; loop error : 0.0]


Epoch [50/500] completed, Average Loss: 0.0001


Epoch 51:   0%|          | 9/703761 [00:00<19:17:17, 10.13it/s, lr : 0.0001 ; epoch loss : 3.9203 ; loop error : 0.0]


Epoch [51/500] completed, Average Loss: 0.0001


Epoch 52:   0%|          | 9/703761 [00:00<19:19:41, 10.11it/s, lr : 0.0001 ; epoch loss : 3.8953 ; loop error : 0.0]


Epoch [52/500] completed, Average Loss: 0.0001


Epoch 53:   0%|          | 9/703761 [00:00<19:17:36, 10.13it/s, lr : 0.0001 ; epoch loss : 3.8453 ; loop error : 0.0]


Epoch [53/500] completed, Average Loss: 0.0001


Epoch 54:   0%|          | 9/703761 [00:00<19:37:02,  9.96it/s, lr : 0.0001 ; epoch loss : 3.8188 ; loop error : 0.0]


Epoch [54/500] completed, Average Loss: 0.0001


Epoch 55:   0%|          | 9/703761 [00:00<18:56:17, 10.32it/s, lr : 0.0001 ; epoch loss : 3.8031 ; loop error : 0.0]


Epoch [55/500] completed, Average Loss: 0.0001


Epoch 56:   0%|          | 9/703761 [00:00<19:20:09, 10.11it/s, lr : 0.0001 ; epoch loss : 3.7500 ; loop error : 0.0]


Epoch [56/500] completed, Average Loss: 0.0001


Epoch 57:   0%|          | 9/703761 [00:00<18:34:43, 10.52it/s, lr : 0.0001 ; epoch loss : 3.7422 ; loop error : 0.0]


Epoch [57/500] completed, Average Loss: 0.0001


Epoch 58:   0%|          | 9/703761 [00:00<19:28:10, 10.04it/s, lr : 0.0001 ; epoch loss : 3.7516 ; loop error : 0.0]


Epoch [58/500] completed, Average Loss: 0.0001


Epoch 59:   0%|          | 9/703761 [00:00<17:33:41, 11.13it/s, lr : 0.0001 ; epoch loss : 3.7547 ; loop error : 0.0]


Epoch [59/500] completed, Average Loss: 0.0001


Epoch 60:   0%|          | 9/703761 [00:00<17:00:21, 11.50it/s, lr : 0.0001 ; epoch loss : 3.7391 ; loop error : 0.0]


Epoch [60/500] completed, Average Loss: 0.0001


Epoch 61:   0%|          | 9/703761 [00:00<18:37:16, 10.50it/s, lr : 0.0001 ; epoch loss : 3.6406 ; loop error : 0.0]


Epoch [61/500] completed, Average Loss: 0.0001


Epoch 62:   0%|          | 9/703761 [00:00<19:56:17,  9.80it/s, lr : 0.0001 ; epoch loss : 3.5703 ; loop error : 0.0]


Epoch [62/500] completed, Average Loss: 0.0001


Epoch 63:   0%|          | 9/703761 [00:00<19:32:26, 10.00it/s, lr : 0.0001 ; epoch loss : 3.5281 ; loop error : 0.0]


Epoch [63/500] completed, Average Loss: 0.0001


Epoch 64:   0%|          | 9/703761 [00:00<19:11:56, 10.18it/s, lr : 0.0001 ; epoch loss : 3.5063 ; loop error : 0.0]


Epoch [64/500] completed, Average Loss: 0.0000


Epoch 65:   0%|          | 9/703761 [00:00<18:25:40, 10.61it/s, lr : 0.0001 ; epoch loss : 3.5047 ; loop error : 0.0]


Epoch [65/500] completed, Average Loss: 0.0000


Epoch 66:   0%|          | 9/703761 [00:00<20:35:00,  9.50it/s, lr : 0.0001 ; epoch loss : 3.5297 ; loop error : 0.0]


Epoch [66/500] completed, Average Loss: 0.0001


Epoch 67:   0%|          | 9/703761 [00:00<18:23:05, 10.63it/s, lr : 0.0001 ; epoch loss : 3.5344 ; loop error : 0.0]


Epoch [67/500] completed, Average Loss: 0.0001


Epoch 68:   0%|          | 9/703761 [00:00<17:35:41, 11.11it/s, lr : 0.0001 ; epoch loss : 3.5047 ; loop error : 0.0]


Epoch [68/500] completed, Average Loss: 0.0000


Epoch 69:   0%|          | 9/703761 [00:00<18:09:59, 10.76it/s, lr : 0.0001 ; epoch loss : 3.4531 ; loop error : 0.0]


Epoch [69/500] completed, Average Loss: 0.0000


Epoch 70:   0%|          | 9/703761 [00:00<19:06:30, 10.23it/s, lr : 0.0001 ; epoch loss : 3.4000 ; loop error : 0.0]


Epoch [70/500] completed, Average Loss: 0.0000


Epoch 71:   0%|          | 9/703761 [00:00<18:37:43, 10.49it/s, lr : 0.0001 ; epoch loss : 3.3594 ; loop error : 0.0]


Epoch [71/500] completed, Average Loss: 0.0000


Epoch 72:   0%|          | 9/703761 [00:00<18:51:32, 10.37it/s, lr : 0.0001 ; epoch loss : 3.3438 ; loop error : 0.0]


Epoch [72/500] completed, Average Loss: 0.0000


Epoch 73:   0%|          | 9/703761 [00:00<20:21:51,  9.60it/s, lr : 0.0001 ; epoch loss : 3.3188 ; loop error : 0.0]


Epoch [73/500] completed, Average Loss: 0.0000


Epoch 74:   0%|          | 9/703761 [00:00<19:02:22, 10.27it/s, lr : 0.0001 ; epoch loss : 3.2828 ; loop error : 0.0]


Epoch [74/500] completed, Average Loss: 0.0000


Epoch 75:   0%|          | 9/703761 [00:00<20:01:32,  9.76it/s, lr : 0.0001 ; epoch loss : 3.2406 ; loop error : 0.0]


Epoch [75/500] completed, Average Loss: 0.0000


Epoch 76:   0%|          | 9/703761 [00:00<19:26:36, 10.05it/s, lr : 0.0001 ; epoch loss : 3.2031 ; loop error : 0.0]


Epoch [76/500] completed, Average Loss: 0.0000


Epoch 77:   0%|          | 9/703761 [00:00<20:32:30,  9.52it/s, lr : 0.0001 ; epoch loss : 3.1703 ; loop error : 0.0]


Epoch [77/500] completed, Average Loss: 0.0000


Epoch 78:   0%|          | 9/703761 [00:00<19:21:20, 10.10it/s, lr : 0.0001 ; epoch loss : 3.1516 ; loop error : 0.0]


Epoch [78/500] completed, Average Loss: 0.0000


Epoch 79:   0%|          | 9/703761 [00:00<19:37:27,  9.96it/s, lr : 0.0001 ; epoch loss : 3.1422 ; loop error : 0.0]


Epoch [79/500] completed, Average Loss: 0.0000


Epoch 80:   0%|          | 9/703761 [00:00<19:31:48, 10.01it/s, lr : 0.0001 ; epoch loss : 3.1250 ; loop error : 0.0]


Epoch [80/500] completed, Average Loss: 0.0000


Epoch 81:   0%|          | 9/703761 [00:00<19:28:01, 10.04it/s, lr : 0.0001 ; epoch loss : 3.1078 ; loop error : 0.0]


Epoch [81/500] completed, Average Loss: 0.0000


Epoch 82:   0%|          | 9/703761 [00:00<19:20:10, 10.11it/s, lr : 0.0001 ; epoch loss : 3.0922 ; loop error : 0.0]


Epoch [82/500] completed, Average Loss: 0.0000


Epoch 83:   0%|          | 9/703761 [00:00<20:51:11,  9.37it/s, lr : 0.0001 ; epoch loss : 3.0688 ; loop error : 0.0]


Epoch [83/500] completed, Average Loss: 0.0000


Epoch 84:   0%|          | 9/703761 [00:00<20:47:05,  9.41it/s, lr : 0.0001 ; epoch loss : 3.0484 ; loop error : 0.0]


Epoch [84/500] completed, Average Loss: 0.0000


Epoch 85:   0%|          | 9/703761 [00:00<16:59:21, 11.51it/s, lr : 0.0001 ; epoch loss : 3.0297 ; loop error : 0.0]


Epoch [85/500] completed, Average Loss: 0.0000


Epoch 86:   0%|          | 9/703761 [00:00<19:55:40,  9.81it/s, lr : 0.0001 ; epoch loss : 3.0156 ; loop error : 0.0]


Epoch [86/500] completed, Average Loss: 0.0000


Epoch 87:   0%|          | 9/703761 [00:00<19:07:47, 10.22it/s, lr : 0.0001 ; epoch loss : 2.9937 ; loop error : 0.0]


Epoch [87/500] completed, Average Loss: 0.0000


Epoch 88:   0%|          | 9/703761 [00:00<19:26:58, 10.05it/s, lr : 0.0001 ; epoch loss : 2.9813 ; loop error : 0.0]


Epoch [88/500] completed, Average Loss: 0.0000


Epoch 89:   0%|          | 9/703761 [00:00<18:39:10, 10.48it/s, lr : 0.0001 ; epoch loss : 2.9609 ; loop error : 0.0]


Epoch [89/500] completed, Average Loss: 0.0000


Epoch 90:   0%|          | 9/703761 [00:00<19:51:53,  9.84it/s, lr : 0.0001 ; epoch loss : 2.9453 ; loop error : 0.0]


Epoch [90/500] completed, Average Loss: 0.0000


Epoch 91:   0%|          | 9/703761 [00:00<19:12:24, 10.18it/s, lr : 0.0001 ; epoch loss : 2.9281 ; loop error : 0.0]


Epoch [91/500] completed, Average Loss: 0.0000


Epoch 92:   0%|          | 9/703761 [00:00<19:39:23,  9.95it/s, lr : 0.0001 ; epoch loss : 2.9156 ; loop error : 0.0]


Epoch [92/500] completed, Average Loss: 0.0000


Epoch 93:   0%|          | 9/703761 [00:00<19:06:27, 10.23it/s, lr : 0.0001 ; epoch loss : 2.9094 ; loop error : 0.0]


Epoch [93/500] completed, Average Loss: 0.0000


Epoch 94:   0%|          | 9/703761 [00:00<18:28:17, 10.58it/s, lr : 0.0001 ; epoch loss : 2.8953 ; loop error : 0.0]


Epoch [94/500] completed, Average Loss: 0.0000


Epoch 95:   0%|          | 9/703761 [00:00<20:24:22,  9.58it/s, lr : 0.0001 ; epoch loss : 2.8922 ; loop error : 0.0]


Epoch [95/500] completed, Average Loss: 0.0000


Epoch 96:   0%|          | 9/703761 [00:00<18:41:12, 10.46it/s, lr : 0.0001 ; epoch loss : 2.8813 ; loop error : 0.0]


Epoch [96/500] completed, Average Loss: 0.0000


Epoch 97:   0%|          | 9/703761 [00:00<20:30:25,  9.53it/s, lr : 0.0001 ; epoch loss : 2.8734 ; loop error : 0.0]


Epoch [97/500] completed, Average Loss: 0.0000


Epoch 98:   0%|          | 9/703761 [00:00<21:19:48,  9.16it/s, lr : 0.0001 ; epoch loss : 2.8625 ; loop error : 0.0]


Epoch [98/500] completed, Average Loss: 0.0000


Epoch 99:   0%|          | 9/703761 [00:00<18:51:18, 10.37it/s, lr : 0.0001 ; epoch loss : 2.8531 ; loop error : 0.0]


Epoch [99/500] completed, Average Loss: 0.0000


Epoch 100:   0%|          | 9/703761 [00:00<18:47:18, 10.40it/s, lr : 0.0001 ; epoch loss : 2.8453 ; loop error : 0.0]


Epoch [100/500] completed, Average Loss: 0.0000


Epoch 101:   0%|          | 9/703761 [00:00<19:01:20, 10.28it/s, lr : 0.0001 ; epoch loss : 2.8359 ; loop error : 0.0]


Epoch [101/500] completed, Average Loss: 0.0000


Epoch 102:   0%|          | 9/703761 [00:00<19:35:04,  9.98it/s, lr : 0.0001 ; epoch loss : 2.8266 ; loop error : 0.0]


Epoch [102/500] completed, Average Loss: 0.0000


Epoch 103:   0%|          | 9/703761 [00:00<19:53:34,  9.83it/s, lr : 0.0001 ; epoch loss : 2.8188 ; loop error : 0.0]


Epoch [103/500] completed, Average Loss: 0.0000


Epoch 104:   0%|          | 9/703761 [00:00<20:09:31,  9.70it/s, lr : 0.0001 ; epoch loss : 2.8094 ; loop error : 0.0]


Epoch [104/500] completed, Average Loss: 0.0000


Epoch 105:   0%|          | 9/703761 [00:00<19:47:57,  9.87it/s, lr : 0.0001 ; epoch loss : 2.8016 ; loop error : 0.0]


Epoch [105/500] completed, Average Loss: 0.0000


Epoch 106:   0%|          | 9/703761 [00:00<18:39:52, 10.47it/s, lr : 0.0001 ; epoch loss : 2.7953 ; loop error : 0.0]


Epoch [106/500] completed, Average Loss: 0.0000


Epoch 107:   0%|          | 9/703761 [00:00<20:13:15,  9.67it/s, lr : 0.0001 ; epoch loss : 2.7859 ; loop error : 0.0]


Epoch [107/500] completed, Average Loss: 0.0000


Epoch 108:   0%|          | 9/703761 [00:00<19:38:53,  9.95it/s, lr : 0.0001 ; epoch loss : 2.7812 ; loop error : 0.0]


Epoch [108/500] completed, Average Loss: 0.0000


Epoch 109:   0%|          | 9/703761 [00:00<20:07:06,  9.72it/s, lr : 0.0001 ; epoch loss : 2.7687 ; loop error : 0.0]


Epoch [109/500] completed, Average Loss: 0.0000


Epoch 110:   0%|          | 9/703761 [00:00<19:39:06,  9.95it/s, lr : 0.0001 ; epoch loss : 2.7625 ; loop error : 0.0]


Epoch [110/500] completed, Average Loss: 0.0000


Epoch 111:   0%|          | 9/703761 [00:00<20:12:28,  9.67it/s, lr : 0.0001 ; epoch loss : 2.7555 ; loop error : 0.0]


Epoch [111/500] completed, Average Loss: 0.0000


Epoch 112:   0%|          | 9/703761 [00:00<20:56:16,  9.34it/s, lr : 0.0001 ; epoch loss : 2.7484 ; loop error : 0.0]


Epoch [112/500] completed, Average Loss: 0.0000


Epoch 113:   0%|          | 9/703761 [00:00<20:02:29,  9.75it/s, lr : 0.0001 ; epoch loss : 2.7422 ; loop error : 0.0]


Epoch [113/500] completed, Average Loss: 0.0000


Epoch 114:   0%|          | 9/703761 [00:00<20:52:21,  9.37it/s, lr : 0.0001 ; epoch loss : 2.7336 ; loop error : 0.0]


Epoch [114/500] completed, Average Loss: 0.0000


Epoch 115:   0%|          | 9/703761 [00:00<20:13:41,  9.66it/s, lr : 0.0001 ; epoch loss : 2.7281 ; loop error : 0.0]


Epoch [115/500] completed, Average Loss: 0.0000


Epoch 116:   0%|          | 9/703761 [00:00<19:24:53, 10.07it/s, lr : 0.0001 ; epoch loss : 2.7203 ; loop error : 0.0]


Epoch [116/500] completed, Average Loss: 0.0000


Epoch 117:   0%|          | 9/703761 [00:00<18:43:35, 10.44it/s, lr : 0.0001 ; epoch loss : 2.7133 ; loop error : 0.0]


Epoch [117/500] completed, Average Loss: 0.0000


Epoch 118:   0%|          | 9/703761 [00:00<18:55:59, 10.33it/s, lr : 0.0001 ; epoch loss : 2.7047 ; loop error : 0.0]


Epoch [118/500] completed, Average Loss: 0.0000


Epoch 119:   0%|          | 9/703761 [00:00<21:13:36,  9.21it/s, lr : 0.0001 ; epoch loss : 2.6984 ; loop error : 0.0]


Epoch [119/500] completed, Average Loss: 0.0000


Epoch 120:   0%|          | 9/703761 [00:00<19:49:35,  9.86it/s, lr : 0.0001 ; epoch loss : 2.6898 ; loop error : 0.0]


Epoch [120/500] completed, Average Loss: 0.0000


Epoch 121:   0%|          | 9/703761 [00:00<19:50:55,  9.85it/s, lr : 0.0001 ; epoch loss : 2.6859 ; loop error : 0.0]


Epoch [121/500] completed, Average Loss: 0.0000


Epoch 122:   0%|          | 9/703761 [00:00<19:21:21, 10.10it/s, lr : 0.0001 ; epoch loss : 2.6766 ; loop error : 0.0]


Epoch [122/500] completed, Average Loss: 0.0000


Epoch 123:   0%|          | 9/703761 [00:00<19:51:04,  9.85it/s, lr : 0.0001 ; epoch loss : 2.6727 ; loop error : 0.0]


Epoch [123/500] completed, Average Loss: 0.0000


Epoch 124:   0%|          | 9/703761 [00:00<19:02:53, 10.26it/s, lr : 0.0001 ; epoch loss : 2.6680 ; loop error : 0.0]


Epoch [124/500] completed, Average Loss: 0.0000


Epoch 125:   0%|          | 9/703761 [00:00<19:45:17,  9.90it/s, lr : 0.0001 ; epoch loss : 2.6609 ; loop error : 0.0]


Epoch [125/500] completed, Average Loss: 0.0000


Epoch 126:   0%|          | 9/703761 [00:00<19:01:06, 10.28it/s, lr : 0.0001 ; epoch loss : 2.6555 ; loop error : 0.0]


Epoch [126/500] completed, Average Loss: 0.0000


Epoch 127:   0%|          | 9/703761 [00:00<19:51:12,  9.85it/s, lr : 0.0001 ; epoch loss : 2.6461 ; loop error : 0.0]


Epoch [127/500] completed, Average Loss: 0.0000


Epoch 128:   0%|          | 9/703761 [00:00<20:39:01,  9.47it/s, lr : 0.0001 ; epoch loss : 2.6391 ; loop error : 0.0]


Epoch [128/500] completed, Average Loss: 0.0000


Epoch 129:   0%|          | 9/703761 [00:00<20:04:24,  9.74it/s, lr : 0.0001 ; epoch loss : 2.6367 ; loop error : 0.0]


Epoch [129/500] completed, Average Loss: 0.0000


Epoch 130:   0%|          | 9/703761 [00:00<19:58:57,  9.78it/s, lr : 0.0001 ; epoch loss : 2.6273 ; loop error : 0.0]


Epoch [130/500] completed, Average Loss: 0.0000


Epoch 131:   0%|          | 9/703761 [00:00<19:40:49,  9.93it/s, lr : 0.0001 ; epoch loss : 2.6234 ; loop error : 0.0]


Epoch [131/500] completed, Average Loss: 0.0000


Epoch 132:   0%|          | 9/703761 [00:00<18:50:07, 10.38it/s, lr : 0.0001 ; epoch loss : 2.6141 ; loop error : 0.0]


Epoch [132/500] completed, Average Loss: 0.0000


Epoch 133:   0%|          | 9/703761 [00:00<19:35:01,  9.98it/s, lr : 0.0001 ; epoch loss : 2.6086 ; loop error : 0.0]


Epoch [133/500] completed, Average Loss: 0.0000


Epoch 134:   0%|          | 9/703761 [00:00<19:51:19,  9.85it/s, lr : 0.0001 ; epoch loss : 2.6047 ; loop error : 0.0]


Epoch [134/500] completed, Average Loss: 0.0000


Epoch 135:   0%|          | 9/703761 [00:00<18:18:50, 10.67it/s, lr : 0.0001 ; epoch loss : 2.6078 ; loop error : 0.0]


Epoch [135/500] completed, Average Loss: 0.0000


Epoch 136:   0%|          | 9/703761 [00:00<20:00:54,  9.77it/s, lr : 0.0001 ; epoch loss : 2.6008 ; loop error : 0.0]


Epoch [136/500] completed, Average Loss: 0.0000


Epoch 137:   0%|          | 9/703761 [00:00<19:44:56,  9.90it/s, lr : 0.0001 ; epoch loss : 2.6164 ; loop error : 0.0]


Epoch [137/500] completed, Average Loss: 0.0000


Epoch 138:   0%|          | 9/703761 [00:00<19:11:30, 10.19it/s, lr : 0.0001 ; epoch loss : 2.6742 ; loop error : 0.0]


Epoch [138/500] completed, Average Loss: 0.0000


Epoch 139:   0%|          | 9/703761 [00:00<20:02:24,  9.75it/s, lr : 0.0001 ; epoch loss : 2.7289 ; loop error : 0.0]


Epoch [139/500] completed, Average Loss: 0.0000


Epoch 140:   0%|          | 9/703761 [00:00<19:26:29, 10.06it/s, lr : 0.0001 ; epoch loss : 2.7492 ; loop error : 0.0]


Epoch [140/500] completed, Average Loss: 0.0000


Epoch 141:   0%|          | 9/703761 [00:00<19:26:28, 10.06it/s, lr : 0.0001 ; epoch loss : 2.7258 ; loop error : 0.0]


Epoch [141/500] completed, Average Loss: 0.0000


Epoch 142:   0%|          | 9/703761 [00:00<19:30:19, 10.02it/s, lr : 0.0001 ; epoch loss : 2.6898 ; loop error : 0.0]


Epoch [142/500] completed, Average Loss: 0.0000


Epoch 143:   0%|          | 9/703761 [00:00<19:26:26, 10.06it/s, lr : 0.0001 ; epoch loss : 2.6703 ; loop error : 0.0]


Epoch [143/500] completed, Average Loss: 0.0000


Epoch 144:   0%|          | 9/703761 [00:00<19:26:24, 10.06it/s, lr : 0.0001 ; epoch loss : 2.6297 ; loop error : 0.0]


Epoch [144/500] completed, Average Loss: 0.0000


Epoch 145:   0%|          | 9/703761 [00:00<19:51:43,  9.84it/s, lr : 0.0001 ; epoch loss : 2.5977 ; loop error : 0.0]


Epoch [145/500] completed, Average Loss: 0.0000


Epoch 146:   0%|          | 9/703761 [00:00<20:07:04,  9.72it/s, lr : 0.0001 ; epoch loss : 2.5625 ; loop error : 0.0]


Epoch [146/500] completed, Average Loss: 0.0000


Epoch 147:   0%|          | 9/703761 [00:00<19:10:43, 10.19it/s, lr : 0.0001 ; epoch loss : 2.5430 ; loop error : 0.0]


Epoch [147/500] completed, Average Loss: 0.0000


Epoch 148:   0%|          | 9/703761 [00:00<19:32:55, 10.00it/s, lr : 0.0001 ; epoch loss : 2.5195 ; loop error : 0.0]


Epoch [148/500] completed, Average Loss: 0.0000


Epoch 149:   0%|          | 9/703761 [00:00<18:46:30, 10.41it/s, lr : 0.0001 ; epoch loss : 2.5148 ; loop error : 0.0]


Epoch [149/500] completed, Average Loss: 0.0000


Epoch 150:   0%|          | 9/703761 [00:00<19:59:28,  9.78it/s, lr : 0.0001 ; epoch loss : 2.5094 ; loop error : 0.0]


Epoch [150/500] completed, Average Loss: 0.0000


Epoch 151:   0%|          | 9/703761 [00:00<19:46:16,  9.89it/s, lr : 0.0001 ; epoch loss : 2.5102 ; loop error : 0.0]


Epoch [151/500] completed, Average Loss: 0.0000


Epoch 152:   0%|          | 9/703761 [00:00<18:35:12, 10.52it/s, lr : 0.0001 ; epoch loss : 2.5070 ; loop error : 0.0]


Epoch [152/500] completed, Average Loss: 0.0000


Epoch 153:   0%|          | 9/703761 [00:00<18:03:12, 10.83it/s, lr : 0.0001 ; epoch loss : 2.4883 ; loop error : 0.0]


Epoch [153/500] completed, Average Loss: 0.0000


Epoch 154:   0%|          | 9/703761 [00:00<17:32:50, 11.14it/s, lr : 0.0001 ; epoch loss : 2.4797 ; loop error : 0.0]


Epoch [154/500] completed, Average Loss: 0.0000


Epoch 155:   0%|          | 9/703761 [00:00<18:38:36, 10.49it/s, lr : 0.0001 ; epoch loss : 2.4570 ; loop error : 0.0]


Epoch [155/500] completed, Average Loss: 0.0000


Epoch 156:   0%|          | 9/703761 [00:00<18:32:26, 10.54it/s, lr : 0.0001 ; epoch loss : 2.4594 ; loop error : 0.0]


Epoch [156/500] completed, Average Loss: 0.0000


Epoch 157:   0%|          | 9/703761 [00:00<17:56:38, 10.89it/s, lr : 0.0001 ; epoch loss : 2.4555 ; loop error : 0.0]


Epoch [157/500] completed, Average Loss: 0.0000


Epoch 158:   0%|          | 9/703761 [00:00<18:31:21, 10.55it/s, lr : 0.0001 ; epoch loss : 2.4227 ; loop error : 0.0]


Epoch [158/500] completed, Average Loss: 0.0000


Epoch 159:   0%|          | 9/703761 [00:00<18:19:17, 10.67it/s, lr : 0.0001 ; epoch loss : 2.4039 ; loop error : 0.0]


Epoch [159/500] completed, Average Loss: 0.0000


Epoch 160:   0%|          | 9/703761 [00:00<17:48:35, 10.98it/s, lr : 0.0001 ; epoch loss : 2.3992 ; loop error : 0.0]


Epoch [160/500] completed, Average Loss: 0.0000


Epoch 161:   0%|          | 9/703761 [00:00<19:38:50,  9.95it/s, lr : 0.0001 ; epoch loss : 2.3898 ; loop error : 0.0]


Epoch [161/500] completed, Average Loss: 0.0000


Epoch 162:   0%|          | 9/703761 [00:00<18:08:42, 10.77it/s, lr : 0.0001 ; epoch loss : 2.3813 ; loop error : 0.0]


Epoch [162/500] completed, Average Loss: 0.0000


Epoch 163:   0%|          | 9/703761 [00:00<17:54:15, 10.92it/s, lr : 0.0001 ; epoch loss : 2.3484 ; loop error : 0.0]


Epoch [163/500] completed, Average Loss: 0.0000


Epoch 164:   0%|          | 9/703761 [00:00<18:00:58, 10.85it/s, lr : 0.0001 ; epoch loss : 2.3391 ; loop error : 0.0]


Epoch [164/500] completed, Average Loss: 0.0000


Epoch 165:   0%|          | 9/703761 [00:00<18:48:34, 10.39it/s, lr : 0.0001 ; epoch loss : 2.3172 ; loop error : 0.0]


Epoch [165/500] completed, Average Loss: 0.0000


Epoch 166:   0%|          | 9/703761 [00:00<17:32:49, 11.14it/s, lr : 0.0001 ; epoch loss : 2.3047 ; loop error : 0.0]


Epoch [166/500] completed, Average Loss: 0.0000


Epoch 167:   0%|          | 9/703761 [00:00<19:08:39, 10.21it/s, lr : 0.0001 ; epoch loss : 2.3078 ; loop error : 0.0]


Epoch [167/500] completed, Average Loss: 0.0000


Epoch 168:   0%|          | 9/703761 [00:00<18:20:04, 10.66it/s, lr : 0.0001 ; epoch loss : 2.3047 ; loop error : 0.0]


Epoch [168/500] completed, Average Loss: 0.0000


Epoch 169:   0%|          | 9/703761 [00:00<18:43:46, 10.44it/s, lr : 0.0001 ; epoch loss : 2.3156 ; loop error : 0.0]


Epoch [169/500] completed, Average Loss: 0.0000


Epoch 170:   0%|          | 9/703761 [00:00<19:12:35, 10.18it/s, lr : 0.0001 ; epoch loss : 2.2992 ; loop error : 0.0]


Epoch [170/500] completed, Average Loss: 0.0000


Epoch 171:   0%|          | 9/703761 [00:00<18:48:45, 10.39it/s, lr : 0.0001 ; epoch loss : 2.2984 ; loop error : 0.0]


Epoch [171/500] completed, Average Loss: 0.0000


Epoch 172:   0%|          | 9/703761 [00:00<17:58:56, 10.87it/s, lr : 0.0001 ; epoch loss : 2.2852 ; loop error : 0.0]


Epoch [172/500] completed, Average Loss: 0.0000


Epoch 173:   0%|          | 9/703761 [00:00<18:28:12, 10.58it/s, lr : 0.0001 ; epoch loss : 2.2734 ; loop error : 0.0]


Epoch [173/500] completed, Average Loss: 0.0000


Epoch 174:   0%|          | 9/703761 [00:00<18:52:19, 10.36it/s, lr : 0.0001 ; epoch loss : 2.2570 ; loop error : 0.0]


Epoch [174/500] completed, Average Loss: 0.0000


Epoch 175:   0%|          | 9/703761 [00:00<18:50:12, 10.38it/s, lr : 0.0001 ; epoch loss : 2.2484 ; loop error : 0.0]


Epoch [175/500] completed, Average Loss: 0.0000


Epoch 176:   0%|          | 9/703761 [00:00<18:08:17, 10.78it/s, lr : 0.0001 ; epoch loss : 2.2289 ; loop error : 0.0]


Epoch [176/500] completed, Average Loss: 0.0000


Epoch 177:   0%|          | 9/703761 [00:00<18:05:45, 10.80it/s, lr : 0.0001 ; epoch loss : 2.1977 ; loop error : 0.0]


Epoch [177/500] completed, Average Loss: 0.0000


Epoch 178:   0%|          | 9/703761 [00:00<18:32:01, 10.55it/s, lr : 0.0001 ; epoch loss : 2.1922 ; loop error : 0.0]


Epoch [178/500] completed, Average Loss: 0.0000


Epoch 179:   0%|          | 9/703761 [00:00<17:57:51, 10.88it/s, lr : 0.0001 ; epoch loss : 2.1875 ; loop error : 0.0]


Epoch [179/500] completed, Average Loss: 0.0000


Epoch 180:   0%|          | 9/703761 [00:00<17:57:23, 10.89it/s, lr : 0.0001 ; epoch loss : 2.1711 ; loop error : 0.0]


Epoch [180/500] completed, Average Loss: 0.0000


Epoch 181:   0%|          | 9/703761 [00:00<18:26:12, 10.60it/s, lr : 0.0001 ; epoch loss : 2.1664 ; loop error : 0.0]


Epoch [181/500] completed, Average Loss: 0.0000


Epoch 182:   0%|          | 9/703761 [00:00<18:21:34, 10.65it/s, lr : 0.0001 ; epoch loss : 2.1703 ; loop error : 0.0]


Epoch [182/500] completed, Average Loss: 0.0000


Epoch 183:   0%|          | 9/703761 [00:00<19:21:34, 10.10it/s, lr : 0.0001 ; epoch loss : 2.1625 ; loop error : 0.0]


Epoch [183/500] completed, Average Loss: 0.0000


Epoch 184:   0%|          | 9/703761 [00:00<19:53:35,  9.83it/s, lr : 0.0001 ; epoch loss : 2.1477 ; loop error : 0.0]


Epoch [184/500] completed, Average Loss: 0.0000


Epoch 185:   0%|          | 9/703761 [00:00<19:22:37, 10.09it/s, lr : 0.0001 ; epoch loss : 2.1297 ; loop error : 0.0]


Epoch [185/500] completed, Average Loss: 0.0000


Epoch 186:   0%|          | 9/703761 [00:00<18:03:34, 10.82it/s, lr : 0.0001 ; epoch loss : 2.1172 ; loop error : 0.0]


Epoch [186/500] completed, Average Loss: 0.0000


Epoch 187:   0%|          | 9/703761 [00:00<17:48:58, 10.97it/s, lr : 0.0001 ; epoch loss : 2.0992 ; loop error : 0.0]


Epoch [187/500] completed, Average Loss: 0.0000


Epoch 188:   0%|          | 9/703761 [00:00<18:54:21, 10.34it/s, lr : 0.0001 ; epoch loss : 2.0922 ; loop error : 0.0]


Epoch [188/500] completed, Average Loss: 0.0000


Epoch 189:   0%|          | 9/703761 [00:00<18:44:20, 10.43it/s, lr : 0.0001 ; epoch loss : 2.1039 ; loop error : 0.0]


Epoch [189/500] completed, Average Loss: 0.0000


Epoch 190:   0%|          | 9/703761 [00:00<18:40:50, 10.46it/s, lr : 0.0001 ; epoch loss : 2.1070 ; loop error : 0.0]


Epoch [190/500] completed, Average Loss: 0.0000


Epoch 191:   0%|          | 9/703761 [00:00<18:28:56, 10.58it/s, lr : 0.0001 ; epoch loss : 2.0859 ; loop error : 0.0]


Epoch [191/500] completed, Average Loss: 0.0000


Epoch 192:   0%|          | 9/703761 [00:00<18:20:00, 10.66it/s, lr : 0.0001 ; epoch loss : 2.0523 ; loop error : 0.0]


Epoch [192/500] completed, Average Loss: 0.0000


Epoch 193:   0%|          | 9/703761 [00:00<17:51:40, 10.94it/s, lr : 0.0001 ; epoch loss : 2.0430 ; loop error : 0.0]


Epoch [193/500] completed, Average Loss: 0.0000


Epoch 194:   0%|          | 9/703761 [00:00<17:54:18, 10.92it/s, lr : 0.0001 ; epoch loss : 2.0344 ; loop error : 0.0]


Epoch [194/500] completed, Average Loss: 0.0000


Epoch 195:   0%|          | 9/703761 [00:00<17:27:31, 11.20it/s, lr : 0.0001 ; epoch loss : 2.0469 ; loop error : 0.0]


Epoch [195/500] completed, Average Loss: 0.0000


Epoch 196:   0%|          | 9/703761 [00:00<18:46:18, 10.41it/s, lr : 0.0001 ; epoch loss : 2.0617 ; loop error : 0.0]


Epoch [196/500] completed, Average Loss: 0.0000


Epoch 197:   0%|          | 9/703761 [00:00<17:54:03, 10.92it/s, lr : 0.0001 ; epoch loss : 2.0531 ; loop error : 0.0]


Epoch [197/500] completed, Average Loss: 0.0000


Epoch 198:   0%|          | 9/703761 [00:00<19:16:41, 10.14it/s, lr : 0.0001 ; epoch loss : 2.0445 ; loop error : 0.0]


Epoch [198/500] completed, Average Loss: 0.0000


Epoch 199:   0%|          | 9/703761 [00:00<17:54:41, 10.91it/s, lr : 0.0001 ; epoch loss : 2.0141 ; loop error : 0.0]


Epoch [199/500] completed, Average Loss: 0.0000


Epoch 200:   0%|          | 9/703761 [00:00<18:54:49, 10.34it/s, lr : 0.0001 ; epoch loss : 2.0219 ; loop error : 0.0]


Epoch [200/500] completed, Average Loss: 0.0000


Epoch 201:   0%|          | 9/703761 [00:00<18:58:24, 10.30it/s, lr : 0.0001 ; epoch loss : 2.0172 ; loop error : 0.0]


Epoch [201/500] completed, Average Loss: 0.0000


Epoch 202:   0%|          | 9/703761 [00:00<18:14:00, 10.72it/s, lr : 0.0001 ; epoch loss : 2.0148 ; loop error : 0.0]


Epoch [202/500] completed, Average Loss: 0.0000


Epoch 203:   0%|          | 9/703761 [00:00<18:12:08, 10.74it/s, lr : 0.0001 ; epoch loss : 2.0000 ; loop error : 0.0]


Epoch [203/500] completed, Average Loss: 0.0000


Epoch 204:   0%|          | 9/703761 [00:00<18:15:01, 10.71it/s, lr : 0.0001 ; epoch loss : 1.9766 ; loop error : 0.0]


Epoch [204/500] completed, Average Loss: 0.0000


Epoch 205:   0%|          | 9/703761 [00:00<18:00:13, 10.86it/s, lr : 0.0001 ; epoch loss : 1.9805 ; loop error : 0.0]


Epoch [205/500] completed, Average Loss: 0.0000


Epoch 206:   0%|          | 9/703761 [00:00<17:51:21, 10.95it/s, lr : 0.0001 ; epoch loss : 1.9641 ; loop error : 0.0]


Epoch [206/500] completed, Average Loss: 0.0000


Epoch 207:   0%|          | 9/703761 [00:00<19:19:59, 10.11it/s, lr : 0.0001 ; epoch loss : 1.9680 ; loop error : 0.0]


Epoch [207/500] completed, Average Loss: 0.0000


Epoch 208:   0%|          | 9/703761 [00:00<17:32:14, 11.15it/s, lr : 0.0001 ; epoch loss : 1.9633 ; loop error : 0.0]


Epoch [208/500] completed, Average Loss: 0.0000


Epoch 209:   0%|          | 9/703761 [00:00<18:16:09, 10.70it/s, lr : 0.0001 ; epoch loss : 1.9625 ; loop error : 0.0]


Epoch [209/500] completed, Average Loss: 0.0000


Epoch 210:   0%|          | 9/703761 [00:00<18:18:22, 10.68it/s, lr : 0.0001 ; epoch loss : 1.9508 ; loop error : 0.0]


Epoch [210/500] completed, Average Loss: 0.0000


Epoch 211:   0%|          | 9/703761 [00:00<16:43:18, 11.69it/s, lr : 0.0001 ; epoch loss : 1.9594 ; loop error : 0.0]


Epoch [211/500] completed, Average Loss: 0.0000


Epoch 212:   0%|          | 9/703761 [00:00<17:36:31, 11.10it/s, lr : 0.0001 ; epoch loss : 1.9734 ; loop error : 0.0]


Epoch [212/500] completed, Average Loss: 0.0000


Epoch 213:   0%|          | 9/703761 [00:00<18:02:41, 10.83it/s, lr : 0.0001 ; epoch loss : 1.9297 ; loop error : 0.0]


Epoch [213/500] completed, Average Loss: 0.0000


Epoch 214:   0%|          | 9/703761 [00:00<18:26:04, 10.60it/s, lr : 0.0001 ; epoch loss : 1.8898 ; loop error : 0.0]


Epoch [214/500] completed, Average Loss: 0.0000


Epoch 215:   0%|          | 9/703761 [00:00<19:00:35, 10.28it/s, lr : 0.0001 ; epoch loss : 1.8836 ; loop error : 0.0]


Epoch [215/500] completed, Average Loss: 0.0000


Epoch 216:   0%|          | 9/703761 [00:00<17:01:55, 11.48it/s, lr : 0.0001 ; epoch loss : 1.8781 ; loop error : 0.0]


Epoch [216/500] completed, Average Loss: 0.0000


Epoch 217:   0%|          | 9/703761 [00:00<16:57:36, 11.53it/s, lr : 0.0001 ; epoch loss : 1.8766 ; loop error : 0.0]


Epoch [217/500] completed, Average Loss: 0.0000


Epoch 218:   0%|          | 9/703761 [00:00<17:39:08, 11.07it/s, lr : 0.0001 ; epoch loss : 1.8570 ; loop error : 0.0]


Epoch [218/500] completed, Average Loss: 0.0000


Epoch 219:   0%|          | 9/703761 [00:00<16:49:31, 11.62it/s, lr : 0.0001 ; epoch loss : 1.8500 ; loop error : 0.0]


Epoch [219/500] completed, Average Loss: 0.0000


Epoch 220:   0%|          | 9/703761 [00:00<17:56:29, 10.90it/s, lr : 0.0001 ; epoch loss : 1.8398 ; loop error : 0.0]


Epoch [220/500] completed, Average Loss: 0.0000


Epoch 221:   0%|          | 9/703761 [00:00<18:55:05, 10.33it/s, lr : 0.0001 ; epoch loss : 1.8383 ; loop error : 0.0]


Epoch [221/500] completed, Average Loss: 0.0000


Epoch 222:   0%|          | 9/703761 [00:00<18:33:06, 10.54it/s, lr : 0.0001 ; epoch loss : 1.8211 ; loop error : 0.0]


Epoch [222/500] completed, Average Loss: 0.0000


Epoch 223:   0%|          | 9/703761 [00:00<18:29:26, 10.57it/s, lr : 0.0001 ; epoch loss : 1.8000 ; loop error : 0.0]


Epoch [223/500] completed, Average Loss: 0.0000


Epoch 224:   0%|          | 9/703761 [00:00<18:45:02, 10.43it/s, lr : 0.0001 ; epoch loss : 1.7977 ; loop error : 0.0]


Epoch [224/500] completed, Average Loss: 0.0000


Epoch 225:   0%|          | 9/703761 [00:00<18:26:20, 10.60it/s, lr : 0.0001 ; epoch loss : 1.7953 ; loop error : 0.0]


Epoch [225/500] completed, Average Loss: 0.0000


Epoch 226:   0%|          | 9/703761 [00:00<18:24:14, 10.62it/s, lr : 0.0001 ; epoch loss : 1.7945 ; loop error : 0.0]


Epoch [226/500] completed, Average Loss: 0.0000


Epoch 227:   0%|          | 9/703761 [00:00<18:36:54, 10.50it/s, lr : 0.0001 ; epoch loss : 1.7977 ; loop error : 0.0]


Epoch [227/500] completed, Average Loss: 0.0000


Epoch 228:   0%|          | 9/703761 [00:00<18:35:18, 10.52it/s, lr : 0.0001 ; epoch loss : 1.8117 ; loop error : 0.0]


Epoch [228/500] completed, Average Loss: 0.0000


Epoch 229:   0%|          | 9/703761 [00:00<18:51:10, 10.37it/s, lr : 0.0001 ; epoch loss : 1.8148 ; loop error : 0.0]


Epoch [229/500] completed, Average Loss: 0.0000


Epoch 230:   0%|          | 9/703761 [00:00<19:21:51, 10.10it/s, lr : 0.0001 ; epoch loss : 1.7898 ; loop error : 0.0]


Epoch [230/500] completed, Average Loss: 0.0000


Epoch 231:   0%|          | 9/703761 [00:00<18:28:00, 10.59it/s, lr : 0.0001 ; epoch loss : 1.7727 ; loop error : 0.0]


Epoch [231/500] completed, Average Loss: 0.0000


Epoch 232:   0%|          | 9/703761 [00:00<18:59:11, 10.30it/s, lr : 0.0001 ; epoch loss : 1.7789 ; loop error : 0.0]


Epoch [232/500] completed, Average Loss: 0.0000


Epoch 233:   0%|          | 9/703761 [00:00<18:26:06, 10.60it/s, lr : 0.0001 ; epoch loss : 1.7844 ; loop error : 0.0]


Epoch [233/500] completed, Average Loss: 0.0000


Epoch 234:   0%|          | 9/703761 [00:00<19:21:46, 10.10it/s, lr : 0.0001 ; epoch loss : 1.7773 ; loop error : 0.0]


Epoch [234/500] completed, Average Loss: 0.0000


Epoch 235:   0%|          | 9/703761 [00:00<18:51:07, 10.37it/s, lr : 0.0001 ; epoch loss : 1.7617 ; loop error : 0.0]


Epoch [235/500] completed, Average Loss: 0.0000


Epoch 236:   0%|          | 9/703761 [00:00<18:19:56, 10.66it/s, lr : 0.0001 ; epoch loss : 1.7477 ; loop error : 0.0]


Epoch [236/500] completed, Average Loss: 0.0000


Epoch 237:   0%|          | 9/703761 [00:00<18:05:03, 10.81it/s, lr : 0.0001 ; epoch loss : 1.7563 ; loop error : 0.0]


Epoch [237/500] completed, Average Loss: 0.0000


Epoch 238:   0%|          | 9/703761 [00:00<17:30:24, 11.17it/s, lr : 0.0001 ; epoch loss : 1.7641 ; loop error : 0.0]


Epoch [238/500] completed, Average Loss: 0.0000


Epoch 239:   0%|          | 9/703761 [00:00<18:19:59, 10.66it/s, lr : 0.0001 ; epoch loss : 1.7711 ; loop error : 0.0]


Epoch [239/500] completed, Average Loss: 0.0000


Epoch 240:   0%|          | 9/703761 [00:00<17:40:44, 11.06it/s, lr : 0.0001 ; epoch loss : 1.7922 ; loop error : 0.0]


Epoch [240/500] completed, Average Loss: 0.0000


Epoch 241:   0%|          | 9/703761 [00:00<18:13:09, 10.73it/s, lr : 0.0001 ; epoch loss : 1.7695 ; loop error : 0.0]


Epoch [241/500] completed, Average Loss: 0.0000


Epoch 242:   0%|          | 9/703761 [00:00<19:40:15,  9.94it/s, lr : 0.0001 ; epoch loss : 1.7812 ; loop error : 0.0]


Epoch [242/500] completed, Average Loss: 0.0000


Epoch 243:   0%|          | 9/703761 [00:00<19:57:04,  9.80it/s, lr : 0.0001 ; epoch loss : 1.7781 ; loop error : 0.0]


Epoch [243/500] completed, Average Loss: 0.0000


Epoch 244:   0%|          | 9/703761 [00:00<19:56:57,  9.80it/s, lr : 0.0001 ; epoch loss : 1.7711 ; loop error : 0.0]


Epoch [244/500] completed, Average Loss: 0.0000


Epoch 245:   0%|          | 9/703761 [00:00<18:25:58, 10.61it/s, lr : 0.0001 ; epoch loss : 1.7766 ; loop error : 0.0]


Epoch [245/500] completed, Average Loss: 0.0000


Epoch 246:   0%|          | 9/703761 [00:00<17:47:51, 10.98it/s, lr : 0.0001 ; epoch loss : 1.7750 ; loop error : 0.0]


Epoch [246/500] completed, Average Loss: 0.0000


Epoch 247:   0%|          | 9/703761 [00:00<18:29:40, 10.57it/s, lr : 0.0001 ; epoch loss : 1.7477 ; loop error : 0.0]


Epoch [247/500] completed, Average Loss: 0.0000


Epoch 248:   0%|          | 9/703761 [00:00<18:20:04, 10.66it/s, lr : 0.0001 ; epoch loss : 1.7328 ; loop error : 0.0]


Epoch [248/500] completed, Average Loss: 0.0000


Epoch 249:   0%|          | 9/703761 [00:00<17:48:14, 10.98it/s, lr : 0.0001 ; epoch loss : 1.6906 ; loop error : 0.0]


Epoch [249/500] completed, Average Loss: 0.0000


Epoch 250:   0%|          | 9/703761 [00:00<17:51:16, 10.95it/s, lr : 0.0001 ; epoch loss : 1.6562 ; loop error : 0.0]


Epoch [250/500] completed, Average Loss: 0.0000


Epoch 251:   0%|          | 9/703761 [00:00<18:18:12, 10.68it/s, lr : 0.0001 ; epoch loss : 1.6320 ; loop error : 0.0]


Epoch [251/500] completed, Average Loss: 0.0000


Epoch 252:   0%|          | 9/703761 [00:00<18:24:09, 10.62it/s, lr : 0.0001 ; epoch loss : 1.6211 ; loop error : 0.0]


Epoch [252/500] completed, Average Loss: 0.0000


Epoch 253:   0%|          | 9/703761 [00:00<18:23:37, 10.63it/s, lr : 0.0001 ; epoch loss : 1.6164 ; loop error : 0.0]


Epoch [253/500] completed, Average Loss: 0.0000


Epoch 254:   0%|          | 9/703761 [00:00<17:35:24, 11.11it/s, lr : 0.0001 ; epoch loss : 1.6133 ; loop error : 0.0]


Epoch [254/500] completed, Average Loss: 0.0000


Epoch 255:   0%|          | 9/703761 [00:00<17:05:40, 11.44it/s, lr : 0.0001 ; epoch loss : 1.6094 ; loop error : 0.0]


Epoch [255/500] completed, Average Loss: 0.0000


Epoch 256:   0%|          | 9/703761 [00:00<18:19:07, 10.67it/s, lr : 0.0001 ; epoch loss : 1.6070 ; loop error : 0.0]


Epoch [256/500] completed, Average Loss: 0.0000


Epoch 257:   0%|          | 9/703761 [00:00<17:40:24, 11.06it/s, lr : 0.0001 ; epoch loss : 1.6055 ; loop error : 0.0]


Epoch [257/500] completed, Average Loss: 0.0000


Epoch 258:   0%|          | 9/703761 [00:00<18:02:51, 10.83it/s, lr : 0.0001 ; epoch loss : 1.6047 ; loop error : 0.0]


Epoch [258/500] completed, Average Loss: 0.0000


Epoch 259:   0%|          | 9/703761 [00:00<18:00:22, 10.86it/s, lr : 0.0001 ; epoch loss : 1.6023 ; loop error : 0.0]


Epoch [259/500] completed, Average Loss: 0.0000


Epoch 260:   0%|          | 9/703761 [00:00<18:08:06, 10.78it/s, lr : 0.0001 ; epoch loss : 1.6000 ; loop error : 0.0]


Epoch [260/500] completed, Average Loss: 0.0000


Epoch 261:   0%|          | 9/703761 [00:00<18:11:50, 10.74it/s, lr : 0.0001 ; epoch loss : 1.5992 ; loop error : 0.0]


Epoch [261/500] completed, Average Loss: 0.0000


Epoch 262:   0%|          | 9/703761 [00:00<18:20:03, 10.66it/s, lr : 0.0001 ; epoch loss : 1.5977 ; loop error : 0.0]


Epoch [262/500] completed, Average Loss: 0.0000


Epoch 263:   0%|          | 9/703761 [00:00<18:26:22, 10.60it/s, lr : 0.0001 ; epoch loss : 1.5961 ; loop error : 0.0]


Epoch [263/500] completed, Average Loss: 0.0000


Epoch 264:   0%|          | 9/703761 [00:00<19:01:09, 10.28it/s, lr : 0.0001 ; epoch loss : 1.5938 ; loop error : 0.0]


Epoch [264/500] completed, Average Loss: 0.0000


Epoch 265:   0%|          | 9/703761 [00:00<18:53:13, 10.35it/s, lr : 0.0001 ; epoch loss : 1.5922 ; loop error : 0.0]


Epoch [265/500] completed, Average Loss: 0.0000


Epoch 266:   0%|          | 9/703761 [00:00<17:47:03, 10.99it/s, lr : 0.0001 ; epoch loss : 1.5906 ; loop error : 0.0]


Epoch [266/500] completed, Average Loss: 0.0000


Epoch 267:   0%|          | 9/703761 [00:00<18:19:34, 10.67it/s, lr : 0.0001 ; epoch loss : 1.5898 ; loop error : 0.0]


Epoch [267/500] completed, Average Loss: 0.0000


Epoch 268:   0%|          | 9/703761 [00:00<18:39:38, 10.48it/s, lr : 0.0001 ; epoch loss : 1.5898 ; loop error : 0.0]


Epoch [268/500] completed, Average Loss: 0.0000


Epoch 269:   0%|          | 9/703761 [00:00<18:21:22, 10.65it/s, lr : 0.0001 ; epoch loss : 1.5875 ; loop error : 0.0]


Epoch [269/500] completed, Average Loss: 0.0000


Epoch 270:   0%|          | 9/703761 [00:00<18:22:40, 10.64it/s, lr : 0.0001 ; epoch loss : 1.5867 ; loop error : 0.0]


Epoch [270/500] completed, Average Loss: 0.0000


Epoch 271:   0%|          | 9/703761 [00:00<17:56:32, 10.90it/s, lr : 0.0001 ; epoch loss : 1.5844 ; loop error : 0.0]


Epoch [271/500] completed, Average Loss: 0.0000


Epoch 272:   0%|          | 9/703761 [00:00<18:51:24, 10.37it/s, lr : 0.0001 ; epoch loss : 1.5836 ; loop error : 0.0]


Epoch [272/500] completed, Average Loss: 0.0000


Epoch 273:   0%|          | 9/703761 [00:00<18:31:22, 10.55it/s, lr : 0.0001 ; epoch loss : 1.5820 ; loop error : 0.0]


Epoch [273/500] completed, Average Loss: 0.0000


Epoch 274:   0%|          | 9/703761 [00:00<17:44:14, 11.02it/s, lr : 0.0001 ; epoch loss : 1.5820 ; loop error : 0.0]


Epoch [274/500] completed, Average Loss: 0.0000


Epoch 275:   0%|          | 9/703761 [00:00<17:13:44, 11.35it/s, lr : 0.0001 ; epoch loss : 1.5813 ; loop error : 0.0]


Epoch [275/500] completed, Average Loss: 0.0000


Epoch 276:   0%|          | 9/703761 [00:00<18:08:09, 10.78it/s, lr : 0.0001 ; epoch loss : 1.5781 ; loop error : 0.0]


Epoch [276/500] completed, Average Loss: 0.0000


Epoch 277:   0%|          | 9/703761 [00:00<17:36:15, 11.10it/s, lr : 0.0001 ; epoch loss : 1.5773 ; loop error : 0.0]


Epoch [277/500] completed, Average Loss: 0.0000


Epoch 278:   0%|          | 9/703761 [00:00<19:14:05, 10.16it/s, lr : 0.0001 ; epoch loss : 1.5766 ; loop error : 0.0]


Epoch [278/500] completed, Average Loss: 0.0000


Epoch 279:   0%|          | 9/703761 [00:00<19:01:13, 10.28it/s, lr : 0.0001 ; epoch loss : 1.5766 ; loop error : 0.0]


Epoch [279/500] completed, Average Loss: 0.0000


Epoch 280:   0%|          | 9/703761 [00:00<19:44:34,  9.90it/s, lr : 0.0001 ; epoch loss : 1.5750 ; loop error : 0.0]


Epoch [280/500] completed, Average Loss: 0.0000


Epoch 281:   0%|          | 9/703761 [00:00<18:19:43, 10.67it/s, lr : 0.0001 ; epoch loss : 1.5750 ; loop error : 0.0]


Epoch [281/500] completed, Average Loss: 0.0000


Epoch 282:   0%|          | 9/703761 [00:00<18:47:09, 10.41it/s, lr : 0.0001 ; epoch loss : 1.5750 ; loop error : 0.0]


Epoch [282/500] completed, Average Loss: 0.0000


Epoch 283:   0%|          | 9/703761 [00:00<18:38:07, 10.49it/s, lr : 0.0001 ; epoch loss : 1.5734 ; loop error : 0.0]


Epoch [283/500] completed, Average Loss: 0.0000


Epoch 284:   0%|          | 9/703761 [00:00<17:58:41, 10.87it/s, lr : 0.0001 ; epoch loss : 1.5711 ; loop error : 0.0]


Epoch [284/500] completed, Average Loss: 0.0000


Epoch 285:   0%|          | 9/703761 [00:00<19:57:02,  9.80it/s, lr : 0.0001 ; epoch loss : 1.5703 ; loop error : 0.0]


Epoch [285/500] completed, Average Loss: 0.0000


Epoch 286:   0%|          | 9/703761 [00:00<17:53:07, 10.93it/s, lr : 0.0001 ; epoch loss : 1.5688 ; loop error : 0.0]


Epoch [286/500] completed, Average Loss: 0.0000


Epoch 287:   0%|          | 9/703761 [00:00<18:32:17, 10.55it/s, lr : 0.0001 ; epoch loss : 1.5680 ; loop error : 0.0]


Epoch [287/500] completed, Average Loss: 0.0000


Epoch 288:   0%|          | 9/703761 [00:00<19:24:16, 10.07it/s, lr : 0.0001 ; epoch loss : 1.5680 ; loop error : 0.0]


Epoch [288/500] completed, Average Loss: 0.0000


Epoch 289:   0%|          | 9/703761 [00:00<17:56:52, 10.89it/s, lr : 0.0001 ; epoch loss : 1.5680 ; loop error : 0.0]


Epoch [289/500] completed, Average Loss: 0.0000


Epoch 290:   0%|          | 9/703761 [00:00<17:57:13, 10.89it/s, lr : 0.0001 ; epoch loss : 1.5672 ; loop error : 0.0]


Epoch [290/500] completed, Average Loss: 0.0000


Epoch 291:   0%|          | 9/703761 [00:00<18:38:31, 10.49it/s, lr : 0.0001 ; epoch loss : 1.5664 ; loop error : 0.0]


Epoch [291/500] completed, Average Loss: 0.0000


Epoch 292:   0%|          | 9/703761 [00:00<18:32:44, 10.54it/s, lr : 0.0001 ; epoch loss : 1.5641 ; loop error : 0.0]


Epoch [292/500] completed, Average Loss: 0.0000


Epoch 293:   0%|          | 9/703761 [00:00<19:50:18,  9.85it/s, lr : 0.0001 ; epoch loss : 1.5633 ; loop error : 0.0]


Epoch [293/500] completed, Average Loss: 0.0000


Epoch 294:   0%|          | 9/703761 [00:00<19:09:21, 10.21it/s, lr : 0.0001 ; epoch loss : 1.5625 ; loop error : 0.0]


Epoch [294/500] completed, Average Loss: 0.0000


Epoch 295:   0%|          | 9/703761 [00:00<16:46:07, 11.66it/s, lr : 0.0001 ; epoch loss : 1.5052 ; loop error : 0.0]


KeyboardInterrupt: 

In [27]:
@torch.no_grad()
def infer(model: nn.Module, 
          tokenerizer: Processor, 
          french_sentence: str, 
          max_length: int | None = None, 
          skip_special_tokens: bool = True) -> str:
    # first model as eval (we don't train here)
    model.eval()
    sequence_length = max_length if max_length else tokenerizer.sequence_length

    # Tokenize the input sequence in french
    tokens = tokenerizer.tokenize(french_sentence)

    # create the encoder input and mask
    encoder_ids = tokens["input_ids"].int().to(DEVICE)
    encoder_mask = tokens["attention_mask"].unsqueeze(0).unsqueeze(0).to(DEVICE)

    # initialize decoder mask and input
    decoder_mask = torch.zeros((1, 1, 1, tokenerizer.sequence_length)).to(DEVICE)
    decoder_mask[0, 0, 0, 0] = 1 # unmask the start of sequence token
    target_ids = torch.zeros((1, tokenerizer.sequence_length)).int().to(DEVICE)
    target_ids[0, 0] = tokenerizer.tokenizer.cls_token_id # start of sequence

    # loop to generate output ids
    for idx in range(1, sequence_length):
        output_probs: Tensor = model(
            encoder_ids,
            target_ids,
            encoder_mask=encoder_mask,
            decoder_mask=None)

        # select next token ID with the highest probability
        next_token_id = output_probs.argmax(dim=-1)[0, idx]
        target_ids[0, idx] = next_token_id
        decoder_mask[0, 0, 0, idx] = 1 # unmask the generated token

        # early stop when encounter sep_token_id
        if next_token_id.item() == tokenerizer.tokenizer.sep_token_id:
            break

    return tokenerizer.decode(target_ids[0], skip_special_tokens=skip_special_tokens)

In [37]:
sentence = make_generator(csv_path, 1)
print(next(sentence))
infer(model, processor, "Wow what the actual fuck ?")

ListPairedSentences(fr=['Il a transformé notre vie | Il a transformé la société | Son fonctionnement | La technologie, moteur du changement Accueil | Concepts | Enseignants | Recherche | Aperçu | Collaborateurs | Web HHCC | Ressources | Commentaires Musée virtuel du Canada'], en=['Changing Lives | Changing Society | How It Works | Technology Drives Change Home | Concepts | Teachers | Search | Overview | Credits | HHCC Web | Reference | Feedback Virtual Museum of Canada Home Page'])


'.......................................................................................................................................................................................................'