# Bytenet
Implementation of the paper Neural Machine Translation in Linear Time (https://arxiv.org/pdf/1610.10099). The Bynet is a CNN based Encoder/Decoder Model used here for Sequence to Sequence Translation. The model is trained on a part of the WMT2014 english to german dataset.

### Data Preprocessing
Data is loaded and tokenized in this step.

In [1]:
# Define Imports
import json

import requests
import torch
import pickle
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from datasets import load_dataset
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import AutoTokenizer


In [2]:
import requests


class WMT19JSONLoader:
    def __init__(self, file_path, source_lang='de', target_lang='en', max_length=128):
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.max_length = max_length
        # self.tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
        self.file_path = file_path
        self.tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

    def load_json_data(self, file_path):
        """
        Function that loads the downloaded JSON file

        :param file_path:
        :return:
        """
        loaded_data = []
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    loaded_data.append(json.loads(line.strip()))
                except json.JSONDecodeError as e:
                    print(f"Error when line is decoded: {e}")
        return loaded_data

    def convert_to_tensor(self, src, trg):
        """
        Checks if source and target are tensor
        If both are not tensor, they are converted to tensors

        :param src:
        :param trg:
        :return:
        """
        if not torch.is_tensor(src):
            src = torch.Tensor(src)
        if not torch.is_tensor(trg):
            trg = torch.tensor(trg, dtype=torch.int32)
        return src, trg

    def extract_source_target(self, load_data):
        """
        Function that extracts out of the downloaded JSON the
        german rows as source and the english rows as targets

        :param load_data:
        :param source_lang:
        :param target_lang:
        :return:
        """
        source_texts = []
        target_texts = []
        for item in load_data:
            if ('row' in item and 'translation' in item['row'] and
                    self.source_lang in item['row']['translation'] and
                    self.target_lang in item['row']['translation']):
                source_texts.append(item['row']['translation'][self.source_lang])
                target_texts.append(item['row']['translation'][self.target_lang])
        return source_texts, target_texts

    def tokenize_texts(self, texts):
        """
        Function for tokenizing the text data
        Uses BERT-Tokenizer as tokenizer model

        :param texts:
        :return:
        """
        tokenized_texts = []
        for text in texts:
            tokens = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
            
            tokenized_texts.append(tokens['input_ids'].squeeze())
        return tokenized_texts

    def load_and_tokenize(self, json_file_path):
        """
        Function that does the load json data
        and the tokenizing process

        :param json_file_path:
        """
        loaded_data = self.load_json_data(json_file_path)

        source_texts, target_texts = self.extract_source_target(loaded_data)

        # The tokenized source and targets
        # self.tokenizer is a object of type transformers from the Bert model
        # padding="max_length": is used to fill sequence to maximal length
        # truncation = True: Means that the sequence is cutted, if longer than max_length
        # return_tensors="pt": Means that a pytorch tensor is returned
        # the source text is tokenized into smaller elements
        tokenized_source_texts = self.tokenize_texts(source_texts)

        # the target text is tokenized into smaller elements
        tokenized_target_texts = self.tokenize_texts(target_texts)

        #TODO: evetually squeeze as in WMTLoader

        return tokenized_source_texts, tokenized_target_texts


def download_data(offset, length):
    """
    Method for downloading the dataset as JSON
    F.e. if the first 10 rows have to be downloaded, offset has to
    be 0 and length has to be 10

    :param offset: The offset used in the url
    :param length: The length of the selected number of rows in the dataset
    :return:
    """
    url = f"https://datasets-server.huggingface.co/rows?dataset=wmt%2Fwmt19&config=de-en&split=train&offset={offset}&length={length}"
    query_parameters = {"downloadformat": "json"}
    response = requests.get(url, params=query_parameters)
    if response.status_code == 200:
        loaded_data = response.json()
        print(f"Downloading dataset-offset: {offset}")
        return loaded_data['rows']
    else:
        print(f"Error while downloading data: {response.status_code}")
        return []


def save_data_to_json(load_data, file_path):
    """
    Writes data into the JSON object

    :param load_data: The data that has to be writen into file
    :param file_path: The file path where the file has to be saved
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'a', encoding='utf-8') as f:
        for item in load_data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')


def download_batch_and_save(offset, length, output_file):
    """
    Downloads and saves the batch

    :param offset: The offset which is currently used to download
    :param length: The length is defined with 100
    :param output_file: The name of the file to be saved
    """
    loaded_data = download_data(offset, length)
    save_data_to_json(loaded_data, output_file)


def download_entire_de_en_dataset(batch_size, output_dir, num_workers):
    """
    Downloads the entire WMT19 dataset. Uses a ThreadPoolExecutor for
    faster download of the dataset.

    :param batch_size:
    :param output_dir:
    :param num_workers:
    """
    offset = 0
    output_file = os.path.join(output_dir, 'wmt_19_de_en.json')
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        while True:
            futures.append(executor.submit(download_batch_and_save, offset, batch_size, output_file))
            offset += batch_size
            # if offset >= 34800000:
            # This controls how much of the dataset is actually downloaded
            if offset >= 40000:
                break

        for future in as_completed(futures):
            future.result()


### ByteNet Model
The following Cells implement the necessary parts of the ByteNet model. The model is made up of a number of sets of, each of which contains a number of residual blocks that apply LayerNorm and 1D Convolutions, which is further masked for the Decoder part of the Network.

In [3]:
# Imports for ByteNet
import torch
from torch.utils.data import Dataset
from torch import nn
from torch.nn import init
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
from data.data_loader import WMTLoader, WMT19JSONLoader, download_entire_de_en_dataset


In [4]:
class ResidualBlockReLu(nn.Module):
    """
    Implementation of residual Layer for Bytenet machine translation task.

    :param d: The number of input features.
    :param dilation: The initial dilation rate for the convolution layers.
    """

    def __init__(self, d, dilation, k, decoder=False):
        super(ResidualBlockReLu, self).__init__()
        self.decoder = decoder
        self.layer_norm1 = nn.LayerNorm(128)
        self.reLu1 = nn.ReLU()
        # 2*d -> d
        self.conv1 = nn.Conv1d(d * 2, d, 1)
        self.layer_norm2 = nn.LayerNorm(128)
        self.reLu2 = nn.ReLU()
        # Masked kernel size is k
        # Dilation only used for masked convolution
        # d -> d
        if decoder:
            # Masked convolution basically means all padding on left side
            self.receptive_field = (k - 1) * dilation
            self.conv2 = nn.Conv1d(d, d, k, dilation=dilation)
        else:
            # Padding still needed to keep the size of the input and output the same
            padding = (k - 1) * dilation // 2
            if padding > 0:
                self.conv2 = nn.Conv1d(d, d, k, dilation=dilation, padding=padding)
            else:
                self.conv2 = nn.Conv1d(d, d, k, dilation=dilation)
        self.layer_norm3 = nn.LayerNorm(128)
        self.reLu3 = nn.ReLU()
        # d -> 2*d
        self.conv3 = nn.Conv1d(d, d * 2, 1)

    def forward(self, x):
        residual = x
        x = self.layer_norm1(x)
        x = self.reLu1(x)
        x = self.conv1(x)
        x = self.layer_norm2(x)
        x = self.reLu2(x)
        # When Decoder is used, the convolution is causal
        if self.decoder and self.receptive_field > 0:
            x = F.pad(x, (self.receptive_field, 0))
        x = self.conv2(x)
        x = self.layer_norm3(x)
        x = self.reLu3(x)
        x = self.conv3(x)
        # Add back the residual
        x += residual
        return x


In [5]:
class BytenetEncoder(nn.Module):
    """
    Implementation of the ByteNet Encoder. Default Parameters are set to the ones used in the paper.
    
    :param kernel_size: The kernel size for the unmasked (padded) convolution in the residual block.
    :param max_dilation_rate: The maximum dilation rate for the convolution layers.
    :param masked_kernel_size: The kernel size for the masked convolution in the residual block (only interesting for decoder).
    :param num_sets: The number of sets of residual blocks.
    :param set_size: The number of residual blocks in each set.
    :param hidden_channels: The number of hidden channels in the model.
    """
    def __init__(self, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, num_sets=6, set_size=5,
                 hidden_channels=800, emb_size = 1600):
        super(BytenetEncoder, self).__init__()
        self.num_channels = hidden_channels
        self.kernel_size = kernel_size
        self.layers = nn.Sequential()
        # 128 is size of tokenizer
        # input of shape [batch_size, 128, 128] as [batch_size, tokens, embedding_size]
        self.layers.append(nn.Conv1d(in_channels=emb_size, out_channels=hidden_channels * 2, kernel_size=1))
        # From the Paper:
        # Model has a series of residual blocks of increased dilation rate
        # With unmasked convolutions for the encoder
        for _ in range(num_sets):
            dilation_rate = 1
            for _ in range(set_size):
                # Dilation rate does not exceed a given maximum
                # Example from the paper: 16
                self.layers.append(ResidualBlockReLu(hidden_channels,
                                                     dilation_rate if dilation_rate <= max_dilation_rate else max_dilation_rate,
                                                     masked_kernel_size))
                                # Dilation Rate doubles each layer (starting out at 1)
                dilation_rate = dilation_rate * 2

            # "the network applies one more convolution"
        # Note: The output of the residual layers is 2*input_features, however the output of the final convolutions is not specified in the paper
        # Experimentation needed if it should be 2*input_features or input_features
        self.encoder_out_conv = nn.Conv1d(in_channels=hidden_channels * 2, out_channels=2 * hidden_channels, kernel_size=1)
        # "and ReLU"
        # Not sure if these last 2 layers should be in encoder or just decoder
        # self.layers.append(nn.ReLU())
        # "followed by a convolution"
        # self.layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size))
        # "and a final softmax layer" (probably not for encoder, however paper does not specify)
        # self.layers.append(nn.Softmax(dim=1))

    def forward(self, x):
        # Temporary
        x = x.float()
        for layer in self.layers:
            x = layer(x)
        x = self.encoder_out_conv(x)
        return x


In [6]:
class BytenetDecoder(nn.Module):
    """
    Implementation of the ByteNet Decoder. Default Parameters are set to the ones used in the paper.
    
    :param kernel_size: The kernel size for the unmasked (padded) convolution in the residual block (not important for decoder).
    :param max_dilation_rate: The maximum dilation rate for the convolution layers.
    :param masked_kernel_size: The kernel size for the masked convolution in the residual block.
    :param num_sets: The number of sets of residual blocks.
    :param set_size: The number of residual blocks in each set.
    :param hidden_channels: The number of hidden channels in the model.
    """
    def __init__(self, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, num_sets=6, set_size=5,
                 hidden_channels=800, output_channels=384):
        super(BytenetDecoder, self).__init__()
        self.num_channels = hidden_channels
        self.kernel_size = kernel_size
        self.layers = nn.Sequential()
        # From the Paper:
        # Model has a series of residual blocks of increased dilation rate
        # With masekd convolution for decoder
        for _ in range(num_sets):
            dilation_rate = 1
            for _ in range(set_size):
                # Dilation Rate doubles each layer (starting out at 1)
                # 1, 2, 4, 8, 16
                # Dilation rate does not exceed a given maximum
                # Example from the paper: 16
                self.layers.append(ResidualBlockReLu(hidden_channels,
                                                     dilation_rate if dilation_rate <= max_dilation_rate else max_dilation_rate,
                                                     masked_kernel_size, decoder=True))
                dilation_rate = dilation_rate * 2

        # "the network applies one more convolution"
        # Note: The output of the residual layers is 2*input_features, however the output of the final convolutions is not specified in the paper
        # Experimentation needed if it should be 2*input_features or input_features
        self.layers.append(nn.Conv1d(hidden_channels * 2, hidden_channels, 1))
        # "and ReLU"
        self.layers.append(nn.ReLU())
        # "followed by a convolution"
        self.layers.append(nn.Conv1d(hidden_channels, output_channels, 1))
        # "and a final softmax layer"
        # self.layers.append(nn.LogSoftmax(dim=-1))

        # self.layers.append(nn.Softmax(dim=1))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x


In [7]:
class EncoderDecoderStacking(nn.Module):
    """
    Stacks the encoder and decoder for the ByteNet model.
    This means passing the output of the encoder as input to the decoder.
    
    :param kernel_size: The kernel size for the unmasked (padded) convolution in the residual block (for Encoder).
    :param max_dilation_rate: The maximum dilation rate for the convolution layers.
    :param masked_kernel_size: The kernel size for the masked convolution in the residual block (for Decoder).
    :param num_sets: The number of sets of residual blocks.
    :param set_size: The number of residual blocks in each set.
    :param hidden_channels: The number of hidden channels in the model.
    :param output_channels: The number of output channels in the model (vocab size).

    :return x: The output of the decoder.
    """

    def __init__(self, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, n_sets=6, blocks_per_set=5,
                 hidden_channels=800, output_channels = 384, emb_size= 1600):
        super(EncoderDecoderStacking, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.encoder = BytenetEncoder(kernel_size=kernel_size, max_dilation_rate=max_dilation_rate,
                                      masked_kernel_size=masked_kernel_size, num_sets=n_sets, set_size=blocks_per_set,
                                      hidden_channels=hidden_channels, emb_size=emb_size)
        self.decoder = BytenetDecoder(kernel_size=kernel_size, max_dilation_rate=max_dilation_rate,
                                      masked_kernel_size=masked_kernel_size, num_sets=n_sets, set_size=blocks_per_set,
                                      hidden_channels=hidden_channels, output_channels=output_channels)

    def forward(self, x):
        # This permutation is needed for embeddings in pytorch with 1d convolutions
        embed_x = self.embed(x).permute(0, 2, 1)
        x = self.encoder(embed_x)
        x = self.decoder(x)
        return x


In [8]:
class InputEmbeddingTensor:
    """
    Class which enables the embedding of tokens.

    :param vocab_size: The size of the vocabulary as int.
    :param embed_size: The size of the embedding units as int.
    """

    def __init__(self, vocab_size, embed_size):
        super(InputEmbeddingTensor, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        # This is the actual lookup table.
        # A lookup table is an array of data that maps input values to output values
        self.lookup_table_non_zero = nn.Embedding(vocab_size - 1, embed_size)
        init.xavier_uniform_(self.lookup_table_non_zero.weight)

    def embed(self, in_values):
        """
        In this method the first n tokens are embedded via look-up table.
        The n tokens serve as targets for the predictions.

        :param in_values: The train input values from batch, more exact: the tokens
        :return: A embedded tensor of size n × 2d where d is the number of inner
                channels in the network
        """
        lookup_table_zero = torch.zeros(1, self.embed_size).to(in_values.device)
        # Here the both look up tables are combined. The rows with the zeros and the rows
        # with values from the actual lookup table are combined therefore
        lookup_table = torch.cat((lookup_table_zero, self.lookup_table_non_zero.weight.to(device)),
                                 0)  # Move to the same device as inputs
        # Next the input ids are embedded into the lookup table, which means that each id has it own
        # embedding-vector, f.e:
        # id: 5 => [1,5,4]; id:7 => [3,2,9]
        # The input ids are the tokens
        # If a token sequence of 5;7 is used, the resulting matrix is:
        # [1,5,4],[3,2,9]
        return F.embedding(in_values, lookup_table).to(in_values.device)


### Training
The following cells implement the training of the ByteNet model. The model is trained on a part of the WMT2014 english to german dataset.

In [9]:
# Load the data
device = 'cuda' if torch.cuda.is_available() else 'cpu'
cache_dir = 'F:/wmt19_cache'
#    wmt_loader = WMTLoader(split="train", cache_dir=cache_dir)
# Number of workers provides parallel loading
num_workers = 4
#    data_load = DataLoader(wmt_loader, batch_size=32, collate_fn=wmt_loader.collate_fn, num_workers=num_workers)
#    temp = data_load
#
# for batch in wmt_loader:
#     src_batch, tgt_batch = batch
#     break

batch_size = 100
# change as needed
output_dir = 'F:\\wmt19_json'
download_entire_de_en_dataset(batch_size, output_dir, 4)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
wmt_json_loader = WMT19JSONLoader(output_dir)


Downloading dataset-offset: 100
Downloading dataset-offset: 0
Downloading dataset-offset: 200
Downloading dataset-offset: 300
Using device: cuda


In [10]:
print(wmt_json_loader.tokenizer.get_vocab())

{'<pad>': 0, '</s>': 1, '<unk>': 2, '\x00': 3, '\x01': 4, '\x02': 5, '\x03': 6, '\x04': 7, '\x05': 8, '\x06': 9, '\x07': 10, '\x08': 11, '\t': 12, '\n': 13, '\x0b': 14, '\x0c': 15, '\r': 16, '\x0e': 17, '\x0f': 18, '\x10': 19, '\x11': 20, '\x12': 21, '\x13': 22, '\x14': 23, '\x15': 24, '\x16': 25, '\x17': 26, '\x18': 27, '\x19': 28, '\x1a': 29, '\x1b': 30, '\x1c': 31, '\x1d': 32, '\x1e': 33, '\x1f': 34, ' ': 35, '!': 36, '"': 37, '#': 38, '$': 39, '%': 40, '&': 41, "'": 42, '(': 43, ')': 44, '*': 45, '+': 46, ',': 47, '-': 48, '.': 49, '/': 50, '0': 51, '1': 52, '2': 53, '3': 54, '4': 55, '5': 56, '6': 57, '7': 58, '8': 59, '9': 60, ':': 61, ';': 62, '<': 63, '=': 64, '>': 65, '?': 66, '@': 67, 'A': 68, 'B': 69, 'C': 70, 'D': 71, 'E': 72, 'F': 73, 'G': 74, 'H': 75, 'I': 76, 'J': 77, 'K': 78, 'L': 79, 'M': 80, 'N': 81, 'O': 82, 'P': 83, 'Q': 84, 'R': 85, 'S': 86, 'T': 87, 'U': 88, 'V': 89, 'W': 90, 'X': 91, 'Y': 92, 'Z': 93, '[': 94, '\\': 95, ']': 96, '^': 97, '_': 98, '`': 99, 'a': 10

In [11]:
# HYPERPARAMETERS
num_sets = 3
set_size = 5
embed_size = 1600 # Paper
batch_size = 64

In [12]:
class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts):
        self.source_texts = source_texts
        self.target_texts = target_texts

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        return self.source_texts[idx], self.target_texts[idx]


In [13]:
cache_dir = 'F:/wmt19_cache'
# wmt_loader = WMTLoader(split="train", cache_dir=cache_dir)
# index = 0
# source, target = wmt_loader[index]
# print("Source:", source)
# print("Target:", target)

# use drive in which to save dataset in cache
tokenized_source_texts, tokenized_target_texts = wmt_json_loader.load_and_tokenize(
    'F:\\wmt19_json\\wmt_19_de_en.json')
src = tokenized_source_texts
trgt = tokenized_target_texts
vocab_size = len(wmt_json_loader.tokenizer.get_vocab())
print(f"Vocabulary size: {vocab_size}")

Error when line is decoded: Expecting ',' delimiter: line 1 column 1109 (char 1108)
Error when line is decoded: Expecting ':' delimiter: line 1 column 401 (char 400)
Error when line is decoded: Expecting ',' delimiter: line 1 column 614 (char 613)
Error when line is decoded: Extra data: line 1 column 489 (char 488)
Error when line is decoded: Expecting ',' delimiter: line 1 column 462 (char 461)
Error when line is decoded: Expecting ':' delimiter: line 1 column 784 (char 783)
Error when line is decoded: Expecting ',' delimiter: line 1 column 186 (char 185)
Error when line is decoded: Expecting value: line 1 column 1 (char 0)
Error when line is decoded: Expecting ',' delimiter: line 1 column 390 (char 389)
Error when line is decoded: Expecting ':' delimiter: line 1 column 274 (char 273)
Error when line is decoded: Expecting ',' delimiter: line 1 column 227 (char 226)
Error when line is decoded: Extra data: line 1 column 314 (char 313)
Error when line is decoded: Expecting value: line 1 

In [14]:
print(src[:1])

[tensor([ 90, 108, 104, 103, 104, 117, 100, 120, 105, 113, 100, 107, 112, 104,
         35, 103, 104, 117,  35,  86, 108, 119, 125, 120, 113, 106, 118, 115,
        104, 117, 108, 114, 103, 104,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0])]


In [15]:
print(trgt[:500])

[tensor([ 85, 104, 118, 120, 112, 115, 119, 108, 114, 113,  35, 114, 105,  35,
        119, 107, 104,  35, 118, 104, 118, 118, 108, 114, 113,   1,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]), tensor([ 76,  35, 103, 104, 102, 111, 100, 117, 104,  35, 117, 104, 118, 120,
        112, 104, 103,  35, 119, 107, 104,  35, 118, 104, 118, 118, 108, 114,
        113,  35, 114, 105,  35, 119, 107, 104,  35,  72, 120, 117, 114, 115,
        104, 100, 113,  35,  83, 100, 117, 

In [16]:
translation_dataset = TranslationDataset(tokenized_source_texts, tokenized_target_texts)
dataset_size = len(translation_dataset)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size
train_dataset, test_dataset = torch.utils.data.random_split(translation_dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [17]:
criterion = torch.nn.CrossEntropyLoss()
inputEmbedding = InputEmbeddingTensor(vocab_size, embed_size)
# size and all params according to the paper, reduce for performance
encoder_decoder = EncoderDecoderStacking(n_sets=3, blocks_per_set=5, output_channels=vocab_size,emb_size=embed_size).to(
    device)

# Define a loss function and an optimizer
# When changing Loss function, make sure to check if the decoder should have the softmax layer, and adjust that
optimizer = torch.optim.Adam(encoder_decoder.parameters(), lr=0.0003)  #  Paper: 0.0003
# Number of epochs
num_epochs = 3


In [18]:
 # Train the model loop
 for epoch in range(1):
    for i, (inputs, targets) in tqdm(enumerate(train_loader), total=len(train_loader)):
        # Move data to the appropriate device
        # inputs = inputEmbedding.embed(inputs.to(device))  # Add batch dimension
        inputs = inputs.to(device)  # Add batch
        targets = targets.to(device)  # Add batch

        outputs = encoder_decoder(inputs.to(device))
        # Compute loss
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print loss every 100 steps
        if i % 25 == 0:
            tqdm.write(
            f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item()}')

  0%|          | 1/18411 [00:00<4:42:46,  1.09it/s]

Epoch [1/1], Step [1/18411], Loss: 5.923763275146484


  0%|          | 26/18411 [00:15<3:01:21,  1.69it/s]

Epoch [1/1], Step [26/18411], Loss: 2.747082471847534


  0%|          | 51/18411 [00:30<3:01:23,  1.69it/s]

Epoch [1/1], Step [51/18411], Loss: 2.7317094802856445


  0%|          | 76/18411 [00:44<3:01:31,  1.68it/s]

Epoch [1/1], Step [76/18411], Loss: 2.5172066688537598


  1%|          | 101/18411 [00:59<3:01:33,  1.68it/s]

Epoch [1/1], Step [101/18411], Loss: 2.671386241912842


  1%|          | 126/18411 [01:13<2:46:32,  1.83it/s]

Epoch [1/1], Step [126/18411], Loss: 2.6472041606903076


  1%|          | 151/18411 [01:27<2:46:17,  1.83it/s]

Epoch [1/1], Step [151/18411], Loss: 2.480778455734253


  1%|          | 176/18411 [01:40<2:45:58,  1.83it/s]

Epoch [1/1], Step [176/18411], Loss: 2.647597074508667


  1%|          | 201/18411 [01:54<2:45:50,  1.83it/s]

Epoch [1/1], Step [201/18411], Loss: 2.5525224208831787


  1%|          | 226/18411 [02:07<2:49:08,  1.79it/s]

Epoch [1/1], Step [226/18411], Loss: 2.499941110610962


  1%|▏         | 251/18411 [02:21<2:45:47,  1.83it/s]

Epoch [1/1], Step [251/18411], Loss: 2.643735885620117


  1%|▏         | 276/18411 [02:34<2:45:17,  1.83it/s]

Epoch [1/1], Step [276/18411], Loss: 2.3605775833129883


  2%|▏         | 301/18411 [02:49<2:59:47,  1.68it/s]

Epoch [1/1], Step [301/18411], Loss: 2.3085131645202637


  2%|▏         | 326/18411 [03:03<3:00:01,  1.67it/s]

Epoch [1/1], Step [326/18411], Loss: 2.4324731826782227


  2%|▏         | 351/18411 [03:18<2:58:59,  1.68it/s]

Epoch [1/1], Step [351/18411], Loss: 2.3233516216278076


  2%|▏         | 376/18411 [03:33<2:58:13,  1.69it/s]

Epoch [1/1], Step [376/18411], Loss: 2.6174752712249756


  2%|▏         | 401/18411 [03:47<2:59:10,  1.68it/s]

Epoch [1/1], Step [401/18411], Loss: 2.561753511428833


  2%|▏         | 426/18411 [04:02<2:55:44,  1.71it/s]

Epoch [1/1], Step [426/18411], Loss: 2.4332828521728516


  2%|▏         | 451/18411 [04:16<2:50:30,  1.76it/s]

Epoch [1/1], Step [451/18411], Loss: 2.639113664627075


  3%|▎         | 476/18411 [04:31<2:57:58,  1.68it/s]

Epoch [1/1], Step [476/18411], Loss: 2.461958646774292


  3%|▎         | 501/18411 [04:46<2:57:39,  1.68it/s]

Epoch [1/1], Step [501/18411], Loss: 2.5045981407165527


  3%|▎         | 526/18411 [05:00<2:58:22,  1.67it/s]

Epoch [1/1], Step [526/18411], Loss: 2.499451160430908


  3%|▎         | 551/18411 [05:15<2:58:31,  1.67it/s]

Epoch [1/1], Step [551/18411], Loss: 2.3568837642669678


  3%|▎         | 576/18411 [05:30<2:57:23,  1.68it/s]

Epoch [1/1], Step [576/18411], Loss: 2.387563467025757


  3%|▎         | 601/18411 [05:45<2:55:30,  1.69it/s]

Epoch [1/1], Step [601/18411], Loss: 2.5116448402404785


  3%|▎         | 626/18411 [05:59<2:55:48,  1.69it/s]

Epoch [1/1], Step [626/18411], Loss: 2.3793277740478516


  4%|▎         | 651/18411 [06:13<2:49:31,  1.75it/s]

Epoch [1/1], Step [651/18411], Loss: 2.575197696685791


  4%|▎         | 676/18411 [06:27<2:42:36,  1.82it/s]

Epoch [1/1], Step [676/18411], Loss: 2.216629981994629


  4%|▍         | 701/18411 [06:41<2:45:32,  1.78it/s]

Epoch [1/1], Step [701/18411], Loss: 2.246415615081787


  4%|▍         | 726/18411 [06:55<2:47:58,  1.75it/s]

Epoch [1/1], Step [726/18411], Loss: 2.511845827102661


  4%|▍         | 751/18411 [07:08<2:44:01,  1.79it/s]

Epoch [1/1], Step [751/18411], Loss: 2.3514697551727295


  4%|▍         | 776/18411 [07:22<2:45:00,  1.78it/s]

Epoch [1/1], Step [776/18411], Loss: 2.4163200855255127


  4%|▍         | 801/18411 [07:36<2:50:43,  1.72it/s]

Epoch [1/1], Step [801/18411], Loss: 2.4375596046447754


  4%|▍         | 826/18411 [07:50<2:41:37,  1.81it/s]

Epoch [1/1], Step [826/18411], Loss: 2.332367420196533


  5%|▍         | 851/18411 [08:04<2:43:10,  1.79it/s]

Epoch [1/1], Step [851/18411], Loss: 2.381592273712158


  5%|▍         | 876/18411 [08:18<2:40:34,  1.82it/s]

Epoch [1/1], Step [876/18411], Loss: 2.3763983249664307


  5%|▍         | 901/18411 [08:31<2:40:20,  1.82it/s]

Epoch [1/1], Step [901/18411], Loss: 2.587261915206909


  5%|▌         | 926/18411 [08:45<2:42:16,  1.80it/s]

Epoch [1/1], Step [926/18411], Loss: 2.3743815422058105


  5%|▌         | 951/18411 [08:59<2:49:15,  1.72it/s]

Epoch [1/1], Step [951/18411], Loss: 2.3194074630737305


  5%|▌         | 976/18411 [09:13<2:43:09,  1.78it/s]

Epoch [1/1], Step [976/18411], Loss: 2.442049980163574


  5%|▌         | 1001/18411 [09:27<2:46:14,  1.75it/s]

Epoch [1/1], Step [1001/18411], Loss: 2.4790656566619873


  6%|▌         | 1026/18411 [09:41<2:47:16,  1.73it/s]

Epoch [1/1], Step [1026/18411], Loss: 2.3556997776031494


  6%|▌         | 1051/18411 [09:56<2:50:11,  1.70it/s]

Epoch [1/1], Step [1051/18411], Loss: 2.4562721252441406


  6%|▌         | 1076/18411 [10:10<2:43:07,  1.77it/s]

Epoch [1/1], Step [1076/18411], Loss: 2.3321778774261475


  6%|▌         | 1101/18411 [10:24<2:45:05,  1.75it/s]

Epoch [1/1], Step [1101/18411], Loss: 2.400568723678589


  6%|▌         | 1126/18411 [10:38<2:42:39,  1.77it/s]

Epoch [1/1], Step [1126/18411], Loss: 2.384310007095337


  6%|▋         | 1151/18411 [10:52<2:38:36,  1.81it/s]

Epoch [1/1], Step [1151/18411], Loss: 2.378422737121582


  6%|▋         | 1176/18411 [11:06<2:43:05,  1.76it/s]

Epoch [1/1], Step [1176/18411], Loss: 2.342219591140747


  7%|▋         | 1201/18411 [11:20<2:45:47,  1.73it/s]

Epoch [1/1], Step [1201/18411], Loss: 2.332374095916748


  7%|▋         | 1226/18411 [11:33<2:38:42,  1.80it/s]

Epoch [1/1], Step [1226/18411], Loss: 2.329695224761963


  7%|▋         | 1251/18411 [11:47<2:40:00,  1.79it/s]

Epoch [1/1], Step [1251/18411], Loss: 2.5368282794952393


  7%|▋         | 1276/18411 [12:02<2:47:38,  1.70it/s]

Epoch [1/1], Step [1276/18411], Loss: 2.399980068206787


  7%|▋         | 1301/18411 [12:16<2:43:03,  1.75it/s]

Epoch [1/1], Step [1301/18411], Loss: 2.334949493408203


  7%|▋         | 1326/18411 [12:30<2:38:19,  1.80it/s]

Epoch [1/1], Step [1326/18411], Loss: 2.3547780513763428


  7%|▋         | 1351/18411 [12:44<2:49:24,  1.68it/s]

Epoch [1/1], Step [1351/18411], Loss: 2.5083167552948


  7%|▋         | 1376/18411 [12:58<2:39:53,  1.78it/s]

Epoch [1/1], Step [1376/18411], Loss: 2.38899564743042


  8%|▊         | 1401/18411 [13:12<2:43:21,  1.74it/s]

Epoch [1/1], Step [1401/18411], Loss: 2.321938991546631


  8%|▊         | 1426/18411 [13:25<2:34:33,  1.83it/s]

Epoch [1/1], Step [1426/18411], Loss: 2.398192882537842


  8%|▊         | 1451/18411 [13:39<2:34:23,  1.83it/s]

Epoch [1/1], Step [1451/18411], Loss: 2.5418567657470703


  8%|▊         | 1476/18411 [13:52<2:33:58,  1.83it/s]

Epoch [1/1], Step [1476/18411], Loss: 2.4120569229125977


  8%|▊         | 1501/18411 [14:06<2:34:02,  1.83it/s]

Epoch [1/1], Step [1501/18411], Loss: 2.4061696529388428


  8%|▊         | 1526/18411 [14:20<2:42:40,  1.73it/s]

Epoch [1/1], Step [1526/18411], Loss: 2.305490732192993


  8%|▊         | 1551/18411 [14:34<2:34:59,  1.81it/s]

Epoch [1/1], Step [1551/18411], Loss: 2.496818780899048


  9%|▊         | 1576/18411 [14:48<2:33:00,  1.83it/s]

Epoch [1/1], Step [1576/18411], Loss: 2.199361801147461


  9%|▊         | 1601/18411 [15:01<2:32:54,  1.83it/s]

Epoch [1/1], Step [1601/18411], Loss: 2.3733959197998047


  9%|▉         | 1626/18411 [15:15<2:32:34,  1.83it/s]

Epoch [1/1], Step [1626/18411], Loss: 2.412484884262085


  9%|▉         | 1651/18411 [15:28<2:39:24,  1.75it/s]

Epoch [1/1], Step [1651/18411], Loss: 2.3769032955169678


  9%|▉         | 1676/18411 [15:42<2:32:24,  1.83it/s]

Epoch [1/1], Step [1676/18411], Loss: 2.316049814224243


  9%|▉         | 1701/18411 [15:56<2:32:47,  1.82it/s]

Epoch [1/1], Step [1701/18411], Loss: 2.2732865810394287


  9%|▉         | 1726/18411 [16:09<2:31:44,  1.83it/s]

Epoch [1/1], Step [1726/18411], Loss: 2.1561203002929688


 10%|▉         | 1751/18411 [16:23<2:31:30,  1.83it/s]

Epoch [1/1], Step [1751/18411], Loss: 2.2484138011932373


 10%|▉         | 1776/18411 [16:36<2:31:13,  1.83it/s]

Epoch [1/1], Step [1776/18411], Loss: 2.3937935829162598


 10%|▉         | 1801/18411 [16:50<2:31:10,  1.83it/s]

Epoch [1/1], Step [1801/18411], Loss: 2.432997226715088


 10%|▉         | 1826/18411 [17:03<2:31:58,  1.82it/s]

Epoch [1/1], Step [1826/18411], Loss: 2.2034530639648438


 10%|█         | 1851/18411 [17:17<2:30:49,  1.83it/s]

Epoch [1/1], Step [1851/18411], Loss: 2.151792526245117


 10%|█         | 1876/18411 [17:31<2:39:13,  1.73it/s]

Epoch [1/1], Step [1876/18411], Loss: 2.2072603702545166


 10%|█         | 1901/18411 [17:45<2:31:17,  1.82it/s]

Epoch [1/1], Step [1901/18411], Loss: 2.4005420207977295


 10%|█         | 1926/18411 [17:58<2:29:54,  1.83it/s]

Epoch [1/1], Step [1926/18411], Loss: 2.309964418411255


 11%|█         | 1951/18411 [18:12<2:33:17,  1.79it/s]

Epoch [1/1], Step [1951/18411], Loss: 2.2647526264190674


 11%|█         | 1976/18411 [18:25<2:37:58,  1.73it/s]

Epoch [1/1], Step [1976/18411], Loss: 2.3270046710968018


 11%|█         | 2001/18411 [18:39<2:29:41,  1.83it/s]

Epoch [1/1], Step [2001/18411], Loss: 2.1612632274627686


 11%|█         | 2026/18411 [18:53<2:34:38,  1.77it/s]

Epoch [1/1], Step [2026/18411], Loss: 2.2185962200164795


 11%|█         | 2051/18411 [19:07<2:38:07,  1.72it/s]

Epoch [1/1], Step [2051/18411], Loss: 2.2663192749023438


 11%|█▏        | 2076/18411 [19:21<2:32:27,  1.79it/s]

Epoch [1/1], Step [2076/18411], Loss: 2.4309380054473877


 11%|█▏        | 2101/18411 [19:35<2:32:01,  1.79it/s]

Epoch [1/1], Step [2101/18411], Loss: 2.357823371887207


 12%|█▏        | 2126/18411 [19:49<2:28:08,  1.83it/s]

Epoch [1/1], Step [2126/18411], Loss: 2.0237832069396973


 12%|█▏        | 2151/18411 [20:02<2:28:13,  1.83it/s]

Epoch [1/1], Step [2151/18411], Loss: 2.214075803756714


 12%|█▏        | 2176/18411 [20:16<2:28:17,  1.82it/s]

Epoch [1/1], Step [2176/18411], Loss: 2.376330614089966


 12%|█▏        | 2201/18411 [20:30<2:29:24,  1.81it/s]

Epoch [1/1], Step [2201/18411], Loss: 2.132284164428711


 12%|█▏        | 2226/18411 [20:43<2:27:34,  1.83it/s]

Epoch [1/1], Step [2226/18411], Loss: 2.3407204151153564


 12%|█▏        | 2251/18411 [20:57<2:27:04,  1.83it/s]

Epoch [1/1], Step [2251/18411], Loss: 2.2361741065979004


 12%|█▏        | 2276/18411 [21:10<2:27:04,  1.83it/s]

Epoch [1/1], Step [2276/18411], Loss: 2.2474606037139893


 12%|█▏        | 2301/18411 [21:24<2:28:49,  1.80it/s]

Epoch [1/1], Step [2301/18411], Loss: 2.0706958770751953


 13%|█▎        | 2326/18411 [21:38<2:26:28,  1.83it/s]

Epoch [1/1], Step [2326/18411], Loss: 2.340144634246826


 13%|█▎        | 2351/18411 [21:52<2:34:17,  1.73it/s]

Epoch [1/1], Step [2351/18411], Loss: 2.3625776767730713


 13%|█▎        | 2376/18411 [22:06<2:29:13,  1.79it/s]

Epoch [1/1], Step [2376/18411], Loss: 2.2606968879699707


 13%|█▎        | 2401/18411 [22:19<2:29:34,  1.78it/s]

Epoch [1/1], Step [2401/18411], Loss: 2.0902724266052246


 13%|█▎        | 2426/18411 [22:33<2:26:42,  1.82it/s]

Epoch [1/1], Step [2426/18411], Loss: 2.2556633949279785


 13%|█▎        | 2451/18411 [22:47<2:25:46,  1.82it/s]

Epoch [1/1], Step [2451/18411], Loss: 2.1987993717193604


 13%|█▎        | 2476/18411 [23:01<2:25:31,  1.82it/s]

Epoch [1/1], Step [2476/18411], Loss: 2.096522331237793


 14%|█▎        | 2501/18411 [23:15<2:33:20,  1.73it/s]

Epoch [1/1], Step [2501/18411], Loss: 2.086077928543091


 14%|█▎        | 2526/18411 [23:29<2:25:17,  1.82it/s]

Epoch [1/1], Step [2526/18411], Loss: 1.977664828300476


 14%|█▍        | 2551/18411 [23:42<2:24:30,  1.83it/s]

Epoch [1/1], Step [2551/18411], Loss: 2.2083611488342285


 14%|█▍        | 2576/18411 [23:56<2:29:26,  1.77it/s]

Epoch [1/1], Step [2576/18411], Loss: 2.149646759033203


 14%|█▍        | 2601/18411 [24:10<2:24:01,  1.83it/s]

Epoch [1/1], Step [2601/18411], Loss: 2.3141660690307617


 14%|█▍        | 2626/18411 [24:23<2:30:10,  1.75it/s]

Epoch [1/1], Step [2626/18411], Loss: 2.0532305240631104


 14%|█▍        | 2651/18411 [24:37<2:28:55,  1.76it/s]

Epoch [1/1], Step [2651/18411], Loss: 2.259791612625122


 15%|█▍        | 2676/18411 [24:51<2:23:21,  1.83it/s]

Epoch [1/1], Step [2676/18411], Loss: 2.1601245403289795


 15%|█▍        | 2701/18411 [25:04<2:24:21,  1.81it/s]

Epoch [1/1], Step [2701/18411], Loss: 1.9560511112213135


 15%|█▍        | 2726/18411 [25:18<2:23:46,  1.82it/s]

Epoch [1/1], Step [2726/18411], Loss: 2.219264030456543


 15%|█▍        | 2751/18411 [25:32<2:22:58,  1.83it/s]

Epoch [1/1], Step [2751/18411], Loss: 2.057793378829956


 15%|█▌        | 2776/18411 [25:46<2:22:33,  1.83it/s]

Epoch [1/1], Step [2776/18411], Loss: 2.1320016384124756


 15%|█▌        | 2801/18411 [26:00<2:30:14,  1.73it/s]

Epoch [1/1], Step [2801/18411], Loss: 2.1052327156066895


 15%|█▌        | 2826/18411 [26:13<2:22:07,  1.83it/s]

Epoch [1/1], Step [2826/18411], Loss: 2.1620142459869385


 15%|█▌        | 2851/18411 [26:27<2:21:51,  1.83it/s]

Epoch [1/1], Step [2851/18411], Loss: 1.976346492767334


 16%|█▌        | 2876/18411 [26:40<2:21:25,  1.83it/s]

Epoch [1/1], Step [2876/18411], Loss: 2.0502567291259766


 16%|█▌        | 2901/18411 [26:55<2:24:21,  1.79it/s]

Epoch [1/1], Step [2901/18411], Loss: 1.8943650722503662


 16%|█▌        | 2926/18411 [27:08<2:21:26,  1.82it/s]

Epoch [1/1], Step [2926/18411], Loss: 1.9528934955596924


 16%|█▌        | 2951/18411 [27:22<2:21:16,  1.82it/s]

Epoch [1/1], Step [2951/18411], Loss: 2.127845287322998


 16%|█▌        | 2976/18411 [27:35<2:20:58,  1.82it/s]

Epoch [1/1], Step [2976/18411], Loss: 1.9252105951309204


 16%|█▋        | 3001/18411 [27:49<2:20:47,  1.82it/s]

Epoch [1/1], Step [3001/18411], Loss: 2.117168426513672


 16%|█▋        | 3026/18411 [28:03<2:21:17,  1.81it/s]

Epoch [1/1], Step [3026/18411], Loss: 2.222215414047241


 17%|█▋        | 3051/18411 [28:17<2:28:01,  1.73it/s]

Epoch [1/1], Step [3051/18411], Loss: 2.0682241916656494


 17%|█▋        | 3076/18411 [28:31<2:27:48,  1.73it/s]

Epoch [1/1], Step [3076/18411], Loss: 1.987437129020691


 17%|█▋        | 3101/18411 [28:45<2:27:18,  1.73it/s]

Epoch [1/1], Step [3101/18411], Loss: 2.0144083499908447


 17%|█▋        | 3126/18411 [28:59<2:25:07,  1.76it/s]

Epoch [1/1], Step [3126/18411], Loss: 1.863360047340393


 17%|█▋        | 3151/18411 [29:13<2:27:05,  1.73it/s]

Epoch [1/1], Step [3151/18411], Loss: 1.8451663255691528


 17%|█▋        | 3176/18411 [29:28<2:21:44,  1.79it/s]

Epoch [1/1], Step [3176/18411], Loss: 1.8633301258087158


 17%|█▋        | 3201/18411 [29:41<2:26:24,  1.73it/s]

Epoch [1/1], Step [3201/18411], Loss: 2.184072494506836


 18%|█▊        | 3226/18411 [29:56<2:26:20,  1.73it/s]

Epoch [1/1], Step [3226/18411], Loss: 2.185262441635132


 18%|█▊        | 3251/18411 [30:10<2:26:06,  1.73it/s]

Epoch [1/1], Step [3251/18411], Loss: 1.956963062286377


 18%|█▊        | 3276/18411 [30:24<2:18:01,  1.83it/s]

Epoch [1/1], Step [3276/18411], Loss: 1.9974966049194336


 18%|█▊        | 3301/18411 [30:37<2:17:42,  1.83it/s]

Epoch [1/1], Step [3301/18411], Loss: 2.175577402114868


 18%|█▊        | 3326/18411 [30:51<2:24:28,  1.74it/s]

Epoch [1/1], Step [3326/18411], Loss: 1.9743428230285645


 18%|█▊        | 3351/18411 [31:05<2:25:10,  1.73it/s]

Epoch [1/1], Step [3351/18411], Loss: 2.219001054763794


 18%|█▊        | 3376/18411 [31:19<2:16:50,  1.83it/s]

Epoch [1/1], Step [3376/18411], Loss: 1.9151073694229126


 18%|█▊        | 3401/18411 [31:33<2:24:38,  1.73it/s]

Epoch [1/1], Step [3401/18411], Loss: 1.6999503374099731


 19%|█▊        | 3426/18411 [31:48<2:24:24,  1.73it/s]

Epoch [1/1], Step [3426/18411], Loss: 1.8099604845046997


 19%|█▊        | 3451/18411 [32:02<2:23:59,  1.73it/s]

Epoch [1/1], Step [3451/18411], Loss: 1.8789745569229126


 19%|█▉        | 3476/18411 [32:16<2:23:39,  1.73it/s]

Epoch [1/1], Step [3476/18411], Loss: 1.9815152883529663


 19%|█▉        | 3501/18411 [32:30<2:16:48,  1.82it/s]

Epoch [1/1], Step [3501/18411], Loss: 1.9756468534469604


 19%|█▉        | 3526/18411 [32:44<2:17:11,  1.81it/s]

Epoch [1/1], Step [3526/18411], Loss: 1.9493268728256226


 19%|█▉        | 3551/18411 [32:58<2:22:30,  1.74it/s]

Epoch [1/1], Step [3551/18411], Loss: 1.7912874221801758


 19%|█▉        | 3576/18411 [33:12<2:22:38,  1.73it/s]

Epoch [1/1], Step [3576/18411], Loss: 1.7988535165786743


 20%|█▉        | 3601/18411 [33:25<2:16:36,  1.81it/s]

Epoch [1/1], Step [3601/18411], Loss: 2.0498242378234863


 20%|█▉        | 3626/18411 [33:40<2:22:47,  1.73it/s]

Epoch [1/1], Step [3626/18411], Loss: 1.7832220792770386


 20%|█▉        | 3651/18411 [33:53<2:15:27,  1.82it/s]

Epoch [1/1], Step [3651/18411], Loss: 1.8717049360275269


 20%|█▉        | 3676/18411 [34:07<2:14:47,  1.82it/s]

Epoch [1/1], Step [3676/18411], Loss: 2.0915567874908447


 20%|██        | 3701/18411 [34:21<2:14:02,  1.83it/s]

Epoch [1/1], Step [3701/18411], Loss: 1.7808892726898193


 20%|██        | 3726/18411 [34:34<2:21:00,  1.74it/s]

Epoch [1/1], Step [3726/18411], Loss: 2.168504476547241


 20%|██        | 3751/18411 [34:49<2:21:12,  1.73it/s]

Epoch [1/1], Step [3751/18411], Loss: 1.8735857009887695


 21%|██        | 3776/18411 [35:03<2:20:52,  1.73it/s]

Epoch [1/1], Step [3776/18411], Loss: 2.0299723148345947


 21%|██        | 3801/18411 [35:17<2:20:50,  1.73it/s]

Epoch [1/1], Step [3801/18411], Loss: 1.9459038972854614


 21%|██        | 3826/18411 [35:31<2:13:14,  1.82it/s]

Epoch [1/1], Step [3826/18411], Loss: 1.842344045639038


 21%|██        | 3851/18411 [35:44<2:12:55,  1.83it/s]

Epoch [1/1], Step [3851/18411], Loss: 1.7119808197021484


 21%|██        | 3876/18411 [35:58<2:12:47,  1.82it/s]

Epoch [1/1], Step [3876/18411], Loss: 1.9075595140457153


 21%|██        | 3901/18411 [36:12<2:17:13,  1.76it/s]

Epoch [1/1], Step [3901/18411], Loss: 1.9913164377212524


 21%|██▏       | 3926/18411 [36:25<2:12:10,  1.83it/s]

Epoch [1/1], Step [3926/18411], Loss: 2.0242254734039307


 21%|██▏       | 3951/18411 [36:40<2:19:16,  1.73it/s]

Epoch [1/1], Step [3951/18411], Loss: 1.8381426334381104


 22%|██▏       | 3976/18411 [36:53<2:11:32,  1.83it/s]

Epoch [1/1], Step [3976/18411], Loss: 1.891054630279541


 22%|██▏       | 4001/18411 [37:07<2:18:54,  1.73it/s]

Epoch [1/1], Step [4001/18411], Loss: 1.7893949747085571


 22%|██▏       | 4026/18411 [37:21<2:11:14,  1.83it/s]

Epoch [1/1], Step [4026/18411], Loss: 1.781872034072876


 22%|██▏       | 4051/18411 [37:34<2:12:28,  1.81it/s]

Epoch [1/1], Step [4051/18411], Loss: 1.9259885549545288


 22%|██▏       | 4076/18411 [37:48<2:10:45,  1.83it/s]

Epoch [1/1], Step [4076/18411], Loss: 1.7448179721832275


 22%|██▏       | 4101/18411 [38:02<2:11:21,  1.82it/s]

Epoch [1/1], Step [4101/18411], Loss: 1.7274302244186401


 22%|██▏       | 4126/18411 [38:16<2:17:42,  1.73it/s]

Epoch [1/1], Step [4126/18411], Loss: 1.7262283563613892


 23%|██▎       | 4151/18411 [38:30<2:14:15,  1.77it/s]

Epoch [1/1], Step [4151/18411], Loss: 1.8195489645004272


 23%|██▎       | 4176/18411 [38:44<2:09:45,  1.83it/s]

Epoch [1/1], Step [4176/18411], Loss: 1.663744568824768


 23%|██▎       | 4201/18411 [38:58<2:17:53,  1.72it/s]

Epoch [1/1], Step [4201/18411], Loss: 1.7009767293930054


 23%|██▎       | 4226/18411 [39:12<2:09:23,  1.83it/s]

Epoch [1/1], Step [4226/18411], Loss: 1.630694031715393


 23%|██▎       | 4251/18411 [39:25<2:08:48,  1.83it/s]

Epoch [1/1], Step [4251/18411], Loss: 1.7185617685317993


 23%|██▎       | 4276/18411 [39:39<2:15:02,  1.74it/s]

Epoch [1/1], Step [4276/18411], Loss: 1.6982958316802979


 23%|██▎       | 4301/18411 [39:53<2:15:58,  1.73it/s]

Epoch [1/1], Step [4301/18411], Loss: 1.5896100997924805


 23%|██▎       | 4326/18411 [40:07<2:09:38,  1.81it/s]

Epoch [1/1], Step [4326/18411], Loss: 1.7129002809524536


 24%|██▎       | 4351/18411 [40:21<2:08:23,  1.83it/s]

Epoch [1/1], Step [4351/18411], Loss: 1.7349963188171387


 24%|██▍       | 4376/18411 [40:35<2:14:47,  1.74it/s]

Epoch [1/1], Step [4376/18411], Loss: 2.1030337810516357


 24%|██▍       | 4401/18411 [40:48<2:07:30,  1.83it/s]

Epoch [1/1], Step [4401/18411], Loss: 1.7439745664596558


 24%|██▍       | 4426/18411 [41:02<2:14:12,  1.74it/s]

Epoch [1/1], Step [4426/18411], Loss: 1.787444829940796


 24%|██▍       | 4451/18411 [41:16<2:08:02,  1.82it/s]

Epoch [1/1], Step [4451/18411], Loss: 1.5983877182006836


 24%|██▍       | 4476/18411 [41:30<2:07:19,  1.82it/s]

Epoch [1/1], Step [4476/18411], Loss: 1.729569911956787


 24%|██▍       | 4501/18411 [41:44<2:13:19,  1.74it/s]

Epoch [1/1], Step [4501/18411], Loss: 1.8671914339065552


 25%|██▍       | 4526/18411 [41:58<2:13:57,  1.73it/s]

Epoch [1/1], Step [4526/18411], Loss: 1.6523369550704956


 25%|██▍       | 4551/18411 [42:12<2:06:19,  1.83it/s]

Epoch [1/1], Step [4551/18411], Loss: 1.786057949066162


 25%|██▍       | 4576/18411 [42:26<2:09:45,  1.78it/s]

Epoch [1/1], Step [4576/18411], Loss: 1.5741921663284302


 25%|██▍       | 4601/18411 [42:40<2:08:58,  1.78it/s]

Epoch [1/1], Step [4601/18411], Loss: 1.6956710815429688


 25%|██▌       | 4626/18411 [42:54<2:07:55,  1.80it/s]

Epoch [1/1], Step [4626/18411], Loss: 1.518837571144104


 25%|██▌       | 4651/18411 [43:08<2:08:40,  1.78it/s]

Epoch [1/1], Step [4651/18411], Loss: 1.8305805921554565


 25%|██▌       | 4676/18411 [43:22<2:05:26,  1.82it/s]

Epoch [1/1], Step [4676/18411], Loss: 1.6954272985458374


 26%|██▌       | 4701/18411 [43:35<2:09:06,  1.77it/s]

Epoch [1/1], Step [4701/18411], Loss: 1.572740912437439


 26%|██▌       | 4726/18411 [43:49<2:09:50,  1.76it/s]

Epoch [1/1], Step [4726/18411], Loss: 1.7551435232162476


 26%|██▌       | 4751/18411 [44:03<2:04:58,  1.82it/s]

Epoch [1/1], Step [4751/18411], Loss: 1.7335880994796753


 26%|██▌       | 4776/18411 [44:16<2:04:34,  1.82it/s]

Epoch [1/1], Step [4776/18411], Loss: 1.4519269466400146


 26%|██▌       | 4801/18411 [44:30<2:05:33,  1.81it/s]

Epoch [1/1], Step [4801/18411], Loss: 1.68831205368042


 26%|██▌       | 4826/18411 [44:44<2:04:31,  1.82it/s]

Epoch [1/1], Step [4826/18411], Loss: 1.928688883781433


 26%|██▋       | 4851/18411 [44:58<2:10:11,  1.74it/s]

Epoch [1/1], Step [4851/18411], Loss: 1.531905174255371


 26%|██▋       | 4876/18411 [45:12<2:03:25,  1.83it/s]

Epoch [1/1], Step [4876/18411], Loss: 1.5658639669418335


 27%|██▋       | 4901/18411 [45:25<2:09:13,  1.74it/s]

Epoch [1/1], Step [4901/18411], Loss: 1.7406339645385742


 27%|██▋       | 4926/18411 [45:40<2:09:11,  1.74it/s]

Epoch [1/1], Step [4926/18411], Loss: 1.7565667629241943


 27%|██▋       | 4951/18411 [45:54<2:08:37,  1.74it/s]

Epoch [1/1], Step [4951/18411], Loss: 1.6691784858703613


 27%|██▋       | 4976/18411 [46:08<2:09:47,  1.73it/s]

Epoch [1/1], Step [4976/18411], Loss: 1.716597080230713


 27%|██▋       | 5001/18411 [46:22<2:09:16,  1.73it/s]

Epoch [1/1], Step [5001/18411], Loss: 1.7121502161026


 27%|██▋       | 5026/18411 [46:37<2:09:14,  1.73it/s]

Epoch [1/1], Step [5026/18411], Loss: 1.4408347606658936


 27%|██▋       | 5051/18411 [46:51<2:09:04,  1.73it/s]

Epoch [1/1], Step [5051/18411], Loss: 1.685478687286377


 28%|██▊       | 5076/18411 [47:05<2:08:45,  1.73it/s]

Epoch [1/1], Step [5076/18411], Loss: 1.7363418340682983


 28%|██▊       | 5101/18411 [47:20<2:05:32,  1.77it/s]

Epoch [1/1], Step [5101/18411], Loss: 1.6691275835037231


 28%|██▊       | 5126/18411 [47:33<2:01:45,  1.82it/s]

Epoch [1/1], Step [5126/18411], Loss: 1.3536968231201172


 28%|██▊       | 5151/18411 [47:47<2:01:22,  1.82it/s]

Epoch [1/1], Step [5151/18411], Loss: 1.4935821294784546


 28%|██▊       | 5176/18411 [48:01<2:07:52,  1.72it/s]

Epoch [1/1], Step [5176/18411], Loss: 1.4341446161270142


 28%|██▊       | 5201/18411 [48:15<2:07:33,  1.73it/s]

Epoch [1/1], Step [5201/18411], Loss: 1.567084789276123


 28%|██▊       | 5226/18411 [48:29<2:05:42,  1.75it/s]

Epoch [1/1], Step [5226/18411], Loss: 1.6658672094345093


 29%|██▊       | 5251/18411 [48:43<2:01:24,  1.81it/s]

Epoch [1/1], Step [5251/18411], Loss: 1.7035268545150757


 29%|██▊       | 5276/18411 [48:57<2:06:30,  1.73it/s]

Epoch [1/1], Step [5276/18411], Loss: 1.7759658098220825


 29%|██▉       | 5301/18411 [49:11<2:04:16,  1.76it/s]

Epoch [1/1], Step [5301/18411], Loss: 1.5765515565872192


 29%|██▉       | 5326/18411 [49:25<1:59:54,  1.82it/s]

Epoch [1/1], Step [5326/18411], Loss: 1.5279083251953125


 29%|██▉       | 5351/18411 [49:39<1:59:31,  1.82it/s]

Epoch [1/1], Step [5351/18411], Loss: 1.4860436916351318


 29%|██▉       | 5376/18411 [49:52<1:59:08,  1.82it/s]

Epoch [1/1], Step [5376/18411], Loss: 1.5381016731262207


 29%|██▉       | 5401/18411 [50:06<2:04:05,  1.75it/s]

Epoch [1/1], Step [5401/18411], Loss: 1.6273390054702759


 29%|██▉       | 5426/18411 [50:21<2:07:00,  1.70it/s]

Epoch [1/1], Step [5426/18411], Loss: 1.6194418668746948


 30%|██▉       | 5451/18411 [50:35<2:05:10,  1.73it/s]

Epoch [1/1], Step [5451/18411], Loss: 1.634330153465271


 30%|██▉       | 5476/18411 [50:51<2:21:36,  1.52it/s]

Epoch [1/1], Step [5476/18411], Loss: 1.8238306045532227


 30%|██▉       | 5501/18411 [51:05<1:57:55,  1.82it/s]

Epoch [1/1], Step [5501/18411], Loss: 1.6153923273086548


 30%|███       | 5526/18411 [51:19<2:04:37,  1.72it/s]

Epoch [1/1], Step [5526/18411], Loss: 1.5079514980316162


 30%|███       | 5551/18411 [51:33<2:03:05,  1.74it/s]

Epoch [1/1], Step [5551/18411], Loss: 1.6877772808074951


 30%|███       | 5576/18411 [51:47<1:57:15,  1.82it/s]

Epoch [1/1], Step [5576/18411], Loss: 1.7173566818237305


 30%|███       | 5601/18411 [52:00<2:00:40,  1.77it/s]

Epoch [1/1], Step [5601/18411], Loss: 1.6906319856643677


 31%|███       | 5626/18411 [52:14<1:58:37,  1.80it/s]

Epoch [1/1], Step [5626/18411], Loss: 1.6454144716262817


 31%|███       | 5651/18411 [52:28<2:03:12,  1.73it/s]

Epoch [1/1], Step [5651/18411], Loss: 1.8651561737060547


 31%|███       | 5676/18411 [52:42<1:57:11,  1.81it/s]

Epoch [1/1], Step [5676/18411], Loss: 1.689879298210144


 31%|███       | 5701/18411 [52:56<1:56:12,  1.82it/s]

Epoch [1/1], Step [5701/18411], Loss: 1.8560482263565063


 31%|███       | 5726/18411 [53:10<1:59:20,  1.77it/s]

Epoch [1/1], Step [5726/18411], Loss: 1.490485429763794


 31%|███       | 5751/18411 [53:24<1:59:58,  1.76it/s]

Epoch [1/1], Step [5751/18411], Loss: 1.514358401298523


 31%|███▏      | 5776/18411 [53:38<1:58:36,  1.78it/s]

Epoch [1/1], Step [5776/18411], Loss: 1.5978283882141113


 32%|███▏      | 5801/18411 [53:51<1:57:18,  1.79it/s]

Epoch [1/1], Step [5801/18411], Loss: 1.7264866828918457


 32%|███▏      | 5826/18411 [54:05<1:57:26,  1.79it/s]

Epoch [1/1], Step [5826/18411], Loss: 1.5752999782562256


 32%|███▏      | 5851/18411 [54:19<1:54:33,  1.83it/s]

Epoch [1/1], Step [5851/18411], Loss: 1.5605195760726929


 32%|███▏      | 5876/18411 [54:33<1:55:53,  1.80it/s]

Epoch [1/1], Step [5876/18411], Loss: 1.4155477285385132


 32%|███▏      | 5901/18411 [54:47<2:00:10,  1.74it/s]

Epoch [1/1], Step [5901/18411], Loss: 1.4649120569229126


 32%|███▏      | 5926/18411 [55:01<1:54:06,  1.82it/s]

Epoch [1/1], Step [5926/18411], Loss: 1.491693377494812


 32%|███▏      | 5951/18411 [55:15<1:59:51,  1.73it/s]

Epoch [1/1], Step [5951/18411], Loss: 1.445594072341919


 32%|███▏      | 5976/18411 [55:28<1:53:09,  1.83it/s]

Epoch [1/1], Step [5976/18411], Loss: 1.6965978145599365


 33%|███▎      | 6001/18411 [55:42<1:53:21,  1.82it/s]

Epoch [1/1], Step [6001/18411], Loss: 1.3062732219696045


 33%|███▎      | 6026/18411 [55:55<1:52:57,  1.83it/s]

Epoch [1/1], Step [6026/18411], Loss: 1.6382101774215698


 33%|███▎      | 6051/18411 [56:09<1:52:51,  1.83it/s]

Epoch [1/1], Step [6051/18411], Loss: 1.5189719200134277


 33%|███▎      | 6076/18411 [56:23<1:54:51,  1.79it/s]

Epoch [1/1], Step [6076/18411], Loss: 1.5978960990905762


 33%|███▎      | 6101/18411 [56:37<1:55:47,  1.77it/s]

Epoch [1/1], Step [6101/18411], Loss: 1.5374795198440552


 33%|███▎      | 6126/18411 [56:50<1:55:46,  1.77it/s]

Epoch [1/1], Step [6126/18411], Loss: 1.4634836912155151


 33%|███▎      | 6151/18411 [57:05<1:58:11,  1.73it/s]

Epoch [1/1], Step [6151/18411], Loss: 1.4525070190429688


 34%|███▎      | 6176/18411 [57:18<1:51:45,  1.82it/s]

Epoch [1/1], Step [6176/18411], Loss: 1.6387927532196045


 34%|███▎      | 6201/18411 [57:32<1:51:55,  1.82it/s]

Epoch [1/1], Step [6201/18411], Loss: 1.5095691680908203


 34%|███▍      | 6226/18411 [57:46<1:57:21,  1.73it/s]

Epoch [1/1], Step [6226/18411], Loss: 1.6288237571716309


 34%|███▍      | 6251/18411 [58:00<1:50:59,  1.83it/s]

Epoch [1/1], Step [6251/18411], Loss: 1.5635703802108765


 34%|███▍      | 6276/18411 [58:14<1:52:25,  1.80it/s]

Epoch [1/1], Step [6276/18411], Loss: 1.2826642990112305


 34%|███▍      | 6301/18411 [58:27<1:50:36,  1.82it/s]

Epoch [1/1], Step [6301/18411], Loss: 1.4928513765335083


 34%|███▍      | 6326/18411 [58:41<1:50:18,  1.83it/s]

Epoch [1/1], Step [6326/18411], Loss: 1.5524675846099854


 34%|███▍      | 6351/18411 [58:54<1:50:30,  1.82it/s]

Epoch [1/1], Step [6351/18411], Loss: 1.4543945789337158


 35%|███▍      | 6376/18411 [59:08<1:49:44,  1.83it/s]

Epoch [1/1], Step [6376/18411], Loss: 1.2192491292953491


 35%|███▍      | 6401/18411 [59:22<1:51:33,  1.79it/s]

Epoch [1/1], Step [6401/18411], Loss: 1.4042856693267822


 35%|███▍      | 6426/18411 [59:35<1:49:54,  1.82it/s]

Epoch [1/1], Step [6426/18411], Loss: 1.5687335729599


 35%|███▌      | 6451/18411 [59:49<1:54:09,  1.75it/s]

Epoch [1/1], Step [6451/18411], Loss: 1.4626656770706177


 35%|███▌      | 6476/18411 [1:00:03<1:55:01,  1.73it/s]

Epoch [1/1], Step [6476/18411], Loss: 1.7944276332855225


 35%|███▌      | 6501/18411 [1:00:18<1:54:47,  1.73it/s]

Epoch [1/1], Step [6501/18411], Loss: 1.5365537405014038


 35%|███▌      | 6526/18411 [1:00:32<1:54:32,  1.73it/s]

Epoch [1/1], Step [6526/18411], Loss: 1.398723840713501


 36%|███▌      | 6551/18411 [1:00:46<1:54:24,  1.73it/s]

Epoch [1/1], Step [6551/18411], Loss: 1.6295033693313599


 36%|███▌      | 6576/18411 [1:01:00<1:49:04,  1.81it/s]

Epoch [1/1], Step [6576/18411], Loss: 1.3425956964492798


 36%|███▌      | 6601/18411 [1:01:14<1:52:01,  1.76it/s]

Epoch [1/1], Step [6601/18411], Loss: 1.238918423652649


 36%|███▌      | 6626/18411 [1:01:28<1:52:58,  1.74it/s]

Epoch [1/1], Step [6626/18411], Loss: 1.6451623439788818


 36%|███▌      | 6651/18411 [1:01:42<1:49:58,  1.78it/s]

Epoch [1/1], Step [6651/18411], Loss: 1.4057708978652954


 36%|███▋      | 6676/18411 [1:01:56<1:50:29,  1.77it/s]

Epoch [1/1], Step [6676/18411], Loss: 1.628606915473938


 36%|███▋      | 6701/18411 [1:02:10<1:50:28,  1.77it/s]

Epoch [1/1], Step [6701/18411], Loss: 1.250679612159729


 37%|███▋      | 6726/18411 [1:02:24<1:47:03,  1.82it/s]

Epoch [1/1], Step [6726/18411], Loss: 1.5524333715438843


 37%|███▋      | 6751/18411 [1:02:38<1:50:22,  1.76it/s]

Epoch [1/1], Step [6751/18411], Loss: 1.5324403047561646


 37%|███▋      | 6776/18411 [1:02:52<1:46:44,  1.82it/s]

Epoch [1/1], Step [6776/18411], Loss: 1.3593963384628296


 37%|███▋      | 6801/18411 [1:03:06<1:51:48,  1.73it/s]

Epoch [1/1], Step [6801/18411], Loss: 1.2104750871658325


 37%|███▋      | 6826/18411 [1:03:20<1:48:24,  1.78it/s]

Epoch [1/1], Step [6826/18411], Loss: 1.4331170320510864


 37%|███▋      | 6851/18411 [1:03:34<1:45:12,  1.83it/s]

Epoch [1/1], Step [6851/18411], Loss: 1.1717991828918457


 37%|███▋      | 6876/18411 [1:03:48<1:48:54,  1.77it/s]

Epoch [1/1], Step [6876/18411], Loss: 1.5854456424713135


 37%|███▋      | 6901/18411 [1:04:01<1:48:59,  1.76it/s]

Epoch [1/1], Step [6901/18411], Loss: 1.4900563955307007


 38%|███▊      | 6926/18411 [1:04:15<1:44:30,  1.83it/s]

Epoch [1/1], Step [6926/18411], Loss: 1.1820507049560547


 38%|███▊      | 6951/18411 [1:04:28<1:44:15,  1.83it/s]

Epoch [1/1], Step [6951/18411], Loss: 1.4960981607437134


 38%|███▊      | 6976/18411 [1:04:42<1:44:04,  1.83it/s]

Epoch [1/1], Step [6976/18411], Loss: 1.4505174160003662


 38%|███▊      | 7001/18411 [1:04:56<1:49:51,  1.73it/s]

Epoch [1/1], Step [7001/18411], Loss: 1.4330251216888428


 38%|███▊      | 7026/18411 [1:05:10<1:49:43,  1.73it/s]

Epoch [1/1], Step [7026/18411], Loss: 1.3600112199783325


 38%|███▊      | 7051/18411 [1:05:24<1:49:11,  1.73it/s]

Epoch [1/1], Step [7051/18411], Loss: 1.3776745796203613


 38%|███▊      | 7076/18411 [1:05:38<1:48:16,  1.74it/s]

Epoch [1/1], Step [7076/18411], Loss: 1.4032278060913086


 39%|███▊      | 7101/18411 [1:05:53<1:48:55,  1.73it/s]

Epoch [1/1], Step [7101/18411], Loss: 1.5339857339859009


 39%|███▊      | 7126/18411 [1:06:07<1:48:50,  1.73it/s]

Epoch [1/1], Step [7126/18411], Loss: 1.3903785943984985


 39%|███▉      | 7151/18411 [1:06:21<1:43:20,  1.82it/s]

Epoch [1/1], Step [7151/18411], Loss: 1.0807887315750122


 39%|███▉      | 7176/18411 [1:06:34<1:42:14,  1.83it/s]

Epoch [1/1], Step [7176/18411], Loss: 1.6117677688598633


 39%|███▉      | 7201/18411 [1:06:48<1:48:05,  1.73it/s]

Epoch [1/1], Step [7201/18411], Loss: 1.4227429628372192


 39%|███▉      | 7226/18411 [1:07:02<1:42:01,  1.83it/s]

Epoch [1/1], Step [7226/18411], Loss: 1.5198951959609985


 39%|███▉      | 7251/18411 [1:07:16<1:44:37,  1.78it/s]

Epoch [1/1], Step [7251/18411], Loss: 1.5031284093856812


 40%|███▉      | 7276/18411 [1:07:30<1:41:36,  1.83it/s]

Epoch [1/1], Step [7276/18411], Loss: 1.3477476835250854


 40%|███▉      | 7301/18411 [1:07:44<1:41:25,  1.83it/s]

Epoch [1/1], Step [7301/18411], Loss: 1.6680216789245605


 40%|███▉      | 7326/18411 [1:07:57<1:42:56,  1.79it/s]

Epoch [1/1], Step [7326/18411], Loss: 1.5817396640777588


 40%|███▉      | 7351/18411 [1:08:11<1:42:01,  1.81it/s]

Epoch [1/1], Step [7351/18411], Loss: 1.3776670694351196


 40%|████      | 7376/18411 [1:08:25<1:43:10,  1.78it/s]

Epoch [1/1], Step [7376/18411], Loss: 1.3064781427383423


 40%|████      | 7401/18411 [1:08:39<1:40:14,  1.83it/s]

Epoch [1/1], Step [7401/18411], Loss: 1.4331532716751099


 40%|████      | 7426/18411 [1:08:52<1:42:19,  1.79it/s]

Epoch [1/1], Step [7426/18411], Loss: 1.4119383096694946


 40%|████      | 7451/18411 [1:09:06<1:41:36,  1.80it/s]

Epoch [1/1], Step [7451/18411], Loss: 1.5577834844589233


 41%|████      | 7476/18411 [1:09:20<1:43:50,  1.76it/s]

Epoch [1/1], Step [7476/18411], Loss: 1.1532905101776123


 41%|████      | 7501/18411 [1:09:34<1:45:06,  1.73it/s]

Epoch [1/1], Step [7501/18411], Loss: 1.511551022529602


 41%|████      | 7526/18411 [1:09:48<1:45:02,  1.73it/s]

Epoch [1/1], Step [7526/18411], Loss: 1.4256017208099365


 41%|████      | 7551/18411 [1:10:03<1:44:35,  1.73it/s]

Epoch [1/1], Step [7551/18411], Loss: 1.383466124534607


 41%|████      | 7576/18411 [1:10:16<1:38:38,  1.83it/s]

Epoch [1/1], Step [7576/18411], Loss: 1.5117963552474976


 41%|████▏     | 7601/18411 [1:10:30<1:38:25,  1.83it/s]

Epoch [1/1], Step [7601/18411], Loss: 1.3726428747177124


 41%|████▏     | 7626/18411 [1:10:43<1:38:07,  1.83it/s]

Epoch [1/1], Step [7626/18411], Loss: 1.3156884908676147


 42%|████▏     | 7651/18411 [1:10:57<1:37:51,  1.83it/s]

Epoch [1/1], Step [7651/18411], Loss: 1.3152762651443481


 42%|████▏     | 7676/18411 [1:11:11<1:39:40,  1.80it/s]

Epoch [1/1], Step [7676/18411], Loss: 1.6884987354278564


 42%|████▏     | 7701/18411 [1:11:25<1:41:44,  1.75it/s]

Epoch [1/1], Step [7701/18411], Loss: 1.1208381652832031


 42%|████▏     | 7726/18411 [1:11:38<1:37:06,  1.83it/s]

Epoch [1/1], Step [7726/18411], Loss: 1.3013094663619995


 42%|████▏     | 7751/18411 [1:11:52<1:42:41,  1.73it/s]

Epoch [1/1], Step [7751/18411], Loss: 1.2731584310531616


 42%|████▏     | 7776/18411 [1:12:07<1:42:30,  1.73it/s]

Epoch [1/1], Step [7776/18411], Loss: 1.2536534070968628


 42%|████▏     | 7801/18411 [1:12:21<1:41:30,  1.74it/s]

Epoch [1/1], Step [7801/18411], Loss: 1.3139828443527222


 43%|████▎     | 7826/18411 [1:12:34<1:37:22,  1.81it/s]

Epoch [1/1], Step [7826/18411], Loss: 1.2383630275726318


 43%|████▎     | 7851/18411 [1:12:48<1:36:20,  1.83it/s]

Epoch [1/1], Step [7851/18411], Loss: 1.2755337953567505


 43%|████▎     | 7876/18411 [1:13:02<1:36:24,  1.82it/s]

Epoch [1/1], Step [7876/18411], Loss: 1.3680475950241089


 43%|████▎     | 7901/18411 [1:13:16<1:36:55,  1.81it/s]

Epoch [1/1], Step [7901/18411], Loss: 1.268527865409851


 43%|████▎     | 7926/18411 [1:13:30<1:35:29,  1.83it/s]

Epoch [1/1], Step [7926/18411], Loss: 1.386985182762146


 43%|████▎     | 7951/18411 [1:13:43<1:35:07,  1.83it/s]

Epoch [1/1], Step [7951/18411], Loss: 1.4581211805343628


 43%|████▎     | 7976/18411 [1:13:57<1:34:52,  1.83it/s]

Epoch [1/1], Step [7976/18411], Loss: 1.3590811491012573


 43%|████▎     | 8001/18411 [1:14:10<1:34:36,  1.83it/s]

Epoch [1/1], Step [8001/18411], Loss: 1.0623769760131836


 44%|████▎     | 8026/18411 [1:14:24<1:38:36,  1.76it/s]

Epoch [1/1], Step [8026/18411], Loss: 1.3438650369644165


 44%|████▎     | 8051/18411 [1:14:37<1:37:07,  1.78it/s]

Epoch [1/1], Step [8051/18411], Loss: 1.1564738750457764


 44%|████▍     | 8076/18411 [1:14:51<1:35:24,  1.81it/s]

Epoch [1/1], Step [8076/18411], Loss: 1.4804092645645142


 44%|████▍     | 8101/18411 [1:15:05<1:39:24,  1.73it/s]

Epoch [1/1], Step [8101/18411], Loss: 1.5392193794250488


 44%|████▍     | 8126/18411 [1:15:19<1:33:49,  1.83it/s]

Epoch [1/1], Step [8126/18411], Loss: 1.0293961763381958


 44%|████▍     | 8151/18411 [1:15:33<1:37:08,  1.76it/s]

Epoch [1/1], Step [8151/18411], Loss: 1.1544824838638306


 44%|████▍     | 8176/18411 [1:15:47<1:34:22,  1.81it/s]

Epoch [1/1], Step [8176/18411], Loss: 1.2340534925460815


 45%|████▍     | 8201/18411 [1:16:01<1:36:19,  1.77it/s]

Epoch [1/1], Step [8201/18411], Loss: 1.2151862382888794


 45%|████▍     | 8226/18411 [1:16:14<1:32:34,  1.83it/s]

Epoch [1/1], Step [8226/18411], Loss: 1.2628196477890015


 45%|████▍     | 8251/18411 [1:16:28<1:32:20,  1.83it/s]

Epoch [1/1], Step [8251/18411], Loss: 1.270963191986084


 45%|████▍     | 8276/18411 [1:16:41<1:34:36,  1.79it/s]

Epoch [1/1], Step [8276/18411], Loss: 1.3455710411071777


 45%|████▌     | 8301/18411 [1:16:55<1:32:02,  1.83it/s]

Epoch [1/1], Step [8301/18411], Loss: 1.2911356687545776


 45%|████▌     | 8326/18411 [1:17:09<1:37:12,  1.73it/s]

Epoch [1/1], Step [8326/18411], Loss: 1.2700779438018799


 45%|████▌     | 8351/18411 [1:17:23<1:31:30,  1.83it/s]

Epoch [1/1], Step [8351/18411], Loss: 1.2409077882766724


 45%|████▌     | 8376/18411 [1:17:36<1:31:12,  1.83it/s]

Epoch [1/1], Step [8376/18411], Loss: 1.4586353302001953


 46%|████▌     | 8401/18411 [1:17:50<1:36:27,  1.73it/s]

Epoch [1/1], Step [8401/18411], Loss: 1.2445693016052246


 46%|████▌     | 8426/18411 [1:18:05<1:34:40,  1.76it/s]

Epoch [1/1], Step [8426/18411], Loss: 1.4322741031646729


 46%|████▌     | 8451/18411 [1:18:19<1:35:53,  1.73it/s]

Epoch [1/1], Step [8451/18411], Loss: 1.2925877571105957


 46%|████▌     | 8476/18411 [1:18:33<1:35:47,  1.73it/s]

Epoch [1/1], Step [8476/18411], Loss: 0.9636564254760742


 46%|████▌     | 8501/18411 [1:18:47<1:33:03,  1.77it/s]

Epoch [1/1], Step [8501/18411], Loss: 1.441537857055664


 46%|████▋     | 8526/18411 [1:19:01<1:29:57,  1.83it/s]

Epoch [1/1], Step [8526/18411], Loss: 1.2944988012313843


 46%|████▋     | 8551/18411 [1:19:14<1:30:22,  1.82it/s]

Epoch [1/1], Step [8551/18411], Loss: 1.4667567014694214


 47%|████▋     | 8576/18411 [1:19:28<1:34:54,  1.73it/s]

Epoch [1/1], Step [8576/18411], Loss: 1.242253065109253


 47%|████▋     | 8601/18411 [1:19:43<1:34:46,  1.73it/s]

Epoch [1/1], Step [8601/18411], Loss: 1.1578675508499146


 47%|████▋     | 8626/18411 [1:19:57<1:29:40,  1.82it/s]

Epoch [1/1], Step [8626/18411], Loss: 1.2691891193389893


 47%|████▋     | 8651/18411 [1:20:10<1:31:41,  1.77it/s]

Epoch [1/1], Step [8651/18411], Loss: 1.3183313608169556


 47%|████▋     | 8676/18411 [1:20:24<1:33:37,  1.73it/s]

Epoch [1/1], Step [8676/18411], Loss: 1.1361109018325806


 47%|████▋     | 8701/18411 [1:20:39<1:34:02,  1.72it/s]

Epoch [1/1], Step [8701/18411], Loss: 1.1191238164901733


 47%|████▋     | 8726/18411 [1:20:53<1:31:05,  1.77it/s]

Epoch [1/1], Step [8726/18411], Loss: 1.380029559135437


 48%|████▊     | 8751/18411 [1:21:07<1:29:16,  1.80it/s]

Epoch [1/1], Step [8751/18411], Loss: 1.2191193103790283


 48%|████▊     | 8776/18411 [1:21:20<1:28:17,  1.82it/s]

Epoch [1/1], Step [8776/18411], Loss: 0.939292311668396


 48%|████▊     | 8801/18411 [1:21:34<1:28:28,  1.81it/s]

Epoch [1/1], Step [8801/18411], Loss: 1.1129425764083862


 48%|████▊     | 8826/18411 [1:21:48<1:33:06,  1.72it/s]

Epoch [1/1], Step [8826/18411], Loss: 1.2016087770462036


 48%|████▊     | 8851/18411 [1:22:03<1:31:39,  1.74it/s]

Epoch [1/1], Step [8851/18411], Loss: 0.9878941178321838


 48%|████▊     | 8876/18411 [1:22:16<1:27:27,  1.82it/s]

Epoch [1/1], Step [8876/18411], Loss: 1.1372157335281372


 48%|████▊     | 8901/18411 [1:22:30<1:27:10,  1.82it/s]

Epoch [1/1], Step [8901/18411], Loss: 1.2146023511886597


 48%|████▊     | 8926/18411 [1:22:44<1:31:20,  1.73it/s]

Epoch [1/1], Step [8926/18411], Loss: 1.0751603841781616


 49%|████▊     | 8951/18411 [1:22:58<1:26:07,  1.83it/s]

Epoch [1/1], Step [8951/18411], Loss: 1.2434719800949097


 49%|████▉     | 8976/18411 [1:23:12<1:31:03,  1.73it/s]

Epoch [1/1], Step [8976/18411], Loss: 1.0328649282455444


 49%|████▉     | 9001/18411 [1:23:26<1:29:27,  1.75it/s]

Epoch [1/1], Step [9001/18411], Loss: 1.195564866065979


 49%|████▉     | 9026/18411 [1:23:40<1:30:27,  1.73it/s]

Epoch [1/1], Step [9026/18411], Loss: 1.1721755266189575


 49%|████▉     | 9051/18411 [1:23:54<1:26:44,  1.80it/s]

Epoch [1/1], Step [9051/18411], Loss: 1.20404851436615


 49%|████▉     | 9076/18411 [1:24:08<1:25:13,  1.83it/s]

Epoch [1/1], Step [9076/18411], Loss: 0.9288732409477234


 49%|████▉     | 9101/18411 [1:24:22<1:24:43,  1.83it/s]

Epoch [1/1], Step [9101/18411], Loss: 1.091562271118164


 50%|████▉     | 9126/18411 [1:24:36<1:28:47,  1.74it/s]

Epoch [1/1], Step [9126/18411], Loss: 1.2910583019256592


 50%|████▉     | 9151/18411 [1:24:49<1:24:12,  1.83it/s]

Epoch [1/1], Step [9151/18411], Loss: 1.1706329584121704


 50%|████▉     | 9176/18411 [1:25:03<1:27:30,  1.76it/s]

Epoch [1/1], Step [9176/18411], Loss: 0.9561737775802612


 50%|████▉     | 9201/18411 [1:25:17<1:28:35,  1.73it/s]

Epoch [1/1], Step [9201/18411], Loss: 1.0797415971755981


 50%|█████     | 9226/18411 [1:25:31<1:23:42,  1.83it/s]

Epoch [1/1], Step [9226/18411], Loss: 1.0610122680664062


 50%|█████     | 9251/18411 [1:25:44<1:23:42,  1.82it/s]

Epoch [1/1], Step [9251/18411], Loss: 1.381881594657898


 50%|█████     | 9276/18411 [1:25:58<1:25:47,  1.77it/s]

Epoch [1/1], Step [9276/18411], Loss: 1.0501781702041626


 51%|█████     | 9301/18411 [1:26:13<1:27:45,  1.73it/s]

Epoch [1/1], Step [9301/18411], Loss: 1.1161388158798218


 51%|█████     | 9326/18411 [1:26:27<1:27:28,  1.73it/s]

Epoch [1/1], Step [9326/18411], Loss: 1.1004501581192017


 51%|█████     | 9351/18411 [1:26:41<1:23:26,  1.81it/s]

Epoch [1/1], Step [9351/18411], Loss: 1.2069251537322998


 51%|█████     | 9376/18411 [1:26:54<1:22:37,  1.82it/s]

Epoch [1/1], Step [9376/18411], Loss: 1.0330718755722046


 51%|█████     | 9401/18411 [1:27:08<1:24:25,  1.78it/s]

Epoch [1/1], Step [9401/18411], Loss: 1.0475965738296509


 51%|█████     | 9426/18411 [1:27:22<1:23:15,  1.80it/s]

Epoch [1/1], Step [9426/18411], Loss: 1.226586937904358


 51%|█████▏    | 9451/18411 [1:27:36<1:23:55,  1.78it/s]

Epoch [1/1], Step [9451/18411], Loss: 1.0374358892440796


 51%|█████▏    | 9476/18411 [1:27:50<1:23:17,  1.79it/s]

Epoch [1/1], Step [9476/18411], Loss: 1.1759958267211914


 52%|█████▏    | 9501/18411 [1:28:03<1:21:07,  1.83it/s]

Epoch [1/1], Step [9501/18411], Loss: 1.069788932800293


 52%|█████▏    | 9526/18411 [1:28:17<1:22:03,  1.80it/s]

Epoch [1/1], Step [9526/18411], Loss: 1.0584412813186646


 52%|█████▏    | 9551/18411 [1:28:31<1:20:48,  1.83it/s]

Epoch [1/1], Step [9551/18411], Loss: 0.9933385252952576


 52%|█████▏    | 9576/18411 [1:28:45<1:23:11,  1.77it/s]

Epoch [1/1], Step [9576/18411], Loss: 1.074298620223999


 52%|█████▏    | 9601/18411 [1:28:59<1:22:00,  1.79it/s]

Epoch [1/1], Step [9601/18411], Loss: 1.0309866666793823


 52%|█████▏    | 9626/18411 [1:29:12<1:21:03,  1.81it/s]

Epoch [1/1], Step [9626/18411], Loss: 1.0777465105056763


 52%|█████▏    | 9651/18411 [1:29:26<1:20:08,  1.82it/s]

Epoch [1/1], Step [9651/18411], Loss: 1.138648271560669


 53%|█████▎    | 9676/18411 [1:29:40<1:20:14,  1.81it/s]

Epoch [1/1], Step [9676/18411], Loss: 0.8398889899253845


 53%|█████▎    | 9701/18411 [1:29:54<1:22:54,  1.75it/s]

Epoch [1/1], Step [9701/18411], Loss: 0.6904557943344116


 53%|█████▎    | 9726/18411 [1:30:07<1:19:05,  1.83it/s]

Epoch [1/1], Step [9726/18411], Loss: 0.9464301466941833


 53%|█████▎    | 9751/18411 [1:30:21<1:20:01,  1.80it/s]

Epoch [1/1], Step [9751/18411], Loss: 1.1317353248596191


 53%|█████▎    | 9776/18411 [1:30:35<1:19:46,  1.80it/s]

Epoch [1/1], Step [9776/18411], Loss: 1.2075152397155762


 53%|█████▎    | 9801/18411 [1:30:48<1:18:56,  1.82it/s]

Epoch [1/1], Step [9801/18411], Loss: 1.1944007873535156


 53%|█████▎    | 9826/18411 [1:31:02<1:18:19,  1.83it/s]

Epoch [1/1], Step [9826/18411], Loss: 0.8759249448776245


 54%|█████▎    | 9851/18411 [1:31:16<1:21:36,  1.75it/s]

Epoch [1/1], Step [9851/18411], Loss: 1.282027006149292


 54%|█████▎    | 9876/18411 [1:31:30<1:19:42,  1.78it/s]

Epoch [1/1], Step [9876/18411], Loss: 0.9276576638221741


 54%|█████▍    | 9901/18411 [1:31:43<1:17:34,  1.83it/s]

Epoch [1/1], Step [9901/18411], Loss: 1.0971182584762573


 54%|█████▍    | 9926/18411 [1:31:57<1:21:16,  1.74it/s]

Epoch [1/1], Step [9926/18411], Loss: 1.1109776496887207


 54%|█████▍    | 9951/18411 [1:32:11<1:20:20,  1.75it/s]

Epoch [1/1], Step [9951/18411], Loss: 0.9259126782417297


 54%|█████▍    | 9976/18411 [1:32:26<1:21:15,  1.73it/s]

Epoch [1/1], Step [9976/18411], Loss: 0.92336505651474


 54%|█████▍    | 10001/18411 [1:32:40<1:17:57,  1.80it/s]

Epoch [1/1], Step [10001/18411], Loss: 0.8067215085029602


 54%|█████▍    | 10026/18411 [1:32:53<1:17:22,  1.81it/s]

Epoch [1/1], Step [10026/18411], Loss: 0.8977952003479004


 55%|█████▍    | 10051/18411 [1:33:07<1:18:51,  1.77it/s]

Epoch [1/1], Step [10051/18411], Loss: 1.2365418672561646


 55%|█████▍    | 10076/18411 [1:33:21<1:15:59,  1.83it/s]

Epoch [1/1], Step [10076/18411], Loss: 0.919282853603363


 55%|█████▍    | 10101/18411 [1:33:35<1:19:18,  1.75it/s]

Epoch [1/1], Step [10101/18411], Loss: 1.0161352157592773


 55%|█████▍    | 10126/18411 [1:33:48<1:15:36,  1.83it/s]

Epoch [1/1], Step [10126/18411], Loss: 1.0650736093521118


 55%|█████▌    | 10151/18411 [1:34:02<1:15:14,  1.83it/s]

Epoch [1/1], Step [10151/18411], Loss: 0.9669404029846191


 55%|█████▌    | 10176/18411 [1:34:16<1:16:01,  1.81it/s]

Epoch [1/1], Step [10176/18411], Loss: 0.9059196710586548


 55%|█████▌    | 10201/18411 [1:34:29<1:18:53,  1.73it/s]

Epoch [1/1], Step [10201/18411], Loss: 1.0375210046768188


 56%|█████▌    | 10226/18411 [1:34:43<1:14:35,  1.83it/s]

Epoch [1/1], Step [10226/18411], Loss: 0.8962680697441101


 56%|█████▌    | 10251/18411 [1:34:57<1:16:25,  1.78it/s]

Epoch [1/1], Step [10251/18411], Loss: 1.0593687295913696


 56%|█████▌    | 10276/18411 [1:35:11<1:14:17,  1.83it/s]

Epoch [1/1], Step [10276/18411], Loss: 0.8625479340553284


 56%|█████▌    | 10301/18411 [1:35:25<1:14:50,  1.81it/s]

Epoch [1/1], Step [10301/18411], Loss: 1.1526802778244019


 56%|█████▌    | 10326/18411 [1:35:38<1:13:38,  1.83it/s]

Epoch [1/1], Step [10326/18411], Loss: 0.9492884278297424


 56%|█████▌    | 10351/18411 [1:35:52<1:16:59,  1.74it/s]

Epoch [1/1], Step [10351/18411], Loss: 0.815232515335083


 56%|█████▋    | 10376/18411 [1:36:06<1:15:43,  1.77it/s]

Epoch [1/1], Step [10376/18411], Loss: 0.9142078757286072


 56%|█████▋    | 10401/18411 [1:36:20<1:16:14,  1.75it/s]

Epoch [1/1], Step [10401/18411], Loss: 1.193684697151184


 57%|█████▋    | 10426/18411 [1:36:34<1:13:04,  1.82it/s]

Epoch [1/1], Step [10426/18411], Loss: 1.0577856302261353


 57%|█████▋    | 10451/18411 [1:36:47<1:13:44,  1.80it/s]

Epoch [1/1], Step [10451/18411], Loss: 1.0433598756790161


 57%|█████▋    | 10476/18411 [1:37:01<1:15:06,  1.76it/s]

Epoch [1/1], Step [10476/18411], Loss: 0.9257679581642151


 57%|█████▋    | 10501/18411 [1:37:15<1:16:19,  1.73it/s]

Epoch [1/1], Step [10501/18411], Loss: 1.025530457496643


 57%|█████▋    | 10526/18411 [1:37:29<1:11:55,  1.83it/s]

Epoch [1/1], Step [10526/18411], Loss: 1.0055030584335327


 57%|█████▋    | 10551/18411 [1:37:42<1:11:52,  1.82it/s]

Epoch [1/1], Step [10551/18411], Loss: 1.1234527826309204


 57%|█████▋    | 10576/18411 [1:37:56<1:12:50,  1.79it/s]

Epoch [1/1], Step [10576/18411], Loss: 0.9069619178771973


 58%|█████▊    | 10601/18411 [1:38:10<1:12:03,  1.81it/s]

Epoch [1/1], Step [10601/18411], Loss: 1.0257165431976318


 58%|█████▊    | 10626/18411 [1:38:24<1:15:08,  1.73it/s]

Epoch [1/1], Step [10626/18411], Loss: 1.0319300889968872


 58%|█████▊    | 10651/18411 [1:38:38<1:14:40,  1.73it/s]

Epoch [1/1], Step [10651/18411], Loss: 1.0178370475769043


 58%|█████▊    | 10676/18411 [1:38:52<1:12:41,  1.77it/s]

Epoch [1/1], Step [10676/18411], Loss: 1.0719130039215088


 58%|█████▊    | 10701/18411 [1:39:06<1:11:33,  1.80it/s]

Epoch [1/1], Step [10701/18411], Loss: 0.88516765832901


 58%|█████▊    | 10726/18411 [1:39:20<1:12:53,  1.76it/s]

Epoch [1/1], Step [10726/18411], Loss: 1.052710771560669


 58%|█████▊    | 10751/18411 [1:39:34<1:10:08,  1.82it/s]

Epoch [1/1], Step [10751/18411], Loss: 1.1113046407699585


 59%|█████▊    | 10776/18411 [1:39:48<1:09:29,  1.83it/s]

Epoch [1/1], Step [10776/18411], Loss: 1.152193546295166


 59%|█████▊    | 10801/18411 [1:40:01<1:10:30,  1.80it/s]

Epoch [1/1], Step [10801/18411], Loss: 0.978086531162262


 59%|█████▉    | 10826/18411 [1:40:15<1:10:11,  1.80it/s]

Epoch [1/1], Step [10826/18411], Loss: 1.0261173248291016


 59%|█████▉    | 10851/18411 [1:40:29<1:09:32,  1.81it/s]

Epoch [1/1], Step [10851/18411], Loss: 0.9792565703392029


 59%|█████▉    | 10876/18411 [1:40:43<1:09:05,  1.82it/s]

Epoch [1/1], Step [10876/18411], Loss: 1.0571564435958862


 59%|█████▉    | 10901/18411 [1:40:56<1:09:17,  1.81it/s]

Epoch [1/1], Step [10901/18411], Loss: 0.968112051486969


 59%|█████▉    | 10926/18411 [1:41:10<1:08:31,  1.82it/s]

Epoch [1/1], Step [10926/18411], Loss: 1.0862921476364136


 59%|█████▉    | 10951/18411 [1:41:24<1:07:54,  1.83it/s]

Epoch [1/1], Step [10951/18411], Loss: 0.940789520740509


 60%|█████▉    | 10976/18411 [1:41:37<1:07:41,  1.83it/s]

Epoch [1/1], Step [10976/18411], Loss: 0.7975807189941406


 60%|█████▉    | 11001/18411 [1:41:51<1:07:58,  1.82it/s]

Epoch [1/1], Step [11001/18411], Loss: 0.8506855964660645


 60%|█████▉    | 11026/18411 [1:42:05<1:11:17,  1.73it/s]

Epoch [1/1], Step [11026/18411], Loss: 1.0150864124298096


 60%|██████    | 11051/18411 [1:42:19<1:11:06,  1.72it/s]

Epoch [1/1], Step [11051/18411], Loss: 0.8932255506515503


 60%|██████    | 11076/18411 [1:42:33<1:08:11,  1.79it/s]

Epoch [1/1], Step [11076/18411], Loss: 0.9119385480880737


 60%|██████    | 11101/18411 [1:42:47<1:07:32,  1.80it/s]

Epoch [1/1], Step [11101/18411], Loss: 0.7536417245864868


 60%|██████    | 11126/18411 [1:43:01<1:06:13,  1.83it/s]

Epoch [1/1], Step [11126/18411], Loss: 0.7568557858467102


 61%|██████    | 11151/18411 [1:43:14<1:07:05,  1.80it/s]

Epoch [1/1], Step [11151/18411], Loss: 0.7836523056030273


 61%|██████    | 11176/18411 [1:43:29<1:08:42,  1.76it/s]

Epoch [1/1], Step [11176/18411], Loss: 0.8970882892608643


 61%|██████    | 11201/18411 [1:43:42<1:06:26,  1.81it/s]

Epoch [1/1], Step [11201/18411], Loss: 0.9764691591262817


 61%|██████    | 11226/18411 [1:43:56<1:06:25,  1.80it/s]

Epoch [1/1], Step [11226/18411], Loss: 0.76230388879776


 61%|██████    | 11251/18411 [1:44:10<1:05:19,  1.83it/s]

Epoch [1/1], Step [11251/18411], Loss: 0.8021557331085205


 61%|██████    | 11276/18411 [1:44:24<1:05:07,  1.83it/s]

Epoch [1/1], Step [11276/18411], Loss: 1.0712175369262695


 61%|██████▏   | 11301/18411 [1:44:37<1:04:50,  1.83it/s]

Epoch [1/1], Step [11301/18411], Loss: 0.7712180018424988


 62%|██████▏   | 11326/18411 [1:44:51<1:05:17,  1.81it/s]

Epoch [1/1], Step [11326/18411], Loss: 1.0351349115371704


 62%|██████▏   | 11351/18411 [1:45:05<1:08:01,  1.73it/s]

Epoch [1/1], Step [11351/18411], Loss: 1.0377403497695923


 62%|██████▏   | 11376/18411 [1:45:19<1:07:00,  1.75it/s]

Epoch [1/1], Step [11376/18411], Loss: 0.8564895391464233


 62%|██████▏   | 11401/18411 [1:45:33<1:04:41,  1.81it/s]

Epoch [1/1], Step [11401/18411], Loss: 1.1533570289611816


 62%|██████▏   | 11426/18411 [1:45:47<1:03:38,  1.83it/s]

Epoch [1/1], Step [11426/18411], Loss: 0.843757152557373


 62%|██████▏   | 11451/18411 [1:46:01<1:04:53,  1.79it/s]

Epoch [1/1], Step [11451/18411], Loss: 0.7737345695495605


 62%|██████▏   | 11476/18411 [1:46:14<1:04:29,  1.79it/s]

Epoch [1/1], Step [11476/18411], Loss: 0.9254058003425598


 62%|██████▏   | 11501/18411 [1:46:28<1:03:02,  1.83it/s]

Epoch [1/1], Step [11501/18411], Loss: 0.9483506679534912


 63%|██████▎   | 11526/18411 [1:46:42<1:03:09,  1.82it/s]

Epoch [1/1], Step [11526/18411], Loss: 0.6813968420028687


 63%|██████▎   | 11551/18411 [1:46:55<1:02:49,  1.82it/s]

Epoch [1/1], Step [11551/18411], Loss: 1.0127073526382446


 63%|██████▎   | 11576/18411 [1:47:09<1:04:36,  1.76it/s]

Epoch [1/1], Step [11576/18411], Loss: 0.9122501611709595


 63%|██████▎   | 11601/18411 [1:47:23<1:04:32,  1.76it/s]

Epoch [1/1], Step [11601/18411], Loss: 0.7725614905357361


 63%|██████▎   | 11626/18411 [1:47:36<1:02:34,  1.81it/s]

Epoch [1/1], Step [11626/18411], Loss: 0.8178500533103943


 63%|██████▎   | 11651/18411 [1:47:50<1:03:11,  1.78it/s]

Epoch [1/1], Step [11651/18411], Loss: 0.9460147619247437


 63%|██████▎   | 11676/18411 [1:48:04<1:03:48,  1.76it/s]

Epoch [1/1], Step [11676/18411], Loss: 0.9122444987297058


 64%|██████▎   | 11701/18411 [1:48:18<1:01:18,  1.82it/s]

Epoch [1/1], Step [11701/18411], Loss: 0.6653228998184204


 64%|██████▎   | 11726/18411 [1:48:32<1:01:07,  1.82it/s]

Epoch [1/1], Step [11726/18411], Loss: 0.7034371495246887


 64%|██████▍   | 11751/18411 [1:48:45<1:00:44,  1.83it/s]

Epoch [1/1], Step [11751/18411], Loss: 0.7854418754577637


 64%|██████▍   | 11776/18411 [1:48:59<1:04:04,  1.73it/s]

Epoch [1/1], Step [11776/18411], Loss: 0.7789792418479919


 64%|██████▍   | 11801/18411 [1:49:13<1:01:51,  1.78it/s]

Epoch [1/1], Step [11801/18411], Loss: 0.8012067079544067


 64%|██████▍   | 11826/18411 [1:49:27<1:03:33,  1.73it/s]

Epoch [1/1], Step [11826/18411], Loss: 0.7243439555168152


 64%|██████▍   | 11851/18411 [1:49:41<59:48,  1.83it/s]  

Epoch [1/1], Step [11851/18411], Loss: 0.8860514163970947


 65%|██████▍   | 11876/18411 [1:49:54<59:35,  1.83it/s]  

Epoch [1/1], Step [11876/18411], Loss: 1.0290262699127197


 65%|██████▍   | 11901/18411 [1:50:09<1:02:52,  1.73it/s]

Epoch [1/1], Step [11901/18411], Loss: 0.799151599407196


 65%|██████▍   | 11926/18411 [1:50:22<59:16,  1.82it/s]  

Epoch [1/1], Step [11926/18411], Loss: 0.6544020771980286


 65%|██████▍   | 11951/18411 [1:50:36<1:02:15,  1.73it/s]

Epoch [1/1], Step [11951/18411], Loss: 0.9186322093009949


 65%|██████▌   | 11976/18411 [1:50:51<1:02:08,  1.73it/s]

Epoch [1/1], Step [11976/18411], Loss: 0.8120425343513489


 65%|██████▌   | 12001/18411 [1:51:04<58:28,  1.83it/s]  

Epoch [1/1], Step [12001/18411], Loss: 0.8131251335144043


 65%|██████▌   | 12026/18411 [1:51:18<58:25,  1.82it/s]  

Epoch [1/1], Step [12026/18411], Loss: 0.9636958837509155


 65%|██████▌   | 12051/18411 [1:51:32<1:01:23,  1.73it/s]

Epoch [1/1], Step [12051/18411], Loss: 0.7326427698135376


 66%|██████▌   | 12076/18411 [1:51:46<57:45,  1.83it/s]  

Epoch [1/1], Step [12076/18411], Loss: 0.8094152808189392


 66%|██████▌   | 12101/18411 [1:51:59<57:28,  1.83it/s]

Epoch [1/1], Step [12101/18411], Loss: 0.9209199547767639


 66%|██████▌   | 12126/18411 [1:52:13<57:20,  1.83it/s]  

Epoch [1/1], Step [12126/18411], Loss: 0.8921829462051392


 66%|██████▌   | 12151/18411 [1:52:27<57:26,  1.82it/s]

Epoch [1/1], Step [12151/18411], Loss: 0.800449550151825


 66%|██████▌   | 12176/18411 [1:52:40<56:52,  1.83it/s]

Epoch [1/1], Step [12176/18411], Loss: 0.8523589968681335


 66%|██████▋   | 12201/18411 [1:52:54<56:39,  1.83it/s]

Epoch [1/1], Step [12201/18411], Loss: 0.9299212694168091


 66%|██████▋   | 12226/18411 [1:53:07<56:25,  1.83it/s]

Epoch [1/1], Step [12226/18411], Loss: 0.9275651574134827


 67%|██████▋   | 12251/18411 [1:53:21<56:16,  1.82it/s]

Epoch [1/1], Step [12251/18411], Loss: 0.9023810029029846


 67%|██████▋   | 12276/18411 [1:53:34<56:21,  1.81it/s]

Epoch [1/1], Step [12276/18411], Loss: 0.9784999489784241


 67%|██████▋   | 12301/18411 [1:53:48<55:42,  1.83it/s]

Epoch [1/1], Step [12301/18411], Loss: 0.8945502042770386


 67%|██████▋   | 12326/18411 [1:54:01<55:45,  1.82it/s]

Epoch [1/1], Step [12326/18411], Loss: 0.8042857050895691


 67%|██████▋   | 12351/18411 [1:54:15<55:17,  1.83it/s]

Epoch [1/1], Step [12351/18411], Loss: 0.4948694705963135


 67%|██████▋   | 12376/18411 [1:54:28<54:59,  1.83it/s]

Epoch [1/1], Step [12376/18411], Loss: 0.5260342955589294


 67%|██████▋   | 12401/18411 [1:54:42<54:46,  1.83it/s]

Epoch [1/1], Step [12401/18411], Loss: 0.9062896370887756


 67%|██████▋   | 12426/18411 [1:54:55<54:42,  1.82it/s]

Epoch [1/1], Step [12426/18411], Loss: 0.8939958214759827


 68%|██████▊   | 12451/18411 [1:55:09<54:18,  1.83it/s]

Epoch [1/1], Step [12451/18411], Loss: 0.6490846872329712


 68%|██████▊   | 12476/18411 [1:55:23<54:28,  1.82it/s]

Epoch [1/1], Step [12476/18411], Loss: 0.7282984256744385


 68%|██████▊   | 12501/18411 [1:55:36<54:41,  1.80it/s]

Epoch [1/1], Step [12501/18411], Loss: 0.7508687973022461


 68%|██████▊   | 12526/18411 [1:55:50<53:37,  1.83it/s]

Epoch [1/1], Step [12526/18411], Loss: 0.7942105531692505


 68%|██████▊   | 12551/18411 [1:56:03<53:29,  1.83it/s]

Epoch [1/1], Step [12551/18411], Loss: 0.5933367013931274


 68%|██████▊   | 12576/18411 [1:56:17<53:10,  1.83it/s]

Epoch [1/1], Step [12576/18411], Loss: 0.971579909324646


 68%|██████▊   | 12601/18411 [1:56:30<52:59,  1.83it/s]

Epoch [1/1], Step [12601/18411], Loss: 0.7341657876968384


 69%|██████▊   | 12626/18411 [1:56:44<53:02,  1.82it/s]

Epoch [1/1], Step [12626/18411], Loss: 0.7840709090232849


 69%|██████▊   | 12651/18411 [1:56:58<52:34,  1.83it/s]

Epoch [1/1], Step [12651/18411], Loss: 0.942905604839325


 69%|██████▉   | 12676/18411 [1:57:11<52:15,  1.83it/s]

Epoch [1/1], Step [12676/18411], Loss: 0.846564531326294


 69%|██████▉   | 12701/18411 [1:57:25<52:13,  1.82it/s]

Epoch [1/1], Step [12701/18411], Loss: 0.7966850399971008


 69%|██████▉   | 12726/18411 [1:57:38<52:41,  1.80it/s]

Epoch [1/1], Step [12726/18411], Loss: 0.5900649428367615


 69%|██████▉   | 12751/18411 [1:57:52<51:44,  1.82it/s]

Epoch [1/1], Step [12751/18411], Loss: 0.8846209049224854


 69%|██████▉   | 12776/18411 [1:58:05<51:42,  1.82it/s]

Epoch [1/1], Step [12776/18411], Loss: 0.5926759243011475


 70%|██████▉   | 12801/18411 [1:58:19<51:52,  1.80it/s]

Epoch [1/1], Step [12801/18411], Loss: 0.7426736354827881


 70%|██████▉   | 12826/18411 [1:58:33<51:41,  1.80it/s]

Epoch [1/1], Step [12826/18411], Loss: 0.7999347448348999


 70%|██████▉   | 12851/18411 [1:58:46<50:50,  1.82it/s]

Epoch [1/1], Step [12851/18411], Loss: 0.5379071831703186


 70%|██████▉   | 12876/18411 [1:59:00<50:30,  1.83it/s]

Epoch [1/1], Step [12876/18411], Loss: 0.6966395378112793


 70%|███████   | 12901/18411 [1:59:13<50:08,  1.83it/s]

Epoch [1/1], Step [12901/18411], Loss: 0.6384490728378296


 70%|███████   | 12926/18411 [1:59:27<50:42,  1.80it/s]

Epoch [1/1], Step [12926/18411], Loss: 0.7684404850006104


 70%|███████   | 12951/18411 [1:59:40<49:49,  1.83it/s]

Epoch [1/1], Step [12951/18411], Loss: 0.7067668437957764


 70%|███████   | 12976/18411 [1:59:54<49:40,  1.82it/s]

Epoch [1/1], Step [12976/18411], Loss: 0.5473188161849976


 71%|███████   | 13001/18411 [2:00:08<49:17,  1.83it/s]

Epoch [1/1], Step [13001/18411], Loss: 0.7405473589897156


 71%|███████   | 13026/18411 [2:00:21<49:06,  1.83it/s]

Epoch [1/1], Step [13026/18411], Loss: 1.068695068359375


 71%|███████   | 13051/18411 [2:00:35<48:50,  1.83it/s]

Epoch [1/1], Step [13051/18411], Loss: 0.7788395285606384


 71%|███████   | 13076/18411 [2:00:48<48:36,  1.83it/s]

Epoch [1/1], Step [13076/18411], Loss: 0.8483514189720154


 71%|███████   | 13101/18411 [2:01:02<48:59,  1.81it/s]

Epoch [1/1], Step [13101/18411], Loss: 0.706569254398346


 71%|███████▏  | 13126/18411 [2:01:16<50:32,  1.74it/s]

Epoch [1/1], Step [13126/18411], Loss: 0.7032701373100281


 71%|███████▏  | 13151/18411 [2:01:30<50:46,  1.73it/s]

Epoch [1/1], Step [13151/18411], Loss: 0.9048652648925781


 72%|███████▏  | 13176/18411 [2:01:44<47:43,  1.83it/s]

Epoch [1/1], Step [13176/18411], Loss: 0.7859429717063904


 72%|███████▏  | 13201/18411 [2:01:57<47:31,  1.83it/s]

Epoch [1/1], Step [13201/18411], Loss: 0.6168420910835266


 72%|███████▏  | 13226/18411 [2:02:11<47:15,  1.83it/s]

Epoch [1/1], Step [13226/18411], Loss: 0.6689770221710205


 72%|███████▏  | 13251/18411 [2:02:24<47:00,  1.83it/s]

Epoch [1/1], Step [13251/18411], Loss: 0.7568662762641907


 72%|███████▏  | 13276/18411 [2:02:38<46:49,  1.83it/s]

Epoch [1/1], Step [13276/18411], Loss: 0.7529901266098022


 72%|███████▏  | 13301/18411 [2:02:51<46:40,  1.82it/s]

Epoch [1/1], Step [13301/18411], Loss: 0.6216385960578918


 72%|███████▏  | 13326/18411 [2:03:05<46:23,  1.83it/s]

Epoch [1/1], Step [13326/18411], Loss: 0.7842172384262085


 73%|███████▎  | 13351/18411 [2:03:18<46:10,  1.83it/s]

Epoch [1/1], Step [13351/18411], Loss: 0.739949643611908


 73%|███████▎  | 13376/18411 [2:03:32<47:06,  1.78it/s]

Epoch [1/1], Step [13376/18411], Loss: 0.5417389273643494


 73%|███████▎  | 13401/18411 [2:03:46<46:30,  1.80it/s]

Epoch [1/1], Step [13401/18411], Loss: 0.7265773415565491


 73%|███████▎  | 13426/18411 [2:04:00<46:17,  1.79it/s]

Epoch [1/1], Step [13426/18411], Loss: 0.794725775718689


 73%|███████▎  | 13451/18411 [2:04:13<45:15,  1.83it/s]

Epoch [1/1], Step [13451/18411], Loss: 0.5658774971961975


 73%|███████▎  | 13476/18411 [2:04:27<45:00,  1.83it/s]

Epoch [1/1], Step [13476/18411], Loss: 0.5054638385772705


 73%|███████▎  | 13501/18411 [2:04:40<44:46,  1.83it/s]

Epoch [1/1], Step [13501/18411], Loss: 0.6559200882911682


 73%|███████▎  | 13526/18411 [2:04:54<44:48,  1.82it/s]

Epoch [1/1], Step [13526/18411], Loss: 0.6749403476715088


 74%|███████▎  | 13551/18411 [2:05:07<44:16,  1.83it/s]

Epoch [1/1], Step [13551/18411], Loss: 0.706325113773346


 74%|███████▎  | 13576/18411 [2:05:21<44:07,  1.83it/s]

Epoch [1/1], Step [13576/18411], Loss: 0.4623773396015167


 74%|███████▍  | 13601/18411 [2:05:35<44:27,  1.80it/s]

Epoch [1/1], Step [13601/18411], Loss: 0.7418451309204102


 74%|███████▍  | 13626/18411 [2:05:48<43:54,  1.82it/s]

Epoch [1/1], Step [13626/18411], Loss: 0.6907400488853455


 74%|███████▍  | 13651/18411 [2:06:02<43:29,  1.82it/s]

Epoch [1/1], Step [13651/18411], Loss: 0.5822035670280457


 74%|███████▍  | 13676/18411 [2:06:15<43:10,  1.83it/s]

Epoch [1/1], Step [13676/18411], Loss: 0.7938037514686584


 74%|███████▍  | 13701/18411 [2:06:29<42:56,  1.83it/s]

Epoch [1/1], Step [13701/18411], Loss: 0.6775826215744019


 75%|███████▍  | 13726/18411 [2:06:42<43:05,  1.81it/s]

Epoch [1/1], Step [13726/18411], Loss: 0.6600261330604553


 75%|███████▍  | 13751/18411 [2:06:56<42:30,  1.83it/s]

Epoch [1/1], Step [13751/18411], Loss: 0.5847415924072266


 75%|███████▍  | 13776/18411 [2:07:09<42:33,  1.82it/s]

Epoch [1/1], Step [13776/18411], Loss: 0.6718093752861023


 75%|███████▍  | 13801/18411 [2:07:23<42:01,  1.83it/s]

Epoch [1/1], Step [13801/18411], Loss: 0.5693637728691101


 75%|███████▌  | 13826/18411 [2:07:37<41:48,  1.83it/s]

Epoch [1/1], Step [13826/18411], Loss: 0.6067602634429932


 75%|███████▌  | 13851/18411 [2:07:50<41:37,  1.83it/s]

Epoch [1/1], Step [13851/18411], Loss: 0.7149626612663269


 75%|███████▌  | 13876/18411 [2:08:04<42:14,  1.79it/s]

Epoch [1/1], Step [13876/18411], Loss: 0.7703264951705933


 76%|███████▌  | 13901/18411 [2:08:17<41:06,  1.83it/s]

Epoch [1/1], Step [13901/18411], Loss: 0.5577892661094666


 76%|███████▌  | 13926/18411 [2:08:31<40:53,  1.83it/s]

Epoch [1/1], Step [13926/18411], Loss: 0.8010144233703613


 76%|███████▌  | 13951/18411 [2:08:44<40:40,  1.83it/s]

Epoch [1/1], Step [13951/18411], Loss: 0.6337377429008484


 76%|███████▌  | 13976/18411 [2:08:58<40:27,  1.83it/s]

Epoch [1/1], Step [13976/18411], Loss: 0.5885214805603027


 76%|███████▌  | 14001/18411 [2:09:11<40:11,  1.83it/s]

Epoch [1/1], Step [14001/18411], Loss: 0.7636903524398804


 76%|███████▌  | 14026/18411 [2:09:25<40:03,  1.82it/s]

Epoch [1/1], Step [14026/18411], Loss: 0.5876750946044922


 76%|███████▋  | 14051/18411 [2:09:39<42:02,  1.73it/s]

Epoch [1/1], Step [14051/18411], Loss: 0.6914359331130981


 76%|███████▋  | 14076/18411 [2:09:53<40:30,  1.78it/s]

Epoch [1/1], Step [14076/18411], Loss: 0.6971803307533264


 77%|███████▋  | 14101/18411 [2:10:06<39:17,  1.83it/s]

Epoch [1/1], Step [14101/18411], Loss: 0.6741093993186951


 77%|███████▋  | 14126/18411 [2:10:20<39:02,  1.83it/s]

Epoch [1/1], Step [14126/18411], Loss: 0.7186462879180908


 77%|███████▋  | 14151/18411 [2:10:33<38:48,  1.83it/s]

Epoch [1/1], Step [14151/18411], Loss: 0.4515366852283478


 77%|███████▋  | 14176/18411 [2:10:47<38:38,  1.83it/s]

Epoch [1/1], Step [14176/18411], Loss: 0.8679170608520508


 77%|███████▋  | 14201/18411 [2:11:01<38:22,  1.83it/s]

Epoch [1/1], Step [14201/18411], Loss: 0.5744237899780273


 77%|███████▋  | 14226/18411 [2:11:14<38:41,  1.80it/s]

Epoch [1/1], Step [14226/18411], Loss: 0.7627114653587341


 77%|███████▋  | 14251/18411 [2:11:28<38:12,  1.81it/s]

Epoch [1/1], Step [14251/18411], Loss: 0.7113087177276611


 78%|███████▊  | 14276/18411 [2:11:41<37:52,  1.82it/s]

Epoch [1/1], Step [14276/18411], Loss: 0.5238170027732849


 78%|███████▊  | 14301/18411 [2:11:55<37:40,  1.82it/s]

Epoch [1/1], Step [14301/18411], Loss: 0.4769074022769928


 78%|███████▊  | 14326/18411 [2:12:09<37:13,  1.83it/s]

Epoch [1/1], Step [14326/18411], Loss: 0.6375439763069153


 78%|███████▊  | 14351/18411 [2:12:22<37:45,  1.79it/s]

Epoch [1/1], Step [14351/18411], Loss: 0.6752530336380005


 78%|███████▊  | 14376/18411 [2:12:36<36:57,  1.82it/s]

Epoch [1/1], Step [14376/18411], Loss: 0.6421542167663574


 78%|███████▊  | 14401/18411 [2:12:49<36:49,  1.81it/s]

Epoch [1/1], Step [14401/18411], Loss: 0.5200881958007812


 78%|███████▊  | 14426/18411 [2:13:03<36:17,  1.83it/s]

Epoch [1/1], Step [14426/18411], Loss: 0.6643665432929993


 78%|███████▊  | 14451/18411 [2:13:17<36:12,  1.82it/s]

Epoch [1/1], Step [14451/18411], Loss: 0.7370418906211853


 79%|███████▊  | 14476/18411 [2:13:30<36:04,  1.82it/s]

Epoch [1/1], Step [14476/18411], Loss: 0.7322944402694702


 79%|███████▉  | 14501/18411 [2:13:44<35:43,  1.82it/s]

Epoch [1/1], Step [14501/18411], Loss: 0.6077839136123657


 79%|███████▉  | 14526/18411 [2:13:57<35:49,  1.81it/s]

Epoch [1/1], Step [14526/18411], Loss: 0.48758238554000854


 79%|███████▉  | 14551/18411 [2:14:11<37:01,  1.74it/s]

Epoch [1/1], Step [14551/18411], Loss: 0.7018321752548218


 79%|███████▉  | 14576/18411 [2:14:25<34:58,  1.83it/s]

Epoch [1/1], Step [14576/18411], Loss: 0.7897995114326477


 79%|███████▉  | 14601/18411 [2:14:39<34:44,  1.83it/s]

Epoch [1/1], Step [14601/18411], Loss: 0.4923868477344513


 79%|███████▉  | 14626/18411 [2:14:52<34:31,  1.83it/s]

Epoch [1/1], Step [14626/18411], Loss: 0.7045316100120544


 80%|███████▉  | 14651/18411 [2:15:06<34:29,  1.82it/s]

Epoch [1/1], Step [14651/18411], Loss: 0.5714495778083801


 80%|███████▉  | 14676/18411 [2:15:20<34:42,  1.79it/s]

Epoch [1/1], Step [14676/18411], Loss: 0.6839393973350525


 80%|███████▉  | 14701/18411 [2:15:33<34:00,  1.82it/s]

Epoch [1/1], Step [14701/18411], Loss: 0.43003976345062256


 80%|███████▉  | 14726/18411 [2:15:47<33:37,  1.83it/s]

Epoch [1/1], Step [14726/18411], Loss: 0.6345674991607666


 80%|████████  | 14751/18411 [2:16:00<33:20,  1.83it/s]

Epoch [1/1], Step [14751/18411], Loss: 0.7752732634544373


 80%|████████  | 14776/18411 [2:16:14<33:30,  1.81it/s]

Epoch [1/1], Step [14776/18411], Loss: 0.5552656054496765


 80%|████████  | 14801/18411 [2:16:27<32:53,  1.83it/s]

Epoch [1/1], Step [14801/18411], Loss: 0.545242428779602


 81%|████████  | 14826/18411 [2:16:41<32:38,  1.83it/s]

Epoch [1/1], Step [14826/18411], Loss: 0.3978725075721741


 81%|████████  | 14851/18411 [2:16:54<32:27,  1.83it/s]

Epoch [1/1], Step [14851/18411], Loss: 0.7159889340400696


 81%|████████  | 14876/18411 [2:17:08<32:45,  1.80it/s]

Epoch [1/1], Step [14876/18411], Loss: 0.7993839979171753


 81%|████████  | 14901/18411 [2:17:22<31:59,  1.83it/s]

Epoch [1/1], Step [14901/18411], Loss: 0.6405995488166809


 81%|████████  | 14926/18411 [2:17:35<31:45,  1.83it/s]

Epoch [1/1], Step [14926/18411], Loss: 0.6747318506240845


 81%|████████  | 14951/18411 [2:17:49<31:42,  1.82it/s]

Epoch [1/1], Step [14951/18411], Loss: 0.5095676183700562


 81%|████████▏ | 14976/18411 [2:18:02<31:19,  1.83it/s]

Epoch [1/1], Step [14976/18411], Loss: 0.5397247672080994


 81%|████████▏ | 15001/18411 [2:18:16<31:05,  1.83it/s]

Epoch [1/1], Step [15001/18411], Loss: 0.4905610680580139


 82%|████████▏ | 15026/18411 [2:18:29<30:51,  1.83it/s]

Epoch [1/1], Step [15026/18411], Loss: 0.5948610305786133


 82%|████████▏ | 15051/18411 [2:18:43<30:37,  1.83it/s]

Epoch [1/1], Step [15051/18411], Loss: 0.7542818784713745


 82%|████████▏ | 15076/18411 [2:18:56<30:24,  1.83it/s]

Epoch [1/1], Step [15076/18411], Loss: 0.5687036514282227


 82%|████████▏ | 15101/18411 [2:19:10<30:10,  1.83it/s]

Epoch [1/1], Step [15101/18411], Loss: 0.6161296963691711


 82%|████████▏ | 15126/18411 [2:19:24<29:56,  1.83it/s]

Epoch [1/1], Step [15126/18411], Loss: 0.5486301779747009


 82%|████████▏ | 15151/18411 [2:19:37<29:41,  1.83it/s]

Epoch [1/1], Step [15151/18411], Loss: 0.5002360343933105


 82%|████████▏ | 15176/18411 [2:19:51<29:29,  1.83it/s]

Epoch [1/1], Step [15176/18411], Loss: 0.5918630957603455


 83%|████████▎ | 15201/18411 [2:20:04<29:15,  1.83it/s]

Epoch [1/1], Step [15201/18411], Loss: 0.6928386092185974


 83%|████████▎ | 15226/18411 [2:20:18<29:01,  1.83it/s]

Epoch [1/1], Step [15226/18411], Loss: 0.5154706835746765


 83%|████████▎ | 15251/18411 [2:20:31<28:49,  1.83it/s]

Epoch [1/1], Step [15251/18411], Loss: 0.4372379779815674


 83%|████████▎ | 15276/18411 [2:20:45<28:41,  1.82it/s]

Epoch [1/1], Step [15276/18411], Loss: 0.684462308883667


 83%|████████▎ | 15301/18411 [2:20:58<28:21,  1.83it/s]

Epoch [1/1], Step [15301/18411], Loss: 0.48973405361175537


 83%|████████▎ | 15326/18411 [2:21:12<28:06,  1.83it/s]

Epoch [1/1], Step [15326/18411], Loss: 0.6463407278060913


 83%|████████▎ | 15351/18411 [2:21:25<27:57,  1.82it/s]

Epoch [1/1], Step [15351/18411], Loss: 0.6861053705215454


 84%|████████▎ | 15376/18411 [2:21:39<27:39,  1.83it/s]

Epoch [1/1], Step [15376/18411], Loss: 0.597206711769104


 84%|████████▎ | 15401/18411 [2:21:52<27:27,  1.83it/s]

Epoch [1/1], Step [15401/18411], Loss: 0.6263267397880554


 84%|████████▍ | 15426/18411 [2:22:06<27:37,  1.80it/s]

Epoch [1/1], Step [15426/18411], Loss: 0.5706032514572144


 84%|████████▍ | 15451/18411 [2:22:20<28:16,  1.74it/s]

Epoch [1/1], Step [15451/18411], Loss: 0.6328779458999634


 84%|████████▍ | 15476/18411 [2:22:34<26:53,  1.82it/s]

Epoch [1/1], Step [15476/18411], Loss: 0.5554161667823792


 84%|████████▍ | 15501/18411 [2:22:48<26:34,  1.83it/s]

Epoch [1/1], Step [15501/18411], Loss: 0.6238630414009094


 84%|████████▍ | 15526/18411 [2:23:01<26:25,  1.82it/s]

Epoch [1/1], Step [15526/18411], Loss: 0.801811695098877


 84%|████████▍ | 15551/18411 [2:23:15<26:14,  1.82it/s]

Epoch [1/1], Step [15551/18411], Loss: 0.5483880639076233


 85%|████████▍ | 15576/18411 [2:23:28<25:56,  1.82it/s]

Epoch [1/1], Step [15576/18411], Loss: 0.6036145091056824


 85%|████████▍ | 15601/18411 [2:23:42<25:40,  1.82it/s]

Epoch [1/1], Step [15601/18411], Loss: 0.5174009799957275


 85%|████████▍ | 15626/18411 [2:23:55<25:27,  1.82it/s]

Epoch [1/1], Step [15626/18411], Loss: 0.4644168019294739


 85%|████████▌ | 15651/18411 [2:24:09<25:11,  1.83it/s]

Epoch [1/1], Step [15651/18411], Loss: 0.6420789361000061


 85%|████████▌ | 15676/18411 [2:24:23<24:59,  1.82it/s]

Epoch [1/1], Step [15676/18411], Loss: 0.5514038801193237


 85%|████████▌ | 15701/18411 [2:24:36<25:49,  1.75it/s]

Epoch [1/1], Step [15701/18411], Loss: 0.5980334281921387


 85%|████████▌ | 15726/18411 [2:24:50<25:38,  1.74it/s]

Epoch [1/1], Step [15726/18411], Loss: 0.6076196432113647


 86%|████████▌ | 15751/18411 [2:25:04<24:19,  1.82it/s]

Epoch [1/1], Step [15751/18411], Loss: 0.5141921043395996


 86%|████████▌ | 15776/18411 [2:25:18<24:04,  1.82it/s]

Epoch [1/1], Step [15776/18411], Loss: 0.5081605315208435


 86%|████████▌ | 15801/18411 [2:25:31<23:57,  1.82it/s]

Epoch [1/1], Step [15801/18411], Loss: 0.4920215308666229


 86%|████████▌ | 15826/18411 [2:25:45<23:40,  1.82it/s]

Epoch [1/1], Step [15826/18411], Loss: 0.45416325330734253


 86%|████████▌ | 15851/18411 [2:25:58<23:28,  1.82it/s]

Epoch [1/1], Step [15851/18411], Loss: 0.5633214712142944


 86%|████████▌ | 15876/18411 [2:26:12<23:10,  1.82it/s]

Epoch [1/1], Step [15876/18411], Loss: 0.5097110271453857


 86%|████████▋ | 15901/18411 [2:26:26<22:59,  1.82it/s]

Epoch [1/1], Step [15901/18411], Loss: 0.5909535884857178


 87%|████████▋ | 15926/18411 [2:26:39<22:42,  1.82it/s]

Epoch [1/1], Step [15926/18411], Loss: 0.5969482660293579


 87%|████████▋ | 15951/18411 [2:26:53<22:28,  1.82it/s]

Epoch [1/1], Step [15951/18411], Loss: 0.6003382802009583


 87%|████████▋ | 15976/18411 [2:27:06<23:30,  1.73it/s]

Epoch [1/1], Step [15976/18411], Loss: 0.6937961578369141


 87%|████████▋ | 16001/18411 [2:27:20<21:59,  1.83it/s]

Epoch [1/1], Step [16001/18411], Loss: 0.6423401236534119


 87%|████████▋ | 16026/18411 [2:27:34<21:46,  1.83it/s]

Epoch [1/1], Step [16026/18411], Loss: 0.4902324676513672


 87%|████████▋ | 16051/18411 [2:27:48<21:34,  1.82it/s]

Epoch [1/1], Step [16051/18411], Loss: 0.5188096761703491


 87%|████████▋ | 16076/18411 [2:28:01<21:21,  1.82it/s]

Epoch [1/1], Step [16076/18411], Loss: 0.6114518046379089


 87%|████████▋ | 16101/18411 [2:28:15<21:07,  1.82it/s]

Epoch [1/1], Step [16101/18411], Loss: 0.5823948383331299


 88%|████████▊ | 16126/18411 [2:28:28<20:53,  1.82it/s]

Epoch [1/1], Step [16126/18411], Loss: 0.7778500318527222


 88%|████████▊ | 16151/18411 [2:28:42<20:40,  1.82it/s]

Epoch [1/1], Step [16151/18411], Loss: 0.5605342984199524


 88%|████████▊ | 16176/18411 [2:28:55<20:26,  1.82it/s]

Epoch [1/1], Step [16176/18411], Loss: 0.5010561943054199


 88%|████████▊ | 16201/18411 [2:29:09<20:12,  1.82it/s]

Epoch [1/1], Step [16201/18411], Loss: 0.4717017412185669


 88%|████████▊ | 16226/18411 [2:29:23<20:15,  1.80it/s]

Epoch [1/1], Step [16226/18411], Loss: 0.5746870636940002


 88%|████████▊ | 16251/18411 [2:29:36<19:46,  1.82it/s]

Epoch [1/1], Step [16251/18411], Loss: 0.5389412641525269


 88%|████████▊ | 16276/18411 [2:29:50<19:31,  1.82it/s]

Epoch [1/1], Step [16276/18411], Loss: 0.6200768947601318


 89%|████████▊ | 16301/18411 [2:30:03<19:17,  1.82it/s]

Epoch [1/1], Step [16301/18411], Loss: 0.3664115071296692


 89%|████████▊ | 16326/18411 [2:30:17<19:03,  1.82it/s]

Epoch [1/1], Step [16326/18411], Loss: 0.5062811374664307


 89%|████████▉ | 16351/18411 [2:30:31<18:50,  1.82it/s]

Epoch [1/1], Step [16351/18411], Loss: 0.5469502806663513


 89%|████████▉ | 16376/18411 [2:30:44<18:38,  1.82it/s]

Epoch [1/1], Step [16376/18411], Loss: 0.4887562692165375


 89%|████████▉ | 16401/18411 [2:30:58<18:21,  1.82it/s]

Epoch [1/1], Step [16401/18411], Loss: 0.5066556334495544


 89%|████████▉ | 16426/18411 [2:31:11<18:09,  1.82it/s]

Epoch [1/1], Step [16426/18411], Loss: 0.5614338517189026


 89%|████████▉ | 16451/18411 [2:31:25<18:11,  1.80it/s]

Epoch [1/1], Step [16451/18411], Loss: 0.5759936571121216


 89%|████████▉ | 16476/18411 [2:31:39<17:39,  1.83it/s]

Epoch [1/1], Step [16476/18411], Loss: 0.5258963704109192


 90%|████████▉ | 16501/18411 [2:31:52<17:27,  1.82it/s]

Epoch [1/1], Step [16501/18411], Loss: 0.5379680395126343


 90%|████████▉ | 16526/18411 [2:32:06<17:14,  1.82it/s]

Epoch [1/1], Step [16526/18411], Loss: 0.4874667227268219


 90%|████████▉ | 16551/18411 [2:32:20<17:00,  1.82it/s]

Epoch [1/1], Step [16551/18411], Loss: 0.6115305423736572


 90%|█████████ | 16576/18411 [2:32:33<16:58,  1.80it/s]

Epoch [1/1], Step [16576/18411], Loss: 0.4982704520225525


 90%|█████████ | 16601/18411 [2:32:47<16:37,  1.82it/s]

Epoch [1/1], Step [16601/18411], Loss: 0.3680025041103363


 90%|█████████ | 16626/18411 [2:33:00<16:18,  1.82it/s]

Epoch [1/1], Step [16626/18411], Loss: 0.47640350461006165


 90%|█████████ | 16651/18411 [2:33:14<16:06,  1.82it/s]

Epoch [1/1], Step [16651/18411], Loss: 0.4903671443462372


 91%|█████████ | 16676/18411 [2:33:28<15:52,  1.82it/s]

Epoch [1/1], Step [16676/18411], Loss: 0.4452381730079651


 91%|█████████ | 16701/18411 [2:33:41<15:37,  1.82it/s]

Epoch [1/1], Step [16701/18411], Loss: 0.5140186548233032


 91%|█████████ | 16726/18411 [2:33:55<15:32,  1.81it/s]

Epoch [1/1], Step [16726/18411], Loss: 0.5078063607215881


 91%|█████████ | 16751/18411 [2:34:08<15:10,  1.82it/s]

Epoch [1/1], Step [16751/18411], Loss: 0.5337066054344177


 91%|█████████ | 16776/18411 [2:34:22<14:59,  1.82it/s]

Epoch [1/1], Step [16776/18411], Loss: 0.4558230936527252


 91%|█████████▏| 16801/18411 [2:34:35<14:42,  1.82it/s]

Epoch [1/1], Step [16801/18411], Loss: 0.5873456001281738


 91%|█████████▏| 16826/18411 [2:34:49<14:34,  1.81it/s]

Epoch [1/1], Step [16826/18411], Loss: 0.4868232309818268


 92%|█████████▏| 16851/18411 [2:35:03<14:17,  1.82it/s]

Epoch [1/1], Step [16851/18411], Loss: 0.44296565651893616


 92%|█████████▏| 16876/18411 [2:35:16<14:02,  1.82it/s]

Epoch [1/1], Step [16876/18411], Loss: 0.42814648151397705


 92%|█████████▏| 16901/18411 [2:35:30<13:49,  1.82it/s]

Epoch [1/1], Step [16901/18411], Loss: 0.510318398475647


 92%|█████████▏| 16926/18411 [2:35:43<13:34,  1.82it/s]

Epoch [1/1], Step [16926/18411], Loss: 0.4186801612377167


 92%|█████████▏| 16951/18411 [2:35:57<13:24,  1.81it/s]

Epoch [1/1], Step [16951/18411], Loss: 0.49233385920524597


 92%|█████████▏| 16976/18411 [2:36:11<13:07,  1.82it/s]

Epoch [1/1], Step [16976/18411], Loss: 0.5307062268257141


 92%|█████████▏| 17001/18411 [2:36:24<12:57,  1.81it/s]

Epoch [1/1], Step [17001/18411], Loss: 0.48579245805740356


 92%|█████████▏| 17026/18411 [2:36:38<12:40,  1.82it/s]

Epoch [1/1], Step [17026/18411], Loss: 0.6753261089324951


 93%|█████████▎| 17051/18411 [2:36:52<12:25,  1.82it/s]

Epoch [1/1], Step [17051/18411], Loss: 0.5422696471214294


 93%|█████████▎| 17076/18411 [2:37:05<12:25,  1.79it/s]

Epoch [1/1], Step [17076/18411], Loss: 0.5659211874008179


 93%|█████████▎| 17101/18411 [2:37:19<11:58,  1.82it/s]

Epoch [1/1], Step [17101/18411], Loss: 0.5374500751495361


 93%|█████████▎| 17126/18411 [2:37:32<11:43,  1.83it/s]

Epoch [1/1], Step [17126/18411], Loss: 0.5238627195358276


 93%|█████████▎| 17151/18411 [2:37:46<11:35,  1.81it/s]

Epoch [1/1], Step [17151/18411], Loss: 0.4811370074748993


 93%|█████████▎| 17176/18411 [2:38:00<11:55,  1.73it/s]

Epoch [1/1], Step [17176/18411], Loss: 0.4575045108795166


 93%|█████████▎| 17201/18411 [2:38:14<11:05,  1.82it/s]

Epoch [1/1], Step [17201/18411], Loss: 0.3474278748035431


 94%|█████████▎| 17226/18411 [2:38:28<10:52,  1.82it/s]

Epoch [1/1], Step [17226/18411], Loss: 0.42400094866752625


 94%|█████████▎| 17251/18411 [2:38:42<10:48,  1.79it/s]

Epoch [1/1], Step [17251/18411], Loss: 0.40238991379737854


 94%|█████████▍| 17276/18411 [2:38:56<10:24,  1.82it/s]

Epoch [1/1], Step [17276/18411], Loss: 0.5837234854698181


 94%|█████████▍| 17301/18411 [2:39:09<10:07,  1.83it/s]

Epoch [1/1], Step [17301/18411], Loss: 0.4206671714782715


 94%|█████████▍| 17326/18411 [2:39:23<10:26,  1.73it/s]

Epoch [1/1], Step [17326/18411], Loss: 0.3907450735569


 94%|█████████▍| 17351/18411 [2:39:37<09:56,  1.78it/s]

Epoch [1/1], Step [17351/18411], Loss: 0.546348512172699


 94%|█████████▍| 17376/18411 [2:39:51<09:28,  1.82it/s]

Epoch [1/1], Step [17376/18411], Loss: 0.4088731110095978


 95%|█████████▍| 17401/18411 [2:40:05<09:40,  1.74it/s]

Epoch [1/1], Step [17401/18411], Loss: 0.42063748836517334


 95%|█████████▍| 17426/18411 [2:40:18<09:12,  1.78it/s]

Epoch [1/1], Step [17426/18411], Loss: 0.4300004243850708


 95%|█████████▍| 17451/18411 [2:40:32<08:45,  1.83it/s]

Epoch [1/1], Step [17451/18411], Loss: 0.46231070160865784


 95%|█████████▍| 17476/18411 [2:40:46<08:31,  1.83it/s]

Epoch [1/1], Step [17476/18411], Loss: 0.3340107500553131


 95%|█████████▌| 17501/18411 [2:40:59<08:21,  1.82it/s]

Epoch [1/1], Step [17501/18411], Loss: 0.45983096957206726


 95%|█████████▌| 17526/18411 [2:41:13<08:06,  1.82it/s]

Epoch [1/1], Step [17526/18411], Loss: 0.3390153646469116


 95%|█████████▌| 17551/18411 [2:41:27<07:49,  1.83it/s]

Epoch [1/1], Step [17551/18411], Loss: 0.6015251874923706


 95%|█████████▌| 17576/18411 [2:41:40<07:36,  1.83it/s]

Epoch [1/1], Step [17576/18411], Loss: 0.7637864351272583


 96%|█████████▌| 17601/18411 [2:41:54<07:22,  1.83it/s]

Epoch [1/1], Step [17601/18411], Loss: 0.48015347123146057


 96%|█████████▌| 17626/18411 [2:42:07<07:08,  1.83it/s]

Epoch [1/1], Step [17626/18411], Loss: 0.4779093861579895


 96%|█████████▌| 17651/18411 [2:42:21<06:54,  1.83it/s]

Epoch [1/1], Step [17651/18411], Loss: 0.46565109491348267


 96%|█████████▌| 17676/18411 [2:42:34<06:40,  1.83it/s]

Epoch [1/1], Step [17676/18411], Loss: 0.4436546266078949


 96%|█████████▌| 17701/18411 [2:42:48<06:27,  1.83it/s]

Epoch [1/1], Step [17701/18411], Loss: 0.7213284373283386


 96%|█████████▋| 17726/18411 [2:43:01<06:13,  1.83it/s]

Epoch [1/1], Step [17726/18411], Loss: 0.3647501766681671


 96%|█████████▋| 17751/18411 [2:43:15<06:00,  1.83it/s]

Epoch [1/1], Step [17751/18411], Loss: 0.4722226858139038


 97%|█████████▋| 17776/18411 [2:43:28<05:46,  1.83it/s]

Epoch [1/1], Step [17776/18411], Loss: 0.5298581123352051


 97%|█████████▋| 17801/18411 [2:43:42<05:52,  1.73it/s]

Epoch [1/1], Step [17801/18411], Loss: 0.5703009366989136


 97%|█████████▋| 17826/18411 [2:43:56<05:22,  1.81it/s]

Epoch [1/1], Step [17826/18411], Loss: 0.4448164701461792


 97%|█████████▋| 17851/18411 [2:44:10<05:23,  1.73it/s]

Epoch [1/1], Step [17851/18411], Loss: 0.47591257095336914


 97%|█████████▋| 17876/18411 [2:44:23<04:57,  1.80it/s]

Epoch [1/1], Step [17876/18411], Loss: 0.3171903192996979


 97%|█████████▋| 17901/18411 [2:44:37<04:39,  1.83it/s]

Epoch [1/1], Step [17901/18411], Loss: 0.45273056626319885


 97%|█████████▋| 17926/18411 [2:44:51<04:40,  1.73it/s]

Epoch [1/1], Step [17926/18411], Loss: 0.39849546551704407


 98%|█████████▊| 17951/18411 [2:45:06<04:25,  1.73it/s]

Epoch [1/1], Step [17951/18411], Loss: 0.41121527552604675


 98%|█████████▊| 17976/18411 [2:45:19<03:58,  1.83it/s]

Epoch [1/1], Step [17976/18411], Loss: 0.6657997965812683


 98%|█████████▊| 18001/18411 [2:45:33<03:56,  1.73it/s]

Epoch [1/1], Step [18001/18411], Loss: 0.6297090649604797


 98%|█████████▊| 18026/18411 [2:45:47<03:31,  1.82it/s]

Epoch [1/1], Step [18026/18411], Loss: 0.30980184674263


 98%|█████████▊| 18051/18411 [2:46:01<03:27,  1.73it/s]

Epoch [1/1], Step [18051/18411], Loss: 0.2902379631996155


 98%|█████████▊| 18076/18411 [2:46:15<03:10,  1.76it/s]

Epoch [1/1], Step [18076/18411], Loss: 0.5450282692909241


 98%|█████████▊| 18101/18411 [2:46:29<02:58,  1.73it/s]

Epoch [1/1], Step [18101/18411], Loss: 0.3808218538761139


 98%|█████████▊| 18126/18411 [2:46:43<02:42,  1.75it/s]

Epoch [1/1], Step [18126/18411], Loss: 0.45109111070632935


 99%|█████████▊| 18151/18411 [2:46:57<02:27,  1.77it/s]

Epoch [1/1], Step [18151/18411], Loss: 0.379652738571167


 99%|█████████▊| 18176/18411 [2:47:12<02:17,  1.71it/s]

Epoch [1/1], Step [18176/18411], Loss: 0.468416303396225


 99%|█████████▉| 18201/18411 [2:47:26<02:01,  1.73it/s]

Epoch [1/1], Step [18201/18411], Loss: 0.40785351395606995


 99%|█████████▉| 18226/18411 [2:47:40<01:43,  1.80it/s]

Epoch [1/1], Step [18226/18411], Loss: 0.45526397228240967


 99%|█████████▉| 18251/18411 [2:47:53<01:28,  1.82it/s]

Epoch [1/1], Step [18251/18411], Loss: 0.3623613119125366


 99%|█████████▉| 18276/18411 [2:48:07<01:15,  1.80it/s]

Epoch [1/1], Step [18276/18411], Loss: 0.4161825478076935


 99%|█████████▉| 18301/18411 [2:48:21<01:01,  1.78it/s]

Epoch [1/1], Step [18301/18411], Loss: 0.3143357038497925


100%|█████████▉| 18326/18411 [2:48:35<00:47,  1.80it/s]

Epoch [1/1], Step [18326/18411], Loss: 0.39217978715896606


100%|█████████▉| 18351/18411 [2:48:49<00:33,  1.78it/s]

Epoch [1/1], Step [18351/18411], Loss: 0.3902491331100464


100%|█████████▉| 18376/18411 [2:49:03<00:19,  1.82it/s]

Epoch [1/1], Step [18376/18411], Loss: 0.42874348163604736


100%|█████████▉| 18401/18411 [2:49:16<00:05,  1.82it/s]

Epoch [1/1], Step [18401/18411], Loss: 0.48884114623069763


100%|██████████| 18411/18411 [2:49:22<00:00,  1.81it/s]


In [19]:
torch.save(encoder_decoder.state_dict(), 'model_state2.pth')


In [38]:
# TODO: I have no idea what I'm doing or if this is correct
def translate(to_translate, model, loader):
    model.eval()
    inp = loader.tokenize_texts([to_translate])[0].unsqueeze(0).to(device)
    with torch.no_grad():
        out = model(inp)
    print(out.shape)

    out = torch.argmax(out.squeeze(0), dim=0)
    token_ids = out.tolist()
    translated_texts = loader.tokenizer.decode(token_ids)

    print(f"Translated text: {translated_texts}")


In [54]:
encoder_decoder.eval()
text = ["Wir brauchen einen neuen Datensatz."]
print(f"Translating: {text}")
translate(text, encoder_decoder, wmt_json_loader)


Translating: ['Wir brauchen ein neues Auto.']
torch.Size([1, 384, 128])
Translated text: We weed ao mevantieo to<pad>datte<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
