In [1]:
%autosave 300
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
%config Completer.use_jedi = False

Autosaving every 300 seconds


In [2]:
import os

os.chdir(
    "/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilot-model-run/code/Users/Soutrik.Chowdhury/Abi_GenAI_Sessions"
)
print(os.getcwd())

/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilot-model-run/code/Users/Soutrik.Chowdhury/Abi_GenAI_Sessions


In [3]:
# model import
from src.encoder_decoder_transformer import build_transformer, casual_mask

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset,random_split
import numpy as np
import matplotlib.pyplot as plt
from torch.nn import functional as F

# HuggingFace libraries
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

# typing
from typing import Any

# Library for progress bars in loops
from tqdm import tqdm

In [4]:
class Config:
    N = 2
    d_model = 256
    d_ff = 2048
    h = 8
    dropout_rate = 0.1
    batch_size = 32
    num_epochs = 5
    lr = 10**-4
    seq_len = 350
    lang_src = "en"
    lang_tgt = "it"
    model_folder = "weights"
    model_basename = "enmodel_"
    preload = None
    tokenizer_file = "tokenizer_{0}.json"
    token_folder = "./tokenizers"

In [5]:
config = Config()

In [6]:
def get_all_sentences(ds, lang):
    for pair in ds:
        yield pair["translation"][lang]

##### __Tokenization__
- **Tokenization**: Each token in a sentence is mapped to a unique integer ID based on the vocabulary created during the training of the tokenizer.
- **Vocabulary Mapping**: The integer ID represents a specific word in the vocabulary.
- **Special Tokens in Transformers**:
  - **[UNK]**: Identifies unknown words in a sequence.
  - **[PAD]**: Used for padding to ensure all sequences in a batch have the same length; attention masks are used to ignore these tokens during training.
  - **[SOS]**: Signals the Start of Sentence.
  - **[EOS]**: Signals the End of Sentence.

In [7]:
# Defining Tokenizer
def build_tokenizer(config, ds, lang, force_download=True):
    """Function to build a tokenizer for a given language and dataset"""
    # Crating a file path for the tokenizer
    tokenizer_path = config.tokenizer_file.format(lang)

    if force_download:
        tokenizer = Tokenizer(
            WordLevel(unk_token="[UNK]")
        )  # Initializing a new world-level tokenizer
        tokenizer.pre_tokenizer = (
            Whitespace()
        )  # We will split the text into tokens based on whitespace

        # Creating a trainer for the new tokenizer
        trainer = WordLevelTrainer(
            special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2
        )  # Defining Word Level strategy and special tokens

        # Training new tokenizer on sentences from the dataset and language specified
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(
            str(tokenizer_path)
        )  # Saving trained tokenizer to the file path specified at the beginning of the function
    else:
        tokenizer = Tokenizer.from_file(
            str(tokenizer_path)
        )  # If the tokenizer already exist, we load it
    return tokenizer  # Returns the loaded tokenizer or the trained tokenizer

[SoS] -> [tokens] -> [EoS] -> [pad]

In [8]:
class BilingualDataset(Dataset):
    """ Dataset class for the bilingual dataset"""
    def __init__(
        self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len
    ) -> None:
        super().__init__()

        self.seq_len = seq_len
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        # Defining special tokens by using the target language tokenizer
        self.sos_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64
        )
        self.eos_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64
        )
        self.pad_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64
        )

    # Total number of instances in the dataset (some pairs are larger than others)
    def __len__(self):
        return len(self.ds)

    # Using the index to retrive source and target texts
    def __getitem__(self, index: Any) -> Any:
        src_target_pair = self.ds[index]
        src_text = src_target_pair["translation"][self.src_lang]
        tgt_text = src_target_pair["translation"][self.tgt_lang]

        # Tokenizing source and target texts
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Computing how many padding tokens need to be added to the tokenized texts
        # Source tokens
        enc_num_padding_tokens = (
            self.seq_len - len(enc_input_tokens) - 2
        )  # Subtracting the two '[EOS]' and '[SOS]' special tokens
        # Target tokens
        dec_num_padding_tokens = (
            self.seq_len - len(dec_input_tokens) - 1
        )  # Subtracting the '[SOS]' special token

        # If the texts exceed the 'seq_len' allowed, it will raise an error. This means that one of the sentences in the pair is too long to be processed
        # given the current sequence length limit (this will be defined in the config dictionary below)
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Building the encoder input tensor by combining several elements
        encoder_input = torch.cat(
            [
                self.sos_token,  # inserting the '[SOS]' token
                torch.tensor(
                    enc_input_tokens, dtype=torch.int64
                ),  # Inserting the tokenized source text
                self.eos_token,  # Inserting the '[EOS]' token
                torch.tensor(
                    [self.pad_token] * enc_num_padding_tokens, dtype=torch.int64
                ),  # Addind padding tokens
            ]
        )

        # Building the decoder input tensor by combining several elements
        decoder_input = torch.cat(
            [
                self.sos_token,  # inserting the '[SOS]' token
                torch.tensor(
                    dec_input_tokens, dtype=torch.int64
                ),  # Inserting the tokenized target text
                torch.tensor(
                    [self.pad_token] * dec_num_padding_tokens, dtype=torch.int64
                ),  # Addind padding tokens
            ]
        )

        # Creating a label tensor, the expected output for training the model
        label = torch.cat(
            [
                torch.tensor(
                    dec_input_tokens, dtype=torch.int64
                ),  # Inserting the tokenized target text
                self.eos_token,  # Inserting the '[EOS]' token
                torch.tensor(
                    [self.pad_token] * dec_num_padding_tokens, dtype=torch.int64
                ),  # Adding padding tokens
            ]
        )

        # Ensuring that the length of each tensor above is equal to the defined 'seq_len'
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,
            "decoder_input": decoder_input,
            "encoder_mask": (encoder_input != self.pad_token)
            .unsqueeze(0)
            .unsqueeze(0)
            .int(),
            "decoder_mask": (decoder_input != self.pad_token)
            .unsqueeze(0)
            .unsqueeze(0)
            .int()
            & casual_mask(decoder_input.size(0)),
            "label": label,
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

##### __DATASET PREPARATION__

In [9]:
# downloading the data using config src_lang and tgt_lang
ds_raw = load_dataset(
    "opus_books",
    f"{config.lang_src}-{config.lang_tgt}",
    split="train",
    
)

In [10]:
# Building or loading tokenizer for both the source and target languages
tokenizer_src = build_tokenizer(config, ds_raw, config.lang_src)
tokenizer_tgt = build_tokenizer(config, ds_raw, config.lang_tgt)

In [11]:
# Splitting the dataset for training and validation 
train_ds_size = int(0.3 * len(ds_raw)) # 90% for training
val_ds_size = len(ds_raw) - train_ds_size # 10% for validation
train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size]) # Randomly splitting the dataset

In [12]:
len(train_ds_raw),len(val_ds_raw)

(9699, 22633)

In [13]:
# Processing data with the BilingualDataset class, which we will define below
train_ds = BilingualDataset(
    train_ds_raw,
    tokenizer_src,
    tokenizer_tgt,
    config.lang_src,
    config.lang_tgt,
    config.seq_len,
)
val_ds = BilingualDataset(
    val_ds_raw,
    tokenizer_src,
    tokenizer_tgt,
    config.lang_src,
    config.lang_tgt,
    config.seq_len,
)

In [14]:
trainds_sample_op = next(iter(train_ds))

In [15]:
trainds_sample_op.keys()

dict_keys(['encoder_input', 'decoder_input', 'encoder_mask', 'decoder_mask', 'label', 'src_text', 'tgt_text'])

In [16]:
# Dataloaders are used to iterate over the dataset in batches during training and validation
train_dataloader = DataLoader(
    train_ds, batch_size=config.batch_size, shuffle=True
) 
val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

In [17]:
for data in train_dataloader:
    print("Encoder input shape:", data["encoder_input"].shape)
    print("Decoder input shape:", data["decoder_input"].shape)
    print("Encoder mask shape:", data["encoder_mask"].shape)
    print("Decoder mask shape:", data["decoder_mask"].shape)
    print("Label shape:", data["label"].shape)
    print("Source text:", data["src_text"])
    print("Target text:", data["tgt_text"])
    break

Encoder input shape: torch.Size([32, 350])
Decoder input shape: torch.Size([32, 350])
Encoder mask shape: torch.Size([32, 1, 1, 350])
Decoder mask shape: torch.Size([32, 1, 350, 350])
Label shape: torch.Size([32, 350])
Source text: ['The enigma then was explained: this affable and kind little widow was no great dame; but a dependant like myself.', "Yashvin's face wore the expression it had when he was losing at cards.", '"Is she going by herself?" asked the porter\'s wife.', 'While still in the hall he heard her retreating footsteps, and knew that she had been waiting and listening for him, but had now gone back to the drawing-room.', 'The clock struck ten.', '"Mr. Brocklehurst, I believe I intimated in the letter which I wrote to you three weeks ago, that this little girl has not quite the character and disposition I could wish: should you admit her into Lowood school, I should be glad if the superintendent and teachers were requested to keep a strict eye on her, and, above all, to gu

In [18]:
def get_model(config, vocab_src_len, vocab_tgt_len):
    """Function to build the transformer model"""
    # We will use the lengths of the source language and target language vocabularies, the 'seq_len', and the dimensionality of the embeddings
    model = build_transformer(
        src_vocab_size=vocab_src_len,
        tgt_vocab_size=vocab_tgt_len,
        src_seq_len=config.seq_len,
        tgt_seq_len=config.seq_len,
        d_model=config.d_model,
        N=config.N,
        d_ff=config.d_ff,
        h=config.h,
    )
    return model

In [19]:
# Define function to obtain the most probable next token
def greedy_decode(
    model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device
):
    """Function to generate a sequence of tokens using the greedy decoding algorithm
    Args:
    model: Transformer model
    source: Tensor with the source sequence
    source_mask: Mask for the source sequence
    tokenizer_src: Source language tokenizer
    tokenizer_tgt: Target language tokenizer
    max_len: Maximum length of the output sequence
    device: Device to run the computations

    Returns:
    Tensor with the sequence of tokens generated by the decoder

    """

    # This is same as the inference testing loop that we created in the model testing section

    # Retrieving the indices from the start and end of sequences of the target tokens
    sos_idx = tokenizer_tgt.token_to_id("[SOS]")
    eos_idx = tokenizer_tgt.token_to_id("[EOS]")

    # Computing the output of the encoder for the source sequence
    encoder_output = model.encode(source, source_mask)
    # Initializing the decoder input with the Start of Sentence token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)

    # Looping until the 'max_len', maximum length, is reached
    while True:
        if decoder_input.size(1) == max_len:
            break

        # Building a mask for the decoder input
        decoder_mask = (
            casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        )

        # Calculating the output of the decoder
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # Applying the projection layer to get the probabilities for the next token
        prob = model.project(out[:, -1])

        # Selecting token with the highest probability
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [
                decoder_input,
                torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device),
            ],
            dim=1,
        )

        # If the next token is an End of Sentence token, we finish the loop
        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)  # Sequence of tokens generated by the decoder

In [20]:
def run_validation(
    model,
    validation_ds,
    tokenizer_src,
    tokenizer_tgt,
    max_len,
    device,
    print_msg,
    num_examples=2,
):
    """ Function to run the validation loop for the model
    Args:
    model: Transformer model
    validation_ds: Validation dataset
    tokenizer_src: Source language tokenizer
    tokenizer_tgt: Target language tokenizer
    max_len: Maximum length of the output sequence
    device: Device to run the computations
    print_msg: Function to print messages
    num_examples: Number of examples to process
    """
    model.eval()  # Setting model to evaluation mode
    count = (
        0  # Initializing counter to keep track of how many examples have been processed
    )

    # Creating evaluation loop
    with torch.no_grad():  # Ensuring that no gradients are computed during this process
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device)
            encoder_mask = batch["encoder_mask"].to(device)

            # Ensuring that the batch_size of the validation set is 1
            assert encoder_input.size(0) == 1, "Batch size must be 1 for validation."

            # Applying the 'greedy_decode' function to get the model's output for the source text of the input batch
            model_out = greedy_decode(
                model,
                encoder_input,
                encoder_mask,
                tokenizer_src,
                tokenizer_tgt,
                max_len,
                device,
            )

            # Retrieving source and target texts from the batch
            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]  # True translation
            model_out_text = tokenizer_tgt.decode(
                model_out.detach().cpu().numpy()
            )  # Decoded, human-readable model output

            print_msg(f"SOURCE: {source_text}")
            print_msg(f"TARGET: {target_text}")
            print_msg(f"PREDICTED: {model_out_text}")

            # After two examples, we break the loop
            if count == num_examples:
                break

In [21]:
def train_model(config, train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt):
    """ Function to train the Transformer model
    Args:
    config: Configuration dictionary
    train_dataloader: Training dataloader
    val_dataloader: Validation dataloader
    tokenizer_src: Source language tokenizer
    tokenizer_tgt: Target language tokenizer
    """
    
    # Setting up device to run on GPU to train faster
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device {device}")

    # Initializing model on the GPU using the 'get_model' function
    model = get_model(
        config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()
    ).to(device)

    # Setting up the Adam optimizer with the specified learning rate from the '
    # config' dictionary plus an epsilon value
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, eps=1e-9)

    # Initializing epoch and global step variables
    initial_epoch = 0
    global_step = 0

    # Initializing CrossEntropyLoss function for training
    # We ignore padding tokens when computing loss, as they are not relevant for the learning process
    # We also apply label_smoothing to prevent overfitting
    loss_fn = nn.CrossEntropyLoss(
        ignore_index=tokenizer_src.token_to_id("[PAD]"), label_smoothing=0.1
    ).to(device)

    # Initializing training loop

    # Iterating over each epoch from the 'initial_epoch' variable up to
    # the number of epochs informed in the config
    for epoch in range(initial_epoch, config.num_epochs):
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        print(epoch)
        model.train()
        # Initializing an iterator over the training dataloader
        # We also use tqdm to display a progress bar
        batch_iterator = tqdm(train_dataloader, desc=f"Processing epoch {epoch:02d}")

        # For each batch...
        for batch in batch_iterator:

            # Loading input data and masks onto the GPU
            encoder_input = batch["encoder_input"].to(device)
            decoder_input = batch["decoder_input"].to(device)
            encoder_mask = batch["encoder_mask"].to(device)
            decoder_mask = batch["decoder_mask"].to(device)

            # Running tensors through the Transformer
            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(
                encoder_output, encoder_mask, decoder_input, decoder_mask
            )
            proj_output = model.project(decoder_output)

            # Loading the target labels onto the GPU
            label = batch["label"].to(device)

            # Computing loss between model's output and true labels
            loss = loss_fn(
                proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1)
            )

            # Updating progress bar
            batch_iterator.set_postfix({f"loss": f"{loss.item():6.3f}"})

            # Performing backpropagation
            loss.backward()

            # Updating parameters based on the gradients
            optimizer.step()

            # Clearing the gradients to prepare for the next batch
            optimizer.zero_grad()

            global_step += 1  # Updating global step count

        # We run the 'run_validation' function at the end of each epoch
        # to evaluate model performance
        run_validation(
            model,
            val_dataloader,
            tokenizer_src,
            tokenizer_tgt,
            config.seq_len,
            device,
            lambda msg: batch_iterator.write(msg),
        )

In [22]:
train_model(config, train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt)

Using device cuda
0


Processing epoch 00: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 304/304 [22:41<00:00,  4.48s/it, loss=6.810]


SOURCE: "Not to advertise: and to trust this quest of a situation to me. I'll find you one in time."
TARGET: — Di non fare annunzi nei giornali, di affidare a me l'incarico di trovarvi una situazione; a suo tempo ve ne procurerò una.
PREDICTED: — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — —
SOURCE: All these last days Dolly had been alone with her children.
TARGET: Tutti quei giorni Dolly era stata sola coi bambini.
PREDICTED: — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — —
1


Processing epoch 01: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 304/304 [22:40<00:00,  4.48s/it, loss=6.323]


SOURCE: At that hour most of the others were sewing likewise; but one class still stood round Miss Scatcherd's chair reading, and as all was quiet, the subject of their lessons could be heard, together with the manner in which each girl acquitted herself, and the animadversions or commendations of Miss Scatcherd on the performance.
TARGET: Quasi tutte cucivano in quell'ora, eccetto alcune alunne che leggevano a voce alta attorno alla sedia della signorina Scatcherd.
PREDICTED: — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — 

Processing epoch 02: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 304/304 [22:56<00:00,  4.53s/it, loss=6.443]


SOURCE: He has again and again explained that it is not himself, but his office he wishes to mate. He has told me I am formed for labour--not for love: which is true, no doubt.
TARGET: Mi ha ripetuto che non era per sé che prendeva moglie, ma per adempiere la sua missione, che ero adattata al lavoro e non all'amore.
PREDICTED: — E non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non

Processing epoch 03: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 304/304 [23:22<00:00,  4.61s/it, loss=6.120]


SOURCE: The hostess was a short, fair, round-faced woman, beaming with smiles and dimples.
TARGET: La padrona era una donna dal viso tondo, bionda e non alta, tutta splendente di fossette e sorrisi.
PREDICTED: Il suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo ’ era .
SOURCE: She is a splendid woman.
TARGET: È una carissima donna.
PREDICTED: Ma non era un ’ era .
4


Processing epoch 04: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 304/304 [22:32<00:00,  4.45s/it, loss=6.097]


SOURCE: "While something in me," he went on, "is acutely sensible to her charms, something else is as deeply impressed with her defects: they are such that she could sympathise in nothing I aspired to--co-operate in nothing I undertook.
TARGET: — Se vi è qualcosa in me, — riprese, — che subisce fascino della sua attrattiva, qualcos'altro invece è urtato dai suoi difetti; ella non capirebbe aspirazioni, non potrebbe aiutarmi nelle mie imprese,
PREDICTED: — Non è un ’ è un ’ è un ’ è un ’ è di , — disse che non è un ’ è un ’ è di .
SOURCE: A Russian nursemaid was feeding the child and evidently herself eating also.
TARGET: Dava da mangiare alla bambina, e visibilmente mangiava lei stessa insieme alla piccola, una ragazza russa che faceva il servizio nella camera della bambina.
PREDICTED: Il suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo suo 

Reference:
* https://medium.com/ai-in-plain-english/building-and-training-a-transformer-from-scratch-fdbf3db00df4