model.py


In [1]:
from torch.nn.modules.activation import MultiheadAttention
import torch
import torch.nn as nn
import math


## coding our own LAYER NORMALISZATION CODE as the inbuilt one doesnt allow bias = false
class LayerNormalization(nn.Module):
    def __init__(self, eps:float = 10**-6)-> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))    ## alpha and beta are learnable parameters

    def forward(self, x):
        # x: [batch, seq_length , hidden_size]
        mean = x.mean(-1, keepdim = True) # [batch, seq, 1]
        std = x.std(-1, keepdim = True) # [batch , seq , 1]
        #keep the dimension for broadcasting , if (keepdim = False) - the last dimension will not be there - [batch , seqdim]
        return self.alpha * (x - mean) / (std + self.eps) + self.bias


## FEED FORWARD NETWORK - using squeeze and expand method
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model : int, d_ff : int, dropout:float)-> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model , d_ff) ## w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        ## [batch, seq_length, d_model] -> [batch, seq_length, d_ff] -> [batch, seq_length, d_model]
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

## for converting inputs to dimensional embedding prepared to go in encoder or decoder.
class InputEmbeddings(nn.Module):
    def __init__(self, d_model:int, vocab_size:int)-> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # [batch_size, seq_length] -> [batch_size, seq_length, d_model]
        # multiply by sqrt(d_model) to scale the embedding according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model:int, seq_len:int, dropout:float )-> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # create a matrix of shape (seq_len , d_model)
        pe = torch.zeros(seq_len, d_model)
        # create a vector of shape [seq_len]
        position = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) ## [seq_len , 1]
        # create a vector of shape [d_model]
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # [d_model]
        # apply sine to even indices
        pe[ : , 0::2] = torch.sin(position * div_term) # sin(position * (10000**(2i/d_model))
        # apply cosine to all odd indices
        pe[ : , 1::2] = torch.cos(position * div_term) # cos(position * (10000**(2i/d_model))
        # add a batch to positional encoding
        pe = pe.unsqueeze(0)
        ## register the positional encoding as BUFFER(non trainable)
        self.register_buffer('pe' , pe) ## saves the value of pe as "pe" even if the kernel gets closed, and this is not back_propagated


    def forward(self, x):
        x = x + (self.pe[:, : x.shape[1] , :]).requires_grad_(False) # [batch, seq_len , d_model]
        # x = x + (self.pe[:, : , :]).requires_grad_(False)
        # x.shape[1] gives the seq_length of a sentence.
        return self.dropout(x)


class ResidualConnection(nn.Module):
    def __init__(self, dropout:float)-> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x))) ## cant understand it currently


## MULTI HEAD ATTENTION part which we can use for BOTH ENCODER and DECODER
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model:int, h:int, dropout:float)-> None:
        super().__init__()
        self.d_model = d_model # embedding vector size
        self.h = h # Number of heads
        #make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # dimension of embedding seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias = False) #Wq
        self.w_k = nn.Linear(d_model, d_model, bias = False) #Wk
        self.w_v = nn.Linear(d_model, d_model, bias = False) #Wv
        self.w_o = nn.Linear(d_model, d_model, bias = False) #Wo
        ## Heads are not considered yet in the above code
        self.dropout = nn.Dropout(dropout)

    @staticmethod  # we can directly use call function without instantiating the multi head attention class by using: MultiHeadAtetntion.attention(...)
    def attention(query, key, value, mask, dropout:nn.Dropout):
        d_k = query.shape[-1] # gives the last dimension which is the dimension_size of each head i.e. d_k
        # Just apply formula from the paper
        attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k)
        if mask is not None:
            ## write a very low value(indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e4)
        # applying softmax along the values of last dimension(could have been applied along any of last 2 dimensions, doesnt matter)
        attention_scores = attention_scores.softmax(dim = -1) # [batch, h, seq_length, seq_length]
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        ## (batch, h, seq_length, seq_length) -> (batch, h , seq_length, d_k)
        # return attention scores which can be used for visualisation
        return (attention_scores @ value) , attention_scores

    def forward(self, q, k , v , mask):
        query = self.w_q(q) ## [batch, seq_length, d_model] -> [batch, seq_length, d_model]
        key = self.w_k(k) # [batch, seq_length, d_model] -> [batch, seq_length, d_model]
        value = self.w_v(v) # [batch, seq_length, d_model] -> [batch, seq_length, d_model]

        # dividing it into h heads
        #[batch, seq_length, d_model] -> [batch, seq_length, h, d_k] -> [batch, h, seq_length, d_k]
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2) # query.shape[1] = seq_length(), query.shape[0] = batch
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2)

        #calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        #combine all heads together
        # (batch, h, seq_length, d_k) -> (batch, seq_length, h , d_k) - > (batch , seq_length, d_model)
        x = x.transpose(1,2).contiguous().view(x.shape[0] , -1, self.h * self.d_k)

        # multipply by wo
        # (batch , seq_length, d_model) -> (batch , seq_length, d_model)
        return self.w_o(x)

## a single encoder block
class EncoderBlock(nn.Module):
    def __init__(self, self_attention_block:MultiHeadAttentionBlock, feed_forward_block:FeedForwardBlock , dropout:float)-> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connection = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connection[0](x, lambda x: self.self_attention_block(x,x,x, src_mask))
        ## as for an encoder key, query, value have same inputs
        x = self.residual_connection[1](x, self.feed_forward_block)
        # in encoder block, one can see 2 skip connections, one before and after the MHA and one before after the Feed forward layer.
        return x

## Actual encoder
class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


class DecoderBlock(nn.Module):
    def __init__(self, self_attention_block:MultiHeadAttentionBlock, cross_attention_block : MultiHeadAttentionBlock, feed_forward_block:FeedForwardBlock, dropout:float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connection = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connection[0](x, lambda x: self.self_attention_block(x ,x, x, tgt_mask))
        # initial masked multi head attention layer where encoder outputs are not used
        x = self.residual_connection[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        # cross attention layers where query is x, and key and value are from encoder blocks
        x = self.residual_connection[2](x, self.feed_forward_block)
        return x
        # final feed forward layer

class Decoder(nn.Module):
    def __init__(self, layers : nn.ModuleList)-> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layers in self.layers:
            x = layers( x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)


## for converting the final enmbedding to the vocabulary space meaning which work is most likely to come
class ProjectionLayer(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x)-> None:
        # [batch, seq_length, d_model] -> [batch, seq_length, vocab_size]
        return torch.log_softmax(self.proj(x), dim = -1)


class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed : InputEmbeddings, tgt_embed : InputEmbeddings, src_pos : PositionalEncoding, tgt_pos : PositionalEncoding, projection_layer : ProjectionLayer  )-> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        #[batch, seq_length, d_model]
        src = self.src_embed(src)
        src = self.src_pos(src)
        encoder_output = self.encoder(src, src_mask)
        return encoder_output

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor )-> None:
        # [batch, seq_length, d_model]
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        # target - the thing we need to predict
        decoder_output = self.decoder(tgt, encoder_output, src_mask, tgt_mask)
        return decoder_output

    def project(self, x):
        # [batch, seq_length, vocab_size]
        return self.projection_layer(x)

def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_length: int, tgt_seq_length: int, d_model: int = 512, N:int=6, h:int=8, dropout:float = 0.1, d_ff:int=256):
    # create embedding layer

    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # create positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_length, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_length, dropout)

    # create encoder blocks
    encoder_blocks = []
    # N - no of encoder and decoder blocks
    for _ in range(N // 2):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)


    # create decoder blocks
    decoder_blocks = []
    for _ in range(N // 2):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout )
        decoder_blocks.append(decoder_block)

    e1, e2, e3 = encoder_blocks
    d1, d2, d3 = decoder_blocks

    encoder_blocks1 = [e1, e2, e3, e3, e2, e1]
    decoder_blocks1 = [d1, d2, d3, d3, d2, d1]


    # create the encoder and decoder
    encoder = Encoder(nn.ModuleList(encoder_blocks1))
    decoder = Decoder(nn.ModuleList(decoder_blocks1))



    #create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # initialise the parameters(will work even if we dont do this)
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.normal_(p, std = 0.02)

    n_param = sum(p.numel() for p in transformer.parameters())
    print("Total Parameters:", n_param)

    return transformer

config.py


In [2]:
from pathlib import Path

def get_config():
    return{
        "batch_size":2048,
        "num_epochs":20,
        "lr": 10**-4,
        "seq_len":130,
        "d_model" : 512,
        "lang_src" : "en",
        "lang_tgt" : "fr",
        "model_folder" : "weights",
        "model_basename" : "tmodel_",
        "preload" : True,
        "tokenizer_file" : "tokenizer_{0}.json",
        "experiment_name" : "runs/tmodel",
        "scheduler": None

    }

def get_weights_file_path(config, epoch: str):
    model_folder = config["model_folder"]
    model_basename = config["model_basename"]
    model_filename = f"{model_basename}{epoch}.pt"
    return str(Path('.')/ model_folder / model_filename)

dataset.py


In [3]:
import torch
import torch.nn
from torch.utils.data import Dataset

## convert from one language to another
class BillingualDataset(Dataset):
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype = torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype = torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype = torch.int64)

    def __len__(self):
        return len(self.ds)


    def __getitem__(self, idx):
        ## extracting the text fromt he input
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        ## transform text into token
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        ##add sos eos and padding to each of the sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 ## add both sos and eod
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 ## add only sos and not eos

        ## make sure number of padding tokenn is not negative. If it is, sentence is too long
        if enc_num_padding_tokens < 0  or dec_num_padding_tokens < 0:
            raise ValueError("Sentence too long")

        ## add sos and eos token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype = torch.int64),
                self.eos_token,
                # torch.tensor([self.pad_token] * enc_num_padding_tokens , dtype = torch.int64),
            ],
            dim = 0,
        )

        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype = torch.int64),
                # torch.tensor([self.pad_token] * dec_num_padding_tokens , dtype = torch.int64),
            ],
            dim = 0,
        )

        ## add only eos token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype = torch.int64),
                self.eos_token,
                # torch.tensor([self.pad_token] * dec_num_padding_tokens , dtype = torch.int64),
            ],
            dim = 0,
        )

        ## NOTICE THE DIFFERENCE b/w DECODER_INPUT and LABEL, this difference allows us to parallely train decoder models
        ## for any index i, input is from 0 to i of decoder input and label(or prediction) is ith of label which is actually the next word.

        # double check the size of tensors to make sure they are fo same length i.e. seq_len

        # assert encoder_input.size(0) == self.seq_len
        # assert decoder_input.size(0) == self.seq_len
        # assert label.size(0) == self.seq_len

        return {
            "encoder_input" : encoder_input,
            "decoder_input" : decoder_input,
            "encoder_str_length":len(enc_input_tokens),
            "decoder_str_length":len(dec_input_tokens),
            "encoder_mask" : (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),  ## (1,1,seq_len)
            # where ever encoder token is not equal to pad token, pass TRUE, and where it is equal to pad pass FALSE , thereforE of type(T, T ,T, F, F, F, F)
            "decoder_mask" : (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),  # (1,seq_len) & (1, seq_len, seq_len)
            ## seq_len = 10
            ## SOS    I  GOT   A   CAT    PAD    PAD    PAD    PAD    PAD    PAD
            ## TRUE TRUE TRUE TRUE TRUE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
            ## 1 1 1 1 1 0 0 0 0 0
            ## Upper triangular matrix
            ## 1 1 1 1 1 1 1 1 1 1
            ## 0 1 1 1 1 1 1 1 1 1
            ## 0 0 1 1 1 1 1 1 1 1
            ## 0 0 0 1 1 1 1 1 1 1
            ## 0 0 0 0 1 1 1 1 1 1
            ## 0 0 0 0 0 1 1 1 1 1
            ## 0 0 0 0 0 0 1 1 1 1
            ## 0 0 0 0 0 0 0 1 1 1
            ## 0 0 0 0 0 0 0 0 1 1
            ## 0 0 0 0 0 0 0 0 0 1

            ## after AND operation - Final Decoder Mask
            ## 1 1 1 1 1 0 0 0 0 0
            ## 0 1 1 1 1 0 0 0 0 0
            ## 0 0 1 1 1 0 0 0 0 0
            ## 0 0 0 1 1 0 0 0 0 0
            ## 0 0 0 0 1 0 0 0 0 0
            ## 0 0 0 0 0 0 0 0 0 0
            ## 0 0 0 0 0 0 0 0 0 0
            ## 0 0 0 0 0 0 0 0 0 0
            ## 0 0 0 0 0 0 0 0 0 0
            ## 0 0 0 0 0 0 0 0 0 0
            "label" : label, #(seq_len)
            "src_text" : src_text,
            "tgt_text" : tgt_text,
        }

def causal_mask(size):
    ## creates upper traigular matrix of ones with diagonal = 1.
    mask = torch.triu(torch.ones((1, size, size)), diagonal = 1).type(torch.int)
    return mask == 0


train.py


In [4]:
# from model import build_transformer
# from dataset import BillingualDataset, causal_mask
# from config import get_config, get_weights_file_path


!pip install torchtext
!pip3 install datasets
!pip3 install tokenizers

from torchtext import datasets as datasets
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import LambdaLR

import warnings
from tqdm import tqdm
import os
from pathlib import Path


# hugging face datasets and tokenizer
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

!pip3 install torchmetrics
import torchmetrics

from torch.utils.tensorboard import SummaryWriter

## basically speeds up some part of code
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    ## precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    ## initialise decoder input with sos token
    decoder_input = torch.empty(1,1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len: # if we reach max length before getting the eos token
            break

        ## build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)

        decoder_input = torch.cat(
            [decoder_input, torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)], dim = 1,
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)


def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_step, writer, num_examples= 2):
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)

    except:
        # if we cant get the console width
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch['encoder_input'].to(device) # [B, seq_len]
            encoder_mask = batch['encoder_mask'].to(device) # [B, 1, 1, seq_len]

            ## check that the batch size is 1
            assert encoder_input.size(0) == 1, "batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            ## print the sourc , target and model output

            print_msg('-'* console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break
    if writer:
        ## evaluate the character error rate
        ## compute the char error rate
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation_cer', cer, global_step)
        writer.flush()

        ## compute word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation_wer', wer, global_step)
        writer.flush()

        ## compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation BLEU', bleu, global_step)
        writer.flush()

def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        ## most code taken from  https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token = "[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]","[PAD]","[SOS]","[EOS]"], min_frequency = 2)
        ## for a word to be a part of our dataset, it should atleast come twice otherwise its not the part of our dataset
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer = trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer
  



In [5]:
def get_ds(config):
    # it has only the train split, so we divide it ourselves
    ds_raw = load_dataset('opus_books', f"{config['lang_src']}-{config['lang_tgt']}", split = 'train')
    print("dataset_size" , len(ds_raw))

    # build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])




    ## keep 90% for traning and 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size

    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])
    sorted_train_ds = sorted(train_ds_raw, key = lambda x:len(x["translation"][config['lang_src']]))
    # sorted_train_ds = train_ds_raw ## not sorted, taken as it is
    filtered_sorted_train_ds = [k for k in sorted_train_ds if (len(k['translation'][config['lang_src']]) < 150 and  len(k['translation'][config['lang_src']]) > 3)]
    filtered_sorted_train_ds = [k for k in filtered_sorted_train_ds if (len(k['translation'][config['lang_tgt']]) < 150 and len(k['translation'][config['lang_tgt']]) > 3)]
    filtered_sorted_train_ds = [k for k in filtered_sorted_train_ds if len(k['translation'][config['lang_src']]) + 10 > len(k['translation'][config['lang_tgt']]) ]


    train_ds = BillingualDataset(filtered_sorted_train_ds, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    # train_ds = BillingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BillingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])


    # find max length of each sentence in the source and target sentence
    max_len_src = 0
    max_len_tgt = 0

    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))


    max_len_src_filtered = 0
    max_len_tgt_filtered = 0
    for item in filtered_sorted_train_ds:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src_filtered = max(max_len_src_filtered, len(src_ids))
        max_len_tgt_filtered = max(max_len_tgt_filtered, len(tgt_ids))


    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')

    print(f'Max length of filtered source sentence: {max_len_src_filtered}')
    print(f'Max length of filterd target sentence: {max_len_tgt_filtered}')

    print("length of train dataset" , len(train_ds))
    print("length of validation dataset" , len(val_ds))

    train_dataloader = DataLoader(train_ds, batch_size = config['batch_size'], shuffle = True, collate_fn = collate_fn )
    val_dataloader = DataLoader(val_ds, batch_size = 1, shuffle = True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt



def collate_fn(batch):
    encoder_input_max = max(b['encoder_str_length'] for b in batch)
    decoder_input_max = max(b['decoder_str_length'] for b in batch)
    encoder_input_max += 2
    decoder_input_max += 2

    # input_size_max = max(encoder_input_max, decoder_input_max)

    pad_token_encoder = torch.tensor([tokenizer_src.token_to_id("[PAD]")], dtype = torch.int64)
    pad_token_decoder = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype = torch.int64)

    encoder_inputs = []
    decoder_inputs = []
    encoder_masks = []
    decoder_masks = []
    labels = []
    src_texts = []
    tgt_texts = []

    for b in batch:
        enc_num_padding_token = encoder_input_max - len(b['encoder_input'])
        dec_num_padding_token = decoder_input_max - len(b['decoder_input'])
        label_num_padding_token = decoder_input_max - len(b['label'])

        encoder_input = torch.cat(
            [
                b['encoder_input'],
                torch.tensor([pad_token_encoder] * enc_num_padding_token , dtype = torch.int64)
            ],
            dim = 0,
        )
        decoder_input = torch.cat(
            [
                b['decoder_input'],
                torch.tensor([pad_token_decoder] * dec_num_padding_token, dtype = torch.int64)
            ],
            dim = 0,
        )
        label = torch.cat(
            [
                b['label'],
                torch.tensor([pad_token_decoder] * label_num_padding_token, dtype = torch.int64)
            ],
            dim = 0,
        )
        encoder_mask = (encoder_input != pad_token_encoder).unsqueeze(0).unsqueeze(0).int()
        decoder_mask = (decoder_input != pad_token_decoder).unsqueeze(0).int() & causal_mask(decoder_input_max)
        encoder_inputs.append(encoder_input)
        decoder_inputs.append(decoder_input)
        encoder_masks.append(encoder_mask)
        decoder_masks.append(decoder_mask)
        labels.append(label)
        src_texts.append(b["src_text"])
        tgt_texts.append(b['tgt_text'])

    # print(k.size() for k in encoder_inputs)
    # print(k.shape() for k in decoder_inputs)
    # print(k.shape() for k in encoder_masks)
    # print(k.shape() for k in decoder_masks)

    return {
        "encoder_input": torch.vstack(encoder_inputs),
        "decoder_input": torch.vstack(decoder_inputs),
        "encoder_mask": torch.vstack(encoder_masks),
        "decoder_mask": torch.vstack(decoder_masks),
        "label" : torch.vstack(labels),
        "src_text" : src_texts,
        "tgt_text": tgt_texts
    }


def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], d_model = config['d_model'])
    return model

In [6]:
# from config import get_config

cfg = get_config()
cfg['batch_size'] = 8
cfg['preload'] = None
cfg['num_epochs'] = 10

# from train import train_model

torch.cuda.amp.autocast(enabled=True)
## in pytorch lightening, check if the above command is already enabled when precision is set to FP16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

## make sure the weights folder exists
Path(cfg['model_folder']).mkdir(parents = True, exist_ok = True)

train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(cfg)


model = get_model(cfg, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

# tensorboard
writer = SummaryWriter(cfg['experiment_name'])

optimizer = torch.optim.Adam(model.parameters(), lr = cfg['lr'] , eps = 1e-9)
## each feature can have different learnign rate, so for words seen less it can increase learning rate of those weights


Using device:  cuda
dataset_size 127085
Max length of source sentence: 471
Max length of target sentence: 482
Max length of filtered source sentence: 45
Max length of filterd target sentence: 48
length of train dataset 60888
length of validation dataset 12709
Total Parameters: 57124690


In [7]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [8]:
MAX_LR = 10**-3
STEPS_PER_EPOCH = len(train_dataloader)
EPOCHS = 30

In [9]:
# Scheduler
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                                max_lr = MAX_LR,
                                                steps_per_epoch = STEPS_PER_EPOCH,
                                                epochs = EPOCHS,
                                                pct_start = 1/10 if EPOCHS != 1 else 0.5,
                                                div_factor = 10,
                                                three_phase = True,
                                                final_div_factor = 10,
                                                anneal_strategy = "linear"
                                                )

In [None]:


## if the user has specified a model to preload before training , load it
initial_epoch = 0
global_step = 0
# if config['preload']:
#     model_filename = get_weights_file_path(config, config['preload'])
#     print(f'Preloading model {model_filename}')
#     state = torch.load(model_filename)
#     model.load_state_dict(state['model_state_dict'])
#     initial_epoch = state['epoch'] + 1
#     optimizer.load_state_dict(state['optimizer_state_dict'])       ## important to store optimiser for Adam as all weights have different lr
#     global_step = state['global_step']
#     print("preloaded")

loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer_src.token_to_id('[PAD]'), label_smoothing= 0.1)

scaler = torch.cuda.amp.GradScaler()
lr = [0.0]

for epoch in range(initial_epoch, EPOCHS):
    loss_acc = []

    model.train()
    batch_iterator = tqdm(train_dataloader, desc = f"Processing Epoch {epoch:02d}")

    for batch in batch_iterator:
        torch.cuda.empty_cache()

        encoder_input = batch['encoder_input'].to(device) # [B, seq_len]
        decoder_input = batch['decoder_input'].to(device) # [B, seq_len]
        encoder_mask = batch['encoder_mask'].to(device) # [B, 1, 1, Seq_len]
        decoder_mask = batch['decoder_mask'].to(device) # [B, 1, Seq_len, Seq_len]

        ## run the tensors through the encoder, decoder and projection layer
        # print(encoder_input.shape)
        # print(decoder_mask.shape)
        encoder_mask = encoder_mask.unsqueeze(1)
        decoder_mask = decoder_mask.unsqueeze(1)
        # print(encoder_mask.shape)

        with torch.autocast(device_type = 'cuda', dtype = torch.float16 ):
            encoder_output = model.encode(encoder_input, encoder_mask)  # [B, seq_len, d_model]
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output = model.project(decoder_output) # [B, seq_len, Vocab_size]

            ## compare the ouput with the label
            label = batch['label'].to(device) ## [B, seq_len]

            ## compute the loss using simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            loss_acc.append(loss)
            batch_iterator.set_postfix(
                {"loss_acc": f"{torch.mean(torch.stack(loss_acc)).item():6.3f}",
                    "loss": f"{loss.item():6.3f}", "lr" : f"{get_lr(optimizer)}"
                })



        ## log the loss
        writer.add_scalar('train_loss', loss.item(), global_step)
        writer.flush()

        ## backpropagate the loss
        # loss.backward()
        scaler.scale(loss).backward()

        ## update the weights
        # optimizer.step()
        scale = scaler.get_scale()
        scaler.step(optimizer)
        scaler.update()
        skip_lr_sched = (scale > scaler.get_scale())
        if not skip_lr_sched:
            scheduler.step()
        lr.append(scheduler.get_last_lr())
        optimizer.zero_grad(set_to_none = True)

        global_step += 1


    ## run validation at the end of every epoch
    run_validation(model,val_dataloader, tokenizer_src, tokenizer_tgt, cfg['seq_len'], device, lambda msg : batch_iterator.write(msg) , global_step, writer)

    ## remove the prev model files
    if epoch > 0:
        prev_model_filename  = get_weights_file_path(cfg, f"{epoch - 1:02d}")
        os.remove(prev_model_filename)

    model_filename = get_weights_file_path(cfg, f"{epoch:02d}")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'global_step': global_step
    }, model_filename )


Processing Epoch 00: 100%|██████████| 7611/7611 [17:14<00:00,  7.36it/s, loss_acc=4.823, loss=4.476, lr=0.00039989488437281004]


--------------------------------------------------------------------------------
    SOURCE: Unable to comprehend the captain's resistance, he hastened to say to him,−−
    TARGET: Ne pouvant s’expliquer la résistance du capitaine, il se hâta de lui dire :
 PREDICTED: Le capitaine voulut lui comprendre le capitaine , il se hâta de lui dire :
--------------------------------------------------------------------------------
    SOURCE: The elder one, whom you have seen (and whom I cannot hate, whilst I abhor all his kindred, because he has some grains of affection in his feeble mind, shown in the continued interest he takes in his wretched sister, and also in a dog-like attachment he once bore me), will probably be in the same state one day.
    TARGET: L'aîné, que vous avez vu (et que je ne puis pas haïr, bien que je déteste toute sa famille, parce que cet esprit faible a montré, par son continuel intérêt pour sa malheureuse soeur, qu'il y avait en lui quelque peu d'affection, et parce q

Processing Epoch 01: 100%|██████████| 7611/7611 [19:14<00:00,  6.59it/s, loss_acc=3.801, loss=3.257, lr=0.0006997897687456201]


--------------------------------------------------------------------------------
    SOURCE: « Si cependant vous avez la bonté de me mettre au courant de vos investigations, continua-t-il, je serai heureux de vous preter mon concours dans la limite de mes moyens.
    TARGET: "If you will let me know how your investigations go," he continued, "I shall be happy to give you any help I can.
 PREDICTED: " If you have done my of my , I ' s , I ," he continued , " I am , " I am in my of my ."
--------------------------------------------------------------------------------
    SOURCE: "No doubt it is a geyser, like those in Iceland."
    TARGET: --Eh! sans doute, geyser, riposte mon oncle, un geyser pareil à ceux de l'Islande!»
 PREDICTED: -- Non , c ' est un homme , comme ceux dans l ' Islande .
--------------------------------------------------------------------------------


Processing Epoch 02: 100%|██████████| 7611/7611 [19:10<00:00,  6.62it/s, loss_acc=3.460, loss=3.657, lr=0.000999645234758234]


--------------------------------------------------------------------------------
    SOURCE: The fair Amanda reflected for a while.
    TARGET: La belle Amanda réfléchit un peu.
 PREDICTED: L ’ Amanda songea pour un temps .
--------------------------------------------------------------------------------
    SOURCE: But his thoughts had never turned in that direction, and, moreover, he had not the least inclination for riotous living.
    TARGET: Il n'y avait pas pensé, parce que sa chair était morte, et qu'il ne se sentait plus le moindre appétit de débauche.
 PREDICTED: Mais sa pensée n ' avait pas dans cette direction , et , et , il n ' avait pas le moins de .
--------------------------------------------------------------------------------


Processing Epoch 03: 100%|██████████| 7611/7611 [19:06<00:00,  6.64it/s, loss_acc=3.113, loss=3.385, lr=0.0007004598808689559]


--------------------------------------------------------------------------------
    SOURCE: A noise aroused him; someone was knocking at the door, trying to open it.
    TARGET: Un bruit le réveilla, on frappait a la porte, on essayait d'ouvrir.
 PREDICTED: Un bruit le fit ; quelqu ’ un s ’ en allait , en s ’ en .
--------------------------------------------------------------------------------
    SOURCE: When they reached the shop, everyone was ready: Grivet and Olivier, the witnesses of Therese, were there, along with Suzanne, who looked at the bride as little girls look at dolls they have just dressed up.
    TARGET: Lorsqu'ils arrivèrent à la boutique, tout le monde était prêt: il y avait là Grivet et Olivier, témoins de Thérèse, et Suzanne qui regardait la mariée comme les petites filles regardent les poupées qu'elles viennent d'habiller.
 PREDICTED: Quand ils furent prêts à la boutique ; chacun était prêt à Olivier et Olivier , la témoins , il y avait des filles , qui regardait 

Processing Epoch 04: 100%|██████████| 7611/7611 [19:48<00:00,  6.40it/s, loss_acc=2.633, loss=3.261, lr=0.00040060441485634203]


--------------------------------------------------------------------------------
    SOURCE: « Voila mon dernier anneau, cria-t-il, tout est complet maintenant. »
    TARGET: "The last link," he cried, exultantly. "My case is complete."
 PREDICTED: " That last the last ring ," he cried , " is his astonishment ."
--------------------------------------------------------------------------------
    SOURCE: CHAPTER 16
    TARGET: CHAPITRE XVI.
 PREDICTED: CHAPITRE XVI
--------------------------------------------------------------------------------


Processing Epoch 05: 100%|██████████| 7611/7611 [19:55<00:00,  6.37it/s, loss_acc=2.229, loss=2.265, lr=0.0001007489488437281]


--------------------------------------------------------------------------------
    SOURCE: If your feelings are still what they were last April, tell me so at once. _My_ affections and wishes are unchanged, but one word from you will silence me on this subject for ever."
    TARGET: Les miens n’ont pas varié, non plus que le reve que j’avais formé alors. Mais un mot de vous suffira pour m’imposer silence a jamais.
 PREDICTED: Si vos sentiments sont encore si elles me le dire , si je suis incapable de me rendre la route , mais un mot de silence pour vous , je vous réponds sur ce sujet .
--------------------------------------------------------------------------------
    SOURCE: "So did I, madam, and I am excessively disappointed.
    TARGET: -- Moi aussi, madame, et vous me voyez très désappointé.
 PREDICTED: -- Je m ' a dit , madame , et je suis un peu longue .
--------------------------------------------------------------------------------


Processing Epoch 06: 100%|██████████| 7611/7611 [19:41<00:00,  6.44it/s, loss_acc=1.992, loss=2.046, lr=9.626086004434347e-05]


--------------------------------------------------------------------------------
    SOURCE: Observing Conseil, I discovered that, just barely, the gallant lad had fallen under the general influence.
    TARGET: En observant Conseil, je constatai que ce brave garçon subissait tant soit peu l'influence générale.
 PREDICTED: Je , Conseil , à peine , à peine , que le brave garçon était tombé sous l ' influence général .
--------------------------------------------------------------------------------
    SOURCE: "I tell you that she can't see you."
    TARGET: -- Je n'y peux rien, s'écria la femme d'un ton rude, je vous répète qu'elle ne peut vous voir.
 PREDICTED: -- Je vous dis qu ' elle ne peut pas vous voir .
--------------------------------------------------------------------------------


Processing Epoch 07: 100%|██████████| 7611/7611 [19:46<00:00,  6.42it/s, loss_acc=1.936, loss=1.878, lr=9.25123586894041e-05]


--------------------------------------------------------------------------------
    SOURCE: Among some unimportant papers he found the following letter, that which he had sought at the risk of his life:
    TARGET: Au milieu de quelques papiers sans importance, il trouva la lettre suivante: c'était celle qu'il était allé chercher au risque de sa vie:
 PREDICTED: Quelques papiers , après avoir trouvé la lettre , ce qui était essayé de courir au risque de sa vie :
--------------------------------------------------------------------------------
    SOURCE: Thereupon her son had a nervous attack, and threatened to fall ill, if she did not give way to his whim.
    TARGET: Son fils eut une crise de nerfs, il la menaça de tomber malade, si elle ne cédait pas à son caprice.
 PREDICTED: Là - dessus son fils avait une crise nerveuse , et il menaçait de ne pas se laisser agir franchement , de sa fantaisie .
--------------------------------------------------------------------------------


Processing Epoch 08: 100%|██████████| 7611/7611 [19:46<00:00,  6.42it/s, loss_acc=1.892, loss=1.719, lr=8.876336462923932e-05]


--------------------------------------------------------------------------------
    SOURCE: Mrs. Fairfax turned out to be what she appeared, a placid-tempered, kind-natured woman, of competent education and average intelligence.
    TARGET: Mme Fairfax était en effet ce qu'elle m'avait paru tout d'abord, une femme douce, complaisante, suffisamment instruite, et d'une intelligence ordinaire.
 PREDICTED: Mme Fairfax se tourna vers cette éducation de l ' éducation tranquille et capables de l ' éducation de capables , de l ' intelligence et de toute intelligence .
--------------------------------------------------------------------------------
    SOURCE: The host drew back and burst into tears.
    TARGET: L'hôte recula d'un pas et se mit à fondre en larmes.
 PREDICTED: L ' hôte se recula , et éclata en sanglots .
--------------------------------------------------------------------------------


Processing Epoch 09: 100%|██████████| 7611/7611 [18:47<00:00,  6.75it/s, loss_acc=1.856, loss=2.037, lr=8.501486327429995e-05]


--------------------------------------------------------------------------------
    SOURCE: I thought I recognised you at street-corners, and I ran after all the carriages through whose windows I saw a shawl fluttering, a veil like yours."
    TARGET: J’ai cru vous reconnaître au coin des rues; et je courais après tous les fiacres où flottait à la portière un châle, un voile pareil au vôtre...
 PREDICTED: Je croyais que vous dans les coin de la rue ; et je courus de toute la voiture des voitures au un voile , bien tourner la vôtre .
--------------------------------------------------------------------------------
    SOURCE: ÉTIENNE had at last descended from the platform and entered the Voreux; he spoke to men whom he met, asking if there was work to be had, but all shook their heads, telling him to wait for the captain.
    TARGET: Étienne, descendu enfin du terri, venait d'entrer au Voreux; et les hommes auxquels il s'adressait, demandant s'il y avait du travail, hochaient la tete, 

Processing Epoch 10: 100%|██████████| 7611/7611 [18:49<00:00,  6.74it/s, loss_acc=1.826, loss=1.917, lr=8.126636191936059e-05]


--------------------------------------------------------------------------------
    SOURCE: The porter said, 'Yes, madam'; and the constable began not to like it, and would have persuaded the mercer to dismiss him, and let me go, since, as he said, he owned I was not the person.
    TARGET: Le commissionnaire dit: «Oui, madame»; et la chose commença de déplaire au commissaire qui s'efforça de persuader au mercier de me congédier et de me laisser aller, puisque, ainsi qu'il disait, il convenait que je n'étais point la personne.
 PREDICTED: Le portier , madame . Et le commissaire se mit à commissaire en commissaire , et ne l ' eût pas dit le mercier , car je fus reçue , et me , car il ne le but , car il ne pouvait .
--------------------------------------------------------------------------------
    SOURCE: "Are you a Christian?"
    TARGET: -- Êtes-vous chrétien?
 PREDICTED: -- Vous êtes un chrétien ?
--------------------------------------------------------------------------------


Processing Epoch 11: 100%|██████████| 7611/7611 [18:45<00:00,  6.76it/s, loss_acc=1.802, loss=1.787, lr=7.751835326964662e-05]


--------------------------------------------------------------------------------
    SOURCE: "Listen!" he said to her; and she shuddered at the sound of that fatal voice which she had not heard for a long time.
    TARGET: « Écoute », lui dit-il, et elle frémit au son de cette voix funeste qu’elle n’avait pas entendue depuis longtemps. Il continua.
 PREDICTED: « Écoutez , lui dit - il , et elle frissonnait au son de cette voix fatal qui ne l ' avait pas entendu pendant longtemps .
--------------------------------------------------------------------------------
    SOURCE: At eight o'clock Justin came to fetch him to shut up the shop.
    TARGET: À huit heures, Justin venait le chercher pour fermer la pharmacie.
 PREDICTED: À huit heures , Justin venait l ’ aller chercher pour qu ’ il fermait la boutique .
--------------------------------------------------------------------------------


Processing Epoch 12: 100%|██████████| 7611/7611 [18:50<00:00,  6.74it/s, loss_acc=1.779, loss=1.795, lr=7.376985191470725e-05]


--------------------------------------------------------------------------------
    SOURCE: "That was what I wished you to think."
    TARGET: – C’est ce que je désirais vous faire croire.
 PREDICTED: -- C ' est ce que je voulais vous croire .
--------------------------------------------------------------------------------
    SOURCE: Eight o'clock struck.
    TARGET: Huit heures sonnaient.
 PREDICTED: Huit heures sonnerent .
--------------------------------------------------------------------------------


Processing Epoch 13: 100%|██████████| 7611/7611 [18:41<00:00,  6.79it/s, loss_acc=1.760, loss=1.979, lr=7.002184326499329e-05]


--------------------------------------------------------------------------------
    SOURCE: "Yes, but water decomposed into its primitive elements," replied Cyrus Harding, "and decomposed doubtless, by electricity, which will then have become a powerful and manageable force, for all great discoveries, by some inexplicable laws, appear to agree and become complete at the same time.
    TARGET: -- Oui, mais l'eau décomposée en ses éléments constitutifs, répondit Cyrus Smith, et décomposée, sans doute, par l'électricité, qui sera devenue alors une force puissante et maniable, car toutes les grandes découvertes, par une loi inexplicable, semblent concorder et se compléter au même moment.
 PREDICTED: -- Oui , mais dans ses éléments Bunzen , répondit Cyrus Smith , et il aura été par l ' électricité qui se tout à fait pour un grand fracas .
--------------------------------------------------------------------------------
    SOURCE: En effet, j’ai vu deux de ses espions particuliers, de moi b

Processing Epoch 14: 100%|██████████| 7611/7611 [19:03<00:00,  6.66it/s, loss_acc=1.743, loss=1.720, lr=6.627383461527933e-05]


--------------------------------------------------------------------------------
    SOURCE: "By God! I tell you you shall drink a glass in here; I'll break the jaws of the first man who looks askance at me!"
    TARGET: —Nom de Dieu! je te dis que tu vas boire une chope la-dedans, je casse la gueule au premier qui me regarde de travers!
 PREDICTED: — Nom de Dieu ! je te dis que tu vas boire un verre , moi , je les mâchoires de l ' air qui me de travers .
--------------------------------------------------------------------------------
    SOURCE: Perhaps other creeks also ran towards the west, but they could not be seen.
    TARGET: Peut-être d'autres creeks couraient-ils vers l'ouest, mais rien ne permettait de le constater.
 PREDICTED: Peut - etre aussi , étant - elle aussi ah ! mais ils ne pouvaient voir de l ' ouest .
--------------------------------------------------------------------------------


Processing Epoch 15:  66%|██████▌   | 5004/7611 [12:10<06:26,  6.74it/s, loss_acc=1.715, loss=1.729, lr=6.38088303725399e-05] 