<a href="https://colab.research.google.com/github/s-pike3/Projects_In_AI-ML/blob/main/Hw3_part3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from torch.utils.data.dataloader import default_collate
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn import tree
import matplotlib.pyplot as plt
import math
import torch
from torch import nn
from torch.nn import functional as F
!pip install datasets
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers import normalizers, pre_tokenizers, Tokenizer, models, trainers
import time
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence



### Data Preparation


TranslationDataset stores instance of English-French translation set and associated tokenizers.

In [2]:
from torch.utils.data import Dataset
from tokenizers.processors import TemplateProcessing
class TranslationDataset(Dataset):
    def __init__(self, data, eng_tokenizer=None,fr_tokenizer=None):
        super(TranslationDataset, self).__init__()
        self.data = data
        self.values = []
        self.labels = []
        self.set_values_labels()
        self.eng_tokenizer = None
        if eng_tokenizer is None:
          self.eng_tokenizer = Tokenizer(models.BPE())
          self.eng_tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
          trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[BOS]", "[EOS]"]) #  Adding [BOS] and [EOS] here
          self.eng_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
          self.eng_tokenizer.post_processor = TemplateProcessing(
              single="[BOS] $A [EOS]",
              special_tokens=[("[BOS]", 1), ("[EOS]", 2)],
          )
          self.eng_tokenizer.train_from_iterator(self.values_iterator(), length=len(self.values))
        else:
          self.eng_tokenizer = eng_tokenizer

        self_fr_tokenizer = None
        if fr_tokenizer is None:
          self.fr_tokenizer = Tokenizer(models.BPE())
          trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[BOS]", "[EOS]"]) #  Adding [BOS] and [EOS] here
          self.fr_tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
          self.fr_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
          self.fr_tokenizer.post_processor = TemplateProcessing(
              single="[BOS] $A [EOS]",
              special_tokens=[("[BOS]", 1), ("[EOS]", 2)],
          )
          self.fr_tokenizer.train_from_iterator(self.labels_iterator(), length=len(self.labels))
        else:
          self.fr_tokenizer = fr_tokenizer
        self.fr_max_len = 0
        self.set_max_len()


    def set_max_len(self):
      for i in range(len(self.values)):
        seq_len = len(self.fr_tokenizer.encode(self.labels[i]).ids)
        if(seq_len > self.fr_max_len):
          self.fr_max_len = seq_len
    def get_fr_max_len(self):
      return self.fr_max_len

    def __len__(self):
        return len(self.values)  # number of samples in the dataset

    def __getitem__(self, index):
        return torch.tensor(self.eng_tokenizer.encode(self.values[index]).ids), \
            torch.tensor(self.fr_tokenizer.encode(self.labels[index]).ids)

    def set_values_labels(self):
       for i in range(len(self.data)):
        self.values.append(self.data[i]['translation']['en'])
        self.labels.append(self.data[i]['translation']['fr'])

    def get_values(self):
      return self.values

    def get_labels(self):
      return self.labels

    def values_iterator(self):
      for i in range(len(self.values)):
          yield self.values[i]

    def labels_iterator(self):
      for i in range(len(self.labels)):
          yield self.labels[i]

    def get_eng_tokenizer(self):
      return self.eng_tokenizer

    def get_fr_tokenizer(self):
      return self.fr_tokenizer

Load Data

In [3]:
books = load_dataset("opus_books", "en-fr")
tmp = books["train"].train_test_split(test_size=0.1)
books1 = tmp["test"]
books1 = books1.train_test_split(test_size=0.2)
train = books1["train"]
tmp = books1["test"].train_test_split(test_size=0.5)
val = tmp["train"]
test = tmp["test"]
len(train),len(val),len(test)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(10167, 1271, 1271)

In [4]:
train_iter = TranslationDataset(train)

In [5]:
#Validation and test set uses tokenizers trained from the training set
val_iter = TranslationDataset(val, eng_tokenizer=train_iter.get_eng_tokenizer(),fr_tokenizer=train_iter.get_fr_tokenizer())
test_iter = TranslationDataset(test,eng_tokenizer=train_iter.get_eng_tokenizer(),fr_tokenizer=train_iter.get_fr_tokenizer())

## Transformer

Adapted from the following tutorials:

Etienne, B. (2024, March 19). A complete guide to write your own transformers. Medium. https://medium.com/data-science/a-complete-guide-to-write-your-own-transformers-29e23f371ddd

Sayed, E. (2024, June 11). Building a transformer from scratch: A step-by-step guide. Medium. https://medium.com/@sayedebad.777/building-a-transformer-from-scratch-a-step-by-step-guide-a3df0aeb7c9a



In [16]:
class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_dim=256, num_heads=2):
        """
        input_dim: Dimensionality of the input.
        num_heads: The number of attention heads to split the input into.
        """
        super(MultiHeadAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.Wv = nn.Linear(hidden_dim, hidden_dim, bias=False) # the Value part
        self.Wk = nn.Linear(hidden_dim, hidden_dim, bias=False) # the Key part
        self.Wq = nn.Linear(hidden_dim, hidden_dim, bias=False) # the Query part
        self.Wo = nn.Linear(hidden_dim, hidden_dim, bias=False) # the output layer


    def scaled_dot_product_attention(self,query,key,value,attention_mask=None,key_padding_mask=None):
        """
        query : tensor of shape (batch_size, num_heads, query_sequence_length, hidden_dim//num_heads)
        key : tensor of shape (batch_size, num_heads, key_sequence_length, hidden_dim//num_heads)
        value : tensor of shape (batch_size, num_heads, key_sequence_length, hidden_dim//num_heads)
        attention_mask : tensor of shape (query_sequence_length, key_sequence_length)
        key_padding_mask : tensor of shape (sequence_length, key_sequence_length)

        """
        d_k = query.size(-1)
        tgt_len, src_len = query.size(-2), key.size(-2)
        attention_scores = torch.div(torch.matmul(query,key.transpose(-2,-1)),np.sqrt(d_k))

        # Attention mask here
        if attention_mask is not None:
          attention_mask = attention_mask.unsqueeze(0)
          attention_scores = attention_scores + attention_mask

        # Key mask here
        if key_padding_mask is not None:
            key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2) # Broadcast over batch size, num heads
            attention_scores = attention_scores + key_padding_mask


        attention_scores = torch.softmax(attention_scores, dim=-1)
        output = torch.matmul(attention_scores, value) # (batch_size, num_heads, sequence_length, hidden_dim)

        return output, attention_scores


    def forward(
            self,
            q,
            k,
            v,
            attention_mask=None,
            key_padding_mask=None):
        """
        q : tensor of shape (batch_size, query_sequence_length, hidden_dim)
        k : tensor of shape (batch_size, key_sequence_length, hidden_dim)
        v : tensor of shape (batch_size, key_sequence_length, hidden_dim)
        attention_mask : tensor of shape (query_sequence_length, key_sequence_length)
        key_padding_mask : tensor of shape (sequence_length, key_sequence_length)

        """
        dk = q.size(-1)
        q = self.Wq(q)
        k = self.Wk(k)
        v = self.Wv(v)

        q =  q.view(q.shape[0], q.shape[1], self.num_heads, dk // self.num_heads).transpose(1, 2)
        k = k.view(k.shape[0], k.shape[1], self.num_heads, dk // self.num_heads).transpose(1, 2)
        v = v.view(v.shape[0], v.shape[1], self.num_heads, dk // self.num_heads).transpose(1, 2)

        attn_values, attn_weights  = self.scaled_dot_product_attention(
            query=q,
            key=k,
            value=v,
            attention_mask=attention_mask,
            key_padding_mask=key_padding_mask,
        )

        batch_size, num_heads, seq_length, head_hidden_dim = attn_values.size()
        combined_attn = attn_values.transpose(1, 2).contiguous().view(batch_size, seq_length, self.num_heads * head_hidden_dim)
        output = self.Wo(combined_attn)

        self.attention_weigths = attn_weights

        return output

In [17]:
# Taken from https://pytorch.org/tutorials/beginner/transformer_tutorial.html#define-the-model
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]``
        """
        x = x + self.pe[:, :x.size(1), :]
        return x

In [18]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model: int, d_ff: int):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [19]:
class ResidualConnection(nn.Module):

        def __init__(self, n_dim: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = nn.LayerNorm(n_dim)

        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

In [20]:
class EncoderBlock(nn.Module):
    def __init__(self, n_dim: int, dropout: float, n_heads: int):
        super().__init__()
        self.mha = MultiHeadAttention(hidden_dim=n_dim, num_heads=n_heads)
        self.ff = PositionWiseFeedForward(n_dim, n_dim)
        self.residual_connections = nn.ModuleList([ResidualConnection(n_dim, dropout) for _ in range(2)])

    def forward(self, x, src_mask=None):
        x = self.residual_connections[0](x, lambda x: self.mha(x, x, x,  key_padding_mask=src_mask))
        x = self.residual_connections[1](x, self.ff)
        return x

In [21]:
class Encoder(nn.Module):

    def __init__(self,
            vocab_size: int,
            n_dim: int,
            dropout: float,
            n_encoder_blocks: int,
            n_heads: int):

        super().__init__()
        self.n_dim = n_dim

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=n_dim
        )
        self.positional_encoding = PositionalEncoding(
            d_model=n_dim,
            dropout=dropout
        )
        self.layers = nn.ModuleList([
            EncoderBlock(n_dim, dropout, n_heads) for _ in range(n_encoder_blocks)
        ])
        self.norm = nn.LayerNorm(n_dim)

    def forward(self, x, padding_mask=None):
        x = self.embedding(x) * math.sqrt(self.n_dim)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, padding_mask)
        return x



In [22]:
class DecoderBlock(nn.Module):

    def __init__(self, n_dim: int, dropout: float, n_heads: int):
        super().__init__()
        self.mha = MultiHeadAttention(hidden_dim=n_dim, num_heads=n_heads)
        self.cross_mha = MultiHeadAttention(hidden_dim=n_dim, num_heads=n_heads)
        self.ff = PositionWiseFeedForward(n_dim, n_dim)
        self.residual_connections = nn.ModuleList([ResidualConnection(n_dim, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, tgt_mask=None, tgt_padding_mask=None, memory_padding_mask=None):
        x = self.residual_connections[0](x, lambda x: self.mha(x, x, x, attention_mask=tgt_mask, key_padding_mask=tgt_padding_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_mha(x, encoder_output, encoder_output, attention_mask=None, key_padding_mask=memory_padding_mask))
        x = self.residual_connections[2](x, self.ff)
        return x

class Decoder(nn.Module):

    def __init__(self, vocab_size: int, n_dim: int, dropout: float, n_decoder_blocks: int, n_heads: int):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=n_dim,
            padding_idx=0
        )
        self.positional_encoding = PositionalEncoding(
            d_model=n_dim,
            dropout=dropout
        )

        self.layers = nn.ModuleList([
            DecoderBlock(n_dim, dropout, n_heads) for _ in range(n_decoder_blocks)
        ])


    def forward(self, tgt, memory, tgt_mask=None, tgt_padding_mask=None, memory_padding_mask=None):
        x = self.embedding(tgt)
        x = self.positional_encoding(x)

        for layer in self.layers:
            x = layer(
                x,
                memory,
                tgt_mask=tgt_mask,
                tgt_padding_mask=tgt_padding_mask,
                memory_padding_mask=memory_padding_mask)
        return x

In [23]:
def generate_square_subsequent_mask(size: int):
      """Generate a triangular (size, size) mask. From PyTorch docs."""
      mask = (1 - torch.triu(torch.ones(size, size), diagonal=1)).bool()
      mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
      return mask

In [76]:
class Transformer(nn.Module):
    def __init__(self, vocab_size: int, model_dim: int, dropout: float, n_encoder_layers: int, n_decoder_layers: int, n_heads: int):
        super(Transformer, self).__init__()
        self.vocab_size = vocab_size
        self.model_dim = model_dim
        self.dropout = dropout
        self.n_encoder_layers = n_encoder_layers
        self.n_decoder_layers = n_decoder_layers
        self.n_heads = n_heads
        self.PAD_IDX = 0

        self.encoder = Encoder(
            self.vocab_size, self.model_dim, self.dropout, self.n_encoder_layers, self.n_heads)
        self.decoder = Decoder(
            self.vocab_size, self.model_dim, self.dropout, self.n_decoder_layers, self.n_heads)
        self.fc = nn.Linear(self.model_dim, self.vocab_size)


    @staticmethod
    def generate_square_subsequent_mask(size: int):
            """Generate a triangular (size, size) mask. From PyTorch docs."""
            mask = (1 - torch.triu(torch.ones(size, size), diagonal=1)).bool()
            mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
            return mask


    def encode(
            self,
            x: torch.Tensor,
        ) -> torch.Tensor:
        """
        Input
            x: (B, S) with elements in (0, C) where C is num_classes
        Output
            (B, S, E) embedding
        """
        mask = (x == self.PAD_IDX).float()
        encoder_padding_mask = mask.masked_fill(mask == 1, float('-inf'))

        # (B, S, E)
        encoder_output = self.encoder(
            x,
            padding_mask=encoder_padding_mask
        )

        return encoder_output, encoder_padding_mask


    def decode(
            self,
            tgt: torch.Tensor,
            memory: torch.Tensor,
            memory_padding_mask=None
        ) -> torch.Tensor:
        """
        B = Batch size
        S = Source sequence length
        L = Target sequence length
        E = Model dimension

        Input
            encoded_x: (B, S, E)
            y: (B, L) with elements in (0, C) where C is num_classes
        Output
            (B, L, C) logits
        """

        mask = (tgt == self.PAD_IDX).float()
        tgt_padding_mask = mask.masked_fill(mask == 1, float('-inf'))


        decoder_output = self.decoder(
            tgt=tgt,
            memory=memory,
            tgt_mask=self.generate_square_subsequent_mask(tgt.size(1)),
            tgt_padding_mask=tgt_padding_mask,
            memory_padding_mask=memory_padding_mask,
        )
        output = self.fc(decoder_output)  # shape (B, L, C)

        return output



    def forward(
            self,
            x: torch.Tensor,
            y: torch.Tensor,
        ) -> torch.Tensor:
        """
        Input
            x: (B, Sx) with elements in (0, C) where C is num_classes
            y: (B, Sy) with elements in (0, C) where C is num_classes
        Output
            (B, L, C) logits
        """

        # Encoder output shape (B, S, E)
        encoder_output, encoder_padding_mask = self.encode(x)

        # Decoder output shape (B, L, C)
        decoder_output = self.decode(
            tgt=y,
            memory=encoder_output,
            memory_padding_mask=encoder_padding_mask
        )

        return decoder_output

#### Training Transformer

In [27]:
#https://medium.com/data-science/a-complete-guide-to-write-your-own-transformers-29e23f371ddd

from tqdm import tqdm
PAD_IDX = 0
SOS_IDX = 1
EOS_IDX = 2

def train(model, optimizer, loader, loss_fn, epoch):
    model.train()
    losses = 0
    acc = 0
    history_loss = []
    history_acc = []
    print('1')

    with tqdm(loader, position=0, leave=True) as tepoch:
        for x, y in tepoch:
            tepoch.set_description(f"Epoch {epoch}")

            optimizer.zero_grad()

            logits = model(x, y[:, :-1])
            loss = loss_fn(logits.contiguous().view(-1, model.vocab_size), y[:, 1:].contiguous().view(-1))

            loss.backward()

            optimizer.step()
            losses += loss.item()

            preds = logits.argmax(dim=-1)
            masked_pred = preds * (y[:, 1:]!=PAD_IDX)
            accuracy = (masked_pred == y[:, 1:]).float().mean()
            acc += accuracy.item()

            history_loss.append(loss.item())
            history_acc.append(accuracy.item())
            tepoch.set_postfix(loss=loss.item(), accuracy=100. * accuracy.item())

    return losses / len(list(loader)), acc / len(list(loader)), history_loss, history_acc


def evaluate(model, loader, loss_fn):
    model.eval()
    losses = 0
    acc = 0
    history_loss = []
    history_acc = []

    for x, y in tqdm(loader, position=0, leave=True):

        logits = model(x, y[:, :-1])

        loss = loss_fn(logits.contiguous().view(-1, model.vocab_size), y[:, 1:].contiguous().view(-1))
        losses += loss.item()

        preds = logits.argmax(dim=-1)
        masked_pred = preds * (y[:, 1:]!=PAD_IDX)
        accuracy = (masked_pred == y[:, 1:]).float().mean()
        acc += accuracy.item()

        history_loss.append(loss.item())
        history_acc.append(accuracy.item())

    return losses / len(list(loader)), acc / len(list(loader)), history_loss, history_acc

In [6]:
def collate_fn(batch):
    """
    This function pads inputs with PAD_IDX to have batches of equal length
    """
    src_batch, tgt_batch = [], []

    for src_sample, tgt_sample in batch:
        src_batch.append(src_sample)
        tgt_batch.append(tgt_sample)

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
    return src_batch, tgt_batch

In [28]:
#https://medium.com/data-science/a-complete-guide-to-write-your-own-transformers-29e23f371ddd
# Model definition
model = Transformer(vocab_size=train_iter.get_fr_tokenizer().get_vocab_size(),
    model_dim=128,
    dropout=0.1,
    n_encoder_layers=2,
    n_decoder_layers=2,
    n_heads=2)

# Instantiate datasets
dataloader_train = DataLoader(train_iter, batch_size=8, collate_fn=collate_fn)
dataloader_val = DataLoader(val_iter, batch_size=8, collate_fn=collate_fn)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

# Define loss function : we ignore logits which are padding tokens
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)

# Save history to dictionnary
history = {
    'train_loss': [],
    'eval_loss': [],
    'train_acc': [],
    'eval_acc': []
}

# Main loop
for epoch in range(1, 5):
    start_time = time.time()

    train_loss, train_acc, hist_loss, hist_acc = train(model, optimizer, dataloader_train, loss_fn, epoch)

    history['train_loss'] += hist_loss
    history['train_acc'] += hist_acc
    end_time = time.time()
    val_loss, val_acc, hist_loss, hist_acc = evaluate(model, dataloader_val, loss_fn)
    history['eval_loss'] += hist_loss
    history['eval_acc'] += hist_acc
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Train acc: {train_acc:.3f}, Val loss: {val_loss:.3f}, Val acc: {val_acc:.3f} "f"Epoch time = {(end_time - start_time):.3f}s"))


1


Epoch 1: 100%|██████████| 1271/1271 [15:24<00:00,  1.37it/s, accuracy=73.2, loss=5.76]
100%|██████████| 159/159 [00:56<00:00,  2.83it/s]


Epoch: 1, Train loss: 6.099, Train acc: 0.600, Val loss: 5.806, Val acc: 0.627 Epoch time = 931.291s
1


Epoch 2: 100%|██████████| 1271/1271 [15:39<00:00,  1.35it/s, accuracy=75, loss=5.32]
100%|██████████| 159/159 [00:46<00:00,  3.42it/s]


Epoch: 2, Train loss: 5.186, Train acc: 0.632, Val loss: 5.512, Val acc: 0.644 Epoch time = 945.293s
1


Epoch 3: 100%|██████████| 1271/1271 [15:47<00:00,  1.34it/s, accuracy=76, loss=4.98]
100%|██████████| 159/159 [00:46<00:00,  3.39it/s]


Epoch: 3, Train loss: 4.733, Train acc: 0.653, Val loss: 5.513, Val acc: 0.650 Epoch time = 953.533s
1


Epoch 4: 100%|██████████| 1271/1271 [15:40<00:00,  1.35it/s, accuracy=76.4, loss=4.73]
100%|██████████| 159/159 [00:47<00:00,  3.37it/s]


Epoch: 4, Train loss: 4.422, Train acc: 0.672, Val loss: 5.442, Val acc: 0.655 Epoch time = 946.265s


### Translation

In [29]:
class Translator(nn.Module):
    def __init__(self, transformer, eng_tokenizer, fr_tokenizer):
        super(Translator, self).__init__()
        self.transformer = transformer
        self.eng_tokenizer = eng_tokenizer
        self.fr_tokenizer = fr_tokenizer

    def __call__(self, sentence, max_length=None, pad=False):

        x = torch.tensor(self.eng_tokenizer.encode(sentence).ids).reshape(1,-1)
        encoder_output, mask = model.encode(x) # (B, S, E)
        if not max_length:
            max_length = x.size(1)

        outputs = torch.ones((x.size()[0], max_length)).type_as(x).long() * SOS_IDX

        for step in range(1, max_length):
            y = outputs[:, :step]
            probs = self.transformer.decode(y, encoder_output)
            output = torch.argmax(probs, dim=-1)

            if output[:, -1].detach().numpy() in (EOS_IDX, SOS_IDX):
                break
            outputs[:, step] = output[:, -1]

        translation = ""
        for i in range(len(outputs[0])):
            translation += self.fr_tokenizer.id_to_token(outputs[0][i])
            translation += " "

        return translation

translator = Translator(model,train_iter.get_eng_tokenizer(), train_iter.get_fr_tokenizer())

In [30]:
sentence = "I am tired"
out = translator(sentence)
out

'" Je suis . " '

In [33]:
from nltk.translate.bleu_score import sentence_bleu
true = val_iter.get_values()
scores = []
for i in range(len(true)):
  guess = [translator(true[i])]
  scores.append(sentence_bleu([true[i]], guess))
  if(i == 500):
    break
mean_score = np.mean(scores)
print(mean_score)

0.0


We see from above that the model translates "I am tired" to "Je suis." (I am.). Thus, although it is not able to translate accurately, it demonstrates some basic understanding. The bleu score indicates that the transformer model has very low performance, although this is likely due to a lack of training time.

Although I was not able to compare my transformer with my encoder-decoder, it seems probable that the encoder decoder model would train faster due to decreased model complexity but would be less accurate because it is a less sophisticated model.

## Encoder Decoder

In [7]:
def scaled_dot_product_attention(self,query,key,value):
        d_k = query.size(-1)
        tgt_len, src_len = query.size(-2), key.size(-2)
        attention_scores = torch.div(torch.matmul(query,key.transpose(-2,-1)),np.sqrt(d_k))

        attention_scores = torch.softmax(attention_scores, dim=-1)
        output = torch.matmul(attention_scores, value) # (batch_size, num_heads, sequence_length, hidden_dim)

        return output, attention_scores

In [9]:
class EncoderDecoder(nn.Module):
  def __init__(self, input_size, output_size, hidden_size, encoder):
    super().__init__()
    self.hidden_size = hidden_size
    self.encoder = encoder
    self.decoder = nn.LSTM(hidden_size,output_size)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, input_tensor, target_tensor):

    encoder_output, encoder_hidden = self.encoder(input_tensor)
    decoder_output, = self.decoder(encoder_output)
    output = self.fc(decoder_output)
    return output


In [8]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = num_hiddens
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, self.hidden_size,batch_first=True)


    def forward(self, X):
        embs = self.embedding(X.t())
        context, attn_weights = scaled_dot_product_attention(embs,embs,embs)
        h_0 = torch.zeros(1, embs.size(0), self.hidden_size)
        outputs, state = self.rnn(embs,h_0)
        return outputs, state

### Works Cited

Etienne, B. (2024, March 19). A complete guide to write your own transformers. Medium. https://medium.com/data-science/a-complete-guide-to-write-your-own-transformers-29e23f371ddd


lhoestq. (2021, February 15). NLP dataset for Byteleveltokenizer training. Hugging Face Forums. https://discuss.huggingface.co/t/nlp-dataset-for-byteleveltokenizer-training/3653


lianghsun. (2022, August 22). Add Bos and Eos when encoding a sentence. Hugging Face Forums. https://discuss.huggingface.co/t/add-bos-and-eos-when-encoding-a-sentence/21833/2


huggingface. (n.d.). Quicktour. Tokenizers. https://huggingface.co/docs/tokenizers/python/latest/quicktour.html#post-processing

Sayed, E. (2024, June 11). Building a transformer from scratch: A step-by-step guide. Medium. https://medium.com/@sayedebad.777/building-a-transformer-from-scratch-a-step-by-step-guide-a3df0aeb7c9a


Zhang, A., Lipton, Z., Li, M., & Smola, A. (2023). Dive into Deep Learning. Cambridge University Press.
