In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

In [2]:
import math
# from datasets import load_dataset
# from tokenizers import Tokenizer
# from tokenizers.models import WordLevel
# from tokenizers.trainers import WordLevelTrainer
# from tokenizers.pre_tokenizers import Whitespace

In [3]:
from pathlib import Path

In [4]:
from typing import Any

In [5]:
from tqdm import tqdm

In [6]:
import warnings

In [7]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)

In [9]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

In [10]:
class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

In [11]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [12]:
class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)

In [13]:
class ResidualConnection(nn.Module):

        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)

        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

In [14]:
class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Encoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [15]:
class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

In [16]:
class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)

In [17]:
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)

In [18]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

In [19]:
transformer = build_transformer(100, 100, 10, 10)

In [20]:
src = torch.randint(0,100, (32,10))

In [21]:
src.shape

torch.Size([32, 10])

In [22]:
src_mask = torch.randint(0,2, (10,10))

In [23]:
src_mask.shape

torch.Size([10, 10])

In [24]:
encoder_output = transformer.encode(src, src_mask)

In [25]:
tgt_mask = torch.randint(0,2, (10, 10))

In [26]:
tgt = torch.randint(0,100, (32, 10))

In [27]:
transformer.decode(encoder_output, src_mask, tgt, tgt_mask).shape

torch.Size([32, 10, 512])

In [28]:
MAX_LEN = 10
def get_config():
    return {
        "batch_size": 64,
        "num_epochs": 100,
        "lr": 10**-4,
        "seq_len": MAX_LEN,
        "d_model": 128,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

In [29]:
config = get_config()

In [30]:
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

In [31]:
def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config["seq_len"], config['seq_len'], d_model=config['d_model'], N=3, h = 4, d_ff=256)
    return model

In [32]:
from zipfile import ZipFile

with ZipFile("data (1).zip", 'r') as zObject:
    zObject.extractall(path="")

In [33]:
SOS_token = 0
EOS_token = 1
PAD_token = 2
class Lang:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {0:"[SOS]", 1:"[EOS]",2:"[PAD]"}
    self.n_words = 3
  def addSentence(self, sentence):
    for word in sentence.split(' '):
      self.addWord(word)
  def addWord(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1
    else:
      self.word2count[word] += 1

In [34]:
def unicodeToAscii(s):
  return ''.join(
      c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn'
  )

def normalizeString(s):
  s = unicodeToAscii(s.lower().strip())
  s = re.sub(r"([.!?])", r" \1", s)
  s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
  return s.strip()

In [35]:
def readLangs(lang1, lang2, reverse=False):
  print("Reading Lines......")

  lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')

  pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

  if reverse:
    pairs = [list(reversed(p)) for p in pairs]
    input_lang = Lang(lang2)
    output_lang = Lang(lang1)
  else:
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

  return input_lang, output_lang, pairs

In [36]:
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
  return len(p[0].split(' ')) < MAX_LEN-1 and len(p[1].split(' ')) < MAX_LEN-1 and p[1].startswith(eng_prefixes)

def filterPairs(pairs):
  return [p for p in pairs if filterPair(p)]

In [37]:
def prepareData(lang1, lang2, reverse = False):
  input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
  print("Read %s sentence pairs" %len(pairs))
  pairs = filterPairs(pairs)
  print("Trimmed to %s sentence pairs" % len(pairs))
  print("Counting words....")
  for pair in pairs:
    input_lang.addSentence(pair[0])
    output_lang.addSentence(pair[1])
  print("Counted Words")
  print(input_lang.name, input_lang.n_words)
  print(output_lang.name, output_lang.n_words)
  return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng','fra',True)

Reading Lines......
Read 135842 sentence pairs
Trimmed to 10551 sentence pairs
Counting words....
Counted Words
fra 4343
eng 2802


In [38]:
print(random.choice(pairs))

['nous sommes reconnaissants', 'we re grateful']


In [39]:
def indexesFromSentence(lang, sentence):
  return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
  indexes = indexesFromSentence(lang, sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes, dtype=torch.long, device=device).view(1,-1)

def tensorsFromPair(pair):
  input_tensor = tensorFromSentence(input_lang, pair[0])
  target_tensor = tensorFromSentence(output_lang, pair[1])
  return (input_tensor, target_tensor)

In [40]:
def get_dataloader(batch_size):
  input_lang, output_lang, pairs = prepareData('eng', 'fra', True)

  n = len(pairs)
  input_ids = np.zeros((n,MAX_LEN), dtype=np.int32)
  target_ids = np.zeros((n,MAX_LEN), dtype=np.int32)

  for idx, (inp, tgt) in enumerate(pairs):
    inp_ids = indexesFromSentence(input_lang, inp)
    tgt_ids = indexesFromSentence(output_lang, tgt)
    inp_ids.append(EOS_token)
    tgt_ids.append(EOS_token)
    input_ids[idx, :len(inp_ids)] = inp_ids
    target_ids[idx, :len(tgt_ids)] = tgt_ids

  train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                             torch.LongTensor(target_ids).to(device))

  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
  return input_lang, output_lang, train_dataloader


In [41]:
class BilingualDatasetNew(Dataset):
    def __init__(self, pairs, input_lang, output_lang, seq_len=MAX_LEN):
        super().__init__()
        self.seq_len = seq_len
        self.pairs = pairs
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.src_lang = input_lang.name
        self.tgt_lang = output_lang.name
        self.sos_token = torch.tensor([SOS_token], dtype=torch.int64)
        self.eos_token = torch.tensor([EOS_token], dtype=torch.int64)
        self.pad_token = torch.tensor([PAD_token], dtype=torch.int64)
    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src_target_pair = self.pairs[idx]
        src_text = src_target_pair[0]
        tgt_text = src_target_pair[1]

        enc_input_tokens = indexesFromSentence(self.input_lang, src_text)
        dec_input_tokens = indexesFromSentence(self.output_lang, tgt_text)

        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        if enc_num_padding_tokens<0 or dec_num_padding_tokens <0:
            raise ValueError("Sentence is too long")

        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token]*dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,
            "decoder_input": decoder_input,
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),
            "label":label,
            "src_text":src_text,
            "tgt_text":tgt_text,
        }

In [42]:
def get_ds(config):
    input_lang, output_lang, pairs = prepareData('eng','fra',True)
    ds_raw = pairs

    train_ds_size = int(0.9*len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])
    train_ds = BilingualDatasetNew(train_ds_raw, input_lang, output_lang)
    val_ds = BilingualDatasetNew(val_ds_raw, input_lang, output_lang)
    max_len_src = 0
    max_len_tgt = 0
    for item in ds_raw:
        src_ids = indexesFromSentence(input_lang, item[0])
        tgt_ids = indexesFromSentence(output_lang, item[1])
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')


    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, input_lang, output_lang


In [43]:
train_dataloader, val_dataloader, input_lang, output_lang =  get_ds(config)

Reading Lines......
Read 135842 sentence pairs
Trimmed to 10551 sentence pairs
Counting words....
Counted Words
fra 4343
eng 2802
Max length of source sentence: 8
Max length of target sentence: 8


In [45]:
for batch in train_dataloader:
    x = batch
    break

In [44]:
def train_epoch(data_loader, model, optimizer, criterion):
    total_loss = 0
    for batch in data_loader:
        encoder_input = batch['encoder_input'].to(device)
        decoder_input = batch['decoder_input'].to(device)
        encoder_mask = batch['encoder_mask'].to(device)
        decoder_mask = batch['decoder_mask'].to(device)

        encoder_output = model.encode(encoder_input, encoder_mask)
        decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
        proj_output = model.project(decoder_output)
        label = batch['label'].to(device)
        loss = criterion(proj_output.view(-1, output_lang.n_words), label.view(-1))
        loss.backward()

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        total_loss += loss.item()
    return total_loss / len(data_loader)

In [46]:
def train_model(config):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using Device:", device)
    device = torch.device(device)
    # Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)
    train_dataloader, val_dataloader, input_lang, output_lang = get_ds(config)
    model = get_model(config, input_lang.n_words, output_lang.n_words).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr = config['lr'], eps=1e-9)
    initial_epoch = 0

    loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_token, label_smoothing=0.1).to(device)
    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        loss = train_epoch(train_dataloader, model, optimizer, loss_fn)
        if epoch % 5 == 0:
          print(f"epoch - {epoch} : loss - {loss}")
    return model

In [47]:
model = train_model(config)

Using Device: cuda
Reading Lines......
Read 135842 sentence pairs
Trimmed to 10551 sentence pairs
Counting words....
Counted Words
fra 4343
eng 2802
Max length of source sentence: 8
Max length of target sentence: 8
epoch - 0 : loss - 6.606975225794235
epoch - 5 : loss - 3.570637584532667
epoch - 10 : loss - 3.0673806651326636
epoch - 15 : loss - 2.73547807315852
epoch - 20 : loss - 2.475146324042506
epoch - 25 : loss - 2.2681292367461543
epoch - 30 : loss - 2.09466687784899
epoch - 35 : loss - 1.951891962313812
epoch - 40 : loss - 1.8353458994987026
epoch - 45 : loss - 1.7392991645224143
epoch - 50 : loss - 1.657767894284037
epoch - 55 : loss - 1.5955496506402957
epoch - 60 : loss - 1.540169155037643
epoch - 65 : loss - 1.4934901087076071
epoch - 70 : loss - 1.456233496633952
epoch - 75 : loss - 1.4253623093534635
epoch - 80 : loss - 1.4012750987238531
epoch - 85 : loss - 1.379709851821797
epoch - 90 : loss - 1.3615811359162298
epoch - 95 : loss - 1.3484423424573553


Decoder(
  (layers): ModuleList(
    (0-2): 3 x DecoderBlock(
      (self_attention_block): MultiHeadAttentionBlock(
        (w_q): Linear(in_features=128, out_features=128, bias=False)
        (w_k): Linear(in_features=128, out_features=128, bias=False)
        (w_v): Linear(in_features=128, out_features=128, bias=False)
        (w_o): Linear(in_features=128, out_features=128, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (cross_attention_block): MultiHeadAttentionBlock(
        (w_q): Linear(in_features=128, out_features=128, bias=False)
        (w_k): Linear(in_features=128, out_features=128, bias=False)
        (w_v): Linear(in_features=128, out_features=128, bias=False)
        (w_o): Linear(in_features=128, out_features=128, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward_block): FeedForwardBlock(
        (linear_1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=

In [62]:
def greedy_decode(model, source, input_lang, output_lang, max_len=MAX_LEN, device=device):
    with torch.no_grad():
        sos_idx = SOS_token
        eos_idx = EOS_token
        src_text = source
        enc_input_tokens = indexesFromSentence(input_lang, src_text)
        enc_num_padding_tokens = max_len - len(enc_input_tokens) - 2
        encoder_input = torch.cat(
                [
                    torch.tensor([sos_idx], dtype=torch.int64),
                    torch.tensor(enc_input_tokens, dtype=torch.int64),
                    torch.tensor([eos_idx], dtype=torch.int64),
                    torch.tensor([PAD_token]*enc_num_padding_tokens, dtype=torch.int64),
                ],
                dim=0,
            ).to(device)
        src_mask = (encoder_input != PAD_token).unsqueeze(0).unsqueeze(0).int().to(device)
        encoder_output = model.encode(encoder_input, src_mask)
        decoder_input = torch.empty(1,1).fill_(sos_idx).type_as(encoder_input).to(device)
        while True:
            if decoder_input.size(1) == max_len:
                break
            decoder_mask = causal_mask(decoder_input.size(1)).type_as(src_mask).to(device)
            out = model.decode(encoder_output, src_mask, decoder_input, decoder_mask)
            prob = model.project(out[:,-1])
            _, next_word = torch.max(prob, dim=1)
            decoder_input = torch.cat(
                [decoder_input, torch.empty(1,1).type_as(encoder_input).fill_(next_word.item()).to(device)],dim=1
            ).to(device)
            if next_word==eos_idx:
                break
        words=[]
        for idx in decoder_input.squeeze(0):
          idx = idx.item()
          words.append(output_lang.index2word[idx])
    return words


In [84]:
greedy_decode(model, pairs[4567][0], input_lang, output_lang)[1:-2]

['you', 'are', 'such', 'a', 'liar']

In [79]:
pairs[4567][1]

'you are such a liar !'

In [68]:
input_lang.name

'fra'

In [82]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [83]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

def calculate_bleu(reference, candidate):
    reference = [word_tokenize(sentence.lower()) for sentence in reference]
    candidate = word_tokenize(candidate.lower())

    smoothing_function = SmoothingFunction().method1  # You can choose different smoothing methods

    return sentence_bleu(reference, candidate, smoothing_function=smoothing_function)

# Example usage:
reference_sentence = ["The cat is sitting on the mat"]
candidate_sentence = "The cat is on the mat"

bleu_score = calculate_bleu(reference_sentence, candidate_sentence)
print(f"BLEU Score: {bleu_score}")

BLEU Score: 0.28764198060873264


In [94]:
from tqdm import tqdm
def bleu_metric(model, n=10000):
    bleu=0
    for i in tqdm(range(n)):
        pair = random.choice(pairs)
        output_words = greedy_decode(model, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words[1:-2])
        reference_sentence = [pair[1]]
        candidate_sentence = output_sentence
        bleu+= calculate_bleu(reference_sentence, candidate_sentence)
    return bleu/n

In [95]:
bleu_metric(model)

100%|██████████| 10000/10000 [06:05<00:00, 27.32it/s]


0.5238303726987809