<a href="https://colab.research.google.com/github/rupeshsah2038/misc/blob/main/transformer_nmt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Transformer based NMT - 1 #
# Based on: https://pytorch.org/tutorials/beginner/transformer_tutorial.html #
# Based on: https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb #
# Based on : https://github.com/aladdinpersson/Machine-Learning-Collection/blob/master/ML/Pytorch/more_advanced/seq2seq_transformer/seq2seq_transformer.py #
# Using nn.Transformer directly #
# Next let's break it down with nn.TransformerEncoder and nn.TransformerDecoder #

In [None]:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
# import torch.nn.functional as F

import spacy
import math
import random

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import BucketIterator, Field
from tqdm import tqdm

# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Tokenization
def tokenize_data(text):
    return [tok for tok in text.split()]

# Fields
src_f = Field(tokenize=tokenize_data,
              use_vocab=True,
              lower=True,
              init_token='<sos>',
              eos_token='<eos>',
              unk_token='<unk>'
              )
trg_f = Field(tokenize=tokenize_data,
              use_vocab=True,
              lower=True,
              init_token='<sos>',
              eos_token='<eos>',
              unk_token='<unk>'
              )

In [None]:
# Data
train, dev, test = Multi30k.splits(exts=('.de', '.en'),
                                   fields=(src_f, trg_f))

print(f"Number of train examples: {len(train.examples)}")
print(f"Number of valid examples: {len(dev.examples)}")
print(f"Number of test examples: {len(test.examples)}")
print(vars(train.examples[0]))

# device
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

Number of train examples: 29000
Number of valid examples: 1014
Number of test examples: 1000
{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche.'], 'trg': ['two', 'young,', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes.']}
cuda


In [None]:
# Vocabulary and Batching
src_f.build_vocab(train, min_freq=2)
trg_f.build_vocab(train, min_freq=2)
print(len(src_f.vocab))
print(len(trg_f.vocab))

batch_size = 32
train_iter, dev_iter, test_iter = BucketIterator.splits(
    (train, dev, test),
    batch_size = batch_size,
    device = device
)

9597
7704


In [None]:
# Model
class Transformer_NMT(nn.Module):
    def __init__(self, embedding_dim, src_vocab_size, trg_vocab_size, n_heads, n_layers, src_pad_idx, ff_dim, max_len, dropout, device):
        super(Transformer_NMT, self).__init__()
        self.src_tok_embedding = nn.Embedding(src_vocab_size, embedding_dim)
        self.src_pos_embedding = nn.Embedding(max_len, embedding_dim)
        self.trg_tok_embedding = nn.Embedding(trg_vocab_size, embedding_dim)
        self.trg_pos_embedding = nn.Embedding(max_len, embedding_dim)
        self.device = device

        self.transformer = nn.Transformer(
            d_model = embedding_dim,
            nhead = n_heads,
            num_encoder_layers = n_layers,
            num_decoder_layers = n_layers,
            dim_feedforward = ff_dim,
            dropout = dropout,
            )

        # output of transformer model is: [target_seq_length, batch_size, hid_dim=embedding_dim]
        self.fc_out = nn.Linear(embedding_dim, trg_vocab_size)
        # we are transformering it to get: [target_seq_length, batch_size, output_dim=trg_vocb_size]

        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx # this is to tell the model which tokens in src should be ignored (as it is a pad token)

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx # creating a BoolTensor
        return src_mask.to(self.device)
        # so essentially we are telling model to ignore the src positions which have pad token

    def forward(self, src, trg):
        src_seq_len, N = src.shape
        trg_seq_len, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_len).unsqueeze(1).expand(src_seq_len, N).to(self.device)
        ) # here expand will be expanded to a larger size
        trg_positions = (
            torch.arange(0, trg_seq_len).unsqueeze(1).expand(trg_seq_len, N).to(self.device)
        )

        src_embedding = self.dropout(self.src_tok_embedding(src) + self.src_pos_embedding(src_positions))
        trg_embedding = self.dropout(self.trg_tok_embedding(trg) + self.trg_pos_embedding(trg_positions))

        src_pad_mask = self.make_src_mask(src)
        # print(trg_seq_len)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_len).to(self.device)
        # print(trg_mask.shape)

        output = self.transformer(
            src = src_embedding,
            tgt = trg_embedding,
            src_key_padding_mask = src_pad_mask,
            tgt_mask = trg_mask,
        )
        output = self.fc_out(output)

        return output

In [None]:
# model parameters
embedding_dim = 256
src_vocab_size = len(src_f.vocab)
trg_vocab_size = len(trg_f.vocab)
n_heads = 4
n_layers = 2
src_pad_idx = src_f.vocab.stoi["<pad>"]
ff_dim = 512
max_len = 100
dropout = 0.1

model = Transformer_NMT(
    embedding_dim = embedding_dim,
    src_vocab_size = src_vocab_size,
    trg_vocab_size = trg_vocab_size,
    n_heads = n_heads,
    n_layers = n_layers,
    src_pad_idx = src_pad_idx,
    ff_dim = ff_dim,
    max_len = max_len,
    dropout = dropout,
    device = device,
).to(device)
print(model)

Transformer_NMT(
  (src_tok_embedding): Embedding(9597, 256)
  (src_pos_embedding): Embedding(100, 256)
  (trg_tok_embedding): Embedding(7704, 256)
  (trg_pos_embedding): Embedding(100, 256)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): M

In [None]:
# optimizer and loss criterion
optimizer = optim.Adam(model.parameters(), lr=1e-4)
trg_pad_idx = trg_f.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

NameError: ignored

In [None]:
# train function
epoch = 5
train_losses = []
eval_losses = []

for i in range(epoch):
    # training
    model.train()
    for batch_idx, batch in tqdm(enumerate(train_iter), total=len(train_iter), desc="Training Progress"):
        in_src = batch.src.to(device)
        out_trg = batch.trg.to(device)

        output = model(in_src, out_trg[:-1, :]) # trg_len, batch_size, trg_vocab_size
        output = output.reshape(-1, output.shape[2]) # trg_len*batch_size, trg_vocab_size
        out_trg = out_trg[1:].reshape(-1)
        optimizer.zero_grad()

        train_loss = criterion(output, out_trg)
        train_losses.append(train_loss)

        train_loss.backward()
        optimizer.step()

    # eval
    model.eval()
    for batch_idx, batch in tqdm(enumerate(dev_iter), total=len(dev_iter), desc="Evaluation Progress"):
        in_src = batch.src.to(device)
        out_trg = batch.trg.to(device)

        output = model(in_src, out_trg[:-1, :]) # trg_len, batch_size, trg_vocab_size
        output = output.reshape(-1, output.shape[2]) # trg_len*batch_size, trg_vocab_size
        out_trg = out_trg[1:].reshape(-1)

        eval_loss = criterion(output, out_trg)
        eval_losses.append(eval_loss)

    print(f'Epoch: {i+1}/{epoch}')
    print(f'Training Loss: {sum(train_losses)/len(train_losses):,.3f}\tEvaluation Loss: {sum(eval_losses)/len(eval_losses):,.3f}')
    print(f'Training PPL: {math.exp(sum(train_losses)/len(train_losses)):,.3f}\tEvaluation PPL: {math.exp(sum(eval_losses)/len(eval_losses)):,.3f}')

Training Progress: 100%|██████████| 907/907 [00:34<00:00, 26.37it/s]
Evaluation Progress: 100%|██████████| 32/32 [00:00<00:00, 83.30it/s]


Epoch: 1/5
Training Loss: 2.428	Evaluation Loss: 2.323
Training PPL: 11.336	Evaluation PPL: 10.207


Training Progress: 100%|██████████| 907/907 [00:34<00:00, 26.63it/s]
Evaluation Progress: 100%|██████████| 32/32 [00:00<00:00, 78.56it/s]


Epoch: 2/5
Training Loss: 2.379	Evaluation Loss: 2.294
Training PPL: 10.796	Evaluation PPL: 9.910


Training Progress: 100%|██████████| 907/907 [00:33<00:00, 26.95it/s]
Evaluation Progress: 100%|██████████| 32/32 [00:00<00:00, 78.46it/s]


Epoch: 3/5
Training Loss: 2.335	Evaluation Loss: 2.271
Training PPL: 10.325	Evaluation PPL: 9.690


Training Progress: 100%|██████████| 907/907 [00:33<00:00, 26.97it/s]
Evaluation Progress: 100%|██████████| 32/32 [00:00<00:00, 77.46it/s]


Epoch: 4/5
Training Loss: 2.291	Evaluation Loss: 2.252
Training PPL: 9.887	Evaluation PPL: 9.502


Training Progress: 100%|██████████| 907/907 [00:33<00:00, 26.95it/s]
Evaluation Progress: 100%|██████████| 32/32 [00:00<00:00, 75.94it/s]

Epoch: 5/5
Training Loss: 2.250	Evaluation Loss: 2.229
Training PPL: 9.488	Evaluation PPL: 9.293



