# Introduction

Find small single text files for **Language Modeling** experiements here ⬇️

https://www.kaggle.com/datasets/devicharith/language-translation-englishfrench

This is an end-to-end runnable notebook that clones the repository and starts the training. Ideal for uploading on cloud machines and start training. This notebooks uses a much smaller verision of the original Transformer architecture.

In [1]:
!git clone https://github.com/sovit-123/attention_is_all_you_need.git
%cd attention_is_all_you_need
!git checkout pre_norm
!pip install .

Cloning into 'attention_is_all_you_need'...
remote: Enumerating objects: 334, done.[K
remote: Counting objects: 100% (52/52), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 334 (delta 24), reused 37 (delta 15), pack-reused 282[K
Receiving objects: 100% (334/334), 7.32 MiB | 12.82 MiB/s, done.
Resolving deltas: 100% (199/199), done.
/kaggle/working/attention_is_all_you_need
Branch 'pre_norm' set up to track remote branch 'pre_norm' from 'origin'.
Switched to a new branch 'pre_norm'
Processing /kaggle/working/attention_is_all_you_need
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: attention
  Building wheel for attention (setup.py) ... [?25l- done
[?25h  Created wheel for attention: filename=attention-1.0-py3-none-any.whl size=7505 sha256=c8b6b094450e7dc6942501caa0918e33c0484e6d16165ce6404fecabaca042fb
  Stored in directory: /tmp/pip-ephem-wheel-cache-w3u_zocf/wheels/21/f5/d1/4f7cca7147429101613

In [2]:
!pip install -U install portalocker
!pip install torchtext
!pip install scikit-learn
!pip install pandas
!pip install spacy

Collecting install
  Downloading install-1.3.5-py3-none-any.whl (3.2 kB)
Collecting portalocker
  Obtaining dependency information for portalocker from https://files.pythonhosted.org/packages/17/9e/87671efcca80ba6203811540ed1f9c0462c1609d2281d7b7f53cef05da3d/portalocker-2.8.2-py3-none-any.whl.metadata
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker, install
Successfully installed install-1.3.5 portalocker-2.8.2


In [3]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [4]:
%cd examples

/kaggle/working/attention_is_all_you_need/examples


In [5]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from timeit import default_timer as timer
from attention import transformer
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd



In [6]:
# Set seed.
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [7]:
SRC_LANGUAGE = 'fr'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

In [8]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='fr_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

In [9]:
csv = pd.read_csv(
    'data/english_french/eng_-french.csv', 
    usecols=['English words/sentences', 'French words/sentences']
)
csv.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [10]:
train_csv, test_csv = train_test_split(csv, test_size=0.1)

In [11]:
print(len(train_csv))
print(len(test_csv))

158058
17563


In [12]:
train_csv.head()

Unnamed: 0,English words/sentences,French words/sentences
158383,They kept him waiting outside for a long time.,Ils le firent poireauter dehors.
146722,How much money did you spend on your car?,Combien d'argent avez-vous dépensé pour votre ...
120085,I heard it from a reliable source.,Je l'ai entendu d'une source fiable.
152460,My parents met each other in the mountains.,Mes parents se sont rencontrés dans les montag...
63136,My teacher drove me home.,Mon professeur m'a reconduit chez moi.


In [13]:
test_csv.head()

Unnamed: 0,English words/sentences,French words/sentences
2785,Take a seat.,Prends place !
29880,I wish Tom was here.,J'aimerais que Tom soit là.
53776,How did the audition go?,Comment s'est passée l'audition ?
154386,I've no friend to talk to about my problems.,Je n'ai pas d'ami avec lequel je puisse m'entr...
149823,I really like this skirt. Can I try it on?,"J'aime beaucoup cette jupe, puis-je l'essayer ?"


In [14]:
print(train_csv['French words/sentences'].iloc[0])
print(train_csv['English words/sentences'].iloc[0])

Ils le firent poireauter dehors.
They kept him waiting outside for a long time.


In [15]:
class TranslationDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv
        
    def __len__(self):
        return len(self.csv)
    
    def __getitem__(self, idx):
        return(
            self.csv['French words/sentences'].iloc[idx],
            self.csv['English words/sentences'].iloc[idx]
        )

In [16]:
train_dataset = TranslationDataset(train_csv)
valid_dataset = TranslationDataset(test_csv)

In [17]:
iterator = iter(train_dataset)
print(next(iterator))

('Ils le firent poireauter dehors.', 'They kept him waiting outside for a long time.')


In [18]:
# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(
        yield_tokens(train_dataset, ln),
        min_freq=1,
        specials=special_symbols,
        special_first=True,
    )

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

In [19]:
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 256
NHEAD = 2
FFN_HID_DIM = 512
BATCH_SIZE = 512
MAX_LEN = 256
NUM_ENCODER_LAYERS = 2
DEVICE = 'cuda'
NUM_EPOCHS = 200
DROPOUT = 0.1
# DEVICE = 'cpu'

In [20]:
# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
    return src_batch, tgt_batch

In [21]:
model = transformer.Transformer(
    embed_dim=EMB_SIZE,
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    seq_len=MAX_LEN,
    num_layers=NUM_ENCODER_LAYERS,
    n_heads=NHEAD,
    device=DEVICE,
    dropout=DROPOUT
).to(DEVICE)

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")
print(model)

17,176,605 total parameters.
17,176,605 training parameters.
Transformer(
  (encoder): TransformerEncoder(
    (embedding): Embedding(
      (embed): Embedding(25319, 256)
    )
    (positional_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-1): 2 x TransformerBlock(
        (attention): MultiHeadAttention(
          (q): Linear(in_features=128, out_features=128, bias=True)
          (k): Linear(in_features=128, out_features=128, bias=True)
          (v): Linear(in_features=128, out_features=128, bias=True)
          (out): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ffn): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
       

In [22]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [23]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
def train_epoch(model, optimizer):
    model.train()
    losses = 0

#     for src, tgt in tqdm(train_dataloader, total=len(list(train_dataloader))):
    for src, tgt in train_dataloader:
        # print(" ".join(vocab_transform[SRC_LANGUAGE].lookup_tokens(list(src[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        # print(" ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)
        
        tgt_input = tgt[:, :-1]

        logits = model(src, tgt_input)

        optimizer.zero_grad()

        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.view(-1, TGT_VOCAB_SIZE), tgt_out.contiguous().view(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


val_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
def evaluate(model):
    model.eval()
    losses = 0

#     for src, tgt in tqdm(val_dataloader, total=len(list(val_dataloader))):
    for src, tgt in val_dataloader:
        # print(" ".join(vocab_transform[SRC_LANGUAGE].lookup_tokens(list(src[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        # print(" ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)
        
        tgt_input = tgt[:, :-1]
        
        logits = model(src, tgt_input)

        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.view(-1, TGT_VOCAB_SIZE), tgt_out.contiguous().view(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [24]:
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer)
    end_time = timer()
    val_loss = evaluate(model)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

Epoch: 1, Train loss: 5.221, Val loss: 4.123, Epoch time = 95.262s
Epoch: 2, Train loss: 3.869, Val loss: 3.527, Epoch time = 94.060s
Epoch: 3, Train loss: 3.440, Val loss: 3.187, Epoch time = 94.182s
Epoch: 4, Train loss: 3.155, Val loss: 2.937, Epoch time = 94.173s
Epoch: 5, Train loss: 2.934, Val loss: 2.742, Epoch time = 94.708s
Epoch: 6, Train loss: 2.753, Val loss: 2.581, Epoch time = 94.172s
Epoch: 7, Train loss: 2.600, Val loss: 2.439, Epoch time = 94.447s
Epoch: 8, Train loss: 2.469, Val loss: 2.320, Epoch time = 94.226s
Epoch: 9, Train loss: 2.354, Val loss: 2.224, Epoch time = 94.056s
Epoch: 10, Train loss: 2.256, Val loss: 2.136, Epoch time = 94.714s
Epoch: 11, Train loss: 2.167, Val loss: 2.063, Epoch time = 94.657s
Epoch: 12, Train loss: 2.091, Val loss: 2.004, Epoch time = 94.165s
Epoch: 13, Train loss: 2.021, Val loss: 1.946, Epoch time = 94.138s
Epoch: 14, Train loss: 1.959, Val loss: 1.893, Epoch time = 94.118s
Epoch: 15, Train loss: 1.902, Val loss: 1.848, Epoch time

In [25]:
import os
os.makedirs('outputs/translation_custom_dataloader', exist_ok=True)
torch.save(model, 'outputs/translation_custom_dataloader/model.pth')

## Inference

In [26]:
import torch

from attention.transformer import TransformerDecoder, TransformerEncoder

In [27]:
model = torch.load('outputs/translation_custom_dataloader/model.pth')

In [28]:
def make_tgt_mask(tgt, pad_token_id=1):
    """
    :param tgt: Target sequence.
    Returns:
        tgt_mask: Target mask.
    """
    batch_size = tgt.shape[0]
    device = tgt.device

    # Same as src_mask but we additionally want to mask tokens from looking forward into the future tokens
    # Note: wherever the mask value is true we want to attend to that token, otherwise we mask (ignore) it.
    sequence_length = tgt.shape[1]  # trg_token_ids shape = (B, T) where T max trg token-sequence length
    trg_padding_mask = (tgt != pad_token_id).view(batch_size, 1, 1, -1)  # shape = (B, 1, 1, T)
    trg_no_look_forward_mask = torch.triu(torch.ones((1, 1, sequence_length, sequence_length), device=device) == 1).transpose(2, 3)

    # logic AND operation (both padding mask and no-look-forward must be true to attend to a certain target token)
    tgt_mask = trg_padding_mask & trg_no_look_forward_mask  # final shape = (B, 1, T, T)
    return tgt_mask
    
def make_src_mask(src, pad_token_id=1):
    """
    :param src: Source sequence.

    Returns:
        src_mask: Source mask.
    """
    batch_size = src.shape[0]

    # src_mask shape = (B, 1, 1, S) check out attention function in transformer_model.py where masks are applied
    # src_mask only masks pad tokens as we want to ignore their representations (no information in there...)
    src_mask = (src != pad_token_id).view(batch_size, 1, 1, -1)
    return src_mask


In [29]:
decoder = TransformerDecoder(
            TGT_VOCAB_SIZE,
            EMB_SIZE,
            MAX_LEN,
            NUM_ENCODER_LAYERS,
            expansion_factor=4,
            n_heads=NHEAD
        ).to(DEVICE).eval()

In [30]:
decoder.load_state_dict(model.decoder.state_dict())

<All keys matched successfully>

In [31]:
encoder = TransformerEncoder(
            MAX_LEN,
            SRC_VOCAB_SIZE,
            EMB_SIZE,
            NUM_ENCODER_LAYERS,
            expansion_factor=4,
            n_heads=NHEAD
        ).to(DEVICE).eval()

In [32]:
model.eval()

Transformer(
  (encoder): TransformerEncoder(
    (embedding): Embedding(
      (embed): Embedding(25319, 256)
    )
    (positional_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-1): 2 x TransformerBlock(
        (attention): MultiHeadAttention(
          (q): Linear(in_features=128, out_features=128, bias=True)
          (k): Linear(in_features=128, out_features=128, bias=True)
          (v): Linear(in_features=128, out_features=128, bias=True)
          (out): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ffn): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
        )
        (dropout1): Dropout(p=0.1, inplace=False)
        

In [33]:
def decode(src, tgt):
    """
    :param src: Encoder input
    :param tgt: Decoder input

    Returns:
        out_labels: Final prediction sequence
    """
    tgt_mask = make_tgt_mask(tgt).to(DEVICE)
    src_mask = make_src_mask(src).to(DEVICE)
    enc_out = encoder(src)
    out_labels = []
    batch_size, seq_len = src.shape[0], src.shape[1]
    out = tgt
    with torch.no_grad():
        for i in range(seq_len):
            if i != 0:
                tgt = torch.tensor(out_labels, dtype=torch.long).unsqueeze(0).to(DEVICE)
                # print(tgt)
                out = decoder(torch.tensor(tgt).to(DEVICE), enc_out, src_mask, tgt_mask)
            else:
                out = decoder(out, enc_out, src_mask, tgt_mask)
            out = out.reshape(-1, out.shape[-1])
            num_of_trg_tokens = len(tgt[0])
            out = out[num_of_trg_tokens-1::num_of_trg_tokens]
            out = torch.argmax(out, dim=-1)
            out_labels.append(out.item())
            out = torch.unsqueeze(out, 0)
        return out_labels

In [34]:
# Full-stops are important for the model to perform well.
src_sentence = "Bonjour, comment vas-tu?"
start_symbol = BOS_IDX
src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
num_tokens = src.shape[0]
src = src.to(DEVICE)
ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
out = decode(torch.ravel(src).unsqueeze(0), ys)
print(" ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(out))).replace("<bos>", "").replace("<eos>", ""))

Do you have to come .  . 


  out = decoder(torch.tensor(tgt).to(DEVICE), enc_out, src_mask, tgt_mask)
