In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
from tqdm import tqdm
import numpy as np
from datetime import datetime

## data imports
from sklearn.model_selection import train_test_split

## torch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

## transformer related imports
from transformers import AutoTokenizer
from transformers_fmt.model_blocks.transformer import Transformers

## constants
from constants import ROOT_DIR, DEVICE, LOGS_DIR
from utils.logging import logs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
keys = torch.randint(1, 10, (2, 3, 12))
keys

tensor([[[6, 9, 5, 3, 1, 5, 5, 6, 7, 4, 7, 9],
         [3, 8, 6, 3, 9, 1, 3, 2, 7, 2, 2, 7],
         [5, 4, 8, 9, 9, 7, 1, 2, 4, 4, 8, 1]],

        [[1, 2, 1, 1, 9, 7, 1, 9, 8, 4, 4, 4],
         [3, 3, 3, 4, 4, 9, 7, 2, 8, 1, 9, 3],
         [6, 3, 8, 4, 8, 3, 4, 6, 6, 2, 1, 3]]])

In [3]:
splitted_tensor = torch.stack(keys.split(3, dim = 2), dim=0)

In [5]:
torch.concat(splitted_tensor.split(split_size=1, dim = 0), dim=3).squeeze()

tensor([[[6, 9, 5, 3, 1, 5, 5, 6, 7, 4, 7, 9],
         [3, 8, 6, 3, 9, 1, 3, 2, 7, 2, 2, 7],
         [5, 4, 8, 9, 9, 7, 1, 2, 4, 4, 8, 1]],

        [[1, 2, 1, 1, 9, 7, 1, 9, 8, 4, 4, 4],
         [3, 3, 3, 4, 4, 9, 7, 2, 8, 1, 9, 3],
         [6, 3, 8, 4, 8, 3, 4, 6, 6, 2, 1, 3]]])

In [6]:
max_seq_len = 20
logs_dir = os.path.join(LOGS_DIR, f'transformer_{datetime.strftime(datetime.now(), "%Y-%m-%d_%H:%M:%S")}')
print(f'Logs dir for current run: {logs_dir}')
writer = SummaryWriter(log_dir=logs_dir)

Logs dir for current run: /Users/adityarustagi/Documents/self-implementations/logs/transformer_2023-08-13_16:53:47


# Model

### Internal Blocks

In [7]:
###################################### SELF IMPLEMENTATIONS ######################################

class ScaledDotProductAttention(nn.Module) :

    def __init__(self, 
                 n_heads: int = 8,
                 d_model: int = 512,
                 mask: bool = False,
                 device: str = 'cuda',
        ) -> None :

        """
        Args:
            n_heads (int): Number of heads in the multi head attention. Defualts to 8
            d_model (int, optional): Dimension of the input. Defaults to 512.
            mask (bool, optional): Whether to apply masking. Defaults to False
        """

        super(ScaledDotProductAttention, self).__init__()
        self.mask = mask
        self.d_k = int(d_model/n_heads)
        self.device = device

    def forward(self,
                key : torch.Tensor,
                query : torch.Tensor,
                value : torch.Tensor,
                encoder_mask : torch.Tensor = None
        ) -> torch.Tensor :

        """
        Calculate scaler dot product of key, query and values as described in https://arxiv.org/pdf/1706.03762.pdf

        Args:
            key (torch.Tensor): Key tensor. Shape = (n_heads, batch_size, seq_len, d_model/n_heads)
            query (torch.Tensor): Query tensor. Shape = (n_heads, batch_size, seq_len, d_model/n_heads)
            value (torch.Tensor): Value tensor. Shape = (n_heads, batch_size, seq_len, d_model/n_heads)
            encoder_mask (torch.Tensor): Mask applied to attention to make softmax scores of [PAD] token 0. Defaults to None

        Returns:
            value_with_attention: Value with attention applied. Shape = (n_heads, batch_size, seq_len, d_model/n_heads)
        """

        # assert key.size() == query.size() == value.size(), "Key, query and value must have same shape"

        batch_size, seq_len = key.size(1), key.size(2)
        attention_scores = torch.matmul(query, key.transpose(2, 3))/torch.sqrt(torch.tensor(self.d_k))

        if encoder_mask is not None:
            attention_scores = torch.masked_fill(
                                    attention_scores, 
                                    encoder_mask.unsqueeze(1) == False,
                                    float('-inf')
                                )
        
        if self.mask :
            seq_len_enc = attention_scores.size(2)
            seq_len_dec = attention_scores.size(3)
            attention_scores = torch.masked_fill(
                                    attention_scores, 
                                    torch.tril(torch.ones(seq_len_enc, seq_len_dec)).to(self.device) == False, 
                                    float('-inf')
                                )

        attention_scores = torch.softmax(attention_scores, dim = 3)
            
        value_with_attention = torch.matmul(attention_scores, value)

        return value_with_attention, attention_scores

In [8]:
sdpa = ScaledDotProductAttention(
    d_model = 10,
    mask=False,
    device = 'cpu'
)

In [9]:
key = torch.rand((2, 3, 4, 5))
query = torch.rand((2, 3, 4, 5))
value = torch.rand((2, 3, 4, 5))

In [10]:
encoder_mask = torch.tensor(
    [
        [1, 1, 0, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 1]
    ]
)# encoder_mask = torch.randint(0, 2, (2, 3, 4, 4))


In [40]:
encoder_mask.type()

'torch.LongTensor'

In [12]:
value_with_attention, attention_scores=sdpa(key, query, value, encoder_mask)

In [13]:
attention_scores

tensor([[[[0.3770, 0.6230, 0.0000, 0.0000],
          [0.3160, 0.6840, 0.0000, 0.0000],
          [0.3861, 0.6139, 0.0000, 0.0000],
          [0.5343, 0.4657, 0.0000, 0.0000]],

         [[0.2710, 0.4216, 0.3073, 0.0000],
          [0.3710, 0.3755, 0.2536, 0.0000],
          [0.2850, 0.4102, 0.3048, 0.0000],
          [0.2552, 0.4704, 0.2744, 0.0000]],

         [[0.2327, 0.1993, 0.0953, 0.4727],
          [0.3091, 0.2113, 0.1598, 0.3198],
          [0.2925, 0.1383, 0.1051, 0.4641],
          [0.2077, 0.1533, 0.1367, 0.5023]]],


        [[[0.6433, 0.3567, 0.0000, 0.0000],
          [0.5522, 0.4478, 0.0000, 0.0000],
          [0.5094, 0.4906, 0.0000, 0.0000],
          [0.4833, 0.5167, 0.0000, 0.0000]],

         [[0.2996, 0.2577, 0.4427, 0.0000],
          [0.3451, 0.3384, 0.3165, 0.0000],
          [0.3068, 0.3937, 0.2995, 0.0000],
          [0.3383, 0.4003, 0.2615, 0.0000]],

         [[0.2144, 0.2596, 0.3001, 0.2259],
          [0.2686, 0.2147, 0.2485, 0.2682],
          [0.2888, 0

In [14]:
###################################### MULTI HEADED ATTENTION ######################################

class MultiHeadAttention(nn.Module) :
    
    def __init__(self, 
                 n_head: int = 8, 
                 d_model: int = 512, 
                 dropout: float = 0.1, 
                 mask: bool = False,
                 self_attention: bool = True,
                 device: str = 'cuda',
        ) :

        """
        Args:
            n_head (int): Number of heads. Defaults to 8.
            d_model (int): Dimension of input. Defaults to 512.
            dropout (float): Dropout rate. Defaults to 0.1.
            mask (bool): Whether to mask the attention. Defaults to False.
            self_attention (bool): Whether to use self attention. Defaults to True.
        """

        super(MultiHeadAttention, self).__init__()
        
        self.n_head = n_head
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        self.self_attention = self_attention

        self.d_k = self.d_v = d_model // n_head
        self.w_qs = nn.Linear(d_model, n_head * self.d_k)
        self.w_ks = nn.Linear(d_model, n_head * self.d_k)
        self.w_vs = nn.Linear(d_model, n_head * self.d_v)

        self.attention = ScaledDotProductAttention(n_head, d_model, mask, device = device)

        self.mha_linear = nn.Linear(d_model, d_model)

        nn.init.normal_(self.w_qs.weight, mean = 0, std = np.sqrt(2.0 / (d_model + self.d_k)))
        nn.init.normal_(self.w_ks.weight, mean = 0, std = np.sqrt(2.0 / (d_model + self.d_k)))
        nn.init.normal_(self.w_vs.weight, mean = 0, std = np.sqrt(2.0 / (d_model + self.d_v)))

    def forward(self, x, encoder_mask = None, q = None) :

        """
        Implementation of multi head attention layer.

        Args:
            x (torch.Tensor): Padded input with the shaep batch_len, seq_len, d_model
            q (torch.Tensor): Query with the shape batch_size, seq_len, d_model. Defaults to None.
        
        Returns:
            torch.Tensor: Values with multiheadattention applied. Shape = (batch_size, seq_len, d_model)
        
        Raises:
            ValueError: If mode is cross attention and query passed in forward is None.
            ValueError: If mode is cross attention and shape of query is not same as input coming from encoder.
        
        References:
            https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/MultiHeadAttention.py
            https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Transformer.py
            https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/PositionalEncoding.py
        """
         
        if not self.self_attention:
            if q is None :
                raise ValueError("q is required for cross attention")
            # elif x.size() != q.size() :
            #     raise ValueError("q and X must have same size")
        else :
            q = x

        key = F.gelu(self.w_ks(x))
        query = F.gelu(self.w_qs(q))
        value = F.gelu(self.w_vs(x))

        ## keeping n_heads as major dimension
        key = torch.stack(key.split(self.d_k, dim = 2), dim=0)
        query = torch.stack(query.split(self.d_k, dim = 2), dim=0)#query.view(-1, query.size(0), query.size(1), self.d_k)
        value = torch.stack(value.split(self.d_k, dim = 2), dim=0)#value.view(-1, value.size(0), value.size(1), self.d_v)

        value, attention = self.attention(key, query, value, encoder_mask)

        value = torch.concat(value.split(split_size=1, dim = 0), dim=3).squeeze()

        value = self.dropout(F.gelu(self.mha_linear(value)))

        return value, attention

In [15]:
###################################### ADD LAYER NORMALIZATION ######################################

class AddLayerNormalization(nn.Module) :

    def __init__(self, d_model) :
        
        super().__init__()

        self.layer_norm = nn.LayerNorm([d_model])

    def forward(self, x, mha_output) :
        
        return self.layer_norm(x + mha_output)

In [16]:
###################################### POINT-WISE FEED FORWARD ######################################

class PointWiseFeedforward(nn.Module) :

    def __init__(self, 
                 d_ff: int = 2048, 
                 d_model: int = 512
    ) -> None :
        
        """
        Args:
            d_ff (int): Intermediate size of the feedforward layer.
            d_model (int):  Size of the embeddings.
        """
        
        super(PointWiseFeedforward, self).__init__()

        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x) :
        
        linear1_output = self.linear1(x)
        linear2_output = self.linear2(F.gelu(linear1_output))

        return linear2_output
        

### Single Transformer Layer

In [17]:
###################################### SINGLE ENCODER LAYER ######################################

class EncoderLayer(nn.Module) :

    def __init__(self,
                 n_heads: int = 8,
                 d_model: int = 512,
                 d_ff: int = 2048,
                 device: str = 'cuda'
        ) -> None :
        
        super(EncoderLayer, self).__init__()
        
        self.mha = MultiHeadAttention(n_heads, d_model, device = device)
        self.layer_norm = AddLayerNormalization(d_model)
        self.pff = PointWiseFeedforward(d_ff, d_model)
        self.layer_norm2 = AddLayerNormalization(d_model)

    def forward(self, x, encoder_mask) :

        mha_output, mha_attention_scores = self.mha(x, encoder_mask=encoder_mask)
        # logs(f"mha_output shape: {mha_output.shape}")
        norm_output1 = self.layer_norm(x, mha_output)
        # logs(f"norm_output1 shape: {norm_output1.shape}")

        pff_output = self.pff(norm_output1)
        # logs(f"pff_output shape: {pff_output.shape}")
        norm_output2 = self.layer_norm2(norm_output1, pff_output)
        # logs(f"norm_output2 shape: {norm_output2.shape}")

        return norm_output2, mha_attention_scores

In [18]:
###################################### SINGLE DECODER LAYER ######################################

class DecoderLayer(nn.Module) :

    def __init__(self, 
                 n_heads,
                 d_model,
                 d_ff, 
                 device
    ) -> None :

        super(DecoderLayer, self).__init__()

        self.mha = MultiHeadAttention(n_heads, d_model, mask = True, device = device)
        self.cross_mha = MultiHeadAttention(n_heads, d_model, self_attention=False, device = device)
        self.layer_norm1 = AddLayerNormalization(d_model)
        self.layer_norm2 = AddLayerNormalization(d_model)
        self.layer_norm3 = AddLayerNormalization(d_model)
        self.pff = PointWiseFeedforward(d_ff, d_model)

    def forward(self, x, encoder_out, encoder_mask) :
        ## passing encoder output to all decoder layers : to be discussed with Deepak
        decoder_query, _ = self.mha(x)
        norm_decoder_query = self.layer_norm1(x, decoder_query)

        x, _ = self.cross_mha(encoder_out, q = norm_decoder_query, encoder_mask=encoder_mask)
        norm_cross_x = self.layer_norm2(norm_decoder_query, x)

        x = self.pff(norm_cross_x)
        norm_decoder_output = self.layer_norm3(norm_cross_x, x)

        return norm_decoder_output
    


### Transformer

In [19]:
###################################### POSITION EMBEDDING ######################################

class PositionEmbedding(nn.Module) :

    def __init__(self,
        max_seq_len: int = 128, 
        d_model: int = 512,
        dropout: int = 0.1,
        device: str = 'cuda'
    ) :

        super(PositionEmbedding, self).__init__()

        self.embedding = torch.zeros(max_seq_len, d_model).to(device)
        self.dropout = nn.Dropout(dropout)
        
        for i in range(max_seq_len) :
            self.embedding[i, 0::2] = torch.sin((i/1000**(2*torch.arange(d_model)[::2]/d_model)))
            self.embedding[i, 1::2] = torch.cos((i/1000**(2*torch.arange(d_model)[1::2]/d_model)))

    def forward(self, x) :

        embedding = torch.repeat_interleave(self.embedding.unsqueeze(0), x.size(0), 0)

        return self.dropout(x + embedding[:, :x.size(1), :])

In [20]:
###################################### ENCODER ######################################

class Encoder(nn.Module) :

    def __init__(self,
                 n_layer: int = 6,
                 n_heads: int = 8,
                 d_model: int = 512,
                 d_ff: int = 2048,
                 device: str = 'cuda'
    ) :
        super(Encoder, self).__init__()
        
        self.encoder = nn.ModuleDict({
            f'encoder_layer_{i}' : 
            (
                EncoderLayer(
                    n_heads,
                    d_model,
                    d_ff,
                    device = device
                )
            ) for i in range(n_layer)
            })

    def forward(self, x, encoder_mask) :
        # logs(f'input size : {x.size()}')
        for name, layer in self.encoder.items() :
            x, attention_scores = layer(x, encoder_mask)
            # logs(f'{name} output size : {x.size()}')
        return x, attention_scores

In [21]:
###################################### DECODER ######################################

class Decoder(nn.Module) :

    def __init__(self,
                 n_layer: int = 6,
                 n_heads: int = 8,
                 d_model: int = 512,
                 d_ff: int = 2048,
                 device: str = 'cuda'
    ) -> None :

        super(Decoder, self).__init__()

        self.decoder = nn.ModuleDict({
            f'decoder_layer_{i}' :
            (
                DecoderLayer(
                    n_heads,
                    d_model,
                    d_ff,
                    device = device
                )
            ) for i in range(n_layer)
        })


    def forward(self, x, encoder_out, encoder_mask) :

        for name, layer in self.decoder.items() :
            x = layer(x, encoder_out, encoder_mask)
        return x

In [88]:
##################################### TRANSFORMER_FMT ######################################

class Transformers(nn.Module) :

    def __init__(self,
                 n_layer,
                 n_heads,
                 d_model,
                 d_ff,
                 max_seq_len,
                 vocab_size,
                 device
        ) -> None :

        super(Transformers, self).__init__()

        vocab_size = vocab_size + 2

        self.encoder = Encoder(n_layer, n_heads, d_model, d_ff, device=device)
        self.decoder = Decoder(n_layer, n_heads, d_model, d_ff, device=device)
        self.positonal_embedding = PositionEmbedding(max_seq_len, d_model, device=device)

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.logit_layer = nn.Linear(d_model, vocab_size)

        self.max_seq_len = max_seq_len


    def encoder_pass(self, x, encoder_mask) :
        
        x = self.embedding(x)
        x = self.positonal_embedding(x)
        x = self.encoder(x, encoder_mask)

        return x
    

    def decoder_pass(self, encoder_output, input_ids, encoder_mask) :

        x = self.embedding(input_ids)
        x = self.positonal_embedding(x)
        x = self.decoder(x, encoder_output, encoder_mask)

        next_token_logits = F.relu(self.logit_layer(x))
        next_token_logits = next_token_logits.reshape(-1, next_token_logits.size(2))

        return F.log_softmax(next_token_logits, dim=1)

    def forward(self, encoder_inp, decoder_inp, encoder_mask) :

        enc_output, attention_scores = self.encoder_pass(encoder_inp, encoder_mask)
        output = self.decoder_pass(enc_output, decoder_inp, encoder_mask)

        return attention_scores, output

# Dataset

### Prepare Tokenizer

In [89]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer.add_special_tokens({
    'bos_token' : '[BOS]',
    'eos_token' : '[EOS]'
})

2

In [90]:
def get_data(file_location, chunksize=1000, n_chunks = 100) :

    data = []


    for i, items in enumerate(pd.read_csv(file_location, chunksize=chunksize)) :

        data.append(items)
        if i == n_chunks - 1 :
            break

    data = pd.concat(data)
    data.index = range(len(data))

    return data

In [91]:
data = get_data(file_location=os.path.join(ROOT_DIR, 'data/en_fr_100K.csv'), n_chunks = 2)

In [92]:
def remove_long_sentence(max_seq_len, data):
    """
    Remove sentences that are longer than max_seq_len
    """
    data['en_sentence_length'] = data['en'].apply(lambda x : len(x.split()) if type(x) == str else max_seq_len + 1)
    data['fr_sentence_length'] = data['fr'].apply(lambda x : len(x.split()) if type(x) == str else max_seq_len + 1)

    data = data.drop(
        data[(data['en_sentence_length'] > max_seq_len) | (data['fr_sentence_length'] > max_seq_len)].index
    )

    data.index = range(len(data))
    
    return data

In [93]:
data = remove_long_sentence(max_seq_len, data)

In [94]:
data

Unnamed: 0,en,fr,en_sentence_length,fr_sentence_length
0,Site map,Plan du site,2,3
1,Feedback,Rétroaction,1,1
2,Credits,Crédits,1,1
3,Français,English,1,1
4,What is light ?,Qu’est-ce que la lumière?,4,4
...,...,...,...,...
988,Bubble Are we almost there?,Bulle Quand est-ce qu’on arrive?,5,5
989,Bubble I’m hungry.,Bulle J’ai faim.,3,3
990,Sound Car Image 3 Mother Stop pestering your...,Bruit Voiture Image 3 Mère Laissez votre père ...,10,9
991,We’re almost there… aren’t we?,Nous ne sommes plus très loin (en regardant so...,5,12


In [95]:
train_data, val_test_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)

train_data.index = range(len(train_data))
val_data.index = range(len(val_data))
test_data.index = range(len(test_data))

### Pytorch Dataset

In [96]:
class Data(Dataset) :

    def __init__(self, data) :
        self.data = data

    def __len__(self) :
        return len(self.data)
    
    def __getitem__(self, index) -> any:
        row = self.data.loc[index]
        return {'en' : row['en'], 'fr' : row['fr']}

In [97]:
train_dataset = Data(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

val_dataset = Data(val_data)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)

test_dataset = Data(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True)

data_loaders = {
    'train': train_dataloader,
    'val': val_dataloader,
    'test': test_dataloader
}

# Modeling

### Model

In [98]:
model = Transformers(
    n_layer = 6,
    n_heads = 8,
    d_model = 512,
    d_ff = 2048,
    max_seq_len = 512,
    vocab_size = tokenizer.vocab_size,
    device = DEVICE
).to(DEVICE)

In [99]:
def preprocess_batch(batch, type = 'input') :

    ## Append the BOS and EOS token based on wether the batch is the encoder input, decoder input(output shifted left)
    ## or the label (output shifted right)
    if type == 'input' :
        input_token_ids = [
            torch.cat(
                (torch.tensor([tokenizer.bos_token_id]), torch.tensor(inp), torch.tensor([tokenizer.eos_token_id])),
            ) for inp in batch['input_ids']
        ]

    elif type == 'output' :
        input_token_ids = [
            torch.cat(
                (torch.tensor([tokenizer.bos_token_id]), torch.tensor(inp)),
            ) for inp in batch['input_ids']
        ]

    elif type == 'label' :
        
        input_token_ids = [
            torch.cat(
                (torch.tensor(inp), torch.tensor([tokenizer.eos_token_id])),
            ) for inp in batch['input_ids']
        ]

    ## pad the token to the maxiumum sentence length
    input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value = tokenizer.pad_token_id, )

    return input_token_ids

# def collate_fn(samples):
    
#     eng_samples = [items['en'] for items in samples]
#     fr_samples = [items['fr'] for items in samples]

#     batch = {}

#     for language, sample in {'en' : eng_samples, 'fr' : fr_samples}.items() :

#         sample = tokenizer.batch_encode_plus(sample)
#         batch[language] = preprocess_batch(sample)

#     # samples['fr'] = tokenizer.batch_encode_plus(samples['fr'])
#     return batch  

### Train / val loop

In [118]:
from tqdm import tqdm
debug = False

def train_model(model, data_loader, optimizer, criterion, device, epoch, mode = 'train') :

    EPOCH_LOSS = 0

    assert mode in ['train', 'val'], 'Mode should be either "train" or "val"'

    if mode == 'train' :
        model.train()
    elif mode == 'val' :
        model.eval()

    for i, rows in tqdm(enumerate(data_loader[mode]), total = len(data_loader[mode])) :

        # try :

        ## preprocess batch for training
        en_token_ids = tokenizer.batch_encode_plus(rows['en'], add_special_tokens = False)
        fr_token_ids = tokenizer.batch_encode_plus(rows['fr'], add_special_tokens = False)
        encoder_inp = preprocess_batch(en_token_ids, type='input').to(device)
        # print(encoder_inp.size())
        logs(f'encoder_inp size : {encoder_inp.size()}', debug)
        decoder_inp = preprocess_batch(fr_token_ids, type='output').to(device)
        # print(decoder_inp.size())
        logs(f'decoder_inp size : {decoder_inp.size()}', debug)
        label = preprocess_batch(fr_token_ids, type='label').to(device)
        # print(label.size())

        ## encoder mask
        enc_mask = encoder_inp != tokenizer.pad_token_id
        enc_mask = enc_mask.long()

        dec_mask = decoder_inp != tokenizer.pad_token_id
        dec_mask = dec_mask.long()
        # print(enc_mask.size())
        
        ## forward pass through the model
        attention_scores, output = model(encoder_inp, decoder_inp, enc_mask)
        # print(output.size())
        # print(label.size())
        ## calculate loss
        loss = criterion(output, label.reshape(-1)) * dec_mask.reshape(-1).float()
        loss = loss.sum() / dec_mask.reshape(-1).float().sum()

        ## optimize model
        if mode == 'train' :
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        ## accumulate loss
        EPOCH_LOSS += loss.item()

        writer.add_scalar(f'{mode} loss', EPOCH_LOSS/(i + 1), epoch * len(data_loader[mode]) + i)

        # except Exception as e :
        #     print(f'{epoch}_{i} | Exception : {e}')
        
        torch.cuda.empty_cache()

    return EPOCH_LOSS / len(data_loader[mode])

### Optimizer and Loss Function

In [119]:
criterion = nn.CrossEntropyLoss(reduction='none')
optimizer = optim.Adam(model.parameters(), lr = 1e-4)

### Model training

In [120]:
least_val_loss = 1000
MODEL_DIR = os.path.join(ROOT_DIR, 'model_weights')

In [121]:
start_epochs = 0
end_epochs = 100

for epochs in range(start_epochs, end_epochs) :

    train_loss = train_model(model, data_loaders, optimizer, criterion, DEVICE, epochs, mode = 'train')
    # break
    val_loss = train_model(model, data_loaders, optimizer, criterion, DEVICE, epochs, mode = 'val')
    
    if val_loss < least_val_loss :
        try :
            least_val_loss = val_loss
            torch.save(
                model.state_dict(), 
                os.path.join(MODEL_DIR, f'transformer_ep-{epochs + 1}_val-loss-{val_loss:.4f}.pt')
            )
        except Exception as e :
            print(f'{epochs} | Problem in saving model\nException : {e}')

    print(f'Epochs : {epochs + 1} | Train Loss : {train_loss:.4f} | Val Loss : {val_loss:.4f}')
    print('----------------------------------------------------')

100%|██████████| 100/100 [06:42<00:00,  4.03s/it]
100%|██████████| 13/13 [00:18<00:00,  1.41s/it]


0 | Problem in saving model
Exception : [Errno 2] No such file or directory: '/Users/adityarustagi/Documents/self-implementations/model_weights/transformer_ep-1_val-loss-8.5930.pt'
Epochs : 1 | Train Loss : 9.5370 | Val Loss : 8.5930
----------------------------------------------------


 58%|█████▊    | 58/100 [04:12<03:02,  4.35s/it]


KeyboardInterrupt: 

### Model Testing

In [None]:
def test_model(model, valid_loader, device, tokenizer, max_seq_len) :

    EPOCH_LOSS = 0

    model.eval()

    for i, rows in enumerate(valid_loader) :

        ## preprocess batch for training
        en_token_ids = tokenizer.batch_encode_plus(rows['en'], add_special_tokens = False)
        fr_token_ids = tokenizer.batch_encode_plus(rows['fr'], add_special_tokens = False)
        encoder_inp = preprocess_batch(en_token_ids, type='input').to(device)
        decoder_inp = preprocess_batch(fr_token_ids, type='output').to(device)
        label = preprocess_batch(fr_token_ids, type='label').to(device)
        

        ## encode the input
        enc_output, attention_scores = model.encoder_pass(encoder_inp)

        input_tokens = generate(model, enc_output, tokenizer, max_seq_len)

In [52]:
debug = True
def generate(model, enc_output, tokenizer, max_seq_len, device):
    
    input_ids = torch.tensor([[tokenizer.bos_token_id] for _ in range(enc_output.size(0))]).to(device)

    unfinished_sequences = torch.ones(input_ids.size(0), 1).to(device)
    eos_token_id_tensor = torch.tensor([tokenizer.eos_token_id]).to(device)
    # logs('unfinished_sequences size: {}'.format(unfinished_sequences.size()), debug)

    sentence_length = input_ids.size(1)

    while sentence_length <= max_seq_len :

        x = model.embedding(input_ids)
        x = model.positonal_embedding(x)
        x = model.decoder(x, enc_output, encoder_mask = None)
        next_token_logits = x[:, -1, :]
        
        logs(f'next_token_logits size: {next_token_logits.size()}', debug)
        # print(next_token_logits)
        next_token_logits = F.softmax(next_token_logits, dim=1)
        next_token_indices = torch.argmax(next_token_logits, dim = 1)
        print(next_token_indices)

        logs(f'next_token_indices post softmax size: {next_token_indices.size()}', debug)

        logs(f'next_token_indices * unfinished_sequences size: {(next_token_indices.unsqueeze(1) * unfinished_sequences).size()}', debug)

        logs(f'tokenizer.pad_token_id * (1 - unfinished_sequences) size: {(tokenizer.pad_token_id * (1 - unfinished_sequences)).size()}', debug)

        next_token_indices = (
            next_token_indices.unsqueeze(1) * unfinished_sequences + tokenizer.pad_token_id * (1 - unfinished_sequences)
        )
        unfinished_sequences = unfinished_sequences.mul(
            next_token_indices.tile(
                eos_token_id_tensor.shape[0]
            ).ne(eos_token_id_tensor).prod(dim = 0)
        )

        logs(unfinished_sequences, debug)

        if unfinished_sequences.max() == 0 :
            break

        # print(input_ids.size())
        # print(next_token_indices.size())

        input_ids = torch.cat(
            (
                input_ids, 
                next_token_indices
            ), 
            dim = 1).long()

        sentence_length += 1
    
        print(input_ids.size(1))
        print('-'*30)

    return input_ids


In [2]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer.add_special_tokens({
    'bos_token' : '[BOS]',
    'eos_token' : '[EOS]'
})

2

In [3]:
tokenizer.add_bos_token = True

In [6]:
test_sentence = ['my name is kong', 'this is aditya rustagi', 'i like to eat ice cream']
test_sentence_tokens = tokenizer.batch_encode_plus(test_sentence, padding=True, add_special_tokens = False)
# encoder_inp = preprocess_batch(test_sentence_tokens, type='input').to(DEVICE)

In [7]:
tokenizer.batch_decode(test_sentence_tokens['input_ids'])

['my name is kong [PAD] [PAD] [PAD] [PAD]',
 'this is aditya rustagi',
 'i like to eat ice cream [PAD] [PAD]']

In [69]:
test_sentence_tokens['token_type_ids']

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [70]:
test_sentence_tokens['attention_mask']

[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]

In [None]:
enc_output, attention_scores = model.encoder_pass(encoder_inp, encoder_mask = None)
output_ids = generate(model, enc_output, tokenizer, max_seq_len, DEVICE)

In [55]:
tokenizer.batch_decode(output_ids)

['[BOS] Į Ι ő Ι Έ ι ǹ É ɨ œ " * ω ǫ [unused2] ª ş £ " *',
 "[BOS] > l λ ũ l λ Į Ι Į Ι â [unused88] [unused58] ǫ '. ņ đ ƒ Φ",
 '[BOS] Į Ι [unused83] Μ [unused83] Ι µ\'¥\'¥\'¥\'¥\'Ð " * ω']

In [67]:
logits = torch.tensor([[[0.1, 0.3, 0.6],  # Sample 1, class probabilities
                       [0.8, 0.1, 0.1],
                       [0.2, 0.5, 0.3]],
                      
                      [[0.5, 0.2, 0.3],  # Sample 2, class probabilities
                       [0.2, 0.6, 0.2],
                       [0.3, 0.4, 0.3]]])

In [68]:
logits.size()

torch.Size([2, 3, 3])

In [86]:
a = torch.rand((2, 5, 10))
mask = torch.randint(0, 2, (2, 5))
b = torch.randint(0, 10, (10, 1))
print(mask)
print(mask.reshape(-1))
print(criterion(a.reshape(-1, a.size(2)), b.squeeze()) * mask.reshape(-1))

tensor([[0, 1, 0, 0, 0],
        [1, 1, 1, 1, 0]])
tensor([0, 1, 0, 0, 0, 1, 1, 1, 1, 0])
tensor([0.0000, 2.0416, 0.0000, 0.0000, 0.0000, 2.2514, 2.5425, 2.0280, 2.6332,
        0.0000])


torch.Size([2, 1])