In [1]:
import torch

import torch.nn as nn

import math

import warnings

from tqdm import tqdm

from typing import Any







import torch

import torch.nn as nn

from torch.utils.data import DataLoader, Dataset, random_split

from tqdm import tqdm

from pathlib import Path

from tokenizers import Tokenizer

from tokenizers.models import WordLevel

from tokenizers.trainers import WordLevelTrainer

from tokenizers.pre_tokenizers import Whitespace

import os

from torch.utils.tensorboard import SummaryWriter

import transformers

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset

In [2]:
!conda install -y gdown

  pid, fd = os.forkpty()


Retrieving notices: ...working... done
Channels:
 - rapidsai
 - nvidia
 - nodefaults
 - conda-forge
 - defaults
 - pytorch
Platform: linux-64
Collecting package metadata (repodata.json): \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done
Solving environment: - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - gdown


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-24.9.2               |  py310hff52083_0 

In [3]:
!pip install transformers




In [4]:
# Enable parallelism in tokenization
transformers.utils.logging.enable_propagation()

**Embedding**

In [5]:
class Embedding(nn.Module):

  def __init__(self, d_model:int, vocab_size:int):

    super().__init__()

    # model dimension from the paper whihc is 512

    self.d_model=d_model

    self.vocab_size=vocab_size

    self.embedding=nn.Embedding(vocab_size,d_model)

  def forward(self,x):

    # * sqrt(self.d_model) from the research paper

    return self.embedding(x) * math.sqrt(self.d_model)

In [6]:
class PositionalEncoding(nn.Module):

  def __init__(self, d_model:int , seq_len:int , dropout:float):

    super().__init__()

    self.d_model=d_model

    self.seq_len=seq_len

    self.dropout=nn.Dropout(dropout)



    pe=torch.zeros(self.seq_len, self.d_model)

    # unsequeeze 1 to reshape

    positions=torch.arange(0, self.seq_len, dtype=torch.float).unsqueeze(1)

    div_term = 10000 ** (torch.arange(0,self.d_model,2) / d_model)

    # even poistion encoding

    pe[:,0::2]=torch.sin(positions/div_term)

    # odd poistion encoding

    pe[:,1::2]=torch.cos(positions/div_term)

    # for bacth dimensions

    pe=pe.unsqueeze(0)

    # saving our positional encoding like tunable parameter but it did not update during training

    self.register_buffer("pe",pe)

  def forward(self,x):

    # not want trainable encoding

    x=x+ (self.pe[:,:x.shape[1],:]).requires_grad_(False)

    return self.dropout(x)

# **MultiHead Attention**

In [7]:
class MultiHeadAttention(nn.Module):

  def __init__(self, d_model:int,h:int, dropout:float ):

    super().__init__()

    self.d_model=d_model

    self.h=h



    assert d_model % h==0,"Dimensions is not divisible by number of heads"



    self.d_k=self.d_model // self.h

    #  now query key and value weights

    self.w_q=nn.Linear(d_model, d_model)

    self.w_k=nn.Linear(d_model, d_model)

    self.w_v=nn.Linear(d_model, d_model)

    # matrix which we use after concatenating to convert it to same dimensional back

    self.w_o=nn.Linear(d_model, d_model)

    self.dropout=nn.Dropout(dropout)



  @staticmethod

  def attention(query, key, value, mask, dropout: nn.Dropout):

    # mask => When we want certain words to NOT interact with others, we "hide" them



    d_k = query.shape[-1] # The last dimension of query, key, and value



    # We calculate the Attention(Q,K,V) as in the formula in the image above

    attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k) # @ = Matrix multiplication sign in PyTorch



    # Before applying the softmax, we apply the mask to hide some interactions between words

    if mask is not None: # If a mask IS defined...

        attention_scores.masked_fill_(mask == 0, -1e9) # Replace each value where mask is equal to 0 by -1e9

        attention_scores = attention_scores.softmax(dim = -1) # Applying softmax

        if dropout is not None: # If a dropout IS defined...

            attention_scores = dropout(attention_scores) # We apply dropout to prevent overfitting



    return (attention_scores @ value), attention_scores # Multiply the output matrix by the V matrix, as in the formula



  def forward(self, q, k, v, mask):

    query=self.w_q(q)

    key=self.w_k(k)

    value=self.w_v(v)





    # reshaping it for multihead

    query=query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2)

    key=key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2)

    value=value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2)

    # Obtaining the output and the attention scores

    x, self.attention_scores = MultiHeadAttention.attention(query, key, value, mask, self.dropout)



    # Obtaining the H matrix

    x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)



    return self.w_o(x) # Multiply the H matrix by the weight matrix W_o, resulting in the MH-A matrix

**Residual Connections**

In [8]:
class ResidualConnection(nn.Module):

    def __init__(self, dropout: float) -> None:

        super().__init__()

        self.dropout = nn.Dropout(dropout) # We use a dropout layer to prevent overfitting

        self.norm = LayerNormalization() # We use a normalization layer



    def forward(self, x, sublayer):

        # We normalize the input and add it to the original input 'x'. This creates the residual connection process.

        return x + self.dropout(self.norm(sublayer(x)))

**Feed Forward Network**

dff=2048

from the paper attention all you need

In [9]:
class LayerNormalization(nn.Module):



    def __init__(self, eps: float = 10**-6) -> None: # We define epsilon as 0.000001 to avoid division by zero

        super().__init__()

        self.eps = eps



        # We define alpha as a trainable parameter and initialize it with ones

        self.alpha = nn.Parameter(torch.ones(1)) # One-dimensional tensor that will be used to scale the input data



        # We define bias as a trainable parameter and initialize it with zeros

        self.bias = nn.Parameter(torch.zeros(1)) # One-dimensional tenso that will be added to the input data



    def forward(self, x):

        mean = x.mean(dim = -1, keepdim = True) # Computing the mean of the input data. Keeping the number of dimensions unchanged

        std = x.std(dim = -1, keepdim = True) # Computing the standard deviation of the input data. Keeping the number of dimensions unchanged



        # Returning the normalized input

        return self.alpha * (x-mean) / (std + self.eps) + self.bias

In [10]:


class FeedForwardBlock(nn.Module):



    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:

        super().__init__()

        # First linear transformation

        self.linear_1 = nn.Linear(d_model, d_ff) # W1 & b1

        self.dropout = nn.Dropout(dropout) # Dropout

        # Second linear transformation

        self.linear_2 = nn.Linear(d_ff, d_model) # W2 & b2



    def forward(self, x):

        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

**Encoder Block**

In [11]:
class EncoderBlock(nn.Module):



    def __init__(self, self_attention_block: MultiHeadAttention, feed_forward_block: FeedForwardBlock, dropout: float) -> None:

        super().__init__()

        # Storing the self-attention block and feed-forward block

        self.self_attention_block = self_attention_block

        self.feed_forward_block = feed_forward_block

        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)]) # 2 Residual Connections with dropout



    def forward(self, x, src_mask):

        # Applying the first residual connection with the self-attention block

        # src_mask for padding tokens

        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))

        x = self.residual_connections[1](x, self.feed_forward_block)

        return x

**Complete Encoder**

In [12]:
class Encoder(nn.Module):



    # The Encoder takes in instances of 'EncoderBlock'

    def __init__(self, layers: nn.ModuleList) -> None:

        super().__init__()

        self.layers = layers # Storing the EncoderBlocks

        self.norm = LayerNormalization() # Layer for the normalization of the output of the encoder layers



    def forward(self, x, mask):

        # Iterating over each EncoderBlock stored in self.layers

        for layer in self.layers:

            x = layer(x, mask) # Applying each EncoderBlock to the input tensor 'x'

        return self.norm(x) # Normalizing output

In [13]:










class DecoderBlock(nn.Module):



    # The DecoderBlock takes in two MultiHeadAttentionBlock. One is self-attention, while the other is cross-attention.

    def __init__(self,  self_attention_block: MultiHeadAttention, cross_attention_block: MultiHeadAttention, feed_forward_block: FeedForwardBlock, dropout: float) -> None:

        super().__init__()

        self.self_attention_block = self_attention_block

        self.cross_attention_block = cross_attention_block

        self.feed_forward_block = feed_forward_block

        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)]) # List of three Residual Connections with dropout rate



    def forward(self, x, encoder_output, src_mask, tgt_mask):



        # Self-Attention block with query, key, and value plus the target language mask

        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))



        # The Cross-Attention block using two 'encoder_ouput's for key and value plus the source language mask. It also takes in 'x' for Decoder queries

        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))



        # Feed-forward block with residual connections

        x = self.residual_connections[2](x, self.feed_forward_block)

        return x

In [14]:




class Decoder(nn.Module):



    # The Decoder takes in instances of 'DecoderBlock'

    def __init__(self, layers: nn.ModuleList) -> None:

        super().__init__()

        self.layers = layers

        self.norm = LayerNormalization() # Layer to normalize the output



    def forward(self, x, encoder_output, src_mask, tgt_mask):



        # Iterating over each DecoderBlock stored in self.layers

        for layer in self.layers:

            # Applies each DecoderBlock to the input 'x' plus the encoder output and source and target masks

            x = layer(x, encoder_output, src_mask, tgt_mask)

        return self.norm(x) # Returns normalized output

**Linear Layer**

In [15]:
class ProjectionLayer(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None: # Model dimension and the size of the output vocabulary

        super().__init__()

        self.proj = nn.Linear(d_model, vocab_size) # Linear layer for projecting the feature space of 'd_model' to the output space of 'vocab_size'

    def forward(self, x):

        return torch.log_softmax(self.proj(x), dim = -1)

# **Transformer **

In [16]:
class Transformer(nn.Module):



    # This takes in the encoder and decoder, as well the embeddings for the source and target language.

    # It also takes in the Positional Encoding for the source and target language, as well as the projection layer

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: Embedding, tgt_embed: Embedding, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:

        super().__init__()

        self.encoder = encoder

        self.decoder = decoder

        self.src_embed = src_embed

        self.tgt_embed = tgt_embed

        self.src_pos = src_pos

        self.tgt_pos = tgt_pos

        self.projection_layer = projection_layer



    # Encoder

    def encode(self, src, src_mask):

        src = self.src_embed(src) # Applying source embeddings to the input source language

        src = self.src_pos(src) # Applying source positional encoding to the source embeddings

        return self.encoder(src, src_mask) # Returning the source embeddings plus a source mask to prevent attention to certain elements



    # Decoder

    def decode(self, encoder_output, src_mask, tgt, tgt_mask):

        tgt = self.tgt_embed(tgt) # Applying target embeddings to the input target language (tgt)

        tgt = self.tgt_pos(tgt) # Applying target positional encoding to the target embeddings



        # Returning the target embeddings, the output of the encoder, and both source and target masks

        # The target mask ensures that the model won't 'see' future elements of the sequence

        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)



    # Applying Projection Layer with the Softmax function to the Decoder output

    def project(self, x):

        return self.projection_layer(x)

In [17]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int = 512, N: int = 6, h: int = 8, dropout: float = 0.1, d_ff: int = 2048) -> Transformer:



    # Creating Embedding layers

    src_embed = Embedding(d_model, src_vocab_size) # Source language (Source Vocabulary to 512-dimensional vectors)

    tgt_embed = Embedding(d_model, tgt_vocab_size) # Target language (Target Vocabulary to 512-dimensional vectors)



    # Creating Positional Encoding layers

    src_pos = PositionalEncoding(d_model, src_seq_len, dropout) # Positional encoding for the source language embeddings

    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout) # Positional encoding for the target language embeddings



    # Creating EncoderBlocks

    encoder_blocks = [] # Initial list of empty EncoderBlocks

    for _ in range(N): # Iterating 'N' times to create 'N' EncoderBlocks (N = 6)

        encoder_self_attention_block = MultiHeadAttention(d_model, h, dropout) # Self-Attention

        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout) # FeedForward



        # Combine layers into an EncoderBlock

        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)

        encoder_blocks.append(encoder_block) # Appending EncoderBlock to the list of EncoderBlocks



    # Creating DecoderBlocks

    decoder_blocks = [] # Initial list of empty DecoderBlocks

    for _ in range(N): # Iterating 'N' times to create 'N' DecoderBlocks (N = 6)

        decoder_self_attention_block = MultiHeadAttention(d_model, h, dropout) # Self-Attention

        decoder_cross_attention_block = MultiHeadAttention(d_model, h, dropout) # Cross-Attention

        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout) # FeedForward



        # Combining layers into a DecoderBlock

        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)

        decoder_blocks.append(decoder_block) # Appending DecoderBlock to the list of DecoderBlocks



    # Creating the Encoder and Decoder by using the EncoderBlocks and DecoderBlocks lists

    encoder = Encoder(nn.ModuleList(encoder_blocks))

    decoder = Decoder(nn.ModuleList(decoder_blocks))



    # Creating projection layer

    projection_layer = ProjectionLayer(d_model, tgt_vocab_size) # Map the output of Decoder to the Target Vocabulary Space



    # Creating the transformer by combining everything above

    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)



    # Initialize the parameters

    for p in transformer.parameters():

        if p.dim() > 1:

            nn.init.xavier_uniform_(p)



    return transformer # Assembled and initialized Transformer. Ready to be trained and validated!


In [18]:


def build_tokenizer(config, lang):
    if lang == config['lang_src']:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    elif lang == config['lang_tgt']:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
    else:
        raise ValueError("Unsupported language")

    # Adding special tokens
    special_tokens = {'additional_special_tokens': ['[SOS]', '[EOS]'], 'pad_token': '[PAD]'}
    tokenizer.add_special_tokens(special_tokens)

    print(f"Tokenizer vocab size after adding special tokens: {tokenizer.vocab_size}")
    return tokenizer


def casual_mask(size):
    """Creates a future-masking matrix to prevent decoder from 'seeing' future tokens."""
    return torch.triu(torch.ones(size, size), diagonal=1).type(torch.bool)


def train_model(config, df):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config, df)

    model = get_model(config, tokenizer_src.vocab_size, tokenizer_tgt.vocab_size).to(device)
    model.src_embed.embedding = nn.Embedding(tokenizer_src.vocab_size, config['d_model'])
    model.tgt_embed.embedding = nn.Embedding(tokenizer_tgt.vocab_size, config['d_model'])

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.pad_token_id, label_smoothing=0.1).to(device)

    for epoch in range(config['num_epochs']):
        model.train()
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
            optimizer.zero_grad()
            encoder_input, decoder_input = batch['encoder_input'].to(device), batch['decoder_input'].to(device)
            encoder_mask, decoder_mask = batch['encoder_mask'].to(device), batch['decoder_mask'].to(device)
            label = batch['label'].to(device)

            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output = model.project(decoder_output)

            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.vocab_size), label.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()


In [19]:
from transformers import AutoTokenizer

def build_tokenizer(config, lang):
    if lang == config['lang_src']:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # English tokenizer
    elif lang == config['lang_tgt']:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")  # German tokenizer
    else:
        raise ValueError("Unsupported language")
    return tokenizer




In [20]:
# def casual_mask(size):

#         # Creating a square matrix of dimensions 'size x size' filled with ones

#         mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)

#         return mask == 0

def casual_mask(size):
    """Creates a causal mask of shape (1, size, size) to prevent attention to future tokens."""
    mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.bool)
    return mask == 0  # Invert the mask so that future positions are masked (set to False)


In [21]:


class BilingualDataset(Dataset):
    def __init__(self, df, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        self.df = df
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.seq_len = seq_len
        self.sos_token_id = tokenizer_tgt.convert_tokens_to_ids("[SOS]")
        self.eos_token_id = tokenizer_tgt.convert_tokens_to_ids("[EOS]")
        self.pad_token_id = tokenizer_tgt.pad_token_id

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        src_text = self.df.iloc[index][self.src_lang]
        tgt_text = self.df.iloc[index][self.tgt_lang]

        # Tokenize source and target texts
        enc_input_tokens = self.tokenizer_src.encode(src_text, truncation=True, padding="max_length", max_length=self.seq_len)
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text, truncation=True, padding="max_length", max_length=self.seq_len)

        # Convert to tensors
        encoder_input = torch.tensor(enc_input_tokens, dtype=torch.long)
        decoder_input = torch.tensor([self.sos_token_id] + dec_input_tokens[:-1], dtype=torch.long)  # Add [SOS] at the beginning
        label = torch.tensor(dec_input_tokens, dtype=torch.long)

        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input,
            'encoder_mask': (encoder_input != self.pad_token_id).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input != self.pad_token_id).unsqueeze(0).unsqueeze(0).int(),
            'label': label,
            'src_text': src_text,
            'tgt_text': tgt_text
        }




# class BilingualDataset(Dataset):
#     def __init__(self, df, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
#         self.df = df
#         self.tokenizer_src = tokenizer_src
#         self.tokenizer_tgt = tokenizer_tgt
#         self.src_lang = src_lang
#         self.tgt_lang = tgt_lang
#         self.seq_len = seq_len
#         self.sos_token_id = self.tokenizer_tgt.convert_tokens_to_ids("[SOS]")
#         self.eos_token_id = self.tokenizer_tgt.convert_tokens_to_ids("[EOS]")
#         self.pad_token_id = self.tokenizer_tgt.pad_token_id

#     def __getitem__(self, index):
#         src_text = self.df.iloc[index][self.src_lang]
#         tgt_text = self.df.iloc[index][self.tgt_lang]

#         # Tokenize source and target texts
#         enc_input_tokens = self.tokenizer_src.encode(src_text, truncation=True, padding="max_length", max_length=self.seq_len)
#         dec_input_tokens = self.tokenizer_tgt.encode(tgt_text, truncation=True, padding="max_length", max_length=self.seq_len)

#         # Convert to tensors
#         encoder_input = torch.tensor(enc_input_tokens, dtype=torch.long)
#         decoder_input = torch.tensor([self.sos_token_id] + dec_input_tokens[:-1], dtype=torch.long)  # Add [SOS] at the beginning
#         label = torch.tensor(dec_input_tokens + [self.eos_token_id], dtype=torch.long)  # Add [EOS] at the end

#         return {
#             'encoder_input': encoder_input,
#             'decoder_input': decoder_input,
#             'encoder_mask': (encoder_input != self.pad_token_id).unsqueeze(0).unsqueeze(0).int(),
#             'decoder_mask': (decoder_input != self.pad_token_id).unsqueeze(0).unsqueeze(0).int(),
#             'label': label,
#             'src_text': src_text,
#             'tgt_text': tgt_text
#         }


In [22]:


def get_dataloaders(config, df):
    tokenizer_src = build_tokenizer(config, config['lang_src'])
    tokenizer_tgt = build_tokenizer(config, config['lang_tgt'])
    train_size = int(0.9 * len(df))
    train_df = df[:train_size]
    val_df = df[train_size:]

    train_ds = BilingualDataset(train_df, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_df, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

    train_loader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_loader, val_loader, tokenizer_src, tokenizer_tgt


In [23]:
def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
    model.resize_token_embeddings(len(tokenizer_src))
    return model


In [24]:


def get_config():

    return{

        'batch_size': 6,

        'num_epochs': 20,

        'lr': 10**-4,

        'seq_len': 350,

        'd_model': 512, # Dimensions of the embeddings in the Transformer. 512 like in the "Attention Is All You Need" paper.

        'lang_src': 'en',

        'lang_tgt': 'de',

        'model_folder': '/kaggle/working/weights',

        'model_basename': '/kaggle/working/tmodel_',

        'preload': None,

        'tokenizer_file': 'tokenizer_{0}.json',

        'experiment_name': 'runs/tmodel'

    }





# Function to construct the path for saving and retrieving model weights

def get_weights_file_path(config, epoch: str):

    model_folder = config['model_folder'] # Extracting model folder from the config

    model_basename = config['model_basename'] # Extracting the base name for model files

    model_filename = f"{model_basename}{epoch}.pt" # Building filename

    return str(Path('.')/ model_folder/ model_filename)


def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.convert_tokens_to_ids('[SOS]')
    eos_idx = tokenizer_tgt.convert_tokens_to_ids('[EOS]')

    encoder_output = model.encode(source, source_mask)
    decoder_input = torch.full((1, 1), sos_idx, dtype=torch.long).to(device)

    while True:
        if decoder_input.size(1) >= max_len:
            break

        decoder_mask = casual_mask(decoder_input.size(1)).to(device)
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)
        prob = model.project(out[:, -1])

        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat([decoder_input, next_word.unsqueeze(0)], dim=1)

        if next_word.item() == eos_idx:
            break

    return decoder_input.squeeze(0)


def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples=2):

    model.eval() # Setting model to evaluation mode

    count = 0 # Initializing counter to keep track of how many examples have been processed



    console_width = 80 # Fixed witdh for printed messages



    # Creating evaluation loop

    with torch.no_grad(): # Ensuring that no gradients are computed during this process

        for batch in validation_ds:

            count += 1

            encoder_input = batch['encoder_input'].to(device)

            encoder_mask = batch['encoder_mask'].to(device)



            # Ensuring that the batch_size of the validation set is 1

            assert encoder_input.size(0) ==  1, 'Batch size must be 1 for validation.'



            # Applying the 'greedy_decode' function to get the model's output for the source text of the input batch

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)



            # Retrieving source and target texts from the batch

            source_text = batch['src_text'][0]

            target_text = batch['tgt_text'][0] # True translation

            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # Decoded, human-readable model output



            # Printing results

            print_msg('-'*console_width)

            print_msg(f'SOURCE: {source_text}')

            print_msg(f'TARGET: {target_text}')

            print_msg(f'PREDICTED: {model_out_text}')



            # After two examples, we break the loop

            if count == num_examples:

                break

import pandas as pd



file_path = '/kaggle/input/less-train/less-train.csv'

df = pd.read_csv(file_path)

df=df.iloc[:100000,:]

# Drop rows with NaN in 'en' or 'de' columns

df = df.dropna(subset=['en', 'de'])


In [25]:
def get_model(config, vocab_src_len, vocab_tgt_len):



    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])

    return model

In [26]:

def get_ds(config, df):
    # Get dataloaders and tokenizers for source and target languages
    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_dataloaders(config, df)
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

# Training function that now accepts `df` for training data
def train_model(config, df):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device {device}")

    # Creating model directory to store weights
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)

    # Retrieving dataloaders and tokenizers for source and target languages
    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config, df)

    # Initializing model on the GPU
    model = get_model(config, tokenizer_src.vocab_size, tokenizer_tgt.vocab_size).to(device)

    # Tensorboard setup
    writer = SummaryWriter(config['experiment_name'])

    # Setting up the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    # Initializing epoch and global step variables
    initial_epoch = 0
    global_step = 0

    # Checking if there is a pre-trained model to load
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)

        # Restore saved states
        initial_epoch = state['epoch'] + 1
        model.load_state_dict(state['model_state_dict'])
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']

    # Setting up CrossEntropyLoss with label smoothing and padding token ignore
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.convert_tokens_to_ids('[PAD]'), label_smoothing=0.1).to(device)

    # Training loop
    for epoch in range(initial_epoch, config['num_epochs']):
        batch_iterator = tqdm(train_dataloader, desc=f'Processing epoch {epoch:02d}')
        for batch in batch_iterator:
            model.train()

            # Move data to GPU
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)
            label = batch['label'].to(device)

            # Forward pass
            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output = model.project(decoder_output)

            # Calculate loss
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.vocab_size), label.view(-1))
            batch_iterator.set_postfix({f"loss": f"{loss.item():6.3f}"})

            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

        # Run validation at the end of each epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device,
                       lambda msg: batch_iterator.write(msg), global_step, writer)

        # Save model checkpoint
        model_filename = get_weights_file_path(config, f'{epoch:02d}')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)
        print(f"Checkpoint saved at {model_filename}")


In [27]:
warnings.filterwarnings('ignore') # Filtering warnings
config = get_config() # Retrieving config settings
# train_model(config, df)

In [28]:
# writer = SummaryWriter(config['experiment_name'])
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# # model_filename = get_weights_file_path(config, config['preload'])
# state = torch.load("tmodel_00.pt")
# global_step = state['global_step']
# train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config, df)
# model = get_model(config, tokenizer_src.vocab_size, tokenizer_tgt.vocab_size).to(device)
# print()
# run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device,
#                        lambda msg: batch_iterator.write(msg), global_step, writer)

In [29]:
# # Tokenizer function modified for DataFrame usage

# def build_tokenizer(config, df, lang):

#     tokenizer_path = Path(config['tokenizer_file'].format(lang))



#     if not Path.exists(tokenizer_path):

#         tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))

#         tokenizer.pre_tokenizer = Whitespace()



#         # Training the tokenizer on sentences from the DataFrame

#         trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)

#         tokenizer.train_from_iterator(df[lang].tolist(), trainer=trainer)

#         tokenizer.save(str(tokenizer_path))

#     else:

#         tokenizer = Tokenizer.from_file(str(tokenizer_path))

#     return tokenizer





# def get_ds(config, df):

#     # Using the DataFrame `df` instead of loading from Hugging Face

#     tokenizer_src = build_tokenizer(config, df, config['lang_src'])

#     tokenizer_tgt = build_tokenizer(config, df, config['lang_tgt'])



#     # Splitting the dataset for training and validation

#     train_ds_size = int(0.9 * len(df))

#     train_df = df[:train_ds_size]

#     val_df = df[train_ds_size:]



#     # Creating dataset instances for training and validation

#     train_ds = BilingualDataset(train_df, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

#     val_ds = BilingualDataset(val_df, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])



#     # Dataloaders

#     train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)

#     val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)



#     return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt





# class BilingualDataset(Dataset):

#     def __init__(self, df, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:

#         super().__init__()

#         self.seq_len = seq_len

#         self.df = df

#         self.tokenizer_src = tokenizer_src

#         self.tokenizer_tgt = tokenizer_tgt

#         self.src_lang = src_lang

#         self.tgt_lang = tgt_lang

#         self.sos_token = torch.tensor([tokenizer_tgt.convert_tokens_to_ids("[SOS]")], dtype=torch.int64)

#         self.eos_token = torch.tensor([tokenizer_tgt.convert_tokens_to_ids("[EOS]")], dtype=torch.int64)

#         self.pad_token = torch.tensor([tokenizer_tgt.convert_tokens_to_ids("[PAD]")], dtype=torch.int64)



#     def __len__(self):

#         return len(self.df)



#     def __getitem__(self, index: Any) -> Any:

#         src_text = self.df.iloc[index][self.src_lang]

#         tgt_text = self.df.iloc[index][self.tgt_lang]



#         # Tokenizing

#         enc_input_tokens = self.tokenizer_src.encode(src_text).ids

#         dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids



#         enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2

#         dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1



#         if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:

#             raise ValueError('Sentence is too long')



#         encoder_input = torch.cat(

#             [self.sos_token, torch.tensor(enc_input_tokens, dtype=torch.int64), self.eos_token,

#              torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64)]

#         )

#         decoder_input = torch.cat(

#             [self.sos_token, torch.tensor(dec_input_tokens, dtype=torch.int64),

#              torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)]

#         )

#         label = torch.cat(

#             [torch.tensor(dec_input_tokens, dtype=torch.int64), self.eos_token,

#              torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)]

#         )



#         assert encoder_input.size(0) == self.seq_len

#         assert decoder_input.size(0) == self.seq_len

#         assert label.size(0) == self.seq_len



#         return {

#             'encoder_input': encoder_input,

#             'decoder_input': decoder_input,

#             'encoder_mask': (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),

#             'decoder_mask': (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)),

#             'label': label,

#             'src_text': src_text,

#             'tgt_text': tgt_text

#         }



# # Now pass `df` as an argument to `get_ds` in the `train_model` function:

# train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config, df)



# # Iterating through dataset to extract the original sentence and its translation

# def get_all_sentences(ds, lang):

#     for pair in ds:

#         yield pair['translation'][lang]





















# def casual_mask(size):

#         # Creating a square matrix of dimensions 'size x size' filled with ones

#         mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)

#         return mask == 0



























# def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):

#     # Retrieving the indices from the start and end of sequences of the target tokens

#     sos_idx = tokenizer_tgt.convert_tokens_to_ids('[SOS]')

#     eos_idx = tokenizer_tgt.convert_tokens_to_ids('[EOS]')



#     # Computing the output of the encoder for the source sequence

#     encoder_output = model.encode(source, source_mask)

#     # Initializing the decoder input with the Start of Sentence token

#     decoder_input = torch.empty(1,1).fill_(sos_idx).type_as(source).to(device)



#     # Looping until the 'max_len', maximum length, is reached

#     while True:

#         if decoder_input.size(1) == max_len:

#             break



#         # Building a mask for the decoder input

#         decoder_mask = casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)



#         # Calculating the output of the decoder

#         out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)



#         # Applying the projection layer to get the probabilities for the next token

#         prob = model.project(out[:, -1])



#         # Selecting token with the highest probability

#         _, next_word = torch.max(prob, dim=1)

#         decoder_input = torch.cat([decoder_input, torch.empty(1,1). type_as(source).fill_(next_word.item()).to(device)], dim=1)



#         # If the next token is an End of Sentence token, we finish the loop

#         if next_word == eos_idx:

#             break



#     return decoder_input.squeeze(0) # Sequence of tokens generated by the decoder













# def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples=2):

#     model.eval() # Setting model to evaluation mode

#     count = 0 # Initializing counter to keep track of how many examples have been processed



#     console_width = 80 # Fixed witdh for printed messages



#     # Creating evaluation loop

#     with torch.no_grad(): # Ensuring that no gradients are computed during this process

#         for batch in validation_ds:

#             count += 1

#             encoder_input = batch['encoder_input'].to(device)

#             encoder_mask = batch['encoder_mask'].to(device)



#             # Ensuring that the batch_size of the validation set is 1

#             assert encoder_input.size(0) ==  1, 'Batch size must be 1 for validation.'



#             # Applying the 'greedy_decode' function to get the model's output for the source text of the input batch

#             model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)



#             # Retrieving source and target texts from the batch

#             source_text = batch['src_text'][0]

#             target_text = batch['tgt_text'][0] # True translation

#             model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # Decoded, human-readable model output



#             # Printing results

#             print_msg('-'*console_width)

#             print_msg(f'SOURCE: {source_text}')

#             print_msg(f'TARGET: {target_text}')

#             print_msg(f'PREDICTED: {model_out_text}')



#             # After two examples, we break the loop

#             if count == num_examples:

#                 break









# # We pass as parameters the config dictionary, the length of the vocabylary of the source language and the target language

# def get_model(config, vocab_src_len, vocab_tgt_len):



#     # Loading model using the 'build_transformer' function.

#     # We will use the lengths of the source language and target language vocabularies, the 'seq_len', and the dimensionality of the embeddings

#     model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])

#     return model









# def get_config():

#     return{

#         'batch_size': 8,

#         'num_epochs': 20,

#         'lr': 10**-4,

#         'seq_len': 350,

#         'd_model': 512, # Dimensions of the embeddings in the Transformer. 512 like in the "Attention Is All You Need" paper.

#         'lang_src': 'en',

#         'lang_tgt': 'it',

#         'model_folder': 'weights',

#         'model_basename': 'tmodel_',

#         'preload': None,

#         'tokenizer_file': 'tokenizer_{0}.json',

#         'experiment_name': 'runs/tmodel'

#     }





# # Function to construct the path for saving and retrieving model weights

# def get_weights_file_path(config, epoch: str):

#     model_folder = config['model_folder'] # Extracting model folder from the config

#     model_basename = config['model_basename'] # Extracting the base name for model files

#     model_filename = f"{model_basename}{epoch}.pt" # Building filename

#     return str(Path('.')/ model_folder/ model_filename) # Combining current directory, the model folder, and the model filename













# def train_model(config):

#     # Setting up device to run on GPU to train faster

#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#     print(f"Using device {device}")



#     # Creating model directory to store weights

#     Path(config['model_folder']).mkdir(parents=True, exist_ok=True)



#     # Retrieving dataloaders and tokenizers for source and target languages using the 'get_ds' function

#     train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)



#     # Initializing model on the GPU using the 'get_model' function

#     model = get_model(config,tokenizer_src.vocab_size, tokenizer_tgt.vocab_size).to(device)



#     # Tensorboard

#     writer = SummaryWriter(config['experiment_name'])



#     # Setting up the Adam optimizer with the specified learning rate from the '

#     # config' dictionary plus an epsilon value

#     optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps = 1e-9)



#     # Initializing epoch and global step variables

#     initial_epoch = 0

#     global_step = 0



#     # Checking if there is a pre-trained model to load

#     # If true, loads it

#     if config['preload']:

#         model_filename = get_weights_file_path(config, config['preload'])

#         print(f'Preloading model {model_filename}')

#         state = torch.load(model_filename) # Loading model



#         # Sets epoch to the saved in the state plus one, to resume from where it stopped

#         initial_epoch = state['epoch'] + 1

#         # Loading the optimizer state from the saved model

#         optimizer.load_state_dict(state['optimizer_state_dict'])

#         # Loading the global step state from the saved model

#         global_step = state['global_step']



#     # Initializing CrossEntropyLoss function for training

#     # We ignore padding tokens when computing loss, as they are not relevant for the learning process

#     # We also apply label_smoothing to prevent overfitting

#     loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer_src.convert_tokens_to_ids('[PAD]'), label_smoothing = 0.1).to(device)



#     # Initializing training loop



#     # Iterating over each epoch from the 'initial_epoch' variable up to

#     # the number of epochs informed in the config

#     for epoch in range(initial_epoch, config['num_epochs']):



#         # Initializing an iterator over the training dataloader

#         # We also use tqdm to display a progress bar

#         batch_iterator = tqdm(train_dataloader, desc = f'Processing epoch {epoch:02d}')



#         # For each batch...

#         for batch in batch_iterator:

#             model.train() # Train the model



#             # Loading input data and masks onto the GPU

#             encoder_input = batch['encoder_input'].to(device)

#             decoder_input = batch['decoder_input'].to(device)

#             encoder_mask = batch['encoder_mask'].to(device)

#             decoder_mask = batch['decoder_mask'].to(device)



#             # Running tensors through the Transformer

#             encoder_output = model.encode(encoder_input, encoder_mask)

#             decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)

#             proj_output = model.project(decoder_output)



#             # Loading the target labels onto the GPU

#             label = batch['label'].to(device)



#             # Computing loss between model's output and true labels

#             loss = loss_fn(proj_output.view(-1, tokenizer_tgt.vocab_size), label.view(-1))



#             # Updating progress bar

#             batch_iterator.set_postfix({f"loss": f"{loss.item():6.3f}"})



#             writer.add_scalar('train loss', loss.item(), global_step)

#             writer.flush()



#             # Performing backpropagation

#             loss.backward()



#             # Updating parameters based on the gradients

#             optimizer.step()



#             # Clearing the gradients to prepare for the next batch

#             optimizer.zero_grad()



#             global_step += 1 # Updating global step count



#         # We run the 'run_validation' function at the end of each epoch

#         # to evaluate model performance

#         run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)



#         # Saving model

#         model_filename = get_weights_file_path(config, f'{epoch:02d}')

#         # Writting current model state to the 'model_filename'

#         torch.save({

#             'epoch': epoch, # Current epoch

#             'model_state_dict': model.state_dict(),# Current model state

#             'optimizer_state_dict': optimizer.state_dict(), # Current optimizer state

#             'global_step': global_step # Current global step

#         }, model_filename)

In [30]:


# if __name__ == '__main__':

#     warnings.filterwarnings('ignore') # Filtering warnings

#     config = get_config() # Retrieving config settings

#     train_model(config)