In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text \
    import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence \
    import pad_sequences
import numpy as np
import random



In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks')

Mounted at /content/drive


In [3]:
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EPOCHS = 20
BATCH_SIZE = 128
MAX_WORDS = 10000
READ_LINES = 60000
LAYER_SIZE = 256
EMBEDDING_WIDTH = 128
TEST_PERCENT = 0.2
SAMPLE_SIZE = 20
OOV_WORD = 'UNK'
PAD_INDEX = 0
OOV_INDEX = 1
START_INDEX = MAX_WORDS - 2
STOP_INDEX = MAX_WORDS - 1
MAX_LENGTH = 60
SRC_DEST_FILE_NAME = 'fra.txt'


In [4]:
# Function to read file.
def read_file_combined(file_name, max_len):
    file = open(file_name, 'r', encoding='utf-8')
    src_word_sequences = []
    dest_word_sequences = []
    for i, line in enumerate(file):
        if i == READ_LINES:
            break
        pair = line.split('\t')
        word_sequence = text_to_word_sequence(pair[1])
        src_word_sequence = word_sequence[0:max_len]
        src_word_sequences.append(src_word_sequence)
        word_sequence = text_to_word_sequence(pair[0])
        dest_word_sequence = word_sequence[0:max_len]
        dest_word_sequences.append(dest_word_sequence)
    file.close()
    return src_word_sequences, dest_word_sequences


In [5]:
# Functions to tokenize and un-tokenize sequences.
def tokenize(sequences):
    # "MAX_WORDS-2" used to reserve two indices
    # for START and STOP.
    tokenizer = Tokenizer(num_words=MAX_WORDS-2,
                          oov_token=OOV_WORD)
    tokenizer.fit_on_texts(sequences)
    token_sequences = tokenizer.texts_to_sequences(sequences)
    return tokenizer, token_sequences

def tokens_to_words(tokenizer, seq):
    word_seq = []
    for index in seq:
        if index == PAD_INDEX:
            word_seq.append('PAD')
        elif index == OOV_INDEX:
            word_seq.append(OOV_WORD)
        elif index == START_INDEX:
            word_seq.append('START')
        elif index == STOP_INDEX:
            word_seq.append('STOP')
        else:
            word_seq.append(tokenizer.sequences_to_texts(
                [[index]])[0])
    print(word_seq)


In [6]:
# Read file and tokenize.
src_seq, dest_seq = read_file_combined(SRC_DEST_FILE_NAME,
                                       MAX_LENGTH)
src_tokenizer, src_token_seq = tokenize(src_seq)
dest_tokenizer, dest_token_seq = tokenize(dest_seq)


In [7]:
# Prepare training data.
dest_target_token_seq = [x + [STOP_INDEX] for x in dest_token_seq]
dest_input_token_seq = [[START_INDEX] + x for x in
                        dest_target_token_seq]
src_input_data = pad_sequences(src_token_seq)
dest_input_data = pad_sequences(dest_input_token_seq,
                                padding='post')
dest_target_data = pad_sequences(
    dest_target_token_seq, padding='post', maxlen
    = len(dest_input_data[0]))

# Convert to same precision as model.
src_input_data = src_input_data.astype(np.int64)
dest_input_data = dest_input_data.astype(np.int64)
dest_target_data = dest_target_data.astype(np.int64)


In [8]:
# Split into training and test set.
rows = len(src_input_data[:,0])
all_indices = list(range(rows))
test_rows = int(rows * TEST_PERCENT)
test_indices = random.sample(all_indices, test_rows)
train_indices = [x for x in all_indices if x not in test_indices]

train_src_input_data = src_input_data[train_indices]
train_dest_input_data = dest_input_data[train_indices]
train_dest_target_data = dest_target_data[train_indices]

test_src_input_data = src_input_data[test_indices]
test_dest_input_data = dest_input_data[test_indices]
test_dest_target_data = dest_target_data[test_indices]

# Create a sample of the test set that we will inspect in detail.
test_indices = list(range(test_rows))
sample_indices = random.sample(test_indices, SAMPLE_SIZE)
sample_input_data = test_src_input_data[sample_indices]
sample_dest_input_data = test_dest_input_data[sample_indices]
sample_dest_target_data = test_dest_target_data[sample_indices]

# Create Dataset objects.
trainset = TensorDataset(torch.from_numpy(train_src_input_data),
                         torch.from_numpy(train_dest_input_data),
                         torch.from_numpy(train_dest_target_data))
testset = TensorDataset(torch.from_numpy(test_src_input_data),
                         torch.from_numpy(test_dest_input_data),
                         torch.from_numpy(test_dest_target_data))


In [9]:
# Embedding Layer
class InputEmbedding(nn.Module):

    def __init__(self,d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self,input_text):
        return self.embedding(input_text) * math.sqrt(self.d_model)

In [None]:
embedding = InputEmbedding(d_model, src_vocab_size)
sample_input_embedding = embedding(torch.tensor(sample_input_data))
sample_input_embedding.shape

torch.Size([20, 14, 256])

In [10]:
# Postitional Encoding Layer
class Pos_Enc(nn.Module):

    def __init__(self, p):
        super().__init__()
        self.dropout = nn.Dropout(p = 0.1)


    def get_angles(self,len_seq, d_model):

        """
        Input
        x  : input samples with shape (#samples, len_seq, len_emb = d_model)

        """
        # Initialize the parameters
        angles = torch.zeros((len_seq, d_model // 2))

        for pos in range(len_seq):
            for i in range(d_model//2):
                angles[pos,i] = pos/(10000**(2*i/d_model))
        return(angles)

    def add_pos_enc(self,x):
        """
        This function will calculate the positional encodings for a given input x
        Input
        x   : input sequences with shape (#samples, len_seq, len_emb)

        Output
        pos_encoding (tensor): denoting the position of words in the sequence; shape = (1, len_seq, len_emb)

        """

        # Initialize variables
        len_seq = x.shape[1]
        d_model = x.shape[2]
        pos_encoding = torch.zeros((1,len_seq, d_model))
        # Calculate the angles
        angles = self.get_angles(len_seq,d_model)
        # we would need a tensor of 1, len_seq, d_model) first for loop is len_seq
        for pos in range(len_seq):
            for i in range(angles.shape[1]):
                pos_encoding[:,pos, 0::2] = torch.sin(angles[pos,:])
                pos_encoding[:,pos, 1::2] = torch.cos(angles[pos,:])

        # register pos_encoding as a buffer in the modul
        self.register_buffer('pe', pos_encoding)
        pos_encoding.requires_grad = False # not trainable

        # Add the positional Encodings to the input
        x = x + pos_encoding

        # Apply Dropout and return
        return(self.dropout(x))


#### Example: Positional Encoding

In [11]:
d_model = 16  # Size of the model
pos_enc = Pos_Enc(0.1)

    # Create a sample input tensor
    # Let's say we have 2 samples with a sequence length of 5
sample_input = torch.zeros((2, 5,d_model))  # Shape: (#samples, len_seq)
    # Get the angles
angles = pos_enc.get_angles(5,d_model)

    # Print the angles
print("Computed angles:", angles.shape) # these angles are for 5 positions
position_enc = pos_enc.add_pos_enc(sample_input)
print("Computed positional encodings:", position_enc.shape) # these angles are for 5 positions

Computed angles: torch.Size([5, 8])
Computed positional encodings: torch.Size([2, 5, 16])


### Feed Forward Layer

this will be a feed forward of two layers. it takes the input with the same dims as input, then runs through a d_model number of parameters. then runs it again through the len_emb neurons to return the same shape of output.

In [12]:
class FeedForward(nn.Module):

    def __init__(self, d_model,d_ff,p):
        super().__init__()
        self.d_model = d_model
        self.layer1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(p=0.1)
        self.layer2 = nn.Linear(d_ff,d_model)

    def forward(self, x):

        """
        This class defines the feed forward layer of the transformer
        - A dropout layer will be applied between the two neural layers
        """

        return self.layer2(self.dropout(self.layer1(x))) # shape: (#samples, len_seq, d_model)


#### Example FeedForward Layer

In [None]:
# Example parameters
d_model = 512  # Dimensionality of the input (hidden state size)
d_ff = 2048    # Dimensionality of the feedforward layer
p = 0.1        # Dropout probability

# Create an instance of the FeedForward layer
feed_forward_layer = FeedForward(d_model, d_ff, p)

# Example input: shape (batch_size, len_seq, d_model)
batch_size = 3
len_seq = 4
input_tensor = torch.randn(batch_size, len_seq, d_model)

# Pass the input through the feedforward layer
output_tensor = feed_forward_layer(input_tensor)
output_tensor.shape

torch.Size([3, 4, 512])

### Define the maskes

#### Padding mask

In [13]:
def create_padding_mask(matrix, num_heads):
    """
    Creates a matrix mask for the padding cells

    Arguments:
        matrix -- (n, m) tensor
        num_heads -- int, number of attention heads

    Returns:
        mask -- (n, num_heads, 1, m) binary tensor
    """
    # Check if each row is all zeros (padding rows)
    zero_rows = (matrix == 0).float()  # (n, m)

    # Expand dimensions to match the shape needed for multi-head attention
    padded_matrix_1 = zero_rows.unsqueeze(1)  # (n, 1, m)

    # Repeat the mask across the number of attention heads
    padded_matrix_1 = padded_matrix_1.repeat(1, num_heads, 1)  # (n, num_heads, m)

    # Add a dimension for the attention dimension (as per multi-head attention)
    final_mask = padded_matrix_1.unsqueeze(2)  # (n, num_heads, 1, m)

    return final_mask


In [14]:
padding_mask_trial = create_padding_mask(torch.tensor(sample_input_data),2)
padding_mask_trial.shape

torch.Size([20, 2, 1, 14])

#### Look-ahead mask

In [15]:
def create_look_ahead_mask(dim):
    """
    Creates a look-ahead mask for the decoder in transformer models.
    At each iteration of the decoder making predictions, this function masks the proceeding words
    to prevent the decoder from "seeing" future tokens.

    Arguments:
        dim -- int, the length of the sequence (or the dimensionality of the input)

    Returns:
        mask -- (1, 1, dim, dim) tensor, where the upper triangular part is 0 and the lower part is 1
    """
    # Create a mask that keeps the main diagonal and all sub-diagonals, sets all super-diagonals to zero.
    mask = torch.tril(torch.ones(dim, dim))  # Lower triangular matrix of ones

    # Add an extra dimension to match the required shape (1, 1, dim, dim)
    mask = mask.unsqueeze(0).unsqueeze(0)  # (1, 1, dim, dim)

    return mask


In [16]:
sample_dest_input_data[0]

array([9998,   10,   25,  166, 9999,    0,    0,    0,    0])

In [17]:
create_look_ahead_mask(14)

tensor([[[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])

For the cross attention, say we have a sequence of 3 words and th rest is padded to reach the length 14. so (1,14) mapped to embeddings to get the shape (1,14,20) with len_emb = 20. Assuming the output encoder has shape (1,9,20), then after the first MHA, when we are combining the input of the decoder and the output of the encoder, we need a different mask. The attention parameters will be of size (1,14,9). This attention matrix shows how much attention should each of the 14 entries in the input decoder pay to each of the 9 indexes of the output encoder. So, we need a padding mask such that the attention weights for padded indexes of input decoder return to zero. So the rows of attention matrix corresponding to the padded indexes in the input decoder must be returned to zero.

In [19]:
# Initialize the cross_mha_mask tensor with zeros (same shape as the original code)
def create_cross_attention_mask(dec_pad_mask,len_enc_output):
    num_samples, num_heads, _, len_dec_output = dec_pad_mask.shape
    cross_mha_mask = torch.zeros((num_samples, num_heads, len_dec_output, len_enc_output))

    # Iterate through the third dimension (length of the decoder input
    for i in range(cross_mha_mask.shape[2]):
    # Repeat the dec_pad_mask across the appropriate dimension (axis=2 in the original code)
        cross_mha_mask[:, :, i, :] = dec_pad_mask[:, :, :, i].repeat(1, 1, len_enc_output)
    return(cross_mha_mask)


In [None]:
create_cross_attention_mask(dec_padding_mask,output_encoder.shape[1])[0][0]

### Multi-Head Attention


In [20]:
class MultiHeadAttn(nn.Module):

    def __init__(self,num_heads, d_model, p):
        super().__init__()

        self.num_heads = num_heads
        self.head_params = d_model//num_heads

        self.q_linear = nn.Linear(d_model, d_model)# mapping to the same number of features
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.w_o      = nn.Linear(d_model, d_model)

        self.dropout  = nn.Dropout(p = 0.1)

        # Check if the number of len_emb (d_model) is divisable by num_heads
        assert d_model%num_heads ==0, "d_model is not divisable by the number of heads"


    def forward(self, q, k, v, masking=None):
        """
        Inputs
        q     : the tensor to be mapped to the Query
        k     : the tensor to be mapped to the Key
        v     : the tensor to be mapped to the Value
        """
        batch_size, len_seq, _ = q.shape

        # Define Query, Key, Value
        Query = self.q_linear(q)
        Key  = self.k_linear(k)
        Value = self.v_linear(v)

        # Reshape Query, Key, and Value
        Query_reshaped = Query.view(batch_size, len_seq, num_heads, self.head_params).transpose(1,2) #shape = (#samples, num_heads, len_seq, head_params)
        Key_reshaped = Key.view(Key.shape[0], Key.shape[1], num_heads, self.head_params).transpose(1,2)
        Value_reshaped = Value.view(Value.shape[0], Value.shape[1], num_heads, self.head_params).transpose(1,2) # shape = //

        # Dot-Product Query and Key
        dotqk = torch.matmul(Query_reshaped, Key_reshaped.transpose(-1,-2))/(self.head_params**0.5) # shape = (#samples, num_heads, len_seq, len_seq)
        # Apply the mask if givne
        if masking is not None:
            dotqk += ((masking)* (-1e9))

        # Apply dropout
        dotqk = self.dropout(dotqk)

        # Apply the Softmax to the attention weights
        attention_scores = torch.softmax(dotqk, dim = -1)

        # Multiply to the value matrix
        result = torch.matmul(attention_scores, Value_reshaped) # shape = (#samples, num_heads, len_seq, head_params)
        # Concatenate the heads
        result_reshaped = result.view(batch_size, len_seq, self.num_heads*self.head_params) #shape = (#samples, len_seq, d_model)

        # Apply through the last linear layer
        result_final = self.w_o(result_reshaped) #shape = (#samples, len_seq, d_model)

        return result_final

#### Example Multi-head attention

In [None]:
# Parameters
batch_size = 64
seq_length = 10
d_model = 256  # Embedding size
num_heads = 8
dropout_prob = 0.1

# Instantiate the MultiHeadAttn class
multi_head_attention = MultiHeadAttn(num_heads, d_model, dropout_prob)

# Create random input tensors for queries, keys, and values
q = torch.rand(batch_size, seq_length, d_model)  # Shape: (64, 10, 256) #(#samples, len_seq, #len_emb)
k = torch.rand(batch_size, seq_length, d_model)  # Shape: (64, 10, 256)
v = torch.rand(batch_size, seq_length, d_model)  # Shape: (64, 10, 256)

# Optional masking (for example, padding mask)
masking = create_padding_mask(torch.tensor(sample_input_data),num_heads)
# Forward pass
dotqk = multi_head_attention(sample_input_embedding, sample_input_embedding, sample_input_embedding, masking)

# Check the output shape
print(output.shape)  # Should print: torch.Size([64, 10, 256])

torch.Size([20, 14, 256])


In [None]:
dotqk[1][0][0]

tensor([ 0.1190,  0.1190,  0.1190,  0.1190,  0.1190,  0.1190,  0.1190,  0.1190,
        -0.0147,  0.0795,  0.0811,  0.7231, -0.5528,  0.2886],
       grad_fn=<SelectBackward0>)

In [None]:
masking[1][0][0] * -1e9

tensor([-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
        -1.0000e+09, -1.0000e+09, -1.0000e+09, -0.0000e+00, -0.0000e+00,
        -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00])

In [None]:
torch.softmax(dotqk[1][0][0] + masking[1][0][0] * -1e9, dim = -1)

tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1383,
        0.1520, 0.1522, 0.2893, 0.0808, 0.1873], grad_fn=<SoftmaxBackward0>)

But if we use a dropout, then the attention scores might be changed:

In [None]:
print(masking[1][0])
print(attention_scores[1][0][0])
print(sample_input_data[1])

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.]])
tensor([0.0000, 0.1302, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1363,
        0.1329, 0.1186, 0.1464, 0.1107, 0.2250], grad_fn=<SelectBackward0>)
[   0    0    0    0    0    0    0    0 8070    9 1728    7 1099  494]


### Encoder Layer

In [21]:
class Encoder(nn.Module):
    def __init__(self, heads, d_model, num_iter,p, vocab_size, d_ff):
        super().__init__()
        self.heads = heads
        self.d_model = d_model
        self.num_iter = num_iter


        self.embedding = InputEmbedding(d_model, vocab_size) # will be mapped to d_model with vocab_size number of words
        self.pos_enc   = Pos_Enc(p)
        self.mha       = MultiHeadAttn(num_heads, d_model, p)
        self.ffn       = FeedForward(d_model,d_ff,p)

        self.dropout1  = nn.Dropout(p)
        self.dropout2  = nn.Dropout(p)

        self.norm1     = nn.LayerNorm(d_model)
        self.norm2     = nn.LayerNorm(d_model)

    def forward(self, x, pad_mask):
        """
        Enocder layer of the transformer. This layer will receive the editted text, runs it through the embedding layer, positional encoding
        then the encoder layer for num_iter times

        Input
        x : samples with size (batch_size, len_seq)

        """
        # Run throguh the embedding layer
        x = self.embedding(x)
        # Run through the positional encoding layer
        x = self.pos_enc.add_pos_enc(x)
        # Run through the encoder
        for _ in range(self.num_iter):

            # Run through a mha and dropout
            attention_res = self.dropout1(self.mha(x,x,x, pad_mask))
            # Add and Normalize
            normalize_attn_scores = self.norm1(attention_res + x)
            # Run through the feed forward NN and dropout
            res_ffn = self.dropout2(self.ffn(normalize_attn_scores))
            # Add and Normalize
            x = self.norm2(res_ffn + normalize_attn_scores)

        return(x)


#### Example Encoder

In [22]:
# Example parameters
seq_length = 14
vocab_size = MAX_WORDS
d_model = 60
num_heads =10
d_ff = 60
num_iter = 6
dropout_rate = 0.1
pad_mask = None  # You can specify a padding mask if needed

# Instantiate the Encoder
encoder = Encoder(heads=num_heads, d_model=d_model, num_iter=num_iter, p=dropout_rate, vocab_size=vocab_size, d_ff=d_ff)

# Forward pass through the Encoder
output_encoder = encoder(torch.tensor(sample_input_data), pad_mask = None)
output_encoder.shape

torch.Size([20, 14, 60])

### Define the Decoder layer

In [23]:
class Decoder(nn.Module):

    def __init__(self, num_heads, p, d_ff, d_model, num_iter,vocab_size):
        super().__init__()
        self.embedding = InputEmbedding(d_model, vocab_size) # will be mapped to d_model with vocab_size number of words
        self.pos_enc   = Pos_Enc(p)
        self.mha1 = MultiHeadAttn(num_heads, d_model, p)
        self.mha2 = MultiHeadAttn(num_heads, d_model, p)
        self.ffn = FeedForward(d_model,d_ff,p)

        self.drop1 = nn.Dropout(p)
        self.drop2 = nn.Dropout(p)
        self.drop3 = nn.Dropout(p)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        self.num_iter = num_iter


    def forward(self, x, enc_output, pad_dec_mask, look_ahead_mask):
        # Run through an Embedding layer
        x = self.embedding(x)
        # Run through positional encoding layer
        x = self.pos_enc.add_pos_enc(x)
        for _ in range(self.num_iter):

            # Run the input decdoer through a MHA and add dropout
            res_attn1 = self.drop1(self.mha1(x,x,x,look_ahead_mask))
            # Add and Normalize
            normalized_res_attn1 = self.norm1(res_attn1 + x)
            # Run through the second MHA and add dropout
            res_attn2 = self.drop2(self.mha2(normalized_res_attn1, enc_output, enc_output,pad_dec_mask))
            # Add and Normalize
            normalized_attn2 = self.norm2(res_attn2 + normalized_res_attn1)
            # Run throught the Feed Forward NN and apply dropout
            res_ffn = self.drop3(self.ffn(normalized_attn2))
            # Add and Normalize
            normalized_res_ffn = self.norm3(res_ffn + normalized_attn2)
            x = normalized_res_ffn

        return(normalized_res_attn1)


#### Example Decoder

In [25]:
# Example parameters
seq_length = 14
vocab_size = MAX_WORDS
d_model = 60
num_heads =10
d_ff = 60
num_iter = 6
dropout_rate = 0.1
pad_dec_mask = None  # You can specify a padding mask if needed
look_ahead_mask1 = create_look_ahead_mask(9)
# Instantiate the Encoder
decoder = Decoder(num_heads=num_heads, p=dropout_rate,  d_ff=d_ff, d_model=d_model, num_iter=num_iter, vocab_size=vocab_size)

# Forward pass through the Encoder
output_decoder = decoder(torch.tensor(sample_dest_input_data), output_encoder, pad_dec_mask = None, look_ahead_mask=look_ahead_mask1)
output_decoder.shape # what is this? these are probability? for 9 words of the length of the output which is not more than that.

torch.Size([20, 9, 60])

### Define the Transformer

In [26]:
class Transformer(nn.Module):

    def __init__(self, num_heads, p, d_ff, d_model, num_iter,src_vocab_size, tgt_vocab_size):
        super().__init__()
        self.encoder = Encoder(num_heads, d_model, num_iter,p, src_vocab_size, d_ff)
        self.decdoer = Decoder(num_heads, p, d_ff, d_model, num_iter,tgt_vocab_size)
        self.linear  = nn.Linear(d_model, tgt_vocab_size)

    def forward(self,input_enc, input_dec, enc_padding_mask, dec_padding_mask, look_ahead_mask):
        """
        outputs will be in logits --> use the correct loss function

        """
        # First run through the encoder that is already initialized
        enc_output = self.encoder(input_enc, enc_padding_mask)
        # Run through the decoder
        dec_output = self.decdoer(input_dec, enc_output, dec_padding_mask, look_ahead_mask)
        # Run thourgh the final layer
        final_output = self.linear(dec_output)

        return(final_output)


In [27]:
import torch

class CustomSchedule:
    """
    Custom learning rate schedule that implements the learning rate function
    described in the original Transformer paper. The learning rate is increased
    linearly for the first `warmup_steps` training steps, and then decreased
    proportionally to the inverse square root of the step number.

    Args:
        d_model (int): the dimensionality of the model.
        warmup_steps (int): the number of steps taken to increase the learning rate.
    """

    def __init__(self, d_model, warmup_steps=4000):
        self.d_model = float(d_model)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        """
        Returns the learning rate at the given step.

        Args:
            step (int): the current training step.

        Returns:
            The learning rate at the given step.
        """
        step = float(step)
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)

        return (self.d_model ** -0.5) * min(arg1, arg2)

# Example usage:
if __name__ == "__main__":
    d_model = 512  # Example model dimension
    warmup_steps = 4000
    lr_schedule = CustomSchedule(d_model, warmup_steps)

    for step in range(1, 10001):
        learning_rate = lr_schedule(step)
        print(f"Step {step}: Learning Rate = {learning_rate:.6f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step 5001: Learning Rate = 0.000625
Step 5002: Learning Rate = 0.000625
Step 5003: Learning Rate = 0.000625
Step 5004: Learning Rate = 0.000625
Step 5005: Learning Rate = 0.000625
Step 5006: Learning Rate = 0.000625
Step 5007: Learning Rate = 0.000625
Step 5008: Learning Rate = 0.000625
Step 5009: Learning Rate = 0.000624
Step 5010: Learning Rate = 0.000624
Step 5011: Learning Rate = 0.000624
Step 5012: Learning Rate = 0.000624
Step 5013: Learning Rate = 0.000624
Step 5014: Learning Rate = 0.000624
Step 5015: Learning Rate = 0.000624
Step 5016: Learning Rate = 0.000624
Step 5017: Learning Rate = 0.000624
Step 5018: Learning Rate = 0.000624
Step 5019: Learning Rate = 0.000624
Step 5020: Learning Rate = 0.000624
Step 5021: Learning Rate = 0.000624
Step 5022: Learning Rate = 0.000624
Step 5023: Learning Rate = 0.000624
Step 5024: Learning Rate = 0.000624
Step 5025: Learning Rate = 0.000623
Step 5026: Learning Rate = 0.000623

In [28]:
num_heads = 10
p = 0.1
d_ff = 60
d_model= 60
num_iter= 1
src_vocab_size = MAX_WORDS
tgt_vocab_size = MAX_WORDS
# Define the masking
enc_padding_mask = None
dec_padding_mask = None
look_ahead_mask  = None
transformer = Transformer(num_heads, p, d_ff, d_model, num_iter,src_vocab_size, tgt_vocab_size)

# Loss functions and optimizer.
transformer_optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss()

trainloader = DataLoader(dataset=trainset, batch_size=BATCH_SIZE, shuffle=True)
testloader = DataLoader(dataset=testset, batch_size=BATCH_SIZE, shuffle=False)


In [29]:
output_transformer = transformer(torch.tensor(sample_input_data), torch.tensor(sample_dest_input_data), enc_padding_mask, dec_padding_mask, look_ahead_mask)
output_transformer.shape #each row is a probability distribution showing the most likely word to be chosen

torch.Size([20, 9, 10000])

In [30]:
# Train the model
for i in range(EPOCHS):
    transformer.train()
    train_loss = 0.0
    train_correct = 0
    train_batches = 0
    train_elems = 0

    for src_input, tgt_input, tgt_output in trainloader:

        # Zero the parameter gradiets
        transformer_optimizer.zero_grad()
        # Define the masks
        enc_padding_mask = create_padding_mask(src_input, num_heads)
        dec_padding_mask = create_cross_attention_mask(create_padding_mask(tgt_input,num_heads), src_input.shape[1])
        look_ahead_mask = create_look_ahead_mask(tgt_input.shape[1])#training
        # Run a prediction on the data
        output_transformer = transformer(src_input, tgt_input, enc_padding_mask, dec_padding_mask, look_ahead_mask)
        # calculate the loss
        loss = loss_function(output_transformer.view(-1, MAX_WORDS), tgt_output.view(-1)) #shape: (batch_size * len_seq,MAX_WORDS) vs (batch_size*len_seq)
        # Accumulate metrics
        _, indices = torch.max(output_transformer.data, 2)
        train_correct += (indices == tgt_output).sum().item()
        train_elems += indices.numel()
        train_batches +=  1
        train_loss += loss.item()
        # Backward pass and update.
        loss.backward()
        transformer_optimizer.step()
        train_loss = train_loss / train_batches
        train_acc = train_correct / train_elems

    # Evaluate the model on the test dataset.
    transformer.eval() # Set model in inference mode.
    test_loss = 0.0
    test_correct = 0
    test_batches = 0
    test_elems = 0
    for src_inputs, dest_inputs, dest_targets in testloader:
        # Define the padding masks
        enc_padding_mask = create_padding_mask(src_inputs, num_heads)
        dec_padding_mask = create_cross_attention_mask(create_padding_mask(dest_inputs,num_heads), src_inputs.shape[1])
        look_ahead_mask = create_look_ahead_mask(dest_inputs.shape[1]) #len_seq - Evaluation
        # Make a prediction
        outputs = transformer(src_inputs, dest_inputs, enc_padding_mask, dec_padding_mask, look_ahead_mask)
        loss = loss_function(outputs.view(-1, MAX_WORDS), dest_targets.view(-1))
        _, indices = torch.max(outputs, 2)
        test_correct += (indices == dest_targets).sum().item()
        test_elems += indices.numel()
        test_batches +=  1
        test_loss += loss.item()

    test_loss = test_loss / test_batches
    test_acc = test_correct / test_elems
    print(f'Epoch {i+1}/{EPOCHS} loss: {train_loss:.4f} - acc: {train_acc:0.4f} - val_loss: {test_loss:.4f} - val_acc: {test_acc:0.4f}')

    # Loop through a each sample in the sample dataset
    for sample_enc_input, sample_dec_target in zip(sample_input_data,
                                             sample_dest_target_data):
        # Convert to torch tensor
        sample_enc_input = torch.tensor(np.reshape(sample_enc_input,(1,-1)))
        sample_dec_target = torch.tensor(np.reshape(sample_dec_target,(1,-1)))

        # Define the padding masks
        enc_padding_mask = create_padding_mask(sample_enc_input, num_heads)
        #dec_padding_mask = create_padding_mask(tgt_input)
        # Initialize the input decoder
        input_dec = np.reshape(START_INDEX,(1,-1))
        produced_string = ''
        pred_seq = []
        # we have no words predicted. we loop to predict 60 words unless the stop word is predicted.
        for j in range(10):#MAX_LENGTH):

            # Predict next word
            outputs = transformer(sample_enc_input, torch.tensor(input_dec),
                                  enc_padding_mask, dec_padding_mask = None, look_ahead_mask= None)
            # Find the index of the most probable word and apprend it
            next_index = outputs[:,-1,:].argmax().numpy()
            input_dec = np.reshape(np.append(input_dec, next_index),(1,-1))
            # Stop the algorithm if the STOP token is predicted
            if next_index == STOP_INDEX:
                break

        # Convert the predicted indexes to words
        print(f"Source Sent:")
        tokens_to_words(src_tokenizer, sample_enc_input.tolist()[0])
        print(f"Target Sent:")
        tokens_to_words(dest_tokenizer, sample_dec_target.tolist()[0])
        print(f"Predicted Sent:")
        tokens_to_words(dest_tokenizer, input_dec.tolist()[0])
        print('\n\n')


Epoch 1/20 loss: 0.0057 - acc: 0.5760 - val_loss: 2.1380 - val_acc: 0.6613
Source Sent:
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'je', 'ne', 'suis', 'pas', 'blessé']
Target Sent:
["i'm", 'not', 'hurt', 'STOP', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
Predicted Sent:
['START', "it's", 'do', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']



Source Sent:
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'elle', 'lui', 'donna', 'la', 'voiture']
Target Sent:
['she', 'gave', 'him', 'the', 'car', 'STOP', 'PAD', 'PAD', 'PAD']
Predicted Sent:
['START', "it's", 'do', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']



Source Sent:
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'choisissez', 'quelque', 'chose']
Target Sent:
['choose', 'something', 'STOP', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
Predicted Sent:
['START', "it's", 'do', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']



Source Sent:
['PAD', 'PAD'

In [1]:
torch.save(transformer.state_dict(), 'model_weights.pth')

NameError: name 'torch' is not defined

In [None]:
model.load_state_dict(torch.load('model_weights.pth'))

work on the custom schedule learning rate and the line where we collapse the outputs. check the code for paddings in pytorch.