**This project is an attempt at creating a transformer from scratch using the Multi Head Attention Mechanism which computes the attention between each pair of positions in a sequence. Consisting of multiple 'attention heads', that capture different aspects of the input sequence. This is a step ahead of simply using position within text to decipher meaning and importance.
Transformers alleviate common issues in RNN's such as weaknesses in long range dependencies, gradient vanishing or gradient explosion problems (that either suddenly increase loss or stops the decrease in loss of the model) and more.**

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim


In [4]:
import math

In [5]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
    self.d_model = d_model
    self.num_head = num_heads
    self.head_dim = d_model // num_heads

    # linear transformations for queries, keys and values for all heads
    self.wq = nn.Linear(d_model, d_model)
    self.wk = nn.Linear(d_model, d_model)
    self.wv = nn.Linear(d_model, d_model)

    # linear transformation for the concatenated outputs of all heads
    self.fc_out = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, query, key, value, mask):
        """Calculate scaled dot-product attention for one head."""

        # calculate the dot product of the query and key and transpose the result
        matmul_qk = torch.matmul(query, key.transpose(-2, -1))

        # ge the dimension of the key vectors (d_k) which is the size of the last dimension of the query tensor
        d_k = query.size(-1)

        # scale the dot product by the square root of d_k to stabilize gradients
        scaled_attention_logits = matmul_qk / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))


        # apply a mask to the scaled_atention_logits if a mask is provided
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)  # Masked values have a large negative value

        # compute the attention weights by applying the softmax function along the last dimension
        attention_weights = torch.nn.functional.softmax(scaled_attention_logits, dim=-1)

        # calculcate the weighted sum of the values using the attention weights
        output = torch.matmul(attention_weights, value)

        # return the output and attention weights
        return output, attention_weights

    def split_heads(self, x, batch_size):
      """Split the input tensor into separate attention heads"""

      # reshape the input tensor 'x' into shape that separates the attention heads
      # reshape it into a 4D tensor with dimensions: (batch_size, num_heads, sequence_length, head_dim)

      x = x.view(batch_size, -1, self.num_heads, self.head_dim)

      # permute the dimensions of the tensor to obtain the desired shape
      # the result will have dimensions: (batch_size, num_heads, sequence_length, head_dim)

      x = x.permute(0, 2, 1, 3)

      # return reshaped tensor
      return x

    def combine_heads(self, x, batch_size):
        """Combine the attention heads back to the original shape"""

        # uses 'permute' to rearrange dimensions and swaps dimensions to match original shape
        # uses contiguous to ensure the tensor's memory layout is continguous in memory
        # 'view' to reshape the tensor into desired shape with dimensions (batch+_size, sequence_length, d_model)
        x = x.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
        return x

    def forward(self, query, key, value, mask):
        batch_size = query.size(0) #get the batch size from the query tensor

        #linearly project queries, keys, and values for all heads
        query = self.wq(query) #project queries using the wq linear year
        key = self.wk(key) #project keys using the wk linear layer
        value = self.wv(value) #project values using the wv linear layer

        #split the queries, keys, and values into separate heads
        query = self.split_heads(query, batch_size) #split queries into multiple heads
        key = self.split_heads(key, batch_size) #split keys into multiple heads
        value = self.split_heads(value, batch_size) #split values into multiple heads

        #calculate scaled dot-product attention for each head
        output, attention_weights = self.scaled_dot_product_attention(query, key, value, mask)

        #combine the attention heads
        output = self.combine_heads(output, batch_size)

        #apply a linear transformation to the combined outputs
        output = self.fc_out(output)

        return output, attention_weights








In [6]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=512):
    super(PositionalEncoding, self).__init__()
    self.d_model = d_model # dimensionality of the positional encoding
    self.max_len = max_len # maximum sequence length for which positional encoding will be generated

    # create positional encoding matrix
    pe = self.create_positional_encoding(max_len, d_model)

    # register the positional encoding as a bugger(not a learnable parameter)
    self.register_buffer('pe', pe)

  def create_positional_encoding(self, max_len, d_model):

    #create a sequence of positions from 0 to max_len - 1
    position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)

    # calculate the div_term for the positional encoding
    div_term = torch.exp(torch.arange(0, d_model, 2).float()*(-math.log(10000.0)/d_model))

    # intialize the positional encoding matrix
    pe = torch.zeroes(max_len, d_model)

    #compute sine and cosine values for positional encoding
    pe[:, 0::2] = torch.sin(position*div_term) #selects every second element along the second dimension of the positional encoding of 'pe' tensor
    pe[:, 1::2] = torch.cose(position*div_term) # then calculates the sin or cosine of '(position * div_term)' for each position in the sequence
    # sin based encoding for even indices along the second dimension - sin encoded values
    # cosine based encoding for odd indices along the second dimension of the 'pe' tensor - cosine encoded values

    #add a batch dimension to the positional encoding matrix
    pe = pe.unsqueeze(0)
    return pe

  def forward(self, x):
    # Add the positional encoding to the input embeddings
    x = x + self.pe[:, :x.size(1)]
    return x



In [7]:
class PositionWiseFeedForward(nn.Module):
  def __init__(self, d_model, d_ff):
    super(PositionWiseFeedForward, self).__init__()

    #define the first fully connected layer with input size d_model and output size d_ff
    self.fdc1 = nn.Linear(d_model, d_ff)

    #define the second fully connected layer with input size d_ff and output size d_model
    self.fc2 = nn.Linear(d_ff, d_model)

    #define the activation function ReLU (Rectified Linear Unit)
    self.relu = nn.ReLU()

  def forward(self, x):
    #apply the first fully connected layer followed by ReLU activation
    out1 = self.fc1(x)
    out1 = self.relu(out1)

    #apply the second fully connected layer to the output of the first
    out2 = self.fc2(out1)

    #return the output of the second layer
    return out2



In [8]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
    """
    Initialize a single encoder layer in the transformer model

    Arguments -
    - d_model: the dimensionality of the model's hidden states
    - num_heads: the number of attention heads of multi-head attention
    - d_ff: the dimensionality of the feedforward sublayer
    - dropout: the dropout rate to be applied within the layer

    """
    super(EncoderLayer, self).__init__()

    #multi head self attention layer
    self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)

    #position wise feed forward layer
    self.feedforward = PositionWiseFeedForward(d_model, d_ff)

    #layer normalization
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)

    #dropout
    self.dropout = nn.Dropout(dropout)


  def forward(self, x):
    #multi head self attention
    attention_output, _=self.self_attention(x,x,x, mask=None)
    x = x + self.dropout(attention_output)
    x = self.norm1(x)

    #position wise feedforward
    feedforward_output = self.feedforward(x)
    x = x + self.dropout(feedforward_output)
    x = self.norm2(x)

    return x

In [9]:
class DecoderLayer(nn.Module):
  def __init__(self, d_model, d_ff, num_heads, dropout):  #source sequence mask and target sequence mask
    """
    Initialize a single decoder layer in the transformer model

    Arguments -
    - d_model: the dimensionality of the model's hidden states
    - num_heads: the number of attention heads of multi-head attention
    - d_ff: the dimensionality of the feedforward sublayer
    - dropout: the dropout rate to be applied within the layer

    """
    super(DecoderLayer, self).__init__()

    #multi head self attention layer
    self.self_attn = MultiHeadAttention(d_model, num_heads)

    #multi head cross attention layer (for encoder decoder attention)
    self.cross_attn = MultiHeadAttention(d_model, num_heads)

    #position wise feedforward
    self.feed_forward = PositionWiseFeedForward(d_model, d_ff)

    #layer normalization
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.norm3 = nn.LayerNorm(d_model)

    # dropout layer
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, enc_output, src_mask, tgt_mask):
      """
      Forward pass through the decoder layer.

      argument:
      - x: The input tensor representing the target sequence.
      - enc_output: The output tensor from the encoder (encoder-decoder attention).
      - src_mask: The source sequence mask (for encoder-decoder attention) used when decoder is attenting to encoder's outputs during decoing process
                  Masks out padded values in source sequences
      - tgt_mask: The target sequence mask (for self-attention) used when the decoder is attenting to its own targetsequence
                  Ensures that each position in the target seq can only attend to positions before itself and not to positions after it

      returns:
      - x: The output tensor of the decoder layer.
      """
      # Self-Attention Layer
      self_attn_output = self.self_attn(x, x, x, tgt_mask)
      x = x + self.dropout(self_attn_output)
      x = self.norm1(x)

      # Encoder-Decoder Attention Layer
      cross_attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
      x = x + self.dropout(cross_attn_output)
      x = self.norm2(x)

      # Position-wise FeedForward Layer
      ff_output = self.feed_forward(x)
      x = x + self.dropout(ff_output)
      x = self.norm3(x)

      return x




In [10]:
class TransformerEncoder(nn.Module):
  def __init__(self, num_layers, d_model, num_heads, d_ff, dropout):
    """
    Initialize the Transformer encoder by stacking multiple encoder layers

    arguments -
    - num_layers: number of encoder layers to stack
    - d_model: the dimensionality of the model's hidden states
    - num_heads: the number of attention heads for multi-head attention
    - d_ff: dimnsionality of feedforward sublayer
    - dropout: the dropout rate to apply within the encoder layers

    """
    super(TransformerEncoder, self).__init__()

    #stack multiple encoder layers
    self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

  def forward(self, x, src_mask):
    """
    Forward Pass through the transformer code

    arguments:
    - x: the input tensor representing the source sequence
    - src_mark: the source sequence mask

    returns:
    - enc_output: the output tensor of the encoder
    """

    enc_output = x

    #iterate through encoder layers
    for enc_layer in self.encoder_layers:
      enc_output = enc_layer(enc_output, src_mask)


    return enc_output

In [11]:
class TransformerDecoder(nn.Module):
  def __init__(self, num_layers, d_model, num_heads, d_ff, dropout):
    """ Initialize the Transformer decoder by stacking multiple decoder layers """

    super(TransformerDecoder, self).__init__()

    #stack multiple decoder layers

    self.decode_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout)for _ in range(num_layers)])

  def forward(self, x, src_mask, enc_output, tgt_mmask):
    """forward pass through the transformer decoder"""

    dec_output = x

    #iterate through decoder layers

    for dec_layers in self.decoder_layers:
      dec_output = dec_layers(dec_output, enc_output, src_mask, tgt_mmask)
    return dec_output


In [12]:
class Transformer(nn.Module):
  def __init__(self, encoder, decoder, src_pad_idx, tgt_pad_idx):
    """ intialize the Transformer model with an encoder and decoder

    arguments -
    - encoder: the transformer encoder
    - decoder: The Transformer decoder.
    - src_pad_idx: the padding index for the source sequence (integer value representing the padding token index in the source sequence)
                    - in nlp tasks sequences have variable lengths and padding is added to make all sequences in the batch to have the same length
    - tgt_pad_idx: the padding index for the target sequence - in sequence to sequence tasks such as machine translation the target sequence also need to have same lengths
                    - Padding tokens are added to the end of the shorter target sequence to ensure consistent batch processing
    """

    super(Transformer, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.src_pad_idx = src_pad_idx
    self.tgt_pad_idx = tgt_pad_idx
  def forward(self, src, tgt):
    "forward pass through the Transformer model"

    src_mask = self.make_src_mask(src)
    tgt_mask = self.make_tgt_mask(tgt, src)

    enc_output = self.encoder(src, src_mask)
    output = self.decoder(tgt, enc_output, src_mask, tgt_mask)

    return output

  def make_src_mask(self, src):
    # create a mask to indentify padding tokens in the source sequence
    return (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

  def make_tgt_mask(self, tgt, src):
    #create masks for both self attention and cross attention in the decoder
    tgt_mask = (tgt != self.tgt_pad_idx).unsqueeze(2).unsqueeze(2)
    tgt_mask = tgt_mask & self.subsequent_mask(tgt.size(-1))
    src_mask = (src != self.src_pad_idx).unsqueeze(1).unqueeze(2)
    return tgt_mask, src_mask

  def subsequent_mask(self, size):
    #create a mask to prevent attending to future tokens in self-attention
    return torch.triu(torch.ones(1, size, size), diagonal=1).bool()


**Note about Padding Tokens - During the training and inference phases, the Transformer model should not pay attention to padding tokens because they do not carry meaningful informatio. These padding indices are used to create masks that prevent the model's attention mechanisms (self attention and cross attention) from attending to the padding tokens.
In the make_src_mask and make_tgt_mask methods of the above Transformer class, these indices are used to create binary masks that can indentify the positions of padding tokens, which are then applied to the attention mechanisms to ensure that the model doesn't attend to padding tokens when calculating attention weights**