In [None]:
##################################################  Transformer Architecture  ####################################################



'''

This notebook is a pytorch implementation of transformer architecture from scratch. This implementation is based on the
architecture presented in the paper 'Attention is all you need' by Google.

The Transformer architecture consists of an encoder and decoder.

Encoder :

An Encoder consists of a stack of encoder layers.
The number of encoder layers is a design choice. In this notebook, we implement a 6 layer Encoder.Each encoder layer consists
two sublayers.

Sublayer 1 - Self Attention mechanism
Sublayer 2 - FeedForward Neural Network

Sublayer 1 Input - Word embedding + Positional embedding, Sublayer 1 Output - MultiHeadAttention (Sublayer 1 Input)

Sublayer 2 Input - Normalised Residual Added Sublayer1 Output, Sublayer 2 Output - FeedForward(Relu(FeedForward(Sublayer 2 Input)))

Decoder :

A Decoder consists of a stack of decoder layers.
The number of decoder layers is identical to the number of encoder layers. Each decoder layer consists of three sublayers.
The output from the last layer of the encoder is taken as the key and value matrix input for cross attention layer of the decoder.

Sublayer 1 - Self Attention mechanism
Sublayer 2 - FeedForward Neural Network
Sublayer 3 - Cross Attention Mechanism

Sublayer 1 Input - Word embedding + Positional embedding, Sublayer 1 Output - MultiHeadAttention (Sublayer 1 Input)

Sublayer 2 Input - Normalised Residual Added Sublayer1 Output, Sublayer 2 Output - FeedForward(Relu(FeedForward(Sublayer 2 Input)))

Sublayer 3 Input - Normalized Residual added Sublayer 2 Output, Sublayer 3 Output - CrossAttention (Sublayer 3 Input)

'''


#######################################################  End of Comments  ########################################################

"\n\nThis notebook is a pytorch implementation of transformer architecture from scratch. This implementation is based on the\narchitecture presented in the paper 'Attention is all you need' by Google.\n\nThe Transformer architecture consists of an encoder and decoder. An Encoder consists of a stack of encoder layers.\nThere are 6 encoder layers under Encoder.Each encoder layer consists two sublayers\n\nSubLayer 1 - contains Attention mechanism\nSubLayer 2 - contains FeedForward Neural Network\n\n\n\n"

In [None]:
import torch
import torch.nn as nn
import math

In [None]:
#Defining all the configs needed for the architecture to be implemented
batch_size = 1
vocab_size = 100
seq_len = 3
embed_dim = 512
ffn_dim = 2048
n_heads = 8
n_layers = 6
dropout_prob = 0.1

In [None]:
class Embedding(nn.Module):

  """
  A class for token embedding

  Attributes
  -----------
    embed (tensor): Token embeddings initialized for given vocab_size and embedding dimension
    word_embed (tensor): Token embeddings for the tokens in X

  Methods
  --------
    forward(self,X)
    Returns the embeddings of the token

  """

  def __init__(self,embed_dim,vocab_size):

    """
    Initializes an Embedding object

    Parameters
    -----------
      embed_dim (int): Dimension of the token embedding
      vocab_size (int): Size of the vocabulary

    """

    super().__init__()

    self.embed=nn.Embedding(vocab_size,embed_dim)

  def forward(self,X):

    """
    Returns token embeddings for the tokens

    Arguements
    -----------
      X (tensor): input tokens

    Returns
    --------
      word_embed (tensor): Embeddings of the tokens

    """

    word_embed = self.embed(X)

    return word_embed

In [None]:
class PositionalEmbedding(nn.Module):

  """
  A class for positional embedding

  Attributes
  -----------
    seq_len (int): Number of input tokens
    embed_dim (int): - Dimension of the token embedding

  Methods
  --------
    forward(self,word_embeddings)
    Returns the sum of token and positional embeddings

  """

  def  __init__(self,seq_len,embed_dim):

    """
    Initializes the positional embedding object

    Parameters
    -----------
      embed_dim (int): Dimension of the token embedding
      seq_len (int): Number of input tokens

    """

    super().__init__()

    self.seq_len = seq_len
    self.embed_dim = embed_dim

    pos_embed = torch.zeros(self.seq_len,self.embed_dim)

    #For each token 'pos' in a given sentence of length 'seq_len', a positional embedding vector of size
    #'embed_dim' equal to dimension of word embedding is created. For each position 'i' in positional embedding,
    #the function to calculate the value of the vector varies depending on whether the position is even or odd.
    #For even position sine function is used and for odd position cos function is used.

    for pos in range(seq_len):

      for i in range(0,self.embed_dim,2):

        pos_embed[pos,i] = math.sin(pos/(10000**((2 * i)/self.embed_dim)))

        pos_embed[pos,i+1] = math.cos(pos/(10000**((2 * (i+1))/self.embed_dim)))

    print('POS embedding shape before Unsqueeze : ',pos_embed.shape)

    pos_embed=pos_embed.unsqueeze(0)

    print('POS embedding shape after Unsqueeze : ',pos_embed.shape)


    #register_buffer of pytorch is used to save the parameters that do not need gradient updates during back propogation.
    #Anyhow these parameters still need to be stored and loaded into state_dict
    #https://discuss.pytorch.org/t/what-is-the-difference-between-register-buffer-and-register-parameter-of-nn-module/32723

    self.register_buffer('pos_embed',pos_embed)

  def forward(self,word_embeddings):

    """
      Returns postional embeddings

      Parameters
      -----------
      word_embeddings (tensor): embeddings of the token

      Returns
      --------
      sum_embeddings (tensor): Positional embeddings for individual tokens

    """
    print(word_embeddings.shape)
    print(self.pos_embed.shape)
    sum_embeddings = word_embeddings+self.pos_embed

    return sum_embeddings


In [None]:
class MultiHeadAttention(nn.Module):

  """
  A class for multiheadattention mechanism

  Attributes
  ----------
    seq_len (int):  Number of input tokens
    embed_dim (int):  Dimension of the token embedding
    n_heads (int):  Number of parallel attention heads
    dropout_prob (float): Probability of neurons to be dropped

  Methods
  --------
    split(self,inp_tensors)
    Returns the split heads of attention layer

    scaled_dot_product(self,Q,K,single_head_dim)
    Returns the softmax of dot product of query and key vectors scaled by the number of heads

    combine_heads(self,split_context,seq_len,n_heads,single_head_dim)
    Returns the concatenated parallel heads of attention layer

    forward(self,A,B,C)
    Returns the context vector

  """

  def __init__(self,seq_len,embed_dim,n_heads,dropout_prob):

    """
    Initializes the multihead attention object

    Parameters
    -----------
      seq_len (int): Number of input tokens
      embed_dim (int): Dimension of the token embedding
      n_heads (int): Number of parallel attention heads
      dropout_prob (float): Probability of neurons to be dropped

    """

    super().__init__()

    self.seq_len = seq_len
    self.embed_dim = embed_dim
    self.n_heads = n_heads
    self.single_head_dim = int(self.embed_dim/self.n_heads)
    self.dropout = nn.Dropout(dropout_prob)

    self.W_q = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
    self.W_k = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
    self.W_v = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
    self.W_o = nn.Linear(self.embed_dim, self.embed_dim, bias=False)

  def split(self,inp_tensors):

    """
    Returns the split heads of attention layer

    Parameters
    -----------
      inp_tensors (tensor): query,key and value vectors to be split into parallel heads (n_heads)

    Returns
    --------
      split_tensors (tensor): split heads of query,key and value vectors

    """

    split_tensors = inp_tensors.view(self.seq_len,self.n_heads,self.single_head_dim)

    return split_tensors

  def scaled_dot_product(self,Q,K,single_head_dim):

    """
    Returns the softmax of dot product of query and key vectors scaled by the number of heads

    Parameters
    -----------
      Q,K (tensor): query,key vectors

    Returns
    --------
      attention_score (tensor): attention score of seq_len,n_heads,n_heads dimension

    """

    #Query vector of each token's each head is multiplied with the key vectors of all the heads of the same token and other tokens of the sequence
    #Hence the resulting attention score is a matrix of size n_heads * n_heads

    product = torch.matmul(Q,K.transpose(-2,-1))

    scaled_product = product/math.sqrt(single_head_dim)

    attention_score = nn.functional.softmax(scaled_product,dim=-1)

    return attention_score

  def combine_heads(self,split_context,seq_len,n_heads,single_head_dim):

    """
    Returns the concatenated context vector

    Parameters
    -----------
      split_context (tensor): split context vector
      seq_len (int): Number of input tokens
      n_heads (int): Number of parallel attention heads
      single_head_dim:  dimension of each head vector

    Returns
    --------
      attention_score (tensor): attention score of seq_len,n_heads,n_heads dimension

    """

    context = split_context.view(seq_len,n_heads * single_head_dim)

    return context

  def forward(self,A,B,C):

    """
      Returns context

      Parameters
      -----------
      A (tensor): can be either encoder or decoder input - word embeddings (token embeddings + pos embeddings)
      B,C (tensor): in case of encoder - word embeddings (token embeddings + pos embeddings)
      B,C (tensor): in case of decoder - encoder output

      Returns
      --------
      context (tensor): context vector

    """

    Q = self.split(self.W_q(A))
    K = self.split(self.W_k(B))
    V = self.split(self.W_v(C))

    attention_score = self.scaled_dot_product(Q,K,self.single_head_dim)

    #context = torch.matmul(attention_score,V)

    split_context = attention_score @ V

    context = self.dropout(self.combine_heads(split_context,self.seq_len,self.n_heads,self.single_head_dim))

    return context







In [None]:
class FeedForward(nn.Module):

  """
  A class for feedforward mechanism

  Attributes
  ----------
    embed_dim (int):  Dimension of the token embedding
    ffn_dim (int):  Dimension of the feedforward network
    dropout (Dropout): Dropout layer
    ffn (tensor): instance of Sequential layer class

  Methods
  --------
    forward(self,norm_inp)
    Returns the feedforward network output

  """

  def __init__(self,embed_dim,ffn_dim,dropout_prob):

    """
    Initializes the feedforward object

    Parameters
    -----------
      embed_dim (int): Dimension of the token embedding
      ffn_dim (int): Dimension of the feedforward network
      dropout (Dropout): Dropout layer

    """

    super().__init__()

    self.embed_dim = embed_dim
    self.ffn_dim = ffn_dim
    self.dropout = nn.Dropout(dropout_prob)
    self.ffn = nn.Sequential(nn.Linear(self.embed_dim,self.ffn_dim),
                             nn.ReLU(),
                             nn.Linear(self.ffn_dim,self.embed_dim))



  def forward(self,norm_inp):

      """
      Returns feedforward output vector

      Parameters
      -----------
      norm_inp (tensor): normalised input from multihead attention sublayer

      Returns
      --------
      ffn_out (tensor): feedforward output vector

    """

    ffn_out = self.dropout(self.ffn(norm_inp))

    return ffn_out






In [None]:
class EncoderLayer(nn.Module):

  """
  A class for encoder layer

  Attributes
  ----------
    attention (Attention):  instance of Class MultiHeadAttention
    norm (tensor):  instance of Class LayerNormalisation
    dropout (Dropout): instance of Dropout layer
    feedforward (Feedforward): instance of Class Feedforward

  Methods
  --------
    forward(self,X,X,X)
    Returns the encoder layer output

  """

  def __init__(self,n_layers,n_heads,seq_len,embed_dim,ffn_dim,dropout_prob):

    super().__init__()

    self.dropout = nn.Dropout(dropout_prob)
    self.attention = MultiHeadAttention(seq_len,embed_dim,n_heads,dropout_prob)
    self.norm = nn.LayerNorm(embed_dim)
    self.feedforward = FeedForward(embed_dim,ffn_dim,dropout_prob)

  def forward(self,X):

    """
    Returns the encoder layer output

    Parameters
    -----------
    X (tensor): word embeddings of encoder input tokens (token embeddings + pos embeddings)

    Returns
    --------
    enclayer_output (tensor): Output of encoder layer of dimension (batch_size,seq_len,embed_dim)

    """

    Z = self.dropout(self.attention(X,X,X))
    print('Sublayer 1 Output : \n\n',Z)

    normalized_context = self.norm(Z+X)
    ffn_output = self.feedforward(normalized_context)
    print('Sublayer 2 Output : \n\n',ffn_output)
    enclayer_output = self.norm(self.dropout(ffn_output)+normalized_context)
    print(enclayer_output.shape)
    return enclayer_output






In [None]:
class DecoderLayer(nn.Module):

  """
  A class for decoder layer

  Attributes
  ----------
    dropout (Dropout): instance of Dropout layer
    attention (Attention):  instance of Class MultiHeadAttention
    norm (tensor):  instance of Class LayerNorm
    feedforward (Feedforward): instance of Class Feedforward

  Methods
  --------
    forward(self,X,X,X)
    Returns the encoder layer output

  """


  def __init__(self,n_layers,n_heads,seq_len,embed_dim,ffn_dim,dropout_prob):

    super().__init__()

    self.dropout = nn.Dropout(dropout_prob)
    self.attention = MultiHeadAttention(seq_len,embed_dim,n_heads,dropout_prob)
    self.norm = nn.LayerNorm(embed_dim)
    self.feedforward = FeedForward(embed_dim,ffn_dim,dropout_prob)
    self.cross_attn = MultiHeadAttention(seq_len,embed_dim,n_heads,dropout_prob)

  def forward(self,Y,enc_out):

    """
    Returns the decoder layer output

    Parameters
    -----------
    Y (tensor): word embeddings of decoder input tokens (token embeddings + pos embeddings)
    enc_out (tensor): output from last layer of encoder

    Returns
    --------
    declayer_output (tensor): Output of decoder layer of dimension (batch_size,seq_len,embed_dim)

    """

    Z = self.dropout(self.attention(Y,enc_out,enc_out))
    normalized_context = self.norm(Z+Y)

    ffn_output = self.feedforward(normalized_context)
    normalized_ffn_output = self.norm(self.dropout(ffn_output)+normalized_context)

    # Normalised Feedforward network output is used to compute the Query vector by multiplying with Query matrix
    # Key and Value vector are obtained by multiplying the output of encoder output with the Key and Value matrix of decoder
    # Cross attention - attention between query matrix generated from decoder input and key and value matrix generated from encoder output

    Z_cross = self.cross_attn(normalized_ffn_output,enc_out,enc_out)

    declayer_output = self.norm(self.dropout(Z_cross)+normalized_ffn_output)

    return declayer_output





In [None]:
class Encoder(nn.Module):

  """
  A Class for Encoder block

  Attributes
  -----------
    word_embedding (Embedding): instance of Class Embedding
    pos_embedding (Embedding): instance of Class PositionalEmbedding
    dropout (Dropout):  instance of Dropout layer
    layers (ModuleList): Module list containing instances of Class EncoderLayer

  Methods
  --------
    forward(enc_tokens)
    Returns the output of encoder block

  """

  def __init__(self,n_layers,n_heads,embed_dim,seq_len,vocab_size,dropout_prob,ffn_dim):

    super().__init__()

    self.word_embedding = Embedding(embed_dim,vocab_size)
    self.pos_embedding = PositionalEmbedding(seq_len,embed_dim)
    self.dropout = nn.Dropout(dropout_prob)

    self.layers = nn.ModuleList(EncoderLayer(n_layers,n_heads,seq_len,embed_dim,ffn_dim,dropout_prob) for layer in range(n_layers))


  def forward(self,enc_tokens):

    """
    Returns the encoder output

    Parameters
    -----------
      enc_tokens (tensor):  input tokens for the encoder block

    Returns
    --------
      X (tensor): returns the output of the encoder block of dimension (batch_size,seq_len,n_heads)

    """

    word_embedding = self.word_embedding(enc_tokens)

    pos_embedding = self.pos_embedding(word_embedding)

    X = self.dropout(word_embedding + pos_embedding)



    for layer_num,layer in enumerate(self.layers):

      print('Encoder Layer '+str(layer_num+1)+' Input : \n\n',X)

      X = layer(X)

      print('Encode Layer '+str(layer_num+1)+' Output : \n\n',X)

    print(X.shape)
    return X


In [None]:
class Decoder(nn.Module):

  """
  A Class for Decoder block

  Attributes
  -----------
    word_embedding (Embedding): instance of Class Embedding
    pos_embedding (Embedding): instance of Class PositionalEmbedding
    dropout (Dropout):  instance of Dropout layer
    layers (ModuleList): Module list containing instances of Class DecoderLayer

  Methods
  --------
    forward(enc_tokens)
    Returns the output of deccoder block

  """
  def __init__(self,n_layers,n_heads,embed_dim,seq_len,vocab_size,dropout_prob,ffn_dim):

    super().__init__()

    self.word_embedding = Embedding(embed_dim,vocab_size)
    self.pos_embedding = PositionalEmbedding(seq_len,embed_dim)
    self.dropout = nn.Dropout(dropout_prob)
    self.layers = nn.ModuleList(DecoderLayer(n_layers,n_heads,seq_len,embed_dim,ffn_dim,dropout_prob) for layer in range(n_layers))

  def forward(self,dec_tokens,enc_out):

    """
    Returns the decoder output

    Parameters
    -----------
      dec_tokens (tensor):  input tokens for the decoder block
      enc_out (tensor): Output of encoder block of dimension (batch_size,seq_len,embed_dim)

    Returns
    --------
      Y (tensor): returns the output of the decoder block of dimension (batch_size,seq_len,n_heads)

    """


    word_embedding = self.word_embedding(dec_tokens)

    pos_embedding = self.pos_embedding(word_embedding)

    print(word_embedding.shape)
    print(pos_embedding.shape)

    Y = self.dropout(word_embedding+pos_embedding)

    for layer_num,layer in enumerate(self.layers):

      print('Decoder Layer '+str(layer_num+1)+' Input : ',Y)

      Y = layer(Y,enc_out)

      print('Decoder Layer'+str(layer_num+1)+' Output : ',Y)

    return Y



In [None]:
class Transformer(nn.Module):

  """
  A Class for transformer containing encoder and decoder

  Attributes
  -----------
    encoder (Encoder):  instance of Class Encoder
    decoder (Decoder):  instance of Class Decoder

  """

  def __init__(self,n_layers,n_heads,embed_dim,seq_len,vocab_size,dropout_prob,ffn_dim):

    super().__init__()

    self.encoder = Encoder(n_layers,n_heads,embed_dim,seq_len,vocab_size,dropout_prob,ffn_dim)
    self.decoder = Decoder(n_layers,n_heads,embed_dim,seq_len,vocab_size,dropout_prob,ffn_dim)

  def forward(self,enc_tokens,dec_tokens):

    """
    Returns the output of the decoder

    Parameters
    -----------
      enc_tokens (tensor):  Input tokens of encoder
      dec_tokens (tensor):  Input tokens of decoder

    Returns
    --------
      dec_out (tensor): Output of decoder

    """

    self.enc_tokens = enc_tokens

    self.dec_tokens = dec_tokens

    enc_out = self.encoder(self.enc_tokens)

    dec_out = self.decoder(self.dec_tokens,enc_out)

    return dec_out



In [None]:
model=Transformer(n_layers,n_heads,embed_dim,seq_len,vocab_size,dropout_prob,ffn_dim)

POS embedding shape before Unsqueeze :  torch.Size([3, 512])
POS embedding shape after Unsqueeze :  torch.Size([1, 3, 512])
POS embedding shape before Unsqueeze :  torch.Size([3, 512])
POS embedding shape after Unsqueeze :  torch.Size([1, 3, 512])


In [None]:
model

Transformer(
  (encoder): Encoder(
    (word_embedding): Embedding(
      (embed): Embedding(100, 512)
    )
    (pos_embedding): PositionalEmbedding()
    (dropout): Dropout(p=0.1, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (dropout): Dropout(p=0.1, inplace=False)
        (attention): MultiHeadAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (W_q): Linear(in_features=512, out_features=512, bias=False)
          (W_k): Linear(in_features=512, out_features=512, bias=False)
          (W_v): Linear(in_features=512, out_features=512, bias=False)
          (W_o): Linear(in_features=512, out_features=512, bias=False)
        )
        (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feedforward): FeedForward(
          (dropout): Dropout(p=0.1, inplace=False)
          (ffn): Sequential(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): ReLU()
            (2): Linear(in_fea

In [None]:
enc_inp = torch.randint(1,vocab_size,(batch_size,seq_len))
dec_inp = torch.randint(1,vocab_size,(batch_size,seq_len))

#print(enc_inp)
#enc_out = model(enc_inp)

dec_out = model(enc_inp,dec_inp)
print(dec_out)
print(dec_out.shape)

torch.Size([1, 3, 512])
torch.Size([1, 3, 512])
Encoder Layer 1 Input : 

 tensor([[[ 1.3754,  0.4682,  1.6899,  ...,  1.9252,  5.5338, -3.2870],
         [ 4.0158, -1.3317, -3.0309,  ...,  2.6946, -1.2556,  0.6965],
         [ 0.9449, -0.0000,  2.8819,  ...,  0.0000,  0.0000,  0.4235]]],
       grad_fn=<MulBackward0>)
Sublayer 1 Output : 

 tensor([[ 0.9896,  0.8935,  0.0000,  ...,  0.7971, -0.0000,  0.4307],
        [ 0.3550, -0.0000, -0.5838,  ..., -1.5766,  0.4577, -0.7899],
        [ 0.6246, -0.7355, -0.0000,  ..., -0.3283,  0.6464, -0.1441]],
       grad_fn=<MulBackward0>)
Sublayer 2 Output : 

 tensor([[[-0.0000, -0.3016,  0.2931,  ..., -0.4519,  0.0657,  0.0000],
         [ 0.0098,  0.1366,  0.3545,  ...,  0.2329, -0.0000,  0.0265],
         [-0.0483,  0.5696,  0.1927,  ...,  0.1937,  0.0196, -0.0542]]],
       grad_fn=<MulBackward0>)
torch.Size([1, 3, 512])
Encode Layer 1 Output : 

 tensor([[[ 0.8448,  0.1068,  0.8785,  ...,  0.5096,  2.2277, -1.3180],
         [ 1.6998, -0.5

In [None]:
#model.__doc__
#help(model)