<a href="https://colab.research.google.com/github/shusank8/Transformers/blob/main/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
print("Transformers.... Excited")

Transformers.... Excited


In [5]:
# imports
import torch
import torch.nn as nn
import math

In [6]:
# global variables
embdim = 32
vocab_size = 32
block_size = 32
dropout = 0.2
eps = 1e-5
no_of_heads = 4
hdim = embdim//no_of_heads

In [7]:
class InputEmbeddings(nn.Module):

  def __init__(self):
    super().__init__()
    self.embeddings = nn.Embedding(vocab_size, embdim)

  def forward(self, x):
    return self.embeddings(x)


In [8]:
class PositionalEmbeddings(nn.Module):

  def __init__(self):
    super().__init__()
    self.dropout = nn.Dropout(dropout)

    pe = torch.zeros(block_size, embdim)

    position = torch.arange(0, block_size, dtype = torch.float).unsqueeze(1)

    div_term = torch.exp(torch.arange(0, embdim, 2).float() * (-math.log(10000.0)/embdim))

    pe[:, 0::2] = torch.sin(position*div_term)
    pe[:, 1::2] = torch.cos(position*div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

  def forward(self, x):
    x = self.pe[:, :x.shape[1],:]
    x = self.dropout(x)
    return x

In [9]:
class LayerNormalization(nn.Module):

  def __init__(self):
    self.alpha = nn.Parameters(torch.ones(embdim))
    self.bias = nn.Parameters(torch.zeros(embdim))

  def forward(self, x):
    xmean = x.mean(dim=-1, keepdim=True)
    xvar = x.var(dim=-1, keepdim=True)
    x = self.alpha*((x-xmean)/(xvar+eps)**(1/2))+self.bias
    return x



In [10]:
class FeedForward(nn.Module):

  def __init__(self):
    super().__init__()
    self.m = nn.Sequential(
        nn.Linear(embdim, 3*embdim),
        nn.ReLU(),
        nn.Linear(3*embdim, embdim),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    x = self.m(x)
    return x


In [11]:
class MultiHeadAttentionBlock(nn.Module):

  def __init__(self):

    self.q = nn.Linear(embdim, embdim)
    self.k = nn.Linear(embdim, embdim)
    self.v = nn.Linear(embdim, embdim)
    self.proj = nn.Linear(embdim, embdim)
    dropout = nn.Dropout(dropout)

  @staticmethod
  def attention(query, key, value, mask, dropout):
    head_dim = query.shape[-1]
    attention_scores = (query@key.transpose(-2,-1))/math.sqrt(head_dim)
    if mask is not None:
      attention_scores.masked_fill(mask==0, float("-inf"))
    attention_scores = attention_scores.softmax(dim=-1)
    if dropout is not None:
      attention_scores = dropout(attention_scores)
    return (attention_scores@value), attention_scores



  def forward(self, query, key, val, mask):
    # for self attn query==key==val but cross attn
    q = self.q(query)
    k = self.k(key)
    v = self.v(val)

    # shape of q=> (B, T, C) BUT WE WANT TO BREAK C INTO DIFF HEADS
    # (B,T,NO_OF_HEADS, HEADIM) WHERE NO_OF_HEADS * HEADIM = C
    query = q.view(q.shape[0], q.shape[1], no_of_heads, hdim).transpose(1,2)
    key = k.view(k.shape[0], k.shape[1], no_of_heads, hdim).transpose(1,2)
    v = v.view(v.shape[0], v.shape[1], no_of_heads, hdim).transpose(1,2)

    x, attn_scores = MultiHeadAttentionBlock(q, k, v, mask, dropout)
    x = x.transpose(1,2).contiguous().view(x.shape[0], -1, embdim)
    return self.proj(x)


In [12]:
class ResidualConnection(nn.Module):
  def __init__(self):
    super().__init__()
    self.dropout  = nn.Dropout(dropout)
    self.norm = LayerNormalization()

  def forward(self, x, sublayer):
    return x+ self.dropout(sublayer(self.norm(x)))

In [15]:
class EncoderBlock(nn.Module):

  def __init__(self, s_attn, ffwd):
    super().__init__()
    self.selfattn = s_attn
    self.ffwd = ffwd
    self.residual_connections = nn.ModuleList([ResidualConnection() for _ in range(2)])

  def forward(self, x, src_mask):
    x = self.residual_connections[0](x, lambda x: self.selfattn(x,x,x,src_mask))
    x = self.residual_connections[1](x, self.ffwd)
    return x

In [None]:
class Encoder(nn.Module):

  def __init__(self, layers):
    super().__init__()
    self.layers = layers
    self.norm = LayerNormalization()

  def forward(self, x, mask):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

In [None]:
class DecoderBlock(nn.Module):
  def __init__(self, selfattn, crossattn, ffwd):
    super().__init__()
    self.selfattn = selfattn
    self.crossattn = crossattn
    self.ffwd = ffwd
    self.residual_connections = nn.ModuleList([ResidualConnection() for _ in range(3)])

  def forward(self, x, encoder_output, src_mask, tgt_mask):

