In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math

torch.manual_seed(1)

<torch._C.Generator at 0x7f78e8095d90>

#Word Embedding:

In [3]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, model_dim):
        super().__init__()
        
        self.embed = nn.Embedding(vocab_size, model_dim)
    
    def forward(self, X):
        return self.embed(X)

#Positional Encoding:

In [4]:
class PositonalEncoder(nn.Module):
    def __init__(self, model_dim, max_seq_len = 80):
        super().__init()
        
        self.model_dim = model_dim
        
        pe = torch.zeros(max_seq_len, model_dim)
        
        for pos in range (max_seq_len):
            for i in range(0, model_dim, 2):
                pe[pos,i] = math.sin(pos/10000**((2*i)/model_dim))
                pe[pos, i+1] = math.cos(pos/10000**((2*(i+1))/model_dim))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, X):
        
        
        
        seq_len = X.size(1)
        
        X = X + torch.tensor(self.pe[:,:seq_len], requires_grad=False)
        return X
        
                                     

#Multi-headed attention

In [5]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, heads, model_dim, dropout = 0.1):
        super.__init__()
        
        self.model_dim = model_dim
        self.d_k = model_dim//heads
        self.h = heads
        
        self.q_linear = nn.Linear(model_dim, model_dim,bias=False)
        self.k_linear = nn.Linear(model_dim, model_dim,bias=False)
        self.v_linear = nn.Linear(model_dim,model_dim,bias=False)
        self.dropout  = nn.Dropout(dropout)
        self.out  = nn.Linear(model_dim,model_dim,bias=False)
        
    def forward(self, q,k,v):
        
        
        bs = q.size(0)
        
        # perform linear operation and split into h heads
        
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        
       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        
        scores = attention(q, k, v, self.d_k, self.dropout)
        
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.model_dim)
        
        output = self.out(concat)
    
        return output


#Feed forward sublayer

In [6]:
class FeedForward(nn.Module):
    def __init__(self, model_dim, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(model_dim, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, model_dim)
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

#Normalization

In [7]:
class Norm(nn.Module):
    def __init__(self, model_dim, eps = 1e-6):
        super().__init__()
    
        self.size = model_dim
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) /(x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

#Encoder layer

In [8]:
class EncoderLayer(nn.Module):
    def __init__(self, model_dim, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(model_dim)
        self.norm_2 = Norm(model_dim)
        self.attn = MultiHeadAttention(heads, model_dim)
        self.ff = FeedForward(model_dim)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x

#Decoder layer

In [9]:
class DecoderLayer(nn.Module):
    def __init__(self, model_dim,heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(model_dim)
        self.norm_2 = Norm(model_dim)
        self.norm_3 = Norm(model_dim)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, model_dim)
        self.attn_2 = MultiHeadAttention(heads, model_dim)
        self.ff = FeedForward(model_dim)
    def forward(self, x, e_outputs):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x

In [10]:
def attention(q, k, v, d_k, dropout=None):
        
        scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
        scores = F.softmax(scores, dim=-1)
        output = torch.matmul(scores, v)
        return output

In [11]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, model_dim,heads):
        super().__init__()
        
        self.embed = Embedder(vocab_size, model_dim)
        self.pe = PositionalEncoder(model_dim)
        self.encode = EncoderLayer(model_dim, heads)
        self.norm = Norm(model_dim)
    def forward(self, src):
        x = self.embed(src)
        x = self.pe(x)
        x = self.encode(x)
        return self.norm(x)
    

In [12]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, model_dim,heads):
        super().__init__()
        self.embed = Embedder(vocab_size, model_dim)
        self.pe = PositionalEncoder(model_dim)
        self.decode = DecoderLayer(model_dim, heads)
        self.norm = Norm(model_dim)
    def forward(self, trg, e_outputs):
        x = self.embed(trg)
        x = self.pe(x)
        x = self.decode(x, e_outputs)
        return self.norm(x)

In [13]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, model_dim,heads):
        super().__init__()
        self.encoder = Encoder(src_vocab, model_dim,heads)
        self.decoder = Decoder(trg_vocab, model_dim,heads)
        self.out = nn.Linear(model_dim, trg_vocab)
    def forward(self, src, trg):
        e_outputs = self.encoder(src)
        d_output = self.decoder(trg, e_outputs)
        output = self.out(d_output)
        return output