Self-Attention

In [2]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import math

def scaled_dot_product(q,k,v,mask=None):
    d_k=q.size()[-1]
    scaled=torch.matmul(q,k.transpose(-1,-2))/math.sqrt(d_k)
    if mask is not None:
        scaled+=mask
    attention=F.softmax(scaled,dim=-1)
    values=torch.matmul(attention,v)
    return attention,values

class MultiheadAttention(nn.Module):
    def __init__(self,input_dim,d_model,num_heads):
        super.__init__()
        self.input_dim=input_dim
        self.d_model=d_model
        self.num_heads=num_heads
        self.head_dim=self.d_model//num_heads
        self.qkv_layer=nn.Linear(input_dim,3*d_model)
        self.linear_layer=nn.Linear(d_model*d_model)
    
    def forward(self,x):
        batch_size,sequence_length,input_dim=x.size()
        qkv=self.qkv_layer(x)
        qkv=qkv.reshape(batch_size,self.num_heads,sequence_length,3*self.head_dim)
        q,k,v=qkv.chunk(3,dim=-1)
        attention,values=scaled_dot_product(q,k,v)
        



Positional Encoding

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self,max_sequence_length,d_model):
        super().__init__()
        self.max_sequence_length=max_sequence_length
        self.d_model=d_model
    def forward(self):
        even_i=torch.arange(0,self.max_sequence_length,2).float()
        denominator=torch.pow(10000,even_i/self.d_model)
        pos=torch.arange(self.max_sequence_length,dtype=torch.float).reshape(self.max_sequence_length,1)
        even_PE=torch.sin(pos/denominator)
        odd_PE=torch.cos(pos/denominator)
        stack=torch.stack([even_PE,odd_PE],dim=2)
        PE=torch.flatten(stack,start_dim=1,end_dim=2)
        return PE
pe=PositionalEncoding(6,10)
pe.forward()


        

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.1578,  0.9875,  0.0251,  0.9997],
        [ 0.9093, -0.4161,  0.3117,  0.9502,  0.0502,  0.9987],
        [ 0.1411, -0.9900,  0.4578,  0.8891,  0.0753,  0.9972],
        [-0.7568, -0.6536,  0.5923,  0.8057,  0.1003,  0.9950],
        [-0.9589,  0.2837,  0.7121,  0.7021,  0.1253,  0.9921]])

Layer Normalization

In [None]:
import torch
from torch import nn

class LayerNormalization(nn.Module):
    def __init__(self,parameter_shape,eps=1e-5):
        super().__init__()
        self.parameter_shape=parameter_shape
        self.eps=eps
        self.gamma=nn.Parameter(torch.ones(parameter_shape))
        self.beta=nn.Parameter(torch.zeros(parameter_shape))
    def forward(self,inputs):
        dims=[-(i+1) for i in range(len(self.parameter_shape))]
        mean=inputs.mean(dim=dims,keepdim=True)
        var=((inputs-mean)**2).mean(dim=dims,keepdim=True)
        std=(var+self.eps).sqrt()
        y=(inputs-mean)/std
        out=self.gamma*y+self.beta
        return out