In [1]:
import torch
import numpy as np
import torch.nn as nn

In [2]:
## parameters to define first

# X is an input matrix, having shape (seq_len, dmodel)
# Wq is query trainable, having shape (dmodel, dk)
# Wk is key trainable, having shape (dmodel, dk)
# Wv is value trainable, having shape (dmodel, dk), dv = dk
# Wo trainable, having shape (dk*h, dmodel)
# Q = X @ Wq, shape (seq_len, dk)
# K = X @ Wk, shape (seq_len, dk)
# V  = X @ Wv, shape (seq_len, dk), dk= dv
# A = Attention(Q, K.t()), shape (seq_len, seq_len) where Attention(Q, K.t()) = (Q @ K.t())/sqrt(dk)
# Masking -- shape(seq_len, seq_len)
# then  Softmax(A), shape (seq_len, seq_len)
# second_last => Softmax(A) @ V, shape (seq_len, dk)
# lastly,  (Softmax(A) @ V) @ Wo, shape (seq_len, dmodel)

In [3]:
X = torch.tensor([
    [0.72,0.45,0.31], # Dream
    [0.75,0.20,0.55], # big
    [0.30,0.80,0.40], # and
    [0.85,0.35,0.60], # work
    [0.55,0.15,0.75], # for
    [0.25,0.20,0.85] # it
])

In [4]:
## implementation of Scaled Dot Product Attention using Class





class CausalAttentionSingleHead(nn.Module):
    def __init__(self, dk, dmodel, dropout):
        super(CausalAttentionSingleHead, self).__init__()
        self.dk = dk
        self.dmodel = dmodel
        self.weight_query = nn.Linear(self.dmodel, self.dk)
        self.weight_value = nn.Linear(self.dmodel, self.dk)
        self.weight_key = nn.Linear(self.dmodel, self.dk)
        ## defining Softmax
        self.softmax = nn.Softmax(dim = -1)
        ## defining dropout
        self.dropout = nn.Dropout(dropout)


    def forward(self, X):
        # calculating the Query, Key and Value tensors
        Q = self.weight_query(X)
        K = self.weight_key(X)
        V = self.weight_value(X)
        # calculating the attention score and scaling
        attn_score = (Q @ K.t())/( self.dk**0.5)
        # applying masking, first defining mask then applying
        mask = torch.triu(torch.ones((X.shape[0], X.shape[0])), diagonal=1)
        masked_attn_score = attn_score.masked_fill(mask.bool(), -torch.inf)
        # calculating the Softmax of the attn_score and adding dropouts
        A = self.dropout(self.softmax(masked_attn_score))
        # multiplying A with V
        return A @ V, A
    





In [5]:
obj = CausalAttentionSingleHead(dk = 2, dropout=0.2, dmodel= X.shape[1])

In [6]:
obj.forward(X)

(tensor([[ 0.0000,  0.0000],
         [ 0.0816,  0.0048],
         [ 0.0771,  0.1133],
         [ 0.0877,  0.0613],
         [ 0.1039,  0.0662],
         [ 0.1934, -0.0301]], grad_fn=<MmBackward0>),
 tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.6362, 0.6138, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4394, 0.4178, 0.3928, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.3122, 0.2878, 0.3227, 0.0000, 0.0000],
         [0.2620, 0.0000, 0.2415, 0.2601, 0.2344, 0.0000],
         [0.2204, 0.2119, 0.0000, 0.2202, 0.1996, 0.1876]],
        grad_fn=<MulBackward0>))

In [5]:
import torch
X = torch.tensor([
    [0.72,0.45,0.31], # Dream
    [0.75,0.20,0.55], # big
    [0.30,0.80,0.40], # and
    [0.85,0.35,0.60], # work
    [0.55,0.15,0.75], # for
    [0.25,0.20,0.85] # it
])

In [6]:
from model.attention import CausalAttentionSingleHead

In [10]:
causal_attn = CausalAttentionSingleHead(dk = 512, dmodel = 3, dropout=0.2)

In [11]:
causal_attn.forward(X)

tensor([[ 0.4875,  0.7571,  0.9558,  ...,  0.1489, -0.1114, -0.3259],
        [ 0.6144,  0.8338,  0.9190,  ...,  0.0772, -0.1810, -0.2799],
        [ 0.4871,  0.7891,  0.9095,  ...,  0.1710, -0.0326, -0.4410],
        [ 0.4393,  0.6424,  0.6818,  ...,  0.1050, -0.0481, -0.3064],
        [ 0.5082,  0.7065,  0.6984,  ...,  0.0807, -0.0764, -0.3162],
        [ 0.2883,  0.4394,  0.4239,  ...,  0.0748,  0.0014, -0.2629]],
       grad_fn=<MmBackward0>)

## Masked MultiHead Attention Implementation 

In [1]:
import torch
import torch.nn as nn
import numpy as np
import math


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, h, dmodel, dropout):
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.dmodel = dmodel
        assert self.dmodel % self.h == 0, "head and dmodel configuration failed"
        self.dk = self.dmodel // self.h
        self.weight_query = nn.Linear(dmodel, dmodel)
        self.weight_key = nn.Linear(dmodel, dmodel)
        self.weight_value = nn.Linear(dmodel, dmodel)
        self.w_o = nn.Linear(dmodel, dmodel)
        # softmax
        self.softmax = nn.Softmax(dim = -1)
        # drop out
        self.dropout = nn.Dropout(dropout)

    def forward(self, X):
        seq_len = X.shape[0]
        Q = self.weight_query(X)
        K = self.weight_key(X)
        V = self.weight_value(X)
        Q_head = torch.permute(torch.reshape(Q, shape=(seq_len, self.h, self.dk)), dims= (1, 0, 2))
        K_head = torch.permute(torch.reshape(K, shape=(seq_len, self.h, self.dk)), dims= (1, 0, 2))
        V_head = torch.permute(torch.reshape(V, shape=(seq_len, self.h, self.dk)), dims= (1, 0, 2))
        # calculating attention score and scaling
        attn_score = (torch.matmul(Q_head, K_head.transpose(-1,-2)))/(math.sqrt(self.dk))
        # applying masking
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1)
        attn_score_masked = attn_score.masked_fill(mask.bool(), -torch.inf)
        # applying softmax
        attn_weights = self.softmax(attn_score_masked)
        # applying dropout
        attn_weights_with_dropout = self.dropout(attn_weights)
        # multiplying with V
        A = torch.matmul(attn_weights_with_dropout, V_head)
        # concatenation of the vector
        concate_heads = torch.reshape(torch.permute(A, dims = (1, 0, 2)), shape = (seq_len, self.dmodel))
        return self.w_o(concate_heads)
        







In [18]:
MHA = MultiHeadAttention(h = 4, dmodel= 512, dropout=0.2)

In [19]:
sentences = 10
features = 512

X = torch.randn(sentences, features)


In [20]:
X.shape

torch.Size([10, 512])

In [22]:
MHA.forward(X).shape

torch.Size([10, 512])