In [1]:
import numpy 
import torch
import torch.nn.functional as F
import torch.nn as nn

In [20]:
def scaled_dot_attention(Q,K,V,mask):
    dot_prod = torch.matmul(Q,K.mT)
    d_k = torch.tensor(K.size(-1),dtype=torch.float32)
    scaled_dot = dot_prod/torch.sqrt(d_k)
    if mask is not None:
        scaled_dot = scaled_dot.masked_fill(mask == 0, -1e9)
    soft_dot = F.softmax(scaled_dot,dim=1)
    attention = torch.mm(soft_dot,V)
    return attention

In [21]:
Q = torch.Tensor([13,424,424,254,2]).view(1,-1)
K = torch.Tensor([13,214,14,864,32]).view(1,-1)
V = torch.Tensor([64,43,54,1,4]).view(1,-1)
mask = torch.ones(Q.shape[0], K.shape[0])

In [22]:
attention_score = scaled_dot_attention(Q,K,V,mask)

In [23]:
attention_score

tensor([[64., 43., 54.,  1.,  4.]])

In [24]:
class AttentionHead(nn.Module):
    def __init__(self,embedding_dim,head_dim,mask=False):
        super(AttentionHead,self).__init__()
        self.Q = nn.Linear(embedding_dim,head_dim)
        self.K = nn.Linear(embedding_dim,head_dim)
        self.V = nn.Linear(embedding_dim,head_dim)
        self.mask = torch.ones(Q.shape[0], K.shape[0]) if mask else None
    def forward(self,x):
        q = self.Q(x)
        k = self.K(x)
        v = self.V(x)
        attention_score = scaled_dot_attention(q,k,v,self.mask)
        return attention_score

In [34]:
class MultiHeadAttention(nn.Module):
    def __init__(self,embedding_dim,num_heads):
        super(MultiHeadAttention,self).__init__()
        self.head_dim = embedding_dim//num_heads
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.heads = nn.ModuleList([AttentionHead(self.embedding_dim,self.head_dim) for _ in range(num_heads)])
        self.Wo = nn.Linear(embedding_dim,embedding_dim)
    def forward(self,x,mask=None):
        scores = []
        for head in self.heads:
            scores.append(head(x))
        scores = torch.cat(scores,dim=-1)
        attention = self.Wo(scores)
        return attention

In [35]:
class FeedForward(nn.Module):
    def __init__(self,d_model=512,dff=2048,dropout=0.3):
        super(FeedForward,self).__init__()
        self.fc1 = nn.Linear(d_model,dff)
        self.fc2 = nn.Linear(dff,d_model)
        self.dropout = nn.Dropout(dropout)
        self.layernorm = nn.LayerNorm(d_model)
    def forward(self,x):
        out = F.gelu(self.fc1(x))
        out = self.fc2(out)
        out = self.dropout(out)
        out = self.layernorm(x + out)
        return out

In [9]:
from transformers import AutoTokenizer
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show
from IPython.display import display, HTML

In [10]:
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BertModel.from_pretrained(model_ckpt)
text = "Cat Chased the quick mouse and jumped over the brown fox which was sleeping on the floor"
vis = show(model,"bert",tokenizer,text,display_mode='dark',layer=0,head=0,html_action='return')

  state_dict = torch.load(resolved_archive_file, map_location='cpu')


In [27]:
def positional_encoding(embedding,pos):
    embedding_dim = embedding.size(-1)
    positional_vect = torch.zeros_like(embedding)
    for k in range(emebdding_dim):
        if k % 2 ==0:
            positional_vect[k] = math.sin(pos/(10000**(k/embeddin_dim)))
        elif k % 2 ==1:
            positional_vect[k] = math.cos(pos/(10000**(k/embeddin_dim)))
    return positional_vect

In [28]:
with open("vis.html", "w") as f:
    f.write(vis.data)
    
print("Visualization saved as 'vis.html'. Open this file in a browser to view.")

Visualization saved as 'vis.html'. Open this file in a browser to view.
