In [73]:
!pip install transformers --quiet

In [74]:
import torch
import torchvision
import torch.nn as nn
from transformers import AutoTokenizer
from transformers import AutoConfig
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
import numpy as np
import torch.nn.functional as F

In [75]:
tokenizer('Football is the best sport in the world. brazil is the most popular national team whereas Real Madrid is the most successfull club.',return_tensors='pt',add_special_tokens=False)

{'input_ids': tensor([[ 2289,  1110,  1103,  1436,  4799,  1107,  1103,  1362,   119, 12418,
          5303,  1233,  1110,  1103,  1211,  1927,  1569,  1264,  6142,  5230,
          6331,  1110,  1103,  1211,  2265,  1233,  1526,   119]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}

In [76]:
config=AutoConfig.from_pretrained('bert-base-uncased')
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.29.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [77]:
embedding=nn.Embedding(config.vocab_size,config.hidden_size)

In [78]:
text='Football is the best sport in the world. brazil is the most popular national team whereas Real Madrid is the most successfull club .'

In [79]:
qkv=embedding(tokenizer(text,return_tensors='pt',add_special_tokens=False)['input_ids'])
qkv.shape

torch.Size([1, 28, 768])

In [80]:
def compute_dot_product(query,key,value):
  batch_size=query.shape[0]
  scale=torch.sqrt(torch.tensor(key.shape[-1])) # sqrt(64)
  dot_prod=torch.matmul(query,key.transpose(-2,-1))/scale
  weights=torch.softmax(dot_prod,dim=-1) # 1 x 28 x 12 x 12
  return torch.matmul(weights,value) # M(1 x 28 x 12 x 12) * M(1 x 28 x 12 x 64)

In [81]:
class SelfAttention(nn.Module):
  def __init__(self,embedd_dim,num_heads):
    super().__init__()
    self.num_heads=num_heads # 12
    self.head_dim=embedd_dim//num_heads # 768//12
    self.wq=nn.Linear(self.head_dim,self.head_dim) 
    self.wk=nn.Linear(self.head_dim,self.head_dim)
    self.wv=nn.Linear(self.head_dim,self.head_dim)
  
  def forward(self,query,key,value):
    batch_size = query.size(0)
    query=query.view(batch_size,-1,self.num_heads,self.head_dim) # 1 x 28 x 12 x 64
    key=key.view(batch_size,-1,self.num_heads,self.head_dim) # 1 x 28 x 12 x 64
    value=value.view(batch_size,-1,self.num_heads,self.head_dim) # 1 x 28 x 12 x 64
    Q = self.wq(query) # 1 x 28 x 12 x 64
    K = self.wk(key) # 1 x 28 x 12 x 64
    V = self.wv(value) # 1 x 28 x 12 x 64
    return compute_dot_product(Q,K,V)

In [82]:
class MultiHeadAttention(nn.Module):
  def __init__(self,embdd_dim,num_heads):
    super().__init__()    
    self.embdd_dim=embdd_dim
    self.att=SelfAttention(embdd_dim,num_heads)
    self.fc=nn.Linear(embdd_dim,embdd_dim)

  def forward(self,query,key,value):
    attention=self.att(query,key,value) # 1 x 28 x 12 x 64
    attention=attention.view(query.size(0),-1,self.embdd_dim) # 1 x 28 x 768
    fc_out=self.fc(attention)
    return attention

In [83]:
MultiHeadAttention(768,12)(qkv,qkv,qkv).shape

torch.Size([1, 28, 768])

In [84]:
class FeedForward(nn.Module):
  def __init__(self,embedd_dim):
    super().__init__()
    self.fc1=nn.Linear(embedd_dim,embedd_dim*2)
    self.fc2=nn.Linear(embedd_dim*2,embedd_dim)
  
  def forward(self,x):
    x=F.gelu(self.fc1(x))
    x=self.fc2(x)
    return x

In [85]:
class ResidualBlock(nn.Module):
  def __init__(self,fn):
    super().__init__()
    self.fn=fn
  
  def forward(self,x):
    res=x
    x=self.fn(x)
    x+=res
    return x

In [86]:
class TransformerEncoderBlock(nn.Module):
  def __init__(self,emb_size,num_heads,drop_p=0.3):
    super().__init__()
    self.attention=MultiHeadAttention(emb_size,num_heads)
    self.norm1 = nn.LayerNorm(emb_size)
    self.feed_forward = FeedForward(emb_size)
    self.norm2 = nn.LayerNorm(emb_size)
    self.dropout = nn.Dropout(drop_p)

  def forward(self, x):
    attention_out = self.attention(x, x, x)
    x = x + self.dropout(attention_out)
    x = self.norm1(x)
    feed_forward_out = self.feed_forward(x)
    x = x + self.dropout(feed_forward_out)
    x = self.norm2(x)
    return x

In [87]:
class TransformerEncoder(nn.Sequential):
  def __init__(self, depth,emb_size, num_heads):
      super().__init__(*[TransformerEncoderBlock(emb_size, num_heads) for _ in range(depth)])

In [88]:
Encoder=TransformerEncoder(depth=6,emb_size=config.hidden_size, num_heads=config.num_attention_heads)

In [89]:
Encoder(qkv)

tensor([[[ 0.3486,  0.4549, -1.4903,  ..., -0.9687,  0.1448, -1.1709],
         [ 0.9671, -0.3488,  0.0258,  ..., -0.6450, -0.5667, -0.2769],
         [-0.4156,  0.2014, -1.2503,  ..., -0.3972, -0.7647, -0.8837],
         ...,
         [ 0.2798, -2.4494,  0.6453,  ..., -0.3680,  0.2216, -2.2665],
         [ 0.5365, -1.9726,  0.4352,  ..., -0.6160, -0.2940, -1.2681],
         [-0.7834, -0.1028, -1.5574,  ..., -0.5986, -2.5298,  0.5569]]],
       grad_fn=<NativeLayerNormBackward0>)