In [57]:
from collections import Counter
import numpy as np
import torch
import torch.nn as nn

In [58]:
sentences = [
    "Ruban is working on BERT models.",
    "This sentence contains multiple types of punctuation.",
    "How can BERT be fine-tuned for specific tasks?",
    "What is the current project you are working on?",
    "How does BERT handle different sentence structures?",
    "Ruban is learning deep learning techniques.",
    "BERT is a powerful model for NLP tasks.",
    "Understanding attention mechanisms is crucial."
]

tokens = [sentence.lower().split() for sentence in sentences]
tokens_flat = [token for sublist in tokens for token in sublist]

vocab = Counter(tokens_flat)
vocab = {word: i+2 for i, (word, _) in enumerate(vocab.most_common())}  
vocab['[PAD]'] = 0
vocab['[UNK]'] = 1

In [59]:
def tokenize_sentence(sentence, vocab):
    tokens = sentence.lower().split()
    token_ids = [vocab.get(token, vocab['[UNK]']) for token in tokens]
    return token_ids

tokenized_sentences = [tokenize_sentence(sentence, vocab) for sentence in sentences]
max_len = max(len(s) for s in tokenized_sentences)
padded_sentences = [s + [vocab['[PAD]']] * (max_len - len(s)) for s in tokenized_sentences]

input_ids = torch.tensor(padded_sentences, dtype=torch.long)
print("Tokenized and Padded Sentences:\n", input_ids)

Tokenized and Padded Sentences:
 tensor([[ 4,  2,  5, 10,  3, 11,  0,  0,  0],
        [12,  6, 13, 14, 15, 16, 17,  0,  0],
        [ 7, 18,  3, 19, 20,  8, 21, 22,  0],
        [23,  2, 24, 25, 26, 27, 28,  5, 29],
        [ 7, 30,  3, 31, 32,  6, 33,  0,  0],
        [ 4,  2,  9, 34,  9, 35,  0,  0,  0],
        [ 3,  2, 36, 37, 38,  8, 39, 40,  0],
        [41, 42, 43,  2, 44,  0,  0,  0,  0]])


In [60]:
def get_positional_encoding(seq_len, model_dim):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(model_dim)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(model_dim))
    angle_rads = pos * angle_rates
    
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    return torch.tensor(angle_rads, dtype=torch.float32)

model_dim = 768
positional_encoding = get_positional_encoding(max_len, model_dim)
print("Positional Encoding Shape:", positional_encoding.shape)

Positional Encoding Shape: torch.Size([9, 768])


In [61]:
vocab_size = len(vocab)
embedding_dim = model_dim

embedding_layer = nn.Embedding(vocab_size, embedding_dim)
embedded = embedding_layer(input_ids)
print("Embedded Input Shape:", embedded.shape)

Embedded Input Shape: torch.Size([8, 9, 768])


In [62]:
class BERTSelfAttention(nn.Module):
    def __init__(self, model_dim, num_heads):
        super(BERTSelfAttention, self).__init__()
        self.num_heads = num_heads
        self.model_dim = model_dim
        
        assert model_dim % num_heads == 0
        self.depth = model_dim // num_heads
        
        self.query = nn.Linear(model_dim, model_dim)
        self.key = nn.Linear(model_dim, model_dim)
        self.value = nn.Linear(model_dim, model_dim)
        self.out = nn.Linear(model_dim, model_dim)
        
    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.permute(0, 2, 1, 3)
    
    def forward(self, x):
        batch_size = x.size(0)
        
        query = self.split_heads(self.query(x), batch_size)
        key = self.split_heads(self.key(x), batch_size)
        value = self.split_heads(self.value(x), batch_size)
        
        score = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.depth, dtype=torch.float32))
        attention_weights = torch.nn.functional.softmax(score, dim=-1)
        
        attention_output = torch.matmul(attention_weights, value)
        attention_output = attention_output.permute(0, 2, 1, 3).contiguous()
        attention_output = attention_output.view(batch_size, -1, self.model_dim)
        
        output = self.out(attention_output)
        return output

class BERTLayer(nn.Module):
    def __init__(self, model_dim, num_heads, ff_hidden_dim, dropout=0.1):
        super(BERTLayer, self).__init__()
        self.attention = BERTSelfAttention(model_dim, num_heads)
        self.layernorm1 = nn.LayerNorm(model_dim)
        self.ff = nn.Sequential(
            nn.Linear(model_dim, ff_hidden_dim),
            nn.ReLU(),
            nn.Linear(ff_hidden_dim, model_dim)
        )
        self.layernorm2 = nn.LayerNorm(model_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        attention_output = self.attention(x)
        out1 = self.layernorm1(x + self.dropout(attention_output))
        ff_output = self.ff(out1)
        out2 = self.layernorm2(out1 + self.dropout(ff_output))
        return out2

class BERTModel(nn.Module):
    def __init__(self, vocab_size, model_dim, num_heads, num_layers, ff_hidden_dim, max_len):
        super(BERTModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, model_dim)
        self.positional_encoding = get_positional_encoding(max_len, model_dim)
        self.layers = nn.ModuleList([
            BERTLayer(model_dim, num_heads, ff_hidden_dim) for _ in range(num_layers)
        ])
        
    def forward(self, input_ids):
        x = self.embedding(input_ids) + self.positional_encoding
        for layer in self.layers:
            x = layer(x)
        return x

model = BERTModel(vocab_size, model_dim, num_heads=12, num_layers=12, ff_hidden_dim=3072, max_len=max_len)
output = model(input_ids)
print("Final Output Shape:", output.shape)

Final Output Shape: torch.Size([8, 9, 768])


In [63]:
x = embedding_layer(input_ids) + positional_encoding
for i, layer in enumerate(model.layers):
    x = layer(x)
    print(f"Output after layer {i+1}:", x[0])  

Output after layer 1: tensor([[ 0.3898,  0.5658, -0.4369,  ..., -0.1406, -0.7689,  0.0873],
        [ 1.1101,  0.1013,  0.2335,  ...,  0.5518, -1.4077, -0.3422],
        [-0.0110, -0.8820,  0.5176,  ...,  1.0359, -0.3034,  1.1285],
        ...,
        [-1.2283,  1.4392, -0.2876,  ...,  0.7713, -0.1409,  2.4968],
        [-0.4554,  1.2451,  0.1158,  ...,  0.6447, -0.0852,  2.4578],
        [-0.1555,  0.5083,  0.5224,  ...,  0.5783, -0.0377,  2.3370]],
       grad_fn=<SelectBackward0>)
Output after layer 2: tensor([[ 0.4991,  0.9400, -0.4286,  ..., -0.1892, -0.6020, -0.3038],
        [ 1.5099,  0.2616, -0.0982,  ...,  0.5521, -1.6127, -0.6876],
        [ 0.2074, -0.3140,  0.4856,  ...,  0.8999, -0.9438,  0.5205],
        ...,
        [-1.0810,  1.7156, -0.2261,  ...,  0.7791, -0.5467,  2.0269],
        [-0.4311,  1.4270,  0.1953,  ...,  0.7361, -0.5516,  2.2859],
        [-0.1050,  0.8034,  0.6049,  ...,  0.6172, -0.5052,  1.8046]],
       grad_fn=<SelectBackward0>)
Output after layer 3