### Single Head Attention

In [1]:
import torch 
import torch.nn as nn

In [2]:
import torch.optim as optim

In [3]:
torch.manual_seed(0) # for reproducibility

<torch._C.Generator at 0x21da279a530>

In [4]:
# Defining parameters for the transformer model

n_embed = 120
n_layers = 8
n_heads = 8
head_size = n_embed // n_heads
batch_size = 16  # Batch size for training
block_size = 256  # Context size for the model
dropout = 0.2  # Dropout rate for regularization
vocab_size = 8000

learning_rate = 3e-4 
epochs = 10

In [5]:
## Train Data ( Copied from 2. Data Preparation )
from torch.utils.data import Dataset, DataLoader
import numpy as np
token_id_bin_path = "../../data/processed/Initial/initial_token_ids.bin"
token_ids = np.fromfile(token_id_bin_path, dtype=np.uint16)
class TokenDataset(Dataset):
    def __init__(self, token_ids, block_size ):
        self.block_size = block_size   
        self.data = np.array(token_ids, dtype=np.uint16)  # our data is going to be an np array ( for easy slicing )

    def __len__(self):
        return (self.data).shape[0] - self.block_size   

    def __getitem__(self, idx):
        X = torch.tensor(self.data[idx:idx+self.block_size], dtype=torch.long)
        y = torch.tensor(self.data[idx+1 : idx+self.block_size + 1], dtype = torch.long)

        return X, y

## Lets just quickly split the dataset  -- first 80% be train data
split_idx = int(0.8 * len(token_ids))
train_token_ids = token_ids[:split_idx]
val_token_ids = token_ids[split_idx:]

len(train_token_ids), len(val_token_ids)
token_dataset = TokenDataset(train_token_ids, block_size)
trainloader = DataLoader(token_dataset, batch_size = 32, shuffle=True, drop_last = True)


We know that for each token it consist a embedding vector of dimension n_embed.

i.e.. Ei vector of size n_embed*1                ,where i runs to block_size

And for each head there is a query matrix and key matrix of size head_size*n_embed. 
Which is applied to same x for self-head attention

and Qi = Wq * Ei = head_size*1 for each block_size and batch_size

It can be represented as Linear(n_embed,head_size)

How much each query vector attends to key vector is represented from dot product of Ki.Qi at each cell of matrix of size TxT

this is represented by 
Attend = query @ key

and the x is represented with the down projection to the dimension of head_size which is concatenated later
Vi = Wv * Ei

output from single head = attend @ Vi

In [6]:
class SingleHeadAttention(nn.Module):

    def __init__(self, n_embed, head_size):
        super().__init__()

        self.n_embed = n_embed
        self.head_size = head_size
        self.key = nn.Linear(n_embed, head_size)
        self.query = nn.Linear(n_embed, head_size)
        self.value = nn.Linear(n_embed, head_size)
        self.register_buffer('trill', torch.tril(torch.ones(block_size, block_size)))  # Lower triangular matrix for masking
 

    def forward(self, x):
        B,T,C = x.shape  # B is Batch_size, T is Block_size, C is n_embed
        # x is a shape of Batch_size x Block_size x n_embed
        key= self.key(x)        # B,T,H = head_size
        query = self.query(x)   # B,T,H = head_size

        # B,T,H @ B,H,T
        attend = query @ key.transpose(-2, -1)  # B,T,T

        attend = attend / (self.head_size ** 0.5)  #  Scaled Dot-Product Attention Attention(Q,K,V)=softmax(QK^T/sqrt(d_k))V

        # trill = torch.tril(torch.ones(attend.shape[-1], attend.shape[-1]))  # Lower triangular matrix of block_size

        attend = attend.masked_fill(self.trill[:T, :T] == 0, float('-inf'))  # Masking future tokens

        attend = torch.softmax(attend, dim=-1) # Column-wise softmax IG

        value = self.value(x) # B,T,H  

        out = attend @ value  # B,T,H

        return out




In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self,n_embed, n_heads):
        super().__init__()
        self.n_embed = n_embed
        self.n_heads = n_heads
        self.head_size = n_embed // n_heads

        self.heads = nn.ModuleList([SingleHeadAttention(n_embed, self.head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        out = torch.cat([head(x) for head in self.heads], dim=-1)  # Concatenate outputs from all heads
        out = self.proj(out)
        out = self.dropout(out)
        return out

    

In [8]:
class feed_forward(nn.Module):

    # Multi-layer perceptron (MLP) for feed-forward network in transformer
    
    def __init__(self, n_embed):
        super().__init__()
        self.network = nn.Sequential(
        nn.Linear(n_embed, 4 * n_embed),  # Up-projection min of 4* n_embed from the paper Attention Is All You Need
        nn.ReLU(),
        nn.Linear(4 * n_embed, n_embed),  # Down-projection back to n_embed
        nn.Dropout(dropout)
        )
    
        
    def forward(self, x):

        return self.network(x)
    

In [9]:
#Single Bloack of the Transformer
class Block(nn.Module):

    def __init__(self, n_embed, n_heads):
        super().__init__()

        self.attention = MultiHeadAttention(n_embed, n_heads)
        self.feed_forward = feed_forward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):

        # Creating a residual connection around the attention layer

        x = x + self.attention(self.ln1(x))  # Layer normalization before attention
        x = x + self.feed_forward(self.ln2(x))
        return x

In [None]:
class Transformer(nn.Module):

    def __init__(self):
        super().__init__()
        self.block_size = block_size

        self.token_embedding = nn.Embedding(vocab_size, n_embed)  # Token embeddings
        self.position_embedding = nn.Embedding(block_size, n_embed)
 
        self.block = nn.ModuleList([Block(n_embed, n_heads) for _ in range(n_layers)])
        self.block = nn.Sequential(*self.block) # Sequentially stacking the blocks

        self.layer_norm = nn.LayerNorm(n_embed)  # Final layer normalization

        self.linear = nn.Linear(n_embed, vocab_size)  # Output layer for vocabulary size

    def forward(self, x,target=None):
        x = x.long() ## Ensuring the x is of type long for embedding lookup    
        B, T = x.size()
        positions = torch.arange(0, T, device=x.device)
        # Get embeddings
        token_emb = self.token_embedding(x)  # (B, T, n_embed)
        pos_emb = self.position_embedding(positions)  # (T, n_embed)
        x = token_emb + pos_emb
        x = self.block(x)

        x = self.layer_norm(x)  # Final layer normalization
        logits =self.linear(x)  # Output layer to get logits for vocabulary size. B,T,V
        if target is not None:
            # Reshape for loss calculation
            logits_flat = logits.view(-1, logits.size(-1))  # Reshape logits to (B*T, V)
            target_flat = target.view(-1)  # Reshape target to (B*T)
            loss = nn.CrossEntropyLoss()(logits_flat, target_flat)
            return logits, loss  # Return logits, not softmax probabilities
        else: 
            return logits, None  # Return logits, not softmax probabilities
    
    
    def generate(self, idx, max_new_tokens):
        # idx = Batch_size x Block_size ... B,T
        self.eval()  # Set to evaluation mode
        with torch.no_grad():  # No need to compute gradients during generation
            for _ in range(max_new_tokens):
                # Crop idx to the last block_size tokens if it gets too long
                # This is the correct line to ensure 'idx_cond' always has a length up to 'block_size'
                idx_to_process = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
                
                # Get predictions
                logits, _ = self(idx_to_process)  # Pass the correctly shaped 'idx_to_process' to the forward pass
                
                # Focus only on the last time step
                logits = logits[:, -1, :]  # becomes (B, vocab_size)
                
                # Apply softmax to get probabilities
                probs = torch.softmax(logits, dim=-1)  # (B, vocab_size)
                
                # Sample from the distribution
                idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
                
                # Append sampled index to the running sequence
                idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
                
        return idx
        

In [22]:
model = Transformer()
model.to('cuda' if torch.cuda.is_available() else 'cpu')


Transformer(
  (token_embedding): Embedding(8000, 120)
  (position_embedding): Embedding(256, 120)
  (block): Sequential(
    (0): Block(
      (attention): MultiHeadAttention(
        (heads): ModuleList(
          (0-7): 8 x SingleHeadAttention(
            (key): Linear(in_features=120, out_features=15, bias=True)
            (query): Linear(in_features=120, out_features=15, bias=True)
            (value): Linear(in_features=120, out_features=15, bias=True)
          )
        )
        (proj): Linear(in_features=120, out_features=120, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (feed_forward): feed_forward(
        (network): Sequential(
          (0): Linear(in_features=120, out_features=480, bias=True)
          (1): ReLU()
          (2): Linear(in_features=480, out_features=120, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((120,), e

In [13]:
from torchinfo import summary

summary(model)

Layer (type:depth-idx)                                  Param #
Transformer                                             --
├─Embedding: 1-1                                        960,000
├─Embedding: 1-2                                        30,720
├─Sequential: 1-3                                       --
│    └─Block: 2-1                                       --
│    │    └─MultiHeadAttention: 3-1                     58,080
│    │    └─feed_forward: 3-2                           115,800
│    │    └─LayerNorm: 3-3                              240
│    │    └─LayerNorm: 3-4                              240
│    └─Block: 2-2                                       --
│    │    └─MultiHeadAttention: 3-5                     58,080
│    │    └─feed_forward: 3-6                           115,800
│    │    └─LayerNorm: 3-7                              240
│    │    └─LayerNorm: 3-8                              240
│    └─Block: 2-3                                       --
│    │    └─MultiHea

In [14]:
len(trainloader)

1464

In [33]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
device = 'cuda' if torch.cuda.is_available() else 'cpu'


    # every once in a while evaluate the loss on train and val sets
    # if iter % eval_interval == 0 or iter == max_iters - 1:
    #     losses = estimate_loss()
    #     print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # # sample a batch of data
    # xb, yb = get_batch('train')

    # # evaluate the loss
    # logits, loss = model(xb, yb)
    # optimizer.zero_grad(set_to_none=True)
    # loss.backward()
    # optimizer.step()
for epoch in range(1):
    for batch_idx, (X, y) in enumerate(trainloader):
        X, y = X.to(device, dtype=torch.long), y.to(device, dtype=torch.long)
        logits, loss = model(X, y) # Forward pass 
        optimizer.zero_grad(set_to_none=True)

        loss.backward() 
        optimizer.step()
        if batch_idx % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Batch [{batch_idx+1}/{len(trainloader)}], Loss: {loss.item():.4f}")




Epoch [1/10], Batch [1/1464], Loss: 0.0391


KeyboardInterrupt: 

In [59]:
model_path = "../../models/initial_model_10E.pth"
torch.save(model.state_dict(), model_path)

##### Loading the model and teting the generation code: 

In [23]:
# Load the model 
model.eval()
model_path = "../../models/initial_model_10E.pth"
state_dict = torch.load(model_path, weights_only=True)

model.load_state_dict(state_dict)

<All keys matched successfully>

In [35]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
 ## Generation using the 'generate function' 
model.generate(idx=torch.zeros(1, block_size, dtype=torch.long).to(device), max_new_tokens=40)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)