# Transformer Implementation Plan

## Progress Overview

| Day | Task Description                                              | Status |
|-----|---------------------------------------------------------------|--------|
| 1   | Implement a simple attention layer mechanism                  | ✅ Done |
| 2   | Add a feed forward layer after the self attention layer       | ✅ Done |
| 3   | Implement multi-head attention                                | ⏳ To Do |
| 4   | Add positional encoding                                       | ⏳ To Do |

In [14]:
'''
This is my desperate attempt to implement transformers from scratch in tensorflow.
'''
#imports
import torch
from fastai.text.all import *


In [15]:
'''
Data - we'll use the most famous Shakespeare dataset from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

'''
with open('/Users/zwicky/Desktop/learn_langchain/langchain/learn_transformer/data/input.txt', 'r') as file:
    content = file.read()

split_ratio = 0.9  # 90% for training, 10% for validation
split_index = int(len(content) * split_ratio)

train_content = content[:split_index]
val_content = content[split_index:]

# Write the training and validation sets to separate files
with open('train.txt', 'w') as train_file:
    train_file.write(train_content)

with open('val.txt', 'w') as val_file:
    val_file.write(val_content)


In [3]:
#load the train and val dataset 
lines = []
with open('/Users/zwicky/Desktop/learn_langchain/langchain/learn_transformer/train.txt', 'r') as f:
    lines.extend(f.readlines())
with open('/Users/zwicky/Desktop/learn_langchain/langchain/learn_transformer/val.txt', 'r') as f:
    lines.extend(f.readlines())

# Clean up lines (remove newline characters and join)
text = ' . '.join([l.strip() for l in lines])

tokens = text.split(' ')

# Create a vocabulary of unique tokens
vocab = sorted(list(set(tokens)))

# Create a word-to-index mapping
word2idx = {word: idx for idx, word in enumerate(vocab)}

print("\nFirst 10 tokens:")
print(tokens[:10])
print("\nVocabulary size:")
print(len(vocab))
print("\nFirst 10 items in word2idx:")
print(list(word2idx.items())[:10])


First 10 tokens:
['First', 'Citizen:', '.', 'Before', 'we', 'proceed', 'any', 'further,', 'hear', 'me']

Vocabulary size:
25673

First 10 items in word2idx:
[('', 0), ('&C:', 1), ('&c.', 2), ("'", 3), ("'?", 4), ("'A", 5), ("'Alas,", 6), ("'Alas,'", 7), ("'Alla", 8), ("'An", 9)]


In [16]:
token_sequences = [(tokens[i : i + 3], tokens[i + 3]) for i in range(0, len(tokens) - 4, 4)]

nums_sequences = [(torch.tensor([word2idx[token] for token in input_tokens]), word2idx[target_token])
                  for input_tokens, target_token in token_sequences]


print("\nFirst 5 numerical sequences (input tensors and target indices):")
print(nums_sequences[:5])

# You can check the total number of sequences created
print(f"\nTotal number of sequences created: {len(nums_sequences)}")


First 5 numerical sequences (input tensors and target indices):
[(tensor([1583,  995,  273]), 640), (tensor([24646, 18491,  5162]), 11880), (tensor([12903, 15684, 21347]), 273), (tensor([  0, 273, 405]), 273), (tensor([ 3656, 21347,   273]), 0)]

Total number of sequences created: 62472


In [17]:
nums_sequences[0]

(tensor([1583,  995,  273]), 640)

In [18]:
bs = 64
# Calculate the split point
cut = int(len(nums_sequences) * 0.8)

# Split the sequences into training and validation sets
train_data = nums_sequences[:cut]
valid_data = nums_sequences[cut:]

print(f"Total sequences: {len(nums_sequences)}")
print(f"Training sequences: {len(train_data)}")
print(f"Validation sequences: {len(valid_data)}")

Total sequences: 62472
Training sequences: 49977
Validation sequences: 12495


In [19]:
trainloader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
validloader = torch.utils.data.DataLoader(valid_data, batch_size=64, shuffle=False)

In [20]:
class SelfAttention(torch.nn.Module):
    def __init__(self, d_in, d_qk, d_v=None):
        '''
        Create the Q and K matrices and V matrix 
        If the input tensor is (batch_size, sequence_length, feature_dimension)
        d_in: integer, feature dimension of the input (depends on the model architecture, e.g., embedding size)
        d_qk: integer, dimension of the query and key vectors (depends on the model architecture), ideally can be different but for this implementation we will use the same dimension for both. usually < d_in 
        d_v: integer, dimension of the value vector (if not given, it will be the same as d_qk)
        The scaling factor of -0.5 is equivalent to dividing the dot product by the square root of the dimension of the query and key vectors.
        '''
        super().__init__()
        self.q = torch.nn.Linear(d_in, d_qk, bias=False)
        self.k = torch.nn.Linear(d_in, d_qk, bias=False)
        if d_v is None:
            d_v = d_qk
        self.v = torch.nn.Linear(d_in, d_v, bias=False)
        # self.out = torch.nn.Linear(d_v, d_in, bias=False)
        # print(self.out.shape)
        self.scale = d_qk ** -0.5  
        self.softmax = torch.nn.Softmax(dim=-1)

    def forward(self, x):
        '''
        Forward pass of the self-attention layer.
        x: input tensor of shape (batch_size, seq_len, emb_size)
        Returns: output tensor of shape (batch_size, seq_len, d_v)

        attention vector size is (batch_size, seq_len, seq_len)
        '''
        Q, K, V = self.q(x), self.k(x), self.v(x)
        # print(f"Q shape: {Q.shape}, K shape: {K.shape}, V shape: {V.shape}")
        x = self.scale * torch.matmul(Q, K.transpose(-2, -1))
        # print(f"Attention scores shape: {x.shape}")
        x = self.softmax(x)
        # print(f"Softmax output shape: {x.shape}")
        x = torch.matmul(x, V)
        # print(f"Output shape after attention: {x.shape}")
        return x

        

In [21]:
class nn(torch.nn.Module):
    def __init__(self, d_in, d_ff):
        super().__init__()
        self.lin1 = torch.nn.Linear(d_in, d_ff, bias=False)
        self.lin2 = torch.nn.Linear(d_ff, d_in, bias=False)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        return self.relu(self.lin2(self.lin1(x)))

In [None]:
class Module_simple(torch.nn.Module):
    def __init__(self, vocab_size, d_model, d_qk, d_ff):
        '''
        Embedding layer - maps each token to a vector of size d_model. Shape - (vocab_size, d_model)
        Self attention layer - Calculates attention scores for the input sequence. Shape - (seq_len, d_model)
        Output layer - Maps the output of the self-attention layer to the vocabulary size. Shape - (d_model, vocab_size)

        Input → Embedding → Q/K/V → Self-Attention → FFN → Output

        '''
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, d_model) 
        self.SelfAttention = SelfAttention(d_model, d_qk) 
        self.nn = nn(d_model, d_ff) 
        self.out = torch.nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)  
        x = self.SelfAttention(x) 
        # print(f"Shape after self-attention: {x.shape}")
        x = self.nn(x)
        x = x.mean(1)   
        # print(f"Shape after mean: {x.shape}")
        x = self.out(x) 
        # print(f"Shape after output layer: {x.shape}")
        return x


In [11]:
model = Module_simple(len(vocab), 64, 128)
#pass one batch through the model using trainloader
data_batch, labels_batch = next(iter(trainloader))
out = model(data_batch)
print(out.shape)

torch.Size([64, 25673])


In [12]:
#train model using learner 
dls = DataLoaders(trainloader, validloader)
learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), metrics=accuracy)
# learn.lr_find()  #doesn't work in my notebook env

In [13]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,accuracy,time
0,7.048087,7.23858,0.176391,00:11
1,6.373477,6.980987,0.173349,00:07
2,5.836378,6.955272,0.17567,00:08
3,5.183416,7.009202,0.17615,00:07
4,4.491425,7.064063,0.172229,00:07


## Multi head attention 

In [41]:
class MultiheadAttention(torch.nn.Module):
    def __init__(self, d_in, d_qk, n_heads, d_v=None):
        super().__init__()
        if d_v is None:
            d_v = d_qk
        self.n_heads = n_heads
        self.mattn = torch.nn.ModuleList([
            SelfAttention(d_in, d_qk) for _ in range(n_heads)
        ])
        self.out = torch.nn.Linear(d_v * n_heads, d_in, bias=False)
        

    def forward(self, x):
        '''
        Out dimension - (batch_size, seq_len, d_model)
        '''
        heads = [attn(x) for attn in self.mattn]
        x = torch.cat(heads, dim=-1) 
        x = self.out(x)
        return x

In [42]:
class module_multihead(torch.nn.Module):
    def __init__(self, d_model, n_heads, vocab_sz, d_ff, d_qk):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_sz, d_model)
        self.attention = MultiheadAttention(d_model, d_qk, n_heads)
        self.nn = nn(d_model, d_ff) 
        self.out = torch.nn.Linear(d_model, vocab_sz)

    def forward(self, x):
        x = self.embedding(x)
        x = self.attention(x)  #n_head * (batch_size, seq_len, d_model)
        x = self.nn(x)
        x = x.mean(1)
        x = self.out(x)
        return x
        

In [43]:
model = module_multihead(64, 8, len(vocab), 32, 64)
dls = DataLoaders(trainloader, validloader)
learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), metrics=accuracy)


In [44]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,accuracy,time
0,6.768919,6.933014,0.187595,00:13
1,6.306395,6.810514,0.187835,00:13
2,6.022042,6.802017,0.190956,00:13
3,5.566193,6.84636,0.194798,00:13
4,5.275839,6.91343,0.191677,00:13


In [30]:
'''
Credits 
https://colab.research.google.com/drive/1rjYhOhAmEyi1u2uz2s9XABjDFphTOFvY#scrollTo=c49ffa82
'''

'\nCredits \nhttps://colab.research.google.com/drive/1rjYhOhAmEyi1u2uz2s9XABjDFphTOFvY#scrollTo=c49ffa82\n'