# Step 1 - Tokenization

In [3]:
import torch 
import torch.nn as nn
import torch.optim as optim
import math

In [4]:
def tokenize(text, vocab):
    return [vocab.get(word, vocab["<UNK>"]) for word in text.split()]

# text.split - splits the sentences into words
# vocab - creates a dictionary where each word is assigned a number for uniqueness
# vocab.get - if there is no number assigned, it returns as unknown

# Step 2 - Embedding Layers

In [5]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Embedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim)

    def forward(self, x):
        return self.embedding(x)
    
# we need to convert 0 and 1 that we categorized the above words into vectors to actually extract meanings
# nn.Embedding(vocab_size,embedding_dim) - this code creates a table where each word ID ( the unique number we assigned ) maps to a vector
# embedding_dim - defines the length of each word 
# For Example, if we have words like happy and joy which have similar meaning, they would be assigned similar vectors for the model to understand.

# Step 3 - Positional Encoding

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_seq_len = 5000):
        super(PositionalEncoding, self).__init__()
        self.embedding_dim = embedding_dim
        pe = torch.zeros(max_seq_len, embedding_dim)
        position = torch.arange(0,max_seq_len,dtype= torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe[:,0::2] = torch.sin(position * div_term)
        pe[:,1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0,1)
        self.register_buffer('pe',pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]
    
# the whole point of positional encoding is that the models don't really understand the order hence we need to encode it
# embedding_dim - it matches the vector size from the embedding layer
# we introduce math(cos and sin) - to create patterns i.e., word 1 would get one pattern and word 2 would get another.

# Step 4 - Self Attention

In [28]:
class SelfAttention(nn.Module):
    def __init__(self, embedding_dim):
        super(SelfAttention, self).__init__()
        self.query = nn.Linear(embedding_dim,embedding_dim)
        self.key = nn.Linear(embedding_dim, embedding_dim)
        self.value = nn.Linear(embedding_dim,embedding_dim)

    def forward(self, x):
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x)
        d_k = queries.size(-1) 
        scores = torch.bmm(queries, keys.transpose(1,2)) / torch.tensor(d_k, dtype=torch.float32, device=queries.device).sqrt()
        attention_weights = torch.softmax(scores, dim = -1)
        attended_values = torch.bmm(attention_weights, values)
        return attended_values
    
# Self attention mainly helps the model focuses on important words
# if there is an exmaple of - 'The cat is sat on the mat'
# The model would be able to understand that sat relates to cat more than mat

# query, key and value - they are mainly 3 transformations of the input vectors. 
# query - what do i care about; key - what is avaliable; value - what do i take
# scores - it meaures how much each word relates to other word based on the tokenization(vector formation) that we performed earlier.
# attention_weights - it then turns the scores into probabilities ( telling us which word has more probability than the other.)

# Step 5 - Transformer Block

In [29]:
class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embedding_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embedding_dim)
        )
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        attended = self.attention(x)
        x = self.norm1(x + attended)
        forwarded = self.feed_forward(x)
        x = self.norm2(x + forwarded)
        return x
    
# Once we created the self attention - just one layer isn't enough
# Hence, we create a neural network for better efficiency.
# attention is what we created earlier using self-attention mechanism
# feed_forwrd is a small neural network where all these words are processed for better understanding or guessing of the next words with better probabilities.
# norm1, norm2 - normalizes the numbers so that they are on the same scale.
# x+attended -  residual connection

# Step 6 - Full Language Model

In [30]:
class SimpleLLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(SimpleLLM, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim)
        self.transformer_blocks = nn.Sequential(*[TransformerBlock(embedding_dim, hidden_dim) for _ in range(num_layers)])
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(0, 1) # Transpose for positional encoding
        x = self.positional_encoding(x)
        x = x.transpose(0, 1) # Transpose back
        x = self.transformer_blocks(x)
        x = self.output(x)
        return x
    
# we created a full system incorporating everything we did 
# num_layers - provides layers to the model, more layers gives more intensive thinking -> better outputs
# output - turns back the final vectors back into words for final answer to human

# Training the model

In [31]:
vocab = {"hello": 0, "world": 1, "how": 2, "are": 3, "you": 4, "<UNK>": 5}
vocab_size = len(vocab)
embedding_dim = 16
hidden_dim = 32
num_layers = 2

model = SimpleLLM(vocab_size, embedding_dim, hidden_dim, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

data = ["hello world how are you", "how are you hello world"]
tokenized_data = [tokenize(sentence, vocab) for sentence in data]

for epoch in range(100):
    for sentence in tokenized_data:
        for i in range(1, len(sentence)):
            input_seq = torch.tensor(sentence[:i]).unsqueeze(0)
            target = torch.tensor(sentence[i]).unsqueeze(0)
            optimizer.zero_grad()
            output = model(input_seq)
            loss = criterion(output[:, -1, :], target)
            loss.backward()
            optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

Epoch 0, Loss: 1.9223988056182861
Epoch 10, Loss: 0.4437415897846222
Epoch 20, Loss: 0.19538508355617523
Epoch 30, Loss: 0.11235272139310837
Epoch 40, Loss: 0.07333862036466599
Epoch 50, Loss: 0.05164773762226105
Epoch 60, Loss: 0.03821663558483124
Epoch 70, Loss: 0.02933884970843792
Epoch 80, Loss: 0.023173348978161812
Epoch 90, Loss: 0.01872985064983368


In [32]:
input_text = "hello world how"
input_tokens = tokenize(input_text, vocab)
input_tensor = torch.tensor(input_tokens).unsqueeze(0)
output = model(input_tensor)
predicted_token = torch.argmax(output[:, -1, :]).item()
print(f"Input: {input_text}, Predicted: {list(vocab.keys())[list(vocab.values()).index(predicted_token)]}")

Input: hello world how, Predicted: are
