![GPT](images/gpt-arch-1.png)

In [None]:
import torch
import torch.nn as nn
from modules import TransformerBlock, LayerNorm

class GPTModel(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.pos_emb = nn.Embedding(config["context_length"], config["emb_dim"])
        self.drop_emb = nn.Dropout(config["drop_rate"])
        self.transformerlayers = nn.Sequential(*[TransformerBlock(config) for _ in range(config["number_of_layers"])])
        
        self.finalnorm = LayerNorm(config["emb_dim"])
        self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

    def forward(self, input_idx):
        batch_size, seq_len = input_idx.shape
        token_embeddings = self.tok_emb(input_idx)
        positional_embeddings = self.pos_emb(torch.arange(seq_len, device=input_idx.device))
        x = token_embeddings + positional_embeddings
        x = self.drop_emb(x)
        x = self.transformerlayers(x)
        x = self.finalnorm(x)
        logits = self.out_head(x)
        return logits


Weight Tying 

a concept to save overall memory footprint and computation, used in original GPT2 architecture, where the token embedding weights are used in its output layer, otherwise it would've been 163M params instead of 124M.

however, modern llm architectures use separate weights at these places as it gives better model training and performance.

In [None]:
GPT_CONFIG = {
    "vocab_size": 50257,      # Vocabulary size
    "context_length": 1024,   # Context length
    "emb_dim": 768,           # Embedding dimension
    "num_heads": 12,          # Number of attention heads
    "number_of_layers": 12,   # Number of transformer blocks
    "drop_rate": 0.1,         # Dropout rate
    "qkv_bias": False         # Query-Key-Value bias
}

model = GPTModel(GPT_CONFIG)

# Example input (batch_size=2, seq_len=10)
input_ids = torch.randint(0, GPT_CONFIG["vocab_size"], (2, 10))
print(input_ids)

with torch.no_grad():
    logits = model(input_ids)

print(f"Input shape: {input_ids.shape}")
print(f"Output shape: {logits.shape}")
print(f"\nModel has {sum(p.numel() for p in model.parameters()):,} parameters")


tensor([[34725, 40702, 48254, 29693, 29342,  8794, 17406,  3440, 29527, 31767],
        [24866, 40190, 14001, 34217,  3826, 17485, 22542, 25847,  3891,  4278]])
Input shape: torch.Size([2, 10])
Output shape: torch.Size([2, 10, 50257])

Model has 163,009,536 parameters


**Generating Text**

![](images/gpt-arch-2.png)

We will implement Greedy Decoding, which is the most straightforward way an LLM generates text. It predicts one token at a time by always choosing the single most likely next word.

There are more algorithms of generating text that brings in creativity by upating the softmax function

In [11]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        #limit to our model's context_length
        idx_possible_in_our_context = idx[:, -context_size:]

        #now inferencing to get the result
        with torch.no_grad():
            logits = model(idx_possible_in_our_context)
        
        # now we only want to work with the las row as it contains the newly generated word; (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1 ,:]

        # apply softmax to get probablities
        probs = torch.softmax(logits, dim=-1)

        # not pick only the token id of highest probability term
        idx_next = torch.argmax(probs, dim=-1, keepdim=True)

        idx = torch.cat((idx, idx_next), dim=-1)
    
    return idx

In [20]:
# Testing, but i need tokenizer for it
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)

encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

model.eval() # disable dropout

out = generate_text_simple(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=6, 
    context_size=GPT_CONFIG["context_length"]
)

print("Output:", out)
print("Output length:", len(out[0]))

decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])
Output: tensor([[15496,    11,   314,   716, 23515, 21842, 12754, 31479, 17159,  8621]])
Output length: 10
Hello, I amulus Dowacularï¿½ Along Sep
