In [2]:
GPT_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

In [10]:
import torch

class MyTransformerBlock(torch.nn.Module):
    def __init__(self, config: dict):
        super().__init__()
        
        self.norm1 = torch.nn.LayerNorm(config["emb_dim"])
        
        self.attention = torch.nn.MultiheadAttention(embed_dim=config["emb_dim"],
                                                     num_heads=config["n_heads"],
                                                     dropout=config["drop_rate"],
                                                     bias=config["qkv_bias"],
                                                     add_bias_kv=config["qkv_bias"],
                                                     batch_first=True
                                                     )
        
        self.dropout = torch.nn.Dropout(config["drop_rate"])
        self.norm2 = torch.nn.LayerNorm(config["emb_dim"])
        
        self.feedforward = torch.nn.Sequential(
            torch.nn.Linear(config["emb_dim"], 4 * config["emb_dim"]),
            torch.nn.GELU(),
            torch.nn.Linear(4 * config["emb_dim"], config["emb_dim"]),
        )

    def forward(self, x):
        bs, seq, emb = x.shape

        x_shortcut = x
        x = self.norm1(x)

        mask = torch.nn.Transformer.generate_square_subsequent_mask(seq)
        attn_output, attn_output_weights = self.attention(query=x, key=x, value=x, is_causal=True, attn_mask=mask)
        
        x = self.dropout(attn_output)

        x = x + x_shortcut
        x_shortcut = x

        x = self.norm2(x)
        x = self.feedforward(x)
        x = self.dropout(x)
        
        return x + x_shortcut


In [11]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = MyTransformerBlock(GPT_CONFIG)
output = block(x)
print("Input Shape:", x.shape)
print("Output Shape:", output.shape)

Input Shape: torch.Size([2, 4, 768])
Output Shape: torch.Size([2, 4, 768])


## GPT2 From Scratch

* GPT-2 small = 124 million params
* GPT-2 Medium = 1024-dimensional embeddings, 24 transformer blocks, 16-multi-head attention heads
* GPT-2 large = 1280-dimensional embeddings, 36 transformer blocks, 20-multi-head attention heads
* GPT-2 XL = 1600-dimensional embeddings, 48 transformer blocks, 25-multi-head attention heads

In [12]:

class GPTModel(torch.nn.Module):
    def __init__(self, config: dict):
        super().__init__()

        self.token_embedding = torch.nn.Embedding(config['vocab_size'], config["emb_dim"])
        self.position_embedding = torch.nn.Embedding(config['context_length'], config["emb_dim"])
        self.drop_emb = torch.nn.Dropout(config["drop_rate"])

        self.transformer_blocks = torch.nn.Sequential(
            *[MyTransformerBlock(config) for _ in range(config["n_layers"])]
        )

        self.layer_norm = torch.nn.LayerNorm(config["emb_dim"])
        self.out_head = torch.nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

    def forward(self, x):
        #print(x.shape)
        
        batch_size, seq_len = x.shape   #[BS, CONTEXT_LENGTH]

        token_embeds = self.token_embedding(x)
        position_embeds = self.position_embedding(torch.arange(seq_len, device=x.device))

        x = token_embeds + position_embeds
        x = self.drop_emb(x)
        x = self.transformer_blocks(x)
        x = self.layer_norm(x)
        logits = self.out_head(x)
        return logits

In [5]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
tokenizer

<Encoding 'gpt2'>

In [6]:
input_batch = torch.tensor([ 
                tokenizer.encode("Every effort moves you"),
                tokenizer.encode("Every day holds a"),
              ])
input_batch, input_batch.shape

(tensor([[6109, 3626, 6100,  345],
         [6109, 1110, 6622,  257]]),
 torch.Size([2, 4]))

In [39]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG)
output = model(input_batch)
print("Input Shape:", x.shape)
print("Output Shape:", output.shape)

Input Shape: torch.Size([2, 4, 768])
Output Shape: torch.Size([2, 4, 50257])


* GPT-2 model outputs [batch_size, num_token, vocab_size]

In [24]:
from torchinfo import summary
model = GPTModel(GPT_CONFIG)
summary(model, input_data=input_batch)

torch.Size([2, 4])


Layer (type:depth-idx)                   Output Shape              Param #
GPTModel                                 [2, 4, 50257]             --
├─Embedding: 1-1                         [2, 4, 768]               38,597,376
├─Embedding: 1-2                         [4, 768]                  786,432
├─Dropout: 1-3                           [2, 4, 768]               --
├─Sequential: 1-4                        [2, 4, 768]               --
│    └─MyTransformerBlock: 2-1           [2, 4, 768]               --
│    │    └─LayerNorm: 3-1               [2, 4, 768]               1,536
│    │    └─MultiheadAttention: 3-2      [2, 4, 768]               2,359,296
│    │    └─Dropout: 3-3                 [2, 4, 768]               --
│    │    └─LayerNorm: 3-4               [2, 4, 768]               1,536
│    │    └─Sequential: 3-5              [2, 4, 768]               4,722,432
│    │    └─Dropout: 3-6                 [2, 4, 768]               --
│    └─MyTransformerBlock: 2-2           [2, 4, 768]

In [25]:
total_params = sum(p.numel() for p in model.parameters())
print("Total Params:", total_params)

Total Params: 163000320


* 163 million params
* The original GPT-2 architecture is 124 million parameters and reuses the weights from the token embedding layer in its output layer.

In [27]:
print("Token Embedding Layer Shape:", model.token_embedding.weight.shape)
print("Output Layer Shape:", model.out_head.weight.shape)

Token Embedding Layer Shape: torch.Size([50257, 768])
Output Layer Shape: torch.Size([50257, 768])


In [30]:
163000320 - sum(param.numel() for param in model.out_head.parameters())

124402944

* the original 124-million params
* However, using separate token embedding and output layers results in better training and model performance, hence we use separate layers in out GPT implementation. The same is true for modern LLMs.

### Let's compute the memory requirement

In [None]:
total_size_bytes = total_params * 4   # assuming each parameter is 32-bit float, hence 4 bytes
total_size_mb = total_size_bytes / (1024 * 1024) # KB to MB
print("Total Size of the model (MB):", total_size_mb)

Total Size of the model (MB): 621.796875


### Generating Text

In [None]:
def generate_text_sample(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]  # last vector
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [None]:
start_context = "Do you know anything"
encoded = tokenizer.encode(start_context)
print("Encoded:", encoded)

encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print(encoded_tensor, encoded_tensor.shape)

Encoded: [5211, 345, 760, 1997]
tensor([[5211,  345,  760, 1997]]) torch.Size([1, 4])


In [None]:
torch.tensor(encoded).unsqueeze(0).shape

In [59]:
model.eval()
out = generate_text_sample(model, encoded_tensor, max_new_tokens=6, context_size=GPT_CONFIG["context_length"])

print("Ouput:", out)
print("Decoded Text:", tokenizer.decode(out.squeeze().tolist()))

tensor([[5211,  345,  760, 1997]])
tensor([[ 5211,   345,   760,  1997, 36255]])
tensor([[ 5211,   345,   760,  1997, 36255, 39648]])
tensor([[ 5211,   345,   760,  1997, 36255, 39648, 10355]])
tensor([[ 5211,   345,   760,  1997, 36255, 39648, 10355, 17283]])
tensor([[ 5211,   345,   760,  1997, 36255, 39648, 10355, 17283, 33705]])
Ouput: tensor([[ 5211,   345,   760,  1997, 36255, 39648, 10355, 17283, 33705, 45834]])
Decoded Text: Do you know anything HEL Baalalties TC img stagnation
