# Exercise 4.1
## Number of params in Feed Forward and Attention Attribute of Transformer Block

In [1]:
from gpt import TransformerBlock

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [None]:
transformer_block = TransformerBlock(GPT_CONFIG_124M)

In [2]:
total_params = sum(p.numel() for p in transformer_block.att.parameters())
print(f'total parameters in attention layers in a single transformer block is {total_params}')

2360064


In [3]:
total_params = sum(p.numel() for p in transformer_block.ff.parameters())
print(f'total parameters in ff layers in a single transformer block is {total_params}')

total parameters in ff layers is 4722432


# Exercise 4.2
## Initializing larger GPT models

In [10]:
from gpt import GPTModel
print("Okay now we initialize GPT 2 Medium")

GPT_CONFIG_GPT2_M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1024,
    "n_heads": 16,
    "n_layers": 24,
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT2_M = GPTModel(GPT_CONFIG_GPT2_M)
total_params = sum(p.numel() for p in GPT2_M.parameters())
print(f'total parameters GPT2 Medium is {total_params}')
total_params = total_params- sum(p.numel() for p in GPT2_M.out_head.parameters())
print(f'total parameters GPT2 Medium is {total_params} after removing params from out_head')

Okay now we initialize GPT 2 Medium
total parameters GPT2 Medium is 406212608
total parameters GPT2 Medium is 354749440 after removing params from out_head


In [12]:
print("Okay now we initialize GPT 2 Large")

GPT_CONFIG_GPT2_L = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1280,
    "n_heads": 20,
    "n_layers": 36,
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT2_L = GPTModel(GPT_CONFIG_GPT2_L)
total_params = sum(p.numel() for p in GPT2_L.parameters())
print(f'total parameters GPT2 Large is {total_params}')
total_params = total_params- sum(p.numel() for p in GPT2_L.out_head.parameters())
print(f'total parameters GPT2 Large is {total_params} after removing params from out_head')


Okay now we initialize GPT 2 Large
total parameters GPT2 Large is 838220800
total parameters GPT2 Large is 773891840 after removing params from out_head


In [14]:
print("Okay now we initialize GPT 2 X-Large")

GPT_CONFIG_GPT2_XL = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1600,
    "n_heads": 25,
    "n_layers": 48,
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT2_XL = GPTModel(GPT_CONFIG_GPT2_XL)
total_params = sum(p.numel() for p in GPT2_XL.parameters())
print(f'total parameters GPT2 XLarge is {total_params}')
total_params = total_params- sum(p.numel() for p in GPT2_XL.out_head.parameters())
print(f'total parameters GPT2 Medium is {total_params} after removing params from out_head')

# total_size_bytes = total_params * 4
# total_size_gb = total_size_bytes / (1024 * 1024*1024)
# print(f"Total size of the model: {total_size_gb:.2f} GB")


Okay now we initialize GPT 2 X-Large
total parameters GPT2 XLarge is 1637792000
total parameters GPT2 Medium is 1557380800 after removing params from out_head
Total size of the model: 5.80 MB


# Exercise 4.3
## Using Separate DropOut Parameters

In [15]:
# At the beginning of this chapter, we defined a global drop_rate setting in the GPT_
#  CONFIG_124M dictionary to set the dropout rate in various places throughout the
#  GPTModel architecture. Change the code to specify a separate dropout value for the
#  various dropout layers throughout the model architecture. (Hint: there are three dis
# tinct places where we used dropout layers: the embedding layer, shortcut layer, and
#  multi-head attention module.)

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_emb": 0.1,
    "drop_shortcut": 0.2,
    "drop_attn": 0.3,
    "qkv_bias": False
}

from gpt import MultiHeadAttention, LayerNorm, FeedForward
from torch import nn as nn
import torch

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_attn"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_shortcut"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_emb"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits



### End of Assignment