In [32]:
import torch 
import torch.nn as nn
import tiktoken

In [33]:
# Exercise 4.1 Number of parameters in feed forward and attention modules
# Calculate and compare the number of parameters that are contained in the feed forward module 
# and those that are contained in the multi-head attention module.

In [34]:
from gpt_model import TransformerBlock

GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [35]:
block = TransformerBlock(GPT_CONFIG_124M)
print(block)

TransformerBlock(
  (norm1): LayerNorm()
  (att): MultiHeadAttention(
    (W_query): Linear(in_features=768, out_features=768, bias=False)
    (W_key): Linear(in_features=768, out_features=768, bias=False)
    (W_value): Linear(in_features=768, out_features=768, bias=False)
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (norm2): LayerNorm()
  (feedforward): FeedForward(
    (layers): Sequential(
      (0): Linear(in_features=768, out_features=3072, bias=True)
      (1): GELU()
      (2): Linear(in_features=3072, out_features=768, bias=True)
    )
  )
)


In [36]:
# total parameters in the feed forward mod

total_params = sum(p.numel() for p in block.feedforward.parameters())
print(f"Total number of parameters for feed forward module: {total_params :,}")

Total number of parameters for feed forward module: 4,722,432


The calculations for  within the feed forward module:
- 1st Linear layer: 768 inputs × 4×768 outputs + 4×768 bias units = 2,362,368
- 2nd Linear layer: 4×768 inputs × 768 outputs + 768 bias units = 2,360,064
- Total: 1st Linear layer + 2nd Linear layer = 2,362,368 + 2,360,064 = 4,722,432

In [37]:
#Paramters in the attention mod
total_params = sum(p.numel() for p in block.att.parameters())
print(f"Total number of parameters in attention module: {total_params:,}")

Total number of parameters in attention module: 2,360,064


Calculations for the attention mod

- W_query: 768 inputs × 768 outputs = 589,824
- W_key: 768 inputs × 768 outputs = 589,824
- W_value: 768 inputs × 768 outputs = 589,824
- out_proj: 768 inputs × 768 outputs + 768 bias units = 590,592
- Total: W_query + W_key + W_value + out_proj = 3×589,824 + 590,592 = 2,360,064

In [38]:
#4.2 Initializing GPT models

GPT2-small 
emb_dim = 768
n_layers = 12
n_heads = 12

GPT2-medium
emb_dim = 1024
n_layers = 24
n_heads = 16

GPT2- Large:
emb_dim = 1280
n_layers = 24
n_heads = 16

GPT2-XL
emb_dim = 1600
n_layers = 12
n_heads = 12




In [39]:
GPT_CONFIG_SMALL = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT_CONFIG_MED = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1024,
    "n_heads": 16,
    "n_layers": 24,
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT_CONFIG_LG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1280,
    "n_heads": 20,
    "n_layers": 36,
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT_CONFIG_XL = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1600,
    "n_heads": 25,
    "n_layers": 48,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [40]:
configs = {
    "medium": GPT_CONFIG_MED,
    "xl" : GPT_CONFIG_XL
}

In [43]:
def calculate_size(model): # based on chapter code
    
    total_params = total_params = sum(p.numel() for p in block.att.parameters())
    print(f"Total number of parameters in model: {total_params:,}")

In [44]:
from gpt_model import GPTModel

for model_abbrev in ("medium", "xl"):
    model_name = f"gpt2-{model_abbrev}"
    config = configs[model_abbrev]
    model = GPTModel(config)
    print(f"\n\n{model_name.upper()}:")
    calculate_size(model)



GPT2-MEDIUM:
Total number of parameters in model: 2,360,064


GPT2-XL:
Total number of parameters in model: 2,360,064
