# 1. GPT Parameter Count

In [22]:
from transformers import GPT2Model
model = GPT2Model.from_pretrained('gpt2')

# Function to count the total parameters for the model
def count_model_parameters(model, is_human: bool):
    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return f"{params / 1e6:.2f}M" if is_human else params

In [2]:
print(model)

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)


## Word Embedding Matrix

Let $x\in\mathbb{R}^{V\times P}$ be the input sequence to GPT2. The word embedding function is written as
$$\text{Word Embedding}(x)=Wx, \ W \in \mathbb{R}^{E\times V}$$

The number of parameters for the word embedding layer is $V\times E = 50257\cdot 768 = 38,597,376$

In [3]:
# Verify the parameter count
print("Word Embedding Parameter Count: ", count_model_parameters(model.wte, is_human=False))

Word Embedding Parameter Count:  38597376


## Positional Embedding Matrix

Let $x\in\mathbb{R}^{E \times P}$, represent the input sequence after the going through the word embedding function. We can write the positional encoding function as,
$$\text{Word Positional Embedding}(x)=W_p + x, \ W_p\in\mathbb{R}^{E\times P}$$
The number of parameters for the word embedding layer is $E\times P=768\cdot 1024=786,432$

In [4]:
# Verify the parameter count
print("Positional Embedding Parameter Count: ", count_model_parameters(model.wpe, is_human=False))

Positional Embedding Parameter Count:  786432


# Multi-Head Attention

Let $x\in\mathbb{R}^{E\times P}$ be the input sequence after going through the word embedding and positional embedding matrix, then the we can write the attention mechanism as
$$\text{Attention}(x) = (W_{V}x + b_V)\text{Softmax}\left(\frac{(W_Qx + b_Q)(W_Kx + b_K)^\top}{\sqrt{E}}\right) \ W_{K, Q, V} \in \mathbb{R}^{(E/H)\times E}, \ b_{Q, K, V} \in \mathbb{R}^{E/H}$$
$$\text{Multi Head Attention}(x) = W_O \ \text{concat}(\text{head}_1,\dots, \text{head}_H) + b_O, \ W_O \in \mathbb{R}^{E\times E}, b_O \in \mathbb{R}^E$$
The number of parameter for MHA is $4 \times E^2 + 4\times E$

In [5]:
# Verify parameter count
count_attn_params = lambda E: (4 * E) * (E + 1)
print("Estimated Multi Head Attention Parameter Count: ", count_attn_params(E=768))
print("Multi Head Attention Parameter Count: ", count_model_parameters(model.h[0].attn, is_human=False))

Estimated Multi Head Attention Parameter Count:  2362368
Multi Head Attention Parameter Count:  2362368


## MLP

Let $x\in\mathbb{R}^{E\times P}$ be the input after going through MHA. We can write the MLP layer as,
$$\text{MLP}(x)=W_{\text{Out}}\text{GELU}(W_{\text{In}}x + b_{\text{In}}) + b_{\text{Out}}, \ W_{\text{In}} \in \mathbb{R}^{4E \times E}, b_{\text{In}} \in \mathbb{R}^{4E}, W_{\text{Out}} \in \mathbb{R}^{E \times 4E}, b_{\text{Out}} \in \mathbb{R}^E$$
The number parameters for the MLP layer is $8 \times E^2 + 5\times E = 4,718,592$

In [6]:
# Verify parameter count
count_mlp_params = lambda E: 8*E**2 + 5*E
print("Estimated MLP Layer Parameter Count: ", count_mlp_params(E=768))
print("MLP Layer Parameter Count: ", count_model_parameters(model.h[0].mlp, is_human=False))

Estimated MLP Layer Parameter Count:  4722432
MLP Layer Parameter Count:  4722432


## LayerNorm

The LayerNorm module is explicitly written as,
$$\text{LayerNorm}(x)=\frac{x-\text{E}[x]}{\sqrt{\text{Var}[x]+\epsilon}}*\gamma + \beta, x, \gamma, \beta \in \mathbb{R}^{E\times P}$$
Since $x$ is the input variable and $\epsilon$ is a stabilizing parameter, then the learnable parameters are $\gamma,\beta$. The number of parameters for a LayerNorm module is $2\times E = 1,536$

In [7]:
# Verify parameter count
print("LayerNorm Module Parameter Count: ", count_model_parameters(model.h[0].ln_1, is_human=False))

LayerNorm Module Parameter Count:  1536


## Total Parameter Count for GPT2

A transformer block consists of MHA, MLP, and 2 LayerNorms (each before MHA/MLP), then the number of parameters for a transformers is equivalent to $(4 \times E^2 + 4\times E) + (8\times E^2 + 5\times E) + 2\times (2\times E)= 7,087,872$. Since there are $L=12$ transformers block, the final parameter count for GPT2 is equivalent to
$$V\times E + E\times P + L[(4 \times E^2 + 4\times E) + (8\times E^2 + 5\times E) + 2\times (2\times E)] + 2\times E \\
=\ 38,597,376 + 786,432 + 12*7,087,872 + 1,536 = 124,439,808$$

Based on the given formula, the parameter size of GPT2 scales quadratically with the $E$.

In [8]:
count_mlp_params(E=768) + count_attn_params(E=768) + 4*768

7087872

In [9]:
# Total number of parameters in GPT2
def count_gpt_params(V, P, E, L, is_human: bool): 
    est_params = V * E + P * E + L * (count_attn_params(E) + count_mlp_params(E) + 4*E) + 2*E
    return f"{est_params/1e6:.2f}M" if is_human else est_params

print("Estimated GPT2 Parameters", count_gpt_params(50257, 1024, 768, 12, False))
print("Total GPT2 Parameters: ", count_model_parameters(model, is_human=False))

Estimated GPT2 Parameters 124439808
Total GPT2 Parameters:  124439808


## Estimation of GPT Sizes


In [10]:
# GPT2 Medium
# E = 1024, L = 24
print("Estimated GPT2-Medium Params: ", count_gpt_params(50257, 1024, 1024, 16, True))

#GPT2 Large
# E = 1280, L = 36
print("Estimated GPT2-Large Params: ", count_gpt_params(50257, 1024, 1280, 36, True))

#GPT2 XL
print("Estimated GPT2-XL Params: ", count_gpt_params(50257, 1024, 1600, 48, True))


Estimated GPT2-Medium Params:  254.05M
Estimated GPT2-Large Params:  774.03M
Estimated GPT2-XL Params:  1557.61M


# Modern Model Study

In [11]:
from transformers import AutoModel, AutoTokenizer

gemma4b = AutoModel.from_pretrained("google/gemma-3-4b-pt")
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-pt")

Loading checkpoint shards: 100%|██████████| 2/2 [00:36<00:00, 18.41s/it]


In [3]:
print(gemma4b)

Gemma3Model(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(4096, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-26): 27 x SiglipEncoderLayer(
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (self_attn): SiglipAttention(
              (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
              (activation_fn): 

In [24]:
count_model_parameters(gemma4b.language_model.norm, is_human=False)

2560

In [23]:
gemma4b.language_model.norm

Gemma3RMSNorm((2560,), eps=1e-06)

In [None]:
sample_sentence = "Hello, my name is John! Nice to meet you."
tokenized_sentence = tokenizer(sample_sentence, return_tensors='pt')
embedded_sentence = gemma4b.language_model.embed_tokens(tokenized_sentence.input_ids)