### Model input

In [2]:
from transformers import AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen3-8B"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
n = sum(p.numel() for p in model.parameters())
print(f"HF total: {n:,}")  # ~8,190,722,048 (8.19B)


  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 10.12it/s]

HF total: 8,190,735,360





### Component wise calculation

In [None]:
from transformers import AutoConfig, AutoModelForCausalLM

# ---------------------------
# Qwen3-8B config
# ---------------------------
V   = 151_936     # vocab_size
E   = 4_096       # hidden_size
I   = 12_288      # intermediate_size (SwiGLU)
L   = 36          # num_hidden_layers
H   = 32          # num_attention_heads
H_kv= 8           # num_key_value_heads (GQA)
D   = 128         # head_dim (E == H * D)
TIE = False       # tie_word_embeddings (Qwen3-8B untied)

# ---------------------------
# 1) Hardcoded formula counts
# ---------------------------
def hardcoded_per_layer_counts(V, E, I, L, H, H_kv, D, TIE):
    # Attention (no biases): q(E->E) + k(E->Hkv*D) + v(E->Hkv*D) + o(E->E)
    attn = (E*E) + (E*(H_kv*D)) + (E*(H_kv*D)) + (E*E)           # = (2 + 2*H_kv/H) * E^2
    # MLP (SwiGLU, no biases): gate(E->I) + up(E->I) + down(I->E)
    mlp  = (E*I) + (E*I) + (I*E)                                  # = 3 * E * I
    # RMSNorms per layer: input + post (weights only)
    rms  = 2 * E
    # QK-Norm per layer: q_norm(D) + k_norm(D)
    qkn  = 2 * D

    per_layer = attn + mlp + rms + qkn
    blocks    = L * per_layer
    token_emb = V * E
    lm_head   = 0 if TIE else (E * V)  # Qwen3-8B: untied

    return {
        "attn": attn,
        "mlp": mlp,
        "rms": rms,
        "qkn": qkn,
        "per_layer_total": per_layer,
        "blocks_total": blocks,
        "token_emb": token_emb,
        "lm_head": lm_head,
        "grand_total": token_emb + lm_head + blocks,
    }

hard = hardcoded_per_layer_counts(V, E, I, L, H, H_kv, D, TIE)

def fmt(n: int) -> str:
    return f"{n:,}"

print("=== Hardcoded (formula) per-layer counts ===")
print("attn :", fmt(hard["attn"]))
print("mlp  :", fmt(hard["mlp"]))
print("rms  :", fmt(hard["rms"]))
print("qkn  :", fmt(hard["qkn"]))
print("SUM  :", fmt(hard["per_layer_total"]))
print()

# ----------------------------------------
# 2) Build model from config (no weights)
# ----------------------------------------
cfg = AutoConfig.from_pretrained("Qwen/Qwen3-8B")
model = AutoModelForCausalLM.from_config(cfg)   # no checkpoint download

# Quick sanity that config matches what we hardcoded
cfg_vals = dict(
    V=cfg.vocab_size,
    E=cfg.hidden_size,
    I=cfg.intermediate_size,
    L=cfg.num_hidden_layers,
    H=cfg.num_attention_heads,
    H_kv=getattr(cfg, "num_key_value_heads", H),
    D=getattr(cfg, "head_dim", cfg.hidden_size // cfg.num_attention_heads),
    TIE=getattr(cfg, "tie_word_embeddings", False),
)
print("Config check:", cfg_vals)
print()

# ---------------------------
# 3) Count from the built model
# ---------------------------
def count_params(module) -> int:
    return sum(p.numel() for p in module.parameters() if p.requires_grad)

def per_layer_model_counts(model):
    core = model.model
    layers = core.layers
    L = len(layers)

    per_layer = []
    for i, block in enumerate(layers):
        sa  = block.self_attn
        mlp = block.mlp

        # Attention projections (weights only by Qwen3 design)
        attn_q = count_params(sa.q_proj)
        attn_k = count_params(sa.k_proj)
        attn_v = count_params(sa.v_proj)
        attn_o = count_params(sa.o_proj)

        # QK-Norm: present in Qwen3 (count weights if exist)
        q_norm = count_params(sa.q_norm) if hasattr(sa, "q_norm") else 0
        k_norm = count_params(sa.k_norm) if hasattr(sa, "k_norm") else 0

        # RMSNorms
        ln_in   = count_params(block.input_layernorm)
        ln_post = count_params(block.post_attention_layernorm)

        # MLP (SwiGLU)
        mlp_gate = count_params(mlp.gate_proj)
        mlp_up   = count_params(mlp.up_proj)
        mlp_down = count_params(mlp.down_proj)

        d = {
            "layer": i,
            "attn": attn_q + attn_k + attn_v + attn_o,
            "mlp": mlp_gate + mlp_up + mlp_down,
            "rms": ln_in + ln_post,
            "qkn": q_norm + k_norm,
        }
        d["sum"] = d["attn"] + d["mlp"] + d["rms"] + d["qkn"]
        per_layer.append(d)

    return per_layer

pl = per_layer_model_counts(model)

# Print a few layers and compare vs formula
print("=== Per-layer counts from built model (no weights) ===")
for i in [0, 1, L-1]:
    d = pl[i]
    print(f"[layer {i}]  attn={fmt(d['attn'])}  mlp={fmt(d['mlp'])}  rms={fmt(d['rms'])}  qkn={fmt(d['qkn'])}  sum={fmt(d['sum'])}")

# Check that all layers match and are identical
all_attn = {d["attn"] for d in pl}
all_mlp  = {d["mlp"]  for d in pl}
all_rms  = {d["rms"]  for d in pl}
all_qkn  = {d["qkn"]  for d in pl}
all_sum  = {d["sum"]  for d in pl}
print("\nDistinct values across layers:")
print("attn:", {fmt(x) for x in all_attn})
print("mlp :", {fmt(x) for x in all_mlp})
print("rms :", {fmt(x) for x in all_rms})
print("qkn :", {fmt(x) for x in all_qkn})
print("sum :", {fmt(x) for x in all_sum})
print()

# ---------------------------
# 4) Side-by-side comparison
# ---------------------------
def check(label, got, expect):
    ok = "OK" if got == expect else f"DIFF ({fmt(got - expect)})"
    print(f"{label:18s} model={fmt(got)}  | expect={fmt(expect)}  -> {ok}")

print("=== Comparison (per-layer) ===")
check("attn", next(iter(all_attn)), hard["attn"])
check("mlp",  next(iter(all_mlp)),  hard["mlp"])
check("rms",  next(iter(all_rms)),  hard["rms"])
check("qkn",  next(iter(all_qkn)),  hard["qkn"])
check("sum",  next(iter(all_sum)),  hard["per_layer_total"])

# Totals (optional)
blocks_model = sum(d["sum"] for d in pl)
token_emb_model = count_params(model.model.embed_tokens)
lm_head_model   = count_params(model.lm_head)  # untied
grand_model = token_emb_model + lm_head_model + blocks_model

print("\n=== Totals ===")
check("blocks total", blocks_model, hard["blocks_total"])
check("embed_tokens", token_emb_model, hard["token_emb"])
check("lm_head",      lm_head_model,   hard["lm_head"])
check("GRAND TOTAL",  grand_model,     hard["grand_total"])


=== Hardcoded (formula) per-layer counts ===
attn : 41,943,040
mlp  : 150,994,944
rms  : 8,192
qkn  : 256
SUM  : 192,946,432

Config check: {'V': 151936, 'E': 4096, 'I': 12288, 'L': 36, 'H': 32, 'H_kv': 8, 'D': 128, 'TIE': False}

=== Per-layer counts from built model (no weights) ===
[layer 0]  attn=41,943,040  mlp=150,994,944  rms=8,192  qkn=256  sum=192,946,432
[layer 1]  attn=41,943,040  mlp=150,994,944  rms=8,192  qkn=256  sum=192,946,432
[layer 35]  attn=41,943,040  mlp=150,994,944  rms=8,192  qkn=256  sum=192,946,432

Distinct values across layers:
attn: {'41,943,040'}
mlp : {'150,994,944'}
rms : {'8,192'}
qkn : {'256'}
sum : {'192,946,432'}

=== Comparison (per-layer) ===
attn               model=41,943,040  | expect=41,943,040  -> OK
mlp                model=150,994,944  | expect=150,994,944  -> OK
rms                model=8,192  | expect=8,192  -> OK
qkn                model=256  | expect=256  -> OK
sum                model=192,946,432  | expect=192,946,432  -> OK

=== Totals