# Which parts of the models have the most parameters?

In [1]:
from transformers import AutoModelForCausalLM

In [2]:
model_id = "/home/stefanwebb/models/llm/meta_llama3-8b"
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='cuda',
    torch_dtype="auto"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
for name, param in base_model.named_parameters():
    print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.o_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.o_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self_attn.k_proj.weight
model.layers.2.self_attn.v_proj.weight
model.layers.2.self_attn.o_proj.weight
model.layers.2.mlp.gate_proj.weight
model.layers.2.mlp.up_proj.weight
model.layers.2.mlp.down_proj.weight
model.layers.2.inp

In [5]:
"""
Use these substrings to group weights
    embed_tokens
    layernorm
    self_attn
    mlp
    [anything else]
"""
from collections import defaultdict
import torch
grouped_params = defaultdict(int)

for name, param in base_model.named_parameters():
    if 'embed_tokens' in name:
        grouped_params['embed_tokens'] += torch.numel(param)
    elif 'layernorm' in name:
        grouped_params['layernorm'] += torch.numel(param)
    elif 'self_attn' in name:
        grouped_params['self_attn'] += torch.numel(param)
    elif 'mlp' in name:
        grouped_params['mlp'] += torch.numel(param)
    else:
        print(name)
        grouped_params['etc'] += torch.numel(param)

model.norm.weight
lm_head.weight


In [6]:
grouped_params

defaultdict(int,
            {'embed_tokens': 525336576,
             'self_attn': 1342177280,
             'mlp': 5637144576,
             'layernorm': 262144,
             'etc': 525340672})

In [8]:
total_params = sum(grouped_params.values())
print(total_params)

8030261248


In [9]:
for k, v in grouped_params.items():
    print(k, round(v / total_params * 100, 2))

embed_tokens 6.54
self_attn 16.71
mlp 70.2
layernorm 0.0
etc 6.54


In [10]:
print(base_model.num_parameters(), total_params)

8030261248 8030261248
