In [1]:
import torch
from transformers import GPT2LMHeadModel,GPT2Tokenizer

In [2]:
device="cuda" if torch.cuda.is_available() else "cpu"
model=GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer=GPT2Tokenizer.from_pretrained("gpt2")


In [3]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [4]:
config=model.config

print("Layers:", config.n_layer)
print("Heads:", config.n_head)
print("Hidden size:", config.n_embd)
print("FFN size:", 4 * config.n_embd)

Layers: 12
Heads: 12
Hidden size: 768
FFN size: 3072


In [5]:
total = 0
bias_count = 0

for name, param in model.named_parameters():
    total += param.numel()
    if "bias" in name:
        bias_count += param.numel()

print("Total parameters:", total)
print("Bias parameters:", bias_count)
print("Bias percentage:", bias_count / total * 100)


Total parameters: 124439808
Bias parameters: 102144
Bias percentage: 0.08208305818022477


In [6]:
from collections import defaultdict

param_groups = defaultdict(list)

for name, param in model.named_parameters():
    if "attn" in name:
        param_groups["attention"].append((name, param))
    elif "mlp" in name:
        param_groups["ffn"].append((name, param))
    elif "ln" in name:
        param_groups["layernorm"].append((name, param))
    elif "embed" in name or "wte" in name or "wpe" in name:
        param_groups["embedding"].append((name, param))
    else:
        param_groups["other"].append((name, param))

for k in param_groups:
    print(k, len(param_groups[k]))


embedding 2
layernorm 50
attention 48
ffn 48


### Dummy Training Batch

In [7]:
text = "The transformer architecture is"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
loss.backward()


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


### Identify Parameters NOT Contributing to Gradient Descent

In [8]:
dead_params = []
tiny_grad_params = []

THRESH = 1e-6

for name, param in model.named_parameters():
    if param.grad is None:
        dead_params.append(name)
    else:
        gnorm = param.grad.norm().item()
        if gnorm < THRESH:
            tiny_grad_params.append((name, gnorm))

print("No gradient at all:", len(dead_params))
print("Near-zero gradient:", len(tiny_grad_params))


No gradient at all: 0
Near-zero gradient: 0


### Attention Headâ€“Level Gradient Analysis (Very Important)

In [9]:
def analyze_attention_heads(layer_idx):
    block = model.transformer.h[layer_idx]
    Wqkv = block.attn.c_attn.weight  # [768, 3*768]
    grad = Wqkv.grad

    d_model = config.n_embd
    n_heads = config.n_head
    head_dim = d_model // n_heads

    grad = grad.view(d_model, 3, n_heads, head_dim)

    head_norms = grad.norm(dim=(0, 3))  # (3, n_heads)

    return head_norms.mean(dim=0)  # average QKV

for i in range(config.n_layer):
    norms = analyze_attention_heads(i)
    print(f"Layer {i} head grad norms:", norms)


Layer 0 head grad norms: tensor([0.5569, 0.3553, 0.3728, 0.3998, 0.3836, 0.4077, 0.4608, 0.4338, 0.6736,
        0.4773, 0.3687, 0.3877], device='cuda:0')
Layer 1 head grad norms: tensor([0.1802, 0.2405, 0.1527, 0.1698, 0.1875, 0.3017, 0.3475, 0.4338, 0.3099,
        0.1980, 0.1850, 0.3803], device='cuda:0')
Layer 2 head grad norms: tensor([0.1841, 0.5805, 0.1350, 0.1870, 0.1795, 0.2435, 0.2480, 0.2832, 0.1293,
        0.2410, 0.2017, 0.1819], device='cuda:0')
Layer 3 head grad norms: tensor([0.1662, 0.1242, 0.3059, 0.1461, 0.1999, 0.2719, 0.2028, 0.1585, 0.2294,
        0.1064, 0.2707, 0.1696], device='cuda:0')
Layer 4 head grad norms: tensor([0.1371, 0.1102, 0.1738, 0.2548, 0.3914, 0.1502, 0.2171, 0.1568, 0.3115,
        0.1308, 0.2506, 0.1038], device='cuda:0')
Layer 5 head grad norms: tensor([0.2028, 0.1640, 0.1957, 0.2187, 0.2672, 0.2162, 0.1278, 0.1977, 0.1511,
        0.2598, 0.3465, 0.3506], device='cuda:0')
Layer 6 head grad norms: tensor([0.1687, 0.1632, 0.2366, 0.2068, 0.303

### Detect Dead Neurons in FFN

In [10]:
def analyze_ffn_neurons(layer_idx):
    block = model.transformer.h[layer_idx]
    W1 = block.mlp.c_fc.weight.grad  # [3072, 768]

    neuron_norms = W1.norm(dim=1)  # per neuron
    return neuron_norms

dead_neurons = []

for i in range(config.n_layer):
    norms = analyze_ffn_neurons(i)
    dead = (norms < 1e-6).sum().item()
    print(f"Layer {i} dead FFN neurons:", dead)


Layer 0 dead FFN neurons: 0
Layer 1 dead FFN neurons: 0
Layer 2 dead FFN neurons: 0
Layer 3 dead FFN neurons: 0
Layer 4 dead FFN neurons: 0
Layer 5 dead FFN neurons: 0
Layer 6 dead FFN neurons: 0
Layer 7 dead FFN neurons: 0
Layer 8 dead FFN neurons: 0
Layer 9 dead FFN neurons: 0
Layer 10 dead FFN neurons: 0
Layer 11 dead FFN neurons: 0


### Layer-Level Redundancy Detection

In [11]:
def layer_effectiveness(layer_idx, x):
    block = model.transformer.h[layer_idx]
    with torch.no_grad():
        y = block(x)[0]
    return (y - x).norm().item()

x = model.transformer.wte(inputs["input_ids"]).to(device)

for i in range(config.n_layer):
    print(f"Layer {i} delta norm:", layer_effectiveness(i, x))


Layer 0 delta norm: 123.80655670166016
Layer 1 delta norm: 286.76934814453125
Layer 2 delta norm: 843.882568359375
Layer 3 delta norm: 217.49032592773438
Layer 4 delta norm: 231.7343292236328
Layer 5 delta norm: 331.9229736328125
Layer 6 delta norm: 225.51446533203125
Layer 7 delta norm: 197.25697326660156
Layer 8 delta norm: 192.83938598632812
Layer 9 delta norm: 284.3265075683594
Layer 10 delta norm: 385.94677734375
Layer 11 delta norm: 3905.130615234375
