In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
device = 'cuda'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "openai-community/gpt2-medium"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Force float16 instead of BF16
    device_map="auto"           # Auto-detect the best device
)

In [3]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [4]:
sd_hf = model.state_dict()
for k, v in sd_hf.items():
  print(k, v.shape)

transformer.wte.weight torch.Size([50257, 1024])
transformer.wpe.weight torch.Size([1024, 1024])
transformer.h.0.ln_1.weight torch.Size([1024])
transformer.h.0.ln_1.bias torch.Size([1024])
transformer.h.0.attn.c_attn.weight torch.Size([1024, 3072])
transformer.h.0.attn.c_attn.bias torch.Size([3072])
transformer.h.0.attn.c_proj.weight torch.Size([1024, 1024])
transformer.h.0.attn.c_proj.bias torch.Size([1024])
transformer.h.0.ln_2.weight torch.Size([1024])
transformer.h.0.ln_2.bias torch.Size([1024])
transformer.h.0.mlp.c_fc.weight torch.Size([1024, 4096])
transformer.h.0.mlp.c_fc.bias torch.Size([4096])
transformer.h.0.mlp.c_proj.weight torch.Size([4096, 1024])
transformer.h.0.mlp.c_proj.bias torch.Size([1024])
transformer.h.1.ln_1.weight torch.Size([1024])
transformer.h.1.ln_1.bias torch.Size([1024])
transformer.h.1.attn.c_attn.weight torch.Size([1024, 3072])
transformer.h.1.attn.c_attn.bias torch.Size([3072])
transformer.h.1.attn.c_proj.weight torch.Size([1024, 1024])
transformer.h.1

In [5]:
# # Sample prompt
# prompt = "The "
# inputs = tokenizer(prompt, return_tensors="pt").to(device)

# # Generate output text
# with torch.no_grad():
#     generated_ids = model.generate(
#         inputs.input_ids,
#         max_length=10,  # Adjust length as needed
#         temperature=0.7,  # Adjust for diversity (lower = more deterministic)
#         top_k=50,  # Consider top-k sampling
#         top_p=0.95,  # Nucleus sampling
#         do_sample=True  # Enables sampling instead of greedy decoding
#     )

# # Decode generated text
# generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# print("Generated text:", generated_text)


In [6]:
from hooks import *
remove_all_forward_hooks(model)
register_all_forward_hooks(model)

batch_size = 16
total_samples = 1024
num_batches = total_samples // batch_size

prompt = "The future of AI is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    for _ in range(num_batches):
        outputs = model(**inputs)
compute_importance_scores(model)

In [7]:
# Print importance scores for each registered module
for name, module in model.named_modules():
    if hasattr(module, "importance_scores"):
        print(f"Layer {module.__class__.__name__}: P{name} : importance scores:", module.importance_scores.shape)

Layer LayerNorm: Ptransformer.h.0.ln_1 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.0.attn.c_proj : importance scores: torch.Size([16])
Layer LayerNorm: Ptransformer.h.0.ln_2 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.0.mlp.c_fc : importance scores: torch.Size([4096])
Layer LayerNorm: Ptransformer.h.1.ln_1 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.1.attn.c_proj : importance scores: torch.Size([16])
Layer LayerNorm: Ptransformer.h.1.ln_2 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.1.mlp.c_fc : importance scores: torch.Size([4096])
Layer LayerNorm: Ptransformer.h.2.ln_1 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.2.attn.c_proj : importance scores: torch.Size([16])
Layer LayerNorm: Ptransformer.h.2.ln_2 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.2.mlp.c_fc : importance scores: torch.Size([4096])
Layer LayerNorm: Ptransformer.h.3.ln_1 : im

In [8]:
# i = 0
# for module in model.modules():
#     if hasattr(module, "importance_buffer") and module.importance_buffer:
#         # print(module.importance_buffer)
#         print(module.__class__.__name__)
#         print(len(module.importance_buffer))
#         print(module.importance_buffer[0].shape)
#         print('============')
        
#         print([module.importance_buffer[i].shape for i in range(len(module.importance_buffer))])
#         print('============')
#         i += 1
#         if i>10:
#             break
#         # all_outputs = torch.cat(module.importance_buffer, dim=0)  # Concatenate over batch dimension

#         # # Compute norm-based importance
#         # importance = all_outputs.norm(p=2, dim=0).mean(dim=0)

#         # module.importance_scores = importance
#         # del module.importance_buffer

In [9]:
import pruning 
import utils
import importlib
importlib.reload(utils) 
importlib.reload(pruning) 


<module 'pruning' from '/users/eleves-a/2021/ahmad.rammal/Desktop/pruned_GPT/pruning.py'>

In [10]:
print(model.transformer.h[0].mlp.c_fc.weight.shape)
print(model.transformer.h[0].mlp.c_proj.weight.shape)

torch.Size([1024, 4096])
torch.Size([4096, 1024])


In [11]:
print(model.transformer.h[1].mlp.c_fc.weight.shape)
print(model.transformer.h[1].mlp.c_proj.weight.shape)

torch.Size([1024, 4096])
torch.Size([4096, 1024])


In [12]:
pruning.prune_mlp(model, 2)

Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, nx=4096)
Pruning MLP layer to 2048 neurons
Conv1D(nf=4096, nx=1024)
Conv1D(nf=1024, n

In [13]:
from utils import *
model_size(model)

Corrected Total Parameters: 254,110,720


In [14]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear(in_features=1024, out_features=2048, bias=True)
          (c_proj): Linear(in_features=2048, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=502

In [15]:
layer = model.transformer.h[0].mlp.c_fc
idx = torch.tensor([0, 1]).to('cuda')
dim = 1
num_neurons = idx.size(0)
if dim == 0:
    new_layer = nn.Linear(num_neurons, layer.out_features, bias=layer.bias is not None).to(device)
    new_layer.weight.data = layer.weight.data[:, idx]
    if layer.bias is not None:
        new_layer.bias.data = layer.bias.data[idx]

elif dim == 1:
    new_layer = nn.Linear(layer.nx, num_neurons, bias=layer.bias is not None).to(device)

AttributeError: 'Linear' object has no attribute 'nx'

In [None]:
new_layer.weight.data = layer.weight.data[idx, :]

# else:
#     raise ValueError("Invalid dimension")

In [None]:
if layer.bias is not None:
    new_layer.bias.data = layer.bias.data[idx]

In [None]:
new_layer.weight.data.shape, new_layer.bias.data.shape

(torch.Size([2, 4096]), torch.Size([2]))

In [None]:
layer.weight.data.shape, layer.bias.data.shape

(torch.Size([1024, 4096]), torch.Size([4096]))