In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
device = 'cuda'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "openai-community/gpt2-medium"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Force float16 instead of BF16
    device_map="auto"           # Auto-detect the best device
)

In [3]:
model.transformer.h[0].attn.num_heads

16

In [4]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [5]:
sd_hf = model.state_dict()
for k, v in sd_hf.items():
  print(k, v.shape)

transformer.wte.weight torch.Size([50257, 1024])
transformer.wpe.weight torch.Size([1024, 1024])
transformer.h.0.ln_1.weight torch.Size([1024])
transformer.h.0.ln_1.bias torch.Size([1024])
transformer.h.0.attn.c_attn.weight torch.Size([1024, 3072])
transformer.h.0.attn.c_attn.bias torch.Size([3072])
transformer.h.0.attn.c_proj.weight torch.Size([1024, 1024])
transformer.h.0.attn.c_proj.bias torch.Size([1024])
transformer.h.0.ln_2.weight torch.Size([1024])
transformer.h.0.ln_2.bias torch.Size([1024])
transformer.h.0.mlp.c_fc.weight torch.Size([1024, 4096])
transformer.h.0.mlp.c_fc.bias torch.Size([4096])
transformer.h.0.mlp.c_proj.weight torch.Size([4096, 1024])
transformer.h.0.mlp.c_proj.bias torch.Size([1024])
transformer.h.1.ln_1.weight torch.Size([1024])
transformer.h.1.ln_1.bias torch.Size([1024])
transformer.h.1.attn.c_attn.weight torch.Size([1024, 3072])
transformer.h.1.attn.c_attn.bias torch.Size([3072])
transformer.h.1.attn.c_proj.weight torch.Size([1024, 1024])
transformer.h.1

In [6]:
from hooks import *
remove_all_forward_hooks(model)
register_all_forward_hooks(model)

batch_size = 16
total_samples = 1024
num_batches = total_samples // batch_size

prompt = "The future of AI is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    for _ in range(num_batches):
        outputs = model(**inputs)
compute_importance_scores(model)

In [7]:
# Print importance scores for each registered module
for name, module in model.named_modules():
    if hasattr(module, "importance_scores"):
        print(f"Layer {module.__class__.__name__}: P{name} : importance scores:", module.importance_scores.shape)

Layer LayerNorm: Ptransformer.h.0.ln_1 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.0.attn.c_proj : importance scores: torch.Size([16])
Layer LayerNorm: Ptransformer.h.0.ln_2 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.0.mlp.c_fc : importance scores: torch.Size([4096])
Layer LayerNorm: Ptransformer.h.1.ln_1 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.1.attn.c_proj : importance scores: torch.Size([16])
Layer LayerNorm: Ptransformer.h.1.ln_2 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.1.mlp.c_fc : importance scores: torch.Size([4096])
Layer LayerNorm: Ptransformer.h.2.ln_1 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.2.attn.c_proj : importance scores: torch.Size([16])
Layer LayerNorm: Ptransformer.h.2.ln_2 : importance scores: torch.Size([1024])
Layer Conv1D: Ptransformer.h.2.mlp.c_fc : importance scores: torch.Size([4096])
Layer LayerNorm: Ptransformer.h.3.ln_1 : im

In [8]:
# i = 0
# for module in model.modules():
#     if hasattr(module, "importance_buffer") and module.importance_buffer:
#         # print(module.importance_buffer)
#         print(module.__class__.__name__)
#         print(len(module.importance_buffer))
#         print(module.importance_buffer[0].shape)
#         print('============')
        
#         print([module.importance_buffer[i].shape for i in range(len(module.importance_buffer))])
#         print('============')
#         i += 1
#         if i>10:
#             break
#         # all_outputs = torch.cat(module.importance_buffer, dim=0)  # Concatenate over batch dimension

#         # # Compute norm-based importance
#         # importance = all_outputs.norm(p=2, dim=0).mean(dim=0)

#         # module.importance_scores = importance
#         # del module.importance_buffer

In [None]:
import pruning 
import pruning_utils
import importlib
importlib.reload(pruning_utils) 
importlib.reload(pruning) 


<module 'pruning' from '/users/eleves-a/2021/ahmad.rammal/Desktop/pruned_GPT/pruning.py'>

In [10]:
pruning.prune_mlp(model, 2048)
pruning.prune_heads(model, 12)
pruning.prune_embeddings(model, 768)

In [None]:
from pruning_utils import *
model_size(model)

Corrected Total Parameters: 171,720,960


In [12]:
# Sample prompt
prompt = "The little girl was"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
remove_all_forward_hooks(model)

# Generate output text
with torch.no_grad():
    generated_ids = model.generate(
        inputs.input_ids,
        max_length=100,  # Adjust length as needed
        temperature=0.7,  # Adjust for diversity (lower = more deterministic)
        top_k=50,  # Consider top-k sampling
        top_p=0.95,  # Nucleus sampling
        do_sample=True  # Enables sampling instead of greedy decoding
    )

# Decode generated text
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Generated text:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated text: The little girl was Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope Telescope


In [13]:
model.transformer.h[0].attn.num_heads

12

In [14]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=2048, nx=768)
          (c_proj): Conv1D(nf=768, nx=2048)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [13]:
from datasets import load_dataset

dataset = load_dataset("stas/openwebtext-10k", trust_remote_code=True)

# # Check the size
# print(dataset)

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 10000
    })
})

In [3]:
dataset['train'][0]

{'text': "A magazine supplement with an image of Adolf Hitler and the title 'The Unreadable Book' is pictured in Berlin. No law bans “Mein Kampf” in Germany, but the government of Bavaria, holds the copyright and guards it ferociously. (Thomas Peter/REUTERS)\n\nThe city that was the center of Adolf Hitler’s empire is littered with reminders of the Nazi past, from the bullet holes that pit the fronts of many buildings to the hulking Luftwaffe headquarters that now house the Finance Ministry.\n\nWhat it doesn’t have, nor has it since 1945, are copies of Hitler’s autobiography and political manifesto, “Mein Kampf,” in its bookstores. The latest attempt to publish excerpts fizzled this week after the Bavarian government challenged it in court, although an expurgated copy appeared at newspaper kiosks around the country.\n\nBut in Germany — where keeping a tight lid on Hitler’s writings has become a rich tradition in itself — attitudes toward his book are slowly changing, and fewer people ar

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Tokenization function with truncation
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

In [None]:
num_tokens = sum(len(tokenized_datasets['train'][i]['input_ids']) for i in range(10000))
# for i in range(len(tokenized_datasets)):
#     if len(tokenized_datasets['train']['input_ids'][i])>1024:
#         print(i)
num_tokens

In [1]:
from transformers import GPT2TokenizerFast
import torch
from torch.utils.data import DataLoader, Dataset

class GPT2ChunkedDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=1024):
        self.tokenizer = tokenizer
        
        # 1. Concatenate all texts into one long string
        full_text = " ".join(texts)
        
        # 2. Tokenize the entire text corpus
        tokenized = tokenizer(full_text, return_tensors="pt")["input_ids"].squeeze(0)
        
        # 3. Split into fixed-length chunks of `max_length`
        self.chunks = [tokenized[i:i + max_length] for i in range(0, len(tokenized), max_length)]
        
        # 4. Drop the last chunk if it's too short (optional)
        if len(self.chunks[-1]) < max_length:
            self.chunks.pop()

    def __len__(self):
        return len(self.chunks)

    def __getitem__(self, idx):
        return {"input_ids": self.chunks[idx]}

# Load GPT-2 tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset

dataset = load_dataset("stas/openwebtext-10k", trust_remote_code=True)

# Example calibration text dataset
# calib_texts = [
#     "This is the first calibration text. It is meant to help with quantization.",
#     "Another example of calibration data. It should be long enough to form multiple chunks.",
#     "We concatenate all text, tokenize it, and split it into 1024-token chunks for GPT-2."
# ]

# Create dataset and DataLoader
calib_dataset = GPT2ChunkedDataset(dataset['train']['text'], tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (11241954 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
tokenizer.padding_side

'right'

In [6]:
len(calib_dataset)

10978

In [9]:
calib_dataloader = DataLoader(calib_dataset, batch_size=8, shuffle=False)

In [10]:
len(calib_dataset)

5

In [12]:
calib_dataset[0]

{'input_ids': tensor([ 1212,   318,   262,   717, 36537,  2420,    13,   632,   318,  4001])}