In [3]:
import transformers
import torch

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

In [5]:
print(model.modules)

<bound method Module.modules of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)>


In [None]:
from transformers.pytorch_utils import Conv1D
from dataclasses import dataclass
import torch.nn as nn
from transformers import AutoConfig
import torch.autograd
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention

@dataclass
class quantConfig():
    W_bit: int = 8  # 4-bit is too aggressive for PTQ, use 8-bit
    A_bit: int = 16
    KV_bit: int = 14
    A_layerwise: bool = False
    W_layerwise: bool = True
    A_quant_method: str = "symmetric"
    layerwise: bool = False
    gradclip: tuple = None  # None means no gradient clipping during training

class SymQuantization(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, num_bits, layerwise: bool = False, clip_val: tuple = None):
        ctx.save_for_backward(input)
        ctx.clip_val = clip_val 
        
        if layerwise:
            max_val = torch.max(torch.abs(input))
        else:
            if input.ndimension() <= 3:
                # weight & hidden layer
                max_val = (
                    torch.max(torch.abs(input), dim=-1, keepdim=True)[0]
                    .expand_as(input)
                    .detach()
                )
            elif input.ndimension() == 4:
                # TODO: attention score matrix, calculate alpha / beta per head
                tmp = input.view(input.shape[0], input.shape[1], -1)
                max_val = (
                    torch.max(torch.abs(tmp), dim=-1, keepdim=True)[0]
                    .unsqueeze(-1)
                    .expand_as(input)
                    .detach()
                )
            else:
                raise ValueError

        # quantize and dequantize the input
        # we add a small epsilon to avoid division by zero
        alpha = max_val / ((2**(num_bits - 1) - 1) + 1e-6)
        X_q = torch.round(input / alpha) * alpha

        return X_q

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        clip_val = ctx.clip_val

        # clips the output (effectively STE)
        grad_input = grad_output.clone()
        if clip_val is not None:
            grad_input[input > clip_val[1]] = 0
            grad_input[input < clip_val[0]] = 0
        return grad_input, None, None, None


class QuantLinear(nn.Module):
    def __init__(self, layer, quant_config: quantConfig = quantConfig()):
        super().__init__()
        self.weight = nn.Parameter(layer.weight.data.clone())
        self.bias = nn.Parameter(layer.bias.data.clone()) if layer.bias is not None else None
        self.quantConfig = quant_config
        self.quantFunc = SymQuantization.apply
        self.is_conv1d = isinstance(layer, Conv1D)

    def forward(self, x):

        weight = self.weight
        if self.quantConfig.W_bit and self.quantConfig.W_bit < 32:
            weight = self.quantFunc(self.weight, self.quantConfig.W_bit, 
                                       self.quantConfig.W_layerwise, self.quantConfig.gradclip)

        act = x
        if self.quantConfig.A_bit and self.quantConfig.A_bit < 32:
            act = self.quantFunc(x, self.quantConfig.A_bit, 
                                    self.quantConfig.A_layerwise, self.quantConfig.gradclip)

        if self.is_conv1d:
            out = act @ weight
            if self.bias is not None:
                out = out + self.bias
            return out
        else:
            return nn.functional.linear(act, weight, self.bias)


def quantize_model(model, quant_config=quantConfig(), skip_layers=None):
    """
    Quantize model layers.
    
    Args:
        skip_layers: list of layer names to skip (e.g., ['lm_head'] to preserve output quality)
    """
    if skip_layers is None:
        skip_layers = ['lm_head']  # Don't quantize the output projection by default
    
    for name, module in model.named_modules():
        if isinstance(module, GPT2Attention):
            print((module.c_attn.weight.shape))
    
    replacements = []
    for name, module in model.named_modules():
        for child_name, child in module.named_children():
            # Skip layers that shouldn't be quantized
            full_name = f"{name}.{child_name}" if name else child_name
            if any(skip in full_name for skip in skip_layers):
                print(f"Skipping quantization for: {full_name}")
                continue
            if isinstance(child, (Conv1D, nn.Linear)) and not isinstance(child, QuantLinear):
                replacements.append((module, child_name, child))
    
    for parent, child_name, child in replacements:
        setattr(parent, child_name, QuantLinear(child, quant_config))

    return model

model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", attn_implementation="sdpa")
config = AutoConfig.from_pretrained("openai-community/gpt2")
print("num heads", config.num_attention_heads)
model = quantize_model(model)

tokenizer.pad_token = tokenizer.eos_token
model_inputs = tokenizer(["The secret to baking a good cake is ", "What is the meaning of life? "], return_tensors="pt", padding=True).to(model.device)
print(model_inputs.input_ids.shape)
generated_ids = model.generate(**model_inputs, max_length=30, do_sample = False)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[1]

loading configuration file config.json from cache at /Users/pranavponnusamy/.cache/huggingface/hub/models--openai-community--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transform

num heads 12
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([768, 2304])
torch.Size([2, 9])


'What is the meaning of life? ,,,,,.............,,,'