In [10]:
%env CUDA_VISIBLE_DEVICES=0
%env TOKENIZERS_PARALLELISM="True"

env: CUDA_VISIBLE_DEVICES=0
env: TOKENIZERS_PARALLELISM="True"


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
gptq_config = GPTQConfig(bits=4, dataset="wikitext2", tokenizer=tokenizer)

In [6]:
quantized_model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto", 
    quantization_config=gptq_config
)

Quantizing model.decoder.layers blocks : 100%|██████████| 12/12 [01:19<00:00,  6.65s/it]


In [17]:
quantized_model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): QuantLinear()
            (out_proj): QuantLinear()
            (q_proj): QuantLinear()
            (v_proj): QuantLinear()
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): QuantLinear()
          (fc2): QuantLinear()
        )
      )
    )
  )
  (lm_head): Linear(in_features=768, out_features=50272, bias=False)
)

In [21]:
layer_buffers = quantized_model.model.decoder.layers[0].self_attn.k_proj._buffers

print(layer_buffers.keys())
for key in layer_buffers.keys():
    print(f"--- {key}")
    print(layer_buffers[key].shape)
    print(layer_buffers[key].dtype)
    print("\n")

odict_keys(['qweight', 'qzeros', 'scales', 'g_idx', 'bias'])
--- qweight
torch.Size([96, 768])
torch.int32


--- qzeros
torch.Size([6, 96])
torch.int32


--- scales
torch.Size([6, 768])
torch.float16


--- g_idx
torch.Size([768])
torch.int32


--- bias
torch.Size([768])
torch.float16




In [7]:
quantized_model.to("cpu")
# Calling .save_pretrained() puts the relevant info into the GPTQ config
quantized_model.save_pretrained("opt-125m-gptq")
tokenizer.save_pretrained("opt-125m-gptq")

('opt-125m-gptq/tokenizer_config.json',
 'opt-125m-gptq/special_tokens_map.json',
 'opt-125m-gptq/vocab.json',
 'opt-125m-gptq/merges.txt',
 'opt-125m-gptq/added_tokens.json',
 'opt-125m-gptq/tokenizer.json')

In [22]:
!ls opt-125m-gptq

config.json		model.safetensors	 tokenizer.json
generation_config.json	special_tokens_map.json  vocab.json
merges.txt		tokenizer_config.json


In [8]:
!cat opt-125m-gptq/config.json

{
  "_name_or_path": "facebook/opt-125m",
  "_remove_final_layer_norm": false,
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "OPTForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "do_layer_norm_before": true,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 3072,
  "hidden_size": 768,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "prefix": "</s>",
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.1,
    "dataset": "wikitext2",
    "desc_act": false,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,
    "max_input_length": null,
    "model_seqlen": null,
    "module_name_preceding_first_block": null,
    "mod

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
