In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.gptq import GPTQQuantizer, load_quantized_model
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)


In [1]:

quantizer = GPTQQuantizer(bits=4, dataset="c4", block_name_to_quantize = "model.decoder.layers", model_seqlen = 2048)
quantized_model = quantizer.quantize_model(model, tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)okenizer_config.json: 100%|██████████| 685/685 [00:00<00:00, 1.77MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 651/651 [00:00<00:00, 1.67MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 8.91MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 60.2MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 441/441 [00:00<00:00, 1.03MB/s]
Downloading pytorch_model.bin: 100%|██████████| 251M/251M [00:11<00:00, 21.5MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 137/137 [00:00<00:00, 118kB/s]
Downloading readme: 100%|██████████| 2.38k/2.38k [00:00<00:00, 5.77MB/s]
Repo card metadata block was not found. Setting CardData to empty.
Downloading data: 100%|██████████| 319M/319M [02:20<00:00, 2.27MB/s]
Downloading data files: 100%|██████████| 1/1 [02:20<00:00, 140.92s/it]
Extracting data files: 100%|██████████| 1/1 [00:09<00:00,  9.47s/it]

In [4]:
save_folder = "./gptq_quantized_model"

In [2]:
quantizer.save(quantized_model,save_folder)

In [5]:
# Load Quantized Model
# The model needs to be initialized using empty weights, with weights loaded as a next step.
from accelerate import init_empty_weights
with init_empty_weights():
    empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
empty_model.tie_weights()
quantized_model = load_quantized_model(empty_model, 
                                       save_folder=save_folder, 
                                       device_map="auto",
                                       disable_exllama=False # Uncomment to load quantized model with exllama for faster inference
                                       )

In [8]:
prompt = "Quantized models are "
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

output = quantized_model(**inputs)

In [11]:
output

CausalLMOutputWithPast(loss=None, logits=tensor([[[-6.9492, -6.9336,  9.6875,  ..., -6.9844, -6.8555, -6.8867],
         [-0.2993, -0.3054,  2.1973,  ..., -0.2917, -0.3350, -0.6909],
         [-6.3711, -6.3750, -1.6055,  ..., -6.4258, -6.1914, -6.5391],
         [-8.8047, -8.8125,  1.7188,  ..., -8.8516, -8.7344, -9.0156],
         [-9.3203, -9.3203, -1.2900,  ..., -9.4062, -9.2188, -9.4375],
         [-7.3633, -7.3398, -2.6582,  ..., -7.4180, -7.3555, -7.4336]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 1.3691e+00, -5.0488e-01,  1.5781e+00,  ...,  9.2969e-01,
            2.1973e-01,  5.5713e-01],
          [ 1.9678e+00, -2.1179e-02,  6.3477e-03,  ...,  2.6934e+00,
            1.4600e+00,  4.7412e-01],
          [ 6.8604e-01, -2.4292e-01, -1.6479e-03,  ...,  1.3779e+00,
           -7.4170e-01,  9.0820e-01],
          [ 2.2754e+00, -3.9844e-01,  7.5928e-01,  ...,  6.8799e-01,
           -1.2852e+00,  1.0186e+00],
        