In [1]:
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset

dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")

In [2]:
MAX_SEQ_LEN = 2048

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)

ds = dataset.shuffle().select(range(128))

def preprocess(example):
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}

ds = ds.map(preprocess)

Map: 100%|██████████| 128/128 [00:00<00:00, 1166.95 examples/s]


In [3]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

examples = [
    tokenizer(
        example["text"], padding=False, max_length=MAX_SEQ_LEN, truncation=True,
    ) for example in ds
]

In [4]:
quantize_config = BaseQuantizeConfig(
    bits=4,                         # quantize model to 4-bit
    group_size=128,                 # it is recommended to set the value to 128
    desc_act=False,                 # set to False can significantly speed up inference but the perplexity may slightly bad
    model_file_base_name="model"    # name of the object when we call save_pretrained
)

In [5]:
model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config, device_map="auto")

In [6]:
model.quantize(examples)

INFO - Start quantizing layer 1/22
INFO - Quantizing self_attn.k_proj in layer 1/22...
INFO - Quantizing self_attn.v_proj in layer 1/22...
INFO - Quantizing self_attn.q_proj in layer 1/22...
INFO - Quantizing self_attn.o_proj in layer 1/22...
INFO - Quantizing mlp.up_proj in layer 1/22...
INFO - Quantizing mlp.gate_proj in layer 1/22...
INFO - Quantizing mlp.down_proj in layer 1/22...
INFO - Start quantizing layer 2/22
INFO - Quantizing self_attn.k_proj in layer 2/22...
INFO - Quantizing self_attn.v_proj in layer 2/22...
INFO - Quantizing self_attn.q_proj in layer 2/22...
INFO - Quantizing self_attn.o_proj in layer 2/22...
INFO - Quantizing mlp.up_proj in layer 2/22...
INFO - Quantizing mlp.gate_proj in layer 2/22...
INFO - Quantizing mlp.down_proj in layer 2/22...
INFO - Start quantizing layer 3/22
INFO - Quantizing self_attn.k_proj in layer 3/22...
INFO - Quantizing self_attn.v_proj in layer 3/22...
INFO - Quantizing self_attn.q_proj in layer 3/22...
INFO - Quantizing self_attn.o_pro

In [19]:
quantize_config.model_name_or_path
quantize_config.model_file_base_name

'gptq_model-4bit-128g'

In [9]:
save_dir = "./tinyllama-gptq"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)



('./tinyllama-gptq/tokenizer_config.json',
 './tinyllama-gptq/special_tokens_map.json',
 './tinyllama-gptq/tokenizer.model',
 './tinyllama-gptq/added_tokens.json',
 './tinyllama-gptq/tokenizer.json')

In [12]:
marlin_model = AutoGPTQForCausalLM.from_quantized(save_dir, use_marlin=True, device_map="auto")

INFO - The layer lm_head is not quantized.


Repacking weights to be compatible with Marlin kernel...: 100%|██████████| 314/314 [01:16<00:00,  4.10it/s]
INFO - Disabling fused attention and mlp injection because Marlin kernel is used
The safetensors archive passed at ./tinyllama-gptq/autogptq_model.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


In [13]:
gptq_model = AutoGPTQForCausalLM.from_quantized(save_dir, use_marlin=False, device_map="auto")

INFO - The layer lm_head is not quantized.
Skipping module injection for FusedLlamaMLPForQuantizedModel as currently not supported with use_triton=False.


In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

text = "In a galaxy far, far away"
inputs = tokenizer(text, return_tensors="pt").to("cuda")

print("--- marlin:")
out = marlin_model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(out[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- marlin:
In a galaxy far, far away, the Jedi Knights were a group of highly trained and skilled warriors who fought against


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

model_id = "facebook/opt-125m"
quantization_config = GPTQConfig(
     bits=4,
     group_size=128,
     dataset="wikitext2",
     desc_act=False,
)

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
quant_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map='auto')

Quantizing model.decoder.layers blocks : 100%|██████████| 12/12 [01:49<00:00,  9.17s/it]


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

text = "In a galaxy far, far away"
inputs = tokenizer(text, return_tensors="pt").to(0)

print("--- marlin")
out = marlin_model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(out[0], skip_special_tokens=True))

In a galaxy far, far away, there is no way to stop the spread of the virus.

The virus is spreading,


In [6]:
quant_model.save_pretrained("./opt-quantized")
tokenizer.save_pretrained("./opt-quantized")
quantization_config.to_json_file("./opt-quantized/quantize_config.json")

#### Restart The Runtime

In [11]:
from transformers import AutoModelForCausalLM
model_id = "./opt-quantized"
quant_model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto')
quant_model.model.decoder.layers[0].self_attn.k_proj._buffers.keys()

odict_keys(['qweight', 'qzeros', 'scales', 'g_idx', 'bias'])

In [1]:
from auto_gptq import AutoGPTQForCausalLM
model_id = "./opt-quantized"
marlin_model = AutoGPTQForCausalLM.from_quantized(model_id, use_marlin=True)

INFO - The layer lm_head is not quantized.
Repacking weights to be compatible with Marlin kernel...: 100%|██████████| 140/140 [00:18<00:00,  7.52it/s]
INFO - Disabling fused attention and mlp injection because Marlin kernel is used


OSError: The safetensors archive passed at ./opt-quantized/autogptq_model.safetensors does not contain the valid metadata. Make sure you save your model with the `save_pretrained` method.