In [1]:
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset

dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")

In [4]:
MAX_SEQ_LEN = 512

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)

ds = dataset.shuffle().select(range(32))

def preprocess(example):
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}

ds = ds.map(preprocess)

Map: 100%|██████████| 32/32 [00:00<00:00, 463.65 examples/s]


In [5]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

examples = [
    tokenizer(
        example["text"], padding=False, max_length=MAX_SEQ_LEN, truncation=True,
    ) for example in ds
]

In [6]:
quantize_config = BaseQuantizeConfig(
    bits=4,                         # quantize model to 4-bit
    group_size=128,                 # it is recommended to set the value to 128
    desc_act=False,                 # set to False can significantly speed up inference but the perplexity may slightly bad
    model_file_base_name="model"    # name of the object when we call save_pretrained
)

In [7]:
model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config, device_map="auto")

In [8]:
model.quantize(examples)

INFO - Start quantizing layer 1/22
INFO - Quantizing self_attn.k_proj in layer 1/22...
INFO - Quantizing self_attn.v_proj in layer 1/22...
INFO - Quantizing self_attn.q_proj in layer 1/22...
INFO - Quantizing self_attn.o_proj in layer 1/22...
INFO - Quantizing mlp.up_proj in layer 1/22...
INFO - Quantizing mlp.gate_proj in layer 1/22...
INFO - Quantizing mlp.down_proj in layer 1/22...
INFO - Start quantizing layer 2/22
INFO - Quantizing self_attn.k_proj in layer 2/22...
INFO - Quantizing self_attn.v_proj in layer 2/22...
INFO - Quantizing self_attn.q_proj in layer 2/22...
INFO - Quantizing self_attn.o_proj in layer 2/22...
INFO - Quantizing mlp.up_proj in layer 2/22...
INFO - Quantizing mlp.gate_proj in layer 2/22...
INFO - Quantizing mlp.down_proj in layer 2/22...
INFO - Start quantizing layer 3/22
INFO - Quantizing self_attn.k_proj in layer 3/22...
INFO - Quantizing self_attn.v_proj in layer 3/22...
INFO - Quantizing self_attn.q_proj in layer 3/22...
INFO - Quantizing self_attn.o_pro

In [10]:
save_dir = "./tinyllama-gptq"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)



('./tinyllama-gptq/tokenizer_config.json',
 './tinyllama-gptq/special_tokens_map.json',
 './tinyllama-gptq/tokenizer.model',
 './tinyllama-gptq/added_tokens.json',
 './tinyllama-gptq/tokenizer.json')

In [11]:
marlin_model = AutoGPTQForCausalLM.from_quantized(save_dir, use_marlin=True, device_map="auto")

INFO - The layer lm_head is not quantized.
Repacking weights to be compatible with Marlin kernel...: 100%|██████████| 314/314 [01:24<00:00,  3.71it/s]
INFO - Disabling fused attention and mlp injection because Marlin kernel is used
The safetensors archive passed at ./tinyllama-gptq/autogptq_model.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

text = "In a galaxy far, far away"
inputs = tokenizer(text, return_tensors="pt").to("cuda")

print("--- marlin:")
out = marlin_model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(out[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- marlin:
In a galaxy far, far away, the Jedi are a peaceful order of beings who live in harmony with the Force. However, in the face of a powerful and malevolent force, the Jedi are forced to fight for their survival. In this game


In [15]:
marlin_model.save_pretrained("./tinyllama-marlin")
tokenizer.save_pretrained("./tinyllama-marlin")



('./tinyllama-marlin/tokenizer_config.json',
 './tinyllama-marlin/special_tokens_map.json',
 './tinyllama-marlin/tokenizer.model',
 './tinyllama-marlin/added_tokens.json',
 './tinyllama-marlin/tokenizer.json')

In [16]:
marlin_model_reloaded_cache = AutoGPTQForCausalLM.from_quantized("./tinyllama-gptq", use_marlin=True, device_map="auto")
marlin_model_reloaded_serialized = AutoGPTQForCausalLM.from_quantized("./tinyllama-marlin", use_marlin=True, device_map="auto")

INFO - The layer lm_head is not quantized.
Overriding QuantLinear layers to use Marlin's QuantLinear...: 100%|██████████| 314/314 [00:28<00:00, 11.17it/s]
INFO - Disabling fused attention and mlp injection because Marlin kernel is used
The safetensors archive passed at ./tinyllama-gptq/autogptq_model.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.
INFO - The layer lm_head is not quantized.
Overriding QuantLinear layers to use Marlin's QuantLinear...: 100%|██████████| 314/314 [00:26<00:00, 12.07it/s]
INFO - Disabling fused attention and mlp injection because Marlin kernel is used


In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

text = "In a galaxy far, far away"
inputs = tokenizer(text, return_tensors="pt").to("cuda")

print("--- marlin cached:")
out = marlin_model_reloaded_cache.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(out[0], skip_special_tokens=True))

print("--- marlin serialized:")
out = marlin_model_reloaded_serialized.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(out[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- marlin cached:


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In a galaxy far, far away, the Jedi are a peaceful order of beings who live in harmony with the Force. However, in the face of a powerful and malevolent force, the Jedi are forced to fight for their survival. In this game
--- marlin serialized:
In a galaxy far, far away, the Jedi are a peaceful order of beings who live in harmony with the Force. However, in the face of a powerful and malevolent force, the Jedi are forced to fight for their survival. In this game
