### Example Loading Model From HF Hub As Marlin

In [2]:
from auto_gptq import AutoGPTQForCausalLM

model_id = "TheBloke/Llama-2-7B-chat-GPTQ"
model = AutoGPTQForCausalLM.from_quantized(model_id, device_map="auto", use_marlin=True)

  from .autonotebook import tqdm as notebook_tqdm
INFO - The layer lm_head is not quantized.
Overriding QuantLinear layers to use Marlin's QuantLinear...: 100%|██████████| 454/454 [00:45<00:00, 10.00it/s]
INFO - Disabling fused attention and mlp injection because Marlin kernel is used
The safetensors archive passed at /home/robertgshaw/.cache/huggingface/assets/autogptq/TheBloke/Llama-2-7B-chat-GPTQ/autogptq_model.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

text = "In a galaxy far, far away"
inputs = tokenizer(text, return_tensors="pt").to("cuda")

print("--- marlin:")
out = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(out[0], skip_special_tokens=True))

--- marlin:
In a galaxy far, far away, a group of rebels are on a mission to save the galaxy from the evil Empire.
The rebels are led by a young farm boy named Luke Skywalker, who is still learning the ways of the Force. Alongside


In [None]:
save_dir = "./llama-marlin"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

#### Restart the runtime

In [1]:
from auto_gptq import AutoGPTQForCausalLM

model_id = "TheBloke/Llama-2-7B-chat-GPTQ"
save_dir = "./llama-marlin"

model_reloaded_cached = AutoGPTQForCausalLM.from_quantized(model_id, device_map="auto", use_marlin=True)
model_reloaded_serialized = AutoGPTQForCausalLM.from_quantized(save_dir, device_map="auto", use_marlin=True)

  from .autonotebook import tqdm as notebook_tqdm
INFO - The layer lm_head is not quantized.
Overriding QuantLinear layers to use Marlin's QuantLinear...: 100%|██████████| 454/454 [00:46<00:00,  9.74it/s]
INFO - Disabling fused attention and mlp injection because Marlin kernel is used
The safetensors archive passed at /home/robertgshaw/.cache/huggingface/assets/autogptq/TheBloke/Llama-2-7B-chat-GPTQ/autogptq_model.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.
INFO - The layer lm_head is not quantized.
Overriding QuantLinear layers to use Marlin's QuantLinear...: 100%|██████████| 454/454 [00:46<00:00,  9.66it/s]
INFO - Disabling fused attention and mlp injection because Marlin kernel is used


In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

text = "In a galaxy far, far away"
inputs = tokenizer(text, return_tensors="pt").to("cuda")

print("--- marlin cached:")
out = model_reloaded_cached.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(out[0], skip_special_tokens=True))

print("--- marlin serialized:")
out = model_reloaded_serialized.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(out[0], skip_special_tokens=True))

--- marlin cached:
In a galaxy far, far away, a group of rebels are on a mission to save the galaxy from the evil Empire.
The rebels are led by a young farm boy named Luke Skywalker, who is still learning the ways of the Force. Alongside
--- marlin serialized:
In a galaxy far, far away, a group of rebels are on a mission to save the galaxy from the evil Empire.
The rebels are led by a young farm boy named Luke Skywalker, who is still learning the ways of the Force. Alongside
