In [1]:
from transformers import AutoTokenizer, TextGenerationPipeline, AutoModelForCausalLM, GPTQConfig
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging

## hf also have auto gptq, GPTQConfig
# model_id = "facebook/opt-125m"

# tokenizer = AutoTokenizer.from_pretrained(model_id)

# quantization_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=quantization_config)

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

pretrained_model_dir = "/mnt/sdc/yuzhao/model/llm/llama/llama2-7b-chat-hf"
quantized_model_dir = "/mnt/nas1/models/llama/quantized_models/llama2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
examples = [
    tokenizer(
        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
    )
]

quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
)

# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)

# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(examples)

# save quantized model
model.save_quantized(quantized_model_dir)

# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)

# push quantized model to Hugging Face Hub.
# to use use_auth_token=True, Login first via huggingface-cli login.
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)

# alternatively you can save and push at the same time
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)

# load quantized model to the first GPU
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")

# download quantized model from Hugging Face Hub and load to the first GPU
# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)

# inference with model.generate
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))

# or you can also use pipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
print(pipeline("auto-gptq is")[0]["generated_text"])

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [01:12<00:00, 36.36s/it]
2023-11-09 07:21:25 INFO [auto_gptq.modeling._base] Start quantizing layer 1/32
2023-11-09 07:21:27 INFO [auto_gptq.modeling._base] Quantizing self_attn.k_proj in layer 1/32...
2023-11-09 07:21:28 INFO [auto_gptq.quantization.gptq] duration: 1.7066783905029297
2023-11-09 07:21:28 INFO [auto_gptq.quantization.gptq] avg loss: 3.834463119506836
2023-11-09 07:21:28 INFO [auto_gptq.modeling._base] Quantizing self_attn.v_proj in layer 1/32...
2023-11-09 07:21:30 INFO [auto_gptq.quantization.gptq] duration: 1.2426633834838867
2023-11-09 07:21:30 INFO [auto_gptq.quantization.gptq] avg loss: 0.20677444338798523
2023-11-09 07:21:30 INFO [auto_gptq.modeling._base] Quantizing self_attn.q_proj in layer 1/32...
2023-11-09 07:21:31 INFO [auto_gptq.quantization.gptq] duration: 1.235335350036621
2023-11-09 07:21:31 INFO [auto_gptq.quantization.gptq] avg loss: 4.246157646179199
2023

<s> auto_gptq is a Python package for generating text with the GPT-Q model


The model 'LlamaGPTQForCausalLM' is not supported for . Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaFo

auto-gptq is a powerful tool for generating and optimizing GPT-based models,
