In [None]:
!pip install auto-gptq torch

In [1]:
# Select model.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

In [2]:
# Select dataset for calibration.
# Choosing something that is as close as possible to your target use case is best.
# Here, we selected a generic chat dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"

# Best practice is to use 4096, but 2048 can be good enough.
MAX_SEQ_LEN = 4096

# 1024 samples is usually good enough, moving to 2048 can help sometimes.
NUM_SAMPLES = 2048

In [3]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Pre-process your dataset.
# Its a good idea to use the chat template.
def preprocess(example):
    return {"text": tokenizer.apply_chat_template(
        example["messages"], tokenize=False,
    )}

dataset = load_dataset(DATASET_ID, split="train_sft")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
ds = dataset.shuffle().select(range(NUM_SAMPLES))
ds = ds.map(preprocess)

# BE CAREFUL WITH THE TOKENIZER
#   apply_chat_template already adds the bos_token
#   so we set add_special_token to false
examples = [
    tokenizer(
        example["text"],
        padding=False, max_length=MAX_SEQ_LEN, truncation=True, add_special_tokens=False
    ) for example in ds
]

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 2038/2038 [00:00<00:00, 4754.15 examples/s]


In [4]:
# Confirm the input tokens look right.
# Note: its important that the chat template be applied.
# Note: careful that you don't end up with 2 <|begin_of_text|> tokens!
print(tokenizer.decode(examples[0]["input_ids"]))

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Create a modern drama, written in third-person omniscient point of view, that delves into the emotional complexity of a couple's experience in the aftermath of infidelity. Explore themes such as trust, forgiveness, jealousy, and deceit in a way that captivates the audience and leaves them questioning the boundaries of love and loyalty. The drama should be set in a realistic environment and include dynamic, multi-dimensional characters who evolve throughout the course of the story.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

As soon as the secret was out, nothing was quite the same.

It started with a gut feeling. Alice couldn't shake the sense that something was off between her and her husband, Simon. He worked late and came home exhausted, often falling asleep as soon as his head hit the pillow. But that wasn't what bothered her. It was the distance between them—the way he seemed to be holding something back, even 

In [5]:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

# Setup quantization arguments.
#   We also support speedup from 8 bits.
#   With 8 bits, its best to set group_size=-1 (channelwise) / desc_act=False)
quantize_config = BaseQuantizeConfig(
    bits=4,                         # Only support 4 bit
    group_size=128,                 # Group size 128 is typically the best spot for accuracy / performance.
    desc_act=True,                  # Act_recordering will help accuracy.
    model_file_base_name="model",   # Name of the model.safetensors when we call save_pretrained
    true_sequential=False,          # Set to True to reduce memory consumption during quantization at expense of runtime      
)

# Load model.
model = AutoGPTQForCausalLM.from_pretrained(
    MODEL_ID,
    quantize_config,
    device_map="auto")

CUDA extension not installed.
CUDA extension not installed.
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.86it/s]


In [6]:
# Apply the GPTQ algorithm.
model.quantize(examples)

INFO - Start quantizing layer 1/32
INFO - Quantizing self_attn.k_proj in layer 1/32...
INFO - Quantizing self_attn.v_proj in layer 1/32...
INFO - Quantizing self_attn.q_proj in layer 1/32...
INFO - Quantizing self_attn.o_proj in layer 1/32...
INFO - Quantizing mlp.up_proj in layer 1/32...
INFO - Quantizing mlp.gate_proj in layer 1/32...
INFO - Quantizing mlp.down_proj in layer 1/32...
INFO - Start quantizing layer 2/32


KeyboardInterrupt: 

In [10]:
gptq_save_dir = f"{MODEL_ID.split('/')[-1]}-gptq"
print(f"Saving gptq model to {gptq_save_dir}")
model.save_pretrained(gptq_save_dir)
tokenizer.save_pretrained(gptq_save_dir)



Saving gptq model to Meta-Llama-3-8B-Instruct-gptq


('Meta-Llama-3-8B-Instruct-gptq/tokenizer_config.json',
 'Meta-Llama-3-8B-Instruct-gptq/special_tokens_map.json',
 'Meta-Llama-3-8B-Instruct-gptq/tokenizer.json')