 # Install packages & lib

In [2]:
!pip install autoawq transformers

Collecting autoawq
  Downloading autoawq-0.1.0-cp310-cp310-manylinux2014_x86_64.whl (17.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m98.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers>=0.12.1 (from autoawq)
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate (from autoawq)
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece (from autoawq)
  Downloading sentencepiece-0.1.99-cp310-cp

# Quantize model

In [3]:
import torch
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_path = 'facebook/opt-125m'
quant_path = 'opt-125m-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4 }

# Load model
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoAWQForCausalLM.from_pretrained(
    model_path,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Downloading (…)c6eeba5dd6/README.md:   0%|          | 0.00/7.10k [00:00<?, ?B/s]

Downloading (…)6eeba5dd6/LICENSE.md:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Downloading (…)a5dd6/.gitattributes:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Quantize
model.quantize(tokenizer,
               quant_config=quant_config,
               calib_data="pileval")

# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

Downloading readme:   0%|          | 0.00/167 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

AWQ: 100%|██████████| 12/12 [02:11<00:00, 10.97s/it]


('opt-125m-awq/tokenizer_config.json',
 'opt-125m-awq/special_tokens_map.json',
 'opt-125m-awq/vocab.json',
 'opt-125m-awq/merges.txt',
 'opt-125m-awq/added_tokens.json',
 'opt-125m-awq/tokenizer.json')

# Inference

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer

quant_path = 'opt-125m-awq'

# Load model
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
streamer = TextStreamer(tokenizer, skip_special_tokens=True)

In [None]:
# Convert prompt to tokens
prompt_template = """\
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.

USER: {prompt}
ASSISTANT:"""

tokens = tokenizer(
    prompt_template.format(prompt="How are you today?"),
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens,
    streamer=streamer,
    max_new_tokens=512
)