In [1]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "facebook/opt-6.7b"
quant_model_dir = "models/opt-6.7b-awq"

quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM"
}

In [2]:
# 加载模型
model = AutoAWQForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)

Request 407f21ca-61a2-41f6-bc1f-f8e242fc0d1b: HEAD https://hf-mirror.com/facebook/opt-6.7b/resolve/main/config.json (authenticated: False)
Request 2206588b-3fea-4033-a780-b24641ad0580: GET https://hf-mirror.com/api/models/facebook/opt-6.7b/revision/main (authenticated: False)


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Request f5083f65-c668-4fc9-b3ab-8d72f64d0f70: HEAD https://hf-mirror.com/facebook/opt-6.7b/resolve/main/tokenizer_config.json (authenticated: False)


In [3]:
# 量化模型
model.quantize(tokenizer, quant_config=quant_config)

Request 9327239f-a50e-4da8-afa1-7efa2356c385: GET https://hf-mirror.com/api/datasets/mit-han-lab/pile-val-backup (authenticated: False)
Request 39a13ae1-de79-4264-aa79-19fbfe01e9b4: GET https://hf-mirror.com/api/datasets/mit-han-lab/pile-val-backup (authenticated: False)
Repo card metadata block was not found. Setting CardData to empty.
Request 33a614f5-6bbd-4be0-a724-480254c05804: GET https://hf-mirror.com/api/datasets/mit-han-lab/pile-val-backup/revision/2f5e46ae6a69cf0dce4b12f78241c408936ca0e4 (authenticated: False)
Request 98306d54-e848-45f3-8f97-cbb61546dbd5: POST https://hf-mirror.com/api/datasets/mit-han-lab/pile-val-backup/paths-info/2f5e46ae6a69cf0dce4b12f78241c408936ca0e4 (authenticated: False)
Request aa68acda-3cb5-440d-88b9-59173ead4d2d: GET https://hf-mirror.com/api/datasets/mit-han-lab/pile-val-backup/tree/2f5e46ae6a69cf0dce4b12f78241c408936ca0e4/data?recursive=False&expand=False (authenticated: False)
Request ec906b79-af9e-49e8-93a9-12863d2a45a3: GET https://hf-mirror.co

In [4]:
quant_config

{'zero_point': True, 'q_group_size': 128, 'w_bit': 4, 'version': 'GEMM'}

In [5]:
from transformers import AwqConfig, AutoConfig

# 修改配置文件以使其与transformers集成兼容
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

# 预训练的transformers模型存储在model属性中，我们需要传递一个字典
model.model.config.quantization_config = quantization_config

In [6]:
import os
from datetime import datetime

# 获取当前日期和时间
current_dateTime = datetime.now()

# 格式化为 yyyyMMddHHmmss
formatted_dateTime = current_dateTime.strftime("%Y%m%d%H%M%S")
model_save_path=os.environ["MODEL_SAVE_PATH"]
quant_model_dir = f"{model_save_path}models/{quant_model_dir}/{formatted_dateTime}"

# 保存模型权重
model.save_quantized(quant_model_dir)
# 保存分词器
tokenizer.save_pretrained(quant_model_dir)  
print(f"模型已保存:{quant_model_dir}")

模型已保存:/root/autodl-tmp/models/models/opt-6.7b-awq/20240328185853


In [7]:
model.eval()

OptAWQForCausalLM(
  (model): OPTForCausalLM(
    (model): OPTModel(
      (decoder): OPTDecoder(
        (embed_tokens): Embedding(50272, 4096, padding_idx=1)
        (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
        (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0-31): 32 x OPTDecoderLayer(
            (self_attn): OPTAttention(
              (k_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
              (v_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
              (q_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
              (out_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
            )
            (activation_fn): ReLU()
            (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affin

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(quant_model_dir)
model = AutoModelForCausalLM.from_pretrained(quant_model_dir, device_map="cuda").to(0)

In [9]:
def generate_text(text):
    inputs = tokenizer(text, return_tensors="pt").to(0)

    out = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [10]:
result = generate_text("Merry Christmas! I'm glad to")
print(result)

Merry Christmas! I'm glad to hear you're still enjoying and learning about the game, it's a great way to spend your time :)   It's going to be a bit of a wait before the next chapter, but you can try some of the new factions and events! /u/R2_D2 /u/woutme


In [11]:
result = generate_text("The woman worked as a")
print(result)

The woman worked as a nurse at a hospital between 1994 and 2006, when she was arrested for a sexual assault on a patient.

The man, in the case filed Thursday, worked as a nurse at the hospital from 1989 to 1992.

The police said he had had sexual relationships with about nine other patients at the hospital.

