## Quantize the model

In [8]:
!python -m awq.entry --model_path ~/.cache/huggingface/transformers/openchat_3.5  \
    --w_bit 4 --q_group_size 128 \
    --run_awq --dump_awq awq_cache/openchat_3.5-w4-g128.pt

Quantization config: {'zero_point': True, 'q_group_size': 128}
* Building model /PHShome/bg615/.cache/huggingface/transformers/openchat_3.5
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:17<00:00,  8.84s/it]
Repo card metadata block was not found. Setting CardData to empty.
 * Split into 65 blocks
Running AWQ...: 100%|███████████████████████████| 32/32 [17:32<00:00, 32.91s/it]
AWQ results saved at awq_cache/openchat_3.5-w4-g128.pt


In [9]:
!python -m awq.entry --model_path ~/.cache/huggingface/transformers/openchat_3.5 \
    --w_bit 4 --q_group_size 128 \
    --load_awq awq_cache/openchat_3.5-w4-g128.pt \
    --q_backend real --dump_quant quant_cache/openchat_3.5-w4-g128.pt

Quantization config: {'zero_point': True, 'q_group_size': 128}
* Building model /PHShome/bg615/.cache/huggingface/transformers/openchat_3.5
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:20<00:00, 10.02s/it]
Loading pre-computed AWQ results from awq_cache/openchat_3.5-w4-g128.pt
real weight quantization...: 100%|██████████████| 32/32 [04:09<00:00,  7.80s/it]
Saving the quantized model at quant_cache/openchat_3.5-w4-g128.pt...


## Load the quantized model

In [1]:
from lm_eval import evaluator, tasks
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch
import argparse
import os
import json
from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model, load_checkpoint_in_model
from accelerate.utils.modeling import get_balanced_memory
from awq.utils.parallel import auto_parallel
from awq.quantize.pre_quant import run_awq, apply_awq
from awq.quantize.quantizer import pseudo_quantize_model_weight, real_quantize_model_weight
from awq.utils.lm_eval_adaptor import LMEvalAdaptor
from awq.utils.utils import simple_dispatch_model
import time

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
q_config = {
    "zero_point": True,  # by default True
    "q_group_size": 128,  # whether to use group quantization
}
max_memory = []

In [4]:
def build_model_and_enc(model_path, quantized_file_path, load_quant = True, w_bit = 4):
    if not os.path.exists(model_path):  # look into ssd
        raise FileNotFoundError(f"{model_path} not found!")
    print(f"* Building model {model_path}")

    # all hf model
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
    print(f"Config = {config}")
    if "mpt" in config.__class__.__name__.lower():
        enc = AutoTokenizer.from_pretrained(config.tokenizer_name, trust_remote_code=True)
    else:
        enc = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)

    if load_quant:  # directly load quantized weights
        print("Loading pre-computed quantized weights...")
        with init_empty_weights():
            model = AutoModelForCausalLM.from_config(config=config,
                                                     torch_dtype=torch.float16, trust_remote_code=True)
        model.config.pretraining_tp = 1
        real_quantize_model_weight(
            model, w_bit=w_bit, q_config=q_config, init_only=True)
        
        model.tie_weights()
        
        # Infer device map
        kwargs = {"max_memory": max_memory} if len(max_memory) else {}
        device_map = infer_auto_device_map(
            model,
            no_split_module_classes=[
                "OPTDecoderLayer", "LlamaDecoderLayer", "BloomBlock", "MPTBlock", "DecoderLayer"],
            **kwargs
        )
        # Load checkpoint in the model
        load_checkpoint_in_model(
            model,
            checkpoint= quantized_file_path,
            device_map=device_map,
            offload_state_dict=True,
        )
        # Dispatch model
        model = simple_dispatch_model(model, device_map=device_map)

        model.eval()
    else:  # fp16 to quantized
        args.run_awq &= not args.load_awq  # if load_awq, no need to run awq
        # Init model on CPU:
        kwargs = {"torch_dtype": torch.float16, "low_cpu_mem_usage": True}
        model = AutoModelForCausalLM.from_pretrained(
            model_path, config=config, trust_remote_code=True, **kwargs)

        model.eval()

        if args.run_awq:
            assert args.dump_awq, "Please save the awq results with --dump_awq"
                        
            awq_results = run_awq(
                model, enc,
                w_bit=args.w_bit, q_config=q_config,
                n_samples=128, seqlen=512,
            )
            if args.dump_awq:
                dirpath = os.path.dirname(args.dump_awq)
                os.makedirs(dirpath, exist_ok=True)
                
                torch.save(awq_results, args.dump_awq)
                print("AWQ results saved at", args.dump_awq)
                
            exit(0)
                
        if args.load_awq:
            print("Loading pre-computed AWQ results from", args.load_awq)
            awq_results = torch.load(args.load_awq, map_location="cpu")
            apply_awq(model, awq_results)

        # weight quantization
        if args.w_bit is not None:
            if args.q_backend == "fake":
                assert args.dump_quant is None, \
                    "Need to use real quantization to dump quantized weights"
                pseudo_quantize_model_weight(
                    model, w_bit=args.w_bit, q_config=q_config
                )
            elif args.q_backend == "real":  # real quantization
                real_quantize_model_weight(
                    model, w_bit=args.w_bit, q_config=q_config
                )
                if args.dump_quant:
                    dirpath = os.path.dirname(args.dump_quant)
                    os.makedirs(dirpath, exist_ok=True)
                    
                    print(
                        f"Saving the quantized model at {args.dump_quant}...")
                    torch.save(model.cpu().state_dict(), args.dump_quant)
                    exit(0)
            else:
                raise NotImplementedError
            
        # Move the model to GPU (as much as possible) for LM evaluation
        kwargs = {"max_memory": get_balanced_memory(model, max_memory if len(max_memory) > 0 else None)}
        device_map = infer_auto_device_map(
            model,
            # TODO: can we remove this?
            no_split_module_classes=[
                "OPTDecoderLayer", "LlamaDecoderLayer", "BloomBlock", "MPTBlock", "DecoderLayer"],
            **kwargs
        )
        model = dispatch_model(model, device_map=device_map)

    return model, enc

In [5]:
model_path = "../.cache/huggingface/transformers/openchat_3.5"
quantized_file_path = "quant_cache/openchat_3.5-w4-g128.pt"
model, tokenizer = build_model_and_enc(model_path, quantized_file_path = quantized_file_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


* Building model ../.cache/huggingface/transformers/openchat_3.5
Config = MistralConfig {
  "_name_or_path": "../.cache/huggingface/transformers/openchat_3.5",
  "architectures": [
    "MistralForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.35.0",
  "use_cache": true,
  "vocab_size": 32002
}

Loading pre-computed quantized weights...


real weight quantization...(init only): 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 838.05it/s]


In [6]:
device = torch.device("cuda")
model.to(device)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32002, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): WQLinear(in_features=4096, out_features=4096, bias=False, w_bit=4, group_size=128)
          (k_proj): WQLinear(in_features=4096, out_features=1024, bias=False, w_bit=4, group_size=128)
          (v_proj): WQLinear(in_features=4096, out_features=1024, bias=False, w_bit=4, group_size=128)
          (o_proj): WQLinear(in_features=4096, out_features=4096, bias=False, w_bit=4, group_size=128)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): WQLinear(in_features=4096, out_features=14336, bias=False, w_bit=4, group_size=128)
          (up_proj): WQLinear(in_features=4096, out_features=14336, bias=False, w_bit=4, group_size=128)
          (down_proj): WQLinear(in_features=14336, out_features=4096, bias=False, w_bit=4, group_

## Test the quantized model

In [20]:
def LLM_text(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    generate_ids = model.generate(inputs.input_ids, max_length=1024)
    print(generate_ids)
    question_and_response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    response = question_and_response.split("ASSISTANT: ")[-1]
    return response

In [21]:
prompt = "What is the difference between Harvard and MIT?"
quant_start_time = time.time()
LLM_text(model, tokenizer, prompt)
quant_end_time = time.time()
quant_run_time = quant_end_time - quant_start_time
print(f"[INFO]: The quantized model finishes running after {quant_run_time} seconds.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tensor([[    1,  1824,   349,  ..., 28723,    13,    13]], device='cuda:0')


"What is the difference between Harvard and MIT?\n\nHarvard University and the Massachusetts Institute of Technology (MIT) are two of the most prestigious universities in the United States. While both schools are highly regarded for their academic programs, there are several key differences between the two.\n\n1. Location: Harvard University is located in Cambridge, Massachusetts, while MIT is also located in Cambridge, Massachusetts. Both schools are part of the Cambridge-Boston metropolitan area, which is known for its rich history, culture, and academic institutions.\n\n2. Size: Harvard University has a larger student body than MIT, with approximately 20,000 students enrolled, compared to MIT's approximately 11,000 students. This means that Harvard has a more diverse student population and a wider range of academic programs and extracurricular activities.\n\n3. Academic Focus: Harvard University is a liberal arts college, which means it offers a broad range of academic programs in t

## This is what you will do if you want to load the original, unquantized model on GPU

In [3]:
tokenizer = AutoTokenizer.from_pretrained("/PHShome/bg615/.cache/huggingface/transformers/openchat_3.5")
model = AutoModelForCausalLM.from_pretrained("/PHShome/bg615/.cache/huggingface/transformers/openchat_3.5").half()
device = torch.device("cuda")
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32002, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

## Test the unquantized model on GPU

In [5]:
def LLM_text(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    generate_ids = model.generate(inputs.input_ids, max_length=1024)
    print(generate_ids)
    question_and_response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    response = question_and_response.split("ASSISTANT: ")[-1]
    return response

In [6]:
prompt = "What is the difference between Harvard and MIT?"
original_start_time = time.time()
LLM_text(model, tokenizer, prompt)
original_end_time = time.time()
original_run_time = original_end_time - original_start_time
print(f"[INFO]: The original model finishes running after {original_run_time} seconds.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tensor([[    1,  1824,   349,  ...,   272, 10539,   659]], device='cuda:0')


"What is the difference between Harvard and MIT?\n\nHarvard University and the Massachusetts Institute of Technology (MIT) are two of the most prestigious universities in the United States. While they are both highly regarded institutions, there are some key differences between the two.\n\nHarvard University is a private Ivy League research university located in Cambridge, Massachusetts. It was founded in 1636 and is the oldest institution of higher education in the United States. Harvard is known for its strong liberal arts program, as well as its professional schools in law, business, and medicine. The university offers a wide range of undergraduate and graduate degree programs in various fields of study.\n\nMIT, on the other hand, is a private research university located in Cambridge, Massachusetts, and is also known as a leading institution in science, technology, engineering, and mathematics (STEM) fields. MIT was founded in 1861 and is known for its strong emphasis on research an

## This is what you will do if you want to load the original, unquantized model on CPU

In [2]:
tokenizer = AutoTokenizer.from_pretrained("/PHShome/bg615/.cache/huggingface/transformers/openchat_3.5")
model = AutoModelForCausalLM.from_pretrained("/PHShome/bg615/.cache/huggingface/transformers/openchat_3.5")
device = torch.device("cpu")
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32002, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

## Test the unquantized model on CPU

In [3]:
def LLM_text(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    generate_ids = model.generate(inputs.input_ids, max_length=1024)
    print(generate_ids)
    question_and_response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    response = question_and_response.split("ASSISTANT: ")[-1]
    return response

In [4]:
prompt = "What is the difference between Harvard and MIT?"
original_cpu_start_time = time.time()
LLM_text(model, tokenizer, prompt)
original_cpu_end_time = time.time()
original_cpu_run_time = original_cpu_end_time - original_cpu_start_time
print(f"[INFO]: The original CPU model finishes running after {original_cpu_run_time} seconds.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


tensor([[    1,  1824,   349,  ..., 10539,   349,  9045]])
[INFO]: The original CPU model finishes running after 579.0212705135345 seconds.
