# Effects of Quantization to Text Generation

This notebook is designed to compare the responses of the **Llama 3.1 8B Instruct** model across different quantization configurations using a set of questions selected from the ArenaHard dataset. Specifically, we will examine and contrast the model's performance under the following quantization settings:

- 16-bit (BF16)
- 8-bit (W8A8KV8)
- 4-bit (W4A4KV4)

## 0. To be able to follow the notebook, please first go through the installation steps in the README.md.

### 0.1. Import necessary modules

In [None]:
import time
from logging import Logger

import torch
import sys
from transformers import LlamaTokenizerFast
import transformers
from eval_utils.modeling_llama import LlamaForCausalLM
from eval_utils.main import ptq_model
from utils import utils
from utils.process_args import process_args_ptq

log: Logger = utils.get_logger("spinquant")
MODEL_ID: str = "meta-llama/Llama-3.1-8B-Instruct"
HF_TOKEN: str = "<YOUR HF_TOKEN>" # Replace with your huggingface token
MAX_NEW_TOKENS: int = 2000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 1. Implement some helper functions

###  1.1. Define questions from [ArenaHard](https://huggingface.co/spaces/lmarena-ai/arena-hard-browser)

In [None]:
prompts: list[str] = [
    "I have a dataset which contains a list of 2D images, given a new image, how to find the closest image in the dataset",
    "I have black and white images with 1 pixel width white horizonal lines going through the image. How to detect the lines and remove it?",
    "Given a binary array ‘nums’, you are required to find the maximum length of a contiguous subarray that contains an equal number of 0s and 1s.",
    "Proof that Q(sqrt(-11)) is a principal ideal domain",
    "Suppose you an architect of ad network platform that have a task to build a system for optimization of landing page (financial offers, like selling debit cards and getting comissions from it). You have a traffic flow (TF), conversions (CV), pay per click rates (CZ) or pay per offers (PA). Give outline and a concept code for such a system maximizing revenue. Apply thomson samling method (or similar optimal) to get fastest and accurate results from AB testing.",
    "Can you market size revenue that can earned by UK Management Consultancy by advising and implementing FinTech solutions to Capital Markets clients",
    "Show me how to make 1$ using 19 coins",
    "How DO i perform continuous delta hedging with a neural network in python"
]

### 1.2. Define a function to quantize Llama 3.1 8B Instruct
This function will be used to get the quantized 8-bit and 4-bit versions.

In [None]:
def quantize(args):
    sys.argv = args

    model_args, training_args, ptq_args = process_args_ptq()
    print(model_args)
    print(training_args)
    print(ptq_args)

    config = transformers.AutoConfig.from_pretrained(
        model_args.input_model, token=HF_TOKEN
    )

    process_word_embeddings = False
    if config.tie_word_embeddings:
        config.tie_word_embeddings = False
        process_word_embeddings = True

    dtype = torch.bfloat16 if training_args.bf16 else torch.float16
    model = LlamaForCausalLM.from_pretrained(
        pretrained_model_name_or_path=model_args.input_model,
        config=config,
        torch_dtype=dtype,
        token=HF_TOKEN
    )
    if process_word_embeddings:
        model.lm_head.weight.data = model.model.embed_tokens.weight.data.clone()
    model.cuda()

    model = ptq_model(ptq_args, model, model_args)
    model.seqlen = training_args.model_max_length

    log.info("Model PTQ completed {}".format(model))
    log.info("Start to load tokenizer...")

    tokenizer = LlamaTokenizerFast.from_pretrained(
        pretrained_model_name_or_path=model_args.input_model,
        cache_dir=training_args.cache_dir,
        model_max_length=training_args.model_max_length,
        padding_side="right",
        use_fast=True,
        add_eos_token=False,
        add_bos_token=False,
        token=model_args.access_token,
    )
    log.info("Complete tokenizer loading...")
    model.config.use_cache = False
    return model, tokenizer
    

### 1.3. Implement a function to generate a response for each question

In [None]:
def generate_responses(model, tokenizer):
    responses = []
    for prompt in prompts:
        if tokenizer.chat_template is None:
            input_ids = tokenizer(prompt, return_tensors="pt").to(model.device).input_ids
        else:
            messages = [{"role": "user", "content": prompt}]
            input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
        streamer = transformers.TextStreamer(tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True)
        print(f"Question:\n{prompt}\n")
        print("Generated answer from quantized model:\n")
        start_generation_time = time.time()

        output = model.generate(input_ids, max_new_tokens=MAX_NEW_TOKENS, use_cache=True, streamer=streamer, do_sample=False)
        responses.append(output)
        generation_time = time.time() - start_generation_time
        print(f"\nTokens generated: {output.size(1)}.")
        print(f"Time taken to generate the response: {generation_time:.2f} seconds.")
    return responses

## 2. Load 16-bit version

Note: Loading all 3 different models for each setting at once might not be possible due to VRAM limitations, in that case load each model and generate responses seperately.

In [None]:
# Loading 16-bit version:

config = transformers.AutoConfig.from_pretrained(
    MODEL_ID, token=HF_TOKEN
)

llama_16bit_model = LlamaForCausalLM.from_pretrained(
    pretrained_model_name_or_path=MODEL_ID,
    config=config,
    torch_dtype=torch.bfloat16,
    token=HF_TOKEN
).to(device)

llama_16bit_tokenizer = LlamaTokenizerFast.from_pretrained(
    pretrained_model_name_or_path=MODEL_ID,
    model_max_length=2048,
    padding_side="right",
    use_fast=True,
    add_eos_token=False,
    add_bos_token=False,
    token=HF_TOKEN,
)

### 2.1. Generate Responses

In [None]:
responses_16bit = generate_responses(llama_16bit_model, llama_16bit_tokenizer)

## 3. Load the 16-bit model and quantize it to lower-bit variants

### 3.1. Quantize it to 8-bit and generate responses

In [None]:
args_8bit = [
    "2_eval_ptq.sh",
    "--input_model", MODEL_ID,
    "--do_train", "False",
    "--do_eval", "True",
    "--per_device_eval_batch_size", "4",
    "--model_max_length", "2048",
    "--fp16", "False",
    "--bf16", "True",
    "--save_safetensors", "False",
    "--w_bits", "8",
    "--a_bits", "8",
    "--k_bits", "8",
    "--v_bits", "8",
    "--w_clip",
    "--a_asym",
    "--k_asym",
    "--v_asym",
    "--k_groupsize", "128",
    "--v_groupsize", "128",
    "--rotate",
    "--optimized_rotation_path", "rotation_matrices/llama-3.1-8b-instruct/8-bit/R.bin",
]

llama_8bit_model, llama_8bit_tokenizer = quantize(args_8bit)

In [None]:
responses_8bit = generate_responses(llama_8bit_model, llama_8bit_tokenizer)

### 3.1. Quantize it to 4-bit and generate responses

In [None]:
args_4bit = [
    "2_eval_ptq.sh",
    "--input_model", MODEL_ID,
    "--do_train", "False",
    "--do_eval", "True",
    "--per_device_eval_batch_size", "4",
    "--model_max_length", "2048",
    "--fp16", "False",
    "--bf16", "True",
    "--save_safetensors", "False",
    "--w_bits", "4",
    "--a_bits", "4",
    "--k_bits", "4",
    "--v_bits", "4",
    "--w_clip",
    "--a_asym",
    "--k_asym",
    "--v_asym",
    "--k_groupsize", "128",
    "--v_groupsize", "128",
    "--rotate",
    "--optimized_rotation_path", "rotation_matrices/llama-3.1-8b-instruct/4-bit/R.bin",
]

llama_4bit_model, llama_4bit_tokenizer = quantize(args_4bit)

In [None]:
responses_4bit = generate_responses(llama_4bit_model, llama_4bit_tokenizer)