In [3]:
import torch
import torch.nn as nn
import numpy as np

### QLoRA (NF 4 + Double Quantisation)

In [None]:
# NF4 requires a Look Up table to substitute the values with nearest value

def nf4_lookuptable(weight):
    nf4_table = [
    -1.0,
    -0.6961928,
    -0.52507305,
    -0.3949175,
    -0.28444138,
    -0.18477343,
    -0.09105027,
    -0.0,
     0.0,
     0.0795803,
     0.1609302,
     0.24611232,
     0.33791524,
     0.44070983,
     0.562617,
     1.0 ]
    
    # Substitues the nearest value and for easier flow we return the index of the closest weight from look up table

    min_value = float("inf")
    index = -1
    for i in range(16):
        value = abs(nf4_table[i] - weight)     # We take the absoulte difference of 2 weights
        if value <= min_value:
            index = i
            min_value = value
    return index                               # Returns the index

In [None]:
# Returns the quantized matrix given a pre-trained / normal weight matrix

def find_nf4(weight_matrix):
    quantised_matrix = []
    for i in range(len(weight_matrix)):
        temp = []
        for j in range(len(weight_matrix[0])):
            inter = nf4_lookuptable(weight_matrix[i][j])    # Send each weight to the lookup definition to get the closest weight
            temp.append(inter)  
        quantised_matrix.append(temp)
    return quantised_matrix

In [None]:
# Use a sample weight matrix filled with random numbers for easier inference

sample_weight_matrix = torch.rand(size = (5,5), dtype = torch.float32) * 2 - 1
sample_weight_matrix

tensor([[ 0.3733, -0.5848, -0.9635,  0.6661, -0.9457],
        [-0.1754,  0.7782,  0.6323,  0.5214, -0.9463],
        [ 0.9396,  0.1672,  0.2616, -0.2405,  0.8747],
        [-0.2828,  0.1751, -0.0074, -0.8564,  0.5614],
        [-0.7298,  0.6567,  0.6597, -0.5610, -0.3041]])

In [8]:
nf4_table = [
    -1.0,
    -0.6961928,
    -0.52507305,
    -0.3949175,
    -0.28444138,
    -0.18477343,
    -0.09105027,
    -0.0,
     0.0,
     0.0795803,
     0.1609302,
     0.24611232,
     0.33791524,
     0.44070983,
     0.562617,
     1.0 ]

In [None]:
# This is the quantised matrix after applying NF-4 quantisation (each number lookup index is represented in 4 bits 0 - 15 digits)
# These will be stored in Memory and during fetching we get the weight at respective index using this matrix from the look up table

quantised_weight_matrix = torch.tensor(find_nf4(sample_weight_matrix))
quantised_weight_matrix

tensor([[12,  2,  0, 14,  0],
        [ 5, 14, 14, 14,  0],
        [15, 10, 11,  4, 15],
        [ 4, 10,  8,  0, 14],
        [ 1, 14, 14,  2,  4]])

In [None]:
# This is the quantized matrix whoch consists of the nearest look up weights

quantized_weights = []

for x in quantised_weight_matrix:
    temp = []
    for y in x:
        temp.append(nf4_table[y])
    quantized_weights.append(temp)

quantized_weights = torch.tensor(quantized_weights)
quantized_weights


tensor([[ 0.3379, -0.5251, -1.0000,  0.5626, -1.0000],
        [-0.1848,  0.5626,  0.5626,  0.5626, -1.0000],
        [ 1.0000,  0.1609,  0.2461, -0.2844,  1.0000],
        [-0.2844,  0.1609,  0.0000, -1.0000,  0.5626],
        [-0.6962,  0.5626,  0.5626, -0.5251, -0.2844]])

In [None]:
# Loss in weights (Information Lost in NF4)

sample_weight_matrix - quantized_weights

tensor([[ 0.0353, -0.0597,  0.0365,  0.1035,  0.0543],
        [ 0.0093,  0.2156,  0.0697, -0.0412,  0.0537],
        [-0.0604,  0.0063,  0.0155,  0.0440, -0.1253],
        [ 0.0016,  0.0142, -0.0074,  0.1436, -0.0012],
        [-0.0336,  0.0940,  0.0971, -0.0360, -0.0197]])

### GPTQ (Generative Pre-Trained Quantisation)

In [None]:
! pip install auto-gptq transformers accelerate

In [None]:
# Using inbuilt GPTQ library for Inference
# GPTQ tries to minimise the difference in the layer outputs row wise 
# It finds for parameters like scaling factor and offset value which gives the least loss in output
# It applies those parameters for future inputs or data to reduce the same loss 

import torch
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "facebook/opt-125m"    # Model Name

# Tokenizer and AutoModel respectively
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Configuring the quantisation metrics
quantize_config = BaseQuantizeConfig(
    bits=4,                     # Store in 4 bits
    group_size=128,
    desc_act=False,
)

# Defining the quantisation model using the base model and defined quantisation paramerters
quant_model = AutoGPTQForCausalLM.from_pretrained(
    model_name,
    quantize_config = quantize_config,
    trust_remote_code = True,
)

# A smaple input to be sent for GPTQ to get the optimal parameters for quantisation
example_inputs = tokenizer("Hello, this is a test input for GPTQ.", return_tensors="pt")

# Quantise the model
quant_model.quantize([example_inputs])

orig_weight = model.model.decoder.layers[0].self_attn.q_proj.weight             # Original Unquantised weights at Layer 0 attention q matrix
quant_layer = quant_model.model.model.decoder.layers[0].self_attn.q_proj        # Quantised weights at Layer 0 attention q matrix

print("Original Weight (first 5x5 block):")
print(orig_weight[:5, :5].to(torch.float32))
print("\nQuantized Weight (first 5x5 block):")

In [None]:
# Print small part of the quantised matrix from that layer

print(quant_layer.qweight[:5, :5])

In [None]:
# Basically torch uses int32 and int 64 to store the weights but not int4 or int8.
# So torch stores the quantised values each of 4 bits in torch.int32 format
# So we have to shift respectively to get the individual weights

import torch
import numpy as np

packed_int32_tensor = quant_layer.qweight.cpu().numpy()
orig_shape = orig_weight.shape
num_int32 = packed_int32_tensor.size
num_4bit = num_int32 * 8
packed_int32_flat = packed_int32_tensor.flatten()
unpacked_4bit = np.zeros(num_4bit, dtype=np.int8)

for i, val in enumerate(packed_int32_flat):
    for j in range(8):
        nibble = (val >> (4 * j)) & 0xF
        if nibble >= 8:
            nibble -= 16
        unpacked_4bit[i*8 + j] = nibble

rows, cols = orig_shape
unpacked_weight_matrix = unpacked_4bit.reshape(rows, cols)
print("Unpacked 4-bit quantized weights (first 5x5 block):")
print(unpacked_weight_matrix[:5, :5])

### AWQ (Activation aware Weight Quantisation)

In [None]:
# Same as GPTQ but this AWQ tries to get to know which values are sensitive to activation
# i.e on which input, which output affects more, so those type of weights are held differently and others differently

from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "OriDragon2000/Qwen1.5-1.8B-awq-w4-g128"               # QWEN model (AWQ model - already quantised)

# AWQ requires some external hardware to use it and due to unavailablilty already quantised AWQ model is taken for inference

# Tokenizer and mdoel for QWEN
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

# Sample Input text
input_text = "What is the capital of France?"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Generate output for given prompt
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# Show the quantised weights
print((model.model.layers[0].mlp.down_proj.qweight))

In [None]:
# Similar to GPTQ, torch uses torch.int32 to store the values which are quantised so we need to split manually to see them
import torch
import numpy as np

quant_layer = model.model.layers[0].mlp.down_proj

packed_int32_tensor = quant_layer.qweight.cpu().numpy()
orig_shape = quant_layer.out_features, quant_layer.in_features
num_int32 = packed_int32_tensor.size
num_4bit = num_int32 * 8
packed_int32_flat = packed_int32_tensor.flatten()
unpacked_4bit = np.zeros(num_4bit, dtype=np.int8)

for i, val in enumerate(packed_int32_flat):
    for j in range(8):
        nibble = (val >> (4 * j)) & 0xF
        if nibble >= 8:
            nibble -= 16
        unpacked_4bit[i * 8 + j] = nibble

unpacked_weight_matrix = unpacked_4bit.reshape(orig_shape)

print("Unpacked 4-bit quantized weights (5x5 block):")
print(unpacked_weight_matrix[:5, :5])