# Lab 6: Quantization & Optimization - SOLUTIONS

**Module 6 - Advanced Optimization Techniques**

In [None]:
import torch
import gc
import time
import math
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_NAME = "microsoft/phi-2"

def get_memory_usage():
    if torch.cuda.is_available():
        return {"allocated": torch.cuda.memory_allocated() / 1e9}
    return {"allocated": 0}

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

## Exercise 1: Understanding Quantization - SOLUTION

In [None]:
def manual_quantize_int8(tensor: torch.Tensor) -> tuple:
    abs_max = torch.max(torch.abs(tensor))
    scale = abs_max / 127.0
    quantized = torch.round(tensor / scale).to(torch.int8)
    return quantized, scale

def manual_dequantize(quantized: torch.Tensor, scale: float) -> torch.Tensor:
    return quantized.float() * scale

def analyze_quantization_error(original: torch.Tensor, bits: int = 8):
    max_val = 2 ** (bits - 1) - 1
    abs_max = torch.max(torch.abs(original))
    scale = abs_max / max_val
    
    quantized = torch.round(original / scale).clamp(-max_val, max_val)
    reconstructed = quantized * scale
    
    mse = torch.mean((original - reconstructed) ** 2)
    return {"bits": bits, "mse": mse.item(), "relative_error": (mse.sqrt() / torch.std(original)).item()}

# Test
weights = torch.randn(1000, 1000)
for bits in [8, 4, 2]:
    result = analyze_quantization_error(weights, bits)
    print(f"{bits}-bit: MSE={result['mse']:.6f}, Relative Error={result['relative_error']:.4%}")

## Exercise 2: INT8 Quantization - SOLUTION

In [None]:
def load_model_fp16():
    clear_memory()
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
    )
    return model

def load_model_int8():
    clear_memory()
    config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=6.0)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, quantization_config=config, device_map="auto", trust_remote_code=True
    )
    return model

print("INT8 loading functions ready (requires GPU)")

## Exercise 3: NF4 Quantization - SOLUTION

In [None]:
def load_model_nf4():
    clear_memory()
    config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, quantization_config=config, device_map="auto", trust_remote_code=True
    )
    return model

def benchmark_generation(model, tokenizer, prompts: list, max_tokens: int = 50):
    model.eval()
    total_tokens, total_time = 0, 0
    
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        start = time.time()
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False)
        elapsed = time.time() - start
        tokens = outputs.shape[1] - inputs.input_ids.shape[1]
        total_tokens += tokens
        total_time += elapsed
    
    return {"tokens_per_sec": total_tokens / total_time, "total_tokens": total_tokens}

print("NF4 functions ready")

## Exercise 4: QLoRA - SOLUTION

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

def setup_qlora_model():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True
    )
    model = prepare_model_for_kbit_training(model)
    
    lora_config = LoraConfig(
        r=16, lora_alpha=32, target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)
    return model

def analyze_qlora_efficiency(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Total: {total:,} | Trainable: {trainable:,} | Trainable %: {100*trainable/total:.4f}%")

print("QLoRA setup ready (requires GPU)")

## Checkpoint

Lab 6 complete! **Next:** Lab 7 - Ethical AI & Guardrails