In [11]:
!pip install -q transformers datasets peft accelerate hf_transfer evaluate rouge_score
!pip install bitsandbytes -U



| **Scenario** | **Name / Description**                         | **Precision / Quantization**                                     | **LoRA Applied** | **LoRA State** | **Notes**                                                                                |
| ------------ | ---------------------------------------------- | ---------------------------------------------------------------- | ---------------- | -------------- | ---------------------------------------------------------------------------------------- |
| **1**        | **Baseline**                  | `bfloat16`                                      | ❌ No             | N/A            | Standard LLaMA-3.2-1B model; used as the pure baseline for comparison.                   |
| **2**        | **Baseline + Untrained LoRA** |`bfloat16`                                      | ✅ Yes            | Untrained      | LoRA adapters added but not trained; tests effect of LoRA initialization on output.      |
| **3**        | **Quantized Baseline (4-bit)**                 | 4-bit quantization (NF4 + double quantization, bfloat16 compute) | ❌ No             | N/A            | Evaluates performance and accuracy impact of quantization alone.                         |
| **4**        | **Quantized + Untrained LoRA**                 | 4-bit quantization                                               | ✅ Yes            | Untrained      | Combines quantization and LoRA adapters without training to assess combined overhead.    |


In [12]:
import os
import re
import json
import torch
import evaluate
from tqdm import tqdm
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)
from peft import LoraConfig, get_peft_model

In [13]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.cuda.empty_cache()

# LOAD & PREPARE DATA

In [14]:
dataset = load_dataset("knkarthick/samsum")
train_data = dataset["train"]

test_data = dataset['test'].shuffle(seed=42).select(range(200))
val_data = dataset['validation'].shuffle(seed=42).select(range(200))

def save_jsonl(data, filename):
    with open(filename, "w") as f:
        for sample in data:
            f.write(json.dumps(sample) + "\n")

save_jsonl(val_data, "validation_data.jsonl")

val_data = load_dataset("json", data_files="validation_data.jsonl")["train"]

def format_prompt(example):
    return f"## Dialogue:\n{example['dialogue']}\n##Summary:\n"

val_data = val_data.map(lambda ex: {"text": format_prompt(ex)})


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

# TOKENIZER

In [15]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# EVALUATION FUNCTION

In [16]:
def calculate_rouge(generated_texts, true_summary):
  rouge = evaluate.load("rouge")
  results = rouge.compute(predictions=generated_texts, references=true_summary)
  return results

# Main evaluation function
def evaluate_model(pipe, system_prompt: str = None, num_samples: int = None, batch_size: int = 8):
    """Evaluate model accuracy on the validation data."""

    correct = 0
    correct_index = []
    generated_answers = []
    truncated_count = 0  # Track truncated responses
    token_lengths = []  # Track token lengths

    # Select subset if specified
    eval_data = val_data if num_samples is None else val_data.select(range(num_samples))

    # Prepare prompts and answers
    prompts = [
        f"{system_prompt}\n\n## Dialogue:\n{ex['dialogue']}\n## Summary:\n"
        if system_prompt else f"## Dialogue:\n{ex['dialogue']}\n## Summary:\n"
        for ex in eval_data
    ]
    gt_answers = [ex["summary"] for ex in eval_data]

    # Convert to Hugging Face Dataset for efficient batching
    dataset = Dataset.from_dict({"prompt": prompts})

    print(f"Running evaluation on {len(prompts)} samples...")

    # Run inference in efficient batches
    total_samples = len(dataset)
    from tqdm.auto import tqdm
    pbar = tqdm(total=total_samples, desc="Evaluating", leave=True, position=0)

    for start in range(0, total_samples, batch_size):
        end = min(start + batch_size, total_samples)
        batch_prompts = dataset["prompt"][start:end]
        batch_answers = gt_answers[start:end]

        outputs = pipe(batch_prompts, max_new_tokens=256, batch_size=batch_size, return_full_text=False)

        for j, output in enumerate(outputs):
            generated_only = output[0]["generated_text"]
            generated_answers.append(generated_only)

            # Check token length
            tokens = tokenizer.encode(generated_only)
            token_lengths.append(len(tokens))

            # Check if response was truncated (hit the max_new_tokens limit)
            if len(tokens) >= 256:
                truncated_count += 1
            pbar.update(1)
            pbar.set_postfix({
                "truncated": truncated_count
            })

    pbar.close()
    rouge = calculate_rouge(generated_answers, gt_answers)
    print(f'rouge1: {rouge['rouge1']:.2f}')
    print(f'rouge2: {rouge['rouge2']:.2f}')
    print(f'rougeL: {rouge['rougeL']:.2f}')
    print(f'rougeLsum: {rouge['rougeLsum']:.2f}')
    return rouge

# LORA CONFIG

In [17]:
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)


# QUANTIZATION CONFIG

In [18]:
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# SCENARIO 1: Baseline (Full Precision)

In [19]:
base_fp = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, device_map="auto")
pipe1 = pipeline("text-generation", model=base_fp, tokenizer=tokenizer, device_map="auto", temperature=0.1, do_sample=False)
rouge = evaluate_model(
    pipe=pipe1,
    num_samples=100
)
torch.cuda.empty_cache()

Device set to use cuda:0


Running evaluation on 100 samples...


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


rouge1: 0.14
rouge2: 0.05
rougeL: 0.11
rougeLsum: 0.12


# SCENARIO 2: Baseline + Untrained LoRA (Full Precision)

In [20]:
base_fp_lora = get_peft_model(base_fp, lora_cfg)
pipe2 = pipeline(
    "text-generation",
    model=base_fp_lora,
    tokenizer=tokenizer,
    device_map="auto",
    temperature=0.1,
    do_sample=False
)
evaluate_model(
    pipe=pipe2,
    num_samples=100
)
torch.cuda.empty_cache()

Device set to use cuda:0


Running evaluation on 100 samples...


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

rouge1: 0.14
rouge2: 0.05
rougeL: 0.11
rougeLsum: 0.12


# SCENARIO 3: Quantized Baseline (4-bit)

In [21]:
base_4bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_cfg, device_map="auto")
pipe3 = pipeline(
    "text-generation",
    model=base_4bit,
    tokenizer=tokenizer,
    device_map="auto",
    temperature=0.1,
    do_sample=False
)
evaluate_model(pipe3, num_samples=100)
torch.cuda.empty_cache()

Device set to use cuda:0


Running evaluation on 100 samples...


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

rouge1: 0.16
rouge2: 0.06
rougeL: 0.12
rougeLsum: 0.13


# SCENARIO 4: Quantized + Untrained LoRA

In [22]:
base_4bit_lora = get_peft_model(base_4bit, lora_cfg)
pipe4 = pipeline(
    "text-generation",
    model=base_4bit_lora,
    tokenizer=tokenizer,
    device_map="auto",
    temperature=0.0,
    do_sample=False
)
evaluate_model(pipe4, num_samples=100)
torch.cuda.empty_cache()

Device set to use cuda:0


Running evaluation on 100 samples...


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

rouge1: 0.16
rouge2: 0.06
rougeL: 0.12
rougeLsum: 0.13
