In [1]:
!pip install -q transformers datasets peft accelerate hf_transfer evaluate rouge_score
!pip install -q bitsandbytes -U

| **Scenario** | **Name / Description**                         | **Precision / Quantization**                                     | **LoRA Applied** | **LoRA State** | **Notes**                                                                                |
| ------------ | ---------------------------------------------- | ---------------------------------------------------------------- | ---------------- | -------------- | ---------------------------------------------------------------------------------------- |
| **1**        | **Baseline**                  | `bfloat16`                                      | ❌ No             | N/A            | Standard LLaMA-3.2-1B model; used as the pure baseline for comparison.                   |
| **2**        | **Baseline + Untrained LoRA** |`bfloat16`                                      | ✅ Yes            | Untrained      | LoRA adapters added but not trained; tests effect of LoRA initialization on output.      |
| **3**        | **Quantized Baseline (4-bit)**                 | 4-bit quantization (NF4 + double quantization, bfloat16 compute) | ❌ No             | N/A            | Evaluates performance and accuracy impact of quantization alone.                         |
| **4**        | **Quantized + Untrained LoRA**                 | 4-bit quantization                                               | ✅ Yes            | Untrained      | Combines quantization and LoRA adapters without training to assess combined overhead.    |


In [2]:
import os
import re
import json
import torch
import evaluate
from tqdm import tqdm
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)
from peft import LoraConfig, get_peft_model

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.cuda.empty_cache()

# LOAD & PREPARE DATA

In [4]:
dataset = load_dataset("knkarthick/samsum")
train_data = dataset["train"]

test_data = dataset['test'].shuffle(seed=42).select(range(200))
val_data = dataset['validation'].shuffle(seed=42).select(range(200))


In [None]:

def save_jsonl(data, filename):
    with open(filename, "w") as f:
        for sample in data:
            f.write(json.dumps(sample) + "\n")

save_jsonl(val_data, "validation_data.jsonl")


In [None]:

val_data = load_dataset("json", data_files="validation_data.jsonl")["train"]

def format_prompt(example):
    return f"## Dialogue:\n{example['dialogue']}\n##Summary:\n"

val_data = val_data.map(lambda ex: {"text": format_prompt(ex)})

## Helper Methods for Evaluation

In [30]:
task_instruction = (
    "You are a helpful assistant who writes concise, factual summaries of conversations. "
    "Summarize the following conversation into a single sentence."
)

def build_summary_prompt(sample: dict, with_answer: bool = False, task_instruction: str = None) -> str:
    """
    Build a formatted summarization prompt from a single dataset sample.

    Args:
        sample (dict): Dictionary containing 'dialogue' and 'summary' keys.
        with_answer (bool, optional): If True, include the ground truth summary (for training). Defaults to False.
        task_instruction (str, optional): Optional task instruction to prepend (e.g. "Summarize this conversation...").

    Returns:
        str: Formatted prompt string ready for model input.
    """
    dialogue = sample.get("dialogue", "").strip()
    summary = sample.get("summary", "").strip()

    # Optional system-level prefix
    instruction_part = f"{task_instruction.strip()}\n\n" if task_instruction else ""

    # Build the base summarization prompt
    prompt = (
        f"{instruction_part}"
        f"## Dialogue:\n{dialogue}\n## Summary:"
    )

    # Add ground truth summary if needed (for training)
    if with_answer and summary:
        prompt += f" {summary}"

    return prompt


# TOKENIZER

In [6]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# EVALUATION FUNCTION

In [38]:
def process_batch_predictions(pipe, samples, num_samples=None, task_instruction=None, batch_size=8):
    """
    Processes samples in batches for better efficiency.
    """
    from tqdm.auto import tqdm
    from datasets import Dataset
    import evaluate

    
    # Apply num_samples limit safely
    if num_samples is not None and num_samples < len(samples):
        eval_data = samples.select(range(num_samples))
        print(f"Evaluating a subset of {num_samples} samples...")
    else:
        eval_data = samples
        print(f"Evaluating all {len(samples)} samples...")
    
    # Prepare prompts and ground truth summaries
    prompts = [
        build_summary_prompt(ex, task_instruction=task_instruction, with_answer=False)
        for ex in eval_data
    ]
    
    # Convert to Hugging Face Dataset for efficient batching
    dataset = Dataset.from_dict({"prompt": prompts})
        
    predictions = []
    total_samples = len(dataset)
    
    # Run inference in batches with progress bar
    pbar = tqdm(total=total_samples, desc="Generating summaries", leave=True, position=0)
    
    for start in range(0, total_samples, batch_size):
        end = min(start + batch_size, total_samples)
        batch_prompts = dataset["prompt"][start:end]
        
        # Generate summaries for the batch
        outputs = pipe(batch_prompts, max_new_tokens=256, batch_size=batch_size, return_full_text=False)
        
        for output in outputs:
            generated_text = output[0]["generated_text"].strip()
            predictions.append(generated_text)
            pbar.update(1)
    
    pbar.close()

    return predictions

def compute_rouge_scores(predictions, samples):   
    """Evaluate summarization performance on a subset of samples using ROUGE metrics."""
    # Compute ROUGE scores using the rouge object directly
    print("\nComputing ROUGE scores...")    
    rouge = evaluate.load("rouge")
    
    references = [ex["summary"] for ex in samples]
    rouge_scores = rouge.compute(predictions=predictions, references=references)
    print(
        f"\nROUGE scores on validation set:\n"
        f"Rouge-1: {rouge_scores['rouge1']:.2%}\n"
        f"Rouge-2: {rouge_scores['rouge2']:.2%}\n"
        f"Rouge-L: {rouge_scores['rougeL']:.2%}"
    )    
    return rouge_scores 


# SCENARIO 1: Baseline (BF16 Precision)

In [39]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    dtype=torch.bfloat16,
)
pipe1 = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    device_map="auto",
    temperature=0.01,
    do_sample=False
)

Device set to use cuda:0


In [40]:
task_instruction = (
    "You are a helpful assistant who writes concise, factual summaries of conversations. "
    "Summarize the following conversation into a single sentence."
)

print("Scenario 1 - Evaluate baseline model")
predictions = process_batch_predictions(
    pipe=pipe1,
    samples=val_data,
    # num_samples=200,
    task_instruction=task_instruction
)
rouge_scores = compute_rouge_scores(predictions, val_data)
torch.cuda.empty_cache()

Scenario 1 - Evaluate baseline model
Evaluating all 200 samples...


Generating summaries:   0%|          | 0/200 [00:00<?, ?it/s]


Computing ROUGE scores...

ROUGE scores on validation set:
Rouge-1: 34.67%
Rouge-2: 12.83%
Rouge-L: 26.95%


# SCENARIO 2: Baseline + Untrained LoRA (BF16 Precision)

### LORA CONFIG

In [41]:
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)


### Evaluate 

In [42]:
base_fp_lora = get_peft_model(base_model, lora_cfg)
pipe2 = pipeline(
    "text-generation",
    model=base_fp_lora,
    tokenizer=tokenizer,
    device_map="auto",
    temperature=0.01,
    do_sample=False
)
print("Scenario 2 - Evaluate baseline model with LoRA Adapters")
predictions = process_batch_predictions(
    pipe=pipe2,
    samples=val_data,
    # num_samples=200,
    task_instruction=task_instruction
)
rouge_scores = compute_rouge_scores(predictions, val_data)
torch.cuda.empty_cache()

Device set to use cuda:0


Scenario 2 - Evaluate baseline model with LoRA Adapters
Evaluating all 200 samples...


Generating summaries:   0%|          | 0/200 [00:00<?, ?it/s]


Computing ROUGE scores...

ROUGE scores on validation set:
Rouge-1: 34.67%
Rouge-2: 12.83%
Rouge-L: 26.95%


# SCENARIO 3: Quantized Baseline (4-bit)

### Quantization Config

In [43]:
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

### Evaluate

In [44]:
base_4bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_cfg, device_map="auto")
pipe3 = pipeline(
    "text-generation",
    model=base_4bit,
    tokenizer=tokenizer,
    device_map="auto",
    temperature=0.01,
    do_sample=False
)

print("Scenario 3 - Evaluate Quantized Model")
predictions = process_batch_predictions(
    pipe=pipe3,
    samples=val_data,
    # num_samples=200,
    task_instruction=task_instruction
)
rouge_scores = compute_rouge_scores(predictions, val_data)
torch.cuda.empty_cache()

Device set to use cuda:0


Scenario 3 - Evaluate Quantized Model
Evaluating all 200 samples...


Generating summaries:   0%|          | 0/200 [00:00<?, ?it/s]


Computing ROUGE scores...

ROUGE scores on validation set:
Rouge-1: 32.25%
Rouge-2: 11.94%
Rouge-L: 24.73%


# SCENARIO 4: Quantized + Untrained LoRA

In [45]:
base_4bit_lora = get_peft_model(base_4bit, lora_cfg)
pipe4 = pipeline(
    "text-generation",
    model=base_4bit_lora,
    tokenizer=tokenizer,
    device_map="auto",
    temperature=0.01,
    do_sample=False
)
print("Scenario 4 - Evaluate Quantized Model with LoRA Adapters")
predictions = process_batch_predictions(
    pipe=pipe4,
    samples=val_data,
    # num_samples=200,
    task_instruction=task_instruction
)
rouge_scores = compute_rouge_scores(predictions, val_data)
torch.cuda.empty_cache()

Device set to use cuda:0


Scenario 4 - Evaluate Quantized Model with LoRA Adapters
Evaluating all 200 samples...


Generating summaries:   0%|          | 0/200 [00:00<?, ?it/s]


Computing ROUGE scores...

ROUGE scores on validation set:
Rouge-1: 32.25%
Rouge-2: 11.94%
Rouge-L: 24.73%
