# Dialogue Summarization: Baseline vs System Prompt

This notebook compares Llama's performance on dialogue summarization with and without a system prompt. We'll evaluate using ROUGE scores to measure summary quality.


In [1]:
! pip install -q torch datasets peft huggingface_hub evaluate rouge_score hf_transfer

In [16]:
import os
import warnings
import torch
import evaluate
from tqdm.auto import tqdm
from typing import Optional
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline
)
from huggingface_hub import login
from datasets import Dataset, load_dataset
import evaluate
from tqdm.auto import tqdm
from pprint import pprint

warnings.filterwarnings("ignore")

## 2. Authentication and Configuration

Before we begin training, we need to authenticate with HuggingFace (to download models and upload results) and optionally with Weights & Biases (to track training progress).

**Important:** Make sure you have a `.env` file with:
- `HF_TOKEN`: Your HuggingFace access token
- `HF_USERNAME`: Your HuggingFace username

We'll also load our training configuration from `config.json`, which contains all hyperparameters and settings.


## Load Data from HuggingFace

In [3]:
dataset = load_dataset("knkarthick/samsum")
train_data = dataset["train"]

test_data = dataset['test'].shuffle(seed=42).select(range(200))
val_data = dataset['validation'].shuffle(seed=42).select(range(200))

## Helper Methods for Evaluation

In [4]:
def build_summary_prompt(sample: dict, with_answer: bool = False, task_instruction: str = None) -> str:
    """
    Build a formatted summarization prompt from a single dataset sample.

    Args:
        sample (dict): Dictionary containing 'dialogue' and 'summary' keys.
        with_answer (bool, optional): If True, include the ground truth summary (for training). Defaults to False.
        task_instruction (str, optional): Optional task instruction to prepend (e.g. "Summarize this conversation...").

    Returns:
        str: Formatted prompt string ready for model input.
    """
    dialogue = sample.get("dialogue", "").strip()
    summary = sample.get("summary", "").strip()

    # Optional system-level prefix
    instruction_part = f"{task_instruction.strip()}\n\n" if task_instruction else ""

    # Build the base summarization prompt
    prompt = (
        f"{instruction_part}"
        f"## Dialogue:\n{dialogue}\n## Summary:"
    )

    # Add ground truth summary if needed (for training)
    if with_answer and summary:
        prompt += f" {summary}"

    return prompt


In [6]:
task_instruction = (
    "You are a helpful assistant who writes concise, factual summaries of conversations. "
    "Summarize the following conversation into a single sentence."
)
sample = val_data[0]
print(build_summary_prompt(sample, task_instruction=task_instruction))

You are a helpful assistant who writes concise, factual summaries of conversations. Summarize the following conversation into a single sentence.

## Dialogue:
Victoria: God I'm really broke, I spent way to much this month üò´
Victoria: At least we get paid soon..
Magda: Yeah, don't remind me, I know the feeling
Magda: I just paid my car insurance, I feel robbed üòÇ
Victoria: Thankfully mine is paid for the rest of the year üôè
Magda: üëå
## Summary:


# Evaluation Function

In [15]:
def process_batch_predictions(pipe, samples, num_samples=None, task_instruction=None, batch_size=8):
    """
    Processes samples in batches for better efficiency.
    """
    from tqdm.auto import tqdm
    from datasets import Dataset
    import evaluate

    
    # Apply num_samples limit safely
    if num_samples is not None and num_samples < len(samples):
        eval_data = samples.select(range(num_samples))
        print(f"Evaluating a subset of {num_samples} samples...")
    else:
        eval_data = samples
        print(f"Evaluating all {len(samples)} samples...")
    
    # Prepare prompts and ground truth summaries
    prompts = [
        build_summary_prompt(ex, task_instruction=task_instruction, with_answer=False)
        for ex in eval_data
    ]
    
    # Convert to Hugging Face Dataset for efficient batching
    dataset = Dataset.from_dict({"prompt": prompts})
        
    predictions = []
    total_samples = len(dataset)
    
    # Run inference in batches with progress bar
    pbar = tqdm(total=total_samples, desc="Generating summaries", leave=True, position=0)
    
    for start in range(0, total_samples, batch_size):
        end = min(start + batch_size, total_samples)
        batch_prompts = dataset["prompt"][start:end]
        
        # Generate summaries for the batch
        outputs = pipe(batch_prompts, max_new_tokens=256, batch_size=batch_size, return_full_text=False)
        
        for output in outputs:
            generated_text = output[0]["generated_text"].strip()
            predictions.append(generated_text)
            pbar.update(1)
    
    pbar.close()

    return predictions

def compute_rouge_scores(predictions, samples):   
    """Evaluate summarization performance on a subset of samples using ROUGE metrics."""
    # Compute ROUGE scores using the rouge object directly
    print("\nComputing ROUGE scores...")    
    rouge = evaluate.load("rouge")
    
    references = [ex["summary"] for ex in samples]
    rouge_scores = rouge.compute(predictions=predictions, references=references)
    print(
        f"\nROUGE scores on validation set:\n"
        f"Rouge-1: {rouge_scores['rouge1']:.2%}\n"
        f"Rouge-2: {rouge_scores['rouge2']:.2%}\n"
        f"Rouge-L: {rouge_scores['rougeL']:.2%}"
    )    
    return rouge_scores 


## Load the Model

In [8]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    dtype=torch.bfloat16,
)

pipe = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    device_map="auto",
    temperature=0.01,
    do_sample=False
)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

## Evaluation with Task Instruction

Now we evaluate the same model with a prompt that provides specific (and better) instructions for summarization. This helps the model understand the task better and should improve performance.


In [10]:
task_instruction = (
    "You are a helpful assistant who writes concise, factual summaries of conversations. "
    "Summarize the following conversation into a single sentence."
)

predictions = process_batch_predictions(
    pipe=pipe,
    samples=val_data,
    # num_samples=200,
    task_instruction=task_instruction
)
rouge_scores = compute_rouge_scores(predictions, val_data)

torch.cuda.empty_cache()

Evaluating all 200 samples...


Generating summaries:   0%|          | 0/200 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Computing ROUGE scores...
ROUGE SCORES
rouge1: 35.10%
rouge2: 12.99%
rougeL: 27.18%


## Example Generation

Let's look at a specific example to see how the model performs with and without the system prompt.


In [11]:
# Pick a sample
sample = val_data[0]
pprint(sample)

{'dialogue': "Victoria: God I'm really broke, I spent way to much this month "
             'üò´\n'
             'Victoria: At least we get paid soon..\n'
             "Magda: Yeah, don't remind me, I know the feeling\n"
             'Magda: I just paid my car insurance, I feel robbed üòÇ\n'
             'Victoria: Thankfully mine is paid for the rest of the year üôè\n'
             'Magda: üëå',
 'id': '13829853',
 'summary': 'Magda and Victoria feel broke. '}


In [12]:
# Summary with  task instruction
print("-"*60)
sample_with_task_instruction = build_summary_prompt(sample, task_instruction=task_instruction, with_answer=False)
pprint(f"sample_with_task_instruction=\n{sample_with_task_instruction}")
response = pipe(sample_with_task_instruction, max_new_tokens=256, return_full_text=False)
print("-"*60)
print("Generated Summary:")
print(response[0]['generated_text'])
print("-"*60)

------------------------------------------------------------
('sample_with_task_instruction=\n'
 'You are a helpful assistant who writes concise, factual summaries of '
 'conversations. Summarize the following conversation into a single sentence.\n'
 '\n'
 '## Dialogue:\n'
 "Victoria: God I'm really broke, I spent way to much this month üò´\n"
 'Victoria: At least we get paid soon..\n'
 "Magda: Yeah, don't remind me, I know the feeling\n"
 'Magda: I just paid my car insurance, I feel robbed üòÇ\n'
 'Victoria: Thankfully mine is paid for the rest of the year üôè\n'
 'Magda: üëå\n'
 '## Summary:')
------------------------------------------------------------
Generated Summary:
 Victoria and Magda discuss their financial struggles, with Victoria expressing frustration about overspending and Magda joking about being "robbed" of their car insurance payments.
------------------------------------------------------------
