In [22]:
!pip install transformers sentencepiece torch evaluate bert-score scikit-learn rouge-score peft datasets accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [23]:
# Imports and config
import os
import re
from tqdm.auto import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import evaluate

## Load model and tokenizer
We'll use the small FLAN-T5 model to keep things light.
- Tokenizer converts text ↔ tokens
- Model generates outputs given the tokens

In [24]:
MODEL_NAME = "google-t5/t5-small"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device:", DEVICE)

device: cuda


In [25]:
print("Loading model and tokenizer... This may take a minute")
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

Loading model and tokenizer... This may take a minute


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [26]:
print(model.config )

T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "dtype": "float32",
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_bea

In [27]:
print(f"Hidden size (d_model): {model.config.d_model}")
print(f"Encoder layers: {model.config.num_layers}")
print(f"Decoder layers: {model.config.num_decoder_layers}")

print(f"Number of attention heads: {model.config.num_heads}")
print(f"Key-value dimension per head: {model.config.d_kv}")
print(f"Total Q/K/V dimension: {model.config.num_heads * model.config.d_kv}")

Hidden size (d_model): 512
Encoder layers: 6
Decoder layers: 6
Number of attention heads: 8
Key-value dimension per head: 64
Total Q/K/V dimension: 512


In [28]:
# See all parameter names
for name, param in model.named_parameters():
    if 'SelfAttention' in name and 'q' in name:
        print(f"{name}: {param.shape}")


encoder.block.0.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
encoder.block.1.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
encoder.block.2.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
encoder.block.3.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
encoder.block.4.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
encoder.block.5.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
decoder.block.0.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
decoder.block.1.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
decoder.block.2.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
decoder.block.3.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
decoder.block.4.layer.0.SelfAttention.q.weight: torch.Size([512, 512])
decoder.block.5.layer.0.SelfAttention.q.weight: torch.Size([512, 512])


In [29]:
# Total parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")  # 76,961,152

# trainable parameters
trainable = sum(p.numel() for p in model.parameters()
                  if p.requires_grad)

print(f"trainable parameters: {trainable:,}")  # ~6,144,512

Total parameters: 60,506,624
trainable parameters: 60,506,624


In [30]:
# Check a specific attention layer
encoder_attn = model.encoder.block[0].layer[0].SelfAttention

print("Query weight shape:", encoder_attn.q.weight.shape)  # (384, 512)
print("Key weight shape:", encoder_attn.k.weight.shape)    # (384, 512)
print("Value weight shape:", encoder_attn.v.weight.shape)  # (384, 512)
print("Output weight shape:", encoder_attn.o.weight.shape) # (384, 512)


Query weight shape: torch.Size([512, 512])
Key weight shape: torch.Size([512, 512])
Value weight shape: torch.Size([512, 512])
Output weight shape: torch.Size([512, 512])


- Loads SST-2 and SAMSum from Hugging Face datasets.
- Runs zero-shot classification on SST-2 using google/flan-t5-small (prompting the model to return exactly one label).
- Runs zero-shot summarization on SAMSum (prompting the model for 1–2 sentence summaries).
- Evaluates classification (accuracy) and summarization (ROUGE).
- Uses small subsets by default so that we can iterate quickly on CPU/GPU.

In [31]:

max_examples = 200
# Generation settings
GEN_KWARGS_CLASS = {
    "max_length": 16,
    "num_beams": 5,
    "early_stopping": True,
    "do_sample": False,
    "temperature": 0.0,
}

GEN_KWARGS_SUM = {
    "max_length": 120,
    "num_beams": 4,
    "early_stopping": True,
    "do_sample": False,
    "temperature": 0.0,
}

In [32]:
# Utility: normalize model-generated text
import unicodedata

def normalize_text(s: str):
    if s is None:
        return ""
    s = s.strip().lower()
    # normalize unicode
    s = unicodedata.normalize("NFKD", s)
    # remove punctuation except spaces
    s = re.sub(r"[^\w\s]", "", s)
    s = re.sub(r"\s+", " ", s)
    return s


## Zero-shot classification (SST-2 style)
FLAN-T5 understands instructions. For SST-2, prompting with `sst2: <text>` often produces `positive` or `negative`.
We'll write a tiny helper to classify one or more texts.

In [34]:
# Updated zero-shot SST-2 classification for base T5 using loss-based label scoring (avoids mapping warnings)
from datasets import load_dataset
from tqdm.auto import tqdm
import torch

def zero_shot_sst2_classify(ds, labels=["positive", "negative"], prompt_style="review"):
    """
    Classify sentiment via per-label loss (teacher-forced decoding) instead of free-form generation.
    This is more reliable for the non-instruction-tuned base T5 which may echo the input.

    Args:
        ds: HF dataset with fields 'sentence' and 'label'
        labels: candidate sentiment labels (strings)
        prompt_style: 'review' or 'instruct' (choose different prompt wording)
    Returns:
        dict with sentence, preds, trues, accuracy
    """
    true_labels = ["negative" if row["label"] == 0 else "positive" for row in ds]
    preds = []
    sentences = []

    for ex in tqdm(ds, desc="SST-2 zero-shot (loss scoring)"):
        text = ex["sentence"].strip()
        if prompt_style == "review":
            # Minimal pattern anchor
            prompt = f"review: {text}\nSentiment:"  # Keeps prompt short
        else:
            prompt = (
                "Classify the sentiment of the following review as one of: "
                + ", ".join(labels)
                + f".\n\nReview: {text}\nSentiment:"  # Slightly more instructive
            )
        enc = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(DEVICE)

        # Score each label with teacher forcing
        label_losses = {}
        for lab in labels:
            lab_ids = tokenizer(lab, return_tensors="pt").input_ids.to(DEVICE)
            with torch.no_grad():
                out = model(input_ids=enc.input_ids, attention_mask=enc.attention_mask, labels=lab_ids)
            label_losses[lab] = out.loss.item()
        pred = min(label_losses.items(), key=lambda kv: kv[1])[0]
        preds.append(pred)
        sentences.append(text)

    acc = sum(p == t for p, t in zip(preds, true_labels)) / len(preds)
    print(f"SST-2 zero-shot (loss scoring) accuracy on {len(preds)} examples: {acc:.4f}")
    return {"sentence": sentences, "preds": preds, "trues": true_labels, "accuracy": acc}

# Reload (or reuse) validation slice
from datasets import load_dataset
val_ds = load_dataset("glue", "sst2", split="validation")
if max_examples:
    val_ds = val_ds.select(range(min(len(val_ds), max_examples)))

sst2_res = zero_shot_sst2_classify(val_ds, labels=["positive", "negative"], prompt_style="review")


SST-2 zero-shot (loss scoring): 100%|██████████| 200/200 [00:12<00:00, 15.67it/s]

SST-2 zero-shot (loss scoring) accuracy on 200 examples: 0.5150





In [38]:
# Show a few classification examples
for i in range(20):
    print(i, "sentence: ", sst2_res["sentence"][i], "pred:", sst2_res["preds"][i], "true:", sst2_res["trues"][i])


0 sentence:  it 's a charming and often affecting journey . pred: negative true: positive
1 sentence:  unflinchingly bleak and desperate pred: negative true: negative
2 sentence:  allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . pred: negative true: positive
3 sentence:  the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . pred: negative true: positive
4 sentence:  it 's slow -- very , very slow . pred: negative true: negative
5 sentence:  although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . pred: negative true: positive
6 sentence:  a sometimes tedious film . pred: negative true: negative
7 sentence:  or doing last year 's taxes with your ex-wife . pred: negative true: negative
8 sentence:  you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . pred: negative t

## Classification evaluation (accuracy, precision/recall/f1, confusion matrix, CI)

In [36]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import numpy as np
from math import sqrt
import random

# preds & trues should be lists from your notebook (sst2_res["preds"], sst2_res["trues"])
preds = sst2_res["preds"]
trues = sst2_res["trues"]

# Basic metrics
acc = accuracy_score(trues, preds)
precision, recall, f1, support = precision_recall_fscore_support(trues, preds, labels=["negative", "positive"], average=None)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(trues, preds, average="macro")

print("Accuracy: {:.4f}".format(acc))
print("Macro Precision: {:.4f}, Macro Recall: {:.4f}, Macro F1: {:.4f}".format(macro_precision, macro_recall, macro_f1))
print("\nPer-class (label order = ['negative','positive']):")
for lbl, p, r, f, s in zip(["negative","positive"], precision, recall, f1, support):
    print(f"  {lbl}: precision={p:.3f}, recall={r:.3f}, f1={f:.3f}, support={s}")

print("\n\nClassification report:")
print(classification_report(trues, preds, digits=4))

# Confusion matrix
cm = confusion_matrix(trues, preds, labels=["negative", "positive"])
print("Confusion matrix (rows=true, cols=pred):")
print(cm)

# Bootstrapped 95% CI for accuracy
def bootstrap_confidence_interval(preds, trues, metric_fn, n_boot=1000, alpha=0.05, seed=42):
    rng = random.Random(seed)
    n = len(preds)
    stats = []
    for _ in range(n_boot):
        idxs = [rng.randrange(n) for _ in range(n)]
        p_sample = [preds[i] for i in idxs]
        t_sample = [trues[i] for i in idxs]
        stats.append(metric_fn(t_sample, p_sample))
    stats = np.array(stats)
    lo = np.percentile(stats, 100 * (alpha / 2))
    hi = np.percentile(stats, 100 * (1 - alpha / 2))
    return lo, hi

acc_lo, acc_hi = bootstrap_confidence_interval(preds, trues, lambda y_true, y_pred: accuracy_score(y_true, y_pred), n_boot=1000)
print(f"Accuracy 95% CI (bootstrap): [{acc_lo:.4f}, {acc_hi:.4f}]")


Accuracy: 0.5150
Macro Precision: 0.5885, Macro Recall: 0.5103, Macro F1: 0.3737

Per-class (label order = ['negative','positive']):
  negative: precision=0.510, recall=0.980, f1=0.671, support=101
  positive: precision=0.667, recall=0.040, f1=0.076, support=99


Classification report:
              precision    recall  f1-score   support

    negative     0.5103    0.9802    0.6712       101
    positive     0.6667    0.0404    0.0762        99

    accuracy                         0.5150       200
   macro avg     0.5885    0.5103    0.3737       200
weighted avg     0.5877    0.5150    0.3767       200

Confusion matrix (rows=true, cols=pred):
[[99  2]
 [95  4]]
Accuracy 95% CI (bootstrap): [0.4449, 0.5750]
Accuracy 95% CI (bootstrap): [0.4449, 0.5750]


## Zero-shot summarization
For summarization, prefix the input with `summarize:` and provide the content (e.g., a short dialogue).

In [37]:
# Cell: Zero-shot summarization on SAMSum

def zero_shot_samsum_summarization(ds_samsum, summary_sentences=(1,2)):


    preds = []
    refs = []

    for ex in tqdm(ds_samsum, desc="SAMSum zero-shot"):
        convo = ex["dialogue"]
        prompt = (
            f"Summarize the following conversation in {summary_sentences[0]}-{summary_sentences[1]} sentences:\n\n"
            + convo
            + "\n\nSummary:"
        )
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
        out = model.generate(**inputs, **GEN_KWARGS_SUM)
        summary = tokenizer.decode(out[0], skip_special_tokens=True)
        preds.append(summary.strip())
        refs.append(ex["summary"].strip())

    return {"preds": preds, "refs": refs}


ds_samsum = load_dataset("knkarthick/samsum", split="test")
if max_examples:
        ds_samsum = ds_samsum.select(range(min(len(ds_samsum), max_examples)))

samsum_res = zero_shot_samsum_summarization(ds_samsum, summary_sentences=(1,2))



SAMSum zero-shot: 100%|██████████| 200/200 [05:01<00:00,  1.51s/it]
SAMSum zero-shot: 100%|██████████| 200/200 [05:01<00:00,  1.51s/it]


In [None]:
# Show a few summarization examples
for i in range(20):
    print("REF:", samsum_res["refs"][i])
    print("PRED:", samsum_res["preds"][i])
    print("-" * 60)

REF: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
PRED: Larry called Hannah last time they were at the park together. Hannah doesn't know Larry well.
------------------------------------------------------------
REF: Eric and Rob are going to watch a stand-up on youtube.
PRED: Eric and Rob are watching a stand-up on YouTube.
------------------------------------------------------------
REF: Lenny can't decide which trousers to buy. Bob advised Lenny on that topic. Lenny goes with Bob's advice to pick the trousers that are of best quality.
PRED: Bob sends Lenny photos of his trousers. Lenny will buy the first pair or the third pair.
------------------------------------------------------------
REF: Emma will be home soon and she will let Will know.
PRED: Will is going to pick Emma up. Emma will be home soon.
------------------------------------------------------------
REF: Jane is in Warsaw. Ollie and Jane has a party. Jane lost her calendar. They wil

## Summarization evaluation (ROUGE + BERTScore + bootstrap CI)

In [39]:
import evaluate
import numpy as np
import random

# preds and refs from your notebook: samsum_res["preds"], samsum_res["refs"]
preds = samsum_res["preds"]
refs  = samsum_res["refs"]

# ROUGE
rouge = evaluate.load("rouge")
rouge_res = rouge.compute(predictions=preds, references=refs)
print("ROUGE results (medians / f1 where available):")
for k, v in rouge_res.items():
    # evaluate returns e.g. {'rouge1': 0.4, 'rouge2': 0.2, 'rougeL': 0.37}
    print(f"  {k}: {v:.4f}")

# BERTScore
bertscore = evaluate.load("bertscore")
bs_res = bertscore.compute(predictions=preds, references=refs, lang="en", model_type="microsoft/deberta-xlarge-mnli")  # model_type optional
print("\nBERTScore (mean):")
print(f"  precision: {np.mean(bs_res['precision']):.4f}")
print(f"  recall:    {np.mean(bs_res['recall']):.4f}")
print(f"  f1:        {np.mean(bs_res['f1']):.4f}")

# Bootstrapped CI for ROUGE-1 F1
def rouge1_f1(preds_subset, refs_subset):
    r = evaluate.load("rouge")
    res = r.compute(predictions=preds_subset, references=refs_subset)
    return res["rouge1"]

def bootstrap_rouge(preds, refs, n_boot=1000, alpha=0.05, seed=42):
    rng = random.Random(seed)
    n = len(preds)
    stats = []
    for _ in range(n_boot):
        idxs = [rng.randrange(n) for _ in range(n)]
        p_sample = [preds[i] for i in idxs]
        r_sample = [refs[i] for i in idxs]
        stats.append(rouge1_f1(p_sample, r_sample))
    stats = np.array(stats)
    lo = np.percentile(stats, 100 * (alpha/2))
    hi = np.percentile(stats, 100 * (1-alpha/2))
    return lo, hi

r1_lo, r1_hi = bootstrap_rouge(preds, refs, n_boot=100)  # reduce n_boot for speed on Colab
print(f"\nROUGE-1 F1 95% CI (bootstrap, n_boot=100): [{r1_lo:.4f}, {r1_hi:.4f}]")


Downloading builder script: 6.14kB [00:00, 6.67MB/s]



ROUGE results (medians / f1 where available):
  rouge1: 0.2649
  rouge2: 0.0706
  rougeL: 0.2005
  rougeLsum: 0.2004


OutOfMemoryError: CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacity of 3.68 GiB of which 27.00 MiB is free. Including non-PyTorch memory, this process has 3.63 GiB memory in use. Of the allocated memory 3.50 GiB is allocated by PyTorch, and 39.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Save predictions to disk for later analysis
import json



def save_outputs(df, dir, file_name):
    os.makedirs(dir, exist_ok=True)
    with open(os.path.join(dir, file_name), "w", encoding="utf-8") as f:
        json.dump(df, f, ensure_ascii=False, indent=2)

def format_sst2_readable(res):
    lines = [
        f"{i}\tPRED={p}\tTRUE={t}\tSENT={s}"
        for i, (s, p, t) in enumerate(zip(res["sentence"], res["preds"], res["trues"]))
    ]
    return lines

def format_samsum_readable(res):
    lines = [
        f"{i}\nREF: {r}\nPRED: {p}\n" + "-"*60
        for i, (r, p) in enumerate(zip(res["refs"], res["preds"]))
    ]
    return lines

def write_text(lines, dir, file_name):
    os.makedirs(dir, exist_ok=True)
    with open(os.path.join(dir, file_name), "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

# Create readable text versions
sst2_readable = format_sst2_readable(sst2_res)
samsum_readable = format_samsum_readable(samsum_res)

# Write text files
write_text(sst2_readable, "outputs", "sst2_preds-zeroshot.txt")
write_text(samsum_readable, "outputs", "samsum_preds-zeroshot.txt")

# Write JSON files
save_outputs(sst2_res, "outputs", "sst2_preds-zeroshot.json")
save_outputs(samsum_res, "outputs", "samsum_preds-zeroshot.json")

print("Saved readable .txt and JSON files in ./outputs/")


Saved readable .txt and JSON files in ./outputs/


## Zero-shot with base google-t5/t5-small (non-FLAN)


The base T5 model is not instruction tuned like FLAN, so free-form prompts (e.g. "sst2: <text>") often just echo the input. For classification we score candidate labels via decoder loss; for summarization we can still use the classic `summarize:` prefix. This section loads the base checkpoint separately to avoid mixing weights.




In [40]:
# Load base T5 (google-t5/t5-small) for separate zero-shot experiments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

base_model_name = "google-t5/t5-small"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)
base_device = 'cuda' if torch.cuda.is_available() else 'cpu'
base_model.to(base_device)
print("Loaded base T5:", base_model_name, "on", base_device)

Loaded base T5: google-t5/t5-small on cuda


In [41]:
# Zero-shot sentiment classification via label scoring on base T5
import math
LABELS = ["positive","negative"]

def score_labels_base_t5(texts):
    if isinstance(texts,str):
        texts=[texts]
    results=[]
    for sent in texts:
        # Prompt: keep simple, base T5 trained on span corruption so rely on pattern matching
        prompt = f"review: {sent}\nSentiment:"  # short anchor words
        enc = base_tokenizer(prompt, return_tensors='pt').to(base_device)
        label_losses = {}
        for lab in LABELS:
            lab_ids = base_tokenizer(lab, return_tensors='pt').input_ids.to(base_device)
            # Teacher forcing: encoder input + decoder labels -> loss
            with torch.no_grad():
                out = base_model(input_ids=enc.input_ids,
                                  attention_mask=enc.attention_mask,
                                  labels=lab_ids)
            label_losses[lab] = out.loss.item()
        # Pick lower loss
        pred = min(label_losses.items(), key=lambda kv: kv[1])[0]
        results.append({"text": sent, "pred": pred, "losses": label_losses})
    return results

sample_reviews = [
    "I absolutely loved this movie. It was fantastic!",
    "The plot was predictable and the acting was bad.",
    "Not great, not terrible."]
for r in score_labels_base_t5(sample_reviews):
    print(r)

{'text': 'I absolutely loved this movie. It was fantastic!', 'pred': 'negative', 'losses': {'positive': 11.420465469360352, 'negative': 10.36074447631836}}
{'text': 'The plot was predictable and the acting was bad.', 'pred': 'negative', 'losses': {'positive': 7.862970352172852, 'negative': 7.759611129760742}}
{'text': 'Not great, not terrible.', 'pred': 'negative', 'losses': {'positive': 6.351655006408691, 'negative': 5.998634338378906}}


In [21]:
# Zero-shot summarization with base T5 (classic summarize: prefix)

def summarize_base_t5(text, max_new_tokens=60):
    prompt = "summarize: " + text
    enc = base_tokenizer(prompt, return_tensors='pt').to(base_device)
    with torch.no_grad():
        gen = base_model.generate(**enc, max_new_tokens=max_new_tokens, num_beams=4)
    return base_tokenizer.decode(gen[0], skip_special_tokens=True)

dialogue = (
    "John: Let's meet at 5 pm.\n"
    "Jane: Can we do 6 pm instead?\n"
    "John: Sure. See you then."
)
print("Summary (base T5):", summarize_base_t5(dialogue))

Summary (base T5): John: Let's meet at 5 pm. Jane: Can we do 6 pm instead?
