Setup

In [16]:
!pip install -q transformers peft datasets accelerate evaluate rouge-score sentencepiece

Pick your base model


In [17]:
BASE_MODEL = "facebook/bart-large-cnn"  # or "sshleifer/distilbart-cnn-12-6" or "t5-base"
IS_T5 = BASE_MODEL.startswith("t5")
ADAPTER_OUT = "/content/softprompt_adapter"   # where the learned soft prompt will be saved

Load your training data

In [18]:
import pandas as pd
from datasets import Dataset, DatasetDict

csv_path = "category_summaries.csv"  # path to your file
df = pd.read_csv(csv_path)

# Split train/valid (quick random split; replace with your own if you have predefined splits)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
cut = int(len(df)*0.9)
df_train, df_valid = df.iloc[:cut].copy(), df.iloc[cut:].copy()

raw = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "validation": Dataset.from_pandas(df_valid)
})
len(raw["train"]), len(raw["validation"])


(4, 1)

Tokenize + build inputs

In [19]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)

MAX_INPUT  = 768   # adjust to fit your facts length
MAX_TARGET = 74   # ~≤200 words summary

INSTR = ("Write a concise, buyer-friendly summary for the product category below. "
         "Include the TOP 3 products and key differences, list the most common complaints, "
         "and mention the WORST product and why to avoid it. Keep it under 200 words.")

def build_source(category, facts_text):
    core = f"{INSTR}\n\nCategory: {category}\n{facts_text}"
    return ("summarize: " + core) if IS_T5 else core

def preprocess(batch):
    sources = [build_source(c, f) for c, f in zip(batch["category"], batch["article"])]
    model_inputs = tokenizer(sources, max_length=MAX_INPUT, truncation=True)

    # Check if 'reference' column exists for labels. If not, use 'article' as a placeholder
    # This is a temporary fix to make the code runnable.
    # For actual summarization training, ensure your dataset has a 'reference' column with target summaries.
    target_column = "reference"
    if target_column not in batch:
        print(f"Warning: '{target_column}' column not found for target summaries. Using 'article' as a placeholder for labels. Please ensure your dataset has a column with target summaries (e.g., 'reference').")
        target_column = "article" # Fallback to article to make code runnable, but this isn't true summarization

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch[target_column], max_length=MAX_TARGET, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)
tokenized

Map:   0%|          | 0/4 [00:00<?, ? examples/s]





Map:   0%|          | 0/1 [00:00<?, ? examples/s]



DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
})

Build a soft prompt adapter (PEFT Prompt Tuning)

In [20]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from peft import get_peft_model, PromptTuningConfig, TaskType

base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=base)

peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    num_virtual_tokens=40,                # try 20–80
    tokenizer_name_or_path=BASE_MODEL
)

model = get_peft_model(base, peft_config)
model.print_trainable_parameters()        # only soft prompt params should be trainable

trainable params: 81,920 || all params: 406,372,352 || trainable%: 0.0202


Train

In [21]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
import evaluate

rouge = evaluate.load("rouge")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple): preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # ROUGE-L is usually most stable; keep others for dashboard
    return {k: round(v, 4) for k, v in result.items()}

args = Seq2SeqTrainingArguments(
    output_dir="/content/softprompt_ckpt",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-3,      # soft prompts like higher LR than full FT
    num_train_epochs=3,
    logging_steps=50,
    eval_steps=200,
    save_steps=200,
    predict_with_generate=True,
    generation_max_length=MAX_TARGET,
    fp16=True,               # okay on Colab GPU; set False on CPU
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

# Save ONLY the soft prompt adapter (tiny)
model.save_pretrained(ADAPTER_OUT)
tokenizer.save_pretrained(ADAPTER_OUT)
print("Adapter saved to:", ADAPTER_OUT)

  trainer = Seq2SeqTrainer(


Step,Training Loss


Adapter saved to: /content/softprompt_adapter


Use the soft-prompt adapter for inference

In [22]:
from peft import PeftModel
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
tok  = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
model = PeftModel.from_pretrained(base, ADAPTER_OUT)

pipe = pipeline("summarization", model=model, tokenizer=tok, device_map=None)

def generate_summary(category, facts_text):
    prompt = build_source(category, facts_text)
    out = pipe(prompt, max_length=MAX_TARGET, min_length=int(MAX_TARGET*0.5), do_sample=False, num_beams=4)
    return out[0]["summary_text"].strip()

print(generate_summary("Batteries & Household Power", "Top products:\n- 1. ...\nWorst product: ..."))

Device set to use cpu


Write a concise, buyer-friendly summary for the product category below. Include the TOP 3 products and key differences, list the most common complaints, and mention the WORST product and why to avoid it. Keep it under 200 words.


Save the soft-prompt adapter

In [23]:
# 1) Save the adapter to a folder
ADAPTER_OUT = "/content/softprompt_adapter"   # you can rename this
model.save_pretrained(ADAPTER_OUT)            # model = your PEFT-wrapped model
tokenizer.save_pretrained(ADAPTER_OUT)

import os, glob
print("Adapter files:", glob.glob(os.path.join(ADAPTER_OUT, "*")))


Adapter files: ['/content/softprompt_adapter/adapter_config.json', '/content/softprompt_adapter/tokenizer.json', '/content/softprompt_adapter/vocab.json', '/content/softprompt_adapter/tokenizer_config.json', '/content/softprompt_adapter/adapter_model.safetensors', '/content/softprompt_adapter/merges.txt', '/content/softprompt_adapter/special_tokens_map.json', '/content/softprompt_adapter/README.md']


Download

In [24]:
!zip -rq /content/softprompt_adapter.zip /content/softprompt_adapter
from google.colab import files
files.download("/content/softprompt_adapter.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>