# SummarizeME


## Imports

In [None]:
!pip -q install "transformers>=4.43.0" "datasets" "peft>=0.11.0" "accelerate" "evaluate" "rouge-score" "numpy" "scikit-learn" "bitsandbytes" "torch"

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
import os, json, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer)
from peft import LoraConfig, get_peft_model, PeftModel
import evaluate
import torch
from datetime import datetime

## Config

In [None]:
MODEL_ID       = "google/flan-t5-base"
OUTDIR         = "outputs/flan_t5_base_samsum_lora"
ADAPTER_DIR    = f"{OUTDIR}/adapter"
MERGED_DIR     = "saved_model/flan_t5_base_samsum_merged"
MAX_SRC        = 1024   #max no.of tokens in the input
MAX_TGT        = 256    #max no.of tokens in the output
SEED           = 42
EPOCHS         = 2
LEARNING_RATE  = 5e-5
BATCH_TRAIN    = 2
BATCH_EVAL     = 2
GRAD_ACCUM     = 4
FP16           = True

In [None]:
os.makedirs(OUTDIR, exist_ok=True)
os.makedirs("saved_model", exist_ok=True)

# Improve matmul perf on NVIDIA
torch.backends.cuda.matmul.allow_tf32 = True

print("Device:", ("cuda" if torch.cuda.is_available() else "cpu"))

Device: cuda


## Text Normalization

Cleans and standardizes raw text strings before tokenization. Especially useful for dialogue datasets like SAMSum where extra spaces, tabs, or line breaks may appear in transcripts.


1. Handles empty or None strings safely.
2. Replaces non-breaking spaces (Unicode \u00A0) with normal spaces.
3. Collapses multiple consecutive spaces or tabs into a single space.
4. Removes redundant whitespace before line breaks.
5. Reduces 3+ consecutive newlines to just 2 (preserving paragraph breaks).
6. Strips leading and trailing spaces/newlines.

In [None]:
def normalize(s: str) -> str:
    if not s: return s
    s = s.replace("\u00A0", " ")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s+\n", "\n", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

## Loading Dataset

In [None]:
ds  = load_dataset("knkarthick/samsum")
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

print(ds)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14731 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})


## Preprocess Dataset

In [None]:
def preprocess(batch):
    src = [normalize(f"Summarize the dialogue:\n{d}") for d in batch["dialogue"]]
    tgt = [normalize(s) for s in batch["summary"]]
    model_inputs = tok(src, max_length=MAX_SRC, truncation=True)
    with tok.as_target_tokenizer():
        labels = tok(tgt, max_length=MAX_TGT, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
cols = ["dialogue","summary"]
train_ds = ds["train"].map(preprocess, batched=True, remove_columns=cols)
val_ds   = ds["validation"].map(preprocess, batched=True, remove_columns=cols)
print("Train size:", len(train_ds), ", Val size:", len(val_ds))

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]



Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Train size: 14731 , Val size: 818


## Base Model + LoRA


In [None]:
# pulls the pretrained weights/config for the model
base = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

LoRA Config:

1. r=8: the rank (size) of the low-rank matrices inserted into target layers.
2. lora_alpha=32: scaling factor;
3. lora_dropout=0.1: dropout on the LoRA path to reduce overfitting.
4. bias="none": No added bias.
5. task_type="SEQ_2_SEQ_LM": tells PEFT we’re fine-tuning a seq2seq LM.
6. target_modules=["q","k","v","o"]: for T5, these are the projection submodules in attention (query/key/value/output). LoRA adapters are injected there.


In [None]:
lcfg = LoraConfig(
    r=8, lora_alpha=32, lora_dropout=0.1, bias="none",
    task_type="SEQ_2_SEQ_LM",
    target_modules=["q","v"]
)

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds  = np.where(preds  != -100, preds,  tok.pad_token_id)
    labels = np.where(labels != -100, labels, tok.pad_token_id)
    dec_preds  = tok.batch_decode(preds,  skip_special_tokens=True)
    dec_labels = tok.batch_decode(labels, skip_special_tokens=True)
    scores = rouge.compute(predictions=dec_preds, references=dec_labels, use_stemmer=True)
    return {k: round(v*100, 2) for k, v in scores.items()}  # rouge1/2/L/Lsum


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
model = get_peft_model(base, lcfg)

In [None]:
collator = DataCollatorForSeq2Seq(
    tokenizer=tok,
    model=model,
    label_pad_token_id=-100,
    padding="longest"
)

##Model Training

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

args = Seq2SeqTrainingArguments(
    output_dir=OUTDIR,                    # where to write checkpoints & logs
    seed=SEED, data_seed=SEED,            # make shuffling/splits reproducible
    num_train_epochs=EPOCHS,              # how many passes over the train set
    learning_rate=LEARNING_RATE,          # AdamW LR for all trainable params
    per_device_train_batch_size=BATCH_TRAIN,  # micro-batch size per GPU for TRAIN
    per_device_eval_batch_size=BATCH_EVAL,    # micro-batch size per GPU for EVAL
    gradient_accumulation_steps=GRAD_ACCUM,   # accumulate this many steps before optimizer.step()
    auto_find_batch_size=True,            # if OOM at startup, halve batch until it fits
    eval_strategy="epoch",                # run eval at the end of each epoch
    save_strategy="epoch",                # save a checkpoint each epoch
    predict_with_generate=True,           # during eval, actually generate summaries (not just logits)
    generation_max_length=MAX_TGT,        # cap length of generated summaries for eval
    generation_num_beams=4,               # beam size used for eval generation
    load_best_model_at_end=True,          # after training, reload the best checkpoint (by metric below)
    metric_for_best_model="eval_rougeL",  # choose the “best” checkpoint using ROUGE-L
    greater_is_better=True,               # higher ROUGE-L = better
    fp16=FP16,                            # use mixed precision (faster/less memory on supported GPUs)
    logging_strategy="steps",             # emit logs every N steps (see below)
    logging_steps=50,                     # how often to log training loss, LR, etc.
    logging_first_step=True,              # also log at the first training step
    save_total_limit=2,                   # keep only the last 2 checkpoints to save disk
    report_to="none",                     # disable W&B/TensorBoard auto-logging
)


trainer = Seq2SeqTrainer(
    model=model, args=args,
    train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tok, data_collator=collator,
    compute_metrics=compute_metrics,
)

train_result = trainer.train()

print("Training complete at:", datetime.now().isoformat())


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.0,,50.9,27.18,42.54,42.57
2,0.0,,50.9,27.18,42.54,42.57


Training complete at: 2025-10-27T18:46:22.251165


##Save LoRA Adapter

In [None]:

os.makedirs(ADAPTER_DIR, exist_ok=True)
model.save_pretrained(ADAPTER_DIR)
tok.save_pretrained(ADAPTER_DIR)
print("Saved LoRA adapter to:", ADAPTER_DIR)


Saved LoRA adapter to: outputs/flan_t5_base_samsum_lora/adapter


##Save Merged Model

In [None]:
from peft import PeftModel

os.makedirs(MERGED_DIR, exist_ok=True)
base_for_merge = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
peft_model = PeftModel.from_pretrained(base_for_merge, ADAPTER_DIR)
merged = peft_model.merge_and_unload()
merged.save_pretrained(MERGED_DIR)
tok.save_pretrained(MERGED_DIR)
print("Merged model saved to:", MERGED_DIR)


Merged model saved to: saved_model/flan_t5_base_samsum_merged


## Push Model to HuggingFace

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import create_repo, upload_folder

HF_USER = "akankshashah"

REPO_MERGED = f"{HF_USER}/flan-t5-base-samsum-merged"
REPO_ADAPT  = f"{HF_USER}/flan-t5-base-samsum-lora"

create_repo(REPO_MERGED, private=True, exist_ok=True)
create_repo(REPO_ADAPT,  private=True, exist_ok=True)

with open(f"{MERGED_DIR}/README.md","w") as f:
    f.write("# FLAN-T5-Base fine-tuned on SAMSum (merged)\n\n"
            "**Task:** Abstractive summarization\n\n"
            "Base: google/flan-t5-base · Finetuning: LoRA on SAMSum · This repo contains the merged weights.\n")

with open(f"{ADAPTER_DIR}/README.md","w") as f:
    f.write("# LoRA adapter for FLAN-T5-Base on SAMSum\n\n"
            "Attach this adapter to google/flan-t5-base at load time.\n")

upload_folder(repo_id=REPO_MERGED, folder_path=MERGED_DIR,
              commit_message="Add merged FLAN-T5 SAMSum model")
upload_folder(repo_id=REPO_ADAPT,  folder_path=ADAPTER_DIR,
              commit_message="Add LoRA adapter for FLAN-T5 SAMSum")

print("Pushed:\n - https://huggingface.co/" + REPO_MERGED +
      "\n - https://huggingface.co/" + REPO_ADAPT)


- empty or missing yaml metadata in repo card


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...amsum_merged/spiece.model: 100%|##########|  792kB /  792kB            

  ..._merged/model.safetensors:   3%|2         | 25.1MB /  990MB            

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...lora/adapter/spiece.model: 100%|##########|  792kB /  792kB            

  ...adapter_model.safetensors:  31%|###       | 1.10MB / 3.56MB            

Pushed:
 - https://huggingface.co/akankshashah/flan-t5-base-samsum-merged
 - https://huggingface.co/akankshashah/flan-t5-base-samsum-lora


##Model Evaluation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = MERGED_DIR
mtok = AutoTokenizer.from_pretrained(model_path)
mdl  = AutoModelForSeq2SeqLM.from_pretrained(model_path).eval().to("cuda" if torch.cuda.is_available() else "cpu")

ds_test = load_dataset("knkarthick/samsum")["test"]
rouge_eval = evaluate.load("rouge")

preds, refs = [], []
for ex in ds_test:
    inp = mtok(f"Summarize the dialogue:\n{ex['dialogue']}", return_tensors="pt", max_length=MAX_SRC, truncation=True)
    inp = {k: v.to(mdl.device) for k, v in inp.items()}
    out = mdl.generate(**inp, max_new_tokens=200, num_beams=4, no_repeat_ngram_size=3,
                       length_penalty=2.0, early_stopping=True)
    preds.append(mtok.decode(out[0], skip_special_tokens=True))
    refs.append(ex["summary"])

final_scores = rouge_eval.compute(predictions=preds, references=refs, use_stemmer=True)
print({k: round(v*100,2) for k, v in final_scores.items()})


Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': np.float64(50.68), 'rouge2': np.float64(25.77), 'rougeL': np.float64(41.57), 'rougeLsum': np.float64(41.58)}


##Inference

In [None]:
MODEL_ID_HF = "akankshashah/flan-t5-base-samsum-merged"

tok = AutoTokenizer.from_pretrained(MODEL_ID_HF)
mdl = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID_HF).to("cuda" if torch.cuda.is_available() else "cpu")

def summarize(text, min_new_tokens = 80, max_new_tokens=350):
     prompt = (
        "Write a concise, factual summary using ONLY information stated in the text. "
        "If something is not stated, omit it. Do not speculate.\n\nText:\n" + text
    )
    enc = tok(prompt, return_tensors="pt", truncation=True, max_length=1024).to(mdl.device)
    out = mdl.generate(**enc, num_beams=4, do_sample=False,
                       no_repeat_ngram_size=4, length_penalty=1.0,
                       max_new_tokens=max_new_tokens,
                       min_new_tokens=min_new_tokens,
                       renormalize_logits=True,
                       early_stopping=True)
    return tok.decode(out[0], skip_special_tokens=True)

In [None]:
text = "Vicky Kaushal born 16 May 1988 is an Indian actor who works in Hindi films. He is the recipient of several accolades, including a National Film Award and three Filmfare Awards, and has appeared in Forbes India's Celebrity 100 list of 2019.  After graduating with an engineering degree from Rajiv Gandhi Institute of Technology, Kaushal began his career by assisting Anurag Kashyap in the crime drama Gangs of Wasseypur (2012) and played minor roles in films. His first leading role was in the independent drama Masaan (2015), following which he starred in Kashyap's psychological thriller Raman Raghav 2.0 (2016). Kaushal gained wider recognition in 2018 with supporting roles in the top-grossing dramas Raazi and Sanju, winning the Filmfare Award for Best Supporting Actor for the latter.  His role as an army officer in the 2019 war film Uri: The Surgical Strike established Kaushal as a leading actor and won him the National Film Award for Best Actor. He earned further praise for his portrayal of Udham Singh in the biopic Sardar Udham (2021), winning the Filmfare Critics Award for Best Actor, and had commercial success in 2023 in the romantic comedy Zara Hatke Zara Bachke, the biopic Sam Bahadur and the comedy-drama Dunki. The last of these won him another Filmfare Award for Best Supporting Actor. The 2025 historical action film Chhaava, in which he portrayed Sambhaji, emerged as his highest-grossing release."
summary = summarize(text)

print("Summary:\n", summary)

Summary:
 Actor Vicky Kaushal has been nominated for the Filmfare Award for Best Supporting Actor for his role as an army officer in the film Uri: The Surgical Strike. He also won the FilmFare Critics Award for best actor for his portrayal of Udham Singh in the biopic Sardar Udhham (2021). He has also been nomined for the National Film Award for the best supporting actor for the film Chhaava.
