In [1]:
# ===========================
# 📌 Install Dependencies
# ===========================
!pip install transformers datasets rouge-score

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━

In [2]:
# ===========================
# 📌 Load Pretrained T5 Model (Baseline)
# ===========================
model_name = "t5-small"  # You can also try "t5-base" or "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model_before = T5ForConditionalGeneration.from_pretrained(model_name).to(device)  # Keep a copy of the base model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [3]:
# ===========================
# 📌 Load and Prepare XSum Dataset
# ===========================
dataset = load_dataset("xsum")
train_dataset = dataset["train"].select(range(2000))  # Small subset for faster training
val_dataset = dataset["validation"].select(range(500))

# Preprocessing function
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)

    labels = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=150)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

# Ensure dataset only has required features
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

The repository for xsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/xsum.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


(…)SUM-EMNLP18-Summary-Data-Original.tar.gz:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [4]:
# ===========================
# 📌 Fine-Tuning T5 on XSum
# ===========================
model_after = T5ForConditionalGeneration.from_pretrained(model_name).to(device)  # Separate model instance

training_args = TrainingArguments(
    output_dir="./t5_summarization",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=True if torch.cuda.is_available() else False
)

trainer = Trainer(
    model=model_after,  # Fine-tuning a new model, keeping model_before unchanged
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

trainer.train()

# Save fine-tuned model properly
trainer.model.save_pretrained("./fine_tuned_t5_xsum")
tokenizer.save_pretrained("./fine_tuned_t5_xsum")

print("\n========== Fine-Tuning Completed ==========\n")


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkanamarlapudi21bec18[0m ([33mkanamarlapudi21bec18-iiitkottayam[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.7005,0.552603
2,0.586,0.542247
3,0.5798,0.53927
4,0.5358,0.537782
5,0.58,0.537592






In [5]:
# ===========================
# 📌 Load Fine-Tuned Model Separately
# ===========================
model_after = T5ForConditionalGeneration.from_pretrained("./fine_tuned_t5_xsum").to(device)

In [6]:
# ===========================
# 📌 Functions to Generate Summaries
# ===========================

# Function for the base (pretrained) model
def generate_summary_before(article, max_input_length=512, max_output_tokens=150):
    inputs = tokenizer("summarize: " + article, return_tensors="pt", truncation=True, max_length=max_input_length).to(device)

    with torch.no_grad():  # Disable gradient calculation for inference
        output = model_before.generate(
            **inputs,
            max_new_tokens=max_output_tokens,
            no_repeat_ngram_size=2,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id  # 🚀 Fix padding issue
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Function for the fine-tuned model
def generate_summary_after(article, max_input_length=512, max_output_tokens=150):
    inputs = tokenizer("summarize: " + article, return_tensors="pt", truncation=True, max_length=max_input_length).to(device)

    with torch.no_grad():
        output = model_after.generate(
            **inputs,
            max_new_tokens=max_output_tokens,
            no_repeat_ngram_size=2,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id  # 🚀 Fix padding issue
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [7]:

# ===========================
# 📌 Compare Summaries Before and After Fine-Tuning
# ===========================
test_samples = dataset["test"].select(range(5))  # Pick some test examples

for i, example in enumerate(test_samples):
    article = example["document"]
    reference_summary = example["summary"]

    summary_before = generate_summary_before(article)
    summary_after = generate_summary_after(article)

    print(f"\n========== Example {i+1} ==========")
    print("\n--- Original Article ---\n", article[:500], "...")
    print("\n--- Reference Summary ---\n", reference_summary)
    print("\n--- T5 Generated Summary (Before Fine-Tuning) ---\n", summary_before)
    print("\n--- T5 Generated Summary (After Fine-Tuning) ---\n", summary_after)






--- Original Article ---
 Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link C ...

--- Reference Summary ---
 There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.

--- T5 Generated Summary (Before Fine-Tuning) ---
 prison link Cymru says some ex-offenders were living rough for up to a year. the charity says it is cheaper than jailing homeless repeat offenders if they are given priority - but not men's' the housing act in Wales was introduced in 2015 removing the right for prison leavers to be given pr

In [10]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [22]:
import evaluate  # New library for evaluation metrics

# Load ROUGE metric
rouge = evaluate.load("rouge")

def compute_rouge_scores(reference_summaries, generated_summaries):
    scores = rouge.compute(predictions=generated_summaries, references=reference_summaries)
    # Access scores directly, removing the .mid attribute
    return {
        "ROUGE-1": scores["rouge1"],
        "ROUGE-2": scores["rouge2"],
        "ROUGE-L": scores["rougeL"]
    }
# ===========================
# Summaries & Computed ROUGE Scores
# ===========================

test_samples = dataset["test"].select(range(400))  # Use 400 test samples for evaluation

# Store summaries
reference_summaries = []
generated_summaries_before = []
generated_summaries_after = []

for example in test_samples:
    article = example["document"]
    reference_summary = example["summary"]

    # Generate summaries
    summary_before = generate_summary_before(article)
    summary_after = generate_summary_after(article)

    # Store results
    reference_summaries.append(reference_summary)
    generated_summaries_before.append(summary_before)
    generated_summaries_after.append(summary_after)

# Compute ROUGE scores
rouge_scores_before = compute_rouge_scores(reference_summaries, generated_summaries_before)
rouge_scores_after = compute_rouge_scores(reference_summaries, generated_summaries_after)

# ===========================
# 📌 Print Results
# ===========================
print("\n========== ROUGE Score Comparison ==========\n")
print("🚀 ROUGE Scores (Before Fine-Tuning):")
for key, value in rouge_scores_before.items():
    print(f"  {key}: {value:.4f}")

print("\n✅ ROUGE Scores (After Fine-Tuning):")
for key, value in rouge_scores_after.items():
    print(f"  {key}: {value:.4f}")

# ===========================
# 📌 Optional: Show Percentage Improvement
# ===========================
print("\n📊 Percentage Improvement After Fine-Tuning:")
for key in rouge_scores_before.keys():
    improvement = ((rouge_scores_after[key] - rouge_scores_before[key]) / rouge_scores_before[key]) * 100
    print(f"  {key}: {improvement:.2f}% improvement")




🚀 ROUGE Scores (Before Fine-Tuning):
  ROUGE-1: 0.1680
  ROUGE-2: 0.0226
  ROUGE-L: 0.1147

✅ ROUGE Scores (After Fine-Tuning):
  ROUGE-1: 0.2573
  ROUGE-2: 0.0618
  ROUGE-L: 0.1996

📊 Percentage Improvement After Fine-Tuning:
  ROUGE-1: 53.11% improvement
  ROUGE-2: 173.35% improvement
  ROUGE-L: 73.98% improvement


In [21]:
custom_text = input("Enter your text: ")

baseline_summary = generate_summary_before(custom_text)
fine_tuned_summary = generate_summary_after(custom_text)

print("\n========== Model Comparison ==========")
print("\n--- Original Input ---\n", custom_text)
print("\n--- T5 Generated Summary (Before Fine-Tuning) ---\n", baseline_summary)
print("\n--- T5 Generated Summary (After Fine-Tuning) ---\n", fine_tuned_summary)


Enter your text: There seems to be no end in sight to the agonising wait of family members of eight persons trapped under the debris about 14 km inside the Srisailam Left Bank Canal (SLBC) tunnel from February 22 morning as the debris and mud removal work to trace those trapped is stretching on, despite the conveyor belt becoming functional on Tuesday evening. The task of cutting the tunnel boring machine (TBM) which was damaged badly in the roof collapse, shifting of that debris and removal of mud in the last 20 metres stretch of the tunnel excavated so far remains highly challenging, though the conveyor belt has been restored after 10 days of hard work, a senior official involved in the rescue work said.  “The conveyor belt is not a panacea to all issues inside the tunnel, it may only step up the mud shifting work after testing. Originally, the conveyor belt was linked and synchronised with the functioning of the TBM and it would shift the excavated material, soil mixed with small st