## Download Dataset

In [12]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/newspaper-text-summarization-cnn-dailymail


## Show Dataset Directory

In [13]:
# List the files in the downloaded dataset directory
!ls  /kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail

test.csv  train.csv  validation.csv


## Install and Import Required Libraries

In [14]:
!pip install transformers datasets accelerate -U
!pip install peft bitsandbytes rouge

import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.2
    Uninstalling tokenizers-0.21.2:
      Succes

## Load Dataset

In [15]:
dataset = load_dataset(
    'csv', 
    data_files={
        'train': '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv',
        'validation': '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv',
        'test': '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv'
    }
)

## T5-Models

![T5-Models](https://cdn.analyticsvidhya.com/wp-content/uploads/2024/05/Screenshot-319.png)

## Load the pre-trained T5 model and tokenize

In [20]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## Low Ranked Adoption

In [21]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Check trainable parameters

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


## Preprocessing the Data

In [22]:
def preprocess_function(examples):
    inputs = [doc for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=384, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

encoded_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

## Prepare Dataset

In [23]:
train_dataset = encoded_dataset["train"].shuffle(seed=42)
val_dataset = encoded_dataset["validation"].shuffle(seed=42)
test_dataset = encoded_dataset["test"].shuffle(seed=42)

## Training the Model

In [4]:
training_args = TrainingArguments(
    output_dir="./results",
    report_to="none",
    eval_strategy="epoch",       # evaluate at end of each epoch
    learning_rate=3e-4,
    per_device_train_batch_size=6,     # safe batch size for P100 16GB
    per_device_eval_batch_size=6,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,                         # use mixed precision
    gradient_accumulation_steps=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

NameError: name 'TrainingArguments' is not defined

## Start Training

In [3]:
trainer.train()

NameError: name 'trainer' is not defined

## Evaluate on Validation Set

In [2]:
trainer.evaluate()

NameError: name 'trainer' is not defined

## Generate Summaries

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_summary(example):
    input_ids = tokenizer.encode(example["article"], return_tensors="pt", max_length=384, truncation=True).to(device)
    output = model.generate(input_ids, max_length=128)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return {"summary": summary}

summaries = test_dataset.map(generate_summary, batched=False)

## Display Examples

In [None]:
for i in range(3):
    print("Article:", test_dataset[i]["article"])
    print("\nReference Summary:", test_dataset[i]["highlights"])
    print("\nGenerated Summary:", summaries[i]["summary"])
    print("\n")

## Evaluate with ROUGE

In [None]:
from rouge import Rouge

def calculate_rouge(reference_list, generated_list):
    rouge = Rouge()
    scores = rouge.get_scores(generated_list, reference_list)
    rouge_1 = sum(score['rouge-1']['f'] for score in scores) / len(scores)
    rouge_2 = sum(score['rouge-2']['f'] for score in scores) / len(scores)
    rouge_l = sum(score['rouge-l']['f'] for score in scores) / len(scores)
    return rouge_1, rouge_2, rouge_l

reference_summaries = [example["highlights"] for example in test_dataset]
generated_summaries = [example["summary"] for example in summaries]

rouge_1, rouge_2, rouge_l = calculate_rouge(reference_summaries, generated_summaries)

print("Average ROUGE-1:", rouge_1)
print("Average ROUGE-2:", rouge_2)
print("Average ROUGE-L:", rouge_l)