In [1]:
!pip install transformers datasets rouge_score nltk -q

In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # For more detailed CUDA error tracking

from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import nltk
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the pretrained T5 model and tokenizer
t5 = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(t5)
tokenizer = T5Tokenizer.from_pretrained(t5)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
# Load the XSum dataset
dataset = load_dataset("xsum")
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [5]:
test = dataset["test"]
test

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 11334
})

In [6]:
# Set device and enable DataParallel for multiple GPUs
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs.")
    model = torch.nn.DataParallel(model)
model.to(device)
device

Using 2 GPUs.


device(type='cuda', index=0)

In [7]:
# Function to generate summaries using T5 with error handling
def generate_summary_batch(texts):
    try:
        inputs = tokenizer(["summarize: " + text for text in texts], return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate summaries
        summaries = model.module.generate(  # Use model.module for DataParallel models
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=60,
            num_beams=5,
            length_penalty=2.0,
            early_stopping=True
        )
        return tokenizer.batch_decode(summaries, skip_special_tokens=True)
    except RuntimeError as e:
        print(f"Error during generation: {e}")
        return [""] * len(texts)  # Return empty summaries in case of error

In [8]:
# Function to compute ROUGE scores
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_results = {
        "rouge1": [],
        "rouge2": [],
        "rougeL": []
    }
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge_results["rouge1"].append(scores["rouge1"].fmeasure)
        rouge_results["rouge2"].append(scores["rouge2"].fmeasure)
        rouge_results["rougeL"].append(scores["rougeL"].fmeasure)
    return {metric: sum(scores) / len(scores) for metric, scores in rouge_results.items()}

In [9]:
# Function to compute BLEU score
def compute_bleu(predictions, references):
    pred_tokens = [nltk.word_tokenize(pred.lower()) for pred in predictions]
    ref_tokens = [[nltk.word_tokenize(ref.lower())] for ref in references]
    return corpus_bleu(ref_tokens, pred_tokens)

In [10]:
# Create DataLoader for batching
def create_dataloader(dataset, batch_size=8):
    def collate_fn(batch):
        texts = [example["document"] for example in batch]
        references = [example["summary"] for example in batch]
        return texts, references

    return DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

In [11]:
# Set batch size for evaluation
batch_size = 8
dataloader = create_dataloader(test, batch_size)

all_generated_summaries = []
all_references = []

In [12]:
# Ensure model is in evaluation mode
model.eval()

for batch in tqdm(dataloader, desc="Evaluating"):
    texts, references = batch
    generated_summaries = generate_summary_batch(texts)
    all_generated_summaries.extend(generated_summaries)
    all_references.extend(references)

Evaluating: 100%|██████████| 1417/1417 [1:37:04<00:00,  4.11s/it]


In [13]:
# Compute ROUGE scores
rouge_scores = compute_rouge(all_generated_summaries, all_references)

# Compute BLEU score
bleu_score = compute_bleu(all_generated_summaries, all_references)

In [15]:
# Print results
print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score:.4f}")

print(f"BLEU Score: {bleu_score:.4f}")

ROUGE Scores:
rouge1: 0.2050
rouge2: 0.0309
rougeL: 0.1385
BLEU Score: 0.0129
