In [1]:
!pip install transformers datasets rouge_score nltk -q

In [2]:
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import nltk
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the pretrained BART model and tokenizer
bart = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(bart)
tokenizer = BartTokenizer.from_pretrained(bart)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [4]:
# Load CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")
dataset

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [5]:
test = dataset["test"]
test

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 11490
})

In [6]:
# Function to generate summaries using BART
def generate_summary_batch(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=1024)
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)

    # Generate summaries in batches
    summaries = model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_beams=5, length_penalty=2.0, early_stopping=True)
    return tokenizer.batch_decode(summaries, skip_special_tokens=True)

In [7]:
# Function to compute ROUGE scores
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_results = {
        "rouge1": [],
        "rouge2": [],
        "rougeL": []
    }
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge_results["rouge1"].append(scores["rouge1"].fmeasure)
        rouge_results["rouge2"].append(scores["rouge2"].fmeasure)
        rouge_results["rougeL"].append(scores["rougeL"].fmeasure)
    return {metric: sum(scores) / len(scores) for metric, scores in rouge_results.items()}

In [8]:
# Function to compute BLEU score
def compute_bleu(predictions, references):
    pred_tokens = [nltk.word_tokenize(pred.lower()) for pred in predictions]
    ref_tokens = [[nltk.word_tokenize(ref.lower())] for ref in references]
    return corpus_bleu(ref_tokens, pred_tokens)

In [9]:
# Create DataLoader for batching
def create_dataloader(dataset, batch_size=8):
    def collate_fn(batch):
        texts = [example["article"] for example in batch]
        references = [example["highlights"] for example in batch]
        return texts, references

    return DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

In [10]:
# Set batch size and create DataLoader
batch_size = 8
dataloader = create_dataloader(test, batch_size)

all_generated_summaries = []
all_references = []

In [11]:
# Ensure model is in evaluation mode and move to GPU if available
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

In [12]:
# Evaluate in batches
for batch in tqdm(dataloader, desc="Evaluating"):
    texts, references = batch
    generated_summaries = generate_summary_batch(texts)
    all_generated_summaries.extend(generated_summaries)
    all_references.extend(references)

Evaluating: 100%|██████████| 1437/1437 [3:55:19<00:00,  9.83s/it]  


In [15]:
# Compute ROUGE and BLEU scores
rouge_scores = compute_rouge(all_generated_summaries, all_references)
bleu_score = compute_bleu(all_generated_summaries, all_references)

In [16]:
# Print the results
print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score:.4f}")

print(f"BLEU Score: {bleu_score:.4f}")

ROUGE Scores:
rouge1: 0.4391
rouge2: 0.2095
rougeL: 0.3051
BLEU Score: 0.1627
