#### Evaluation

In [2]:
from datasets import load_dataset, load_metric
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_path = '/DATA/pranta_2411ai09/DialogueSummarization/data/samsum_dataset'
from datasets import load_from_disk
dataset_samsum = load_from_disk(data_path)

In [4]:
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [5]:
# LOAD THE TRAINED MODEL FOR EVALUATION
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('./pegasus-samsum')
tokenizer = AutoTokenizer.from_pretrained('./pegasus-samsum')

In [None]:
import torch

batch_size = 8
test_dataset = dataset_samsum['test']
num_samples = len(test_dataset)
generated_summaries = []
reference_summaries = []

for start_idx in range(0, num_samples, batch_size):
    end_idx = min(start_idx + batch_size, num_samples)
    batch_dialogues = test_dataset[start_idx:end_idx]['dialogue']
    inputs = tokenizer(batch_dialogues, return_tensors='pt', truncation=True, padding=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    summary_ids = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        num_beams=4,
        length_penalty=1.4,
        max_length=100,
        min_length=25,
        early_stopping=True
    )
    batch_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    generated_summaries.extend(batch_summaries)
    reference_summaries.extend(test_dataset[start_idx:end_idx]['summary'])


In [1]:
# Compute ROUGE scores
rouge = load_metric('rouge')
results = rouge.compute(predictions=generated_summaries, references=reference_summaries)

# Extract F1 scores for each ROUGE metric
rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
rouge_dict = {rn: results[rn].mid.fmeasure for rn in rouge_names}

# Display as DataFrame
pd.DataFrame(rouge_dict, index=['pegasus'])


NameError: name 'load_metric' is not defined