In [1]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [3]:
!pip install matplotlib

import tensorflow as tf
import torch

from transformers import pipeline, set_seed

import matplotlib.pyplot as plt

import pandas as pd
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

Collecting matplotlib
  Using cached matplotlib-3.7.1-cp39-cp39-win_amd64.whl (7.6 MB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.1.0-cp39-cp39-win_amd64.whl (429 kB)
                                              0.0/429.4 kB ? eta -:--:--
     --                                      30.7/429.4 kB 1.4 MB/s eta 0:00:01
     ---------                              112.6/429.4 kB 1.1 MB/s eta 0:00:01
     ------------------                     204.8/429.4 kB 1.4 MB/s eta 0:00:01
     ----------------------------           327.7/429.4 kB 1.7 MB/s eta 0:00:01
     -------------------------------------- 429.4/429.4 kB 1.9 MB/s eta 0:00:00
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.40.0-cp39-cp39-win_amd64.whl (2.0 MB)
                                              0.0/2.0 MB ? eta -:--:--
     ---------                                0.5/2.

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jambh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## [The CNN/DailyMail Dataset](https://huggingface.co/datasets/cnn_dailymail)

##  An important aspect of the dataset is that the summaries are abstractive and not extractive, which means that they consist of new sentences instead of simple excerpts.

Extractive Summarization: the extractive approach selects the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.

Abstractive Summarization: The abstractive approach uses new phrases and terms that are different from the original document, keeping the meaning the same, just like how humans do in summarization. So, it is much harder than the extractive approach.

In [4]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", version="3.0.0")

print(f"Features in cnn_dailymail : {dataset['train'].column_names}")

Found cached dataset cnn_dailymail (C:/Users/jambh/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
100%|██████████| 3/3 [00:00<00:00,  6.35it/s]

Features in cnn_dailymail : ['article', 'highlights', 'id']





In [5]:
sample = dataset["train"][1]
print(f"""
Article (excerpt of 500 characters, total length: {len(sample["article"])}):
""")
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])


Article (excerpt of 500 characters, total length: 4051):

Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most s

Summary (length: 281):
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


-------------------

## Text Summarization Pipelines


In [6]:
sample_text = dataset["train"][1]["article"][:1000]

# We'll collect the generated summaries of each model in a dictionary
summaries = {}

### Summarization Baseline
The function baseline_summary_three_sent(text) takes a text as input and returns a summary of the text consisting of the first three sentences. It uses the sent_tokenize function from the NLTK library to split the text into sentences. The sentences are then joined together using line breaks to create the summary.

In [7]:
def baseline_summary_three_sent(text):
    return "\n".join(sent_tokenize(text)[:3])

In [8]:
summaries['baseline'] = baseline_summary_three_sent(sample_text)

summaries['baseline']

'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events.\nHere, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.\nMIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."'

# huggingface pipeline

The pipelines are a great and easy way to use models for inference.

Stating ”summarization”: will return a `SummarizationPipeline`

”text-generation”: will return a TextGenerationPipeline

-----------------------

# GPT-2

We can use GPT-2 it to generate summaries by simply appending “TL;DR” at the end of the input text.

The expression “TL;DR” (too long; didn’t read) is often used on platforms like
Reddit to indicate a short version of a long post. We will start our
summarization experiment by re-creating the procedure of the original paper
with the pipeline() function from Transformers

We create a text generation pipeline and load the GPT-2 model:

In [9]:
from transformers import pipeline, set_seed

set_seed(42)

pipe = pipeline('text-generation', model = 'gpt2-medium' )

gpt2_query = sample_text + "\nTL;DR:\n"

pipe_out = pipe(gpt2_query, max_length = 512, clean_up_tokenization_spaces = True)

# This passes the gpt2_query to the text generation pipeline (pipe) to generate text. 
# The max_length parameter specifies the maximum length of the generated text in tokens. 
# The clean_up_tokenization_spaces parameter ensures that the generated text has proper spacing.





Downloading model.safetensors: 100%|██████████| 1.52G/1.52G [04:18<00:00, 5.88MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 122kB/s]
Downloading (…)olve/main/vocab.json: 1.04MB [00:00, 1.65MB/s]
Downloading (…)olve/main/merges.txt: 456kB [00:00, 1.04MB/s]
Downloading (…)/main/tokenizer.json: 1.36MB [00:00, 5.01MB/s]
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [10]:
pipe_out

[{'generated_text': 'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illnesses are incarcerated until they\'re ready to appear in court. Most often, they face drug charges or charges of assaulting an officer --charges that Judge Steven Leifman says are usually "avoidable felonies." He says the arrests often result from confrontations with police. Mentally ill people often won\'t do what they\'re told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid, delusional, and

In [11]:
pipe_out[0]["generated_text"][len(gpt2_query) :]

'(A quick note about "forgotten floors": When you write this article, remember that we were there ourselves, and, thanks in part to our research and knowledge as CNN\'s special correspondent for Miami-Dade County, we know how tough that floor is.) In 2008, CNN traveled to Miami to investigate the conditions facing some mentally ill inmates in state prisons. The report was based on our interview with inmates in two jails -- at Miami Dade and Florida Department of Juvenile Justice in Tallahassee. One of those jails, the Tallahassee County jail, is the largest in the state, holding many seriously mentally ill prisoners. The other jail, the Miami-Dade County jail, is smaller and less numerous, but it\'s still a major part of the Florida prison system. In 2010, Leifman announced a $200 million overhaul of Miami-Dade county custody facilities that included cutting the prison population and freeing up space for inmate families and local jails. It\'s also now a safe place for many mentally ill

In [12]:
summaries['gpt2'] = "\n".join(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

# T5

T5 (Text-To-Text Transfer Transformer) is a transformer model that is trained in an end-to-end manner with text as input and modified text as output, in contrast to BERT-style models that can only output either a class label or a span of the input. This text-to-text formatting makes the T5 model fit for multiple NLP tasks like Summarization, Question-Answering, Machine Translation, and Classification problems.

How T5 is different from BERT?
Both T5 and BERT are trained with MLM (Masked Language Model) approach. 

What is MLM?

The MLM is a fill-in-the-blank task, where the model masks part of the input text and tries to predict what that masked word should be.

Example:

“I like to eat peanut butter and <MASK> sandwiches,”
“I like to eat peanut butter and jelly sandwiches,”


The only difference is that T5 replaces multiple consecutive tokens with the single Mask Keyword, unlike, BERT which uses Mask token for each word. This illustration is shown below.


### T5 expects a prefix before the input text to understand the task given by the user. For example,

- “summarize:” for the summarization, 
- “cola sentence:” for the classification, 
- “translate English to Spanish:” for the machine translation, etc., 


--------------


But here in this case, I can directly load T5 for summarization with the pipeline() function, which also takes care of formatting the inputs in the text-to-text format so we don’t
need to prepend them with "summarize":


In [13]:
pipe = pipeline('summarization', model = 't5-small' )

pipe_out = pipe(sample_text)

Downloading (…)lve/main/config.json: 1.21kB [00:00, 1.12MB/s]
Downloading model.safetensors: 100%|██████████| 242M/242M [00:40<00:00, 6.00MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 73.2kB/s]
Downloading (…)okenizer_config.json: 2.32kB [00:00, ?B/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 944kB/s]
Downloading (…)/main/tokenizer.json: 1.39MB [00:00, 13.2MB/s]


In [14]:
pipe_out

[{'summary_text': "inmates with the most severe mental illnesses are incarcerated until they're ready to appear in court . most often, they face drug charges or charges of assaulting an officer . mentally ill people become more paranoid, delusional, and less likely to follow dir ."}]

In [15]:
summaries['t5'] = 'n'.join(sent_tokenize(pipe_out[0]['summary_text']))

# BART

BART is a denoising autoencoder for pretraining sequence-to-sequence models. It is trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. It uses a standard Transformer-based neural machine translation architecture. 

That means, It uses a standard seq2seq/NMT architecture with a bidirectional encoder (like BERT) and a left-to-right decoder (like GPT). This means the encoder's attention mask is fully visible, like BERT, and the decoder's attention mask is causal, like GPT2.


This means that a fine-tuned BART model can take a text sequence (for example, English) as input and produce a different text sequence at the output (for example, French). 

This type of model is relevant for machine translation, question-answering , text summarization, or sequence classification (categorizing input text sentences or tokens). 

Another task is sentence entailment which, given two or more sentences, evaluates whether the sentences are logical extensions or are logically related to a given statement.

In [16]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)


Downloading (…)lve/main/config.json: 1.58kB [00:00, 1.05MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.63G/1.63G [04:35<00:00, 5.90MB/s]
Downloading (…)neration_config.json: 100%|██████████| 363/363 [00:00<00:00, 235kB/s]
Downloading (…)olve/main/vocab.json: 899kB [00:00, 8.76MB/s]
Downloading (…)olve/main/merges.txt: 456kB [00:00, 1.13MB/s]
Downloading (…)/main/tokenizer.json: 1.36MB [00:00, 9.09MB/s]


In [17]:
pipe_out

[{'summary_text': 'Miami-Dade pretrial detention facility is dubbed the "forgotten floor" Here, inmates with the most severe mental illnesses are incarcerated. Most often, they face drug charges or charges of assaulting an officer. Judge Steven Leifman says the arrests often result from confrontations with police.'}]

In [18]:
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

In [19]:
summaries["bart"]

'Miami-Dade pretrial detention facility is dubbed the "forgotten floor" Here, inmates with the most severe mental illnesses are incarcerated.\nMost often, they face drug charges or charges of assaulting an officer.\nJudge Steven Leifman says the arrests often result from confrontations with police.'

# PEGASUS

The PEGASUS model’s pre-training task is very similar to summarization, i.e. important sentences are removed and masked from an input document and are later generated together as one output sequence from the remaining sentences, which is fairly similar to a summary. In PEGASUS, several whole sentences are removed from documents during pre-training, and the model is tasked with recovering them. The Input for such pre-training is a document with missing sentences, while the output consists of the missing sentences being concatenated together. The advantage of this self-supervision is that you can create as many examples as there are documents without any human intervention, which often becomes a bottleneck problem in purely supervised systems.

In [20]:
pipe = pipeline('summarization', model="google/pegasus-cnn_dailymail"  )

pipe_out = pipe(sample_text)

Downloading (…)lve/main/config.json: 1.12kB [00:00, ?B/s]
Downloading pytorch_model.bin: 100%|██████████| 2.28G/2.28G [06:32<00:00, 5.80MB/s]
Downloading (…)neration_config.json: 100%|██████████| 280/280 [00:00<00:00, 258kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 88.0/88.0 [00:00<00:00, 52.2kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 1.91M/1.91M [00:01<00:00, 1.82MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 65.0/65.0 [00:00<00:00, 58.6kB/s]


In [21]:
pipe_out

[{'summary_text': 'Mentally ill inmates are housed on the "forgotten floor" of a Miami jail .<n>Judge Steven Leifman says the charges are usually "avoidable felonies"<n>He says the arrests often result from confrontations with police .<n>Mentally ill people often won\'t do what they\'re told when police arrive on the scene .'}]

In [22]:
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

In [23]:
## Comparing Different Summaries

In [24]:
print("GROUND TRUTH")

print(dataset['train'][1]['highlights'])


for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])
    

GROUND TRUTH
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .
BASELINE
Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events.
Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial.
MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."
GPT2
(A quick note about "forgotten floors": When you write this article, remember that we were there ourselves, and, thanks in part to our research and knowledge as CNN's special correspondent for Miami-Dade County, we know 


# SacreBLEU

The bleu_metric object is an instance of the Metric class, and works like an
aggregator: you can add single instances with add() or whole batches via
add_batch(). Once you have added all the samples you need to evaluate, you
then call compute() and the metric is calculated. This returns a dictionary with
several values, such as the precision for each n-gram, the length penalty, as
well as the final BLEU score. Let’s look at the example from before:

In [25]:

from datasets import load_metric

bleu_metric = load_metric("sacrebleu")

  bleu_metric = load_metric("sacrebleu")
Downloading builder script: 7.65kB [00:00, 5.09MB/s]                   


In [26]:
bleu_metric.add(prediction = [summaries["pegasus"]], reference = [dataset['train'][1]['highlights'] ])

results = bleu_metric.compute(smooth_method = 'floor', smooth_value = 0 )
# Instead of assigning zero precision to unmatched n-grams, 
# the smooth method adds a small value (usually 1) to the numerator and denominator
# of the precision calculation for all n-grams. This way, it ensures that even if there are no 
# exact matches, there is still a non-zero precision value, preventing overly harsh penalties.

results['precision'] = [np.round(p , 2) for p in results['precisions'] ] # round of values

pd.DataFrame.from_dict(results, orient = 'index', columns = ['Value'] )

Unnamed: 0,Value
score,18.73841
counts,"[27, 14, 10, 6]"
totals,"[67, 66, 65, 64]"
precisions,"[40.298507462686565, 21.21212121212121, 15.384..."
bp,1.0
sys_len,67
ref_len,57
precision,"[40.3, 21.21, 15.38, 9.38]"


# ROUGE

# ROUGE vs BLEU

Bleu measures precision: how much the words (and/or n-grams) in the machine generated summaries appeared in the human reference summaries.

Rouge measures recall: how much the words (and/or n-grams) in the human reference summaries appeared in the machine generated summaries.

### Interpretation of Rouge Score

ROUGE-n recall=40% means that 40% of the n-grams in the reference summary are also present in the generated summary.

--------

The ROUGE score was specifically developed for applications like
summarization where high recall is more important than just precision.5 

The approach is very similar to the BLEU score in that we look at different n-grams
and compare their occurrences in the generated text and the reference texts.


The difference is that with ROUGE we check how many n-grams in the
reference text also occur in the generated text. For BLEU we looked at how
many n-grams in the generated text appear in the reference

In [27]:
rouge_metric = load_metric('rouge')

Downloading builder script: 5.65kB [00:00, 5.65MB/s]                   


## ROUGE-N

With ROUGE-N, the N represents the n-gram that we are using. For ROUGE-1 we would be measuring the match-rate of unigrams between our model output and reference.

ROUGE-2 and ROUGE-3 would use bigrams and trigrams respectively.


## ROUGE-L

ROUGE-L measures the longest common subsequence (LCS) between our model output and reference. All this means is that we count the longest sequence of tokens that is shared between both:


In the HF Datasets implementation, two variations of ROUGE are
calculated: one calculates the score per sentence and averages it for the
summaries (ROUGE-L), and the other calculates it directly over the whole
summary (ROUGE-Lsum).


In [28]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

reference = dataset['train'][1]['highlights']

records = []

for model_name in summaries:
    rouge_metric.add(prediction = summaries[model_name], reference = reference )
# This line adds the current model's summary (summaries[model_name]) and the reference summary 
# (reference) to the rouge_metric object. It prepares the data for ROUGE score calculation.

    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
# It creates a dictionary called rouge_dict which maps each ROUGE score name to its corresponding
# F-measure value from the computed scores.
    print('rouge_dict ', rouge_dict )
    records.append(rouge_dict)

pd.DataFrame.from_records(records, index = summaries.keys() )

rouge_dict  {'rouge1': 0.365079365079365, 'rouge2': 0.14516129032258066, 'rougeL': 0.20634920634920634, 'rougeLsum': 0.2857142857142857}
rouge_dict  {'rouge1': 0.19727891156462585, 'rouge2': 0.03424657534246575, 'rougeL': 0.108843537414966, 'rougeLsum': 0.17006802721088435}
rouge_dict  {'rouge1': 0.1758241758241758, 'rouge2': 0.0, 'rougeL': 0.13186813186813187, 'rougeLsum': 0.15384615384615383}
rouge_dict  {'rouge1': 0.3655913978494624, 'rouge2': 0.13186813186813184, 'rougeL': 0.2150537634408602, 'rougeLsum': 0.3225806451612903}
rouge_dict  {'rouge1': 0.5, 'rouge2': 0.24489795918367346, 'rougeL': 0.36000000000000004, 'rougeLsum': 0.46}


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.365079,0.145161,0.206349,0.285714
gpt2,0.197279,0.034247,0.108844,0.170068
t5,0.175824,0.0,0.131868,0.153846
bart,0.365591,0.131868,0.215054,0.322581
pegasus,0.5,0.244898,0.36,0.46


# Evaluationg on the TEST set of the CNN/DailyMail Dataset

In [29]:
def calculate_metric_on_baseline_test_ds(dataset, metric, column_text = 'article', column_summary = 'highlights' ):
""""
   This function calculates a specified metric on a baseline test dataset for a Natural Language 
   Processing (NLP) task.
    It assumes the task is a text summarization task, where the goal is to generate a summary 

    Parameters:
    dataset (pandas.DataFrame): The test dataset. It should contain a column for the text and a 
    column for the true summary.metric 
    (datasets.Metric): The metric to calculate. This should be a 
    metric object from the Hugging Face datasets library. 
    column_text (str, optional): The name of the column in the dataset that contains the text.
    Defaults to 'article'.
    column_summary (str, optional): The name of the column in the dataset that contains the 
    true summary. Defaults to 'highlights'.

    Returns:
    score (float): The calculated score of the metric on the test dataset.
    """
    summaries = [baseline_summary_three_sent(text) for text in dataset[column_text] ]

    metric.add_batch(predictions = summaries, references = dataset[column_summary] )

    score = metric.compute()
    return score

In [30]:
test_sampled = dataset['train'].shuffle(seed = 42).select(range(1000))

score = calculate_metric_on_baseline_test_ds(test_sampled, rouge_metric )

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame.from_dict(rouge_dict, orient = 'index' , columns = ['baseline'] ).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.253995,0.100642,0.165754,0.231571


## Strategy to calculate the ROUGE Metric on test dataset for the other Models

In [31]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements.
    
    Generator function to yield successive batch-sized chunks from list_of_elements.

    Parameters:
    list_of_elements (list): List of elements to be divided into chunks.
    batch_size (int): The size of each chunk.

    Yields:
    list: Batch-sized chunk from list_of_elements.
    
    """
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    """
    Function to calculate a specified metric on a test dataset for a Natural Language Processing (NLP) task.
    It assumes the task is a text summarization task, where the goal is to generate a summary from a text.

    Parameters:
    dataset (pandas.DataFrame): The test dataset. It should contain a column for the text and a column for the true summary.
    metric (datasets.Metric): The metric to calculate. This should be a metric object from the Hugging Face datasets library.
    model (transformers.PreTrainedModel): The transformer model to use for text generation.
    tokenizer (transformers.PreTrainedTokenizer): The tokenizer corresponding to the model.
    batch_size (int, optional): The size of the batches to use for processing. Defaults to 16.
    device (str, optional): The device to run the model on. Defaults to the output of torch.cuda.is_available().
    column_text (str, optional): The name of the column in the dataset that contains the text. Defaults to 'article'.
    column_summary (str, optional): The name of the column in the dataset that contains the true summary. Defaults to 'highlights'.

    Returns:
    score (float): The calculated score of the metric on the test dataset.
    """
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the <n> token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

score = calculate_metric_on_test_ds(test_sampled, rouge_metric, 
                                   model_pegasus, tokenizer, batch_size=8)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

# At the end, we compute and return the ROUGE scores.
pd.DataFrame(rouge_dict, index=["pegasus"])

  2%|▏         | 2/125 [1:13:53<74:58:33, 2194.42s/it]