In [1]:
import pandas as pd

In [2]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", version="3.0.0")
print(f"Features: {dataset['train'].column_names}")

Using custom data configuration default
Found cached dataset cnn_dailymail (/home/reese/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

Features: ['article', 'highlights', 'id']


In [3]:
sample = dataset["train"][1]
print(f"""
Article (excerpt of 500 characters, total length: {len(sample["article"])}): """)
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):') 
print(sample["highlights"])


Article (excerpt of 500 characters, total length: 4051): 
Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most s

Summary (length: 281):
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


In [5]:
from datasets import load_metric
rouge_metric = load_metric("rouge")

reference = dataset["train"][1]["highlights"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [6]:
# def evaluate_summaries_baseline(dataset, metric, column_text="article",
#     column_summary="highlights"):
#     summaries = [three_sentence_summary(text) for text in dataset[column_text]]
#     metric.add_batch(predictions=summaries, references=dataset[column_summary])
#     score = metric.compute() 
#     return score

In [8]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(1000))
# score = evaluate_summaries_baseline(test_sampled, rouge_metric) 
# rouge_dict = dict((rn, score[rn].mid.fmeasure) 
#                   for rn in rouge_names) 
# pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T

Loading cached shuffled indices for dataset at /home/reese/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-888acb9a2eb72e89.arrow


In [9]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i:i + batch_size]

def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, batch_size=16, device=device,
                                   column_text="article",
                                   column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size)) 
    
    for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)):

            inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                            padding="max_length", return_tensors="pt")
            
            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                             attention_mask=inputs["attention_mask"].to(device),
                             length_penalty=0.8, num_beams=8, max_length=128)
            
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                    clean_up_tokenization_spaces=True)
                                    for s in summaries]
            
            decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries] 
            
            metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute() 
    return score


In [10]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
score = evaluate_summaries_pegasus(test_sampled, rouge_metric,
                                   model, tokenizer, batch_size=8) 
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

  0%|          | 0/125 [00:00<?, ?it/s]2023-04-02 14:34:20.029323: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-02 14:34:21.155062: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.2/lib64:/usr/local/cuda-11.2/extras/CUPTI/lib64
2023-04-02 14:34:21.155125: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.2/l

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.434779,0.216666,0.31278,0.374219


In [14]:
# custom example
text = """
    A major challenge for scaling machine learning is training models to perform
    tasks that are very difficult or time-consuming for humans to evaluate. We present
    progress on this problem on the task of abstractive summarization of entire fiction
    novels. Our method combines learning from human feedback with recursive
    task decomposition: we use models trained on smaller parts of the task to assist
    humans in giving feedback on the broader task. We collect a large volume of
    demonstrations and comparisons from human labelers, and fine-tune GPT-3 using
    behavioral cloning and reward modeling to do summarization recursively. At
    inference time, the model first summarizes small sections of the book and then
    recursively summarizes these summaries to produce a summary of the entire book.
    Our human labelers are able to supervise and evaluate the models quickly, despite
    not having read the entire books themselves. Our resulting model generates sensible
    summaries of entire books, even matching the quality of human-written summaries
    in a few cases (∼ 5% of books). We achieve state-of-the-art results on the recent
    BookSum dataset for book-length summarization. A zero-shot question-answering
    model using these summaries achieves competitive results on the challenging
    NarrativeQA benchmark for answering questions about books and movie scripts.
    We release datasets of samples from our model.2
"""

tokenized_input = tokenizer(text, max_length=1024,  truncation=True,
                            padding="max_length", return_tensors="pt")

model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

tokenized_input = tokenizer(text, max_length=1024,  truncation=True,
                            padding="max_length", return_tensors="pt")

summary = model.generate(input_ids=tokenized_input["input_ids"].to(device),
                    attention_mask=tokenized_input["attention_mask"].to(device),
                    length_penalty=0.8, num_beams=8, max_length=128)

print(summary)

decoded_summary = tokenizer.decode(summary[0], skip_special_tokens=True,
                        clean_up_tokenization_spaces=True)

decoded_summary = decoded_summary.replace("<n>", " ")

tensor([[    0,   184,   799,  1974,   124,   109,  1778,   113,  7093,  5551,
          5906,  6520,  3884,   113,   954,  5394,  8810,   110,   107,   106,
           284,   207,  1581,  2492,   124,  1934,   972,   113,   109,  1778,
           112,  1595,  4095,   115,  1132,  2230,   124,   109,  7792,  1778,
           110,   107,   106,   284,  1433,   449,   121,  1313,   121,   544,
           121,  3904,   602,   124,   109,   909,  2459, 53231, 20886,   118,
           410,   121, 11412,  5906,  6520,  3884,   110,   107,     1]],
       device='cuda:0')


'We present progress on the task of abstractive summarization of entire fiction novels. We use models trained on smaller parts of the task to assist humans in giving feedback on the broader task. We achieve state-of-the-art results on the recent BookSum dataset for book-length summarization.'

In [15]:
print(decoded_summary)

We present progress on the task of abstractive summarization of entire fiction novels. We use models trained on smaller parts of the task to assist humans in giving feedback on the broader task. We achieve state-of-the-art results on the recent BookSum dataset for book-length summarization.
