In [1]:
import json
import torch
import transformers

import tensorflow_datasets as tfds

from rouge_score import rouge_scorer

In [2]:
# Get the CNN/DailyMail dataset
ds,info = tfds.load("cnn_dailymail", split = "test", with_info = True)

INFO:absl:No config specified, defaulting to first: cnn_dailymail/plain_text
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset cnn_dailymail (/home/tanmay/tensorflow_datasets/cnn_dailymail/plain_text/0.0.2)
INFO:absl:Constructing tf.data.Dataset for split test, from /home/tanmay/tensorflow_datasets/cnn_dailymail/plain_text/0.0.2


In [3]:
def get_rouge(model, tokenizer, key, ds, batch_size = 64, min_length = 210, max_length = 500, device = "cpu", epochs = 1):
    '''Calculates the rouge score of a model on the given dataset

    args
    model: The model to be tested
    tokenizer: tokenizer for the model to be tested
    key: The rouge score we want i.e rouge-1,rouge-2,rouge-L etc.
    ds: dataset from tensorflow.datasets
    batch_size: size of batch to be extracted
    min_length: Minimum length of the output summary
    max_length: Maximum length of the output summary
    device: cuda or cpu

    returns:
    precision: ratio of number of overlapping words in output and reference summary to number of words in output summary
    recall: ratio of number of overlapping words in output and reference summary to number of words in reference summar
    fmeasure: harmonic mean of precision and recall
    '''
    precision = 0.0
    recall = 0.0
    f1 = 0.0
    total_count = 0
    epoch = 0
    key = key
    device = device
    ds_batched = ds.batch(batch_size)
    scorer = rouge_scorer.RougeScorer([key])
    if(device=="cuda"):
        model.cuda()
    print("Starting......")
    for batch in tfds.as_numpy(ds_batched):
        if(epoch==epochs):
          break
        texts,summaries = batch["article"],batch["highlights"]
        step = 0
        for text,summary in zip(texts,summaries):
          preprocessed_txt = str(text).strip().replace("\n","")
          t5_prep = "summarize: "+preprocessed_txt
          tokenized_text = tokenizer.encode(t5_prep,max_length = len(t5_prep),return_tensors = "pt").to(device)
          summary_ids = model.generate(tokenized_text,num_beams = 4,
                                              no_repeat_ngram_size = 2,
                                              min_length = min_length,
                                              max_length = max_length,
                                              early_stopping = True)
          output = tokenizer.decode(summary_ids[0].to(device), skip_special_tokens = True)
          if(step%10==0):
            print("Step: ",step)
          step += 1
          scores = scorer.score(str(summary),output)
          precision += scores[key].precision
          recall+= scores[key].recall
          f1 += scores[key].fmeasure
        total_count += len(texts)
        print("Average score after, ", total_count, "epochs")
        print("Precision: ",precision/total_count)
        print("Recall: ",recall/total_count)
        print("fmeasure ",f1/total_count)
        print(scores)
        epoch += 1

In [4]:
# On T5-Base
model = transformers.T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = transformers.T5Tokenizer.from_pretrained('t5-base')

In [5]:
get_rouge(model, tokenizer, "rouge1", ds, batch_size = 128, device = "cuda") 

Starting......
Step:  0
Step:  10
Step:  20
Step:  30
Step:  40
Step:  50
Step:  60
Step:  70
Step:  80
Step:  90
Step:  100
Step:  110
Step:  120
Average score after,  128 epochs
Precision:  0.2891179901212153
Recall:  0.47454527065282587
fmeasure  0.3462267430901061
{'rouge1': Score(precision=0.32051282051282054, recall=0.6578947368421053, fmeasure=0.4310344827586207)}


In [6]:
# Get summary for exemplar
text2 = "According to the complaint, since 2013 Ang applied for and received $5 million in federal grant money for his work at the University of Arkansas. The U.S. Attorney’s Office said the investigation started when a university employee examined a hard drive in the library’s lost-and-found, trying to find out who owned the device and found emails from Ang. Mr. Ang, 63, was the director of the University of Arkansas’s High Density Electronics Center, which was founded with Defense Department funds. The center made technology for use in the International Space Station, Mr. Ang said in an interview with an electrical engineering trade publication. Mr. Ang’s work also involved power grid security research, said Todd Shields, a dean at Arkansas. He said the university was asked a few months ago to give the federal government information about faculty travel to China but did not know which agency made the request."

In [7]:
preprocessed_txt = text2.strip().replace("\n","")
t5_prep = "summarize: " + preprocessed_txt
device = "cuda"
tokenized_text = tokenizer.encode(t5_prep,return_tensors = "pt").to(device)
summary_ids = model.generate(tokenized_text,num_beams = 4,
                                    no_repeat_ngram_size = 2,
                                    min_length = 30,
                                    max_length = 100,
                                    early_stopping = True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens = True)

In [8]:
text2

'According to the complaint, since 2013 Ang applied for and received $5 million in federal grant money for his work at the University of Arkansas. The U.S. Attorney’s Office said the investigation started when a university employee examined a hard drive in the library’s lost-and-found, trying to find out who owned the device and found emails from Ang. Mr. Ang, 63, was the director of the University of Arkansas’s High Density Electronics Center, which was founded with Defense Department funds. The center made technology for use in the International Space Station, Mr. Ang said in an interview with an electrical engineering trade publication. Mr. Ang’s work also involved power grid security research, said Todd Shields, a dean at Arkansas. He said the university was asked a few months ago to give the federal government information about faculty travel to China but did not know which agency made the request.'

In [9]:
summary

"since 2013 Ang applied for and received $5 million in federal grant money for his work at the University of Arkansas. the investigation started when a university employee examined shard drive in the library's lost-and-found, trying to find out who owned the device and found emails from the director of the university of arkansas."

In [10]:
# On T5-11B
model = transformers.BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = transformers.BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [11]:
get_rouge(model, tokenizer, "rouge1", ds, batch_size = 128, device = "cuda") 

Starting......
Step:  0
Step:  10
Step:  20
Step:  30
Step:  40
Step:  50
Step:  60
Step:  70
Step:  80
Step:  90
Step:  100
Step:  110
Step:  120
Average score after,  128 epochs
Precision:  0.19631271260743996
Recall:  0.6045632142160139
fmeasure  0.28874747453289307
{'rouge1': Score(precision=0.16993464052287582, recall=0.6842105263157895, fmeasure=0.27225130890052357)}


In [12]:
preprocessed_txt = text2.strip().replace("\n","")
t5_prep = "summarize: " + preprocessed_txt
device = "cuda"
tokenized_text = tokenizer.encode(t5_prep,return_tensors = "pt").to(device)
summary_ids = model.generate(tokenized_text,num_beams = 4,
                                    no_repeat_ngram_size = 2,
                                    min_length = 30,
                                    max_length = 100,
                                    early_stopping = True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens = True)

In [13]:
text2

'According to the complaint, since 2013 Ang applied for and received $5 million in federal grant money for his work at the University of Arkansas. The U.S. Attorney’s Office said the investigation started when a university employee examined a hard drive in the library’s lost-and-found, trying to find out who owned the device and found emails from Ang. Mr. Ang, 63, was the director of the University of Arkansas’s High Density Electronics Center, which was founded with Defense Department funds. The center made technology for use in the International Space Station, Mr. Ang said in an interview with an electrical engineering trade publication. Mr. Ang’s work also involved power grid security research, said Todd Shields, a dean at Arkansas. He said the university was asked a few months ago to give the federal government information about faculty travel to China but did not know which agency made the request.'

In [14]:
summary

"The U.S. Attorney's Office said the investigation started when a university employee examined a hard drive in the library’s lost-and-found, trying to find out who owned the device. Mr. Ang, 63, was the director of the High Density Electronics Center, which was founded with Defense Department funds."