In [3]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

In [1]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

In [None]:
model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/FullText.csv', index_col = False)

In [None]:
final_train = pd.read_csv("/content/drive/MyDrive/BBC_News_Tokenized/training_tokenized.csv", index_col = False)
final_test = pd.read_csv("/content/drive/MyDrive/BBC_News_Tokenized/testing_tokenized.csv", index_col = False)
final_validate = pd.read_csv("/content/drive/MyDrive/BBC_News_Tokenized/validation_tokenized.csv", index_col = False)

### Conversion to Dictionaries and finally Datasets for passing into LLM Trainer

In [None]:
traind = {'id': final_train['id'].to_list(),
          'dialogue': final_train['dialogue'].to_list(),
          'summary': final_train['summary'].to_list(),
          'input_ids': final_train['input_ids'].to_list(),
          'attention_mask' : final_train['attention_mask'].to_list(),
          'labels': final_train['labels'].to_list()}

In [None]:
testd = {'id': final_test['id'].to_list(),
          'dialogue': final_test['dialogue'].to_list(),
          'summary': final_test['summary'].to_list(),
          'input_ids': final_test['input_ids'].to_list(),
          'attention_mask' : final_test['attention_mask'].to_list(),
          'labels': final_test['labels'].to_list()}

In [None]:
vald = {'id': final_validate['id'].to_list(),
          'dialogue': final_validate['dialogue'].to_list(),
          'summary': final_validate['summary'].to_list(),
          'input_ids': final_validate['input_ids'].to_list(),
          'attention_mask' : final_validate['attention_mask'].to_list(),
          'labels': final_validate['labels'].to_list()}


In [None]:
from datasets import Dataset

traindj = Dataset.from_dict(traind)
testj = Dataset.from_dict(testd)
valdj = Dataset.from_dict(vald)

# **Model Training**

In [None]:
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model= model_pegasus)


In [None]:
!mkdir -p "/content/drive/MyDrive/News_Summarizer_PegasusModelParams"

In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/News_Summarizer_PegasusModelParams', num_train_epochs=10, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
)

In [None]:
trainer = Trainer(model=model_pegasus, args = trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset = traindj,
                  eval_dataset = valdj)

In [None]:
trainer.train()

In [None]:
## Save model
model_pegasus.save_pretrained("/content/drive/MyDrive/News_Summarizer_PegasusModelParams/pegasus-news")

In [None]:
tokenizer.save_pretrained("/content/drive/MyDrive/News_Summarizer_PegasusModelParams/Pegasus-News-Tokenizer")

# **Evaluation**

ROUGE - Recall-Oriented Understudy for Gisting Evaluation
It measures the F1 Score, wherein the Precision and Recall are calculated ratio of n-grams common to both generated summary and reference summary to total n-grams appearing in reference summary (for recall and total generated n-grams for Precision).

 * For Rouge 1-> n = 1, i.e. Unigrams
 * For Rouge 2-> n = 2, i.e. Bigrams
 * For Rouge L-> n = Longest common subsequence
 * For Rouge L Sum -> Computed over whole summary and not individual sentences

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]



def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')


In [None]:
score = calculate_metric_on_test_ds(
    testj[0:10], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

In [None]:
# Load Saved

tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [None]:
def trunc_func(summ):

  l = len(summ)
  pars = l-1
  while(summ[pars]!= '.' and pars!=0):
    pars = pars-1
  if pars != 0:
    return summ[:pars+1]
  else:
    return summ

In [None]:

#Prediction

gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 1024}



sample_text = testj[2]["dialogue"]

reference = testj[2]["summary"]

pipe = pipeline("summarization", model="pegasus-news",tokenizer=tokenizer)

##
print("Dialogue:")
print(sample_text)


print("\nReference Summary:")
print(reference)


print("\nModel Summary:")
print(trunc_func(pipe(sample_text, **gen_kwargs)[0]["summary_text"]))