<a href="https://colab.research.google.com/github/rahulgundala007/NLP_text_summarization/blob/main/Pegasus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [3]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


In [6]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [7]:
dataset_ToS = load_dataset("Rahulgundala007/Tos_Dataset")
dataset_ToS


DatasetDict({
    train: Dataset({
        features: ['Service_Name', 'Section Title', 'ToS_Detail', 'ToS_Summary'],
        num_rows: 69
    })
})

In [7]:
split_lengths = [len(dataset_ToS[split])for split in dataset_ToS]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_ToS['train'].column_names}")
print("\nToS_Detail:")

print(dataset_ToS["train"][1]["ToS_Detail"])

print("\nSummary:")

print(dataset_ToS["train"][1]["ToS_Summary"])


Split lengths: [69]
Features: ['Service_Name', 'Section Title', 'ToS_Detail', 'ToS_Summary']

ToS_Detail:
Google LLC, based in the State of Delaware, USA, provides Google services and outlines the legal framework and operating laws applicable to the user

Summary:
Identification of Google LLC as the service provider and the legal basis of its operation


In [8]:
pipe = pipeline('summarization', model = model_ckpt )

pipe_out = pipe(dataset_ToS['train'][0]['ToS_Detail'] )

print(pipe_out)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Your max_length is set to 128, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


[{'summary_text': 'These Terms of Service include what you can expect from Google as you use their services, what Google expects from you, the rights to the content you find in Google services, and the legal rights you have in case of problems or disagreements.'}]


In [9]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['ToS_Detail'] , max_length = 1024, truncation = True )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['ToS_Summary'], max_length = 128, truncation = True )

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

dataset_ToS_pt = dataset_ToS.map(convert_examples_to_features, batched = True)

In [10]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [11]:
dataset_ToS_pt['train'][0]

{'Service_Name': 'Google',
 'Section Title': 'What’s covered in these terms',
 'ToS_Detail': 'These Terms of Service include what you can expect from Google as you use their services, what Google expects from you, the rights to the content you find in Google services, and the legal rights you have in case of problems or disagreements',
 'ToS_Summary': "An overview of user expectations, user responsibilities, content rights, and legal recourses covered by Google's Terms of Service.",
 'input_ids': [507,
  7787,
  113,
  1255,
  444,
  180,
  119,
  137,
  1337,
  135,
  1058,
  130,
  119,
  207,
  153,
  318,
  108,
  180,
  1058,
  10118,
  135,
  119,
  108,
  109,
  1420,
  112,
  109,
  601,
  119,
  258,
  115,
  1058,
  318,
  108,
  111,
  109,
  1165,
  1420,
  119,
  133,
  115,
  437,
  113,
  743,
  132,
  36325,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [12]:
pip install transformers[torch]



In [13]:
pip install accelerate -U




In [14]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-ToS', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
)

In [15]:
trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_ToS_pt["train"])

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

In [8]:
model_pegasus.save_pretrained("pegasus-ToS-model")


Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [9]:
tokenizer.save_pretrained("tokenizer")


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')