In [1]:
#!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q



In [2]:
#!pip install --upgrade accelerate
#!pip uninstall -y transformers accelerate
#!pip install transformers accelerate

Defaulting to user installation because normal site-packages is not writeable




Found existing installation: transformers 4.39.2
Uninstalling transformers-4.39.2:
  Successfully uninstalled transformers-4.39.2
Found existing installation: accelerate 0.28.0
Uninstalling accelerate-0.28.0:
  Successfully uninstalled accelerate-0.28.0
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Using cached transformers-4.39.2-py3-none-any.whl (8.8 MB)
Collecting accelerate
  Using cached accelerate-0.28.0-py3-none-any.whl (290 kB)
Installing collected packages: accelerate, transformers
Successfully installed accelerate-0.28.0 transformers-4.39.2




In [3]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
nltk.download("punkt")

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gorantla.krishna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset,load_from_disk,load_metric

In [5]:
import torch

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
model = "google/pegasus-cnn_dailymail"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [9]:
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Training of the model(Finetuning)
# Inferencing (Loading the pretrain model after loading we are just doing the prediction)

In [11]:
dataset = load_dataset("samsum")

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [13]:
print(dataset["train"]["dialogue"][0])

Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)


In [14]:
print(dataset["train"]["summary"][0])

Amanda baked cookies and will bring Jerry some tomorrow.


### Here I am trying to do summarization of the conversation

In [15]:
split_lengths = [len(dataset[split]) for split in dataset]

In [16]:
split_lengths

[14732, 819, 818]

In [17]:
dataset["train"].column_names

['id', 'dialogue', 'summary']

In [18]:
def convert_examples_to_features(data_in_batch):

    input_encoding = tokenizer(data_in_batch["dialogue"], max_length=1024,truncation=True)

    target_encoding = tokenizer(data_in_batch["summary"],max_length=128,truncation=True)

    return {
        "input_ids": input_encoding["input_ids"],
        "attention_mask": input_encoding["attention_mask"],
        "labels": target_encoding["input_ids"]
    }

In [19]:
dataset_encoded = dataset.map(convert_examples_to_features,batched=True)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [20]:
dataset_encoded["train"][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.',
 'input_ids': [12195,
  151,
  125,
  7091,
  3659,
  107,
  842,
  119,
  245,
  181,
  152,
  10508,
  151,
  7435,
  147,
  12195,
  151,
  125,
  131,
  267,
  650,
  119,
  3469,
  29344,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [12195, 7091, 3659, 111, 138, 650, 10508, 181, 3469, 107, 1]}

In [21]:
from transformers import TrainingArguments, Trainer

In [22]:
# FineTuning the model with new data

training_args = TrainingArguments(
    output_dir = "pegasus-samsum",
    num_train_epochs = 1,
    warmup_steps = 500,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    weight_decay = 0.01,
    logging_steps = 10,
    evaluation_strategy = "steps",
    eval_steps = 500,
    save_steps = 1e6,
    gradient_accumulation_steps = 16
)

In [23]:
from  transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer,model=model_pegasus)

In [26]:
trainer = Trainer(model = model_pegasus,
                  args = training_args,
                  tokenizer=tokenizer,
                  data_collator = seq2seq_data_collator,
                  train_dataset = dataset_encoded["test"],
                  eval_dataset = dataset_encoded["validation"]
                  )

In [None]:
trainer.train()

In [None]:
# Evaluation of the model
# Evaluation

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]



def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score



In [None]:
rouge_names = ["rouge1","rouge2","rougeL","rougeLsum"]
rouge_metric = load_metric("rouge")

In [None]:
score = calculate_metric_on_test_ds(
    dataset['test'][0:10], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

In [None]:
# Save the model

model_pegasus.save_pretrained("pegasus-samsum-model")

In [None]:
# Save the Tokenizer
tokenizer.save_pretrained("samsum-tokenizer")

In [None]:
# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/samsum-tokenizer")

In [None]:
# Load the model
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained("/content/pegasus-samsum-model").to(device)

In [None]:
sample_text = dataset["train"]["dialogue"][1]

In [None]:
print(sample_text)

In [None]:
summary_pipeline = pipeline("summarization",model=model_pegasus,tokenizer=tokenizer)

In [None]:
gen_kwargs = {"length_penalty":0.8,"num_beams":8,"max_length":128}

In [None]:
summary_pipeline(sample_text,**gen_kwargs)