# Text Generation

In [1]:
%%capture
!pip install transformers[torch]
!pip install sentencepiece
!pip install datasets
!pip install absl
!pip install rouge_score
!pip install protobuf==3.20
!pip install py7zr
!pip install acclerate -U

In [2]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
device= "cuda"

In [11]:
from datasets import load_dataset


In [3]:
dataset = load_dataset("cnn_dailymail", version="3.0.0")
print(f"Features: {dataset['train'].column_names}")

Features: ['article', 'highlights', 'id']


In [4]:
from datasets import list_metrics, load_metric
rouge_metric = load_metric("rouge", cache_dir=None)


  rouge_metric = load_metric("rouge", cache_dir=None)


In [5]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [6]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


def evaluate_summaries_baseline(dataset, metric,column_text="article",column_summary="highlights"):
    summaries = [three_sentence_summary(text) for text in dataset[column_text]]
    metric.add_batch(predictions=summaries, references=dataset[column_summary])
    score = metric.compute()
    return score

In [7]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(100))

score = evaluate_summaries_baseline(test_sampled, rouge_metric)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame.from_dict(rouge_dict, orient="index", columns= ["baseline"]).T


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.387685,0.165051,0.246191,0.350974


In [8]:
from tqdm import tqdm
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, batch_size=16, device=device,
                               column_text="article", column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))
    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        dct = tokenizer.batch_encode_plus(article_batch, max_length=1024, truncation=True,
                                          padding="max_length", return_tensors="pt")
        summaries = model.generate(input_ids=dct["input_ids"].to(device), length_penalty=0.8,
                                   attention_mask=dct["attention_mask"].to(device), num_beams=8, max_length=128)
        dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in summaries]
        dec = [d.replace("<n>", " ") for d in dec]
        metric.add_batch(predictions=dec, references=target_batch)
    score = metric.compute()
    return score

In [9]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = "google/pegasus-cnn_dailymail"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = (PegasusForConditionalGeneration.from_pretrained(model_name).to(device))

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
score = evaluate_summaries_pegasus(test_sampled, rouge_metric, model, tokenizer, batch_size=8, device=device)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 13/13 [02:40<00:00, 12.32s/it]


In [13]:
pd.DataFrame.from_dict(rouge_dict, orient="index", columns= ["pegasus"]).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.42561,0.202191,0.299855,0.361003


In [14]:
rouge_dict

{'rouge1': 0.42560994242964456,
 'rouge2': 0.20219092151645182,
 'rougeL': 0.2998548474626779,
 'rougeLsum': 0.36100320427452914}

Evaluate PEGASUS output with Samsum summaries

In [12]:
dataset_samsum = load_dataset("samsum")
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")
print(dataset_samsum["test"][0]["dialogue"])
print("\nSummary:")
print(dataset_samsum["test"][0]["summary"])

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [12]:
score = evaluate_summaries_pegasus(dataset_samsum["test"], rouge_metric, model,
                                   tokenizer, column_text="dialogue",
                                   column_summary="summary", batch_size=8)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

100%|██████████| 103/103 [19:45<00:00, 11.51s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.295961,0.08775,0.229215,0.229354


In [13]:
rouge_dict

{'rouge1': 0.2959613863327386,
 'rouge2': 0.08774986612703142,
 'rougeL': 0.22921461510956825,
 'rougeLsum': 0.2293541143118824}

Fine tuning of PEGASUS on samsum dataset

In [13]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["dialogue"], max_length=1024,
                                truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length=128,
                                     truncation=True)

    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

In [14]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features,
                                       batched=True)
columns = ["input_ids", "labels", "attention_mask"]
dataset_samsum_pt.set_format(type="torch", columns=columns)

In [15]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10, push_to_hub=True,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16)

In [18]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])

Cloning https://huggingface.co/pradeepiisc/pegasus-samsum into local empty directory.


In [20]:
trainer.train()




Step,Training Loss,Validation Loss
500,1.6312,1.485915


TrainOutput(global_step=920, training_loss=1.8192964564199032, metrics={'train_runtime': 3058.192, 'train_samples_per_second': 4.817, 'train_steps_per_second': 0.301, 'total_flos': 5526698901602304.0, 'train_loss': 1.8192964564199032, 'epoch': 1.0})

In [21]:
score = evaluate_summaries_pegasus(
    dataset_samsum["test"], rouge_metric, trainer.model, tokenizer,
    batch_size=2, column_text="dialogue", column_summary="summary")

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=[f"pegasus"])

100%|██████████| 410/410 [15:25<00:00,  2.26s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.428544,0.198675,0.33893,0.339131


There is a considerable improvement from the baseline scores above where we used PEGASUS model without fine-tuning. Fine-Tuning it on the contextual dataset lead to an imrovement in all the rouge metrics.

In [22]:
trainer.push_to_hub("Training Complete!")

Upload file pytorch_model.bin:   0%|          | 1.00/2.13G [00:00<?, ?B/s]

Upload file runs/Aug04_13-24-22_5379fb7c03bd/events.out.tfevents.1691155516.5379fb7c03bd.8049.0:   0%|        …

Upload file spiece.model:   0%|          | 1.00/1.82M [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/3.87k [00:00<?, ?B/s]

To https://huggingface.co/pradeepiisc/pegasus-samsum
   6f71d6b..4c6a3d2  main -> main

   6f71d6b..4c6a3d2  main -> main

To https://huggingface.co/pradeepiisc/pegasus-samsum
   4c6a3d2..00c113d  main -> main

   4c6a3d2..00c113d  main -> main



'https://huggingface.co/pradeepiisc/pegasus-samsum/commit/4c6a3d29bf3dccdf5529299cc323b6d1cb99a6cf'

Seq2Seq​Trainin⁠g​Arguments and Seq2SeqTrainer counterparts to TrainingArguments and Trainer
can also be used with predict_with_generate=True to evaluate during the training only


**END**