In [1]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
import torch
from tqdm import tqdm

from nltk.tokenize import sent_tokenize

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [4]:
model_cpkt = 'google/pegasus-cnn_dailymail'
tokenizer = AutoTokenizer.from_pretrained(model_cpkt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_cpkt)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
dataset = pd.read_csv('summarizer-data\samsum-train.csv')

In [6]:
dataset.head()

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
4,13728094,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com..."


In [7]:
(dataset["dialogue"][98])

"O'Neill: Is everything ok?\nO'Neill: I didn't hear back from you\nO'Neill: <file_gif>\nTed: Hey\nTed: I have been really busy today\nTed: Sorry..\nTed: Yes everything is fine ;)\nTed: I'll send you a photo later on :)\nO'Neill: Great!! 👏"

In [8]:
dataset_samsum = load_from_disk('summarizer-data\samsum_dataset')

In [9]:
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [10]:
print(dataset_samsum["test"][1]["dialogue"])

Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)


In [11]:
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]
split_lengths

[14732, 819, 818]

In [12]:
def convert_examples_to_features(example_batch):
    input_en = tokenizer(example_batch['dialogue'], truncation=True, max_length=1024)

    with tokenizer.as_target_tokenizer():
        target_en = tokenizer(example_batch['summary'], truncation=True, max_length=128)

        return {
            'input_ids': input_en['input_ids'],
            'attention_mask': input_en['attention_mask'],
            'labels': target_en['input_ids']
        }
    

In [13]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

Map: 100%|██████████| 818/818 [00:00<00:00, 2411.88 examples/s]


In [14]:
dataset_samsum_pt['train']

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

In [15]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = model_pegasus)

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir= 'pegasus-samsum',
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.1,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16
)



In [17]:
trainer = Trainer(
    model=model_pegasus,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=dataset_samsum_pt['test'],
    eval_dataset=dataset_samsum_pt['validation'],
)

  trainer = Trainer(
