<a href="https://colab.research.google.com/github/research-clone/notebook_tutorials/blob/main/Paraphrasing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install SentencePiece
!pip install accelerate -U
!pip install evaluate
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=bd74997a81f5c29539825a4c66cc2ee4c2f7205cedbec0ec8d6ca9a7ec04da74
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from datasets import load_dataset

dataset = load_dataset("humarin/chatgpt-paraphrases")



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
import torch, nltk
import evaluate
import numpy as np
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, DataCollatorWithPadding, Trainer
nltk.download("punkt", quiet=True)

True

In [None]:
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [None]:
inputs = dataset['train']['text']
labels = dataset['train']['paraphrases']

In [None]:
input_batches = [inputs[i:i+8] for i in range(0, 50000, 8)]

In [None]:
labels_batches = []
for i in range(0, 50000, 8):
  batch = []
  for labels in labels[i:i+8]:
    batch.append(labels[0])

In [None]:
input_ids = [tokenizer(batch, truncation=True, padding='longest', return_tensors="pt") for batch in input_batches]
labels_ids = [tokenizer(batch, truncation=True, padding='longest', return_tensors="pt") for batch in labels_batches]

In [None]:
from torch.utils.data import Dataset, DataLoader

class Paraphrasing_Dataset(Dataset):

    def __init__(self, dataset, tokenizer):

        self.tokenizer = tokenizer
        self.inputs =  dataset['text']
        self.labels = [y[0] for y in dataset['paraphrases']]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input[index], self.labels[index]

In [None]:
train_set = Paraphrasing_Dataset(dataset['train'][:50000], tokenizer)
valid_set = Paraphrasing_Dataset(dataset['train'][50000:55000], tokenizer)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=valid_set,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()