# Set up the Training Infrastructure

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
!mkdir data

In [3]:
 !curl -L https://www.dropbox.com/s/xgvawan1vpuels9/train_light.csv?dl=1 -o data/train.csv -s
 !curl -L https://www.dropbox.com/s/fhon3pf1gf3dog6/val.csv?dl=1 -o data/val.csv -s
 !curl -L https://www.dropbox.com/s/yz5abappvpinxyr/test.csv?dl=1 -o data/test.csv -s
 !curl -L https://www.dropbox.com/s/1n76cka49r5ctje/train.csv?dl=1 -o data/train_long.csv -s

In [4]:
!pip install torch==1.4.0 -q
!pip install transformers==2.9.0 -q
!pip install pytorch_lightning==0.7.5 -q

[K     |████████████████████████████████| 753.4MB 21kB/s 
[31mERROR: torchvision 0.7.0+cu101 has requirement torch==1.6.0, but you'll have torch 1.4.0 which is incompatible.[0m
[K     |████████████████████████████████| 645kB 9.2MB/s 
[K     |████████████████████████████████| 890kB 24.2MB/s 
[K     |████████████████████████████████| 3.8MB 45.2MB/s 
[K     |████████████████████████████████| 1.1MB 56.7MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 235kB 8.9MB/s 
[K     |████████████████████████████████| 829kB 15.1MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone


In [5]:
# @title Output
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


INFO:transformers.file_utils:PyTorch version 1.4.0 available.
INFO:transformers.file_utils:TensorFlow version 2.3.0 available.


In [6]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [7]:
!mkdir output

## Model

We'll be using [pytorch-lightning](https://github.com/PytorchLightning/pytorch-lightning) for training. Most of the below code is adapted from [HugginFace](https://github.com/huggingface/transformers/blob/master/examples/lightning_base.py).

Definition of hyperparameters and other.

In [8]:
args_dict = dict(
    data_dir="data", # path for data files
    output_dir="output", # path to save the checkpoints
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    # early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [9]:
# @title class T5FineTuner(pl.LightningModule)
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.hparams = hparams
    
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
  def is_logger(self):
    return True
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        lm_labels=lm_labels,
    )

  def _step(self, batch):
    lm_labels = batch["target_ids"]
    lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        lm_labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None,using_native_amp=None):
    if self.trainer.use_tpu:
      xm.optimizer_step(optimizer)
    else:
      optimizer.step()
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [10]:
# @title logger
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

## Paraphrase Generation finetuned on PAWS Dataset

In [11]:
train_data = 'data/train.csv'
val_data = 'data/val.csv'
train = pd.read_csv(train_data)
train = train[:25]
print(train.head())

                                           sentence1                                          sentence2
0  They were there to enjoy us and they were ther...  They were there for us to enjoy and they were ...
1  After the end of the war in June 1902 , Higgin...  In August , after the end of the war in June 1...
2  From the merger of the Four Rivers Council and...  Shawnee Trails Council was formed from the mer...
3  The group toured extensively and became famous...  The group toured extensively and was famous in...
4  Kathy and her husband Pete Beale ( Peter Dean ...  Kathy and her husband Peter Dean ( Pete Beale ...


### Prepare Dataset

In [12]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

INFO:filelock:Lock 140232797761096 acquired on /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpkbjvos_m


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model in cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f
INFO:filelock:Lock 140232797761096 released on /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f.lock
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f




In [13]:
# @title ParaphraseDataset(Dataset)
class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=256, truncation=True):
        self.path = os.path.join(data_dir, type_path + '.csv')

        self.source_column = "sentence1"
        self.target_column = "sentence2"
        self.data = pd.read_csv(self.path)

        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            input_, target = self.data.loc[idx, self.source_column], self.data.loc[idx, self.target_column]

            input_ = "paraphrase: "+ input_ + ' </s>'
            target = target + " </s>"

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)
    

In [14]:
dataset = ParaphraseDataset(tokenizer, data_dir='data', type_path='val')
len(dataset)

3536

In [15]:
data = dataset[100]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

paraphrase: It was credited to Lennon - McCartney, but John Lennon often stated that he wrote it.
It was credited to Lennon -- McCartney, but John Lennon often stated that he wrote it.


### Train

In [16]:
args = argparse.Namespace(**args_dict)

In [17]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    # early_stop_callback=False,
    precision=32,
    # amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

In [18]:
def get_dataset(tokenizer, type_path, args):
  return ParaphraseDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

**Initialize model**

In [19]:
model = T5FineTuner(args)

INFO:filelock:Lock 140232797551528 acquired on /root/.cache/torch/transformers/26561bc9e840d8945f475d0d4c4b9df32025eadd79894b867b570cb1d09e67a9.3817cc1260a6b941b17af62b4f2a942b9825f209d8e2eed99e79e96f85f59aab.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp0xitbpot


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json in cache at /root/.cache/torch/transformers/26561bc9e840d8945f475d0d4c4b9df32025eadd79894b867b570cb1d09e67a9.3817cc1260a6b941b17af62b4f2a942b9825f209d8e2eed99e79e96f85f59aab
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/26561bc9e840d8945f475d0d4c4b9df32025eadd79894b867b570cb1d09e67a9.3817cc1260a6b941b17af62b4f2a942b9825f209d8e2eed99e79e96f85f59aab
INFO:filelock:Lock 140232797551528 released on /root/.cache/torch/transformers/26561bc9e840d8945f475d0d4c4b9df32025eadd79894b867b570cb1d09e67a9.3817cc1260a6b941b17af62b4f2a942b9825f209d8e2eed99e79e96f85f59aab.lock
INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json from cache at /root/.cache/torch/transformers/26561bc9e840d8945f475d0d4c4b9df32025eadd79894b867b570cb1d09e67a9.3817cc1260a6b941b17af62b4f2a942b982




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242065649.0, style=ProgressStyle(descri…

INFO:transformers.file_utils:storing https://cdn.huggingface.co/t5-small-pytorch_model.bin in cache at /root/.cache/torch/transformers/9b662cba85524bef76fff5eb77d767407ac36f3fe492869331c011efd2b3a082.388aab7f5c8ed273dc71eb98334d76a3caf1b3280b476c1c77fba861c65445f3
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/9b662cba85524bef76fff5eb77d767407ac36f3fe492869331c011efd2b3a082.388aab7f5c8ed273dc71eb98334d76a3caf1b3280b476c1c77fba861c65445f3
INFO:filelock:Lock 140232797551528 released on /root/.cache/torch/transformers/9b662cba85524bef76fff5eb77d767407ac36f3fe492869331c011efd2b3a082.388aab7f5c8ed273dc71eb98334d76a3caf1b3280b476c1c77fba861c65445f3.lock
INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/t5-small-pytorch_model.bin from cache at /root/.cache/torch/transformers/9b662cba85524bef76fff5eb77d767407ac36f3fe492869331c011efd2b3a082.388aab7f5c8ed273dc71eb98334d76a3caf1b3280b476c1c77fba861c65445f3





INFO:transformers.modeling_utils:Weights of T5ForConditionalGeneration not initialized from pretrained model: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f


**Initialize trainer**

In [20]:
trainer = pl.Trainer(**train_params)

INFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


**start fine-tuning**

In [None]:
trainer.fit(model)

INFO:lightning:
    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | model                                                                 | T5ForConditionalGeneration | 60 M  
1   | model.shared                                                          | Embedding                  | 16 M  
2   | model.encoder                                                         | T5Stack                    | 35 M  
3   | model.encoder.block                                                   | ModuleList                 | 18 M  
4   | model.encoder.block.0                                                 | T5Block                    | 3 M   
5   | model.encoder.block.0.layer                                           | ModuleList                 | 3 M   
6   | model.encoder.block.0.layer.0                                     

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

In [None]:
!mkdir t5_base_paraphrase

In [None]:
## save the model this way so next time you can load it using T5ForConditionalGeneration.from_pretrained
model.model.save_pretrained('t5_base_paraphrase/')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r t5_base_paraphrase drive/My\ Drive/

In [None]:
print('Saved model.')

### Eval

For inference we will use the `generate` method with greedy decoding with max length 2.

In [None]:
# import textwrap
# from tqdm.auto import tqdm
# from sklearn import metrics

Let's visualize few predictions on test dataset

In [None]:
# # dataset = ImdbDataset(tokenizer, 'aclImdb', 'test',  max_len=512)
# dataset = ParaphraseDataset(tokenizer, '/content/drive/My Drive/t5/data', 'test',  max_len=512)
# loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# model.model.eval()
# outputs = []
# targets = []
# for batch in tqdm(loader):
#   outs = model.model.generate(input_ids=batch['source_ids'].cuda(), 
#                               attention_mask=batch['source_mask'].cuda(), 
#                               max_length=2)

#   dec = [tokenizer.decode(ids) for ids in outs]
#   target = [tokenizer.decode(ids) for ids in batch["target_ids"]]
  
#   outputs.extend(dec)
#   targets.extend(target)

batch = next(it)
batch["source_ids"].shape

Now predict on all the test dataset

In [None]:
# loader = DataLoader(dataset, batch_size=32, num_workers=4)
# model.model.eval()
# outputs = []
# targets = []
# for batch in tqdm(loader):
#   outs = model.model.generate(input_ids=batch['source_ids'].cuda(), 
#                               attention_mask=batch['source_mask'].cuda(), 
#                               max_length=2)

#   dec = [tokenizer.decode(ids) for ids in outs]
#   target = [tokenizer.decode(ids) for ids in batch["target_ids"]]
  
#   outputs.extend(dec)
#   targets.extend(target)

Let's check if the model generates any invalid text

This great is great! Our model hasn't generated any invalid prediction. Let's calculate accuarcy and other metrics

In [None]:
# metrics.accuracy_score(targets, outputs)

In [None]:
# print(metrics.classification_report(targets, outputs))

In [None]:
# pip freeze > requirements_t5_finetuning_seb_force_old_versions.txt

# Inference

In [None]:
!pip install torch==1.4.0 -q
!pip install transformers==2.9.0 -q
!pip install pytorch_lightning==0.7.5 -q

In [None]:
# import torch
# from transformers import T5ForConditionalGeneration,T5Tokenizer
 
# def set_seed(seed):
#   torch.manual_seed(seed)
#   if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(seed)

# set_seed(42)

# ### To Do: GET THE MODEL HERE - PRELOAD WITH GDOWN OR SO


# model = T5ForConditionalGeneration.from_pretrained('t5_paws')
# tokenizer = T5Tokenizer.from_pretrained('t5-base')

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print ("device ",device)
# model = model.to(device)

# sentence = "The sun was shining that day."


# text =  "paraphrase: " + sentence + " </s>"


# max_len = 256

# encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
# input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


# # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
# beam_outputs = model.generate(
#     input_ids=input_ids, attention_mask=attention_masks,
#     do_sample=True,
#     max_length=256,
#     top_k=120,
#     top_p=0.98,
#     early_stopping=True,
#     num_return_sequences=10
# )

# print ("\nOriginal Phrase: ")
# print (sentence)
# print ("\n")
# print ("Paraphrased Phrase: ")
# final_outputs =[]
# for beam_output in beam_outputs:
#     sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
#     if sent.lower() != sentence.lower() and sent not in final_outputs:
#         final_outputs.append(sent)

# for i, final_output in enumerate(final_outputs):
#     print("{}: {}".format(i, final_output))
# device  cpu