# T5 - Fine tuning for Paraphrasing

We modified code from https://colab.research.google.com/github/google-research/text-to-text-transfer-transformer/blob/master/notebooks/t5-trivia.ipynb#scrollTo=zrtR2urJV3ST for paraphrasing, which is originally a tutorial for fine-tuning pre-trained T5 for closed book question by the T5 author.

This notebook is to be run in Google Colab using TPU.

In [None]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

BASE_DIR = "gs://t5_renny_new" #@param { type: "string" }
if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
DATA_DIR = os.path.join(BASE_DIR, "data_paraphrase")
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

In [None]:
import pickle
def dump_pickle(filename,obj):
    with tf.io.gfile.GFile(filename,'wb') as f:
        pickle.dump(obj,f)

In [None]:
generated_text_path = os.path.join(DATA_DIR,'generated_text/')

#parameters
PR_SPLIT_FNAMES = {
    "train": "mscoco_train.txt",
    "validation": "mscoco_val.txt"
}
pr_counts_path = os.path.join(DATA_DIR, "mscoco-counts.json")
pr_tsv_path = {
    "train": os.path.join(DATA_DIR, "mscoco_train.tsv"),
    "validation": os.path.join(DATA_DIR, "mscoco_val.tsv")
}



Uploading file


## Convert text files to TSV

In [None]:
import gzip
import json

# Public directory of Natural Questions data on GCS.
#NQ_JSONL_DIR = "gs://natural_questions/v1.0-simplified/"

def nq_jsonl_to_tsv(in_fname, out_fname):

  def extract_answer(tokens, span):
    """Reconstruct answer from token span and remove extra spaces."""
    start, end = span["start_token"], span["end_token"]  
    ans = " ".join(tokens[start:end])
    # Remove incorrect spacing around punctuation.
    ans = ans.replace(" ,", ",").replace(" .", ".").replace(" %", "%")
    ans = ans.replace(" - ", "-").replace(" : ", ":").replace(" / ", "/")
    ans = ans.replace("( ", "(").replace(" )", ")")
    ans = ans.replace("`` ", "\"").replace(" ''", "\"")
    ans = ans.replace(" 's", "'s").replace("s ' ", "s' ")
    return ans

  count = 0
  with tf.io.gfile.GFile(in_fname, "r") as infile,\
       tf.io.gfile.GFile(out_fname, "w") as outfile:
    for line in infile:
      line = line.replace('\n','')
      if line != "<|end of text|>":
        line = line.split(">>>>>>")
        original = line[0]
        paraphrase = line[1]
        
        # Write this line as <question>\t<answer>
        outfile.write("%s\t%s\n" % (original, paraphrase))
        count += 1
        tf.logging.log_every_n(
            tf.logging.INFO,
            "Wrote %d examples to %s." % (count, out_fname),
            1000)
    return count

if tf.io.gfile.exists(pr_counts_path):
  # Used cached data and counts.
  tf.logging.info("Loading NQ from cache.")
  num_nq_examples = json.load(tf.io.gfile.GFile(pr_counts_path))
else:
  # Create TSVs and get counts.
  tf.logging.info("Generating Ori-Para TSVs.")
  num_nq_examples = {}
  for split, fname in PR_SPLIT_FNAMES.items():
    print(split)
    print(fname)
    num_nq_examples[split] = nq_jsonl_to_tsv(
        os.path.join(DATA_DIR, fname), pr_tsv_path[split])
  json.dump(num_nq_examples, tf.io.gfile.GFile(pr_counts_path, "w"))

## Function to load the TSV data as a tf.data.Dataset in TensorFlow.

In [8]:
def pr_dataset_fn(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(pr_tsv_path[split])
  # Split each "<original>\t<paraphrase>" example into (original, paraphrase) tuple.
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  # Map each tuple to a {"original": ... "paraphrase": ...} dict.
  ds = ds.map(lambda *ex: dict(zip(["original", "paraphrase"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(pr_dataset_fn("validation").take(5)):
  print(ex)

A few raw validation examples...
{'original': b'the living room of a home with a large sectional couch', 'paraphrase': b'a very net living room with a big  peace couch under a large rug'}
{'original': b'a very net living room with a big  peace couch under a large rug', 'paraphrase': b'the living room of a home with a large sectional couch'}
{'original': b'a l shaped couch in the living room', 'paraphrase': b'a large sectional sofa is sitting to the far left of a television screen'}
{'original': b'a large sectional sofa is sitting to the far left of a television screen', 'paraphrase': b'a l shaped couch in the living room'}
{'original': b'a man wearing headphones while chewing on an ink pen', 'paraphrase': b'a man wearing headphones with a toothbrush in his mouth'}


In [None]:
def sentence_preprocessor(ds):
  def normalize_text(text):
    """Lowercase and remove quotes from a TensorFlow string."""
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
    return text

  def to_inputs_and_targets(ex):
    """Map {"question": ..., "answer": ...}->{"inputs": ..., "targets": ...}."""
    return {
        "inputs":
             tf.strings.join(
                 ["original sentence: ", normalize_text(ex["original"])]),
        "targets": normalize_text(ex["paraphrase"])
    }
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

## Register task to registry

In [None]:
t5.data.TaskRegistry.remove("paraphrasing")
t5.data.TaskRegistry.add(
    "paraphrasing",
    # Supply a function which returns a tf.data.Dataset.
    dataset_fn=pr_dataset_fn,
    splits=["train", "validation"],
    # Supply a function which preprocesses text from the tf.data.Dataset.
    text_preprocessor=[sentence_preprocessor],
    # Lowercase targets before computing metrics.
    postprocess_fn=t5.data.postprocessors.lower_text, 
    # We'll use accuracy as our evaluation metric.
    metric_fns=[t5.evaluation.metrics.accuracy],
    # Not required, but helps for mixing and auto-caching.
    num_input_examples=num_nq_examples
)

In [14]:
pr_task = t5.data.TaskRegistry.get("paraphrasing")
ds = pr_task.get_dataset(split="validation", sequence_length={"inputs": 128, "targets": 128})
print("A few preprocessed validation examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

A few preprocessed validation examples...
{'inputs_plaintext': b'original sentence: a woman wearing a multi colored  striped sweater holds her arms up triumphantly as a kite flies high in the sky', 'inputs': array([  926,  7142,    10,     3,     9,  2335,  5119,     3,     9,
        1249, 11999,     3, 27465, 19469,  4532,   160,  6026,    95,
       20020,   288,   120,    38,     3,     9,  3650,    15,     3,
          89,  4664,   306,    16,     8,  5796,     1]), 'targets_plaintext': b'a woman flying a kite in the blue cloud filled sky', 'targets': array([   3,    9, 2335, 7070,    3,    9, 3650,   15,   16,    8, 1692,
       3126, 3353, 5796,    1])}
{'inputs_plaintext': b'original sentence: a large area rug lines the floor of a room', 'inputs': array([ 926, 7142,   10,    3,    9,  508,  616, 9787, 2356,    8, 1501,
         13,    3,    9,  562,    1]), 'targets_plaintext': b'a room with a red and white carpet some blue drums and a piano', 'targets': array([   3,    9,  562

In [None]:
#Sentences generated from test split of tags

original_1 = "You like robbery movies, especially if they are keri russell." 
original_2 = "You don't like pop culture references movies, unless they are heartfelt." 
original_3 = "You don't like ghost story movies, especially if they are not inappropriate music." 
original_4 = "You like bollywood movies, especially if they are not fighting the system." 
original_5 = "You like nasa movies if they are identity theft."
original_6 = "You don't like pixar animation movies, especially if they are not halloween theme."
original_7 = "You like modern fantasy movies movies, unless they are intelligent thriller."
original_8 = "You don't like scandal movies if they are artificial human."
original_9 = "You like beautiful cinematography movies, especially if they are vistavision."
original_10 = "You don't like tarantino movies, unless they are interesting concept."
original_11 = "You like anarchy movies, especially if they are not neo-noir."

original_12 = "You don't like heartwarming movies if they are action thriller."
original_13 = "You don't like colourful movies if they are female power."
original_14 = "You don't like british comedy movies, unless they are fast-paced."
original_15 = "You don't like action thriller movies if they are espionage."
original_16 = "You like clever plot movies if they are dream within a dream."
original_17 = "You don’t like romantic comedy movies, especially if they are idiotic."
original_18 = "You like brilliant movies, especially if they are not poor script."
original_19 = "You like musical movies if they are hillarious."
original_20 = "You don’t like personality disorder movies, especially if they are no chemistry."
fun1 = "The birds are flying in the blue sky"
fun2 = "I hear a very lovely piano sound from the next room"
fun3 = "My dog is chasing neighbor's cat"
fun4 = "The wolf (Canis lupus) is a large canine native to Eurasia and North America"
fun5 = "SpaceX's Crew Dragon and Falcon 9 make their first crewed launch for NASA"

test_data = [original_1,original_2,original_3,original_4,original_5,original_6,original_7,original_8,original_9,original_10,original_11,
             original_12,original_13,original_14,original_15,original_16,original_17,original_18,original_19,original_20, fun1, fun2, fun3,fun4,fun5]


## Fine tuning from train data and generate paraphrase from test data

In [None]:
#MODELS_DIR = os.path.join(BASE_DIR, "models_paraphrase")
MODEL_SIZE = "3B" #@param["small", "base", "large", "3B", "11B"]
# Public GCS path for T5 pre-trained model checkpoints
BASE_PRETRAINED_DIR = "gs://t5-data/pretrained_models"
PRETRAINED_DIR = os.path.join(BASE_PRETRAINED_DIR, MODEL_SIZE)

FINETUNE_STEPS_LIST = [15,200,1000,1500] 
temp = 1 #must be 0 if beam size = >1
beam_size = 1
predictions = {}
for FINETUNE_STEPS in FINETUNE_STEPS_LIST:
  #Run_name
  model_name = MODEL_SIZE+"_metr_acc"+"_"+PR_SPLIT_FNAMES["train"].split(".")[0]+"_temp"+str(temp)+"_beam"+str(beam_size)+"_"+str(FINETUNE_STEPS)
  MODELS_DIR = os.path.join(BASE_DIR, model_name)
  MODEL_DIR = os.path.join(MODELS_DIR, MODEL_SIZE)
  
  predictions[model_name] = {}

  if ON_CLOUD and MODEL_SIZE == "3B":
    tf.logging.warn(
        "The `3B` model is too large to use with the 5GB GCS free tier. "
        "Make sure you have at least 25GB on GCS before continuing."
    )
  elif ON_CLOUD and MODEL_SIZE == "11B":
    raise ValueError(
        "The `11B` parameter is too large to fine-tune on the `v2-8` TPU "
        "provided by Colab. Please comment out this Error if you're running "
        "on a larger TPU."
    )

  # Set parallelism and batch size to fit on v2-8 TPU (if possible).
  # Limit number of checkpoints to fit within 5GB (if possible).
  model_parallelism, train_batch_size, keep_checkpoint_max = {
      "small": (1, 256, 16),
      "base": (2, 128, 8),
      "large": (8, 64, 4),
      "3B": (8, 16, 1),
      "11B": (8, 16, 1)}[MODEL_SIZE]

  tf.io.gfile.makedirs(MODEL_DIR)
  # The models from our paper are based on the Mesh Tensorflow Transformer.
  model = t5.models.MtfModel(
      model_dir=MODEL_DIR,
      tpu=TPU_ADDRESS,
      tpu_topology=TPU_TOPOLOGY,
      model_parallelism=model_parallelism,
      batch_size=train_batch_size,
      sequence_length={"inputs": 128, "targets": 128},
      learning_rate_schedule=0.003,
      save_checkpoints_steps=1000,
      keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
      iterations_per_loop=100,
  )

  print("##############################Fine tuning for model {} ################################".format(model_name))
  #fine tune model
  model.finetune(
      mixture_or_task_name="paraphrasing",
      pretrained_model_dir=PRETRAINED_DIR,
      #pretrained_checkpoint_step = 1000120,
      finetune_steps=FINETUNE_STEPS
  )


  print("##############################Prediction for model {} ################################".format(model_name))
  #Predict based on test data and save to pickle
  now = time.time()
  # Write out the supplied questions to text files.

  predict_inputs_path = os.path.join(MODEL_DIR, model_name+"_predict_inputs_%d.txt" % now)
  predict_outputs_path = os.path.join(MODEL_DIR, model_name+ "_predict_outputs_%d.txt" % now)
  # Manually apply preprocessing by prepending "triviaqa question:".
  with tf.io.gfile.GFile(predict_inputs_path, "w") as f:
    for q in test_data:
      f.write("Original sentence: %s\n" % q.lower())
    # Manually apply preprocessing by prepending "triviaqa question:".
  
  # Ignore any logging so that we only see the model's answers to the questions.
  with tf_verbosity_level('ERROR'):
    model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
    model.predict(
      input_file=predict_inputs_path,
      output_file=predict_outputs_path,
      # Select the most probable output token at each step.
      beam_size=beam_size,
      temperature=float(temp),
    )


  # The output filename will have the checkpoint appended so we glob to get 
  # the latest.
  prediction_files = sorted(tf.io.gfile.glob(predict_outputs_path + "*"))
  print("\nPredictions using checkpoint %s:\n" % prediction_files[-1].split("-")[-1])  
  with tf.io.gfile.GFile(prediction_files[-1]) as f:
    for q, a in zip(test_data, f):
      if q:
        predictions[model_name][q] = a
        print("original: " + q)
        print("paraphrase: " + a)
        print()

# saving all generated paraphrase in pickle file, with model name as key
dump_pickle(generated_text_path+"predictions_ordered_beam1_temp1_mscoco_only.pkl",predictions)

