# Generate Praphrase from fine tuned T5 models

We modified code from https://colab.research.google.com/github/google-research/text-to-text-transfer-transformer/blob/master/notebooks/t5-trivia.ipynb#scrollTo=zrtR2urJV3ST for paraphrasing, which is originally a tutorial for fine-tuning pre-trained T5 for closed book question by the T5 author.

This notebook is to be run in Google Colab using TPU.

In [None]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

BASE_DIR = "gs://t5_renny_new" #@param { type: "string" }
if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
DATA_DIR = os.path.join(BASE_DIR, "data_paraphrase")
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

In [None]:
original_1 = "You like robbery movies, especially if they are keri russell." #@param {type:"string"}
original_2 = "You don't like pop culture references movies, unless they are heartfelt." #@param {type:"string"}
original_3 = "You don't like ghost story movies, especially if they are not inappropriate music." #@param {type:"string"}
original_4 = "You like bollywood movies, especially if they are not fighting the system." #@param {type:"string"}
original_5 = "You like nasa movies if they are identity theft."
original_6 = "You don't like pixar animation movies, especially if they are not halloween theme."
original_7 = "You like modern fantasy movies movies, unless they are intelligent thriller."
original_8 = "You don't like scandal movies if they are artificial human."
original_9 = "You like beautiful cinematography movies, especially if they are vistavision."
original_10 = "You don't like tarantino movies, unless they are interesting concept."
original_11 = "You like anarchy movies, especially if they are not neo-noir."

original_12 = "You don't like heartwarming movies if they are action thriller."
original_13 = "You don't like colourful movies if they are female power."
original_14 = "You don't like british comedy movies, unless they are fast-paced."
original_15 = "You don't like action thriller movies if they are espionage."
original_16 = "You like clever plot movies if they are dream within a dream."
original_17 = "You don’t like romantic comedy movies, especially if they are idiotic."
original_18 = "You like brilliant movies, especially if they are not poor script."
original_19 = "You like musical movies if they are hillarious."
original_20 = "You don’t like personality disorder movies, especially if they are no chemistry."
fun1 = "The birds are flying in the blue sky"
fun2 = "I hear a very lovely piano sound from the next room"
fun3 = "My dog is chasing neighbor's cat"
fun4 = "The wolf (Canis lupus) is a large canine native to Eurasia and North America"
fun5 = "SpaceX's Crew Dragon and Falcon 9 make their first crewed launch for NASA"

test_data = [original_1,original_2,original_3,original_4,original_5,original_6,original_7,original_8,original_9,original_10,original_11,
             original_12,original_13,original_14,original_15,original_16,original_17,original_18,original_19,original_20, fun1, fun2, fun3,fun4,fun5]


In [None]:
MODEL_SIZE = "3B"

# insert the name of the models to generate paraphrase
list_tuned_model_name =["3B_mixed_allMscoco_50kTemplate_ordered_train_temp1_beam1_50",
                        "3B_mixed_allMscoco_50kTemplate_ordered_train_temp1_beam1_100",
                        "3B_mixed_allMscoco_50kTemplate_ordered_train_temp1_beam1_150",
                        "3B_mixed_allMscoco_50kTemplate_ordered_train_temp1_beam1_200",
                        "3B_mixed_allMscoco_50kTemplate_ordered_train_temp1_beam1_250"]

predictions = {}
for tuned_model_name in list_tuned_model_name:

  #tuned_model_name = "3B_mixed_allMscoco_50kTemplate_ordered_train_temp1_beam1_200"
  TUNED_MODELS_DIR = os.path.join(BASE_DIR, tuned_model_name)
  LOAD_MODEL_DIR = os.path.join(TUNED_MODELS_DIR, MODEL_SIZE)
  temp = 1
  beam = 1


  # Set parallelism and batch size to fit on v2-8 TPU (if possible).
    # Limit number of checkpoints to fit within 5GB (if possible).
  model_parallelism, train_batch_size, keep_checkpoint_max = {
        "small": (1, 256, 16),
        "base": (2, 128, 8),
        "large": (8, 64, 4),
        "3B": (8, 16, 1),
        "11B": (8, 16, 1)}[MODEL_SIZE]

  model = t5.models.MtfModel(
        model_dir=LOAD_MODEL_DIR,
        tpu=TPU_ADDRESS,
        tpu_topology=TPU_TOPOLOGY,
        model_parallelism=model_parallelism,
        batch_size=train_batch_size,
        sequence_length={"inputs": 128, "targets": 128},
        learning_rate_schedule=0.003,
        save_checkpoints_steps=1000,
        keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
        iterations_per_loop=100,
    )


  print("##############################Prediction for model {} ################################".format(tuned_model_name))
  #Predict based on test data and save to pickle
  now = time.time()
  # Write out the supplied questions to text files.

  predict_inputs_path = os.path.join(LOAD_MODEL_DIR,"playaround/", tuned_model_name+"_predict_inputs_%d.txt" % now)
  predict_outputs_path = os.path.join(LOAD_MODEL_DIR,"playaroud/", tuned_model_name+ "_predict_outputs_%d.txt" % now)
  # Manually apply preprocessing by prepending "triviaqa question:".
  with tf.io.gfile.GFile(predict_inputs_path, "w") as f:
    for q in test_data:
      f.write("Original sentence: %s\n" % q.lower())
      # Manually apply preprocessing by prepending "triviaqa question:".
    
  # Ignore any logging so that we only see the model's answers to the questions.
  with tf_verbosity_level('ERROR'):
    model.batch_size = 8  # Min size for small model on v2-8 with parallelism 1.
    model.predict(
      input_file=predict_inputs_path,
      output_file=predict_outputs_path,
      # Select the most probable output token at each step.
      beam_size=beam,
      temperature=float(temp),
    )

  predictions[tuned_model_name] = {}
  # The output filename will have the checkpoint appended so we glob to get 
  # the latest.
  prediction_files = sorted(tf.io.gfile.glob(predict_outputs_path + "*"))
  print("\nPredictions using checkpoint %s:\n" % prediction_files[-1].split("-")[-1])  
  with tf.io.gfile.GFile(prediction_files[-1]) as f:
    for q, a in zip(test_data, f):
      if q:
        predictions[tuned_model_name][q] = a
        print("original: " + q)
        print("paraphrase: " + a)
        print()



In [None]:
import os
import pickle
def dump_pickle(filename,obj):
    with tf.io.gfile.GFile(filename,'wb') as f:
        pickle.dump(obj,f)

In [None]:
generated_text_path = os.path.join(DATA_DIR,'generated_text/','playaround/')
dump_pickle(generated_text_path+"predictions_small_steps_ordered_beam1_temp1_second.pkl",predictions)

In [None]:
predictions = {}
predictions["3B_mixed_allMscoco_50kTemplate_ordered_train_temp1_beam1_50"]=1