# T5 Fine_Tuning

in this notebook we will fine-tune different models on the datasets we already processed.

## NOTEBOOK SETTINGS

We recommend to use "high ram" setting for this notebook
you can changed this in the colab menu : `Runtime > Change runtime type`


We start by setting the environment connecting colab to the Google Cloud Storage (GCS) bucket and setting everything up for the TPU processor. (This colab uses TPU and high ram settings)

In [None]:
import os
os.environ['USE_AUTH_EPHEM'] = '0'

from google.colab import auth
auth.authenticate_user()

#@title ## Set Your GCS credential
project_id = 'thesis-acr3' #@param {type:"string"}
bucket_name = 'automatic-code-review-3' #@param {type:"string"}

!gcloud config set project {project_id}

!pip3 install --upgrade pip
!pip3 install t5==0.9.0
!pip3 install tensorflow==2.7.0
!pip3 install keras==2.7.0
!pip3 install gin-config
!pip install git+https://github.com/tensorflow/mesh.git
!pip install -qU t5==0.9.2
!pip install -q tensorflow-text==2.8.0rc0

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

tf.flags.DEFINE_string('f','','')

#Set the base dir(Google cloud bucket)
BASE_DIR = "gs://" + bucket_name 

if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
ON_CLOUD = True


if ON_CLOUD:
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.compat.v1.enable_eager_execution(config=None, device_policy=None, execution_mode=None)
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

We specify the paths and the sizes of all our datasets to later build our tasks.

In [None]:
## CLASSIFIER
nq_tsv_path_classifier = {
    "train":      'gs://' + bucket_name + 'dataset/fine-tuning/classifier/train.tsv',
    "validation": 'gs://' + bucket_name + 'dataset/fine-tuning/classifier/val.tsv',
    "test":       'gs://' + bucket_name + 'dataset/fine-tuning/classifier/test.tsv'

}

!gsutil cp {nq_tsv_path_classifier["train"]} ./train.tsv
!gsutil cp {nq_tsv_path_classifier["validation"]} ./val.tsv
!gsutil cp {nq_tsv_path_classifier["test"]} ./test.tsv

data_train = len([line for line in open('./train.tsv', 'r')])
data_val = len([line for line in open('./val.tsv', 'r')])
data_test = len([line for line in open('./test.tsv', 'r')])

num_nq_examples_classifier = dict(train=data_train, validation=data_val, test=data_test)

In [None]:
num_nq_examples_classifier

We specify the model and vocab path of the previusly trained sentencepiece tokenizer model in the GCS bucket

In [None]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary

vocab_model_path = 'gs://' + bucket_name + '/tokenizer/TokenizerModel.model'
vocab_path = 'gs://' + bucket_name + '/tokenizer/TokenizerModel.vocab'

TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

def get_default_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, 100)

DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True, required=False),

    "targets": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True)
}

# Setting up the classifier task


## TASK : CLASSIFIER
- task name = `classifier`
- task prefix = `classifier: `

In [None]:
def nq_dataset_classifier(split, shuffle_files=True):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_classifier[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(nq_dataset_classifier("validation").take(2)):
  print(ex)
print("A few raw training examples...")
for ex in tfds.as_numpy(nq_dataset_classifier("train").take(2)):
  print(ex)

def classifier_preprocessing(ds):
  def to_inputs_and_targets(ex):
        inputs = tf.strings.join(['classifier: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
t5.data.TaskRegistry.remove('classifier')
t5.data.TaskRegistry.add(
    "classifier",
    dataset_fn=nq_dataset_classifier,
    splits=["train", "validation"],
    text_preprocessor=[classifier_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_classifier
)

nq_task = t5.data.TaskRegistry.get("classifier")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(3)):
  print(ex)

# Setting up fine tuning tasks

In [None]:
def _rate_num_input_examples(task):
  if "train" in task.splits:
    return float(task.num_input_examples("train"))
  elif "validation" in task.splits:
    return float(task.num_input_examples("validation"))
  elif "test" in task.splits:
    return float(task.num_input_examples("test"))
  else:
    raise ValueError("Task %s does not have a train or validation split." % (task.name))

In [None]:
## CLASSIFIER
t5.data.MixtureRegistry.remove("classifier")
t5.data.MixtureRegistry.add(
    "classifier",
    ["classifier"],
    default_rate=_rate_num_input_examples
)

Here we need to specify:
- if we want to fin-tuning a pre-trained model or not (and the path of the pre-trained model if needed)
- the dataset we want to use between the new larger dataset and the one by Tufano etal. (ICSE21)
- the downstream task

In [None]:
# our T5 selected architecture
MODEL_SIZE = "small"

#@title Select fine-tuning with or without pre-training
fine_tuning = "fine-tuning_with_pre-training/" #@param ["fine-tuning_with_pre-training/", "fine-tuning_without_pre-training/"]

if fine_tuning == "fine-tuning_with_pre-training/":
  # Specify the pre-trained dir which must contain the pre-trained models, the operative_config.gin file and the checkpoint file as well
  PRETRAINED_DIR= 'gs://' + bucket_name + '/model_dumps/pre-training/'

############ output path ############
task_to_train = 'classifier'
MODEL_DIR = f'gs://{bucket_name}/model_dumps/{fine_tuning}/{task_to_train}'

model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 128, 200),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]


We set the selected learning rate scheduler

In [None]:
from mesh_tensorflow.transformer.learning_rate_schedules import slanted_triangular 

from mesh_tensorflow.transformer.learning_rate_schedules import truncated_rsqrt
 
from tensorflow.keras.optimizers.schedules import PolynomialDecay

from t5 import models

starter_learning_rate = 0.05
end_learning_rate = 0.001
decay_steps = 10000

learning_rate_fn = PolynomialDecay(
    starter_learning_rate,
    decay_steps,
    end_learning_rate,
    power=0.5)

#@title Select a learning rate scheduler
learning_rate_scheduler_picker = "slanted" #@param ["slanted", "isr", "polynomial", "constant"]

if learning_rate_scheduler_picker == "slanted":
  selected_learning_rate_scheduler = slanted_triangular
  PATH_GIN_FILE = 'gs://' + bucket_name + '/utils/operative_config_slanted.gin'
elif learning_rate_scheduler_picker == "isr":
  selected_learning_rate_scheduler = truncated_rsqrt
  PATH_GIN_FILE = 'gs://' + bucket_name + '/utils/operative_config_isr.gin'
elif learning_rate_scheduler_picker == "polynomial":
  selected_learning_rate_scheduler = learning_rate_fn
  PATH_GIN_FILE = 'gs://' + bucket_name + '/utils/operative_config_polynomial.gin'
elif learning_rate_scheduler_picker == "constant":
  selected_learning_rate_scheduler = 0.001
  PATH_GIN_FILE = 'gs://' + bucket_name + '/utils/operative_config_constant.gin'

#@title Select a learning rate scheduler
number_of_steps = 800000 #@param {type:"integer"}

pretraining_steps = 0
if fine_tuning == "fine-tuning_with_pre-training/":
  pretraining_steps = 200000

tf.io.gfile.makedirs(MODEL_DIR)

model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    learning_rate_schedule = selected_learning_rate_scheduler,
    sequence_length={"inputs": 512, "targets": 512},
    save_checkpoints_steps=10000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)

!gsutil cp {PATH_GIN_FILE}  ./config.gin

if learning_rate_scheduler_picker == "slanted":
  gin_lines = [line for line in open("./config.gin")]
  f = open("./config.gin", "w+")
  for i in range(len(gin_lines)):
    if i == 196 and fine_tuning == "fine-tuning_without_pre-training/":
      line = "slanted_triangular.start_step = 0\n"
      f.write(line)
      continue
    if i == 197:
      line = "slanted_triangular.total_train_steps = " + str(number_of_steps + pretraining_steps) + '\n'
      f.write(line)
      continue
    f.write(gin_lines[i])
  f.close()

# Start Training

In [None]:
import gin

if fine_tuning == "fine-tuning_without_pre-training/":
  # NON PRETRAINED
  with gin.unlock_config():    
      gin.parse_config_file("./config.gin")
      TRAIN_STEPS = number_of_steps
      model.train(task_to_train, steps=number_of_steps)

else:
  # PRETRAINED
  with gin.unlock_config():
      gin.parse_config_file("./config.gin")
      #RUN FINE-TUNING
      model.finetune(
          mixture_or_task_name=task_to_train,
          pretrained_model_dir=PRETRAINED_DIR,
            finetune_steps=number_of_steps
      )

# Evaluation

Evaluate the model checkpoint(s) on the validation set

In [None]:
# Use a larger batch size for evaluation, which requires less memory.
# This cell is used to get the inputs and targets, hence can be stopped as soon
# as these files are created
model.batch_size = 1024
model.eval(
    mixture_or_task_name=task_to_train,
    # -1 will evaluate the last checkpoint, you can also provide 
    # a list of checkpoints with the following format : [10000, 20000, 30000]
    checkpoint_steps=-1,
    split="validation"
    )

In [None]:
checkpoints = [x for x in range(pretraining_steps, pretraining_steps + number_of_steps + 10000, 10000)]

In [None]:
folder = MODEL_DIR + "/validation_eval"
input_file = f"{folder}/{task_to_train}_inputs"
output_file = f"{folder}/{task_to_train}_predictions"

model.predict(input_file=input_file, 
              output_file=output_file,
              checkpoint_steps=checkpoints,
              beam_size=1, 
              temperature=0.0, 
              keep_top_k=-1, 
              vocabulary=get_default_vocabulary())

Evaluate the best model checkpoint on the test set

In [None]:
#@title: Select the best checkpoint
best_checkpoint = 200000 #@param {type:"integer"}

In [None]:
# Use a larger batch size for evaluation, which requires less memory.
# This cell is used to get the inputs and targets, hence can be stopped as soon
# as these files are created
model.batch_size = 1024
model.eval(
    mixture_or_task_name=task_to_train,
    checkpoint_steps=best_checkpoint,
    split="test"
    )

In [None]:
folder = MODEL_DIR + "/test_eval"
input_file = f"{folder}/{task_to_train}_inputs"
output_file = f"{folder}/{task_to_train}_predictions"

model.predict(input_file=input_file, 
              output_file=output_file,
              checkpoint_steps=best_checkpoint,
              beam_size=1, 
              temperature=0.0, 
              keep_top_k=-1, 
              vocabulary=get_default_vocabulary())

# Confidence score

Using the `model.score()` function we evaluate the model confidence about the generated predictions (given the input).

In [None]:
import math

model.batch_size = train_batch_size

folder =  MODEL_DIR + "/test_eval"
input_file = f"{folder}/{task_to_train}_inputs"
prediction_file = f"{folder}/{task_to_train}_predictions-{best_checkpoint}"
score_file = f"{folder}/with_score/{task_to_train}_scores"

model.score(inputs=input_file,
            targets=prediction_file,
            scores_file=score_file,
            checkpoint_steps=best_checkpoint,
            vocabulary=get_default_vocabulary())


In [None]:
with tf.io.gfile.GFile(score_file + ".scores", "r") as scores, \
     tf.io.gfile.GFile(f"{folder}/confidence_scores.txt", "w") as confidence_scores:
  confidence_score = [math.exp(float(line.split()[0])) for line in scores.readlines()]

  for i in range(len(confidence_score)):
    confidence_scores.write(str(confidence_score[i]) + '\n')