## BERT run model script

The purpose of this script is to train and test the BERT models. This script is designed to be used in Google Colaboratory in the sense that:

- it assumes data will be loaded from both Google Drive and Google Storage Buckets
- it assumes the script will be executed on a Colabotory GPU

One training epoch on the legislative data (with batch size 25) take about 40 seconds on a Colaboratory GPU

In [None]:
# Import dependencies

!pip install bert-tensorflow
import pandas as pd
import tensorflow as tf
import math
import tensorflow_hub as hub
import sys
import bert
import numpy as np
from bert import run_classifier
from bert import optimization
from datetime import datetime
from google.colab import drive
from sklearn.metrics import confusion_matrix

In [None]:
# Mount Google Drive

drive.mount('/content/gdrive', force_remount=True)

In [None]:
# Import local modules - if taken to production, these would be properly packaged...

sys.path.insert(0, '/content/gdrive/My Drive/') # Set path to Google Drive location (location of beam_search and model modules)
import beam_search
import model

In [None]:
# Check GPU runtime

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
from google.colab import auth
auth.authenticate_user()

In [None]:
# Model parameters

# Encoder (BERT loaded from tfhub)
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1" # This is a path to an uncased (all lowercase) version of BERT

# Decoder
params = {
    "num_hidden_layers":6,
    "hidden_size":6,
    "num_heads":3,
    "filter_size":512,
    "relu_dropout":0.1,
    "allow_ffn_pad":True,
    "layer_postprocess_dropout":0.1,
    "attention_dropout":0.1,
    "initializer_gain":1.0,
    "label_smoothing":0.1,
    "beam_size":4,
    "alpha":1,
    "bucket":'BUCKET_NAME', # Set Google Storage Bucket name
    "task_data_dir":'gs://BUCKET_NAME/', # Set Google Storage Bucket name
    "output_dir":'gs://{}/XXXXX/' # Set Google Storage Bucket output directory
}

# Compute train and warmup steps from batch size
# These transfer learning hyperparameters are taken from the 'BERT fine-tuning' notebook 
# (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 25
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 10
# Warmup is a period of time where the learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 500
MAX_SEQ_LENGTH = 128 # This should match the max seq length set when creating the input file

TRAIN_FILE = '' # Set data path
VALIDATION_FILE = '' # Set data path
TEST_FILE = '' # Set data path

NUM_TRAINING_EXAMPLES = 750 # Set number of training examples

num_train_steps = int(NUM_TRAINING_EXAMPLES/ BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [None]:
# Set data input and output directory (these are taken from params)

TASK_DATA_DIR = params['task_data_dir']
print('***** Task data directory: {} *****'.format(TASK_DATA_DIR))
!gsutil ls $TASK_DATA_DIR

BUCKET = params['bucket']
assert BUCKET, 'Must specify an existing GCS bucket name'
OUTPUT_DIR = params['output_dir'].format(BUCKET)
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

In [None]:
# Adapted from https://github.com/tensorflow/models/tree/master/official/transformer
# Define the transformer encoder/decoder model

def create_model(is_predicting, train, input_ids, input_mask, segment_ids, labels, r_id):
  """Creates transformer model."""

  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  
  output_layer = bert_outputs["sequence_output"]
  
  initializer = tf.variance_scaling_initializer(params["initializer_gain"],
                                                mode="fan_avg",
                                                distribution="uniform")
  
  with tf.variable_scope("Decoder_transformer_loss", initializer=initializer):
    
    decoder_embeddings = model.DecoderEmbeddings()
    decoder_stack = model.DecoderStack(params, train)    
    attention_bias = model.get_padding_bias(input_mask)
    
    if is_predicting:
      predictions = predict(output_layer, attention_bias, decoder_embeddings, decoder_stack, params)
      return predictions
    
    else:
      logits = model.decode(labels, output_layer, attention_bias, decoder_embeddings, decoder_stack, train, params)
      xentropy, weights = model.padded_cross_entropy_loss(logits, 
                                                    labels, 
                                                    params["label_smoothing"], 
                                                    6)
      loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
      predictions = tf.argmax(logits, axis=-1)
      return (loss, predictions, xentropy)

In [None]:
# Adapted from https://github.com/tensorflow/models/tree/master/official/transformer
# Prediction time function (implements beam search decoding strategy)

def predict(encoder_outputs, encoder_decoder_attention_bias, decoder_embeddings, decoder_stack, params):
    """Return predicted sequence."""
    batch_size = tf.shape(encoder_outputs)[0]
    input_length = tf.shape(encoder_outputs)[1]
    max_decode_length = input_length

    symbols_to_logits_fn = model._get_symbols_to_logits_fn(max_decode_length, decoder_embeddings, decoder_stack, params)

    # Create initial set of IDs that will be passed into symbols_to_logits_fn.
    initial_ids = tf.zeros([batch_size], dtype=tf.int32)

    # Create cache storing decoder attention values for each layer.
    cache = {
        "layer_%d" % layer: {
            "k": tf.zeros([batch_size, 0, params["hidden_size"]]),
            "v": tf.zeros([batch_size, 0, params["hidden_size"]]),
        } for layer in range(params["num_hidden_layers"])}

    # Add encoder output and attention bias to the cache.
    cache["encoder_outputs"] = encoder_outputs
    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias

    # Use beam search to find the top beam_size sequences and scores.
    decoded_ids, scores = beam_search.sequence_beam_search(
        symbols_to_logits_fn=symbols_to_logits_fn,
        initial_ids=initial_ids,
        initial_cache=cache,
        vocab_size=6,
        beam_size=params["beam_size"],
        alpha=params["alpha"],
        max_decode_length=max_decode_length,
        eos_id=6,
        decoder_embeddings=decoder_embeddings,
        decoder_stack=decoder_stack)
    
    # setting vocab_size to 6 for labels number. EOS is 4

    # Get the top sequence for each batch element
    top_decoded_ids = decoded_ids[:, 0, 1:]
    top_scores = scores[:, 0]

    return {"outputs": top_decoded_ids, "scores": top_scores}

In [None]:
# Adapted from https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb
# model_fn_builder actually creates the model function
# using the passed parameters for num_labels, learning_rate, etc.

def model_fn_builder(learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, mode, params):
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    labels = features["label_ids"]
    try:
      r_id = features["r_id"]
    except:
      r_id = []

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:
      train = True
      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, train, input_ids, input_mask, segment_ids, labels, r_id)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      train = False
      preds = create_model(
        is_predicting, train, input_ids, input_mask, segment_ids, labels, r_id)

      predictions = {
          'true_labels':labels,
          'predicted_labels': preds["outputs"],
          'r_id': r_id
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

In [None]:
# Adapted from https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb

def file_based_input_fn_builder(input_file, seq_length, is_training,
                                drop_remainder):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  name_to_features = {
      "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
      "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
      "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
      "label_ids": tf.FixedLenFeature([seq_length], tf.int64),
      "r_id": tf.FixedLenFeature([1], tf.int64),
      "is_real_example": tf.FixedLenFeature([], tf.int64),
  }

  def _decode_record(record, name_to_features):
    """Decodes a record to a TensorFlow example."""
    example = tf.parse_single_example(record, name_to_features)

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in list(example.keys()):
      t = example[name]
      if t.dtype == tf.int64:
        t = tf.to_int32(t)
      example[name] = t

    return example

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.
    d = tf.data.TFRecordDataset(input_file)
    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.apply(
        tf.contrib.data.map_and_batch(
            lambda record: _decode_record(record, name_to_features),
            batch_size=batch_size,
            drop_remainder=drop_remainder))

    return d

  return input_fn

In [None]:
# Adapted from https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb
# Specify outpit directory and number of checkpoint steps to save

run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

model_fn = model_fn_builder(
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

In [None]:
# Adapted from https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb
# Create an input function for training. drop_remainder = True for using TPUs.

train_input_fn = file_based_input_fn_builder(
    input_file=LEG_TRAIN_FILE,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [None]:
# Run training

print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

In [None]:
# Create an input function for testing. drop_remainder = True for using TPUs.

pred_input_fn = file_based_input_fn_builder(
    input_file=TEST_FILE,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

predictions = estimator.predict(pred_input_fn)

In [None]:
# Run predictions and save them in true_label and predicted_labels

true_labels = []
predicted_labels = []
r_ids = []

for prediction in predictions:
  true_labels.append(prediction['true_labels'])
  predicted_labels.append(prediction['predicted_labels'])
  r_ids.append(prediction['r_id'])

In [None]:
# Run the evalutation

# Get rid of padding (truncate preds by input length)
def remove_padding(true_labels, predicted_labels):
  non_pad_labels = []
  non_pad_preds = []
  for i in range(len(true_labels)):
    eos = true_labels[i].argmax(axis=0)
    non_pad_labels.append(true_labels[i][1:eos])
    non_pad_preds.append(predicted_labels[i][1:eos])
  return non_pad_labels, non_pad_preds

non_pad_labels, non_pad_preds = remove_padding(true_labels, predicted_labels)

# Raw accuracy
correct = 0
total = 0
for i in range(len(non_pad_labels)):
  for j in range(len(non_pad_labels[i])):
    total += 1    
    if non_pad_labels[i][j] == non_pad_preds[i][j]:
      correct += 1
print(correct/total)

labels_concat = np.concatenate((non_pad_labels))
preds_concat = np.concatenate((non_pad_preds))

# Confusion matrix
confusion_matrix(labels_concat, preds_concat)

In [None]:
# Export results

import pickle

results = {}
r_ids_nums = []
for i in r_ids:
  r_ids_nums.append(i[0])
results['true_labels'] = non_pad_labels
results['predicted_labels'] = non_pad_preds
results['r_ids'] = r_ids_nums

results_file = "" # Set results file

with open(results_file, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# Download results

from google.colab import files
files.download(results_file)