In [0]:
import os
import tensorflow as tf
import json

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth

auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print_tpu_devices = False  #@param {type:"boolean"}
  if print_tpu_devices:
    print('TPU devices:')
    pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.33.17.106:8470


In [0]:
import sys

!test -d improv || git clone https://github.com/rikhuijzer/improv improv
if not 'improv' in sys.path:
  sys.path += ['improv']

## Define params

In [0]:
from src.config import Params
from src.utils import get_project_root
from pathlib import Path
from datetime import datetime
task_name = 'askubuntu'
# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
bert_model = 'uncased_L-12_H-768_A-12'
bert_pretrained_dir = 'gs://cloud-tpu-checkpoints/bert/' + bert_model
epochs = 2  #@param {type:"integer"}
# dir_suffix = '_epochs_base' #@param {type:"string"}
# output_dir_name = str(epochs) + dir_suffix
output_dir_name = str(datetime.now())[:-7]
bucket = 'benchmark-tpu-bucket' #@param {type:"string"}
assert bucket, 'Must specify an existing GCS bucket name'

params = Params(
  data_dir=get_project_root() / 'data' / task_name,
  bert_config_file=os.path.join(bert_pretrained_dir, 'bert_config.json'),
  task_name=task_name,
  vocab_file=os.path.join(bert_pretrained_dir, 'vocab.txt'),
  output_dir='gs://{}/bert/models/{}/{}'.format(bucket, task_name, output_dir_name),
  init_checkpoint=os.path.join(bert_pretrained_dir, 'bert_model.ckpt'),
  do_lower_case=bert_model.startswith('uncased'),
  max_seq_length=128,
  do_train_eval=False,
  do_train=True, 
  do_eval=True,
  do_predict=True, 
  train_batch_size=32,
  eval_batch_size=8,
  predict_batch_size=8,
  learning_rate=5e-5,
  num_train_epochs=epochs,
  warmup_proportion=0.1,
  save_checkpoints_steps=1e10,  # only for pre-training I guess 
  iterations_per_loop=1,
  use_tpu=True,
  tpu_name='grpc://' + os.environ['COLAB_TPU_ADDR'],
  tpu_zone=None,
  gcp_project=None,
  master=None,
  num_tpu_cores=8
)

tf.gfile.MakeDirs(params.output_dir)

## Last preparations

In [0]:
import tensorflow as tf

from src.config import get_debug_params
from src.my_classifier import (
    get_model_and_estimator, evaluate, train, train_eval, predict
)

model_fn, estimator = get_model_and_estimator(params)

INFO:tensorflow:Using config: {'_model_dir': 'gs://benchmark-tpu-bucket/bert/models/askubuntu/2018-11-25 10:42:04', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10000000000.0, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      value: "10.33.17.106:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f5f90c7b3c8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': b'grpc://10.33.17.106:8470', '_evaluation_master': b'grpc://10.33.17.106:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1

## Train Eval loop

In [0]:
# forced to restart TPU each epoch since TPU's do not provide summary statistics
if params.do_train_eval:
  train_eval(params, estimator)

## Train

In [0]:
from tensorflow.python.training import training_util
from tensorflow.python.training.basic_session_run_hooks import SecondOrStepTimer
from tensorflow.python.training.session_run_hook import SessionRunHook, SessionRunArgs

class DevHook(SessionRunHook):
    """hook, based on ProfilerHook, to have the estimator output the run metadata into the model directory
        source: https://stackoverflow.com/questions/45719176"""
    def __init__(self,
                 save_steps=None,
                 save_secs=None,
                 output_dir=""):
        self._output_tag = "step-{}"
        self._output_dir = output_dir
        self._timer = SecondOrStepTimer(
            every_secs=save_secs, every_steps=save_steps)

    def begin(self):
        self._next_step = None
        self._global_step_tensor = training_util.get_global_step()
        tf.logging.info('creating file in: {}'.format(self._output_dir))
        self._writer = tf.summary.FileWriter(self._output_dir + '/hook_data', 
                                             tf.get_default_graph())

        if self._global_step_tensor is None:
            raise RuntimeError("Global step should be created to use ProfilerHook.")

    def before_run(self, run_context):
        tf.logging.info('before_run is called.')
        self._request_summary = (
                self._next_step is None or
                self._timer.should_trigger_for_step(self._next_step)
        )
        requests = {"global_step": self._global_step_tensor}
        opts = (tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                if self._request_summary else None)
        return SessionRunArgs(requests, options=opts)

    def after_run(self, run_context, run_values):
        tf.logging.info('after_run is called.')
        # Test for TensorBoard
        # tf.summary.scalar('my_loss', -1)   
        # error: Graph is finalized and cannot be modified.
        tf.contrib.summary.scalar("my_contrib_loss", -1)
        
        stale_global_step = run_values.results["global_step"]
        global_step = stale_global_step + 1
        if self._request_summary:
            global_step = run_context.session.run(self._global_step_tensor)
            self._writer.add_run_metadata(
                run_values.run_metadata, self._output_tag.format(global_step))
            self._writer.flush()
        self._next_step = global_step + 1

    def end(self, session):
        self._writer.close()

dev_hook = DevHook(save_steps=1, output_dir=params.output_dir)

In [0]:
if params.do_train:
  train(params, estimator, dev_hook)

INFO:tensorflow:Writing example 0 of 96
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-0
INFO:tensorflow:tokens: [CLS] no matter what i do u ##bu ##nt ##u 15 . 04 does not recognize hp laser ##jet 102 ##0 [SEP]
INFO:tensorflow:input_ids: 101 2053 3043 2054 1045 2079 1057 8569 3372 2226 2321 1012 5840 2515 2025 6807 6522 9138 15759 9402 2692 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

RuntimeError: ignored

## Eval

In [0]:
if params.do_eval:
  evaluate(params, estimator)

## Predict

In [0]:
# note that predictions are non-deterministic
if params.do_predict:
  predict(params)