## Google authentication

In [1]:
import os
import tensorflow as tf
import json
from google.colab import auth
auth.authenticate_user()

use_tpu = 'COLAB_TPU_ADDR' in os.environ
if use_tpu:
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  print('TPU address is', TPU_ADDRESS)
  
  with tf.Session(TPU_ADDRESS) as session:
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    # Now credentials are set for all future sessions on this TPU.
else:
  device_name = tf.test.gpu_device_name()
  if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
  print('Found GPU at: {}'.format(device_name))

TPU address is grpc://10.105.239.186:8470


## Download *improv* project

In [2]:
import sys

!test -d improv || git clone https://github.com/rikhuijzer/improv improv
if not 'improv' in sys.path:
  sys.path += ['improv']

Cloning into 'improv'...
remote: Enumerating objects: 381, done.[K
remote: Counting objects: 100% (381/381), done.[K
remote: Compressing objects: 100% (242/242), done.[K
remote: Total 381 (delta 234), reused 276 (delta 133), pack-reused 0[K
Receiving objects: 100% (381/381), 2.74 MiB | 9.37 MiB/s, done.
Resolving deltas: 100% (234/234), done.


## BERT pre-trained 


In [0]:
bucket = 'benchmark-tpu-bucket' #@param {type:"string"}
assert bucket, 'Must specify an existing GCS bucket name'
bucket_models_dir = 'gs://{}/bert/models'.format(bucket)
bert_model = 'BERT-Base, Uncased' #@param ['BERT-Base, Uncased', 'BERT-Large, Uncased']
bert_model_map = {
    'BERT-Base, Uncased': 'uncased_L-12_H-768_A-12',
    'BERT-Large, Uncased': 'uncased_L-24_H-1024_A-16',
}
bert_model_mapped = bert_model_map[bert_model]

bert_pretrained_dir = 'gs://cloud-tpu-checkpoints/bert/' + bert_model_mapped

## Define params

In [4]:
!pip install rasa_nlu

Collecting rasa_nlu
[?25l  Downloading https://files.pythonhosted.org/packages/a5/d2/2e6a081f3d222df01a3d941d7029e52c20619d53d557e721f096962d7293/rasa_nlu-0.13.8-py2.py3-none-any.whl (145kB)
[K    100% |████████████████████████████████| 153kB 5.3MB/s 
Collecting klein (from rasa_nlu)
  Downloading https://files.pythonhosted.org/packages/8a/6b/adc97a7bb3fb781fdd9e49177ad873c1479f87b9745271cbeda81cbb9cc8/klein-17.10.0-py2.py3-none-any.whl
Collecting gevent (from rasa_nlu)
[?25l  Downloading https://files.pythonhosted.org/packages/0b/e5/8bbad57fa8a565e04c696e3413d4051cc3cbb40d04c5d6ad9808ba991d5c/gevent-1.3.7-cp36-cp36m-manylinux1_x86_64.whl (4.5MB)
[K    100% |████████████████████████████████| 4.5MB 2.0MB/s 
Collecting coloredlogs (from rasa_nlu)
[?25l  Downloading https://files.pythonhosted.org/packages/08/0f/7877fc42fff0b9d70b6442df62d53b3868d3a6ad1b876bdb54335b30ff23/coloredlogs-10.0-py2.py3-none-any.whl (47kB)
[K    100% |████████████████████████████████| 51kB 16.4MB/s 
[?25hC

In [0]:
from src.config import HParams
from src.utils import get_project_root
from pathlib import Path
from datetime import datetime, timedelta

task_name = 'askubuntu'
epochs = 30  #@param {type:"integer"}
output_dir_name = str(datetime.now() + timedelta(hours=1))[:-7]
tpu_name = 'grpc://' + os.environ['COLAB_TPU_ADDR'] if use_tpu else ''

hparams = HParams(
  data_dir=get_project_root() / 'data' / task_name,
  bert_config_file=os.path.join(bert_pretrained_dir, 'bert_config.json'),
  task_name=task_name,
  vocab_file=os.path.join(bert_pretrained_dir, 'vocab.txt'),
  output_dir=bucket_models_dir + '/{}/{}'.format(task_name, output_dir_name),
  init_checkpoint=os.path.join(bert_pretrained_dir, 'bert_model.ckpt'),
  do_lower_case=bert_model.startswith('uncased'),
  max_seq_length=128,
  do_train_eval=False,
  do_train=True,
  do_eval=False,
  do_predict=False, 
  train_batch_size=32,
  eval_batch_size=8,
  predict_batch_size=8,
  learning_rate=5e-5,
  num_train_epochs=epochs,
  warmup_proportion=0.1,
  save_checkpoints_steps=1000,
  iterations_per_loop=-1,  # updated below 
  use_tpu=use_tpu,
  tpu_name=tpu_name,
  tpu_zone=None,
  gcp_project=None,
  master=None,
  num_tpu_cores=8
)

tf.gfile.MakeDirs(hparams.output_dir)

In [6]:
# update iterations per loop
!pip install rasa_nlu==0.13.8
from src.my_estimator import get_examples, SetType

checkpoint_about_every_n_epochs = 60 #@param {type:"integer"}
data_filename = hparams.data_dir / (hparams.task_name + '.tsv')
train_examples = get_examples(data_filename, SetType.train)
n_train = len(train_examples)
batches_per_epoch = float(n_train) / float(hparams.train_batch_size)
iterations_per_loop = int(checkpoint_about_every_n_epochs * batches_per_epoch)
hparams = hparams._replace(iterations_per_loop=iterations_per_loop)



## New train eval loop plus one predict run

In [7]:
from src.my_estimator import train_and_evaluate

train_and_evaluate(hparams)

INFO:tensorflow:train_batch_size=32  eval_batch_size=8  max_steps=30
INFO:tensorflow:Using config: {'_model_dir': 'gs://benchmark-tpu-bucket/bert/models/askubuntu/2018-11-28 13:32:08', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      value: "10.105.239.186:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc611cb1b38>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': b'grpc://10.105.239.186:8470', '_evaluation_master': b'grpc://10.105.239.186:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_w

In [8]:
from src.my_estimator import predict

_ = predict(hparams)

INFO:tensorflow:Using config: {'_model_dir': 'gs://benchmark-tpu-bucket/bert/models/askubuntu/2018-11-28 13:32:08', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      value: "10.105.239.186:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc6119e0d30>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': b'grpc://10.105.239.186:8470', '_evaluation_master': b'grpc://10.105.239.186:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=99, 

## Train Eval loop

In [0]:
# forced to restart TPU each epoch since TPU's do not provide summary statistics
if hparams.do_train_eval:
  train_eval(hparams, estimator)

In [0]:
from typing import List, Iterable

def predict(hparams: HParams) -> List[str]:
    from src.my_classifier import get_model_fn_and_estimator, file_based_input_fn_builder
    from src.run_classifier import file_based_convert_examples_to_features
    import os
    import numpy as np
    from src.utils import convert_result_pred, get_rounded_f1

    data_filename = hparams.data_dir / (hparams.task_name + '.tsv')
    params = hparams._replace(use_tpu=False)  # BERT code warns against using TPU for predictions.
    model_fn, estimator = get_model_fn_and_estimator(params)

    predict_examples = get_examples(data_filename, SetType.test)
    predict_file = os.path.join(params.output_dir, "predict.tf_record")
    file_based_convert_examples_to_features(predict_examples, get_unique_intents(data_filename),
                                            params.max_seq_length, get_tokenizer(params),
                                            predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d", len(predict_examples))
    tf.logging.info("  Batch size = %d", params.predict_batch_size)

    predict_drop_remainder = params.use_tpu
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=params.max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    result: Iterable[np.ndarray] = estimator.predict(input_fn=predict_input_fn)
    label_list = get_intents(data_filename)  # used for label_list[max_class] this might be wrong
    y_pred = convert_result_pred(result, label_list)
    print('f1 score: {}'.format(get_rounded_f1(params.data_dir / 'askubuntu.tsv', y_pred, average='micro')))
    return y_pred

## Train

In [0]:
from tensorflow.python.training import training_util
from tensorflow.python.training.basic_session_run_hooks import SecondOrStepTimer
from tensorflow.python.training.session_run_hook import SessionRunHook, SessionRunArgs

class DevHook(SessionRunHook):
    """hook, based on ProfilerHook, to have the estimator output the run metadata into the model directory
        source: https://stackoverflow.com/questions/45719176"""
    def __init__(self,
                 save_steps=None,
                 save_secs=None,
                 output_dir=""):
        self._output_tag = "step-{}"
        self._output_dir = output_dir
        self._timer = SecondOrStepTimer(
            every_secs=save_secs, every_steps=save_steps)

    def begin(self):
        self._next_step = None
        self._global_step_tensor = training_util.get_global_step()
        tf.logging.info('creating file in: {}'.format(self._output_dir))
        self._writer = tf.summary.FileWriter(self._output_dir + '/hook_data', 
                                             tf.get_default_graph())

        if self._global_step_tensor is None:
            raise RuntimeError("Global step should be created to use ProfilerHook.")

    def before_run(self, run_context):
        tf.logging.info('before_run is called.')
        self._request_summary = (
                self._next_step is None or
                self._timer.should_trigger_for_step(self._next_step)
        )
        requests = {"global_step": self._global_step_tensor}
        opts = (tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                if self._request_summary else None)
        return SessionRunArgs(requests, options=opts)

    def after_run(self, run_context, run_values):
        # evaluate(params, estimator)
        tf.logging.info('after_run is called.')
        
        stale_global_step = run_values.results["global_step"]
        global_step = stale_global_step + 1
        if self._request_summary:
            global_step = run_context.session.run(self._global_step_tensor)
            self._writer.add_run_metadata(
                run_values.run_metadata, self._output_tag.format(global_step))
            self._writer.flush()
        self._next_step = global_step + 1

    def end(self, session):
        self._writer.close()

dev_hook = DevHook(save_steps=1, output_dir=hparams.output_dir)

In [0]:
training_start_time = datetime.now()

In [0]:
if False:  # hparams.do_train:
  train(hparams, estimator, dev_hook)

In [14]:
duration = str(datetime.now() - training_start_time)
print('Training took {}.'.format(duration))

Training took 0:00:00.023600.


## Eval

In [0]:
if hparams.do_eval:
  evaluate(hparams, estimator)

## Predict

In [0]:
# note that predictions are non-deterministic
if hparams.do_predict:
  predict(hparams)