## Google authentication

## Download *improv* project

In [10]:
import sys

!test -d improv || git clone https://github.com/rikhuijzer/improv improv
if not 'improv' in sys.path:
  sys.path += ['improv']
%cd improv
!git log -n 1 --format='commit %H'
%cd ..

/content/improv
commit d3f7706088a0a3bb0653cb4660af44277ac27904
/content


In [11]:
!test -d nlu_datasets || git clone https://github.com/rikhuijzer/nlu_datasets nlu_datasets
if not 'nlu_datasets' in sys.path:
  sys.path += ['nlu_datasets']
%cd nlu_datasets
!git log -n 1 --format='commit %H'
%cd ..

/content/nlu_datasets
commit 4a1a9dfc3d6b27400704c022fb59247f806fc4e3
/content


In [12]:
!pip install rasa_nlu



## BERT pre-trained 


In [0]:
bucket = 'benchmark-tpu-bucket' #@param {type:"string"}
assert bucket, 'Must specify an existing GCS bucket name'
bucket_models_dir = 'gs://{}/bert/models'.format(bucket)
bert_model = 'BERT-Large, Uncased' #@param ['BERT-Base, Uncased', 'BERT-Large, Uncased']
bert_model_map = {
    'BERT-Base, Uncased': 'uncased_L-12_H-768_A-12',
    'BERT-Large, Uncased': 'uncased_L-24_H-1024_A-16',
}
bert_model_mapped = bert_model_map[bert_model]

bert_pretrained_dir = 'gs://cloud-tpu-checkpoints/bert/' + bert_model_mapped

## Define params

In [0]:
from improv.config import HParams
from improv.utils import get_project_root
from pathlib import Path
from datetime import datetime, timedelta

corpus = 'chatbot'  #@param ['chatbot', 'askubuntu', 'webapplications']
task = 'ner' #@param ['ner', 'intent', 'ner_intent']
num_train_steps = 250  #@param {type:"integer"}
train_batch_size = "32"  # do not change (remain consistent over datasets)
new_dir = True  #@param {type:"boolean"}
if new_dir:
  output_dir_name = str(datetime.now() + timedelta(hours=1))[:-7]
tpu_name = 'grpc://' + os.environ['COLAB_TPU_ADDR'] if use_tpu else ''
do_train = True  #@param {type:"boolean"}
do_eval = True  #@param {type:"boolean"}
do_predict = True  #@param {type:"boolean"}

h_params = HParams(
  data_dir=Path('nlu_datasets') / 'generated' / corpus / task,
  bert_config_file=os.path.join(bert_pretrained_dir, 'bert_config.json'),
  task_name=corpus,
  vocab_file=os.path.join(bert_pretrained_dir, 'vocab.txt'),
  output_dir=bucket_models_dir + '/{}/{}'.format(corpus, output_dir_name),
  local_dir=str(get_project_root() / 'tmp' / corpus / output_dir_name),
  init_checkpoint=os.path.join(bert_pretrained_dir, 'bert_model.ckpt'),
  do_lower_case=bert_model.startswith('uncased'),
  max_seq_length=128,
  do_train_eval=False,
  do_train=do_train,
  do_eval=do_eval,
  do_predict=do_predict,
  train_batch_size=16, # do not change, this eases comparison
  eval_batch_size=8,
  predict_batch_size=8,
  learning_rate=5e-5,
  num_train_steps=num_train_steps,
  warmup_proportion=0.1,
  save_checkpoints_steps=50,  # ignored by TPU
  save_summary_steps=50,  # ignored by TPU
  iterations_per_loop=1000,  # ignored 
  use_tpu=use_tpu,
  tpu_name=tpu_name,
  tpu_zone=None,
  gcp_project=None,
  master=None,
  num_tpu_cores=8
)

tf.gfile.MakeDirs(h_params.output_dir)
tf.gfile.MakeDirs(h_params.local_dir)

In [9]:
import os
import tensorflow as tf
import json
from google.colab import auth
auth.authenticate_user()

use_tpu = 'COLAB_TPU_ADDR' in os.environ
if use_tpu:
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  print('TPU address is', TPU_ADDRESS)
  
  with tf.Session(TPU_ADDRESS) as session:
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    # Now credentials are set for all future sessions on this TPU.
else:
  device_name = tf.test.gpu_device_name()
  if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
  print('Found GPU at: {}'.format(device_name))

TPU address is grpc://10.94.204.178:8470


## Main


In [15]:
from improv.kyzhouhzau_ner import main, run, evaluate_pred_result

result = run(h_params)

INFO:tensorflow:Using config: {'_model_dir': 'gs://benchmark-tpu-bucket/bert/models/chatbot/2018-12-20 13:24:23', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 50, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      value: "10.94.204.178:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fcea4861d68>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': b'grpc://10.94.204.178:8470', '_evaluation_master': b'grpc://10.94.204.178:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_s

In [16]:
evaluate_pred_result(h_params, result)

text: ['i', 'want', 'to', 'go', 'marienplatz']
true: ['O', 'O', 'O', 'O', 'B-StationDest']
pred: ['O', 'O', 'O', 'O', 'B-StationStart']


text: ['when', 'is', 'the', 'next', 'train', 'in', 'muncher', 'freiheit', '?']
true: ['O', 'O', 'O', 'B-Criterion', 'B-Vehicle', 'O', 'B-StationStart', 'I-StationStart', 'O']
pred: ['O', 'O', 'O', 'B-Criterion', 'B-Vehicle', 'O', 'B-StationStart', 'X', 'X']


text: ['when', 'does', 'the', 'next', 'u', '-', 'bahn', 'leaves', 'from', 'garching', 'forschungszentrum', '?']
true: ['O', 'O', 'O', 'B-Criterion', 'B-Vehicle', 'I-Vehicle', 'I-Vehicle', 'O', 'O', 'B-StationStart', 'I-StationStart', 'O']
pred: ['O', 'O', 'O', 'B-Criterion', 'B-Vehicle', 'I-Vehicle', 'I-Vehicle', 'O', 'O', 'B-StationStart', 'X', 'X']


text: ['from', 'olympia', 'einkaufszentrum', 'to', 'hauptbahnhof']
true: ['O', 'B-StationStart', 'I-StationStart', 'O', 'B-StationDest']
pred: ['O', 'B-StationStart', 'I-StationStart', 'X', 'X']


text: ['when', 'is', 'the', 'next', 'train', 'from