## Google authentication

In [1]:
import os
import tensorflow as tf
import json
from google.colab import auth
auth.authenticate_user()

use_tpu = 'COLAB_TPU_ADDR' in os.environ
if use_tpu:
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  print('TPU address is', TPU_ADDRESS)
  
  with tf.Session(TPU_ADDRESS) as session:
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    # Now credentials are set for all future sessions on this TPU.
else:
  device_name = tf.test.gpu_device_name()
  if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
  print('Found GPU at: {}'.format(device_name))

TPU address is grpc://10.56.21.154:8470


## Download *improv* project

In [2]:
import sys

!test -d improv || git clone https://github.com/rikhuijzer/improv improv
if not 'improv' in sys.path:
  sys.path += ['improv']
%cd improv
!git log -n 1 --format='commit %H'
%cd ..

Cloning into 'improv'...
remote: Enumerating objects: 330, done.[K
remote: Counting objects: 100% (330/330), done.[K
remote: Compressing objects: 100% (229/229), done.[K
remote: Total 805 (delta 218), reused 206 (delta 101), pack-reused 475[K
Receiving objects: 100% (805/805), 6.84 MiB | 7.60 MiB/s, done.
Resolving deltas: 100% (527/527), done.
/content/improv
commit ef5f98762bd4ed2af7f9f412627f3f76987a7d24
/content


In [3]:
!test -d nlu_datasets || git clone https://github.com/rikhuijzer/nlu_datasets nlu_datasets
if not 'nlu_datasets' in sys.path:
  sys.path += ['nlu_datasets']
%cd nlu_datasets
!git log -n 1 --format='commit %H'
%cd ..

Cloning into 'nlu_datasets'...
remote: Enumerating objects: 408, done.[K
remote: Counting objects: 100% (408/408), done.[K
remote: Compressing objects: 100% (259/259), done.[K
remote: Total 408 (delta 203), reused 343 (delta 141), pack-reused 0[K
Receiving objects: 100% (408/408), 1.22 MiB | 8.65 MiB/s, done.
Resolving deltas: 100% (203/203), done.
/content/nlu_datasets
commit 12e6fe28bf4ad4ba9ebc17f6a1c053ff9de62b5e
/content


In [4]:
!pip install rasa_nlu

Collecting rasa_nlu
[?25l  Downloading https://files.pythonhosted.org/packages/a5/d2/2e6a081f3d222df01a3d941d7029e52c20619d53d557e721f096962d7293/rasa_nlu-0.13.8-py2.py3-none-any.whl (145kB)
[K    100% |████████████████████████████████| 153kB 8.0MB/s 
[?25hCollecting typing (from rasa_nlu)
  Downloading https://files.pythonhosted.org/packages/4a/bd/eee1157fc2d8514970b345d69cb9975dcd1e42cd7e61146ed841f6e68309/typing-3.6.6-py3-none-any.whl
Collecting coloredlogs (from rasa_nlu)
[?25l  Downloading https://files.pythonhosted.org/packages/08/0f/7877fc42fff0b9d70b6442df62d53b3868d3a6ad1b876bdb54335b30ff23/coloredlogs-10.0-py2.py3-none-any.whl (47kB)
[K    100% |████████████████████████████████| 51kB 17.6MB/s 
Collecting simplejson (from rasa_nlu)
[?25l  Downloading https://files.pythonhosted.org/packages/e3/24/c35fb1c1c315fc0fffe61ea00d3f88e85469004713dab488dee4f35b0aff/simplejson-3.16.0.tar.gz (81kB)
[K    100% |████████████████████████████████| 81kB 23.9MB/s 
[?25hCollecting klein 

## BERT pre-trained 


In [0]:
bucket = 'benchmark-tpu-bucket' #@param {type:"string"}
assert bucket, 'Must specify an existing GCS bucket name'
bucket_models_dir = 'gs://{}/bert/models'.format(bucket)
bert_model = 'BERT-Large, Uncased' #@param ['BERT-Base, Uncased', 'BERT-Large, Uncased', 'BERT-Large, Cased', 'BERT-Large, Multilingual Cased']

# lowest two are unavailable in cloud-tpu-checkpoints
# need to rewrite to pull them from google storage apis
bert_model_map = {
    'BERT-Base, Uncased': 'uncased_L-12_H-768_A-12',
    'BERT-Large, Uncased': 'uncased_L-24_H-1024_A-16',
    'BERT-Large, Cased': 'cased_L-24_H-1024_A-16',
    'BERT-Large, Multilingual Cased': 'multi_cased_L-12_H-768_A-12'
}
bert_model_mapped = bert_model_map[bert_model]

bert_pretrained_dir = 'gs://cloud-tpu-checkpoints/bert/' + bert_model_mapped

## Define params

In [0]:
from improv.config import HParams
from improv.utils import get_project_root
from pathlib import Path
from datetime import datetime, timedelta

corpus = 'webapplications'  #@param ['chatbot', 'askubuntu', 'webapplications', 'snips2017']
task = 'ner' #@param ['ner', 'intent', 'ner_intent']
num_train_steps = 1000  #@param {type:"integer"}
train_batch_size = 8  #@param {type:"integer"}
new_dir = True  #@param {type:"boolean"}
if new_dir:
  output_dir_name = str(datetime.now() + timedelta(hours=1))[:-7]
tpu_name = 'grpc://' + os.environ['COLAB_TPU_ADDR'] if use_tpu else ''
do_train = True  #@param {type:"boolean"}
do_eval = True  #@param {type:"boolean"}
do_predict = True  #@param {type:"boolean"}

h_params = HParams(
  data_dir=Path('nlu_datasets') / 'generated' / corpus / task,
  bert_config_file=os.path.join(bert_pretrained_dir, 'bert_config.json'),
  task=task,
  task_name=corpus,
  vocab_file=os.path.join(bert_pretrained_dir, 'vocab.txt'),
  output_dir=bucket_models_dir + '/{}/{}'.format(corpus, output_dir_name),
  local_dir=str(get_project_root() / 'tmp' / corpus / output_dir_name),
  init_checkpoint=os.path.join(bert_pretrained_dir, 'bert_model.ckpt'),
  do_lower_case=bert_model.startswith('uncased'),
  max_seq_length=128,
  do_train_eval=False,
  do_train=do_train,
  do_eval=do_eval,
  do_predict=do_predict,
  train_batch_size=train_batch_size,
  eval_batch_size=8,
  predict_batch_size=8,
  learning_rate=5e-5,
  num_train_steps=num_train_steps,
  warmup_proportion=0.1,
  save_checkpoints_steps=50,  # ignored by TPUEstimator
  save_summary_steps=50,  # ignored by TPUEstimator
  iterations_per_loop=1000,  # ignored by TPUEstimator
  use_tpu=use_tpu,
  tpu_name=tpu_name,
  tpu_zone=None,
  gcp_project=None,
  master=None,
  num_tpu_cores=8
)

tf.gfile.MakeDirs(h_params.output_dir)
tf.gfile.MakeDirs(h_params.local_dir)

## Main


In [7]:
from improv.kyzhouhzau_ner import main, run

result = run(h_params)

INFO:tensorflow:Using config: {'_model_dir': 'gs://benchmark-tpu-bucket/bert/models/webapplications/2019-01-04 14:22:46', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 50, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      value: "10.56.21.154:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0e203dc9e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': b'grpc://10.56.21.154:8470', '_evaluation_master': b'grpc://10.56.21.154:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, 

In [8]:
# in need of refactoring into improv
if h_params.task == 'intent':
  from improv.utils import convert_result_pred
  from sklearn.metrics import f1_score
  from improv.my_classifier import get_unique_intents, get_data_filename, get_intents

  def print_intent_f1(h_params: HParams, result):
    intents = get_unique_intents(get_data_filename(h_params))

    y_pred = []
    for pred in result:
      probs = pred['probabilities']
      id = [i for i, j in enumerate(probs) if j == max(probs)][0]
      y_pred.append(intents[id])

    y_true = get_intents(get_data_filename(h_params), training=False)
    score = f1_score(y_true, y_pred, average='weighted')
    score = round(score, 3)
    print('intents weighted f1: {}'.format(score))

  print_intent_f1(h_params, result)
else:
  from improv.kyzhouhzau_ner import evaluate_ner_pred_result

  evaluate_ner_pred_result(h_params, result)
  
  from pathlib import Path
  from improv.evaluate import print_scores

  print_scores(Path(h_params.local_dir) / 'results.txt')

text: ['alternative', 'to', 'facebook']
true: ['O', 'O', 'B-WebService']
pred: ['O', 'O', 'B-WebService']


text: ['how', 'do', 'i', 'delete', 'my', 'facebook', 'account', '?']
true: ['O', 'O', 'O', 'O', 'O', 'B-WebService', 'O', 'O']
pred: ['O', 'O', 'O', 'O', 'X', 'O', 'B-WebService', 'O']


text: ['are', 'there', 'any', 'good', 'pandora', 'alternatives', 'with', 'general', 'availability', 'outside', 'the', 'us', '?']
true: ['O', 'O', 'O', 'O', 'B-WebService', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
pred: ['O', 'O', 'O', 'O', 'B-WebService', 'O', 'O', 'O', 'O', 'O', '[SEP]', 'O', 'O']


text: ['is', 'it', 'possible', 'to', 'export', 'my', 'data', 'from', 'trello', 'to', 'back', 'it', 'up', '?']
true: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-WebService', 'O', 'O', 'O', 'O', 'O']
pred: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-WebService', 'X', 'O', 'O', 'O', 'O']


text: ['is', 'there', 'an', 'online', 'alternative', 'to', 'igoogle']
true: ['O', 'O', 'O', 'O', 'O', 'O', 'B-WebSer

  'recall', 'true', average, warn_for)
