<a href="https://colab.research.google.com/github/rikhuijzer/improv/blob/master/Refactored_Google_BERT_Sentence_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import os
import tensorflow as tf
import json

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth

auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print_tpu_devices = False  #@param {type:"boolean"}
  if print_tpu_devices:
    print('TPU devices:')
    pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.45.28.202:8470


In [0]:
import sys

!test -d improv || git clone https://github.com/rikhuijzer/improv improv
if not 'improv' in sys.path:
  sys.path += ['improv']

## Define params

In [0]:
from src.config import Params
from src.utils import get_project_root
from pathlib import Path

task_name = 'askubuntu'
# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
bert_model = 'uncased_L-12_H-768_A-12'
bert_pretrained_dir = 'gs://cloud-tpu-checkpoints/bert/' + bert_model
output_dir_name = '60_epochs_large' #@param {type:"string"}
bucket = 'benchmark-tpu-bucket' #@param {type:"string"}
assert bucket, 'Must specify an existing GCS bucket name'

params = Params(
  data_dir=get_project_root() / 'data' / task_name,
  bert_config_file=os.path.join(bert_pretrained_dir, 'bert_config.json'),
  task_name=task_name,
  vocab_file=os.path.join(bert_pretrained_dir, 'vocab.txt'),
  output_dir='gs://{}/bert/models/{}/{}'.format(bucket, task_name, output_dir_name),
  init_checkpoint=os.path.join(bert_pretrained_dir, 'bert_model.ckpt'),
  do_lower_case=bert_model.startswith('uncased'),
  max_seq_length=128,
  do_train=True, 
  do_eval=False,
  do_predict=True, 
  train_batch_size=32,
  eval_batch_size=8,
  predict_batch_size=8,
  learning_rate=5e-5,
  num_train_epochs=60,
  warmup_proportion=0.1,
  save_checkpoints_steps=1e10,  # only for pre-training I guess 
  iterations_per_loop=1000,
  use_tpu=True,
  tpu_name='grpc://' + os.environ['COLAB_TPU_ADDR'],
  tpu_zone=None,
  gcp_project=None,
  master=None,
  num_tpu_cores=8
)

tf.gfile.MakeDirs(params.output_dir)

## Last preparations

In [13]:
import tensorflow as tf

import src.tokenization as tokenization
from src.config import get_debug_params
from src.my_classifier import IntentProcessor, get_model_and_estimator, evaluate, train, predict

tf.gfile.MakeDirs(str(params.output_dir))

processor = IntentProcessor()
processor.data_dir = params.data_dir
tokenizer = tokenization.FullTokenizer(vocab_file=str(params.vocab_file), do_lower_case=params.do_lower_case)
model_fn, estimator = get_model_and_estimator(params, processor)



INFO:tensorflow:Using config: {'_model_dir': 'gs://benchmark-tpu-bucket/bert/models/askubuntu/60_epochs_large', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10000000000.0, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      value: "10.45.28.202:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3429e86048>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': b'grpc://10.45.28.202:8470', '_evaluation_master': b'grpc://10.45.28.202:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000,

## Train and eval

In [14]:
if params.do_train:
  train(params, processor, tokenizer, estimator)

if params.do_eval:
  evaluate(params, processor, tokenizer, estimator)

INFO:tensorflow:Writing example 0 of 96
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-0
INFO:tensorflow:tokens: [CLS] no matter what i do u ##bu ##nt ##u 15 . 04 does not recognize hp laser ##jet 102 ##0 [SEP]
INFO:tensorflow:input_ids: 101 2053 3043 2054 1045 2079 1057 8569 3372 2226 2321 1012 5840 2515 2025 6807 6522 9138 15759 9402 2692 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

## Predict

In [15]:
# note that predictions are non-deterministic
params = params._replace(use_tpu=False)
model_fn, estimator = get_model_and_estimator(params, processor)
if params.do_predict:
  y_pred = predict(params, processor, tokenizer, estimator)
  y_true = []

  file = params.data_dir / "test.tsv"
  with open(str(file), 'r') as f:
      for row in f:
          row = row.replace('\n', '')
          lines = row.split('\t')
          y_true.append(lines[1])
  
  from sklearn.metrics import f1_score
  score = f1_score(y_true, y_pred, average='micro')
  score = round(score, 3)
  print('f1 score: {}'.format(score))

INFO:tensorflow:Using config: {'_model_dir': 'gs://benchmark-tpu-bucket/bert/models/askubuntu/60_epochs_large', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10000000000.0, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      value: "10.45.28.202:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3429e89668>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': b'grpc://10.45.28.202:8470', '_evaluation_master': b'grpc://10.45.28.202:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000,