<a href="https://colab.research.google.com/github/somilasthana/deeplearningtutorial/blob/master/BERT_NLU_101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Using BERT TF hub

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

W0530 15:15:05.254896 139819035359104 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [4]:
!pip install bert-tensorflow # Need Bert Libraries 

Collecting bert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)
[K     |████████████████████████████████| 71kB 2.7MB/s 
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1


In [0]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [0]:
OUTPUT_DIR = './'

In [0]:
from tensorflow import keras

import os
import re

In [0]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

In [0]:
# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

In [0]:
# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
      extract=True
  )
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
  
  return train_df, test_df

In [11]:
train, test = download_and_load_datasets()

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [12]:
train.shape, test.shape

((25000, 3), (25000, 3))

In [0]:
train = train.sample(5000)
test = test.sample(5000)

In [14]:
train.columns

Index(['sentence', 'sentiment', 'polarity'], dtype='object')

In [15]:
test.columns

Index(['sentence', 'sentiment', 'polarity'], dtype='object')

In [0]:
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
label_list=[0,1]

In [17]:
train.head()

Unnamed: 0,sentence,sentiment,polarity
3719,"This is a very grim, hard hitting, even brutal...",10,1
6475,"There are bad movies, then there are the movie...",3,0
22815,Not only did they get the characters all wrong...,1,0
1088,In a far away Galaxy is a planet called Ceta. ...,1,0
4613,Joseph Brady and Clarence Doolittle are two sa...,9,1


Transform our data into a format BERT understands.

In [0]:
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(
    
    guid=None,
    text_a=x[DATA_COLUMN],
    text_b=None, # useful in case text_a and text_b has a relation like translation
    label=x[LABEL_COLUMN]

), axis=1)

In [0]:
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(
    
    guid=None,
    text_a=x[DATA_COLUMN],
    text_b=None,
    label=x[LABEL_COLUMN]
    
), axis =1)

Next, we need to preprocess our data so that it matches the data BERT was trained on.

Lowercase our text (if we're using a BERT lowercase model)

Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])

Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])

Map our words to indexes using a vocab file that BERT provides

Add special "CLS" and "SEP" tokens (see the readme)

Append "index" and "segment" tokens to each input (see the BERT paper)

In [0]:
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

In [0]:
def create_tokenizer_from_hub_module():
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([
          tokenization_info["vocab_file"],
          tokenization_info["do_lower_case"]
      ])
  return bert.tokenization.FullTokenizer(
      
      vocab_file=vocab_file,
      do_lower_case=do_lower_case
  
  )

In [22]:
tokenizer = create_tokenizer_from_hub_module()

Instructions for updating:
Colocations handled automatically by placer.


W0530 15:18:04.480695 139819035359104 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0530 15:18:07.188583 139819035359104 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [23]:
tokenizer.tokenize("Trying POC with BERT technique")

['trying', 'po', '##c', 'with', 'bert', 'technique']

Using our tokenizer, we'll call run_classifier.convert_examples_to_features on our InputExamples to convert them into features BERT understands.

In [0]:
MAX_SEQ_LENGTH = 128

In [0]:
train_features = bert.run_classifier.convert_examples_to_features(
    train_InputExamples,
    label_list,
    MAX_SEQ_LENGTH,
    tokenizer
)

In [0]:
test_features = bert.run_classifier.convert_examples_to_features(
    test_InputExamples,
    label_list,
    MAX_SEQ_LENGTH,
    tokenizer
)

In [27]:
train_features[0]

<bert.run_classifier.InputFeatures at 0x7f299bfeecc0>

Creating a Model

First, it loads the BERT tf hub module

Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task

In [0]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels, num_labels):
  
  bert_module = hub.Module(BERT_MODEL_HUB, trainable=True)
  
  bert_inputs = dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids)
  
  bert_outputs = bert_module(inputs=bert_inputs, signature="tokens", as_dict=True)
  
  # Use "pooled_output" for classification tasks on an entire sentence.
  
  output_layer = bert_outputs["pooled_output"]
  
  hidden_size = output_layer.shape[-1].value
  
  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights",
      [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02)
  )
  
  output_bias = tf.get_variable(
      "output_bias",
      [num_labels],
      initializer=tf.zeros_initializer()
  )
  
  with tf.variable_scope("loss"):
    
    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
    
    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    
    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
    
    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    
    if is_predicting:
      return (predicted_labels, log_probs)
    
    per_example_loss = -tf.reduce_sum(one_hot_labels*log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)
    
  

Next we'll wrap our model function in a model_fn_builder function that adapts our model to work for training, evaluation, and prediction.



In [0]:
def model_fn_builder(num_labels, learning_rate, num_train_steps, num_warmup_steps):
  
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]
    
    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    
    if not is_predicting:
      (loss, predicted_labels, log_probs) = create_model(is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
      
      train_ops = bert.optimization.create_optimizer(loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
      
      # Calculate evaluation metrics
      
      def metrics_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        f1_score = tf.contrib.metrics.f1_score(label_ids, predicted_labels)
        auc = tf.metrics.auc(label_ids, predicted_labels)
        recall = tf.metrics.recall(label_ids, predicted_labels)
        precision = tf.metrics.precision(label_ids, predicted_labels)  
        true_pos = tf.metrics.true_positives(label_ids, predicted_labels)
        true_neg = tf.metrics.true_negatives(label_ids, predicted_labels)
        false_pos = tf.metrics.false_positives(label_ids, predicted_labels)  
        false_neg = tf.metrics.false_negatives(label_ids, predicted_labels)
        return {
            "eval_accuracy": accuracy,
            "f1_score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }
      
      eval_metrics = metrics_fn(label_ids, predicted_labels)
      
      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_ops)
      
      else:
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
      
      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      
      return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
  return model_fn

In [0]:
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAINING_EPOCHS = 3.0

# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1

# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [0]:
num_train_steps = int(len(train_features) / BATCH_SIZE*NUM_TRAINING_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [0]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS

)

In [0]:
model_fn = model_fn_builder(
    num_labels=len(label_list),
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps
)

In [110]:
estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

INFO:tensorflow:Using config: {'_model_dir': './', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f2980fa1be0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


I0530 16:04:40.363842 139819035359104 estimator.py:201] Using config: {'_model_dir': './', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f2980fa1be0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [0]:
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False
)

In [112]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!
INFO:tensorflow:Skipping training since max_steps has already saved.


I0530 16:04:45.108958 139819035359104 estimator.py:351] Skipping training since max_steps has already saved.


Training took time  0:00:00.015986


In [0]:
test_input_fn = bert.run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False
)

In [101]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

INFO:tensorflow:Calling model_fn.


I0530 16:01:31.523900 139819035359104 estimator.py:1111] Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0530 16:01:36.056493 139819035359104 saver.py:1483] Saver not created because there are no variables in the graph to restore
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Done calling model_fn.


I0530 16:01:47.750455 139819035359104 estimator.py:1113] Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2019-05-30T16:01:47Z


I0530 16:01:47.784786 139819035359104 evaluation.py:257] Starting evaluation at 2019-05-30T16:01:47Z


INFO:tensorflow:Graph was finalized.


I0530 16:01:49.666142 139819035359104 monitored_session.py:222] Graph was finalized.


Instructions for updating:
Use standard file APIs to check for files with this prefix.


W0530 16:01:49.679651 139819035359104 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


INFO:tensorflow:Restoring parameters from ./model.ckpt-468


I0530 16:01:49.687999 139819035359104 saver.py:1270] Restoring parameters from ./model.ckpt-468


INFO:tensorflow:Running local_init_op.


I0530 16:01:52.337591 139819035359104 session_manager.py:491] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I0530 16:01:52.637682 139819035359104 session_manager.py:493] Done running local_init_op.


INFO:tensorflow:Finished evaluation at 2019-05-30-16:03:15


I0530 16:03:15.424422 139819035359104 evaluation.py:277] Finished evaluation at 2019-05-30-16:03:15


INFO:tensorflow:Saving dict for global step 468: auc = 0.8671559, eval_accuracy = 0.867, f1_score = 0.8649197, false_negatives = 389.0, false_positives = 276.0, global_step = 468, loss = 0.5121848, precision = 0.88523906, recall = 0.84551233, true_negatives = 2206.0, true_positives = 2129.0


I0530 16:03:15.427202 139819035359104 estimator.py:1979] Saving dict for global step 468: auc = 0.8671559, eval_accuracy = 0.867, f1_score = 0.8649197, false_negatives = 389.0, false_positives = 276.0, global_step = 468, loss = 0.5121848, precision = 0.88523906, recall = 0.84551233, true_negatives = 2206.0, true_positives = 2129.0


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 468: ./model.ckpt-468


I0530 16:03:19.396086 139819035359104 estimator.py:2039] Saving 'checkpoint_path' summary for global step 468: ./model.ckpt-468


{'auc': 0.8671559,
 'eval_accuracy': 0.867,
 'f1_score': 0.8649197,
 'false_negatives': 389.0,
 'false_positives': 276.0,
 'global_step': 468,
 'loss': 0.5121848,
 'precision': 0.88523906,
 'recall': 0.84551233,
 'true_negatives': 2206.0,
 'true_positives': 2129.0}

In [0]:
def get_predictions(in_sentences):
  labels=["Negative", "Positive"]
  input_examples = [bert.run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences]
  input_features = bert.run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = bert.run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions = estimator.predict(predict_input_fn)
  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]


In [0]:
pred_sentences = [
  "That movie was absolutely awful",
  "The acting was a bit lacking",
  "The film was creative and surprising",
  "Absolutely fantastic!"
]

In [0]:
predictions = get_predictions(pred_sentences)


In [114]:
predictions

[('That movie was absolutely awful',
  array([-2.2854893e-03, -6.0822968e+00], dtype=float32),
  'Negative'),
 ('The acting was a bit lacking',
  array([-0.03562292, -3.3525236 ], dtype=float32),
  'Negative'),
 ('The film was creative and surprising',
  array([-6.1076026e+00, -2.2283979e-03], dtype=float32),
  'Positive'),
 ('Absolutely fantastic!',
  array([-5.652477e+00, -3.515019e-03], dtype=float32),
  'Positive')]