https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b

https://github.com/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb

In [1]:
!pip install tensorflow_hub

Collecting tensorflow_hub
[?25l  Downloading https://files.pythonhosted.org/packages/00/0e/a91780d07592b1abf9c91344ce459472cc19db3b67fdf3a61dca6ebb2f5c/tensorflow_hub-0.7.0-py2.py3-none-any.whl (89kB)
[K     |████████████████████████████████| 92kB 4.0MB/s eta 0:00:011
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.7.0
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
!pip install bert-tensorflow

Collecting bert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)
[K     |████████████████████████████████| 71kB 3.8MB/s eta 0:00:011
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_hub as hub




In [4]:
BERT_URL = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'
bert_module = hub.Module(BERT_URL)

In [5]:
from bert import tokenization
import bert

# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore








In [6]:
tokenizer.tokenize("This here's an example of using the BERT tokenizer")

['this',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

In [7]:
def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    #Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len-len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)
    
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids


In [8]:
shakespeare_url = "https://homl.info/shakespeare" # shortcut URL
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


In [9]:
shakespeare_text.replace('.', )

TypeError: replace() takes at least 2 arguments (1 given)

In [None]:
tokens = tokenizer.tokenize(shakespeare_text)
encoded = tokenizer.convert_tokens_to_ids(tokens)

In [29]:
tokens

['first',
 'citizen',
 ':',
 'before',
 'we',
 'proceed',
 'any',
 'further',
 ',',
 'hear',
 'me',
 'speak',
 '.',
 'all',
 ':',
 'speak',
 ',',
 'speak',
 '.',
 'first',
 'citizen',
 ':',
 'you',
 'are',
 'all',
 'resolved',
 'rather',
 'to',
 'die',
 'than',
 'to',
 'fa',
 '##mis',
 '##h',
 '?',
 'all',
 ':',
 'resolved',
 '.',
 'resolved',
 '.',
 'first',
 'citizen',
 ':',
 'first',
 ',',
 'you',
 'know',
 'cai',
 '##us',
 'marc',
 '##ius',
 'is',
 'chief',
 'enemy',
 'to',
 'the',
 'people',
 '.',
 'all',
 ':',
 'we',
 'know',
 "'",
 't',
 ',',
 'we',
 'know',
 "'",
 't',
 '.',
 'first',
 'citizen',
 ':',
 'let',
 'us',
 'kill',
 'him',
 ',',
 'and',
 'we',
 "'",
 'll',
 'have',
 'corn',
 'at',
 'our',
 'own',
 'price',
 '.',
 'is',
 "'",
 't',
 'a',
 'verdict',
 '?',
 'all',
 ':',
 'no',
 'more',
 'talking',
 'on',
 "'",
 't',
 ';',
 'let',
 'it',
 'be',
 'done',
 ':',
 'away',
 ',',
 'away',
 '!',
 'second',
 'citizen',
 ':',
 'one',
 'word',
 ',',
 'good',
 'citizens',
 '.',
 '

In [28]:
encoded

[2034,
 6926,
 1024,
 2077,
 2057,
 10838,
 2151,
 2582,
 1010,
 2963,
 2033,
 3713,
 1012,
 2035,
 1024,
 3713,
 1010,
 3713,
 1012,
 2034,
 6926,
 1024,
 2017,
 2024,
 2035,
 10395,
 2738,
 2000,
 3280,
 2084,
 2000,
 6904,
 15630,
 2232,
 1029,
 2035,
 1024,
 10395,
 1012,
 10395,
 1012,
 2034,
 6926,
 1024,
 2034,
 1010,
 2017,
 2113,
 29080,
 2271,
 7871,
 4173,
 2003,
 2708,
 4099,
 2000,
 1996,
 2111,
 1012,
 2035,
 1024,
 2057,
 2113,
 1005,
 1056,
 1010,
 2057,
 2113,
 1005,
 1056,
 1012,
 2034,
 6926,
 1024,
 2292,
 2149,
 3102,
 2032,
 1010,
 1998,
 2057,
 1005,
 2222,
 2031,
 9781,
 2012,
 2256,
 2219,
 3976,
 1012,
 2003,
 1005,
 1056,
 1037,
 14392,
 1029,
 2035,
 1024,
 2053,
 2062,
 3331,
 2006,
 1005,
 1056,
 1025,
 2292,
 2009,
 2022,
 2589,
 1024,
 2185,
 1010,
 2185,
 999,
 2117,
 6926,
 1024,
 2028,
 2773,
 1010,
 2204,
 4480,
 1012,
 2034,
 6926,
 1024,
 2057,
 2024,
 14729,
 3532,
 4480,
 1010,
 1996,
 10717,
 3619,
 2204,
 1012,
 2054,
 3691,
 14175,
 20175,
 20

In [23]:
tokenizer.convert_tokens_to_ids(shakespeare_text.replace("\n", " "))

KeyError: 'F'

In [11]:
sentences = shakespeare_text.split('.')

In [14]:
input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, tokenizer, 20)

In [19]:
input_ids_vals[0]

[101,
 2034,
 6926,
 1024,
 2077,
 2057,
 10838,
 2151,
 2582,
 1010,
 2963,
 2033,
 3713,
 102,
 0,
 0,
 0,
 0,
 0,
 0]

In [None]:
# Build model
in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]

# Instantiate the custom Bert Layer defined above
bert_output = BertLayer(n_fine_tune_layers=10)(bert_inputs)

# Build the rest of the classifier 
dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
    epochs=1,
    batch_size=32
)

In [10]:
import tensorflow_hub as hub
import tensorflow as tf

BERT_URL = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'
module = hub.Module(BERT_URL)
sess = tf.Session()
sess.run(tf.global_variables_initializer())

input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None])
input_mask = tf.placeholder(dtype=tf.int32, shape=[None, None])
segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None])

bert_inputs = dict(
    input_ids=input_ids,
    input_mask=input_mask,
    segment_ids=segment_ids)

bert_outputs = module(bert_inputs, signature="tokens", as_dict=True)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [11]:
sentences = ['New Delhi is the capital of India', 'The capital of India is Delhi']
input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, tokenizer, 20)
out = sess.run(bert_outputs, feed_dict={input_ids: input_ids_vals, input_mask: input_mask_vals, segment_ids: segment_ids_vals})

#out has two keys `dict_keys(['sequence_output', 'pooled_output'])`
sentences = ['I prefer Python over Java', 'I like coding in Python', 'coding is fun']
input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, tokenizer, 20)

out = sess.run(bert_outputs, feed_dict={input_ids: input_ids_vals, input_mask: input_mask_vals, segment_ids: segment_ids_vals})


In [13]:
out['pooled_output']

array([[-0.6547959 ,  0.41325322,  0.99984545, ...,  0.9999431 ,
        -0.6780196 ,  0.97909325],
       [-0.75220275,  0.46669203,  0.9998463 , ...,  0.9999476 ,
        -0.29017213,  0.9752825 ],
       [-0.69471467,  0.4643009 ,  0.9999368 , ...,  0.9999857 ,
        -0.8504318 ,  0.99613315]], dtype=float32)

In [37]:
out['sequence_output'][1].shape

(20, 768)