https://medium.com/@vineet.mundhra/loading-bert-with-tensorflow-hub-7f5a1c722565

https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b

https://github.com/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb

In [1]:
!pip install tensorflow_hub

Collecting tensorflow_hub
[?25l  Downloading https://files.pythonhosted.org/packages/00/0e/a91780d07592b1abf9c91344ce459472cc19db3b67fdf3a61dca6ebb2f5c/tensorflow_hub-0.7.0-py2.py3-none-any.whl (89kB)
[K     |████████████████████████████████| 92kB 1.2MB/s eta 0:00:011
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.7.0
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [77]:
!pip install tensorflow_datasets

Collecting tensorflow_datasets
[?25l  Downloading https://files.pythonhosted.org/packages/88/b9/74c219b0310b3df0ac60c4948c4191b9377b6b746615b176819533096fb5/tensorflow_datasets-2.0.0-py3-none-any.whl (3.1MB)
[K     |████████████████████████████████| 3.1MB 3.9MB/s eta 0:00:01
Collecting dill
[?25l  Downloading https://files.pythonhosted.org/packages/c7/11/345f3173809cea7f1a193bfbf02403fff250a3360e0e118a1630985e547d/dill-0.3.1.1.tar.gz (151kB)
[K     |████████████████████████████████| 153kB 45.5MB/s eta 0:00:01
Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/cd/80/5bb262050dd2f30f8819626b7c92339708fe2ed7bd5554c8193b4487b367/tqdm-4.42.1-py2.py3-none-any.whl (59kB)
[K     |████████████████████████████████| 61kB 12.4MB/s eta 0:00:01
Collecting future
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 45.6MB

In [10]:
import numpy as np

In [11]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_hub as hub
tf.enable_eager_execution()

In [14]:
shakespeare_url = "https://homl.info/shakespeare" # shortcut URL
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [15]:
text = shakespeare_text.split()[:1000]

In [5]:
len(text)

1000

# Fit tokenizer needed to one-hot encode labels

In [43]:
tokenizer = keras.preprocessing.text.Tokenizer(filters="")
tokenizer.fit_on_texts([text])

In [44]:
max_id = len(tokenizer.word_index) 

In [16]:
bytes_vocab = list(set([bytes(v, 'utf-8') for v in text]))

In [40]:
def get_index(x):
    return bytes_vocab.index(x)

In [41]:
t = tf.constant(np.array([b'Citizen:', b'Before', b'we', b'proceed', b'any', b'further,',
       b'hear', b'me', b'speak.', b'All:']))

In [42]:
lookup_dict = {tf.constant(w):  get_index(w) for w in bytes_vocab}

In [43]:
def get_lookup(x):
    return lookup_dict[x]

In [44]:
tf.map_fn(get_lookup, t)

KeyError: <tf.Tensor: id=1684, shape=(), dtype=string, numpy=b'Citizen:'>

In [6]:
embed = hub.load("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1")
embeddings = embed(text[:1000])

In [104]:
dataset = tf.data.Dataset.from_tensor_slices(text)

In [89]:
for d in dataset.take(1):
    print(d)

tf.Tensor(b'First', shape=(), dtype=string)


In [None]:
table = tf.contrib.lookup.index_table_from_tensor(
    mapping=bytes_vocab, default_value=0)


In [None]:
table.

In [126]:
n_steps = 10
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

In [127]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [128]:
dataset = dataset.map(lambda windows: (windows[:-1], windows[1:]))

In [129]:
for d in dataset.take(1):
    print(d)

(<tf.Tensor: id=278803, shape=(10,), dtype=string, numpy=
array([b'First', b'Citizen:', b'Before', b'we', b'proceed', b'any',
       b'further,', b'hear', b'me', b'speak.'], dtype=object)>, <tf.Tensor: id=278804, shape=(10,), dtype=string, numpy=
array([b'Citizen:', b'Before', b'we', b'proceed', b'any', b'further,',
       b'hear', b'me', b'speak.', b'All:'], dtype=object)>)


In [130]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (embed(X_batch), tf.one_hot(table.lookup(Y_batch), 
                     len(bytes_vocab), dtype=tf.int8)))

In [131]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)

In [None]:
for d in dataset.take(1):
    print(d)

# 1. One hot encode Y
# 2. Encode stop words, Comas, etc.


In [14]:
for d in dataset.take(1):
    print(d)

(<tf.Tensor: id=250, shape=(32, 10, 50), dtype=float32, numpy=
array([[[ 1.58649027e-01,  1.28589392e-01, -1.29613027e-01, ...,
         -1.01859465e-01, -3.05732191e-01, -1.98210493e-01],
        [-2.56005861e-02, -5.26675545e-02,  3.39039713e-02, ...,
         -5.28172441e-02, -1.56363156e-02, -4.26248722e-02],
        [ 8.79037604e-02,  1.23868331e-01,  1.46265309e-02, ...,
          5.23213763e-03,  2.99760997e-02,  4.18323092e-02],
        ...,
        [-1.87684625e-01,  1.11483440e-01, -6.98339492e-02, ...,
         -8.72121304e-02, -8.81422907e-02, -1.21769913e-01],
        [-2.02914149e-01, -1.01657093e-01,  1.79136079e-02, ...,
         -3.63735557e-02, -5.38155087e-04,  9.52688009e-02],
        [-1.33177489e-01,  9.86592174e-02, -4.51362878e-01, ...,
         -1.23879559e-01, -5.18544391e-02,  6.37197196e-02]],

       [[-2.51807515e-02,  3.18714857e-01,  6.77748024e-03, ...,
          1.11583963e-01, -3.34233195e-01, -4.43047024e-02],
        [ 3.23567688e-01, -1.01629056e-0

In [38]:
dataset = dataset.map(lambda (X, y): (embed(X), embed(y)))

SyntaxError: invalid syntax (<ipython-input-38-62e6d67f2ffd>, line 1)

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])
#model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
#history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=4)
#model.save("model.h5")

In [None]:
import tensorflow_hub as hub

model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
                   dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])