In [13]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [14]:
from bert.dataset import create_masked_input_dataset

In [15]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  2


In [16]:
encoded_data = create_masked_input_dataset(
    language_model_path='uniparc_5M.model',
    sequence_path='sequences_train_subset.txt',
    max_sequence_length=512,
    batch_size=20,
    buffer_size=1024,
    vocab_size=32000,
    mask_index=4,
    vocab_start=5,
    fix_sequence_length=True
)

In [17]:
print(next(iter(encoded_data)))
masked_seqs, true_values = next(iter(encoded_data))

(<tf.Tensor: id=322742, shape=(20, 512), dtype=int32, numpy=
array([[  351,     4, 24101, ...,     0,     0,     0],
       [   23,  4362,   320, ...,     0,     0,     0],
       [    4,   102,     4, ...,     0,     0,     0],
       ...,
       [   23,     9,    10, ...,     0,     0,     0],
       [   23,  3808,    11, ...,     0,     0,     0],
       [   23,    39,   909, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: id=322743, shape=(20, 512), dtype=int32, numpy=
array([[   0,  162,  966, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [4963,    0,  152, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,   11, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0]], dtype=int32)>)


## BERT layers development

In [18]:
from bert.layers import (PositionEmbedding, Attention, Transformer, TokenEmbedding, Bias,
                         gelu, masked_sparse_cross_entropy_loss, BERTLearningRateScheduler)

MAX_ENCODED_LENGTH = 512

embedding = PositionEmbedding(MAX_ENCODED_LENGTH + 1, MAX_ENCODED_LENGTH, mask_zero=True)
assert np.all(embedding(masked_seqs)[0, 2, :] == embedding(masked_seqs)[5, 2, :])

inputs = embedding(masked_seqs)
out = Attention(8, name='test')(inputs)
out.shape

transformer = Transformer(8, 0.1, name='test')
out = transformer(inputs)
out2 = transformer(out)

In [19]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import layers
from bert.layers import initializer

embedding_dimension = 4
vocab_size = 32000
model_dimension = 16
num_transformer_layers = 2


mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():

    inputs = layers.Input(shape=(MAX_ENCODED_LENGTH,), dtype=tf.int32, batch_size=None)

    token_embedding_layer = TokenEmbedding(
        vocab_size, embedding_dimension, embeddings_initializer=initializer(), mask_zero=True)
    token_embeddings = token_embedding_layer(inputs)
    position_embeddings = PositionEmbedding(
        MAX_ENCODED_LENGTH + 1, embedding_dimension, embeddings_initializer=initializer(),
        mask_zero=True)(inputs)

    embeddings = layers.Add()([token_embeddings, position_embeddings])
    embeddings = layers.Dense(model_dimension)(embeddings)

    transformer = Transformer(4)
    for i in range(num_transformer_layers):
        embeddings = transformer(embeddings)

    out = layers.Dense(embedding_dimension, activation=gelu, kernel_initializer=initializer())(embeddings)
    out = token_embedding_layer(out, transpose=True)
    out = Bias()(out)

    model = tf.keras.Model([inputs], [out], name='model')
    model.summary()
    
    true_labels = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)
    model.compile(loss=masked_sparse_cross_entropy_loss, target_tensors=true_labels,
                  optimizer=tfa.optimizers.AdamW(weight_decay=0.01, learning_rate=1E-3))

    model.fit(encoded_data, epochs=5,
#              callbacks=[BERTLearningRateScheduler(initial_learning_rate=1E-3)], 
              verbose=1)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
token_embedding_2 (TokenEmbeddi multiple             128000      input_5[0][0]                    
                                                                 dense_9[0][0]                    
__________________________________________________________________________________________________
position_embedding_4 (PositionE (None, 512, 4)       2052        input_5[0][0]                    
__________________________________________________________________________________________________
add_2 (Add)                     (None, 512, 4)       0           token_embedding_2[0][0]      

OperatorNotAllowedInGraphError: using a `tf.Tensor` as a Python `bool` is not allowed in Graph execution. Use Eager execution or decorate this function with @tf.function.

In [9]:
# model.save('test_model.h5')

In [10]:
model = tf.keras.models.load_model(
    'test_model.h5',
    custom_objects={
        'PositionEmbedding': PositionEmbedding,
        'TokenEmbedding': TokenEmbedding,
        'Attention': Attention,
        'Transformer': Transformer,
        'Bias': Bias,
        'gelu': gelu,
    }, compile=False)

true_labels = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)
model.compile(loss=masked_sparse_cross_entropy_loss, target_tensors=true_labels,
              optimizer=tfa.optimizers.AdamW(weight_decay=0.01, learning_rate=1E-3))

In [11]:
# eval_data = valid_data.map(sp_encode_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE).padded_batch(60, padded_shapes=([512],))
# eval_encoded = next(iter(eval_data))

In [12]:
bert_predict = model.predict(encoded_data.take(3), verbose=1)

      3/Unknown - 6s 2s/step

In [13]:
bert_predict.shape

(60, 512, 32000)