In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [2]:
from bert.dataset import create_masked_input_dataset

In [3]:
encoded_data = create_masked_input_dataset(
    language_model_path='uniparc_5M.model',
    sequence_path='sequences_train_subset.txt',
    max_sequence_length=512,
    batch_size=20,
    buffer_size=1024,
    vocab_size=32000,
    mask_index=4,
    vocab_start=5,
    fix_sequence_length=True
).take(5)

In [4]:
print(next(iter(encoded_data)))
masked_seqs, true_values = next(iter(encoded_data))

(<tf.Tensor: id=32049, shape=(20, 512), dtype=int32, numpy=
array([[   23,    27,  1023, ...,     0,     0,     0],
       [  492, 13564,     5, ...,     0,     0,     0],
       [  422,   184,    12, ...,     0,     0,     0],
       ...,
       [ 5429,  1302,  2380, ...,     0,     0,     0],
       [ 9009,  1642,    67, ...,     0,     0,     0],
       [    4,  1371,     6, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: id=32050, shape=(20, 512), dtype=int32, numpy=
array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [23,  0,  0, ...,  0,  0,  0]], dtype=int32)>)


## BERT layers development

In [5]:
from bert.layers import (PositionEmbedding, Attention, Transformer, TokenEmbedding, Bias,
                         gelu, masked_sparse_cross_entropy_loss, BERTLearningRateScheduler)

MAX_ENCODED_LENGTH = 512

embedding = PositionEmbedding(MAX_ENCODED_LENGTH + 1, MAX_ENCODED_LENGTH, mask_zero=True)
assert np.all(embedding(masked_seqs)[0, 2, :] == embedding(masked_seqs)[5, 2, :])

inputs = embedding(masked_seqs)
out = Attention(8, name='test')(inputs)
out.shape

transformer = Transformer(8, 0.1, name='test')
out = transformer(inputs)
out2 = transformer(out)

In [6]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import layers
from bert.layers import initializer

embedding_dimension = 4
vocab_size = 32000
model_dimension = 16
num_transformer_layers = 2

inputs = layers.Input(shape=(MAX_ENCODED_LENGTH,), dtype=tf.int32, batch_size=None)

token_embedding_layer = TokenEmbedding(
    vocab_size, embedding_dimension, embeddings_initializer=initializer(), mask_zero=True)
token_embeddings = token_embedding_layer(inputs)
position_embeddings = PositionEmbedding(
    MAX_ENCODED_LENGTH + 1, embedding_dimension, embeddings_initializer=initializer(),
    mask_zero=True)(inputs)

embeddings = layers.Add()([token_embeddings, position_embeddings])
embeddings = layers.Dense(model_dimension)(embeddings)

transformer = Transformer(4)
for i in range(num_transformer_layers):
    embeddings = transformer(embeddings)

out = layers.Dense(embedding_dimension, activation=gelu, kernel_initializer=initializer())(embeddings)
out = token_embedding_layer(out, transpose=True)
out = Bias()(out)

model = tf.keras.Model([inputs], [out], name='model')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
token_embedding (TokenEmbedding multiple             128000      input_1[0][0]                    
                                                                 dense_3[0][0]                    
__________________________________________________________________________________________________
position_embedding_1 (PositionE (None, 512, 4)       2052        input_1[0][0]                    
__________________________________________________________________________________________________
add (Add)                       (None, 512, 4)       0           token_embedding[0][0]        

In [7]:
# %pdb on
true_labels = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)
model.compile(loss=masked_sparse_cross_entropy_loss, target_tensors=true_labels,
              optimizer=tfa.optimizers.AdamW(weight_decay=0.01, learning_rate=1E-3))

In [8]:
model.fit(encoded_data, epochs=1,
          callbacks=[BERTLearningRateScheduler(initial_learning_rate=1E-3)], 
          verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




<tensorflow.python.keras.callbacks.History at 0x13a4ef450>

In [9]:
model.save('test_model.h5')

In [10]:
model = tf.keras.models.load_model(
    'test_model.h5',
    custom_objects={
        'PositionEmbedding': PositionEmbedding,
        'TokenEmbedding': TokenEmbedding,
        'Attention': Attention,
        'Transformer': Transformer,
        'Bias': Bias,
        'gelu': gelu,
    }, compile=False)

true_labels = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)
model.compile(loss=masked_sparse_cross_entropy_loss, target_tensors=true_labels,
              optimizer=tfa.optimizers.AdamW(weight_decay=0.01, learning_rate=1E-3))

In [11]:
# eval_data = valid_data.map(sp_encode_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE).padded_batch(60, padded_shapes=([512],))
# eval_encoded = next(iter(eval_data))

In [12]:
bert_predict = model.predict(encoded_data.take(3), verbose=1)

      3/Unknown - 6s 2s/step

In [13]:
bert_predict.shape

(60, 512, 32000)