In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
vocab_size = 8000
max_seq_len = 512

In [4]:
from bert.dataset import create_masked_input_dataset

training_data = create_masked_input_dataset(
    language_model_path='sentencepiece_models/uniparc_10M_8000.model',
    sequence_path='/projects/bpms/pstjohn/uniparc/sequences_train.txt',
    max_sequence_length=max_seq_len,
    batch_size=20,
    buffer_size=1024,
    vocab_size=vocab_size,
    mask_index=4,
    vocab_start=5,
    fix_sequence_length=True)

training_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

valid_data = create_masked_input_dataset(
    language_model_path='sentencepiece_models/uniparc_10M_8000.model',
    sequence_path='/projects/bpms/pstjohn/uniparc/sequences_valid.txt',
    max_sequence_length=max_seq_len,
    batch_size=20,
    buffer_size=1024,
    vocab_size=vocab_size,
    mask_index=4,
    vocab_start=5,
    fix_sequence_length=True)

valid_data.prefetch(tf.data.experimental.AUTOTUNE)

<PrefetchDataset shapes: (((None, 512), (None, 512)), (None, 512, 1)), types: ((tf.int32, tf.bool), tf.int32)>

In [5]:
(masked_seqs, input_mask), true_values = next(iter(training_data))

In [6]:
masked_seqs.numpy().max()

7994

In [7]:
true_values.shape

TensorShape([20, 512, 1])

## BERT layers development

In [8]:
from tensorflow.keras import layers
from bert.layers import (PositionEmbedding, Attention, Transformer, TokenEmbedding, Bias,
                         Projection, gelu, masked_sparse_cross_entropy_loss, InverseSquareRootSchedule,
                         initializer)

vocab_size = 8000
max_seq_len = 512
embedding_dimension = 32
model_dimension = 64
transformer_ff_dimension = model_dimension * 4
num_attention_heads = model_dimension // 16
num_transformer_layers = 4

dropout_rate = 0.1

# Horovod: adjust learning rate based on number of GPUs.
learning_rate = 1E-4
warmup_updates = 3000


inputs = layers.Input(shape=(max_seq_len,), dtype=tf.int32, batch_size=None)
input_mask = layers.Input(shape=(max_seq_len,), dtype=tf.bool, batch_size=None)

token_embedding_layer = TokenEmbedding(
    vocab_size, embedding_dimension, embeddings_initializer=initializer(), mask_zero=True)
token_embeddings = token_embedding_layer(inputs)
position_embeddings = PositionEmbedding(
    max_seq_len + 1, embedding_dimension, embeddings_initializer=initializer(),
    mask_zero=True)(inputs)

embeddings = layers.Add()([token_embeddings, position_embeddings])
embeddings = Projection(model_dimension, use_residual=False)(embeddings)

transformer = Transformer(num_attention_heads, transformer_ff_dimension, dropout=dropout_rate)
for i in range(num_transformer_layers):
    embeddings = transformer(embeddings)

out = layers.Dense(embedding_dimension, activation=gelu, kernel_initializer=initializer())(embeddings)
out = token_embedding_layer(out, transpose=True)
out = Bias()([out, input_mask])
out = layers.Softmax()(out)

model = tf.keras.Model([inputs, input_mask], [out], name='model')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
token_embedding (TokenEmbedding multiple             256000      input_1[0][0]                    
                                                                 dense[0][0]                      
__________________________________________________________________________________________________
position_embedding (PositionEmb (None, 512, 32)      16416       input_1[0][0]                    
__________________________________________________________________________________________________
add (Add)                       (None, 512, 32)      0           token_embedding[0][0]        

In [9]:
tf.concat([attention_layer(embeddings) for
           attention_layer in transformer.attention_heads],
          axis=-1)

<tf.Tensor 'concat:0' shape=(None, 512, 64) dtype=float32>

In [10]:
# import tensorflow_addons as tfa

# Horovod: add Horovod DistributedOptimizer.
opt = tf.optimizers.Adam(learning_rate=learning_rate)

model.compile(
    loss=tf.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy'],
    optimizer=opt)

# model.compile(
#     loss=tf.keras.losses.SparseCategoricalCrossentropy(
#         from_logits=True, reduction=tf.keras.losses.Reduction.NONE),
#     metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
#     optimizer=opt)

In [11]:
model.fit(training_data, steps_per_epoch=100, epochs=3, verbose=1,
          validation_data=valid_data, validation_steps=10)

Train for 100 steps, validate for 10 steps
Epoch 1/3


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fc569635c50>

In [None]:
model.save('test_model.h5')

In [None]:
model = tf.keras.models.load_model(
    'test_model.h5',
    custom_objects={
        'PositionEmbedding': PositionEmbedding,
        'TokenEmbedding': TokenEmbedding,
        'Attention': Attention,
        'Transformer': Transformer,
        'Bias': Bias,
        'gelu': gelu,
    })

# true_labels = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)
# model.compile(loss=masked_sparse_cross_entropy_loss, target_tensors=true_labels,
#               optimizer=tfa.optimizers.AdamW(weight_decay=0.01, learning_rate=1E-3))

In [None]:
# eval_data = valid_data.map(sp_encode_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE).padded_batch(60, padded_shapes=([512],))
# eval_encoded = next(iter(eval_data))

In [None]:
bert_predict = model.predict(encoded_data.take(3), verbose=1)

In [None]:
bert_predict.shape