In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
from bert.dataset import create_masked_input_dataset

In [3]:
vocab_size = 22
max_seq_len = 256

from bert.dataset import create_masked_input_dataset

training_data = create_masked_input_dataset(
    sequence_path='../uniparc_data/sequences_train.txt',
    max_sequence_length=max_seq_len,
    batch_size=4,
    masking_freq=.05)

training_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

valid_data = create_masked_input_dataset(
    sequence_path='../uniparc_data/sequences_valid.txt',
    max_sequence_length=max_seq_len,
    batch_size=4,
    masking_freq=.05)

In [6]:
masked_seqs, true_values = next(iter(valid_data))

In [7]:
masked_seqs[true_values != 0]

<tf.Tensor: id=72448, shape=(39,), dtype=int32, numpy=
array([10,  1,  1,  1,  1, 20,  1,  1,  1, 21, 16,  1,  1,  1,  1,  5,  1,
        1,  1, 21,  1,  1, 19,  1,  1,  1,  1,  1,  1, 14,  1,  1,  1,  1,
        1,  6,  1,  1, 15], dtype=int32)>

## BERT layers

In [8]:
from tensorflow.keras import layers

from bert.layers import (PositionEmbedding, Attention, Transformer, TokenEmbedding, Bias,
                         gelu, initializer, Projection)

In [None]:
class RelativeAttention(Attention):
    def __init__(self, units, num_heads, max_relative_position, **kwargs):
        self.max_relative_position = max_relative_position
        super(RelativeAttention).__init__(units, num_heads, **kwargs)


In [10]:
embedding_dimension = 128
max_embedding_sequence_length = 1024
model_dimension = 128
transformer_dimension = 4 * model_dimension
num_attention_heads = model_dimension // 64
num_transformer_layers = 1
dropout_rate = 0.

inputs = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)

token_embedding_layer = TokenEmbedding(
    vocab_size, embedding_dimension, embeddings_initializer=initializer(),
    mask_zero=True)
token_embeddings = token_embedding_layer(inputs)
position_embeddings = PositionEmbedding(
    max_embedding_sequence_length + 1, embedding_dimension,
    embeddings_initializer=initializer(),
    mask_zero=True)(inputs)

embeddings = layers.Add()([token_embeddings, position_embeddings])
embeddings = Projection(model_dimension, dropout_rate,
                        use_residual=False)(embeddings)

transformer = Transformer(num_attention_heads, transformer_dimension,
                          dropout=dropout_rate)

for i in range(num_transformer_layers):
    embeddings = transformer(embeddings)

out = layers.Dense(embedding_dimension, activation=gelu,
                   kernel_initializer=initializer())(embeddings)
out = token_embedding_layer(out, transpose=True)
out = Bias()(out)

model = tf.keras.Model(inputs, out, name='model')

In [11]:
model(masked_seqs).shape

TensorShape([4, 256, 22])

In [15]:
from bert.optimizers import ECE, masked_sparse_categorical_crossentropy, BertLinearSchedule
    
opt = tfa.optimizers.AdamW(learning_rate=1E-4, weight_decay=0.001)

true_labels = tf.keras.layers.Input(
    shape=(None,), dtype=tf.int32, batch_size=None)

model.compile(
    target_tensors=true_labels,    
    loss=masked_sparse_categorical_crossentropy,
    metrics=[ECE],
    optimizer=opt,
    experimental_run_tf_function=True)

In [16]:
model.fit(training_data, steps_per_epoch=1000, epochs=5,
          verbose=1, validation_data=valid_data, validation_steps=100,
          callbacks=[BertLinearSchedule(1E-4, 1000, int(1E7))])

Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5

KeyboardInterrupt: 