In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [2]:
from bert.dataset import create_masked_input_dataset

In [3]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
language_model_path='uniparc_5M.model'
sequence_path='sequences_train_subset.txt'
max_sequence_length=512
batch_size=20
buffer_size=1024
vocab_size=32000
mask_index=4
vocab_start=5
fix_sequence_length=True

In [5]:
import numpy as np
import tensorflow as tf
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.Load(language_model_path)

def sp_encode(line_tensor):
    encoded_array = np.asarray(
        sp.SampleEncodeAsIds(line_tensor.numpy(), nbest_size=-1, alpha=0.5))

    # If the protein sequence is too long, take a random slice.
    if len(encoded_array) > max_sequence_length:
        offset = np.random.randint(
            low=0, high=len(encoded_array) - max_sequence_length + 1)
        encoded_array = encoded_array[offset:(offset + max_sequence_length)]

    return encoded_array

def sp_decode(line_tensor):
    return sp.DecodeIds(line_tensor.numpy().tolist())

def sp_encode_tf(line_tensor):
    return tf.py_function(sp_encode, inp=[line_tensor], Tout=[tf.int32,])

def mask_input(input_tensor):
    """ Randomly mask the input tensor according to the formula perscribed by BERT. 
    Randomly masks 15% of input tokens, with 80% recieving the [MASK] token,
    10% randomized, 10% left unchanged. 

    Returns
    -------

    masked_tensor: (batch_size, seq_length) 
        Tensor with masked values
    input_tensor: (batch_size, seq_length)
        Original input tensor (true values)
    input_mask: (batch_size, seq_length)
        Boolean mask that selects the desired inputs.    
    """

    mask_score = tf.random.uniform(input_tensor.shape, maxval=1, dtype=tf.float32)
    input_mask = mask_score < .15

    # Mask with [MASK] token 80% of the time
    mask_mask = mask_score <= 0.15 * 0.8 

    # Mask with random token 10% of the time
    mask_random = (mask_score >= 0.15 * 0.9) & input_mask

    # Tensors to replace with where input is masked or randomized
    mask_value_tensor = tf.ones(input_tensor.shape, dtype=tf.int32) * mask_index
    random_value_tensor = tf.random.uniform(
        input_tensor.shape, minval=vocab_start, maxval=vocab_size, dtype=tf.int32)
    pad_value_tensor = tf.zeros(input_tensor.shape, dtype=tf.int32)

    # Use the replacements to mask the input tensor
    masked_tensor = tf.where(mask_mask, mask_value_tensor, input_tensor)
    masked_tensor = tf.where(mask_random, random_value_tensor, masked_tensor)

    # Set true values to zero (pad value) where not masked
    true_tensor = tf.where(input_mask, input_tensor, pad_value_tensor)

    return masked_tensor, input_mask, true_tensor

def mask_input_tf(input_tensor):
    a, b, c = tf.py_function(mask_input, inp=[input_tensor], Tout=[tf.int32, tf.bool, tf.int32])
    return (a, b), c


valid_data = tf.data.TextLineDataset(sequence_path)

encoded_data = valid_data\
    .map(sp_encode_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
    .map(mask_input_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE)


# This argument controls whether to fix the size of the sequences
tf_seq_len = -1 if not fix_sequence_length else max_sequence_length

encoded_data = encoded_data\
    .shuffle(buffer_size=buffer_size)\
    .padded_batch(batch_size, padded_shapes=(([tf_seq_len], [tf_seq_len]), [tf_seq_len]))

print(next(iter(encoded_data)))

((<tf.Tensor: id=35051, shape=(20, 512), dtype=int32, numpy=
array([[ 6206,   151,  2087, ...,     0,     0,     0],
       [  243,   158,  2257, ...,     0,     0,     0],
       [  243,     4,     6, ...,     0,     0,     0],
       ...,
       [ 6206, 21019, 21502, ...,     0,     0,     0],
       [    4,     5,  5035, ...,     0,     0,     0],
       [  328,    14,   104, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: id=35052, shape=(20, 512), dtype=bool, numpy=
array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       ...,
       [ True, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])>), <tf.Tensor: id=35053, shape=(20, 512), dtype=int32, numpy=
array([[   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
      

In [6]:
(masked_seqs, input_mask), true_values = next(iter(encoded_data))

## BERT layers development

In [7]:
from bert.layers import (PositionEmbedding, Attention, Transformer, TokenEmbedding, Bias,
                         gelu, masked_sparse_cross_entropy_loss, BERTLearningRateScheduler)

MAX_ENCODED_LENGTH = 512

embedding = PositionEmbedding(MAX_ENCODED_LENGTH + 1, MAX_ENCODED_LENGTH, mask_zero=True)
assert np.all(embedding(masked_seqs)[0, 2, :] == embedding(masked_seqs)[5, 2, :])

inputs = embedding(masked_seqs)
out = Attention(8, name='test')(inputs)
out.shape

transformer = Transformer(8, 0.1, name='test')
out = transformer(inputs)
out2 = transformer(out)

In [13]:
import tensorflow.keras.backend as K

class Bias(layers.Layer):
    """ Final bias layer added to logits prior to softmax scoring. This layer
    also clears the _keras_mask attribute from the Transformers in order to
    allow the custom loss function to work properly. """

    def build(self, input_shape):
        self.bias = self.add_weight(name='classifier_bias',
                                    dtype=K.floatx(),
                                    shape=[input_shape[0][-1]],
                                    initializer=tf.zeros_initializer())
        
    def call(self, inputs):
        
        logits = tf.nn.bias_add(inputs[0], self.bias)
        return logits
        
    def compute_mask(self, inputs, mask=None):
        return inputs[1]

In [20]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import layers
from bert.layers import initializer

embedding_dimension = 4
vocab_size = 32000
model_dimension = 16
num_transformer_layers = 2


# mirrored_strategy = tf.distribute.MirroredStrategy()
# with mirrored_strategy.scope():

inputs = layers.Input(shape=(MAX_ENCODED_LENGTH,), dtype=tf.int32, batch_size=None)
input_mask = layers.Input(shape=(MAX_ENCODED_LENGTH,), dtype=tf.bool, batch_size=None)

token_embedding_layer = TokenEmbedding(
    vocab_size, embedding_dimension, embeddings_initializer=initializer(), mask_zero=True)
token_embeddings = token_embedding_layer(inputs)
position_embeddings = PositionEmbedding(
    MAX_ENCODED_LENGTH + 1, embedding_dimension, embeddings_initializer=initializer(),
    mask_zero=True)(inputs)

embeddings = layers.Add()([token_embeddings, position_embeddings])
embeddings = layers.Dense(model_dimension)(embeddings)

transformer = Transformer(4)
for i in range(num_transformer_layers):
    embeddings = transformer(embeddings)

out = layers.Dense(embedding_dimension, activation=gelu, kernel_initializer=initializer())(embeddings)
out = token_embedding_layer(out, transpose=True)
out = Bias()([out, input_mask])

model = tf.keras.Model([inputs, input_mask], [out], name='model')
model.summary()

true_labels = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
    target_tensors=true_labels,
    optimizer=tfa.optimizers.AdamW(weight_decay=0.01, learning_rate=1E-3))

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           [(None, 512)]        0                                            
__________________________________________________________________________________________________
token_embedding_5 (TokenEmbeddi multiple             128000      input_15[0][0]                   
                                                                 dense_13[0][0]                   
__________________________________________________________________________________________________
position_embedding_6 (PositionE (None, 512, 4)       2052        input_15[0][0]                   
__________________________________________________________________________________________________
add_5 (Add)                     (None, 512, 4)       0           token_embedding_5[0][0]      

In [21]:
model.fit(encoded_data, epochs=5,
#              callbacks=[BERTLearningRateScheduler(initial_learning_rate=1E-3)], 
          verbose=1)

Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8d956c4b10>

In [None]:
# model.save('test_model.h5')

In [None]:
model = tf.keras.models.load_model(
    'test_model.h5',
    custom_objects={
        'PositionEmbedding': PositionEmbedding,
        'TokenEmbedding': TokenEmbedding,
        'Attention': Attention,
        'Transformer': Transformer,
        'Bias': Bias,
        'gelu': gelu,
    }, compile=False)

true_labels = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)
model.compile(loss=masked_sparse_cross_entropy_loss, target_tensors=true_labels,
              optimizer=tfa.optimizers.AdamW(weight_decay=0.01, learning_rate=1E-3))

In [None]:
# eval_data = valid_data.map(sp_encode_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE).padded_batch(60, padded_shapes=([512],))
# eval_encoded = next(iter(eval_data))

In [None]:
bert_predict = model.predict(encoded_data.take(3), verbose=1)

In [None]:
bert_predict.shape