In [9]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [10]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [11]:
from tensorflow.keras import layers

from bert.dataset import create_masked_input_dataset
from bert.layers import (PositionEmbedding, Attention, Transformer, TokenEmbedding, Bias,
                         gelu, masked_sparse_cross_entropy_loss, InverseSquareRootSchedule,
                         initializer, Projection)

In [22]:
vocab_size = 22
max_seq_len = 1024
batchSize = 10

def encode(line_tensor):
    line = line_tensor.numpy().decode('utf8')

    if len(line) > max_seq_len:
        offset = np.random.randint(
            low=0, high=len(line) - max_seq_len + 1)
        line = line[offset:(offset + max_seq_len)]

    vocab = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K',
             'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 
             'W', 'Y']

    replacement_dict = {key: i + 2 for i, key in enumerate(vocab)}
    return np.asarray([replacement_dict[item] for item in line])

def encode_tf(line_tensor):
    return tf.py_function(encode, inp=[line_tensor], Tout=[tf.int32,])

training_data = create_masked_input_dataset(
    encode_fn=encode_tf,
    sequence_path='/projects/bpms/pstjohn/uniparc/sequences_train.txt',
    max_sequence_length=max_seq_len,
    batch_size=batchSize,
    buffer_size=1024,
    vocab_size=vocab_size,
    mask_index=4,
    vocab_start=5,
    fix_sequence_length=False)

training_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

valid_data = create_masked_input_dataset(
    encode_fn=encode_tf,
    sequence_path='/projects/bpms/pstjohn/uniparc/sequences_valid.txt',
    max_sequence_length=max_seq_len,
    batch_size=batchSize,
    buffer_size=1024,
    vocab_size=vocab_size,
    mask_index=4,
    vocab_start=5,
    fix_sequence_length=False)

valid_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

<PrefetchDataset shapes: (((None, None), (None, None)), (None, None, 1)), types: ((tf.int32, tf.bool), tf.int32)>

## BERT layers

In [47]:
embedding_dimension = 128
model_dimension = 128
transformer_dimension = 4 * model_dimension
num_attention_heads = model_dimension // 64
num_transformer_layers = 1

dropout_rate = 0.

inputs = layers.Input(shape=(max_seq_len,), dtype=tf.int32, batch_size=None)
input_mask = layers.Input(shape=(max_seq_len,), dtype=tf.bool, batch_size=None)

token_embedding_layer = TokenEmbedding(
    vocab_size, embedding_dimension, embeddings_initializer=initializer(), mask_zero=True)
token_embeddings = token_embedding_layer(inputs)
position_embeddings = PositionEmbedding(
    max_seq_len + 1, embedding_dimension, embeddings_initializer=initializer(),
    mask_zero=True)(inputs)

embeddings = layers.Add()([token_embeddings, position_embeddings])
# embeddings = Projection(model_dimension, use_residual=False)(embeddings)

transformer = Transformer(num_attention_heads, transformer_dimension, dropout=dropout_rate)
for i in range(num_transformer_layers):
    embeddings = transformer(embeddings)

out = layers.Dense(embedding_dimension, activation=gelu, kernel_initializer=initializer())(embeddings)
out = token_embedding_layer(out, transpose=True)
out = Bias()([out, input_mask])
#out = layers.Softmax()(out)

model = tf.keras.Model([inputs, input_mask], [out], name='model')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 1024)]       0                                            
__________________________________________________________________________________________________
token_embedding_5 (TokenEmbeddi multiple             2816        input_11[0][0]                   
                                                                 dense_5[0][0]                    
__________________________________________________________________________________________________
position_embedding_5 (PositionE (None, None, 128)    131200      input_11[0][0]                   
__________________________________________________________________________________________________
add_5 (Add)                     (None, 1024, 128)    0           token_embedding_5[0][0]      

In [64]:
out = model([masked_seqs, input_masks])

In [67]:
true_values.shape

TensorShape([10, 1024, 1])

In [None]:
tf.

In [66]:
tf.where(true_values)

<tf.Tensor: id=2863632, shape=(475, 3), dtype=int64, numpy=
array([[  0,   9,   0],
       [  0,  10,   0],
       [  0,  34,   0],
       ...,
       [  9, 494,   0],
       [  9, 502,   0],
       [  9, 505,   0]])>

In [63]:
tf.where(true_values != 0)

<tf.Tensor: id=2863327, shape=(475, 3), dtype=int64, numpy=
array([[  0,   9,   0],
       [  0,  10,   0],
       [  0,  34,   0],
       ...,
       [  9, 494,   0],
       [  9, 502,   0],
       [  9, 505,   0]])>

In [48]:
(masked_seqs, input_masks), true_values = next(iter(training_data))

In [51]:
tf.ragged.boolean_mask(masked_seqs, input_masks)

<tf.RaggedTensor [[4, 4, 4, 4, 4, 4, 10, 2, 4, 4, 4, 4, 4, 4, 4, 11, 4, 4, 4, 4, 4, 4, 18, 4, 4, 4, 4, 4, 4, 4, 4, 7, 4, 4, 4, 4, 4, 9, 4, 4], [4, 4, 4, 9, 19, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 13, 4, 4, 4, 4, 4, 4, 4, 4, 21, 4, 7, 4, 4, 4, 4, 4, 4, 4, 11, 13, 4, 13, 4, 12, 4, 4, 4, 4, 4, 4, 4, 4, 4, 17, 4, 4, 4], [17, 4, 7, 4, 4, 10, 4, 4, 4, 4, 4, 4, 4, 14, 4, 11, 4, 4, 4, 15, 4, 4, 4, 4, 4, 4, 4, 4, 4, 21, 4, 4, 9, 5, 4, 17, 4, 4, 4, 4], [4, 4, 17, 4, 4, 4, 4, 4, 12, 4, 4, 5, 4, 4, 4, 13, 4, 4, 4, 4, 4, 4, 5, 4, 8, 4, 4, 13, 4, 4, 4, 4, 4, 19, 4, 4, 4, 4, 10, 4, 4, 4, 4, 6, 4, 4, 4, 4, 20, 4, 4, 4, 4, 4, 4, 4, 4, 20, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 16, 10, 4, 18, 4, 4, 4, 4, 11, 4, 4, 4, 7, 4, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 4, 4, 4, 21, 4, 4, 4, 4, 4, 4, 4, 4, 18, 4, 11, 4, 4, 4, 7, 4, 18, 4, 12, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 13, 2, 4, 9, 4, 4, 4, 4, 9, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 4, 4, 4], [4, 4, 4, 4, 4, 9, 4, 4, 4, 4, 4, 4], [4, 20, 

In [44]:
out

<tf.Tensor 'bias_4/Identity:0' shape=(None, 1024, 22) dtype=float32>

In [42]:
from tensorflow.python.keras.metrics import MeanMetricWrapper
from tensorflow.python.keras.losses import LossFunctionWrapper
from tensorflow.python.keras.utils import losses_utils

class MaskedSparseCategoricalCrossentropy(LossFunctionWrapper):
    def __init__(self,
                 reduction=losses_utils.ReductionV2.AUTO,
                 name='mean_absolute_error'):
        super(MeanAbsoluteError, self).__init__(
            mean_absolute_error, name=name, reduction=reduction)


def exponentiated_sparse_categorical_crossentropy(y_true, y_pred, **kwargs):    
    return tf.exp(tf.losses.sparse_categorical_crossentropy(y_true, y_pred, **kwargs))

class ExponentiatedSparseCategoricalCrossentropy(MeanMetricWrapper):
    def __init__(self,
                 name='exponentiated_sparse_categorical_crossentropy',
                 dtype=None,
                 from_logits=False,
                 axis=-1):
        
        super(ExponentiatedSparseCategoricalCrossentropy, self).__init__(
            exponentiated_sparse_categorical_crossentropy,
            name,
            dtype=dtype,
            from_logits=from_logits,
            axis=axis)

In [40]:
learning_rate = 1E-4
warmup_updates = 300

# Horovod: add Horovod DistributedOptimizer.
opt = tf.optimizers.Adam(learning_rate=learning_rate)

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy(),
             ExponentiatedSparseCategoricalCrossentropy(from_logits=True)],
    optimizer=opt)

callbacks = [
    InverseSquareRootSchedule(learning_rate=learning_rate, warmup_updates=warmup_updates),
]


In [41]:
model.fit(training_data, steps_per_epoch=1000, epochs=3, verbose=1,
          validation_data=valid_data, validation_steps=10,
          callbacks=callbacks)          

Train for 1000 steps, validate for 10 steps
Epoch 1/3


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


   1/1000 [..............................] - ETA: 24:55

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


_SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'input_8:0' shape=(None, 1024) dtype=bool>]

In [None]:
(masked_seqs, input_mask), true_values = next(iter(valid_data))

In [None]:
predicted_classes = model([masked_seqs, input_mask])

In [None]:
predicted_classes._keras_mask

In [None]:
tf.reduce_sum(tf.losses.sparse_categorical_crossentropy(true_values, predicted_classes, from_logits=True), 1)

In [None]:
tf.math.log(predicted_classes[input_mask]).shape

In [None]:
tf.reduce_sum((tf.one_hot(tf.squeeze(true_values), vocab_size) * 
               tf.math.log(tf.nn.softmax(predicted_classes))), axis=-1)

In [None]:
model.save('test_model.h5')

In [None]:
model = tf.keras.models.load_model(
    '/scratch/pstjohn/albert_debug_checkpoints/ckpt_6.h5',
    custom_objects={
        'PositionEmbedding': PositionEmbedding,
        'TokenEmbedding': TokenEmbedding,
        'Attention': Attention,
        'Transformer': Transformer,
        'Projection': Projection,        
        'Bias': Bias,
        'gelu': gelu,
    })

# true_labels = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)
# model.compile(loss=masked_sparse_cross_entropy_loss, target_tensors=true_labels,
#               optimizer=tfa.optimizers.AdamW(weight_decay=0.01, learning_rate=1E-3))

In [None]:
model.fit(training_data, steps_per_epoch=100, epochs=3, verbose=1,
          validation_data=valid_data, validation_steps=10)

In [None]:
# eval_data = valid_data.map(sp_encode_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE).padded_batch(60, padded_shapes=([512],))
# eval_encoded = next(iter(eval_data))

In [None]:
bert_predict = model.predict(encoded_data.take(3), verbose=1)

In [None]:
bert_predict.shape