In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [2]:
import tensorflow as tf
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2.0.0
Num GPUs Available:  1


In [36]:
def create_masked_input_dataset(sequence_path,
                                max_sequence_length=512,
                                limit=None,
                                batch_size=20,
                                buffer_size=1024,
                                vocab_size=32000,
                                mask_index=4,
                                vocab_start=5,
                                fix_sequence_length=False,
                                masking_freq=.15,
                                mask_token_freq=.8,
                                mask_random_freq=.1,
                                filter_bzux=True,
                                shard_num_workers=None,
                                shard_worker_index=None):

    
    def encode(line_tensor):
        line = line_tensor.numpy().decode('utf8')

        if len(line) > max_sequence_length:
            offset = np.random.randint(
                low=0, high=len(line) - max_sequence_length + 1)
            line = line[offset:(offset + max_sequence_length)]

        vocab = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K',
                 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 
                 'W', 'Y', 'U', 'O']

        replacement_dict = {key: i + 2 for i, key in enumerate(vocab)}
        return np.asarray([replacement_dict[item] for item in line])

    
    def encode_tf(line_tensor):
        return tf.py_function(encode, inp=[line_tensor], Tout=[tf.int32,])

    
    def mask_input(input_tensor):
        """ Randomly mask the input tensor according to the formula perscribed by BERT. 
        Randomly masks 15% of input tokens, with 80% recieving the [MASK] token,
        10% randomized, 10% left unchanged. 
        
        Returns
        -------
        
        masked_tensor: (batch_size, seq_length) 
            Tensor with masked values
        input_tensor: (batch_size, seq_length)
            Original input tensor (true values)
        input_mask: (batch_size, seq_length)
            Boolean mask that selects the desired inputs.    
        """

        mask_score = tf.random.uniform(input_tensor.shape, maxval=1, dtype=tf.float32)
        input_mask = mask_score < masking_freq

        # Randomly masks 15% of input tokens, Mask with [MASK] token 80% of the time
        mask_mask = mask_score <= 0.15 * mask_token_freq

        # Mask with random token 10% of the time
        mask_random = (mask_score >= 0.15 * (1. - mask_random_freq)) & input_mask

        # Tensors to replace with where input is masked or randomized
        mask_value_tensor = tf.ones(input_tensor.shape, dtype=tf.int32) * mask_index
        random_value_tensor = tf.random.uniform(
            input_tensor.shape, minval=vocab_start, maxval=vocab_size, dtype=tf.int32)
        pad_value_tensor = tf.zeros(input_tensor.shape, dtype=tf.int32)

        # Use the replacements to mask the input tensor
        masked_tensor = tf.where(mask_mask, mask_value_tensor, input_tensor)
        masked_tensor = tf.where(mask_random, random_value_tensor, masked_tensor)
        
        # Set true values to zero (pad value) where not masked
        true_tensor = tf.where(input_mask, input_tensor, pad_value_tensor)

        return masked_tensor, true_tensor


    def mask_input_tf(input_tensor):
        a, c = tf.py_function(mask_input, inp=[input_tensor],
                                 Tout=[tf.int32, tf.int32])
        return (a, c)


    dataset = tf.data.TextLineDataset(sequence_path)
    if limit:
        dataset = dataset.take(limit)
    
    if shard_num_workers:
        dataset = dataset.shard(shard_num_workers, shard_worker_index)
    
    if filter_bzux:
        bzux_filter = lambda string: tf.math.logical_not(
            tf.strings.regex_full_match(string, '.*[BZUX].*'))
        dataset = dataset.filter(bzux_filter)
        
    encoded_data = dataset\
        .map(encode_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
        .map(mask_input_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # This argument controls whether to fix the size of the sequences
    tf_seq_len = -1 if not fix_sequence_length else max_sequence_length

    encoded_data = encoded_data\
        .padded_batch(batch_size, padded_shapes=(
            ([tf_seq_len], [tf_seq_len])))
#        .shuffle(buffer_size=buffer_size)\
        

    return encoded_data

In [37]:
max_seq_len = 1024
vocab_size = 22

training_data = create_masked_input_dataset(
    sequence_path='/projects/bpms/pstjohn/uniparc/sequences_train.txt',
    max_sequence_length=max_seq_len,
    batch_size=10,
    buffer_size=1024,
    vocab_size=vocab_size,
    mask_index=1,
    vocab_start=2,
    fix_sequence_length=True)

training_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

valid_data = create_masked_input_dataset(
    sequence_path='/projects/bpms/pstjohn/uniparc/sequences_valid.txt',
    max_sequence_length=max_seq_len,
    batch_size=10,
    buffer_size=1024,
    vocab_size=vocab_size,
    mask_index=1,
    vocab_start=2,
    fix_sequence_length=True)

valid_data.prefetch(tf.data.experimental.AUTOTUNE)

<PrefetchDataset shapes: ((None, 1024), (None, 1024)), types: (tf.int32, tf.int32)>

In [38]:
masked_seqs, true_values = next(iter(valid_data))

In [39]:
true_values.shape

TensorShape([10, 1024])

## BERT layers development

In [40]:
from tensorflow.keras import layers
import tensorflow.keras.backend as K

class Bias(layers.Layer):
    """ Final bias layer added to logits prior to softmax scoring. This layer
    also applys the input mask from the input to mask non-randomized prediction
    targets """

    def build(self, input_shape):
        self.bias = self.add_weight(name='classifier_bias',
                                    dtype=K.floatx(),
                                    shape=[input_shape[-1]],
                                    initializer=tf.zeros_initializer())
        
    def call(self, inputs):
        logits = tf.nn.bias_add(inputs, self.bias)
        return logits

In [41]:
from bert.layers import (PositionEmbedding, Attention, Transformer, TokenEmbedding,
                         Projection, gelu, InverseSquareRootSchedule,
                         initializer)

# vocab_size = 22
# max_seq_len = 512
embedding_dimension = 32
model_dimension = 64
transformer_ff_dimension = model_dimension * 4
num_attention_heads = model_dimension // 16
num_transformer_layers = 4

dropout_rate = 0.1

# Horovod: adjust learning rate based on number of GPUs.
learning_rate = 1E-4
warmup_updates = 3000


inputs = layers.Input(shape=(max_seq_len,), dtype=tf.int32, batch_size=None)

token_embedding_layer = TokenEmbedding(
    vocab_size, embedding_dimension, embeddings_initializer=initializer(), mask_zero=True)
token_embeddings = token_embedding_layer(inputs)
position_embeddings = PositionEmbedding(
    max_seq_len + 1, embedding_dimension, embeddings_initializer=initializer(),
    mask_zero=True)(inputs)

embeddings = layers.Add()([token_embeddings, position_embeddings])
embeddings = Projection(model_dimension, use_residual=False)(embeddings)

transformer = Transformer(num_attention_heads, transformer_ff_dimension, dropout=dropout_rate)
for i in range(num_transformer_layers):
    embeddings = transformer(embeddings)

out = layers.Dense(embedding_dimension, activation=gelu, kernel_initializer=initializer())(embeddings)
out = token_embedding_layer(out, transpose=True)
out = Bias()(out)

model = tf.keras.Model(inputs, out, name='model')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1024)]       0                                            
__________________________________________________________________________________________________
token_embedding_2 (TokenEmbeddi multiple             704         input_3[0][0]                    
                                                                 dense_2[0][0]                    
__________________________________________________________________________________________________
position_embedding_2 (PositionE (None, None, 32)     32800       input_3[0][0]                    
__________________________________________________________________________________________________
add_2 (Add)                     (None, 1024, 32)     0           token_embedding_2[0][0]      

In [42]:
out = model(masked_seqs)

In [43]:
out.shape

TensorShape([10, 1024, 22])

In [44]:
y_true = true_values
y_pred = out

In [45]:
def masked_sparse_categorical_crossentropy(y_true, y_pred):

    masked_entries = tf.not_equal(y_true, 0)
    y_true_mask = tf.boolean_mask(y_true, masked_entries)
    y_pred_mask = tf.boolean_mask(y_pred, masked_entries)

    return tf.reduce_mean(tf.losses.sparse_categorical_crossentropy(
        y_true_mask, y_pred_mask, from_logits=True))

In [46]:
masked_sparse_categorical_crossentropy(y_true, y_pred)

<tf.Tensor: id=146408, shape=(), dtype=float32, numpy=3.092189>

In [47]:
def ECE(y_true, y_pred):
    return tf.exp(masked_sparse_categorical_crossentropy(y_true, y_pred))

In [48]:
# from tensorflow.python.keras.losses import LossFunctionWrapper
# from tensorflow.python.keras.utils import losses_utils

# class MaskedSparseCategoricalCrossentropy(LossFunctionWrapper):
#     def __init__(self,
#                  axis=-1,
#                  reduction=losses_utils.ReductionV2.AUTO,
#                  name='masked_sparse_categorical_crossentropy'):
        
#         super(MaskedSparseCategoricalCrossentropy, self).__init__(
#             masked_sparse_categorical_crossentropy, reduction=reduction, name=name, axis=axis)      

In [49]:
# from tensorflow.python.keras.metrics import MeanMetricWrapper

# def exponentiated_sparse_categorical_crossentropy(*args, **kwargs):
#     return tf.exp(tf.losses.sparse_categorical_crossentropy(*args, **kwargs))

# class ExponentiatedSparseCategoricalCrossentropy(MeanMetricWrapper):
#     def __init__(self,
#                  name='exponentiated_sparse_categorical_crossentropy',
#                  dtype=None,
#                  from_logits=False,
#                  axis=-1):
        
#         super(ExponentiatedSparseCategoricalCrossentropy, self).__init__(
#             exponentiated_sparse_categorical_crossentropy,
#             name,
#             dtype=dtype,
#             from_logits=from_logits,
#             axis=axis)

In [50]:
# import tensorflow_addons as tfa
learning_rate = 1E-3

# Horovod: add Horovod DistributedOptimizer.
opt = tf.optimizers.Adam(learning_rate=learning_rate)
#true_labels = layers.Input(shape=(max_seq_len,), dtype=tf.int32, batch_size=None)

model.compile(
    loss=masked_sparse_categorical_crossentropy,
    metrics=[ECE],
#    target_tensors=true_labels,
#     metrics=[tf.keras.metrics.SparseCategoricalAccuracy(),
#              ExponentiatedSparseCategoricalCrossentropy(from_logits=True)],
    optimizer=opt)

# model.compile(
#     loss=tf.keras.losses.SparseCategoricalCrossentropy(
#         from_logits=True, reduction=tf.keras.losses.Reduction.NONE),
#     metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
#     optimizer=opt)

In [51]:
model.fit(training_data, steps_per_epoch=100, epochs=3, verbose=1,
          validation_data=valid_data, validation_steps=10)

Train for 100 steps, validate for 10 steps
Epoch 1/3


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




KeyboardInterrupt: 

In [None]:
np.exp(2.6585)

In [None]:
(masked_seqs, input_mask), true_values = next(iter(valid_data))

In [None]:
predicted_classes = model([masked_seqs, input_mask])

In [None]:
predicted_classes._keras_mask

In [None]:
tf.reduce_sum(tf.losses.sparse_categorical_crossentropy(true_values, predicted_classes, from_logits=True), 1)

In [None]:
tf.math.log(predicted_classes[input_mask]).shape

In [None]:
tf.reduce_sum((tf.one_hot(tf.squeeze(true_values), vocab_size) * 
               tf.math.log(tf.nn.softmax(predicted_classes))), axis=-1)

In [None]:
model.save('test_model.h5')

In [None]:
model = tf.keras.models.load_model(
    '/scratch/pstjohn/albert_debug_checkpoints/ckpt_6.h5',
    custom_objects={
        'PositionEmbedding': PositionEmbedding,
        'TokenEmbedding': TokenEmbedding,
        'Attention': Attention,
        'Transformer': Transformer,
        'Projection': Projection,        
        'Bias': Bias,
        'gelu': gelu,
    })

# true_labels = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)
# model.compile(loss=masked_sparse_cross_entropy_loss, target_tensors=true_labels,
#               optimizer=tfa.optimizers.AdamW(weight_decay=0.01, learning_rate=1E-3))

In [None]:
model.fit(training_data, steps_per_epoch=100, epochs=3, verbose=1,
          validation_data=valid_data, validation_steps=10)

In [None]:
# eval_data = valid_data.map(sp_encode_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE).padded_batch(60, padded_shapes=([512],))
# eval_encoded = next(iter(eval_data))

In [None]:
bert_predict = model.predict(encoded_data.take(3), verbose=1)

In [None]:
bert_predict.shape