In [42]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [43]:
from bert.dataset import create_masked_input_dataset

In [44]:
vocab_size = 22
max_seq_len = 256

from bert.dataset import create_masked_input_dataset

training_data = create_masked_input_dataset(
    sequence_path='../uniparc_data/sequences_train.txt',
    max_sequence_length=max_seq_len,
    batch_size=4,
    masking_freq=.05)

training_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

valid_data = create_masked_input_dataset(
    sequence_path='../uniparc_data/sequences_valid.txt',
    max_sequence_length=max_seq_len,
    batch_size=4,
    masking_freq=.05)

In [45]:
masked_seqs, true_values = next(iter(valid_data))

In [46]:
masked_seqs[true_values != 0]

<tf.Tensor: id=111797, shape=(34,), dtype=int32, numpy=
array([ 1,  1,  1,  1,  1,  1,  1,  1,  7,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, 18,  1,  1,  1,  1,  2,  1, 14,  1,  1, 19,  1,  6, 10],
      dtype=int32)>

## BERT layers

In [47]:
from tensorflow.keras import layers

from bert.layers import (Attention, Transformer,
                         gelu, initializer, Projection, DenseNoMask)

In [48]:
embedding_dimension = 128
max_embedding_sequence_length = 1024
model_dimension = 128
transformer_dimension = 4 * model_dimension
num_attention_heads = model_dimension // 64
num_transformer_layers = 1
dropout_rate = 0.

inputs = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)

# Amino-acid level embeddings
embeddings = layers.Embedding(
    vocab_size, model_dimension, embeddings_initializer=initializer(),
    mask_zero=True)(inputs)

# Initialize transformer, use ALBERT-style weight sharing
transformer = Transformer(num_attention_heads, transformer_dimension,
                          attention_type='relative', max_relative_position=10,
                          dropout=dropout_rate)

# Stack transformers together
for i in range(num_transformer_layers):
    embeddings = transformer(embeddings)

# Project back to original embedding dimension
out = DenseNoMask(vocab_size, activation=gelu,
                  kernel_initializer=initializer())(embeddings)

model = tf.keras.Model(inputs, out, name='model')
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_5 (Embedding)      (None, None, 128)         2816      
_________________________________________________________________
transformer_3 (Transformer)  (None, None, 128)         199616    
_________________________________________________________________
dense_no_mask_3 (DenseNoMask (None, None, 22)          2838      
Total params: 205,270
Trainable params: 205,270
Non-trainable params: 0
_________________________________________________________________


In [49]:
model(masked_seqs).shape

TensorShape([4, 256, 22])

tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2

In [56]:
tf.optimizers.Optimizer

tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2

In [65]:
import re

class AdamWeightDecayOptimizer(tf.compat.v1.train.Optimizer):
  """A basic Adam optimizer that includes "correct" L2 weight decay."""

  def __init__(self,
               learning_rate,
               weight_decay_rate=0.0,
               beta_1=0.9,
               beta_2=0.999,
               epsilon=1e-6,
               exclude_from_weight_decay=None,
               name="AdamWeightDecayOptimizer"):
    """Constructs a AdamWeightDecayOptimizer."""
    super(AdamWeightDecayOptimizer, self).__init__(False, name)

    self.learning_rate = learning_rate
    self.weight_decay_rate = weight_decay_rate
    self.beta_1 = beta_1
    self.beta_2 = beta_2
    self.epsilon = epsilon
    self.exclude_from_weight_decay = exclude_from_weight_decay

  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """See base class."""
    assignments = []
    for (grad, param) in grads_and_vars:
      if grad is None or param is None:
        continue

      param_name = self._get_variable_name(param.name)

      m = tf.get_variable(
          name=param_name + "/adam_m",
          shape=param.shape.as_list(),
          dtype=tf.float32,
          trainable=False,
          initializer=tf.zeros_initializer())
      v = tf.get_variable(
          name=param_name + "/adam_v",
          shape=param.shape.as_list(),
          dtype=tf.float32,
          trainable=False,
          initializer=tf.zeros_initializer())

      # Standard Adam update.
      next_m = (
          tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
      next_v = (
          tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
                                                    tf.square(grad)))

      update = next_m / (tf.sqrt(next_v) + self.epsilon)

      # Just adding the square of the weights to the loss function is *not*
      # the correct way of using L2 regularization/weight decay with Adam,
      # since that will interact with the m and v parameters in strange ways.
      #
      # Instead we want ot decay the weights in a manner that doesn't interact
      # with the m/v parameters. This is equivalent to adding the square
      # of the weights to the loss with plain (non-momentum) SGD.
      if self._do_use_weight_decay(param_name):
        update += self.weight_decay_rate * param

      update_with_lr = self.learning_rate * update

      next_param = param - update_with_lr

      assignments.extend(
          [param.assign(next_param),
           m.assign(next_m),
           v.assign(next_v)])
    return tf.group(*assignments, name=name)

  def _do_use_weight_decay(self, param_name):
    """Whether to use L2 weight decay for `param_name`."""
    if not self.weight_decay_rate:
      return False
    if self.exclude_from_weight_decay:
      for r in self.exclude_from_weight_decay:
        if re.search(r, param_name) is not None:
          return False
    return True

  def _get_variable_name(self, param_name):
    """Get the variable name from the tensor name."""
    m = re.match("^(.*):\\d+$", param_name)
    if m is not None:
      param_name = m.group(1)
    return param_name

In [66]:
  optimizer = AdamWeightDecayOptimizer(
      learning_rate=1E-4,
      weight_decay_rate=0.01,
      beta_1=0.9,
      beta_2=0.999,
      epsilon=1e-6,
      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

In [67]:
from bert.optimizers import ECE, masked_sparse_categorical_crossentropy, BertLinearSchedule
    
# opt = tfa.optimizers.AdamW(learning_rate=1E-4, weight_decay=0.001)

true_labels = tf.keras.layers.Input(
    shape=(None,), dtype=tf.int32, batch_size=None)

model.compile(
    target_tensors=true_labels,    
    loss=masked_sparse_categorical_crossentropy,
    metrics=[ECE],
    optimizer=optimizer,
    experimental_run_tf_function=True)

In [68]:
model.fit(training_data, steps_per_epoch=1000, epochs=5,
          verbose=1, validation_data=valid_data, validation_steps=100,
          callbacks=[])

Epoch 1/5


AttributeError: module 'tensorflow' has no attribute 'get_variable'