In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa

In [2]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu_devices))
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

Num GPUs Available:  1


In [3]:
from bert.dataset import create_masked_input_dataset

In [4]:
!nvidia-smi

Fri Jan 17 10:50:18 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.26       Driver Version: 430.26       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Quadro GV100        Off  | 00000000:37:00.0 Off |                  Off |
| 31%   43C    P0    36W / 250W |    119MiB / 32508MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

## BERT layers

In [5]:
from tensorflow.keras import layers

from bert.layers import (PositionEmbedding, Transformer, TokenEmbedding, Bias,
                         gelu, initializer, Projection)

In [18]:
vocab_size = 22
embedding_dimension = 32
model_dimension = 128
transformer_dimension = 4 * model_dimension
num_attention_heads = model_dimension // 64
num_transformer_layers = 4
dropout_rate = 0.

inputs = layers.Input(shape=(None,), dtype=tf.int32, batch_size=None)

token_embedding_layer = TokenEmbedding(
    vocab_size, embedding_dimension, embeddings_initializer=initializer(),
    mask_zero=True)

embeddings = token_embedding_layer(inputs)

embeddings = Projection(model_dimension, dropout_rate,
                        use_residual=False)(embeddings)

transformer = Transformer(num_attention_heads, transformer_dimension,
                          dropout=dropout_rate, attention_type='relative',
                          max_relative_position=64)

for i in range(num_transformer_layers):
    embeddings = transformer(embeddings)

out = layers.Dense(embedding_dimension, activation=gelu,
                   kernel_initializer=initializer())(embeddings)
out = token_embedding_layer(out, transpose=True)
out = Bias()(out)

model = tf.keras.Model(inputs, out, name='model')

In [19]:
from bert.optimizers import ECE, masked_sparse_categorical_crossentropy, BertLinearSchedule
    
opt = tfa.optimizers.AdamW(learning_rate=1E-4, weight_decay=0.0)

true_labels = tf.keras.layers.Input(
    shape=(None,), dtype=tf.int32, batch_size=None)

model.compile(
    target_tensors=true_labels,    
    loss=masked_sparse_categorical_crossentropy,
    metrics=[ECE],
    optimizer=opt,
    experimental_run_tf_function=True)

callbacks = [BertLinearSchedule(1E-4, 100, int(1E7))]

In [20]:
from bert.dataset import create_masked_input_dataset

training_data = create_masked_input_dataset(
    sequence_path='../uniparc_data/sequences_train.txt',
    max_sequence_length=128,
    batch_size=64,
    masking_freq=.05)

training_data = training_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

valid_data = create_masked_input_dataset(
    sequence_path='../uniparc_data/sequences_valid.txt',
    max_sequence_length=128,
    batch_size=64,
    masking_freq=.05)

valid_data = valid_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

In [21]:
model.fit(training_data, steps_per_epoch=1000, epochs=5,
          verbose=1, validation_data=valid_data, validation_steps=100,
          callbacks=callbacks)

Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
 179/1000 [====>.........................] - ETA: 1:39 - loss: 2.7610 - ECE: 15.8247

KeyboardInterrupt: 

In [10]:
training_data_large = create_masked_input_dataset(
    sequence_path='../uniparc_data/sequences_train.txt',
    max_sequence_length=512,
    batch_size=4,
    masking_freq=.05)

training_data_large = training_data_large.repeat().prefetch(tf.data.experimental.AUTOTUNE)

valid_data_large = create_masked_input_dataset(
    sequence_path='../uniparc_data/sequences_valid.txt',
    max_sequence_length=512,
    batch_size=4,
    masking_freq=.05)

valid_data_large = valid_data_large.repeat().prefetch(tf.data.experimental.AUTOTUNE)

In [11]:
model.fit(training_data_large, steps_per_epoch=1000, epochs=5,
          verbose=1, validation_data=valid_data_large, validation_steps=100,
          callbacks=callbacks)

Epoch 1/5

KeyboardInterrupt: 