In [1]:
import numpy as np
import tensorflow as tf

In [2]:
# tf.debugging.set_log_device_placement(True)
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu_devices))
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

Num GPUs Available:  1


In [3]:
!nvidia-smi

Wed Feb  5 13:39:38 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.26       Driver Version: 430.26       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Quadro GV100        Off  | 00000000:37:00.0 Off |                  Off |
| 42%   51C    P0    86W / 250W |    234MiB / 32508MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

In [4]:
from bert.model import create_albert_model
model = create_albert_model(model_dimension=768,
                            transformer_dimension=768 * 4,
                            num_attention_heads=768 // 64,
                            num_transformer_layers=24,
                            vocab_size=24,
                            dropout_rate=0.1,
                            max_relative_position=128,
                            final_layernorm=False)

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 768)         18432     
_________________________________________________________________
transformer (Transformer)    (None, None, 768)         6513728   
_________________________________________________________________
transformer_1 (Transformer)  (None, None, 768)         6513728   
_________________________________________________________________
transformer_2 (Transformer)  (None, None, 768)         6513728   
_________________________________________________________________
transformer_3 (Transformer)  (None, None, 768)         6513728   
_________________________________________________________________
transformer_4 (Transformer)  (None, None, 768)         651372

In [5]:
from bert.losses import ECE, masked_sparse_categorical_crossentropy
from bert.optimization import create_optimizer, WarmUp

In [6]:
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1E-4,
    decay_steps=400000,
    end_learning_rate=0.0)

learning_rate_fn_warmup = WarmUp(initial_learning_rate=1E-4,
                                 decay_schedule_fn=learning_rate_fn,
                                 warmup_steps=10000)

optimizer = tf.keras.optimizers.Adam(
    learning_rate=1E-4,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-6)

In [7]:
model.compile(
    loss=masked_sparse_categorical_crossentropy,
    metrics=[ECE],
    optimizer=optimizer)

In [8]:
callbacks = [
#    tf.keras.callbacks.ModelCheckpoint(filepath='jupyter_test_checkpoints/tfckpt', save_weights_only=True),
    tf.keras.callbacks.TensorBoard(
        log_dir='../uniparc_checkpoints/tblogs/test',
        histogram_freq=0,
        write_graph=False,
        profile_batch=0,
        update_freq='epoch',
        embeddings_freq=0)]    

### Parameters that worked on 32Gb card:
* batch size of 64 with sequence length 128
* batch size of 24 with sequence length 256
* batch size of 8 with sequence length 512
* batch size of 3 with sequence length 1024

In [11]:
from bert.dataset import create_masked_input_dataset

with tf.device('/CPU:0'):

    training_data = create_masked_input_dataset(
        sequence_path='../uniparc_data/train_uniref100.txt.gz',
        max_sequence_length=1024,
        fix_sequence_length=True,
        batch_size=3)

    training_data = training_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

    valid_data = create_masked_input_dataset(
        sequence_path='../uniparc_data/dev_uniref50.txt.gz',
        max_sequence_length=1024,
        batch_size=3)

valid_data = valid_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

In [12]:
model.fit(training_data, steps_per_epoch=50, epochs=5,
          verbose=1, validation_data=valid_data, validation_steps=10,
          callbacks=callbacks)

Train for 50 steps, validate for 10 steps
Epoch 1/5

KeyboardInterrupt: 

In [None]:
# model = create_albert_model(model_dimension=512,
#                             transformer_dimension=512 * 4,
#                             num_attention_heads=512 // 64,
#                             num_transformer_layers=6,
#                             vocab_size=24,
#                             dropout_rate=0.,
#                             max_relative_position=128,
#                             weight_share=False)