In [1]:
!nvidia-smi

Thu May 28 14:04:08 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.116.00   Driver Version: 418.116.00   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000004:04:00.0 Off |                    0 |
| N/A   41C    P0    54W / 300W |      0MiB / 16130MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000004:05:00.0 Off |                    0 |
| N/A   45C    P0    38W / 300W |     10MiB / 16130MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000035:03:00.0 Off |                    0 |
| N/A   

In [2]:
import os
import sys
sys.path.append('..')

import numpy as np
import tensorflow as tf

gpu_devices = tf.config.experimental.list_physical_devices('GPU')

tf.config.set_visible_devices(gpu_devices[0], 'GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

In [3]:
tf.__version__

'2.1.0'

In [4]:
from bert.model import create_model
from bert.losses import ECE, masked_sparse_categorical_crossentropy, masked_sparse_categorical_accuracy
from bert.optimization import WarmUp

In [5]:
import tensorflow_addons.optimizers as tfa_optimizers

In [None]:
tf.keras.losses.SparseCategoricalCrossentropy

In [6]:
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1E-4,
    decay_steps=10000,
    end_learning_rate=0.0)

lr_schedule = WarmUp(
    initial_learning_rate=1E-4,
    decay_schedule_fn=lr_schedule,
    warmup_steps=1000)

optimizer = tfa_optimizers.LAMB(
    learning_rate=lr_schedule,
    weight_decay_rate=0.01,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-6,
    exclude_from_weight_decay=['layer_norm', 'bias'])

strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
# strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    model = create_model(model_dimension=768,
                         transformer_dimension=768 * 4,
                         num_attention_heads=768 // 64,
                         num_transformer_layers=12,
                         vocab_size=24,
                         dropout_rate=0.,
                         max_relative_position=128,
                         max_sequence_length=1024,
                         attention_type='absolute')
    
    model.compile(
        loss=masked_sparse_categorical_crossentropy,
        metrics=[ECE, masked_sparse_categorical_accuracy],
        optimizer=optimizer)

    
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 768)    18432       input_1[0][0]                    
__________________________________________________________________________________________________
position_embedding (PositionEmb (None, None, 768)    787200      input_1[0][0]                    
__________________________________________________________________________________________________
add (Add)                       (None, None, 768)    0           embedding[0][0]                  
                                                                 position_embedding[0][0]     

In [31]:
import os
from bert.dataset import create_masked_input_dataset
data_path = '/gpfs/alpine/proj-shared/bie108/split_uniref100'

# with tf.device('/CPU:0'):
training_data = create_masked_input_dataset(
    sequence_path=os.path.join(
        data_path, 'train_uniref100_split/train_100_*.txt.gz'),
    max_sequence_length=16,
    fix_sequence_length=True,
    batch_size=4,
    shard_num_workers=12,
    shard_worker_index=0,
    masking_freq=.5)

valid_data = create_masked_input_dataset(
    sequence_path=os.path.join(
        data_path, 'train_uniref100_split/train_100_*.txt.gz'),
    max_sequence_length=512,
    batch_size=16)

In [55]:
ds = tf.data.Dataset.from_generator(lambda: np.random.randint(50, size=(1, 32)),
                                    output_types=(tf.int32), output_shapes=(32,)).repeat().batch(10)

list(ds.take(1))

[<tf.Tensor: shape=(10, 32), dtype=int32, numpy=
 array([[30, 38, 18, 29,  4,  7, 25, 18, 29,  5,  1, 44, 31, 29, 41, 48,
         31, 11, 49, 48, 37, 15, 23, 19,  0, 11, 16, 24, 40,  8, 13, 14],
        [ 4, 37,  6, 34, 27, 24, 39, 21, 49, 29, 49, 39, 43, 20,  7, 32,
         39,  9, 26,  4, 39, 40, 20,  3, 33,  6,  9, 28, 38, 33, 17,  5],
        [ 6, 10, 40, 38, 41, 36, 19, 24, 33, 44, 32,  6, 25, 20, 19, 32,
         45,  7, 19, 26,  2, 26, 23, 11, 46, 38, 12,  9, 29, 29, 48, 18],
        [31, 38, 32, 25, 21, 44, 37, 16,  2, 10, 25, 46, 25, 42, 25, 37,
         38, 18, 38, 24, 12, 21, 15, 21, 40,  0, 41, 23, 20, 49, 48, 44],
        [23,  8, 25, 18, 15, 28, 18,  0, 35, 17, 42, 29, 34, 12, 12, 15,
         24, 17, 14, 17,  7, 14, 19, 49, 38, 42, 37, 16, 44, 21, 30,  9],
        [43, 48, 24, 17, 39, 21, 18, 47, 44, 15, 10,  2, 33,  2, 10, 14,
         20, 35, 16, 45,  9, 37, 42, 44,  4, 20, 35, 12, 13,  0,  6, 21],
        [12,  1, 26, 30, 10, 26, 15, 23, 19, 15, 41,  6, 21, 27, 25, 

In [40]:
inputs = tf.data.Dataset.from_tensors(
    [[17, 16,  1,  8, 19,  1, 17,  6,  5,  9, 18,  3, 20, 16, 19, 10],
     [ 6, 15,  1,  1,  1,  3, 23,  5, 10, 23,  4, 15, 10, 20, 13,  1],
     [ 5, 17,  3, 18, 17,  3,  3, 23, 15,  8,  2, 23, 23,  4, 18,  5],
     [ 3, 16,  1,  1, 23,  8,  7,  3,  5, 15, 20, 16, 14, 10,  1,  9]])
targets = tf.data.Dataset.from_tensors(
    [[ 0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
     [ 0, 15,  0,  0,  0,  0, 15,  0,  0, 17,  0,  0,  0,  0,  0,  0],
     [ 0,  0,  0,  0,  0,  0,  0, 11,  0,  0,  0,  9,  4,  0,  0,  0],
     [ 0,  0,  0,  0, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

dataset = tf.data.Dataset.zip((inputs, targets))

In [41]:
list(dataset.take(1))

[(<tf.Tensor: shape=(4, 16), dtype=int32, numpy=
  array([[17, 16,  1,  8, 19,  1, 17,  6,  5,  9, 18,  3, 20, 16, 19, 10],
         [ 6, 15,  1,  1,  1,  3, 23,  5, 10, 23,  4, 15, 10, 20, 13,  1],
         [ 5, 17,  3, 18, 17,  3,  3, 23, 15,  8,  2, 23, 23,  4, 18,  5],
         [ 3, 16,  1,  1, 23,  8,  7,  3,  5, 15, 20, 16, 14, 10,  1,  9]],
        dtype=int32)>,
  <tf.Tensor: shape=(4, 16), dtype=int32, numpy=
  array([[ 0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [ 0, 15,  0,  0,  0,  0, 15,  0,  0, 17,  0,  0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0,  0,  0, 11,  0,  0,  0,  9,  4,  0,  0,  0],
         [ 0,  0,  0,  0, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
        dtype=int32)>)]

In [8]:
model.fit(training_data, steps_per_epoch=50, epochs=5,
          verbose=1, validation_data=valid_data, validation_steps=10)

Train for 50 steps, validate for 10 steps
Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ffe385707f0>