In [1]:
!nvidia-smi

Sun Jun 28 20:02:42 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.116.00   Driver Version: 418.116.00   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000004:04:00.0 Off |                    0 |
| N/A   38C    P0    37W / 300W |     10MiB / 16130MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000004:05:00.0 Off |                    0 |
| N/A   40C    P0    37W / 300W |     10MiB / 16130MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000035:03:00.0 Off |                    0 |
| N/A   

In [2]:
import os
import sys
sys.path.append('..')

import numpy as np
import tensorflow as tf

gpu_devices = tf.config.experimental.list_physical_devices('GPU')

tf.config.set_visible_devices(gpu_devices[0], 'GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

In [3]:
tf.__version__

'2.1.0'

In [4]:
from bert.dataset import create_masked_input_dataset

with tf.device('/CPU:0'):
    
    training_data = create_masked_input_dataset(
        sequence_path=os.path.join(
            '/ccs/home/pstjohn/project_work/split_uniref100/', 'train_uniref100_split/train_100_*.txt'),
        max_sequence_length=512,
        batch_size=8,
        fix_sequence_length=True,
        sequence_compression=None,
        file_buffer_size=2048,
        buffer_size=10000,
        filter_bzux=False)
    
    valid_data = create_masked_input_dataset(
        sequence_path=os.path.join(
            '/ccs/home/pstjohn/project_work/split_uniref100/', 'dev_uniref50_split/dev_50_*.txt'),
        max_sequence_length=512,
        batch_size=8,
        fix_sequence_length=True,
        sequence_compression=None,
        filter_bzux=False)

In [5]:
from bert.model import create_model

modelDimension = 768

model = create_model(model_dimension=modelDimension,
                     transformer_dimension=modelDimension * 4,
                     num_attention_heads=modelDimension // 64,
                     num_transformer_layers=12,
                     vocab_size=24,
                     dropout_rate=0.0,
                     max_relative_position=64,
                     max_sequence_length=512,
                     attention_type='relative')

In [6]:
checkpoint = tf.train.latest_checkpoint(
    '/gpfs/alpine/scratch/pstjohn/bie108/uniparc_checkpoints/12_layer_relative_adam_20200625_highlr.184212/')
model.load_weights(checkpoint)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fffb86b2828>

In [10]:
from bert.optimization import WarmUp

lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1E-4,
    decay_steps=125000,
    end_learning_rate=0.0)

lr_schedule = WarmUp(
    initial_learning_rate=1E-4,
    decay_schedule_fn=lr_schedule,
    warmup_steps=3125)

optimizer = tf.keras.optimizers.Adam(
    learning_rate=lr_schedule,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-6)

In [11]:
from bert.losses import (ECE, masked_sparse_categorical_crossentropy,
                         masked_sparse_categorical_accuracy)

model.compile(
    loss=masked_sparse_categorical_crossentropy,
    metrics=[ECE, masked_sparse_categorical_accuracy],
    optimizer=optimizer)

In [12]:
optimizer.get_weights()

[]