In [None]:
!nvidia-smi

In [None]:
import os
import sys
sys.path.append('..')

import numpy as np
import tensorflow as tf

gpu_devices = tf.config.experimental.list_physical_devices('GPU')

tf.config.set_visible_devices(gpu_devices[0], 'GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

In [None]:
tf.__version__

In [None]:
from bert.model import create_albert_model
from bert.losses import ECE, masked_sparse_categorical_crossentropy, masked_sparse_categorical_accuracy
from bert.optimization import WarmUp

In [None]:
import tensorflow_addons.optimizers as tfa_optimizers

In [None]:
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1E-4,
    decay_steps=10000,
    end_learning_rate=0.0)

lr_schedule = WarmUp(
    initial_learning_rate=1E-4,
    decay_schedule_fn=lr_schedule,
    warmup_steps=1000)

optimizer = tfa_optimizers.LAMB(
    learning_rate=lr_schedule,
    weight_decay_rate=0.01,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-6,
    exclude_from_weight_decay=['layer_norm', 'bias'])

# strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    model = create_albert_model(model_dimension=768,
                                transformer_dimension=768 * 4,
                                num_attention_heads=768 // 64,
                                num_transformer_layers=12,
                                vocab_size=24,
                                dropout_rate=0.,
                                max_relative_position=128)
    
    model.compile(
        loss=masked_sparse_categorical_crossentropy,
        metrics=[ECE, masked_sparse_categorical_accuracy],
        optimizer=optimizer)

    
model.summary()

In [None]:
import os
from bert.dataset import create_masked_input_dataset
data_path = '/gpfs/alpine/proj-shared/bie108/split_uniref100'

# with tf.device('/CPU:0'):
training_data = create_masked_input_dataset(
    sequence_path=os.path.join(
        data_path, 'train_uniref100_split/train_100_*.txt.gz'),
    max_sequence_length=512,
    fix_sequence_length=True,
    batch_size=16,
    shard_num_workers=12,
    shard_worker_index=0)

valid_data = create_masked_input_dataset(
    sequence_path=os.path.join(
        data_path, 'train_uniref100_split/train_100_*.txt.gz'),
    max_sequence_length=512,
    batch_size=16)

In [None]:
inputs, targets = next(iter(training_data))

In [None]:
masked_sparse_categorical_accuracy(targets, model(inputs))

In [None]:
model.fit(training_data, steps_per_epoch=50, epochs=5,
          verbose=1, validation_data=valid_data, validation_steps=10)

In [None]:
masked, true = next(iter(training_data))
predictions = model.predict(masked)

In [None]:
masked[:, 0]