In [None]:
import numpy as np
import tensorflow as tf

In [None]:
tf.debugging.set_log_device_placement(True)
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu_devices))
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [None]:
!nvidia-smi

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.models import load_model

from bert.layers import (Attention, Transformer,
                         gelu, initializer, Projection, DenseNoMask)

from bert.losses import masked_sparse_categorical_crossentropy, ECE
from bert.optimization import WarmUp

In [7]:
from bert.model import create_albert_model
model = create_albert_model(model_dimension=512,
                            transformer_dimension=512 * 4,
                            num_attention_heads=512 // 64,
                            num_transformer_layers=6,
                            vocab_size=24,
                            dropout_rate=0.,
                            max_relative_position=128,
                            weight_share=False)

model.summary()

In [8]:
from bert.losses import ECE, masked_sparse_categorical_crossentropy
from bert.optimization import create_optimizer, WarmUp

In [9]:
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1E-4,
    decay_steps=1000000,
    end_learning_rate=0.0)

learning_rate_fn_warmup = WarmUp(initial_learning_rate=1E-4,
                                 decay_schedule_fn=learning_rate_fn,
                                 warmup_steps=10000)

optimizer = tf.keras.optimizers.Adam(
    learning_rate=learning_rate_fn_warmup,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-6)

In [10]:
model.compile(
    loss=masked_sparse_categorical_crossentropy,
    metrics=[ECE],
    optimizer=optimizer)

Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0


In [11]:
# callbacks = [
#     tf.keras.callbacks.ModelCheckpoint(filepath='jupyter_test_checkpoints/tfckpt', save_weights_only=True),
#     tf.keras.callbacks.TensorBoard(
#         log_dir='../uniparc_checkpoints/tblogs/test',
#         histogram_freq=0,
#         write_graph=False,
#         profile_batch=0,
#         update_freq='epoch',
#         embeddings_freq=0)]    

In [12]:
from bert.dataset import create_masked_input_dataset

with tf.device('/CPU:0'):

    training_data = create_masked_input_dataset(
        sequence_path='../uniparc_data/train_uniref100.txt.gz',
        max_sequence_length=12,
        fix_sequence_length=True,
        batch_size=5)

    training_data = training_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

    valid_data = create_masked_input_dataset(
        sequence_path='../uniparc_data/dev_uniref50.txt.gz',
        max_sequence_length=12,
        batch_size=5)

valid_data = valid_data.repeat().prefetch(tf.data.experimental.AUTOTUNE)

Executing op Range in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AddV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op HashTableV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op LookupTableImportV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Reshape in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorSliceDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AnonymousRandomSeedGenerator in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ShuffleDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FilterDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Execu

In [None]:
%pdb
model.fit(training_data, steps_per_epoch=500, epochs=5,
          verbose=1, validation_data=valid_data, validation_steps=10)

Automatic pdb calling has been turned ON
Executing op DatasetCardinality in device /job:localhost/replica:0/task:0/device:CPU:0
Train for 500 steps, validate for 10 steps
Epoch 1/5
Executing op OptimizeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AnonymousIteratorV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MakeIterator in device /job:localhost/replica:0/task:0/device:CPU:0
  1/500 [..............................] - ETA: 19:19

SystemError: <built-in function len> returned a result with an error set

> [0;32m/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/backprop.py[0m(598)[0;36m_aggregate_grads[0;34m()[0m
[0;32m    596 [0;31m  [0;32massert[0m [0mgradients[0m[0;34m,[0m [0;34m"No gradients to aggregate"[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    597 [0;31m[0;34m[0m[0m
[0m[0;32m--> 598 [0;31m  [0;32mif[0m [0mlen[0m[0;34m([0m[0mgradients[0m[0;34m)[0m [0;34m==[0m [0;36m1[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    599 [0;31m    [0;32mreturn[0m [0mgradients[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    600 [0;31m  [0;32mif[0m [0mall[0m[0;34m([0m[0misinstance[0m[0;34m([0m[0mg[0m[0;34m,[0m [0mops[0m[0;34m.[0m[0mTensor[0m[0;34m)[0m [0;32mfor[0m [0mg[0m [0;32min[0m [0mgradients[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  gradients


[<tensorflow.python.framework.indexed_slices.IndexedSlices object at 0x7fa68ca5c550>, <tensorflow.python.framework.indexed_slices.IndexedSlices object at 0x7fa68c524b00>, <tensorflow.python.framework.indexed_slices.IndexedSlices object at 0x7fa68bf86940>, <tensorflow.python.framework.indexed_slices.IndexedSlices object at 0x7fa68ba4de10>, <tensorflow.python.framework.indexed_slices.IndexedSlices object at 0x7fa6380bbda0>]


ipdb>  len(gradients)


5


ipdb>  all(isinstance(g, ops.Tensor) for g in gradients)


False


In [None]:
model.load_weights('jupyter_test_checkpoints/')

In [None]:
model = create_albert_model(model_dimension=512,
                            transformer_dimension=512 * 4,
                            num_attention_heads=512 // 64,
                            num_transformer_layers=6,
                            vocab_size=24,
                            dropout_rate=0.,
                            max_relative_position=128,
                            weight_share=False)