In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf
import os

In [None]:
datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
mnist_train, mnist_test = datasets['train'], datasets['test']

In [None]:
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


Number of devices: 1


In [None]:
num_train_examples = info.splits['train'].num_examples
num_test_examples = info.splits['test'].num_examples
BUFFER_SIZE = 10000
BATCH_SIZE_PER_REPLICA = 64 #每一個Worker中的REPLICA數量
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

In [None]:
def scale(image, label):
  image = tf.cast(image, tf.float32)
  image /= 255
  return image, label

In [None]:
train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)

In [None]:
with strategy.scope():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10)
  ])

  model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
# 設定每一個結果回收的位置
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

In [None]:
# 設定顯示學習速率
class LR(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    print('\nLearning rate for epoch {} is {}'.format(epoch + 1, model.optimizer.lr.numpy()))

callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                       save_weights_only=True),
    LR()
]

In [None]:
model.fit(train_dataset, epochs=10, callbacks=callbacks)

Epoch 1/10





Learning rate for epoch 1 is 0.0010000000474974513
Epoch 2/10

Learning rate for epoch 2 is 0.0010000000474974513
Epoch 3/10

Learning rate for epoch 3 is 0.0010000000474974513
Epoch 4/10

Learning rate for epoch 4 is 0.0010000000474974513
Epoch 5/10

Learning rate for epoch 5 is 0.0010000000474974513
Epoch 6/10

Learning rate for epoch 6 is 0.0010000000474974513
Epoch 7/10

Learning rate for epoch 7 is 0.0010000000474974513
Epoch 8/10

Learning rate for epoch 8 is 0.0010000000474974513
Epoch 9/10

Learning rate for epoch 9 is 0.0010000000474974513
Epoch 10/10

Learning rate for epoch 10 is 0.0010000000474974513


<tensorflow.python.keras.callbacks.History at 0x7f6c425867f0>

In [None]:
# 確認結果的位置
!ls {checkpoint_dir}

checkpoint		     ckpt_5.data-00000-of-00001
ckpt_10.data-00000-of-00001  ckpt_5.index
ckpt_10.index		     ckpt_6.data-00000-of-00001
ckpt_1.data-00000-of-00001   ckpt_6.index
ckpt_1.index		     ckpt_7.data-00000-of-00001
ckpt_2.data-00000-of-00001   ckpt_7.index
ckpt_2.index		     ckpt_8.data-00000-of-00001
ckpt_3.data-00000-of-00001   ckpt_8.index
ckpt_3.index		     ckpt_9.data-00000-of-00001
ckpt_4.data-00000-of-00001   ckpt_9.index
ckpt_4.index


In [None]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
eval_loss, eval_acc = model.evaluate(eval_dataset)
print('loss: {}, Accuracy: {}'.format(eval_loss, eval_acc))

loss: 0.05265086144208908, Accuracy: 0.9853000044822693
