In [None]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import numpy as np
import matplotlib.pylab as plt
import os

In [None]:
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()

# Normalize pixel values to be between 0 and 1
train_images, test_images = train_images / 255.0, test_images / 255.0

In [None]:
# Plain text name in alphabetical order. https://www.cs.toronto.edu/~kriz/cifar.html
CLASS_NAMES = ['airplane', 'automobile', 'bird', 'cat', 
               'deer','dog', 'frog', 'horse', 'ship', 'truck']

Let's use half of test data as the validation data.

In [None]:
validation_dataset = tf.data.Dataset.from_tensor_slices((test_images[:500], test_labels[:500]))
test_dataset = tf.data.Dataset.from_tensor_slices((test_images[500:], test_labels[500:]))

Now the datasets are built for validation and test data. Next, we will keep all training data for training.

In [None]:
# Create an instance of dataset from raw numpy images and labels.
train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels))

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#transformations_2
train_dataset_size = len(list(train_dataset.as_numpy_iterator()))
print('Training data sample size: ', train_dataset_size)

Training data sample size:  50000


In [None]:
TRAIN_BATCH_SIZE = 200
train_dataset = train_dataset.shuffle(50000).batch(TRAIN_BATCH_SIZE, drop_remainder=True)

In [None]:
validation_dataset = validation_dataset.batch(500)
test_dataset = test_dataset.batch(500)

In [None]:
STEPS_PER_EPOCH = train_dataset_size // TRAIN_BATCH_SIZE
VALIDATION_STEPS = 1 #validation data // validation batch size

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', 
      kernel_initializer='glorot_uniform', padding='same', input_shape = (32,32,3)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu', 
      kernel_initializer='glorot_uniform', padding='same'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu', kernel_initializer='glorot_uniform'),
    tf.keras.layers.Dense(10, activation='softmax', name = 'custom_class')
])
model.build([None, 32, 32, 3])

In [None]:
model.compile(
          loss='sparse_categorical_crossentropy',
          optimizer=tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9),
          metrics=['accuracy'])

Let's define some alias for file path to save model checkpoints.

In [None]:
checkpoint_dir = './cifar10_training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
print(checkpoint_prefix)

./cifar10_training_checkpoints/ckpt_{epoch}


See https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint for details of function signature for `ModelCheckpoint`.

In [None]:
myCheckPoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix
    ,monitor='val_accuracy'
    ,mode='auto')

In [None]:
myCallbacks = [
    myCheckPoint
]

In [None]:
hist = model.fit(
    train_dataset
    ,epochs=12
    ,steps_per_epoch=STEPS_PER_EPOCH
    ,validation_data=validation_dataset
    ,validation_steps=VALIDATION_STEPS
    ,callbacks=myCallbacks).history

Epoch 1/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_1/assets
Epoch 2/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_2/assets
Epoch 3/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_3/assets
Epoch 4/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_4/assets
Epoch 5/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_5/assets
Epoch 6/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_6/assets
Epoch 7/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_7/assets
Epoch 8/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_8/assets
Epoch 9/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_9/assets
Epoch 10/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_10/assets
Epoch 11/12
INFO:tensorflow:Assets written to: ./cifar10_training_checkpoints/ckpt_11/as

In [None]:
type(hist)

dict

In [None]:
hist['val_accuracy']

[0.47200000286102295,
 0.5680000185966492,
 0.6000000238418579,
 0.5899999737739563,
 0.6119999885559082,
 0.6019999980926514,
 0.6100000143051147,
 0.6380000114440918,
 0.6100000143051147,
 0.5699999928474426,
 0.5619999766349792,
 0.5960000157356262]

In [None]:
max_value = max(hist['val_accuracy'])
max_index = hist['val_accuracy'].index(max_value)
print('Best epoch: ', max_index + 1)

Best epoch:  8


Epoch 7 yielded model with highest accuracy. Now take a look at the checkpoint directory:

In [None]:
!ls -lrt ./cifar10_training_checkpoints

total 48
drwxr-xr-x 4 root root 4096 Jan 14 01:56 ckpt_1
drwxr-xr-x 4 root root 4096 Jan 14 01:56 ckpt_2
drwxr-xr-x 4 root root 4096 Jan 14 01:56 ckpt_3
drwxr-xr-x 4 root root 4096 Jan 14 01:57 ckpt_4
drwxr-xr-x 4 root root 4096 Jan 14 01:57 ckpt_5
drwxr-xr-x 4 root root 4096 Jan 14 01:57 ckpt_6
drwxr-xr-x 4 root root 4096 Jan 14 01:57 ckpt_7
drwxr-xr-x 4 root root 4096 Jan 14 01:57 ckpt_8
drwxr-xr-x 4 root root 4096 Jan 14 01:57 ckpt_9
drwxr-xr-x 4 root root 4096 Jan 14 01:57 ckpt_10
drwxr-xr-x 4 root root 4096 Jan 14 01:57 ckpt_11
drwxr-xr-x 4 root root 4096 Jan 14 01:57 ckpt_12


In [None]:
!ls -lrt ./cifar10_training_checkpoints/ckpt_7

total 136
drwxr-xr-x 2 root root   4096 Jan 14 01:57 variables
drwxr-xr-x 2 root root   4096 Jan 14 01:57 assets
-rw-r--r-- 1 root root 127273 Jan 14 01:57 saved_model.pb


In [None]:
best_only_checkpoint_dir = './best_only_cifar10_training_checkpoints'
best_only_checkpoint_prefix = os.path.join(best_only_checkpoint_dir, "ckpt_{epoch}")
print(best_only_checkpoint_prefix)

./best_only_cifar10_training_checkpoints/ckpt_{epoch}


In [None]:
bestCheckPoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=best_only_checkpoint_prefix
    ,monitor='val_accuracy'
    ,mode='auto'
    ,save_best_only=True)

In [None]:
bestCallbacks = [
    bestCheckPoint
]

In [None]:
best_hist = model.fit(
    train_dataset
    ,epochs=12
    ,steps_per_epoch=STEPS_PER_EPOCH
    ,validation_data=validation_dataset
    ,validation_steps=VALIDATION_STEPS
    ,callbacks=bestCallbacks).history

Epoch 1/12
INFO:tensorflow:Assets written to: ./best_only_cifar10_training_checkpoints/ckpt_1/assets
Epoch 2/12
Epoch 3/12
INFO:tensorflow:Assets written to: ./best_only_cifar10_training_checkpoints/ckpt_3/assets
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
INFO:tensorflow:Assets written to: ./best_only_cifar10_training_checkpoints/ckpt_9/assets
Epoch 10/12
Epoch 11/12
Epoch 12/12


Let's take a look at checkpoint directory where you set `set_best_only` to `True`:

In [None]:
!ls -lrt ./best_only_cifar10_training_checkpoints

total 12
drwxr-xr-x 4 root root 4096 Jan 14 01:57 ckpt_1
drwxr-xr-x 4 root root 4096 Jan 14 01:58 ckpt_3
drwxr-xr-x 4 root root 4096 Jan 14 01:58 ckpt_9


Not all checkpoints are saved. This is because `save_best_only` option lets you save checkpoints after the first epoch only if there is an incremental improvement to the model metric of your choice. You can how many times the model was saved. This means the model performance in validation accuracy (`val_accuracy`) is best at the last time the model was saved. Therefore the last checkpoint saved is the best model.