# CIFAR-10 image classification w/ Tensorboard

- Source: https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-python-sdk/tensorboard_keras/source_dir

## Set up the environment

In [1]:
import os
import sagemaker
from sagemaker import get_execution_role
import tensorflow as tf
from tensorflow import keras

sagemaker_session = sagemaker.Session()

role = get_execution_role()

bucket = sagemaker_session.default_bucket()
prefix = "tensorboard_keras_cifar10"
tensorflow_logs_path = "s3://{}/{}/logs".format(bucket, prefix) # Key points

print("Bucket: {}".format(bucket))
print("SageMaker ver: " + sagemaker.__version__)
print("Tensorflow ver: " + tf.__version__)

Bucket: sagemaker-ap-northeast-2-889750940888
SageMaker ver: 2.68.0
Tensorflow ver: 2.4.3


## Download the CIFAR-10 dataset

In [2]:
from tensorflow.keras.datasets import cifar10

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
x_train shape: (50000, 32, 32, 3)
50000 train samples
10000 test samples


Converting dataset into TFRecord files to allow us to use SageMaker Pipe Mode.

In [3]:
import os

if not os.path.exists("./data/validation"):
    os.makedirs("./data/validation")

if not os.path.exists("./data/train"):
    os.makedirs("./data/train")


def write_tfrecords(x, y, filename):
    writer = tf.io.TFRecordWriter(filename)

    for image, label in zip(x, y):
        example = tf.train.Example(
            features=tf.train.Features(
                feature={
                    "image": tf.train.Feature(
                        bytes_list=tf.train.BytesList(value=[image.tobytes()])
                    ),
                    "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label])),
                }
            )
        )
        writer.write(example.SerializeToString())


write_tfrecords(x_test, y_test, "./data/validation/validation.tfrecords")

write_tfrecords(x_train, y_train, "./data/train/train.tfrecords")

## Uploading the data to s3

In [4]:
train_location = 's3://{}/{}/train'.format(bucket, prefix)
validation_location = 's3://{}/{}/validation'.format(bucket, prefix)

In [5]:
!aws s3 sync ./data s3://{bucket}/{prefix}

upload: data/validation/validation.tfrecords to s3://sagemaker-ap-northeast-2-889750940888/tensorboard_keras_cifar10/validation/validation.tfrecords
upload: data/train/train.tfrecords to s3://sagemaker-ap-northeast-2-889750940888/tensorboard_keras_cifar10/train/train.tfrecords


In [6]:
!aws s3 ls {train_location} --recursive

2021-12-22 01:04:36  156300000 tensorboard_keras_cifar10/train/train.tfrecords


In [7]:
!aws s3 ls {validation_location} --recursive

2021-12-22 01:04:36   31260000 tensorboard_keras_cifar10/validation/validation.tfrecords


## Local mode training

In [8]:
keras_metric_definition = [
    {"Name": "train:loss", "Regex": ".*loss: ([0-9\\.]+) - accuracy: [0-9\\.]+.*"},
    {"Name": "train:accuracy", "Regex": ".*loss: [0-9\\.]+ - accuracy: ([0-9\\.]+).*"},
    {
        "Name": "validation:accuracy",
        "Regex": ".*step - loss: [0-9\\.]+ - accuracy: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_accuracy: ([0-9\\.]+).*",
    },
    {
        "Name": "validation:loss",
        "Regex": ".*step - loss: [0-9\\.]+ - accuracy: [0-9\\.]+ - val_loss: ([0-9\\.]+) - val_accuracy: [0-9\\.]+.*",
    },
    {
        "Name": "sec/steps",
        "Regex": ".* (\d+)[mu]s/step - loss: [0-9\\.]+ - accuracy: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_accuracy: [0-9\\.]+",
    },
]

In [9]:
from sagemaker.tensorflow import TensorFlow

hyperparameters = {"epochs": 2, "batch-size": 256, "tf-logs-path": tensorflow_logs_path}

inputs = {"train": train_location, "validation": validation_location}

instance_type = 'local'

estimator = TensorFlow(
    base_job_name="tensorboard-example",
    entry_point="tensorboard_keras_cifar10.py",
    source_dir="source_dir",
    role=role,
    framework_version="2.2.0",
    py_version="py37",
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type=instance_type,
    metric_definitions=keras_metric_definition,
    input_mode="Pipe",
)

estimator.fit(inputs, wait=True)

Creating fo1f85hu2q-algo-1-4yc1s ... 
Creating fo1f85hu2q-algo-1-4yc1s ... done
Attaching to fo1f85hu2q-algo-1-4yc1s
[36mfo1f85hu2q-algo-1-4yc1s |[0m 2021-12-22 01:05:26.169000: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.
[36mfo1f85hu2q-algo-1-4yc1s |[0m 2021-12-22 01:05:26.169144: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:106] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
[36mfo1f85hu2q-algo-1-4yc1s |[0m 2021-12-22 01:05:26.190062: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.
[36mfo1f85hu2q-algo-1-4yc1s |[0m 2021-12-22 01:05:27,610 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training
[36mfo1f85hu2q-algo-1-4yc1s |[0m 2021-12-22 01:05:27,618 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mfo1f

## Hyperparameter tuning

In [10]:
shared_hyperparameters = {"epochs": 6, "tf-logs-path": tensorflow_logs_path}

estimator = TensorFlow(
    base_job_name="tensorboard-example-hpo",
    entry_point="tensorboard_keras_cifar10.py",
    source_dir="source_dir",
    role=role,
    framework_version="2.2.0",
    py_version="py37",
    hyperparameters=shared_hyperparameters,
    instance_count=1,
    instance_type="ml.p3.2xlarge",
    metric_definitions=keras_metric_definition,
    input_mode="Pipe",
)

In [11]:
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

hyperparameter_ranges = {
    "learning-rate": ContinuousParameter(0.00001, 0.001),
    "batch-size": CategoricalParameter([64, 128]),
    "optimizer": CategoricalParameter(["sgd", "adam", "rmsprop"]),
}

objective_metric_name = "validation:accuracy"

inputs = {"train": train_location, "validation": validation_location}

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions=keras_metric_definition,
    objective_type="Maximize",
    max_jobs=4,
    max_parallel_jobs=2,
    early_stopping_type="Auto",
    base_tuning_job_name="remote-hpo",
)

tuner.fit(inputs, wait=False)

## Tensorboard

In [12]:
!aws s3 ls {tensorflow_logs_path} --recursive

2021-12-22 01:05:34          0 tensorboard_keras_cifar10/logs/
2021-12-22 01:05:34          0 tensorboard_keras_cifar10/logs/tensorboard-example-2021-12-22-01-04-38-203/
2021-12-22 01:09:20      75489 tensorboard_keras_cifar10/logs/tensorboard-example-2021-12-22-01-04-38-203/events.out.tfevents.1640135133.85a17b79ad69.27.5.v2
2021-12-22 01:09:08        372 tensorboard_keras_cifar10/logs/tensorboard-example-2021-12-22-01-04-38-203/events.out.tfevents.1640135134.85a17b79ad69.27.1009.v2
2021-12-22 01:05:35          0 tensorboard_keras_cifar10/logs/tensorboard-example-2021-12-22-01-04-38-203/train/
2021-12-22 01:09:08     179889 tensorboard_keras_cifar10/logs/tensorboard-example-2021-12-22-01-04-38-203/train/events.out.tfevents.1640135134.85a17b79ad69.27.1023.v2
2021-12-22 01:05:46         40 tensorboard_keras_cifar10/logs/tensorboard-example-2021-12-22-01-04-38-203/train/events.out.tfevents.1640135145.85a17b79ad69.profile-empty
2021-12-22 01:05:46          0 tensorboard_keras_cifar10/logs

In [15]:
aws_region = sagemaker_session.boto_region_name
!AWS_REGION={aws_region}
# !echo tensorboard --logdir {tensorflow_logs_path}
!tensorboard --logdir {tensorflow_logs_path}

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.4.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


Tensorboard URL: https://YOUR-NOTEBOK-INSTANCE-NAME.notebook.YOUR-REGION.sagemaker.aws/proxy/6006/   
https://sinjoonk-p2.notebook.us-east-1.sagemaker.aws/proxy/6006/

In [16]:
tensorflow_logs_path

's3://sagemaker-ap-northeast-2-889750940888/tensorboard_keras_cifar10/logs'