# CIFAR-10 image classification w/ Tensorboard

- Source: https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-python-sdk/tensorboard_keras/source_dir

## Set up the environment

In [1]:
import os
import sagemaker
from sagemaker import get_execution_role
import tensorflow as tf
from tensorflow import keras

sagemaker_session = sagemaker.Session()

role = get_execution_role()

bucket = sagemaker_session.default_bucket()
prefix = "tensorboard_keras_cifar10"
tensorflow_logs_path = "s3://{}/{}/logs".format(bucket, prefix) # Key points

print("Bucket: {}".format(bucket))
print("SageMaker ver: " + sagemaker.__version__)
print("Tensorflow ver: " + tf.__version__)

Bucket: sagemaker-us-east-1-889750940888
SageMaker ver: 2.81.0
Tensorflow ver: 2.6.2


## Download the CIFAR-10 dataset

In [2]:
from tensorflow.keras.datasets import cifar10

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
x_train shape: (50000, 32, 32, 3)
50000 train samples
10000 test samples


Converting dataset into TFRecord files to allow us to use SageMaker Pipe Mode.

In [3]:
import os

if not os.path.exists("./data/validation"):
    os.makedirs("./data/validation")

if not os.path.exists("./data/train"):
    os.makedirs("./data/train")


def write_tfrecords(x, y, filename):
    writer = tf.io.TFRecordWriter(filename)

    for image, label in zip(x, y):
        example = tf.train.Example(
            features=tf.train.Features(
                feature={
                    "image": tf.train.Feature(
                        bytes_list=tf.train.BytesList(value=[image.tobytes()])
                    ),
                    "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label])),
                }
            )
        )
        writer.write(example.SerializeToString())


write_tfrecords(x_test, y_test, "./data/validation/validation.tfrecords")

write_tfrecords(x_train, y_train, "./data/train/train.tfrecords")

## Uploading the data to s3

In [4]:
train_location = 's3://{}/{}/train'.format(bucket, prefix)
validation_location = 's3://{}/{}/validation'.format(bucket, prefix)

In [5]:
!aws s3 sync ./data s3://{bucket}/{prefix}

upload: data/validation/validation.tfrecords to s3://sagemaker-us-east-1-889750940888/tensorboard_keras_cifar10/validation/validation.tfrecords
upload: data/train/train.tfrecords to s3://sagemaker-us-east-1-889750940888/tensorboard_keras_cifar10/train/train.tfrecords


In [6]:
!aws s3 ls {train_location} --recursive

2022-03-31 06:34:23  156300000 tensorboard_keras_cifar10/train/train.tfrecords


In [7]:
!aws s3 ls {validation_location} --recursive

2022-03-31 06:34:23   31260000 tensorboard_keras_cifar10/validation/validation.tfrecords


## Local mode training

In [9]:
keras_metric_definition = [
    {"Name": "train:loss", "Regex": ".*loss: ([0-9\\.]+) - accuracy: [0-9\\.]+.*"},
    {"Name": "train:accuracy", "Regex": ".*loss: [0-9\\.]+ - accuracy: ([0-9\\.]+).*"},
    {
        "Name": "validation:accuracy",
        "Regex": ".*step - loss: [0-9\\.]+ - accuracy: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_accuracy: ([0-9\\.]+).*",
    }]

In [15]:
from sagemaker.tensorflow import TensorFlow

hyperparameters = {"epochs": 2, "batch-size": 256, "tf-logs-path": tensorflow_logs_path}

inputs = {"train": train_location, "validation": validation_location}

instance_type = 'local'

estimator = TensorFlow(
    base_job_name="tensorboard-example",
    entry_point="tensorboard_keras_cifar10.py",
    source_dir="source_dir",
    role=role,
    framework_version="2.2.0",
    py_version="py37",
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type=instance_type,
    metric_definitions=keras_metric_definition,
    input_mode="Pipe"
)

estimator.fit(inputs, wait=True)

Creating 5j7xsmbzwt-algo-1-srffm ... 
Creating 5j7xsmbzwt-algo-1-srffm ... done
Attaching to 5j7xsmbzwt-algo-1-srffm
[36m5j7xsmbzwt-algo-1-srffm |[0m 2022-03-31 06:41:24.278129: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.
[36m5j7xsmbzwt-algo-1-srffm |[0m 2022-03-31 06:41:24.278282: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:106] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
[36m5j7xsmbzwt-algo-1-srffm |[0m 2022-03-31 06:41:24.299803: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.
[36m5j7xsmbzwt-algo-1-srffm |[0m 2022-03-31 06:41:25,774 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training
[36m5j7xsmbzwt-algo-1-srffm |[0m 2022-03-31 06:41:25,782 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36m5j7x

## Hyperparameter tuning

In [18]:
shared_hyperparameters = {"epochs": 6, "tf-logs-path": tensorflow_logs_path}

estimator = TensorFlow(
    base_job_name="tensorboard-example-hpo",
    entry_point="tensorboard_keras_cifar10.py",
    source_dir="source_dir",
    role=role,
    framework_version="2.2.0",
    py_version="py37",
    hyperparameters=shared_hyperparameters,
    instance_count=1,
    instance_type="ml.p3.2xlarge",
#     instance_type="ml.m5.xlarge",
    metric_definitions=keras_metric_definition,
    input_mode="Pipe",
)

In [19]:
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

hyperparameter_ranges = {
    "learning-rate": ContinuousParameter(0.00001, 0.001),
    "batch-size": CategoricalParameter([64, 128]),
    "optimizer": CategoricalParameter(["sgd", "adam", "rmsprop"]),
}

objective_metric_name = "validation:accuracy"

inputs = {"train": train_location, "validation": validation_location}

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions=keras_metric_definition,
    objective_type="Maximize",
    max_jobs=4,
    max_parallel_jobs=2,
    early_stopping_type="Auto",
    base_tuning_job_name="remote-hpo",
)

tuner.fit(inputs, wait=False)

## Tensorboard

In [20]:
!aws s3 ls {tensorflow_logs_path} --recursive

2021-12-09 04:51:59          0 tensorboard_keras_cifar10/logs/
2021-12-09 05:06:03          0 tensorboard_keras_cifar10/logs/remote-hpo-211209-0500-001-1765c511/
2021-12-09 05:11:59      65424 tensorboard_keras_cifar10/logs/remote-hpo-211209-0500-001-1765c511/events.out.tfevents.1639026363.ip-10-2-196-180.ec2.internal.38.5.v2
2021-12-09 05:11:30        364 tensorboard_keras_cifar10/logs/remote-hpo-211209-0500-001-1765c511/events.out.tfevents.1639026365.ip-10-2-196-180.ec2.internal.38.999.v2
2021-12-09 05:06:07          0 tensorboard_keras_cifar10/logs/remote-hpo-211209-0500-001-1765c511/train/
2021-12-09 05:11:30     367201 tensorboard_keras_cifar10/logs/remote-hpo-211209-0500-001-1765c511/train/events.out.tfevents.1639026366.ip-10-2-196-180.ec2.internal.38.1013.v2
2021-12-09 05:06:13         40 tensorboard_keras_cifar10/logs/remote-hpo-211209-0500-001-1765c511/train/events.out.tfevents.1639026372.ip-10-2-196-180.ec2.internal.profile-empty
2021-12-09 05:06:13          0 tensorboard_ker

In [25]:
!tensorboard --version

2.6.0


In [24]:
aws_region = sagemaker_session.boto_region_name
!AWS_REGION={aws_region}
# !echo tensorboard --logdir {tensorflow_logs_path}
!tensorboard --logdir {tensorflow_logs_path}

Exception in thread Reloader:
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorboard/backend/event_processing/data_ingester.py", line 98, in _reload
    self._multiplexer.AddRunsFromDirectory(path, name)
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorboard/backend/event_processing/plugin_event_multiplexer.py", line 198, in AddRunsFromDirectory
    for subdir in io_wrapper.GetLogdirSubdirectories(path):
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorboard/backend/event_processing/io_wrapper.py", line 202, in GetLogdirSubdirectories
    if not tf.io.gfile.exists(pat

Tensorboard URL: https://YOUR-NOTEBOK-INSTANCE-NAME.notebook.YOUR-REGION.sagemaker.aws/proxy/6006/ 

https://sinjoonk-p2.notebook.us-east-1.sagemaker.aws/proxy/6006/

In [22]:
tensorflow_logs_path

's3://sagemaker-us-east-1-889750940888/tensorboard_keras_cifar10/logs'