## Setup

In [1]:
import numpy as np
import pandas as pd
import h5py
import os
import sagemaker

from tensorflow import logging
from keras.datasets import fashion_mnist
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

logging.set_verbosity(logging.ERROR)

Using TensorFlow backend.


## Download and store data locally

In [15]:
# load data
(X_train, Y_train), (X_val, Y_val) = fashion_mnist.load_data()

Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz


In [24]:
data_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(data_dir, exist_ok=True)

# store in hdf5 files
with h5py.File(os.path.join(data_dir, 'train.hdf5'), 'w') as hf:
    hf.create_dataset('X_train', data=X_train)
    hf.create_dataset('Y_train', data=Y_train)

with h5py.File(os.path.join(data_dir, 'val.hdf5'), 'w') as hf:
    hf.create_dataset('X_val', data=X_val)
    hf.create_dataset('Y_val', data=Y_val)

## Upload data to s3

In [25]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
prefix = 'keras-cnn-fashion-mnist'

In [27]:
training_input_path   = sess.upload_data('data/train.hdf5', key_prefix=prefix+'/data')
validation_input_path = sess.upload_data('data/val.hdf5', key_prefix=prefix+'/data')

print(training_input_path)
print(validation_input_path)

s3://sagemaker-us-west-2-406755861890/keras-cnn-fashion-mnist/data/train.hdf5
s3://sagemaker-us-west-2-406755861890/keras-cnn-fashion-mnist/data/val.hdf5


## Train locally to test training script

In [34]:
# local directory for model checkpoints
os.makedirs("./models/checkpoints", exist_ok = True)

# sagemaker estimator
tf_estimator = TensorFlow(entry_point='keras_cnn_fashion_mnist_aws.py', 
                          role=role,
                          train_volume_size=1,
                          train_instance_count=1, 
                          train_instance_type='local',
                          hyperparameters={'epochs': 1, 'batch_size': 128},
                          framework_version='1.13', 
                          py_version='py3',
                          script_mode=True
                         )

In [36]:
tf_estimator.fit({'training': training_input_path, 'validation': validation_input_path})

Creating tmp3qjiyi_x_algo-1-qancy_1 ... 
[1BAttaching to tmp3qjiyi_x_algo-1-qancy_12mdone[0m
[36malgo-1-qancy_1  |[0m 2019-09-01 06:25:16,713 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training
[36malgo-1-qancy_1  |[0m 2019-09-01 06:25:16,721 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-qancy_1  |[0m 2019-09-01 06:25:16,907 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-qancy_1  |[0m 2019-09-01 06:25:16,931 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-qancy_1  |[0m 2019-09-01 06:25:16,955 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-qancy_1  |[0m 2019-09-01 06:25:16,971 sagemaker-containers INFO     Invoking user script
[36malgo-1-qancy_1  |[0m 
[36malgo-1-qancy_1  |[0m Training Env:
[36malgo-1-qancy_1  |[0m 
[36malgo-1-qancy_1  |[0m {
[36malgo-1-qancy_1  |[0

[36malgo-1-qancy_1  |[0m Train on 60000 samples, validate on 10000 samples
[36malgo-1-qancy_1  |[0m Epoch 1/1
[36malgo-1-qancy_1  |[0m 
[36malgo-1-qancy_1  |[0m Epoch 00001: val_acc improved from -inf to 0.89390, saving model to /opt/ml/model/fashion-mnist-model.hdf5
[36malgo-1-qancy_1  |[0m best_val_acc: 0.8939
[36malgo-1-qancy_1  |[0m Traceback (most recent call last):
[36malgo-1-qancy_1  |[0m   File "keras_cnn_fashion_mnist_aws.py", line 236, in <module>
[36malgo-1-qancy_1  |[0m     tf.contrib.saved_model.save_keras_model(model, args.model_dir)
[36malgo-1-qancy_1  |[0m NameError: name 'tf' is not defined
[36malgo-1-qancy_1  |[0m 2019-09-01 06:29:17,611 sagemaker-containers ERROR    ExecuteUserScriptError:
[36malgo-1-qancy_1  |[0m Command "/usr/local/bin/python3.6 keras_cnn_fashion_mnist_aws.py --batch_size 128 --epochs 1 --model_dir s3://sagemaker-us-west-2-406755861890/tensorflow-training-2019-09-01-06-10-33-668/model"
[36mtmp3qjiyi_x_algo-1-qancy_1 exited wi

RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmp3qjiyi_x/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1

## Train on spot instances

In [8]:
# local directory for model checkpoints
os.makedirs("./models/checkpoints", exist_ok = True)

# sagemaker estimator
tf_estimator = TensorFlow(entry_point='keras_cnn_fashion_mnist.py', 
                          role=role,
                          train_volume_size=1,
                          train_instance_count=1, 
                          train_instance_type='ml.m5.xlarge',
                          train_use_spot_instances=True,
                          train_max_wait=86400,
                          checkpoint_local_path='models/checkpoints',
                          framework_version='1.13', 
                          py_version='py3',
                          script_mode=True
                         )

In [9]:
tf_estimator.fit({'training': training_input_path, 'validation': validation_input_path})

2019-09-01 05:06:57 Starting - Starting the training job...
2019-09-01 05:07:00 Starting - Launching requested ML instances......
2019-09-01 05:08:04 Starting - Preparing the instances for training...
2019-09-01 05:08:45 Downloading - Downloading input data...
2019-09-01 05:09:26 Training - Training image download completed. Training in progress.
2019-09-01 05:09:26 Uploading - Uploading generated training model.
[31m2019-09-01 05:09:23,123 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[31m2019-09-01 05:09:23,129 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-01 05:09:23,396 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-01 05:09:24,060 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-01 05:09:24,076 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-01 05:09

UnexpectedStatusException: Error for Training job tensorflow-training-2019-09-01-05-06-56-221: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/usr/local/bin/python3.6 keras_cnn_fashion_mnist.py --model_dir s3://sagemaker-us-west-2-406755861890/tensorflow-training-2019-09-01-05-06-56-221/model"

## Automatic tuning

In [6]:
# hyperparameter spaces
conv0_hps = {'conv0_pad': IntegerParameter(1, 3),
             'conv0_channels': IntegerParameter(24, 32),
             'conv0_filter': IntegerParameter(2, 4),
             'conv0_stride': IntegerParameter(1, 3),
             'conv0_pool': IntegerParameter(1, 3),
            }
conv1_hps = {'conv1_pad': IntegerParameter(1, 3),
             'conv1_channels': IntegerParameter(48, 64),
             'conv1_filter': IntegerParameter(2, 4),
             'conv1_stride': IntegerParameter(1, 3),
             'conv1_pool': IntegerParameter(1, 3),
            }
conv2_hps = {'conv2_pad': IntegerParameter(1, 3),
             'conv2_channels': IntegerParameter(96, 128),
             'conv2_filter': IntegerParameter(2, 4),
             'conv2_stride': IntegerParameter(1, 3),
             'conv2_pool': IntegerParameter(1, 3),
            }
fc0_hps = {'fc0_neurons': IntegerParameter(200, 300)}
fc1_hps = {'fc1_neurons': IntegerParameter(200, 300)}

hyperparameter_ranges = {**conv0_hps, **conv1_hps, **conv2_hps, **fc0_hps, **fc1_hps}

# objective and metric
objective_metric_name = 'val_acc'
objective_type = 'Maximize'
metric_definitions = [{'Name': 'val_acc',
                       'Regex': 'best_val_acc: ([0-9\\.]+)'}]

# tuner
tuner = HyperparameterTuner(tf_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=10,
                            max_parallel_jobs=2,
                            objective_type=objective_type)

In [30]:
tuner.fit({'training': training_input_path, 'validation': validation_input_path})

## Deploy best model

In [None]:
import time

tf_endpoint_name = 'keras-tf-fmnist-'+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

tf_predictor = tuner.deploy(initial_instance_count=1,
                         instance_type='ml.c5.large',       
                         accelerator_type='ml.eia1.medium',
                         endpoint_name=tf_endpoint_name)

## Cleanup

In [None]:
sess.delete_endpoint(endpoint_name=tf_endpoint_name) 