## Setup

In [None]:
import numpy as np
import h5py
import os
import sagemaker

from keras.datasets import fashion_mnist

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Sagemaker variables
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

## Download and store data locally

In [None]:
# load data
(X_train, Y_train), (X_val, Y_val) = fashion_mnist.load_data()

# create directory for data
os.makedirs("./data", exist_ok = True)

# store in hdf5 files
with h5py.File('./data/train.h5', 'w') as hf:
    hf.create_dataset('X_train', data=X_train)
    hf.create_dataset('Y_train', data=Y_train)

with h5py.File('./data/val.h5', 'w') as hf:
    hf.create_dataset('X_val', data=X_val)
    hf.create_dataset('Y_val', data=Y_val)

## Upload data to s3

In [None]:
prefix = 'keras-cnn-fashion-mnist'

training_input_path   = sess.upload_data('data/train.h5', key_prefix=prefix+'/train')
validation_input_path = sess.upload_data('data/val.h5', key_prefix=prefix+'/val')

print(training_input_path)
print(validation_input_path)

## Train on spot instances using automatic tuning

In [None]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

# local directory for model checkpoints
os.makedirs("./models/checkpoints", exist_ok = True)

# sagemaker estimator
tf_estimator = TensorFlow(entry_point='keras_cnn_fashion_mnist.py', 
                          role=role,
                          train_volume_size=1,
                          train_instance_count=1, 
                          train_instance_type='ml.m5.xlarge',
                          train_use_spot_instances=True,
                          train_max_wait=600,
                          checkpoint_local_path='models/checkpoints',
                          framework_version='1.13', 
                          py_version='py3',
                          script_mode=True
                         )

# hyperparameter spaces
conv0_hps = {'conv0_pad': IntegerParameter(1, 3),
             'conv0_channels': IntegerParameter(24, 32),
             'conv0_filter': IntegerParameter(2, 4),
             'conv0_stride': IntegerParameter(1, 3),
             'conv0_pool': IntegerParameter(1, 3),
             'conv0_activation': ['relu'],
            }
conv1_hps = {'conv1_pad': IntegerParameter(1, 3),
             'conv1_channels': IntegerParameter(48, 64),
             'conv1_filter': IntegerParameter(2, 4),
             'conv1_stride': IntegerParameter(1, 3),
             'conv1_pool': IntegerParameter(1, 3),
             'conv1_activation': ['relu'],
            }
conv2_hps = {'conv2_pad': IntegerParameter(1, 3),
             'conv2_channels': IntegerParameter(96, 128),
             'conv2_filter': IntegerParameter(2, 4),
             'conv2_stride': IntegerParameter(1, 3),
             'conv2_pool': IntegerParameter(1, 3),
             'conv2_activation': ['relu'],
}
fc0_hps = {'fc0_neurons': IntegerParameter(200, 300),
              'fc0_activation': ['relu']
              }
fc1_hps = {'fc1_neurons': IntegerParameter(200, 300),
              'fc1_activation': ['relu']
              }

hps = {**conv0_hps, **conv1_hps, **conv2_hps, **fc0_hps, **fc1_hps}

# fixed hyperparameters
estimator.set_hyperparameters(epochs=100, gpus=gpu_count, batch_size=batch_size)

# objective and metric
objective_metric_name = 'val_acc'
objective_type = 'Maximize'
metric_definitions = [{'Name': 'val_acc',
                       'Regex': 'best_val_acc: ([0-9\\.]+)'}]

# tuner
tuner = HyperparameterTuner(tf_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=10,
                            max_parallel_jobs=2,
                            objective_type=objective_type)

In [None]:
tuner.fit({'training': training_input_path, 'validation': validation_input_path})

## Deploy best model

In [None]:
import time

tf_endpoint_name = 'keras-tf-fmnist-'+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

tf_predictor = tuner.deploy(initial_instance_count=1,
                         instance_type='ml.c5.large',       
                         accelerator_type='ml.eia1.medium',
                         endpoint_name=tf_endpoint_name)

## Cleanup

In [None]:
sess.delete_endpoint(endpoint_name=tf_endpoint_name) 