## Setup

In [1]:
import numpy as np
import pandas as pd
import h5py
import os
import sagemaker

from tensorflow import logging
from keras.datasets import fashion_mnist
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

logging.set_verbosity(logging.ERROR)

Using TensorFlow backend.


## Download and store data locally

In [None]:
# load data
(X_train, Y_train), (X_val, Y_val) = fashion_mnist.load_data()

In [None]:
# create directory for data
os.makedirs("./data", exist_ok = True)

# store in hdf5 files
with h5py.File('./data/train.h5', 'w') as hf:
    hf.create_dataset('X_train', data=X_train)
    hf.create_dataset('Y_train', data=Y_train)

with h5py.File('./data/val.h5', 'w') as hf:
    hf.create_dataset('X_val', data=X_val)
    hf.create_dataset('Y_val', data=Y_val)

## Upload data to s3

In [2]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
prefix = 'keras-cnn-fashion-mnist'

In [4]:
training_input_path   = sess.upload_data('data/train.hdf5', key_prefix=prefix+'/data')
validation_input_path = sess.upload_data('data/val.hdf5', key_prefix=prefix+'/data')

print(training_input_path)
print(validation_input_path)

s3://sagemaker-us-west-2-406755861890/keras-cnn-fashion-mnist/train/train.hdf5
s3://sagemaker-us-west-2-406755861890/keras-cnn-fashion-mnist/val/val.hdf5


## Train locally to test training script

In [None]:
# local directory for model checkpoints
os.makedirs("./models/checkpoints", exist_ok = True)

model_dir = '/opt/ml/model'
train_instance_type = 'local'
hyperparameters = {'epochs': 1, 'batch_size': 128}

# sagemaker estimator
tf_estimator = TensorFlow(entry_point='keras_cnn_fashion_mnist.py', 
                          role=role,
                          train_volume_size=1,
                          train_instance_count=1, 
                          train_instance_type='local',
                          hyperparameters=hyperparameters,
                          train_use_spot_instances=True,
                          train_max_wait=86400,
                          checkpoint_local_path='models/checkpoints',
                          framework_version='1.13', 
                          py_version='py3',
                          script_mode=True
                         )

## Train on spot instances

In [8]:
# local directory for model checkpoints
os.makedirs("./models/checkpoints", exist_ok = True)

# sagemaker estimator
tf_estimator = TensorFlow(entry_point='keras_cnn_fashion_mnist.py', 
                          role=role,
                          train_volume_size=1,
                          train_instance_count=1, 
                          train_instance_type='ml.m5.xlarge',
                          train_use_spot_instances=True,
                          train_max_wait=86400,
                          checkpoint_local_path='models/checkpoints',
                          framework_version='1.13', 
                          py_version='py3',
                          script_mode=True
                         )

In [9]:
tf_estimator.fit({'training': training_input_path, 'validation': validation_input_path})

2019-09-01 05:06:57 Starting - Starting the training job...
2019-09-01 05:07:00 Starting - Launching requested ML instances......
2019-09-01 05:08:04 Starting - Preparing the instances for training...
2019-09-01 05:08:45 Downloading - Downloading input data...
2019-09-01 05:09:26 Training - Training image download completed. Training in progress.
2019-09-01 05:09:26 Uploading - Uploading generated training model.
[31m2019-09-01 05:09:23,123 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[31m2019-09-01 05:09:23,129 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-01 05:09:23,396 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-01 05:09:24,060 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-01 05:09:24,076 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-01 05:09

UnexpectedStatusException: Error for Training job tensorflow-training-2019-09-01-05-06-56-221: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/usr/local/bin/python3.6 keras_cnn_fashion_mnist.py --model_dir s3://sagemaker-us-west-2-406755861890/tensorflow-training-2019-09-01-05-06-56-221/model"

## Automatic tuning

In [6]:
# hyperparameter spaces
conv0_hps = {'conv0_pad': IntegerParameter(1, 3),
             'conv0_channels': IntegerParameter(24, 32),
             'conv0_filter': IntegerParameter(2, 4),
             'conv0_stride': IntegerParameter(1, 3),
             'conv0_pool': IntegerParameter(1, 3),
            }
conv1_hps = {'conv1_pad': IntegerParameter(1, 3),
             'conv1_channels': IntegerParameter(48, 64),
             'conv1_filter': IntegerParameter(2, 4),
             'conv1_stride': IntegerParameter(1, 3),
             'conv1_pool': IntegerParameter(1, 3),
            }
conv2_hps = {'conv2_pad': IntegerParameter(1, 3),
             'conv2_channels': IntegerParameter(96, 128),
             'conv2_filter': IntegerParameter(2, 4),
             'conv2_stride': IntegerParameter(1, 3),
             'conv2_pool': IntegerParameter(1, 3),
            }
fc0_hps = {'fc0_neurons': IntegerParameter(200, 300)}
fc1_hps = {'fc1_neurons': IntegerParameter(200, 300)}

hyperparameter_ranges = {**conv0_hps, **conv1_hps, **conv2_hps, **fc0_hps, **fc1_hps}

# objective and metric
objective_metric_name = 'val_acc'
objective_type = 'Maximize'
metric_definitions = [{'Name': 'val_acc',
                       'Regex': 'best_val_acc: ([0-9\\.]+)'}]

# tuner
tuner = HyperparameterTuner(tf_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=10,
                            max_parallel_jobs=2,
                            objective_type=objective_type)

In [30]:
tuner.fit({'training': training_input_path, 'validation': validation_input_path})

## Deploy best model

In [None]:
import time

tf_endpoint_name = 'keras-tf-fmnist-'+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

tf_predictor = tuner.deploy(initial_instance_count=1,
                         instance_type='ml.c5.large',       
                         accelerator_type='ml.eia1.medium',
                         endpoint_name=tf_endpoint_name)

## Cleanup

In [None]:
sess.delete_endpoint(endpoint_name=tf_endpoint_name) 