## Setup

In [5]:
import numpy as np
import pandas as pd
import h5py
import os
import sagemaker

from tensorflow import logging
from keras.datasets import fashion_mnist
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

logging.set_verbosity(logging.ERROR)

Using TensorFlow backend.


## Download and store data locally

In [32]:
# load data
(X_train, Y_train), (X_val, Y_val) = fashion_mnist.load_data()

Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz


In [35]:
data_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(data_dir, exist_ok=True)

In [36]:
# store in hdf5 files
with h5py.File(os.path.join(data_dir, 'train.hdf5'), 'w') as hf:
    hf.create_dataset('X_train', data=X_train)
    hf.create_dataset('Y_train', data=Y_train)

with h5py.File(os.path.join(data_dir, 'val.hdf5'), 'w') as hf:
    hf.create_dataset('X_val', data=X_val)
    hf.create_dataset('Y_val', data=Y_val)

## Upload data to s3

In [37]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
prefix = 'sagemaker-fashion-mnist'

In [38]:
training_input_path   = sess.upload_data('data/train.hdf5', key_prefix=prefix+'/data')
validation_input_path = sess.upload_data('data/val.hdf5', key_prefix=prefix+'/data')

print(training_input_path)
print(validation_input_path)

s3://sagemaker-us-west-2-406755861890/sagemaker-fashion-mnist/data/train.hdf5
s3://sagemaker-us-west-2-406755861890/sagemaker-fashion-mnist/data/val.hdf5


## Train locally to test training script

In [39]:
# local directory for model checkpoints
os.makedirs("./models/checkpoints", exist_ok = True)

# sagemaker estimator
tf_estimator = TensorFlow(entry_point='keras_cnn_fashion_mnist_aws.py', 
                          role=role,
                          train_volume_size=1,
                          train_instance_count=1, 
                          train_instance_type='local',
                          hyperparameters={'epochs': 1, 'batch_size': 128},
                          framework_version='1.13', 
                          py_version='py3',
                          script_mode=True,
                         )

In [40]:
tf_estimator.fit({'training': training_input_path, 'validation': validation_input_path})

Creating tmpvw47vf9y_algo-1-1fir0_1 ... 
[1BAttaching to tmpvw47vf9y_algo-1-1fir0_12mdone[0m
[36malgo-1-1fir0_1  |[0m 2019-09-03 18:25:45,277 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training
[36malgo-1-1fir0_1  |[0m 2019-09-03 18:25:45,284 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-1fir0_1  |[0m 2019-09-03 18:25:45,485 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-1fir0_1  |[0m 2019-09-03 18:25:45,508 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-1fir0_1  |[0m 2019-09-03 18:25:45,530 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-1fir0_1  |[0m 2019-09-03 18:25:45,547 sagemaker-containers INFO     Invoking user script
[36malgo-1-1fir0_1  |[0m 
[36malgo-1-1fir0_1  |[0m Training Env:
[36malgo-1-1fir0_1  |[0m 
[36malgo-1-1fir0_1  |[0m {
[36malgo-1-1fir0_1  |[0

In [42]:
help(sess)

Help on Session in module sagemaker.session object:

class Session(builtins.object)
 |  Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
 |  
 |  This class provides convenient methods for manipulating entities and resources that Amazon
 |  SageMaker uses, such as training jobs, endpoints, and input datasets in S3.
 |  
 |  AWS service calls are delegated to an underlying Boto3 session, which by default
 |  is initialized using the AWS configuration chain. When you make an Amazon SageMaker API call
 |  that accesses an S3 bucket location and one is not specified, the ``Session`` creates a default
 |  bucket based on a naming convention which includes the current AWS account ID.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, boto_session=None, sagemaker_client=None, sagemaker_runtime_client=None)
 |      Initialize a SageMaker ``Session``.
 |      
 |      Args:
 |          boto_session (boto3.session.Session): The underlying Boto3 session wh

## Train on spot instances

In [21]:
# sagemaker estimator
tf_estimator = TensorFlow(entry_point='keras_cnn_fashion_mnist_aws.py', 
                          role=role,
                          train_volume_size=1,
                          train_instance_count=1, 
                          train_instance_type='ml.m5.xlarge',
                          train_use_spot_instances=True,
                          train_max_wait=86400,
                          checkpoint_local_path=checkpoint_dir,
                          framework_version='1.13', 
                          py_version='py3',
                          script_mode=True
                         )

In [None]:
tf_estimator.fit({'training': training_input_path, 'validation': validation_input_path})

2019-09-03 07:22:28 Starting - Starting the training job...
2019-09-03 07:22:29 Starting - Launching requested ML instances......
2019-09-03 07:23:33 Starting - Preparing the instances for training...
2019-09-03 07:24:21 Downloading - Downloading input data...
2019-09-03 07:24:53 Training - Training image download completed. Training in progress..
[31m2019-09-03 07:24:55,994 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[31m2019-09-03 07:24:56,001 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-03 07:24:56,274 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-03 07:24:56,292 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-03 07:24:56,308 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-03 07:24:56,320 sagemaker-containers INFO     Invoking user script
[0m
[3

## Automatic tuning

In [18]:
# hyperparameter spaces
conv0_hps = {'conv0_pad': IntegerParameter(1, 3),
             'conv0_channels': IntegerParameter(24, 32),
             'conv0_filter': IntegerParameter(2, 4),
             'conv0_stride': IntegerParameter(1, 3),
             'conv0_pool': IntegerParameter(1, 3),
            }
conv1_hps = {'conv1_pad': IntegerParameter(1, 3),
             'conv1_channels': IntegerParameter(48, 64),
             'conv1_filter': IntegerParameter(2, 4),
             'conv1_stride': IntegerParameter(1, 3),
             'conv1_pool': IntegerParameter(1, 3),
            }
conv2_hps = {'conv2_pad': IntegerParameter(1, 3),
             'conv2_channels': IntegerParameter(96, 128),
             'conv2_filter': IntegerParameter(2, 4),
             'conv2_stride': IntegerParameter(1, 3),
             'conv2_pool': IntegerParameter(1, 3),
            }
fc0_hps = {'fc0_neurons': IntegerParameter(200, 300)}
fc1_hps = {'fc1_neurons': IntegerParameter(200, 300)}

hyperparameter_ranges = {**conv0_hps, **conv1_hps, **conv2_hps, **fc0_hps, **fc1_hps}

# objective and metric
objective_metric_name = 'val_acc'
objective_type = 'Maximize'
metric_definitions = [{'Name': 'val_acc',
                       'Regex': 'best_val_acc: ([0-9\\.]+)'}]

# tuner
tuner = HyperparameterTuner(tf_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=10,
                            max_parallel_jobs=2,
                            objective_type=objective_type)

In [30]:
tuner.fit({'training': training_input_path, 'validation': validation_input_path})

## Deploy best model

In [None]:
import time

tf_endpoint_name = 'keras-tf-fmnist-'+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

tf_predictor = tuner.deploy(initial_instance_count=1,
                         instance_type='ml.c5.large',       
                         accelerator_type='ml.eia1.medium',
                         endpoint_name=tf_endpoint_name)

## Cleanup

In [None]:
sess.delete_endpoint(endpoint_name=tf_endpoint_name) 