## Setup

In [2]:
import numpy as np
import pandas as pd
import h5py
import os
import sagemaker
import boto3
import botocore

from keras.datasets import fashion_mnist
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner


## Download and store data locally

In [3]:
# load data
(X_train, Y_train), (X_val, Y_val) = fashion_mnist.load_data()

In [4]:
# create directory for data
os.makedirs("./data", exist_ok = True)

# store in hdf5 files
with h5py.File('./data/train.h5', 'w') as hf:
    hf.create_dataset('X_train', data=X_train)
    hf.create_dataset('Y_train', data=Y_train)

with h5py.File('./data/val.h5', 'w') as hf:
    hf.create_dataset('X_val', data=X_val)
    hf.create_dataset('Y_val', data=Y_val)

## Train locally

In [8]:
# run script in shell for one epoch
!python keras_cnn_fashion_mnist_local.py --epochs 1

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


2019-09-08 15:50:59.381549: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
  model = Model(input=X_input, outputs=X, name='FashionMNISTModel')
Model: "FashionMNISTModel"
_________________________________________________________________
Layer (type)       

## Upload data to s3 bucket

In [2]:
sess = sagemaker.Session()
role_name = '<your IAM role name>'
bucket_name = '<your s3 bucket name>'

In [3]:
sess = sagemaker.Session()
role_name = 'arn:aws:iam::406755861890:role/service-role/AmazonSageMaker-ExecutionRole-20190827T145350'
bucket_name = 'sagemaker-fashion-mnist'

In [4]:
training_input_path   = sess.upload_data('data/train.hdf5', bucket=bucket_name, key_prefix='data')
validation_input_path = sess.upload_data('data/val.hdf5', bucket=bucket_name, key_prefix='data')

print(training_input_path)
print(validation_input_path)

s3://sagemaker-fashion-mnist/data/train.hdf5
s3://sagemaker-fashion-mnist/data/val.hdf5


## Train in the cloud with SageMaker

In [5]:
# store model artifacts
sm_output_dir = os.path.join(os.getcwd(), 'sagemaker_output')
os.makedirs(sm_output_dir, exist_ok=True)

# upload folders to s3 
sm_output_path = sess.upload_data('sagemaker_output/', bucket=bucket_name, key_prefix='sm-output')

print(sm_output_path)

s3://sagemaker-fashion-mnist/sm-output


In [8]:
# objective and metric
metric_definitions = [ {'Name': 'acc',
                       'Regex': 'acc: ([0-9\\.]+)'},
                       {'Name': 'val_acc',
                       'Regex': 'val_acc: ([0-9\\.]+)'}]


hyperparameters = {'epochs': 100, 'batch-size': 128}

# sagemaker estimator
tf_estimator = TensorFlow(entry_point='train_script_local.py', 
                          role=role_name,
                          train_volume_size=1,
                          train_instance_count=1, 
                          train_instance_type='ml.p3.2xlarge',
                          train_use_spot_instances=True,
                          train_max_wait=86400,
                          model_dir=sm_output_path,
                          framework_version='1.13', 
                          py_version='py3',
                          script_mode=True,
                          hyperparameters=hyperparameters,
                          metric_definitions=metric_definitions
                         )

# train estimator
tf_estimator.fit({'training': training_input_path, 'validation': validation_input_path})

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.p3.2xlarge for spot training job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.

## Download training job output from s3

## Inspect training job results