## A quick guide to saving costs with Spot instances on Amazon SageMaker

Amazon SageMaker offers managed spot training, which is a way to use Amazon EC2 Spot instances for Amazon SageMaker training jobs. This means you can now save a lot on training workloads without having to setup and manage Spot instances! Amazon SageMaker will automatically provision Spot instances for you. If a Spot instance is reclaimed, Amazon SageMaker will automatically resume training after capacity is available!


In [None]:
import os
import numpy as np
import time
import sys
import sagemaker
import boto3
import matplotlib.pyplot as plt
import pandas as pd

from sagemaker.session import s3_input
from sagemaker.debugger import TensorBoardOutputConfig
from sagemaker.tensorflow import TensorFlow

sess = boto3.Session()
sm   = sess.client('sagemaker')
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)

Download cifar10 dataset and upload to Amazon S3

In [None]:
!python generate_cifar10_tfrecords.py --data-dir cifar10
datasets = sagemaker_session.upload_data(path='cifar10', key_prefix='datasets/cifar10-dataset')

In [None]:
bucket_name = sagemaker_session.default_bucket()
job_folder      = 'jobs'
dataset_folder  = 'datasets'

train_path = f'{datasets}/train'
val_path   = f'{datasets}/validation'
eval_path  = f'{datasets}/eval'

In [None]:
hyperparams={'epochs'       : 10,
             'learning-rate': 0.01,
             'batch-size'   : 256,
             'weight-decay' : 2e-4,
             'momentum'     : 0.9,
             'optimizer'    : 'adam',
             'model-type'   : 'custom'}

In [None]:
output_path = f's3://{bucket_name}/jobs'
job_name    = f'tensorflow-spot-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
tb_config   = TensorBoardOutputConfig(output_path)

tf_estimator = TensorFlow(entry_point              = 'cifar10-training-sagemaker.py', 
                          source_dir               = 'code',
                          output_path              = f'{output_path}/',
                          code_location            = output_path,
                          checkpoint_s3_uri        = f'{output_path}/{job_name}/checkpoints',
                          role                     = role,
                          train_instance_count     = 1, 
                          train_instance_type      = 'ml.p3.2xlarge',
                          framework_version        = '1.15', 
                          py_version               = 'py3',
                          script_mode              = True,
                          train_use_spot_instances = True,
                          train_max_wait           = 7200,
                          train_max_run            = 3600,
                          sagemaker_session        = sagemaker_session,
                          hyperparameters          = hyperparams,
                          tensorboard_output_config= tb_config)

In [None]:
tf_estimator.fit({'training'  : train_path,
                  'validation': val_path,
                  'eval'      : eval_path},
                  job_name    = job_name,
                  wait        = True)

To test your training script to make sure training resumes properly, provide the checkpoint path from the previous job to a new job by specifying it under `checkpoint_s3_uri` 

`checkpoint_s3_uri = tf_estimator.checkpoint_s3_uri`

In [None]:
hyperparams={'epochs'       : 20,
             'learning-rate': 0.01,
             'batch-size'   : 256,
             'weight-decay' : 2e-4,
             'momentum'     : 0.9,
             'optimizer'    : 'adam',
             'model-type'   : 'custom'}

output_path = f's3://{bucket_name}/jobs'
job_name=f'tensorflow-spot-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'

tf_estimator_test = TensorFlow(entry_point         = 'cifar10-training-sagemaker.py', 
                          source_dir               = 'code',
                          output_path              = f'{output_path}/',
                          code_location            = output_path,
                          checkpoint_s3_uri        = tf_estimator.checkpoint_s3_uri,
                          role                     = role,
                          train_instance_count     = 1, 
                          train_instance_type      = 'ml.p3.2xlarge',
                          framework_version        = '1.15', 
                          py_version               = 'py3',
                          script_mode              = True,
                          train_use_spot_instances = True,
                          train_max_wait           = 7200,
                          train_max_run            = 3600,
                          sagemaker_session        = sagemaker_session,
                          hyperparameters          = hyperparams)

tf_estimator_test.fit({'training'  : train_path,
                      'validation': val_path,
                      'eval'      : eval_path},
                      job_name    = job_name,
                      wait        = True)