# ***Demo: TensorFlow + Amazon SageMaker***
### [Tutorial: Train and tune a deep learning model at scale with Amazon SageMaker](https://aws.amazon.com/getting-started/hands-on/train-tune-deep-learning-model-amazon-sagemaker/)

#### 1. Train model on a single GPU
#### 2. Large-scale hyperparameter optimization
#### 3. Model hosting

In [1]:
import time, os, sys
import sagemaker, boto3
import numpy as np
import pandas as pd

sess = boto3.Session()
sm   = sess.client('sagemaker')
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)
bucket = sagemaker_session.default_bucket()

# datasets = sagemaker_session.upload_data(path='cifar10', key_prefix='datasets/cifar10-dataset-test') 
datasets = f's3://{bucket}/datasets/cifar10-dataset'

In [2]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent

training_experiment = Experiment.create(
                                experiment_name = "gtc-sagemaker-training-experiment-2", 
                                description     = "Experiment to track cifar10 training trials", 
                                sagemaker_boto_client=sm)

In [3]:
single_gpu_trial = Trial.create(
    trial_name = 'gtc-sagemaker-single-gpu-training', 
    experiment_name = training_experiment.experiment_name,
    sagemaker_boto_client = sm,
)

trial_comp_name = 'gtc-single-gpu-training-job'
experiment_config = {"ExperimentName": training_experiment.experiment_name, 
                       "TrialName": single_gpu_trial.trial_name,
                       "TrialComponentDisplayName": trial_comp_name}

In [4]:
from sagemaker.tensorflow import TensorFlow

hyperparams={'epochs'       : 15,
             'learning-rate': 0.01,
             'batch-size'   : 256,
             'weight-decay' : 2e-4,
             'momentum'     : 0.9,
             'optimizer'    : 'adam'}

bucket_name = sagemaker_session.default_bucket()
output_path = f's3://{bucket_name}/jobs'
metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]

tf_estimator = TensorFlow(entry_point          = 'cifar10-training-sagemaker.py', 
                          source_dir           = 'code',
                          output_path          = f'{output_path}/',
                          code_location        = output_path,
                          role                 = role,
                          train_instance_count = 1, 
                          train_instance_type  = 'ml.p3.2xlarge', #'ml.g4dn.xlarge'
                          framework_version    = '1.15.2', 
                          py_version           = 'py3',
                          script_mode          = True,
                          metric_definitions   = metric_definitions,
                          sagemaker_session    = sagemaker_session,
                          hyperparameters      = hyperparams)

job_name=f'tensorflow-single-gpu-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
tf_estimator.fit({'training'  : datasets,
                  'validation': datasets,
                  'eval'      : datasets},
                 job_name = job_name,
                 experiment_config=experiment_config)

INFO:sagemaker:Creating training-job with name: tensorflow-single-gpu-2020-09-27-05-19-06


2020-09-27 05:19:07 Starting - Starting the training job...
2020-09-27 05:19:09 Starting - Launching requested ML instances......
2020-09-27 05:20:10 Starting - Preparing the instances for training......
2020-09-27 05:21:36 Downloading - Downloading input data
2020-09-27 05:21:36 Training - Downloading the training image......
[0m
[34m2020-09-27 05:22:31,762 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-09-27 05:22:32,479 sagemaker-containers INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "eval": "/opt/ml/input/data/eval",
        "training": "/opt/ml/input/data/training",
        "validation": "/opt/ml/input/data/validation"
    },
    "current_host": "algo-1",
    "framework_module": "sagemaker_tensorflow_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {
        "momentum": 0.9,
        "ba

In [5]:
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
    'epochs'        : IntegerParameter(5, 30),
    'learning-rate' : ContinuousParameter(0.001, 0.1, scaling_type='Logarithmic'), 
    'batch-size'    : CategoricalParameter(['128', '256', '512']),
    'momentum'      : ContinuousParameter(0.9, 0.99),
    'optimizer'     : CategoricalParameter(['sgd', 'adam'])
}

objective_metric_name = 'val_acc'
objective_type = 'Maximize'
metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]

tf_estimator = TensorFlow(entry_point          = 'cifar10-training-sagemaker.py', 
                          source_dir           = 'code',
                          output_path          = f'{output_path}/',
                          code_location        = output_path,
                          role                 = role,
                          train_instance_count = 1, 
                          train_instance_type  = 'ml.g4dn.xlarge',
                          framework_version    = '1.15', 
                          py_version           = 'py3',
                          script_mode          = True,
                          metric_definitions   = metric_definitions,
                          sagemaker_session    = sagemaker_session)

tuner = HyperparameterTuner(estimator             = tf_estimator,
                            objective_metric_name = objective_metric_name,
                            hyperparameter_ranges = hyperparameter_ranges,
                            metric_definitions    = metric_definitions,
                            max_jobs              = 16,
                            max_parallel_jobs     = 8,
                            objective_type        = objective_type)

job_name=f'tf-hpo-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
tuner.fit({'training'  : datasets,
           'validation': datasets,
           'eval'      : datasets},
            job_name = job_name)

INFO:root:_TuningJob.start_new!!!
INFO:sagemaker:Creating hyperparameter tuning job with name: tf-hpo-2020-09-27-05-40-51


In [17]:
tuner_predictor = tuner.deploy(initial_instance_count = 1, 
                               instance_type          = 'ml.g4dn.xlarge')

2020-09-27 06:04:49 Starting - Preparing the instances for training
2020-09-27 06:04:49 Downloading - Downloading input data
2020-09-27 06:04:49 Training - Training image download completed. Training in progress.
2020-09-27 06:04:49 Uploading - Uploading generated training model
[0m
[34m2020-09-27 05:57:21,738 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-09-27 05:57:21,739 sagemaker-training-toolkit INFO     Failed to parse hyperparameter _tuning_objective_metric value val_acc to Json.[0m
[34mReturning the value itself[0m
[34m2020-09-27 05:57:21,761 sagemaker_tensorflow_container.training INFO     Appending the training job name to model_dir: s3://sagemaker-us-west-2-453691756499/jobs/tf-hpo-2020-09-27-05-40-51/model/tf-hpo-2020-09-27-05-40-51-016-28d3def2/model[0m
[34m2020-09-27 05:57:36,183 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/usr/bin/python3 -m pip install 

INFO:sagemaker:Creating model with name: tf-hpo-2020-09-27-05-40-51-016-28d3def2
INFO:sagemaker:Creating endpoint with name tf-hpo-2020-09-27-05-40-51-016-28d3def2


-------------!

In [27]:
import numpy as np
import pandas as pd

tuner_predictor.content_type = 'application/x-image'
tuner_predictor.serializer = None

labels = ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']

df = pd.DataFrame()
df['labels'] = labels

# !curl -O https://s3.amazonaws.com/model-server/inputs/tabby.jpg
file_name = 'tabby.jpg'

with open(file_name, 'rb') as f:
    img = f.read()
payload = bytearray(img)

response = tuner_predictor.predict(data=payload)
df['probabilities'] = response['predictions'][0]
print(f'Prediction:{labels[np.argmax(response["predictions"][0])]}')
display(df)

Prediction:cat


Unnamed: 0,labels,probabilities
0,airplane,4e-05
1,automobile,1e-06
2,bird,0.014029
3,cat,0.97792
4,deer,0.000136
5,dog,6.7e-05
6,frog,0.007755
7,horse,2.7e-05
8,ship,2.1e-05
9,truck,3e-06
