# ***Demo: Amazon SageMaker Script Mode and Custom Container***

In [2]:
!pip install sagemaker boto3 awscli --upgrade -q
!pip install ipywidgets -q
# !wget https://raw.githubusercontent.com/awslabs/amazon-sagemaker-examples/master/advanced_functionality/tensorflow_bring_your_own/utils/generate_cifar10_tfrecords.py
# !python generate_cifar10_tfrecords.py --data-dir cifar10

/bin/bash: switchml: line 1: syntax error: unexpected end of file
/bin/bash: error importing function definition for `switchml'
/bin/bash: _moduleraw: line 1: syntax error: unexpected end of file
/bin/bash: error importing function definition for `_moduleraw'
/bin/bash: switchml: line 1: syntax error: unexpected end of file
/bin/bash: error importing function definition for `switchml'
/bin/bash: _moduleraw: line 1: syntax error: unexpected end of file
/bin/bash: error importing function definition for `_moduleraw'


**Step 1:** Import essentials packages, start a sagemaker session and specify the bucket name you created in the pre-requsites section of this workshop.

In [4]:
import os
import boto3
import time
import numpy as np
import sagemaker

sess = boto3.Session()
sagemaker_session = sagemaker.Session()
role = 'AmazonSageMaker-ExecutionRole-20200615T222811'

bucket_name    = sagemaker_session.default_bucket()
jobs_folder    = 'jobs'
dataset_folder = 'datasets'

#### Upload dataset if it doesn't exist

In [5]:
# datasets = sagemaker_session.upload_data(path='cifar10', key_prefix=f'{dataset_folder}/cifar10-dataset')
# datasets

#If dataset already exists
datasets = f's3://{bucket_name}/{dataset_folder}/cifar10-tfrecords'
datasets

's3://sagemaker-us-west-2-453691756499/datasets/cifar10-tfrecords'

# SageMaker training with scrip mode

![](./sm_script_mode.png)

In [6]:
job_name   = f'tf-single-gpu-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'
output_path = f's3://{bucket_name}/{jobs_folder}'

metric_definitions = [{'Name': 'Validation Accuracy', 'Regex': 'Validation Accuracy: ([0-9\\.]+)'}]

hyperparameters = {'epochs'       : 50, 
                   'learning-rate': 0.01,
                   'momentum'     : 0.95,
                   'weight-decay' : 2e-4,
                   'optimizer'    : 'adam',
                   'batch-size'   : 256,
                   'model-type'   : 'custom'}


In [7]:
from sagemaker.debugger import TensorBoardOutputConfig

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=f's3://{bucket_name}/tensorboard_logs')

In [8]:
from sagemaker.tensorflow import TensorFlow
estimator = TensorFlow(entry_point         = 'cifar10-tf2.py', 
                           source_dir           = 'code',
                           output_path          = output_path + '/',
                           code_location        = output_path,
                           role                 = role,
                           instance_count       = 1,
                           instance_type        = 'ml.p3.2xlarge',
                           framework_version    = '2.4', 
                           py_version           = 'py37',
                           metric_definitions   = metric_definitions,
                           hyperparameters      = hyperparameters,
                           tensorboard_output_config = tensorboard_output_config)

**Step 4:** Specify dataset locations in Amazon S3 and then call the fit function.

In [12]:
estimator.fit({'train': f'{datasets}/train',
                    'validation': f'{datasets}/validation',
                    'eval': f'{datasets}/eval'}, 
                  job_name=job_name, wait=True)

2022-04-21 09:06:59 Starting - Starting the training job...
2022-04-21 09:07:25 Starting - Preparing the instances for trainingProfilerReport-1650532018: InProgress
.........
2022-04-21 09:08:56 Downloading - Downloading input data
2022-04-21 09:08:56 Training - Downloading the training image........................
2022-04-21 09:12:44 Training - Training image download completed. Training in progress.[34m2022-04-21 09:12:45.802735: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2022-04-21 09:12:45.807749: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2022-04-21 09:12:45.899884: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0[0m
[34m2022-04-21 09:12:45.992015: W tensorflow/core/profiler/internal/smprofile

# SageMaker training with custom container

![](sm_custom_container.png)

In [25]:
client = boto3.client('sagemaker')
region = boto3.Session().region_name
account = boto3.client('sts').get_caller_identity().get('Account')

repo_name = 'custom-dl-containers'
image_tag = 'sm-gpu'

image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, repo_name, image_tag)

In [None]:
from sagemaker.estimator import Estimator
job_name = f'tf-single-gpu-custom-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'

estimator = Estimator(
    image_uri=image,
    role=role,
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    metric_definitions=[{'Name': 'test_acc', 'Regex': 'test_acc:([0-9\\.]+)'}])

estimator.fit(job_name=job_name)

2022-04-21 17:24:23 Starting - Starting the training job...
2022-04-21 17:24:48 Starting - Preparing the instances for trainingProfilerReport-1650561862: InProgress
.........