# ***Demo: ModelParallel training with Amazon SageMaker***

In [1]:
# !pip install sagemaker --upgrade -q
# !pip install ipywidgets -q
# !wget https://raw.githubusercontent.com/awslabs/amazon-sagemaker-examples/master/advanced_functionality/tensorflow_bring_your_own/utils/generate_cifar10_tfrecords.py
# !python generate_cifar10_tfrecords.py --data-dir cifar10

**Step 1:** Import essentials packages, start a sagemaker session and specify the bucket name you created in the pre-requsites section of this workshop.

In [2]:
import os
import boto3
import time
import numpy as np
import sagemaker

sess = boto3.Session()
sm   = sess.client('sagemaker')
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket_name    = sagemaker_session.default_bucket()
jobs_folder    = 'jobs'
dataset_folder = 'datasets'

#### Upload dataset if it doesn't exist

In [3]:
# datasets = sagemaker_session.upload_data(path='cifar10', key_prefix=f'{dataset_folder}/cifar10-dataset')
# datasets

#If dataset already exists
datasets = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset'
datasets

's3://sagemaker-us-west-2-453691756499/datasets/cifar10-dataset'

In [4]:
instance_type = 'ml.p3.16xlarge'
instance_count = 1

distribution={
              "smdistributed": {
                  "modelparallel": {
                      "enabled":True,
                      "parameters": {
                          "microbatches": 2, 
                          "partitions": 2, 
                          "pipeline": "interleaved", 
                          "optimize": "speed",
                      }
                  }
              },
               "mpi": {
                   "enabled" : True,
                   "processes_per_host" : 2,
                   "custom_mpi_options" : "-verbose -x orte_base_help_aggregate=0 "
               }
              }

**Step 2:** Specify hyperparameters, instance type and number of instances to distribute training to. 

In [5]:
instance_type_name = f'{instance_type.replace(".","-")}'
job_name   = f'tf-modelparallel-{instance_type_name}-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'
output_path = f's3://{bucket_name}/{jobs_folder}'

metric_definitions = [{'Name': 'Validation Accuracy', 'Regex': 'Validation Accuracy: ([0-9\\.]+)'}]

hyperparameters = {'epochs': 50, 
                   'learning-rate': 0.01,
                   'momentum': 0.95,
                   'weight-decay': 2e-4,
                   'optimizer': 'adam',
                   'batch-size' : 256,
                   'model-type': 'custom'}


In [6]:
from sagemaker.tensorflow import TensorFlow
smdp_estimator = TensorFlow(entry_point         = 'cifar10-tf2-smmodelparallel.py', 
                           source_dir           = 'code',
                           output_path          = output_path + '/',
                           code_location        = output_path,
                           role                 = role,
                           instance_count       = instance_count,
                           instance_type        = instance_type,
                           framework_version    = '2.4', 
                           py_version           = 'py37',
                           metric_definitions   = metric_definitions,
                           hyperparameters      = hyperparameters,
                           distribution         = distribution)

**Step 4:** Specify dataset locations in Amazon S3 and then call the fit function.

In [7]:
smdp_estimator.fit({'train': datasets,
                    'validation': datasets,
                    'eval': datasets}, 
                  job_name=job_name, wait=True)

2021-04-18 18:58:01 Starting - Starting the training job...
2021-04-18 18:58:23 Starting - Launching requested ML instancesProfilerReport-1618772280: InProgress
.........
2021-04-18 18:59:44 Starting - Preparing the instances for training............
2021-04-18 19:01:50 Downloading - Downloading input data
2021-04-18 19:01:50 Training - Downloading the training image............
2021-04-18 19:03:47 Training - Training image download completed. Training in progress.[34m2021-04-18 19:03:47.592437: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2021-04-18 19:03:47.597867: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2021-04-18 19:03:47.692157: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0[0m
[34m2021-04-1