# Distributed training with Amazon SageMaker from Amazon SageMaker Studio Lab

In [None]:
!pip install sagemaker --upgrade -q
!pip install ipywidgets -q
!pip install tensorflow -q

If using temporary credentials, copy the output of `aws sts get-session-token --duration-seconds <DURATION>` and assign it to the `credentials` variable below.

If you're using long-term credentials, ignore this cell

In [None]:
credentials = {
    "Credentials": {
        "AccessKeyId": "<AWS_ACCESS_KEY_ID>",
        "SecretAccessKey": "<AWS_SECRET_ACCESS_KEY>",
        "SessionToken": "<AWS_SESSION_TOKEN>",
    }
}
os.environ["AWS_ACCESS_KEY_ID"] = credentials['Credentials']['AccessKeyId']
os.environ["AWS_SECRET_ACCESS_KEY"] = credentials['Credentials']['SecretAccessKey']
os.environ["AWS_SESSION_TOKEN"] = credentials['Credentials']['SessionToken']

sagemaker_role='arn:aws:iam::<ACCOUNT_ID>:role/SageMakerStudioLabRole'

**Step 1:** Import essentials packages, start a sagemaker session and specify the bucket name you created in the pre-requsites section of this workshop.

In [None]:
import boto3
import time
import numpy as np
import sagemaker

sess = boto3.Session()
sm   = sess.client('sagemaker')
sagemaker_session = sagemaker.Session()
role = sagemaker_role

bucket_name    = sagemaker_session.default_bucket()
jobs_folder    = 'jobs'
dataset_folder = 'datasets'

#### Upload dataset if it doesn't exist

In [None]:
# datasets = sagemaker_session.upload_data(path='cifar10', key_prefix=f'{dataset_folder}/cifar10-tfrecords')
# datasets

#If dataset already exists
datasets = f's3://{bucket_name}/{dataset_folder}/cifar10-tfrecords'
datasets

**Step 3:** In this cell we create a SageMaker estimator, by providing it with all the information it needs to launch instances and execute training on those instances.

We specify `distributions` to SMDataParallel.

In the TensorFlow estimator call, we specify training script under `entry_point` and dependencies under `code`. SageMaker automatically copies these files into a TensorFlow container behind the scenes, and are executed on the training instances.

In [None]:
instance_type = 'ml.p3.16xlarge' # 8 NVIDIA V100 GPUs
instance_count = 1

distribution={'smdistributed':{
                    'dataparallel':{
                        'enabled': True
                    }
            }}

**Step 2:** Specify hyperparameters, instance type and number of instances to distribute training to. 

In [None]:
instance_type_count_name = f'{instance_type.replace(".","-")}-x-{str(instance_count)}'
job_name   = f'tf-dataparallel-{instance_type_count_name}-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'
output_path = f's3://{bucket_name}/{jobs_folder}'

metric_definitions = [{'Name': 'Validation Accuracy', 'Regex': 'Validation Accuracy: ([0-9\\.]+)'}]

hyperparameters = {'epochs'       : 50, 
                   'learning-rate': 0.001,
                   'momentum'     : 0.95,
                   'weight-decay' : 2e-4,
                   'optimizer'    : 'adam',
                   'batch-size'   : 512,
                   'model-type'   : 'custom'}


In [None]:
from sagemaker.debugger import TensorBoardOutputConfig

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=f's3://{bucket_name}/tensorboard_logs')

In [None]:
from sagemaker.tensorflow import TensorFlow
smdp_estimator = TensorFlow(entry_point         = 'cifar10-tf2-smdataparallel.py', 
                           source_dir           = 'code',
                           output_path          = output_path + '/',
                           code_location        = output_path,
                           role                 = role,
                           instance_count       = instance_count,
                           instance_type        = instance_type,
                           framework_version    = '2.4.1', 
                           py_version           = 'py37',
                           metric_definitions   = metric_definitions,
                           hyperparameters      = hyperparameters,
                           distribution         = distribution, 
                           tensorboard_output_config = tensorboard_output_config)

**Step 4:** Specify dataset locations in Amazon S3 and then call the fit function.

In [None]:
smdp_estimator.fit({'train': datasets,
                    'validation': datasets,
                    'eval': datasets}, 
                  job_name=job_name, wait=True)

## Clean up

In [None]:
s3 = boto3.resource('s3')
s3_bucket = s3.Bucket(bucket_name)
s3_bucket.objects.filter(Prefix=f"{jobs_folder}/").delete()
s3_bucket.objects.filter(Prefix=f"{dataset_folder}/").delete()

print(f"\nDeleted contents of {bucket}/{jobs_folder}")
print(f"\nDeleted contents of {bucket}/{dataset_folder}")

###### 