# Create a Training Pipeline with the Step Functions Data Science SDK

![Step Functions SageMaker Pipeline](img/stepfunctions_graph.png)

In [1]:
from botocore.exceptions import ClientError

import os
import sagemaker
import logging
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [2]:
import stepfunctions
import logging
from stepfunctions.template.pipeline import TrainingPipeline

stepfunctions.set_stream_logger(level=logging.INFO)

# Create an IAM Execution Role for Step Functions
We need a StepFunctionsWorkflowExecutionRole so that you can create and execute workflows in Step Functions.

In [3]:
iam = boto3.Session().client(service_name='iam', region_name=region)
sts = boto3.Session().client(service_name='sts', region_name=region)
sfn = boto3.Session().client(service_name='stepfunctions', region_name=region)

In [4]:
stepfunction_role_name = 'DSOAWS_StepFunctionsExecutionRole'

### Create an AssumeRolePolicyDocument

In [5]:
assume_role_policy_doc = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "states.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
} 

### Create `DSOAWS_StepFunctionsExecutionRole`

In [6]:
import json
import time
try:
    iam.create_role(
        RoleName=stepfunction_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description='DSOAWS Step Function Workflow Execution Role'
    )
    time.sleep(10)
    print("Role created.")
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Role already exists. This is OK.")
    else:
        print("Unexpected error: %s" % e)
        

Role created.


### Get the Role ARN

In [7]:
stepfunction_role = iam.get_role(RoleName=stepfunction_role_name)
stepfunction_role_arn = stepfunction_role['Role']['Arn']
print(stepfunction_role_arn)

arn:aws:iam::085964654406:role/DSOAWS_StepFunctionsExecutionRole


# Add a Policy to the Role

## Define permissions

In [8]:
stepfunction_permissions = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "sagemaker:CreateTransformJob",
                "sagemaker:DescribeTransformJob",
                "sagemaker:StopTransformJob",
                "sagemaker:CreateTrainingJob",
                "sagemaker:DescribeTrainingJob",
                "sagemaker:StopTrainingJob",
                "sagemaker:CreateHyperParameterTuningJob",
                "sagemaker:DescribeHyperParameterTuningJob",
                "sagemaker:StopHyperParameterTuningJob",
                "sagemaker:CreateModel",
                "sagemaker:CreateEndpointConfig",
                "sagemaker:CreateEndpoint",
                "sagemaker:DeleteEndpointConfig",
                "sagemaker:DeleteEndpoint",
                "sagemaker:UpdateEndpoint",
                "sagemaker:CreateProcessingJob",
                "sagemaker:DescribeProcessingJob",
                "sagemaker:ListProcessingJobs",
                "sagemaker:StopProcessingJob",                
                "sagemaker:ListTags",
                "lambda:InvokeFunction",
                "sqs:SendMessage",
                "sns:Publish",
                "ecs:RunTask",
                "ecs:StopTask",
                "ecs:DescribeTasks",
                "dynamodb:GetItem",
                "dynamodb:PutItem",
                "dynamodb:UpdateItem",
                "dynamodb:DeleteItem",
                "batch:SubmitJob",
                "batch:DescribeJobs",
                "batch:TerminateJob",
                "glue:StartJobRun",
                "glue:GetJobRun",
                "glue:GetJobRuns",
                "glue:BatchStopJobRun"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "iam:PassRole"
            ],
            "Resource": "*",
            "Condition": {
                "StringEquals": {
                    "iam:PassedToService": "sagemaker.amazonaws.com"
                }
            }
        },
        {
            "Effect": "Allow",
            "Action": [
                "events:PutTargets",
                "events:PutRule",
                "events:DescribeRule"
            ],
            "Resource": [
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule",
                "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule",
            ]
        }
    ]
}

## Turn into Policy Object

In [9]:
stepfunction_policy_name = 'DSOAWS_StepFunctionsWorkflowExecutionPolicy'

In [10]:
account_id = sts.get_caller_identity()['Account']

In [11]:
import time
try:
    stepfunction_policy = iam.create_policy(
      PolicyName=stepfunction_policy_name,
      PolicyDocument=json.dumps(stepfunction_permissions)
    )
    stepfunction_policy_arn = f'arn:aws:iam::{account_id}:policy/{stepfunction_policy_name}'
    print("Policy created.")
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy already exists.  Updating policy...")
        stepfunction_policy_arn = f'arn:aws:iam::{account_id}:policy/{stepfunction_policy_name}'
        try:
            stepfunction_policy = iam.create_policy_version(
                PolicyArn=stepfunction_policy_arn,
                PolicyDocument=json.dumps(stepfunction_permissions),
                SetAsDefault=True)
            print('Policy updated.')
        except:
            print('** Policy cannot have more than 5 versions.  This is likely OK.')
    else:
        print("Unexpected error: %s" % e)

Policy created.


In [12]:
print(stepfunction_policy_arn)

arn:aws:iam::085964654406:policy/DSOAWS_StepFunctionsWorkflowExecutionPolicy


## Attach Policy To Step Function Workflow Execution Role

In [13]:
import time
try:
    response = iam.attach_role_policy(
        PolicyArn=stepfunction_policy_arn,
        RoleName=stepfunction_role_name
    )
    print("Done.")
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy is already attached. This is OK.")
    else:
        print("Unexpected error: %s" % e)

Done.


In [14]:
import time
try:
    response = iam.attach_role_policy(
        PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaRole',
        RoleName=stepfunction_role_name
    )
    print("Done.")
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy is already attached. This is OK.")
    else:
        print("Unexpected error: %s" % e)

Done.


In [15]:
import time
try:
    response = iam.attach_role_policy(
        PolicyArn='arn:aws:iam::aws:policy/CloudWatchEventsFullAccess',
        RoleName=stepfunction_role_name
    )
    print("Done.")
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy is already attached. This is OK.")
    else:
        print("Unexpected error: %s" % e)

Done.


# Setup Processing Step

# Upload the Processing Script to S3 for the Pipeline to Consume

In [16]:
!pygmentize ./preprocess-scikit-text-to-bert.py

[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mmodel_selection[39;49;00m [34mimport[39;49;00m train_test_split
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mutils[39;49;00m [34mimport[39;49;00m resample
[34mimport[39;49;00m [04m[36mfunctools[39;49;00m
[34mimport[39;49;00m [04m[36mmultiprocessing[39;49;00m

[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mfrom[39;49;00m [04m[36mdatetime[39;49;00m [34mimport[39;49;00m datetime
[34mimport[39;49;00m [04m[36msubprocess[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33minstall[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mtensorflow==2.1.0[39;49;00m[33m'[39;49;00m])
[34mimport[39;49;00m [04m[36mtensorf

    num_cpus = multiprocessing.cpu_count()
    [36mprint[39;49;00m([33m'[39;49;00m[33mnum_cpus [39;49;00m[33m{}[39;49;00m[33m'[39;49;00m.format(num_cpus))

    p = multiprocessing.Pool(num_cpus)
    p.map(transform_tsv_to_tfrecord, input_files)

    [36mprint[39;49;00m([33m'[39;49;00m[33mListing contents of [39;49;00m[33m{}[39;49;00m[33m'[39;49;00m.format(args.output_data))
    dirs_output = os.listdir(args.output_data)
    [34mfor[39;49;00m file [35min[39;49;00m dirs_output:
        [36mprint[39;49;00m(file)

    [36mprint[39;49;00m([33m'[39;49;00m[33mListing contents of [39;49;00m[33m{}[39;49;00m[33m'[39;49;00m.format(train_data))
    dirs_output = os.listdir(train_data)
    [34mfor[39;49;00m file [35min[39;49;00m dirs_output:
        [36mprint[39;49;00m(file)

    [36mprint[39;49;00m([33m'[39;49;00m[33mListing contents of [39;49;00m[33m{}[39;49;00m[33m'[39;49;00m.format(validation_data))
    dirs_output = os.listdi

In [17]:
import time
processing_code_s3_prefix = 'pipeline_sklearn_processing/{}/code'.format(int(time.time()))

In [18]:
input_code = sess.upload_data(
    './preprocess-scikit-text-to-bert.py',
    bucket=bucket,
    key_prefix=processing_code_s3_prefix,
)

In [19]:
%store processing_code_s3_prefix

Stored 'processing_code_s3_prefix' (str)


In [20]:
print(processing_code_s3_prefix)

pipeline_sklearn_processing/1601151884/code


# Set the Processing Hyper-Parameters

In [21]:
max_seq_length=64
train_split_percentage=0.90
validation_split_percentage=0.05
test_split_percentage=0.05
balance_dataset=True
processing_instance_count=1
processing_instance_type='ml.c5.2xlarge'

# Specify the Raw Inputs S3 Location

In [22]:
raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)
print(raw_input_data_s3_uri)

s3://sagemaker-us-west-2-085964654406/amazon-reviews-pds/tsv/


In [23]:
!aws s3 ls $raw_input_data_s3_uri

2020-09-26 17:43:25 1294879074 amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv.gz
2020-09-26 16:39:04   18997559 amazon_reviews_us_Digital_Software_v1_00.tsv.gz
2020-09-26 16:39:08   27442648 amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz


In [24]:
from sagemaker.sklearn.processing import SKLearnProcessor

processor = SKLearnProcessor(framework_version='0.20.0',
                             role=role,
                             instance_type=processing_instance_type,
                             instance_count=processing_instance_count,
                             max_runtime_in_seconds=7200)

# Setup Training Step

# Show Training Script

In [25]:
!pygmentize src/tf_bert_reviews.py

[34mimport[39;49;00m [04m[36mtime[39;49;00m
[34mimport[39;49;00m [04m[36mrandom[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mfrom[39;49;00m [04m[36mglob[39;49;00m [34mimport[39;49;00m glob
[34mimport[39;49;00m [04m[36mpprint[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36msubprocess[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [34mas[39;49;00m [04m[36mtf[39;49;00m
[37m#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])[39;49;00m
subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33minstall[39;49;00m[33m'[39;49;00m, [33m'[3

    transformer_fine_tuned_model_path = os.path.join(local_model_dir, [33m'[39;49;00m[33mtransformers/fine-tuned/[39;49;00m[33m'[39;49;00m)
    os.makedirs(transformer_fine_tuned_model_path, exist_ok=[34mTrue[39;49;00m)

    [37m# SavedModel Output[39;49;00m
    tensorflow_saved_model_path = os.path.join(local_model_dir, [33m'[39;49;00m[33mtensorflow/saved_model/0[39;49;00m[33m'[39;49;00m)
    os.makedirs(tensorflow_saved_model_path, exist_ok=[34mTrue[39;49;00m)

    [37m# Tensorboard Logs [39;49;00m
    tensorboard_logs_path = os.path.join(local_model_dir, [33m'[39;49;00m[33mtensorboard/[39;49;00m[33m'[39;49;00m)
    os.makedirs(tensorboard_logs_path, exist_ok=[34mTrue[39;49;00m)

    [37m# Commented out due to incompatibility with transformers library (possibly)[39;49;00m
    [37m# Set the global precision mixed_precision policy to "mixed_float16"    [39;49;00m
[37m#    mixed_precision_policy = 'mixed_float16'[39;49;00m
[37m#    print(

# Setup Training Hyper-Parameters
Note that `max_seq_length` is re-used from the processing hyper-parameters above

In [26]:
epochs=3
learning_rate=0.00001
epsilon=0.00000001
train_batch_size=128
validation_batch_size=128
test_batch_size=128
train_steps_per_epoch=100
validation_steps=100
test_steps=100
train_instance_count=1
train_instance_type='ml.c5.9xlarge'
train_volume_size=1024
use_xla=True
use_amp=True
freeze_bert_layer=False
enable_sagemaker_debugger=False
enable_checkpointing=False
enable_tensorboard=False
input_mode='Pipe'
run_validation=True
run_test=True
run_sample_predictions=True
deploy_instance_count=1
deploy_instance_type='ml.m5.4xlarge'
#deploy_instance_type='ml.m5.large' # bur

# Setup Metrics To Track Model Performance

In [27]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
]

# Setup Estimator

In [28]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
                       role=role,
                       instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available
                       instance_type=train_instance_type,
                       volume_size=train_volume_size,                       
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'epsilon': epsilon,
                                        'train_batch_size': train_batch_size,
                                        'validation_batch_size': validation_batch_size,
                                        'test_batch_size': test_batch_size,                                             
                                        'train_steps_per_epoch': train_steps_per_epoch,
                                        'validation_steps': validation_steps,
                                        'test_steps': test_steps,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,                                             
                                        'max_seq_length': max_seq_length,
                                        'freeze_bert_layer': freeze_bert_layer,
                                        'enable_sagemaker_debugger': enable_sagemaker_debugger,
                                        'enable_checkpointing': enable_checkpointing,
                                        'enable_tensorboard': enable_tensorboard,                                        
                                        'run_validation': run_validation,
                                        'run_test': run_test,
                                        'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
                       metric_definitions=metrics_definitions,
#                       max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute
                      )

# Setup Pipeline with the Step Functions SDK

A typical task for a data scientist is to train a model and deploy that model to an endpoint. Without the Step Functions SDK, this is a four step process on SageMaker that includes the following.

1. Training the model
2. Creating the model on SageMaker
3. Creating an endpoint configuration
4. Deploying the trained model to the configured endpoint

The Step Functions SDK provides the [TrainingPipeline](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/pipelines.html#stepfunctions.template.pipeline.train.TrainingPipeline) API to simplify this procedure. The following configures `pipeline` with the necessary parameters to define a training pipeline.

In [29]:
import time
timestamp = int(time.time())

pipeline_name = 'bert-pipeline-{}'.format(timestamp)

print('Pipeline name {}'.format(pipeline_name))

Pipeline name bert-pipeline-1601151889


In [30]:
from __future__ import absolute_import

from sagemaker.utils import base_name_from_image
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.inputs import TrainingInput

from stepfunctions.steps import (
    TrainingStep, 
    TransformStep, 
    ModelStep, 
    EndpointConfigStep, 
    EndpointStep, 
    Chain, 
    Fail, 
    Catch,
    ProcessingStep
)
from stepfunctions.workflow import Workflow
from stepfunctions.template.pipeline.common import WorkflowTemplate
from stepfunctions.template.pipeline.common import StepId

class TrainingPipelineWithDifferentDeployInstanceTypeAndProcessingJob(WorkflowTemplate):

    """
    Creates a standard training pipeline with the following steps in order:
        1. Train estimator
        2. Create estimator model
        3. Endpoint configuration
        4. Deploy model
    """

    __allowed_kwargs = ('pipeline_name',)
    
    def __init__(self, 
                 processor,
                 raw_input_data_s3_uri,
                 train_split_percentage,
                 validation_split_percentage,
                 test_split_percentage,
                 max_seq_length,
                 balance_dataset,
                 estimator, 
                 role, 
                 bucket,                  
                 client, 
                 deploy_instance_count, 
                 deploy_instance_type, 
                 **kwargs):
        """
        Args:
            estimator (sagemaker.estimator.EstimatorBase): The estimator to use for training. Can be a BYO estimator, Framework estimator or Amazon algorithm estimator.
            role (str): An AWS IAM role (either name or full Amazon Resource Name (ARN)). This role is used to create, manage, and execute the Step Functions workflows.
            inputs: Information about the training data. Please refer to the `fit()` method of the associated estimator, as this can take any of the following forms:
                * (str) - The S3 location where training data is saved.
                * (dict[str, str] or dict[str, `sagemaker.session.s3_input`]) - If using multiple channels for training data, you can specify a dict mapping channel names to strings or `sagemaker.session.s3_input` objects.
                * (`sagemaker.session.s3_input`) - Channel configuration for S3 data sources that can provide additional information about the training dataset. See `sagemaker.session.s3_input` for full details.
                * (`sagemaker.amazon.amazon_estimator.RecordSet`) - A collection of Amazon `Record` objects serialized and stored in S3. For use with an estimator for an Amazon algorithm.
                * (list[`sagemaker.amazon.amazon_estimator.RecordSet`]) - A list of `sagemaker.amazon.amazon_estimator.RecordSet` objects, where each instance is a different channel of training data.
            bucket (str): S3 bucket under which the output artifacts from the training job will be stored. The parent path used is built using the format: ``s3://{bucket}/{pipeline_name}/models/{job_name}/``. In this format, `pipeline_name` refers to the keyword argument provided for TrainingPipeline. If a `pipeline_name` argument was not provided, one is auto-generated by the pipeline as `training-pipeline-<timestamp>`. Also, in the format, `job_name` refers to the job name provided when calling the :meth:`TrainingPipeline.run()` method.
            client (SFN.Client, optional): boto3 client to use for creating and interacting with the training pipeline in Step Functions. (default: None)
        Keyword Args:
            pipeline_name (str, optional): Name of the pipeline. This name will be used to name jobs (if not provided when calling execute()), models, endpoints, and S3 objects created by the pipeline. If a `pipeline_name` argument was not provided, one is auto-generated by the pipeline as `training-pipeline-<timestamp>`. (default:None)
        """
        self.processor = processor 
        self.raw_input_data_s3_uri = raw_input_data_s3_uri
        self.train_split_percentage = train_split_percentage
        self.validation_split_percentage = validation_split_percentage
        self.test_split_percentage = test_split_percentage
        self.max_seq_length = max_seq_length
        self.balance_dataset = balance_dataset
        self.estimator = estimator
        self.role = role        
        self.bucket = bucket
        self.deploy_instance_count = deploy_instance_count
        self.deploy_instance_type = deploy_instance_type

        for key in self.__class__.__allowed_kwargs:
            setattr(self, key, kwargs.pop(key, None))

        if not self.pipeline_name:
            self.__pipeline_name_unique = True
            self.pipeline_name = 'training-pipeline-{date}'.format(date=self._generate_timestamp())

        self.definition = self.build_workflow_definition()
        self.input_template = self._extract_input_template(self.definition)

        workflow = Workflow(name=self.pipeline_name, 
                            definition=self.definition, 
                            role=role, 
                            format_json=True, 
                            client=client)

        super(TrainingPipelineWithDifferentDeployInstanceTypeAndProcessingJob, self).__init__(s3_bucket=bucket, 
                                                                                              workflow=workflow, 
                                                                                              role=role, 
                                                                                              client=client)
    
    def build_workflow_definition(self):
        """
        Build the workflow definition for the training pipeline with all the states involved.
        Returns:
            :class:`~stepfunctions.steps.states.Chain`: Workflow definition as a chain of states involved in the the training pipeline.
        """

        processing_inputs=[
                ProcessingInput(
                    input_name='raw_input',
                    source=raw_input_data_s3_uri,
                    destination='/opt/ml/processing/input/data/',
                    s3_data_distribution_type='ShardedByS3Key'
                ),
                ProcessingInput(
                    input_name='code',            
                    source=input_code,
                    destination='/opt/ml/processing/input/code',
                )
        ]

        processed_train_data_s3_uri = 's3://{}/{}/processing/output/bert-train'.format(self.bucket, self.pipeline_name)        
        processed_validation_data_s3_uri = 's3://{}/{}/processing/output/bert-validation'.format(self.bucket, self.pipeline_name)        
        processed_test_data_s3_uri = 's3://{}/{}/processing/output/bert-test'.format(self.bucket, self.pipeline_name)
         
        processing_outputs=[
                ProcessingOutput(s3_upload_mode='EndOfJob',
                                 output_name='bert-train',
                                 source='/opt/ml/processing/output/bert/train',
                                 destination=processed_train_data_s3_uri
                                ),
                ProcessingOutput(s3_upload_mode='EndOfJob',
                                 output_name='bert-validation',
                                 source='/opt/ml/processing/output/bert/validation',
                                 destination=processed_validation_data_s3_uri
                                ),
                ProcessingOutput(s3_upload_mode='EndOfJob',
                                 output_name='bert-test',
                                 source='/opt/ml/processing/output/bert/test',
                                 destination=processed_test_data_s3_uri
                                ),
        ]        

        processing_step = ProcessingStep(
            'Processing Job', # StepId.ProcessingJob.value?
            processor=self.processor,
            job_name=self.pipeline_name,
            inputs=processing_inputs,
            outputs=processing_outputs,
            # experiment_config=experiment_config,
            container_arguments=['--train-split-percentage', str(self.train_split_percentage),
                                 '--validation-split-percentage', str(self.validation_split_percentage),
                                 '--test-split-percentage', str(self.test_split_percentage),
                                 '--max-seq-length', str(self.max_seq_length),
                                 '--balance-dataset', str(self.balance_dataset)],
            container_entrypoint=['python3', '/opt/ml/processing/input/code/preprocess-scikit-text-to-bert.py'],
        )        

        s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution='ShardedByS3Key')
        s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution='ShardedByS3Key')
        s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution='ShardedByS3Key')

        training_step = TrainingStep(
            StepId.Train.value,
            estimator=self.estimator,
            job_name=self.pipeline_name + '/estimator-source',
            data={
                'train': s3_input_train_data,
                'validation': s3_input_validation_data,
                'test': s3_input_test_data
            },
        )

        model = self.estimator.create_model()
        model_step = ModelStep(
            StepId.CreateModel.value,
            instance_type=deploy_instance_type,
            model=model,
            model_name=self.pipeline_name
        )

        endpoint_config_step = EndpointConfigStep(
            StepId.ConfigureEndpoint.value,
            endpoint_config_name=self.pipeline_name,
            model_name=self.pipeline_name,
            initial_instance_count=self.deploy_instance_count,
            instance_type=self.deploy_instance_type
        )
        
        deploy_step = EndpointStep(
            StepId.Deploy.value,
            endpoint_name=self.pipeline_name,
            endpoint_config_name=self.pipeline_name,
        )

        return Chain([
            processing_step, 
            training_step, 
            model_step, 
            endpoint_config_step, 
            deploy_step
        ])
    
    def execute(self, job_name=None, hyperparameters=None):
        """
        Run the training pipeline.
        
        Args:
            job_name (str, optional): Name for the training job. If one is not provided, a job name will be auto-generated. (default: None)
            hyperparameters (dict, optional): Hyperparameters for the estimator training. (default: None)
        
        Returns:
            :py:class:`~stepfunctions.workflow.Execution`: Running instance of the training pipeline.
        """
        inputs = self.input_template.copy()
        
        if hyperparameters is not None:
            inputs[StepId.Train.value]['HyperParameters'] = {
                k: str(v) for k, v in hyperparameters.items()
            }
        
        if job_name is None:
            job_name = '{base_name}-{timestamp}'.format(base_name='training-pipeline', timestamp=self._generate_timestamp())
            
        print(inputs)
        
        # Configure training and model
        inputs[StepId.Train.value]['TrainingJobName'] = 'estimator-' + job_name
        inputs[StepId.Train.value]['OutputDataConfig']['S3OutputPath'] = 's3://{s3_bucket}/{pipeline_name}/models'.format(
            s3_bucket=self.s3_bucket,
            pipeline_name=self.workflow.name
        )
        inputs[StepId.CreateModel.value]['ModelName'] = job_name

        # Configure endpoint
        inputs[StepId.ConfigureEndpoint.value]['EndpointConfigName'] = job_name
        for variant in inputs[StepId.ConfigureEndpoint.value]['ProductionVariants']:
            variant['ModelName'] = job_name
        inputs[StepId.Deploy.value]['EndpointConfigName'] = job_name
        inputs[StepId.Deploy.value]['EndpointName'] = job_name
        
        # Configure the path to model artifact
        inputs[StepId.CreateModel.value]['PrimaryContainer']['ModelDataUrl'] = '{s3_uri}/{job}/output/model.tar.gz'.format(
            s3_uri=inputs[StepId.Train.value]['OutputDataConfig']['S3OutputPath'],
            job=inputs[StepId.Train.value]['TrainingJobName']
        )
        
        return self.workflow.execute(inputs=inputs, name=job_name)

In [31]:
# Note:  If you see an error about 'TensorFlowModel' object has no attribute 'image', you are using SageMaker SDK 1.x
#        The Data Science SDK only supports 1.x at this time.
#        Waiting on this:  https://github.com/aws/aws-step-functions-data-science-sdk-python/issues/69

pipeline = TrainingPipelineWithDifferentDeployInstanceTypeAndProcessingJob(
    processor=processor,
    raw_input_data_s3_uri=raw_input_data_s3_uri,
    train_split_percentage=train_split_percentage,
    validation_split_percentage=validation_split_percentage,
    test_split_percentage=test_split_percentage,
    max_seq_length=max_seq_length,
    balance_dataset=balance_dataset,
    estimator=estimator,
    role=stepfunction_role_arn,
    bucket=bucket,
    client=sfn,
    deploy_instance_count=deploy_instance_count,
    deploy_instance_type=deploy_instance_type,    
)

# Visualize the pipeline

You can now view the workflow definition, and also visualize it as a graph. This workflow and graph represent your training pipeline. 

## View the workflow definition

In [32]:
print(pipeline.workflow.definition.to_json(pretty=True))

{
    "StartAt": "Processing Job",
    "States": {
        "Processing Job": {
            "Resource": "arn:aws:states:::sagemaker:createProcessingJob.sync",
            "Parameters": {
                "ProcessingJobName.$": "$$.Execution.Input['Processing Job'].ProcessingJobName",
                "ProcessingInputs.$": "$$.Execution.Input['Processing Job'].ProcessingInputs",
                "ProcessingOutputConfig.$": "$$.Execution.Input['Processing Job'].ProcessingOutputConfig",
                "AppSpecification.$": "$$.Execution.Input['Processing Job'].AppSpecification",
                "RoleArn.$": "$$.Execution.Input['Processing Job'].RoleArn",
                "ProcessingResources.$": "$$.Execution.Input['Processing Job'].ProcessingResources",
                "StoppingCondition.$": "$$.Execution.Input['Processing Job'].StoppingCondition"
            },
            "Type": "Task",
            "Next": "Training"
        },
        "Training": {
            "Resource": "arn:aws:states

## Visualize the workflow graph
## *Note: This only renders in Jupyter. NOT in JupyterLab.*

In [33]:
pipeline.render_graph()

## You should see a graph like this:
 
<img src="img/pipeline_created.png" width="70%" align="left">

## Create and execute the pipeline on AWS Step Functions

Create the pipeline in AWS Step Functions with [create](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.create).

In [34]:
# Sleeping to wait for role and policy creations
import time
time.sleep(10)

pipeline.create()

[32m[INFO] Workflow created successfully on AWS Step Functions.[0m


'arn:aws:states:us-west-2:085964654406:stateMachine:training-pipeline-2020-09-26-20-24-49'

Run the workflow with [execute](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.execute). A link will be provided after the following cell is executed. Following this link, you can monitor your pipeline execution on Step Functions' console.

In [35]:
execution = pipeline.execute(job_name=None,
                             hyperparameters=None)

{'Processing Job': {'ProcessingJobName': 'training-pipeline-2020-09-26-20-24-49', 'ProcessingInputs': [{'InputName': 'raw_input', 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-085964654406/amazon-reviews-pds/tsv/', 'LocalPath': '/opt/ml/processing/input/data/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-085964654406/pipeline_sklearn_processing/1601151884/code/preprocess-scikit-text-to-bert.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}], 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'bert-train', 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/processing/output/bert-train', 'LocalPath': '/opt/ml/processing/output/bert/train', 'S3UploadMode': 'EndOfJob'}}

In [36]:
stepfunction_arn = 'arn:aws:states:{}:{}:stateMachine:{}'.format(region, account_id, pipeline.pipeline_name)
print(stepfunction_arn)

arn:aws:states:us-west-2:085964654406:stateMachine:training-pipeline-2020-09-26-20-24-49


In [37]:
%store stepfunction_arn

Stored 'stepfunction_arn' (str)


In [38]:
stepfunction_name = pipeline.pipeline_name
print(stepfunction_name)

training-pipeline-2020-09-26-20-24-49


In [39]:
%store stepfunction_name

Stored 'stepfunction_name' (str)


## Check Pipeline Progress
_Note: This only renders in Jupyter at the moment - not in JupyterLab.  This is changing soon._

In [40]:
execution.render_progress()

## You should see a graph like this:

<img src="img/pipeline_executed.png" width="90%" align="left">

In [41]:
%%time

import time

events = execution.list_events()

while len(events) <= 5:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

Number of events:  4
Number of events:  5
Number of events:  5
Number of events:  5
Number of events:  5
Number of events:  5
Number of events:  5
Number of events:  5
Number of events:  5
Number of events:  5
Number of events:  5
Number of events:  5
Number of events:  5
Number of events:  11
CPU times: user 72.2 ms, sys: 0 ns, total: 72.2 ms
Wall time: 6min 30s


In [42]:
execution.render_progress()

# _Wait for ^^ Number of Events ^^ to Reach At Least 6_

In [43]:
import json

processing_job_name = json.loads(events[5]['taskSucceededEventDetails']['output'])['ProcessingJobName']
print('Processing Job Name: {}'.format(processing_job_name))

print('')

processing_job_outputs = json.loads(events[5]['taskSucceededEventDetails']['output'])['ProcessingOutputConfig']['Outputs']

for output in processing_job_outputs:
    if output['OutputName'] == 'bert-train':
        train_data_s3_uri = output['S3Output']['S3Uri']
    if output['OutputName'] == 'bert-validation':
        validation_data_s3_uri = output['S3Output']['S3Uri']
    if output['OutputName'] == 'bert-test':
        test_data_s3_uri = output['S3Output']['S3Uri']

print('Processed Data Bert Train S3 URI: {}'.format(train_data_s3_uri))
print('Processed Data Bert Validation S3 URI: {}'.format(validation_data_s3_uri))
print('Processed Data Bert Test S3 URI: {}'.format(test_data_s3_uri))

Processing Job Name: training-pipeline-2020-09-26-20-24-49

Processed Data Bert Train S3 URI: s3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/processing/output/bert-train
Processed Data Bert Validation S3 URI: s3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/processing/output/bert-validation
Processed Data Bert Test S3 URI: s3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/processing/output/bert-test


In [44]:
from sagemaker.s3 import S3Downloader
print(S3Downloader.list(train_data_s3_uri))

from sagemaker.s3 import S3Downloader
print(S3Downloader.list(validation_data_s3_uri))

from sagemaker.s3 import S3Downloader
print(S3Downloader.list(test_data_s3_uri))

['s3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/processing/output/bert-train/part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord', 's3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/processing/output/bert-train/part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord', 's3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/processing/output/bert-train/part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord']
['s3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/processing/output/bert-validation/part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord', 's3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/processing/output/bert-validation/part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord', 's3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/processing/output/bert-validation/part-algo-1-

In [45]:
%%time

import time

events = execution.list_events()

while len(events) <= 11:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of events:  11
Number of 

In [46]:
execution.render_progress()

In [47]:
import json

training_job_name = json.loads(events[11]['taskSucceededEventDetails']['output'])['TrainingJobName']
print('Training Job Name: {}'.format(training_job_name))

print('')

trained_model_s3_uri = json.loads(events[11]['taskSucceededEventDetails']['output'])['ModelArtifacts']['S3ModelArtifacts']
print('Trained Model S3 URI: {}'.format(trained_model_s3_uri))

Training Job Name: estimator-training-pipeline-2020-09-26-20-25-01

Trained Model S3 URI: s3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/models/estimator-training-pipeline-2020-09-26-20-25-01/output/model.tar.gz


In [48]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a></b>'.format(region, training_job_name)))


# Copy the Model from S3

In [49]:
!aws s3 cp $trained_model_s3_uri ./model.tar.gz

download: s3://sagemaker-us-west-2-085964654406/training-pipeline-2020-09-26-20-24-49/models/estimator-training-pipeline-2020-09-26-20-25-01/output/model.tar.gz to ./model.tar.gz


In [50]:
!mkdir -p ./model/
!tar -xvzf ./model.tar.gz -C ./model/

tensorboard/
metrics/
metrics/confusion_matrix.png
transformers/
transformers/fine-tuned/
transformers/fine-tuned/config.json
transformers/fine-tuned/tf_model.h5
code/
code/inference.py
tensorflow/
tensorflow/saved_model/
tensorflow/saved_model/0/
tensorflow/saved_model/0/saved_model.pb
tensorflow/saved_model/0/variables/
tensorflow/saved_model/0/variables/variables.data-00000-of-00001
tensorflow/saved_model/0/variables/variables.index
tensorflow/saved_model/0/assets/


# Show the Model Prediction Signature

In [51]:
!saved_model_cli show --all --dir ./model/tensorflow/saved_model/0/

2020-09-26 21:06:24.082302: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/extras/CUPTI/lib64:/usr/local/cuda-10.0/lib:/usr/local/cuda-10.0/efa/lib:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:
2020-09-26 21:06:24.082375: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/extras/CUPTI/lib64:/usr/local/cuda-10.0/lib:/usr/local/cuda-10.0/ef

In [52]:
%%time

import time

events = execution.list_events()

while len(events) <= 24:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()

print('Number of events:  {}'.format(len(events)))

Number of events:  29
CPU times: user 2.14 ms, sys: 4.38 ms, total: 6.52 ms
Wall time: 115 ms


In [53]:
execution.render_progress()

# _Wait for ^^ Number of Events ^^ to Reach At Least 19_

In [54]:
import json

step_functions_pipeline_endpoint_name = json.loads(events[24]['taskScheduledEventDetails']['parameters'])['EndpointName']

print('Endpoint Name: {}'.format(step_functions_pipeline_endpoint_name))

Endpoint Name: training-pipeline-2020-09-26-20-25-01


In [55]:
%%time

import time

events = execution.list_events()

while len(events) <= 27:
    print('Number of events:  {}'.format(len(events)))
    time.sleep(30)
    events = execution.list_events()    

print('Number of events:  {}'.format(len(events)))

Number of events:  29
CPU times: user 4.64 ms, sys: 70 µs, total: 4.71 ms
Wall time: 72.3 ms


In [56]:
execution.render_progress()

# _Wait for ^^ Number of Events ^^ to Reach At Least 22_

In [57]:
step_functions_pipeline_endpoint_arn = json.loads(events[27]['stateExitedEventDetails']['output'])['EndpointArn']

print('Endpoint ARN: {}'.format(step_functions_pipeline_endpoint_arn))

Endpoint ARN: arn:aws:sagemaker:us-west-2:085964654406:endpoint/training-pipeline-2020-09-26-20-25-01


In [58]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST Endpoint</a></b>'.format(region, step_functions_pipeline_endpoint_name)))


# Pass Variables to the Next Notebooks(s)

In [59]:
print(step_functions_pipeline_endpoint_name)

training-pipeline-2020-09-26-20-25-01


In [60]:
%store step_functions_pipeline_endpoint_name

Stored 'step_functions_pipeline_endpoint_name' (str)


In [61]:
%store

Stored variables and their in-db values:
auto_ml_job_name                                      -> 'automl-dm-26-16-00-25'
autopilot_endpoint_name                               -> 'automl-dm-ep-26-16-21-49'
autopilot_train_s3_uri                                -> 's3://sagemaker-us-west-2-085964654406/data/amazon
balance_dataset                                       -> True
experiment_name                                       -> 'Amazon-Customer-Reviews-BERT-Experiment-160114585
firehose_arn                                          -> 'arn:aws:firehose:us-west-2:085964654406:deliverys
firehose_name                                         -> 'dsoaws-kinesis-data-firehose'
iam_kinesis_role_name                                 -> 'DSOAWS_Kinesis'
iam_kinesis_role_passed                               -> True
iam_lambda_role_name                                  -> 'DSOAWS_Lambda'
iam_lambda_role_passed                                -> True
iam_role_kinesis_arn                             

In [None]:
%%javascript
Jupyter.notebook.save_checkpoint();
Jupyter.notebook.session.delete();