# (1) SageMaker Processing w/ SageMaker Python SDK
- Source: https://github.com/mullue/sm-tf2/blob/master/tf-2-workflow.ipynb
- Kernel: `conda_tensorflow2_p36`

In [1]:
import os
import sagemaker
import tensorflow as tf
from time import gmtime, strftime 




In [2]:
sess = sagemaker.Session()
bucket = sess.default_bucket() 

data_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(data_dir, exist_ok=True)

train_dir = os.path.join(os.getcwd(), 'data/train')
os.makedirs(train_dir, exist_ok=True)

test_dir = os.path.join(os.getcwd(), 'data/test')
os.makedirs(test_dir, exist_ok=True)

raw_dir = os.path.join(os.getcwd(), 'data/raw')
os.makedirs(raw_dir, exist_ok=True)

In [3]:
import numpy as np
from tensorflow.python.keras.datasets import boston_housing
from sklearn.preprocessing import StandardScaler

(x_train, y_train), (x_test, y_test) = boston_housing.load_data()

np.save(os.path.join(raw_dir, 'x_train.npy'), x_train)
np.save(os.path.join(raw_dir, 'x_test.npy'), x_test)
np.save(os.path.join(train_dir, 'y_train.npy'), y_train)
np.save(os.path.join(test_dir, 'y_test.npy'), y_test)

s3_prefix = 'sm-processing-demo'

rawdata_s3_prefix = '{}/data/raw'.format(s3_prefix)
train_s3_prefix = '{}/data/train'.format(s3_prefix)
test_s3_prefix = '{}/data/test'.format(s3_prefix)

raw_s3 = sess.upload_data(path='./data/raw/', key_prefix=rawdata_s3_prefix)
y_train = sess.upload_data(path='./data/train/y_train.npy', key_prefix=train_s3_prefix)
y_test = sess.upload_data(path='./data/test/y_test.npy', key_prefix=test_s3_prefix)

print('raw data location: {}'.format(raw_s3))

raw data location: s3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/raw


In [4]:
!aws s3 ls s3://{bucket}/{s3_prefix} --recursive

2022-01-06 07:04:19        846 sm-processing-demo-06-07-04-17/input/code/preprocessing.py
2022-01-06 11:03:06        846 sm-processing-demo/code/preprocessing.py
2022-01-06 12:15:36      10736 sm-processing-demo/data/raw/x_test.npy
2022-01-06 12:15:36      42144 sm-processing-demo/data/raw/x_train.npy
2022-01-06 11:15:47      10736 sm-processing-demo/data/test/x_test.npy
2022-01-06 12:15:36        944 sm-processing-demo/data/test/y_test.npy
2022-01-06 12:15:36       3360 sm-processing-demo/data/train/y_train.npy
2022-01-06 06:00:00        846 sm-processing-democode/preprocessing.py


In [5]:
%%writefile preprocessing.py

import glob
import numpy as np
import os
from sklearn.preprocessing import StandardScaler

if __name__=='__main__':
    
    input_files = glob.glob('{}/*.npy'.format('/opt/ml/processing/input'))
    print('\nINPUT FILE LIST: \n{}\n'.format(input_files))
    
    scaler = StandardScaler()
    
    for file in input_files:
        raw = np.load(file)
        transformed = scaler.fit_transform(raw)
        
        if 'train' in file:
            output_path = os.path.join('/opt/ml/processing/train', 'x_train.npy')
          
            np.save(output_path, transformed)
            print('SAVED TRANSFORMED TRAINING DATA FILE\n')
        else:
            output_path = os.path.join('/opt/ml/processing/test', 'x_test.npy')
            
            np.save(output_path, transformed)
            print('SAVED TRANSFORMED TEST DATA FILE\n')

Overwriting preprocessing.py


In [6]:
# from sagemaker import get_execution_role
# from sagemaker.sklearn.processing import SKLearnProcessor

# sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
#                                      role=get_execution_role(),
#                                      instance_type='ml.m5.xlarge',
#                                      instance_count=2)

In [7]:
# from sagemaker.processing import ProcessingInput, ProcessingOutput
# from time import gmtime, strftime 

# processing_job_name = "sm-processing-demo-{}".format(strftime("%d-%H-%M-%S", gmtime()))
# output_destination = 's3://{}/{}/data'.format(bucket, s3_prefix)

# sklearn_processor.run(code='preprocessing.py',
#                       job_name=processing_job_name,
#                       inputs=[ProcessingInput(
#                         source=raw_s3,
#                         destination='/opt/ml/processing/input',
#                         s3_data_distribution_type='ShardedByS3Key')],
#                       outputs=[ProcessingOutput(output_name='train',
#                                                 destination='{}/train'.format(output_destination),
#                                                 source='/opt/ml/processing/train'),
#                                ProcessingOutput(output_name='test',
#                                                 destination='{}/test'.format(output_destination),
#                                                 source='/opt/ml/processing/test')])

# preprocessing_job_description = sklearn_processor.jobs[-1].describe()

# (2) SageMaker Processing w/ boto3
- Reference: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_processing_job

In [8]:
# Upload preprocessing.py to S3
preprocessing_code = sess.upload_data(path='./preprocessing.py', key_prefix=s3_prefix + '/code')
preprocessing_code

's3://sagemaker-us-east-1-889750940888/sm-processing-demo/code/preprocessing.py'

In [9]:
# input, output S3 paths
code_location = 's3://{}/{}/code'.format(bucket, s3_prefix)
input_data = 's3://{}/{}/data/raw'.format(bucket, s3_prefix)
output_train = 's3://{}/{}/data/train'.format(bucket, s3_prefix)
output_test = 's3://{}/{}/data/test'.format(bucket, s3_prefix)

print(code_location + '\n' + input_data + '\n' + output_train + '\n' + output_test)

s3://sagemaker-us-east-1-889750940888/sm-processing-demo/code
s3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/raw
s3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/train
s3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/test


In [10]:
!aws s3 ls {code_location} --recursive

2022-01-06 12:15:37        846 sm-processing-demo/code/preprocessing.py


In [11]:
import boto3

sm_client = boto3.client('sagemaker')

In [12]:
# Processing container image uri
# SageMaker Python SDK를 사용하지 않는다면 Hard coding
# https://docs.aws.amazon.com/sagemaker/latest/dg/ecr-ap-northeast-2.html

from sagemaker import image_uris
container_image = image_uris.retrieve(framework='sklearn',region='us-east-1',version='0.20.0',image_scope='training')
container_image

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3'

In [38]:
!aws s3 ls {output_test} --recursive

2022-01-06 12:27:56      10736 sm-processing-demo/data/test/x_test.npy
2022-01-06 12:15:36        944 sm-processing-demo/data/test/y_test.npy


In [40]:
# Other Parameters
processingjob_name = "sm-processing-demo-{}".format(strftime("%d-%H-%M-%S", gmtime()))

param_input_code = {
    'InputName': 'code',
    'S3Input': {
        'S3Uri': code_location,
        'LocalPath': '/opt/ml/processing/input/code',
        'S3DataType': 'S3Prefix',
        'S3InputMode': 'File',
        'S3DataDistributionType': 'FullyReplicated',
        'S3CompressionType': 'None'
    }
}

param_input_data = {
    'InputName': 'input',
    'S3Input': {
        'S3Uri': input_data,
        'LocalPath': '/opt/ml/processing/input',
        'S3DataType': 'S3Prefix',
        'S3InputMode': 'File',
        'S3DataDistributionType': 'ShardedByS3Key',
        'S3CompressionType': 'None'
    }
}

param_output_train = {
    'OutputName': 'output_train',
    'S3Output': {
        'S3Uri': output_train,
        'LocalPath': '/opt/ml/processing/train',
        'S3UploadMode': 'EndOfJob'
    }
}

param_output_test = {
    'OutputName': 'output_test',
    'S3Output': {
        'S3Uri': output_test,
        'LocalPath': '/opt/ml/processing/test',
        'S3UploadMode': 'EndOfJob'
    }
}

# role = sagemaker.get_execution_role()  # If you're using SageMaker Python SDK
role = 'arn:aws:iam::889750940888:role/service-role/AmazonSageMaker-ExecutionRole-20200219T104989' # If you're not using SageMaker Python SDK

In [41]:
response = sm_client.create_processing_job(
    ProcessingJobName=processingjob_name,
    ProcessingInputs=[
        param_input_code, 
        param_input_data
    ],
    ProcessingOutputConfig={
        'Outputs': [
            param_output_train,
            param_output_test
        ]
    },
    ProcessingResources={
        'ClusterConfig': {
            'InstanceCount': 2,
            'InstanceType': 'ml.t3.medium',
            'VolumeSizeInGB': 5
        }
    },
    StoppingCondition={
        'MaxRuntimeInSeconds': 3600
    },
    AppSpecification={
        'ImageUri': container_image,
        'ContainerEntrypoint': [
            'python3',
            '/opt/ml/processing/input/code/preprocessing.py'
        ]
    },
    RoleArn=role
)

In [42]:
processing_job_status = sm_client.describe_processing_job(
    ProcessingJobName=response['ProcessingJobArn'].split('/')[1]
)
processing_job_status

{'ProcessingInputs': [{'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo/code',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'input',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/raw',
    'LocalPath': '/opt/ml/processing/input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'ShardedByS3Key',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'output_train',
    'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/train',
     'LocalPath': '/opt/ml/processing/train',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False},
   {'OutputName': 'output_test',
    'S3Output': {

# (3) SageMaker Processing w/ SM Pipelines
- Source: https://github.com/gonsoomoon-ml/SageMaker-Pipelines-Step-By-Step/blob/main/scratch/3.1.Preprocess-Pipeline.ipynb

In [46]:
# # input, output S3 paths
# code_location = 's3://{}/{}/code'.format(bucket, s3_prefix)
input_data = 's3://{}/{}/data/raw'.format(bucket, s3_prefix)
input_data
# output_train = 's3://{}/{}/data/train'.format(bucket, s3_prefix)
# output_test = 's3://{}/{}/data/test'.format(bucket, s3_prefix)

!aws s3 ls {output_train} --recursive

2022-01-06 12:15:36       3360 sm-processing-demo/data/train/y_train.npy


In [17]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.m5.xlarge"
)

processing_input_data = ParameterString(
    name="ProcessingInputData",
    default_value=input_data,
)

In [18]:
from sagemaker.sklearn.processing import SKLearnProcessor

framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="sm-processing-demo",
    role=role,
)
print("input_data: \n", input_data)

input_data: 
 s3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/raw


In [48]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
    

step_process = ProcessingStep(
    name="SMProcessDemo",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=input_data,
                        destination='/opt/ml/processing/input'),
    ],
    outputs=[ProcessingOutput(output_name="train",
                              source='/opt/ml/processing/train',
                              destination=output_train),
             ProcessingOutput(output_name="test",
                              source='/opt/ml/processing/test',
                              destination=output_test)],
    code='preprocessing.py'
)

In [49]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = 'sagemaker-pipeline-demo'
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type, 
        processing_instance_count,
        processing_input_data,
    ],
    steps=[step_process],
)

In [50]:
import json

definition = json.loads(pipeline.definition())
definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'ProcessingInputData',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/raw'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'SMProcessDemo',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.ProcessingInstanceType'},
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3',
     'ContainerEntrypoint': ['python3',
      '/opt/ml/processing/input/code/pre

In [51]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:889750940888:pipeline/sagemaker-pipeline-demo',
 'ResponseMetadata': {'RequestId': '62e365c4-cd3e-4af3-ab63-167a5a7d43df',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '62e365c4-cd3e-4af3-ab63-167a5a7d43df',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '91',
   'date': 'Thu, 06 Jan 2022 13:26:37 GMT'},
  'RetryAttempts': 0}}

In [52]:
execution = pipeline.start()

In [53]:
execution.describe()
execution.wait()

In [54]:
execution.list_steps()

[{'StepName': 'SMProcessDemo',
  'StartTime': datetime.datetime(2022, 1, 6, 13, 26, 39, 931000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2022, 1, 6, 13, 30, 55, 172000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'AttemptCount': 0,
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:889750940888:processing-job/pipelines-dw8wqbx201u4-smprocessdemo-xnsnheajek'}}}]

## Execute SM Pipelines via boto3

In [26]:
pipeline_execution_response = sm_client.start_pipeline_execution(
    PipelineName='sagemaker-pipeline-demo',
    PipelineExecutionDisplayName='execution-via-boto3',
    PipelineParameters=[
        {
            'Name': 'ProcessingInstanceType',
            'Value': 'ml.c5.xlarge'
        },
        {
            'Name': 'ProcessingInstanceCount',
            'Value': '2'
        },
        {
            'Name': 'ProcessingInputData',
            'Value': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/raw'
        }
    ],
#     PipelineExecutionDescription='string',
#     ClientRequestToken='string',
#     ParallelismConfiguration={
#         'MaxParallelExecutionSteps': 123
#     }
)

In [27]:
pipeline_execution_response

{'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:889750940888:pipeline/sagemaker-pipeline-demo/execution/ccojvcn2s7n0',
 'ResponseMetadata': {'RequestId': '0992c1f3-6bc8-49a9-8915-d6c997fd5af5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0992c1f3-6bc8-49a9-8915-d6c997fd5af5',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '123',
   'date': 'Thu, 06 Jan 2022 12:20:40 GMT'},
  'RetryAttempts': 0}}