# Customer container를 활용한 SageMaker Processing

References
- Run Scripts with Your Own Processing Container: https://docs.aws.amazon.com/sagemaker/latest/dg/processing-container-run-scripts.html

## Input data 준비

In [13]:
import sagemaker
import os

In [14]:
sess = sagemaker.Session()
bucket = sess.default_bucket() 

data_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(data_dir, exist_ok=True)

train_dir = os.path.join(os.getcwd(), 'data/train')
os.makedirs(train_dir, exist_ok=True)

test_dir = os.path.join(os.getcwd(), 'data/test')
os.makedirs(test_dir, exist_ok=True)

raw_dir = os.path.join(os.getcwd(), 'data/raw')
os.makedirs(raw_dir, exist_ok=True)

In [26]:
import numpy as np
from tensorflow.python.keras.datasets import boston_housing
from sklearn.preprocessing import StandardScaler

(x_train, y_train), (x_test, y_test) = boston_housing.load_data()

np.save(os.path.join(raw_dir, 'x_train.npy'), x_train)
np.save(os.path.join(raw_dir, 'x_test.npy'), x_test)
np.save(os.path.join(train_dir, 'y_train.npy'), y_train)
np.save(os.path.join(test_dir, 'y_test.npy'), y_test)

s3_prefix = 'sm-processing-demo'

rawdata_s3_prefix = '{}/data/raw'.format(s3_prefix)
train_s3_prefix = '{}/data/train'.format(s3_prefix)
test_s3_prefix = '{}/data/test'.format(s3_prefix)

raw_s3 = sess.upload_data(path='./data/raw/', key_prefix=rawdata_s3_prefix)
y_train = sess.upload_data(path='./data/train/y_train.npy', key_prefix=train_s3_prefix)
y_test = sess.upload_data(path='./data/test/y_test.npy', key_prefix=test_s3_prefix)

print('raw data location: {}'.format(raw_s3))

raw data location: s3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/raw


In [27]:
# !aws s3 rm s3://{bucket}/{s3_prefix} --recursive
!aws s3 ls s3://{bucket}/{s3_prefix} --recursive

2022-04-29 02:50:07        846 sm-processing-demo-29-02-50-06/input/code/preprocessing.py
2022-04-29 02:51:08        846 sm-processing-demo-29-02-51-07/input/code/preprocessing.py
2022-04-29 02:52:30      10736 sm-processing-demo/data/raw/x_test.npy
2022-04-29 02:52:30      42144 sm-processing-demo/data/raw/x_train.npy
2022-04-29 02:52:30        944 sm-processing-demo/data/test/y_test.npy
2022-04-29 02:52:30       3360 sm-processing-demo/data/train/y_train.npy


## Processing code 작성

In [12]:
%%writefile code/preprocessing.py

import glob
import numpy as np
import os
from sklearn.preprocessing import StandardScaler

if __name__=='__main__':
    
    input_files = glob.glob('{}/*.npy'.format('/opt/ml/processing/input'))
    print('\nINPUT FILE LIST: \n{}\n'.format(input_files))
    
    scaler = StandardScaler()
    
    for file in input_files:
        raw = np.load(file)
        transformed = scaler.fit_transform(raw)
        
        if 'train' in file:
            output_path = os.path.join('/opt/ml/processing/train', 'x_train.npy')
          
            np.save(output_path, transformed)
            print('SAVED TRANSFORMED TRAINING DATA FILE\n')
        else:
            output_path = os.path.join('/opt/ml/processing/test', 'x_test.npy')
            
            np.save(output_path, transformed)
            print('SAVED TRANSFORMED TEST DATA FILE\n')

Writing code/preprocessing.py


## Dockerfile 작성

In [1]:
%%writefile Dockerfile

FROM nvcr.io/nvidia/pytorch:21.08-py3

RUN pip3 install onnx pandas
ENV PYTHONUNBUFFERED=TRUE

ENTRYPOINT ["python3"]

Overwriting Dockerfile


## Build & Push

In [5]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.Session().region_name
ecr_repository = 'sagemaker-processing-container'
tag = ':latest'
processing_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(account_id, region, ecr_repository + tag)

In [7]:
# Create ECR repository and push docker image
!docker build -t $ecr_repository .
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com
!aws ecr create-repository --repository-name $ecr_repository
!docker tag {ecr_repository + tag} $processing_repository_uri
!docker push $processing_repository_uri

Sending build context to Docker daemon   12.8kB
Step 1/4 : FROM nvcr.io/nvidia/pytorch:21.08-py3
21.08-py3: Pulling from nvidia/pytorch

[1B32c2132b: Pulling fs layer 
[1Bfc91ca4c: Pulling fs layer 
[1Bbfe29823: Pulling fs layer 
[1Bbb0f48c6: Pulling fs layer 
[1B937ae0b1: Pulling fs layer 
[1B47dbb869: Pulling fs layer 
[1B9a515d38: Pulling fs layer 
[1Bbefddb18: Pulling fs layer 
[1Ba5bdde0b: Pulling fs layer 
[1B32b6dcb0: Pulling fs layer 
[1Bb39618ed: Pulling fs layer 
[1B5b7dac39: Pulling fs layer 
[1B46f1ce67: Pulling fs layer 
[1B46b2b0ee: Pulling fs layer 
[10B7dbb869: Waiting fs layer 
[1B010c3f61: Pulling fs layer 
[11Ba515d38: Waiting fs layer 
[1B7fe2ac6f: Pulling fs layer 
[12Befddb18: Waiting fs layer 
[1Baf4d5a99: Pulling fs layer 
[1Baee79aa7: Pulling fs layer 
[1B9b496fe3: Pulling fs layer 
[1B23103b6c: Pulling fs layer 
[1Bff55d023: Pulling fs layer 
[1Bedee2aea: Pulling fs layer 
[1B59107317: Pulling fs layer 
[1Bbe386e50: Pulling fs layer 

## SageMaker Processing w/ SageMaker Python SDK

### Local mode for debugging

In [10]:
role = sagemaker.get_execution_role()
instance_type = 'local'

In [11]:
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

script_processor_local = ScriptProcessor(command=['python3'],
                image_uri=processing_repository_uri,
                role=role,
                instance_count=1,
                instance_type=instance_type)

In [28]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from time import gmtime, strftime 

processing_job_name = "sm-processing-demo-{}".format(strftime("%d-%H-%M-%S", gmtime()))
output_destination = 's3://{}/{}/data'.format(bucket, s3_prefix)

script_processor_local.run(code='code/preprocessing.py',
                      job_name=processing_job_name,
                      inputs=[ProcessingInput(
                        source=raw_s3,
                        destination='/opt/ml/processing/input',
                      # s3_data_distribution_type='ShardedByS3Key')], # ShardedByS3Key is not currently supported in Local Mode
                        s3_data_distribution_type='FullyReplicated')],
                      outputs=[ProcessingOutput(output_name='train',
                                                destination='{}/train'.format(output_destination),
                                                source='/opt/ml/processing/train'),
                               ProcessingOutput(output_name='test',
                                                destination='{}/test'.format(output_destination),
                                                source='/opt/ml/processing/test')])

preprocessing_job_description = script_processor_local.jobs[-1].describe()


Job Name:  sm-processing-demo-29-02-52-39
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/raw', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo-29-02-52-39/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/train', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-

### SageMaker processing

In [30]:
instance_type = 'ml.m5.xlarge'

In [31]:
script_processor = ScriptProcessor(command=['python3'],
                image_uri=processing_repository_uri,
                role=role,
                instance_count=2,
                instance_type=instance_type)

In [32]:
script_processor.run(code='code/preprocessing.py',
                      job_name=processing_job_name,
                      inputs=[ProcessingInput(
                        source=raw_s3,
                        destination='/opt/ml/processing/input',
                        s3_data_distribution_type='ShardedByS3Key')], # ShardedByS3Key is not currently supported in Local Mode
#                         s3_data_distribution_type='FullyReplicated')],
                      outputs=[ProcessingOutput(output_name='train',
                                                destination='{}/train'.format(output_destination),
                                                source='/opt/ml/processing/train'),
                               ProcessingOutput(output_name='test',
                                                destination='{}/test'.format(output_destination),
                                                source='/opt/ml/processing/test')])

preprocessing_job_description = script_processor.jobs[-1].describe()


Job Name:  sm-processing-demo-29-02-52-39
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/raw', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo-29-02-52-39/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/train', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1

## SageMaker Pipelines

In [41]:
# # input, output S3 paths
# code_location = 's3://{}/{}/code'.format(bucket, s3_prefix)
input_data = 's3://{}/{}/data/raw'.format(bucket, s3_prefix)

output_train = 's3://{}/{}/data/train'.format(bucket, s3_prefix)
output_test = 's3://{}/{}/data/test'.format(bucket, s3_prefix)

In [37]:
!aws s3 ls {code_location} --recursive


Parameter validation failed:
Invalid bucket name "{code_location}": Bucket name must match the regex "^[a-zA-Z0-9.\-_]{1,255}$" or be an ARN matching the regex "^arn:(aws).*:(s3|s3-object-lambda):[a-z\-0-9]*:[0-9]{12}:accesspoint[/:][a-zA-Z0-9\-.]{1,63}$|^arn:(aws).*:s3-outposts:[a-z\-0-9]+:[0-9]{12}:outpost[/:][a-zA-Z0-9\-]{1,63}[/:]accesspoint[/:][a-zA-Z0-9\-]{1,63}$"


In [38]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.m5.xlarge"
)

processing_input_data = ParameterString(
    name="ProcessingInputData",
    default_value=input_data,
)

In [39]:
from sagemaker.processing import ScriptProcessor

script_processor = ScriptProcessor(command=['python3'],
                image_uri=processing_repository_uri,
                role=role,
                instance_type=processing_instance_type,
                instance_count=processing_instance_count)

In [42]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

step_process = ProcessingStep(
    name="SMProcessDemo",
    processor=script_processor,
    inputs=[
        ProcessingInput(source=input_data,
                        destination='/opt/ml/processing/input'),
    ],
    outputs=[ProcessingOutput(output_name="train",
                              source='/opt/ml/processing/train',
                              destination=output_train),
             ProcessingOutput(output_name="test",
                              source='/opt/ml/processing/test',
                              destination=output_test)],
    code='code/preprocessing.py'
)

In [43]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = 'sagemaker-pipeline-byoc-demo'
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type, 
        processing_instance_count,
        processing_input_data,
    ],
    steps=[step_process],
)

In [44]:
import json

definition = json.loads(pipeline.definition())
definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'ProcessingInputData',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-us-east-1-889750940888/sm-processing-demo/data/raw'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'SMProcessDemo',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.ProcessingInstanceType'},
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '889750940888.dkr.ecr.us-east-1.amazonaws.com/sagemaker-processing-container:latest',
     'ContainerEntrypoint': ['python3',
      '/opt/ml/processing/input/code/pre

In [45]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:889750940888:pipeline/sagemaker-pipeline-byoc-demo',
 'ResponseMetadata': {'RequestId': '8a0067b5-473c-46c9-accd-c132ef37aedd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '8a0067b5-473c-46c9-accd-c132ef37aedd',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '96',
   'date': 'Fri, 29 Apr 2022 03:08:18 GMT'},
  'RetryAttempts': 0}}

In [52]:
execution = pipeline.start(
    parameters=dict(
        ProcessingInstanceType='ml.c5.xlarge',
        ProcessingInstanceCount='2',
    )
)

In [None]:
execution.describe()
execution.wait()

In [None]:
execution.list_steps()