In [1]:
%store -r
%store

Stored variables and their in-db values:
bucket                     -> 'sagemaker-ap-northeast-2-242201274000'
cat_data_uri               -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
dog_data_uri               -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
input_data_uri             -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
project_prefix             -> 'sagemaker-catvsdog-pipeline-base'


In [2]:
import boto3
import sagemaker
import pandas as pd
from IPython.display import display as dp

region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
print('SG Role: ', role)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
SG Role:  arn:aws:iam::242201274000:role/service-role/AmazonSageMaker-ExecutionRole-20240923T230631


In [3]:
# 내 로컬에서 원본 데이터 확인
import os

local_data_dir = '../data/cvd_data'
local_cat_dir = f'{local_data_dir}/cat'
local_dog_dir = f'{local_data_dir}/dog'

print("Total Cat Images: ", len(os.listdir(local_cat_dir)))
print("Total Dog Images: ", len(os.listdir(local_dog_dir)))

Total Cat Images:  1000
Total Dog Images:  1000


In [4]:
# 전처리에 사용되는 스크립트
preprocessing_code = 'src/cvd_preprocessing.py'
%store preprocessing_code
%store local_cat_dir
%store local_dog_dir

Stored 'preprocessing_code' (str)
Stored 'local_cat_dir' (str)
Stored 'local_dog_dir' (str)


In [5]:
base_output_dir = 'opt/ml/processing/output' 

# 도커 컨테이너의 입력 폴더와 비슷한 환경 기술
base_preproc_input_dir = 'opt/ml/processing/input'
os.makedirs(base_preproc_input_dir, exist_ok=True)

# 출력 훈련 폴더를 기술 합니다.
base_preproc_output_train_dir = 'opt/ml/processing/output/train/'
os.makedirs(base_preproc_output_train_dir, exist_ok=True)

# 출력 테스트 폴더를 기술 합니다.
base_preproc_output_test_dir = 'opt/ml/processing/output/test/'
os.makedirs(base_preproc_output_test_dir, exist_ok=True)


split_rate = 0.9

In [6]:
! cp -r {local_cat_dir} {base_preproc_input_dir}
! cp -r {local_dog_dir} {base_preproc_input_dir}

In [7]:
! python {preprocessing_code} --base_preproc_input_dir {base_preproc_input_dir} \
                              --base_output_dir {base_output_dir} \
                              --split_rate {split_rate}

#### Argument Info ####
args.base_output_dir: opt/ml/processing/output
args.base_preproc_input_dir: opt/ml/processing/input
args.split_rate: 0.9
Dataset Split Complete!
Before: Cat: 1000, Dog: 1000
After: <Train> Cat: 993, Dog: 994
        <Test> Cat: 193, Dog: 194


In [8]:
# Pipeline Parameter 정의
from sagemaker.workflow.parameters import ParameterInteger, ParameterString

processing_instance_count = ParameterInteger(
    name='ProcessingInstanceCount',
    default_value=1
)

processing_instance_type = ParameterString(
    name='ProcessingInstanceType',
    default_value="ml.m5.xlarge"
)

input_data = ParameterString(
    name='InputData',
    default_value=input_data_uri
)

In [9]:
# Pipeline Processor 정의

In [10]:
# from sagemaker.pytorch.processing import PyTorchProcessor

# pytorch_processor = PyTorchProcessor(
#     framework_version='1.8',
#     role=role,
#     instance_type=processing_instance_type,
#     instance_count=processing_instance_count,
#     base_job_name='CvD-PyTorch-Preprocessing'
# )

In [11]:
from sagemaker.sklearn.processing import SKLearnProcessor

framework_version = '1.0-1'

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name='sklearn-CvD-process',
    role=role
)

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is interpreted in pipeline execution time only. As the function needs to evaluate the argument value in SDK compile time, the default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


In [12]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

step_process = ProcessingStep(
    name='CvD-Basic-Process',
    processor=sklearn_processor,
    inputs = [ProcessingInput(source=input_data_uri, destination='/opt/ml/processing/input')],
    outputs = [ProcessingOutput(output_name='train', source='/opt/ml/processing/output/train'),
              ProcessingOutput(output_name='test', source='/opt/ml/processing/output/test')],
    job_arguments=["--split_rate", f"{split_rate}"],
    code = preprocessing_code
)

In [13]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = project_prefix
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type,
        processing_instance_count,
        input_data
    ],
    steps=[step_process]
)

In [14]:
import json

definition = json.loads(pipeline.definition())
definition



{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'InputData',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-ap-northeast-2-242201274000/sagemaker-catvsdog-pipeline-base/input'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'CvD-Basic-Process',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.ProcessingInstanceType'},
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '366743142698.dkr.ecr.ap-northeast-2.amazonaws.com/sagemaker-scikit-learn:1.0-1-cpu-py3',
     'ContainerArguments': ['--split_rate', '0.9'],
     'Containe

In [15]:
# Pipeline 등록, 실행
pipeline.upsert(role_arn=role)
execution = pipeline.start()



In [16]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/sagemaker-catvsdog-pipeline-base',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/sagemaker-catvsdog-pipeline-base/execution/oplfrhj1u8qa',
 'PipelineExecutionDisplayName': 'execution-1731398902199',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 11, 12, 8, 8, 22, 143000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 11, 12, 8, 8, 22, 143000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:user-profile/d-l0dltcg6j4kj/default-20240923T230629',
  'UserProfileName': 'default-20240923T230629',
  'DomainId': 'd-l0dltcg6j4kj',
  'IamIdentity': {'Arn': 'arn:aws:sts::242201274000:assumed-role/AmazonSageMaker-ExecutionRole-20240923T230631/SageMaker',
   'PrincipalId': 'AROATQZCSE2IHHIBWPD6G:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:

In [17]:
execution.wait()

In [18]:
execution.list_steps()

[{'StepName': 'CvD-Basic-Process',
  'StartTime': datetime.datetime(2024, 11, 12, 8, 8, 23, 82000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 11, 12, 8, 10, 57, 179000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:processing-job/pipelines-oplfrhj1u8qa-CvD-Basic-Process-ApF0W168nu'}},
  'AttemptCount': 1}]

In [23]:
# 전처리 후 결과 파일 경로 추출
import boto3
from src.my_sg_utils import get_proc_artifact


client = boto3.client("sagemaker")

train_preproc_dir_artifact = get_proc_artifact(execution, client, kind=0)
test_preproc_dir_artifact = get_proc_artifact(execution, client, kind=1)

print('output-train: ', train_preproc_dir_artifact)
print('output-test : ', test_preproc_dir_artifact)

output-train:  s3://sagemaker-ap-northeast-2-242201274000/sagemaker-catvsdog-pipeline-base/oplfrhj1u8qa/CvD-Basic-Process/output/train
output-test :  s3://sagemaker-ap-northeast-2-242201274000/sagemaker-catvsdog-pipeline-base/oplfrhj1u8qa/CvD-Basic-Process/output/test


In [24]:
%store train_preproc_dir_artifact
%store test_preproc_dir_artifact

Stored 'train_preproc_dir_artifact' (str)
Stored 'test_preproc_dir_artifact' (str)
