In [1]:
import os
import boto3
import time
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import hvplot.pandas
from PIL import Image

import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
model_package_group_name = f"CatBreedModelPackage"

os.environ["default_bucket"] = default_bucket

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
input_data_uri = f"s3://{default_bucket}/cat-breed/"
print(input_data_uri)
os.makedirs("cat-breed", exist_ok=True)

s3://sagemaker-ap-northeast-2-242201274000/cat-breed/


In [6]:
import sagemaker
import sagemaker.workflow as workflow

from sagemaker.workflow.parameters import (
    ParameterString,
    ParameterInteger
)
from sagemaker.workflow.steps import CacheConfig

input_data = ParameterString(
    name='InputData',
    default_value=input_data_uri
)

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)

processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value='ml.m5.large'
)

In [29]:
preprocessing_code = "cat-breed/preprocessing.py"

local_input_dir = 'data/'
local_output_dir = 'opt/ml/processing/'
local_split_rate = 0.1

In [30]:
# ! python {preprocessing_code} --base_preproc_input_dir {local_input_dir} \
#                               --base_preproc_output_dir {local_output_dir} \
#                               --test_split_rate {local_split_rate}

#### Argument Info ####
args.base_preproc_input_dir: data/
args.base_preproc_output_dir: opt/ml/processing/
args.test_split_rate: 0.1
img_path lenth: 200
Split Done!
train-Bombay Images: 90
train-Burmese Images: 90
test-Bombay Images: 10
test-Burmese Images: 10


In [38]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="sklearn-cat-breed-process",
    role=role
)
print('role:', role)

step_process = ProcessingStep(
    name="CatBreedProcess",
    processor=sklearn_processor,
    inputs = [ProcessingInput(source=input_data, destination="/opt/ml/processing/input")],
    outputs = [ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
              ProcessingOutput(output_name="test", source="/opt/ml/processing/test")],
    code=preprocessing_code
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


role: arn:aws:iam::242201274000:role/service-role/AmazonSageMaker-ExecutionRole-20240923T230631


In [39]:
from sagemaker.workflow.pipeline import Pipeline

project_prefix = "Cat-Breed-Demo"

pipeline_name = project_prefix
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type, 
        processing_instance_count,
        input_data,
    ],
    steps=[step_process],
)

In [40]:
import json

definition = json.loads(pipeline.definition())
definition



{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.large'},
  {'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'InputData',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-ap-northeast-2-242201274000/cat-breed/'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'CatBreedProcess',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.ProcessingInstanceType'},
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '366743142698.dkr.ecr.ap-northeast-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3',
     'ContainerEntrypoint': ['python3',
      '/opt/ml/processing/input/code/preprocessing.py']}

In [41]:
pipeline.upsert(role_arn=role)
execution = pipeline.start()



In [42]:
type(execution.describe())

dict

In [43]:
for idx, (key, item) in enumerate(execution.describe().items()):
    print(f"{idx}. key: {key}\n   value: \n{item}\n\n")

0. key: PipelineArn
   value: 
arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/Cat-Breed-Demo


1. key: PipelineExecutionArn
   value: 
arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/Cat-Breed-Demo/execution/yexuildyuhgd


2. key: PipelineExecutionDisplayName
   value: 
execution-1731902518028


3. key: PipelineExecutionStatus
   value: 
Executing


4. key: PipelineExperimentConfig
   value: 
{'ExperimentName': 'cat-breed-demo', 'TrialName': 'yexuildyuhgd'}


5. key: CreationTime
   value: 
2024-11-18 04:01:57.963000+00:00


6. key: LastModifiedTime
   value: 
2024-11-18 04:01:57.963000+00:00


7. key: CreatedBy
   value: 
{'UserProfileArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:user-profile/d-l0dltcg6j4kj/default-20240923T230629', 'UserProfileName': 'default-20240923T230629', 'DomainId': 'd-l0dltcg6j4kj', 'IamIdentity': {'Arn': 'arn:aws:sts::242201274000:assumed-role/AmazonSageMaker-ExecutionRole-20240923T230631/SageMaker', 'PrincipalId': 'AROATQZCSE2IHHIBWPD6G

In [44]:
execution.wait()

In [45]:
execution.list_steps()

[{'StepName': 'CatBreedProcess',
  'StartTime': datetime.datetime(2024, 11, 18, 4, 1, 59, 995000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 11, 18, 4, 4, 32, 585000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:processing-job/pipelines-yexuildyuhgd-CatBreedProcess-6SRoLJzoBB'}},
  'AttemptCount': 1}]

In [46]:
def get_proc_artifact(execution, client, kind=0):
    '''
    kind: 0 --> train
    kind: 1 --> test
    '''
    response = execution.list_steps()

    proc_arn = response[-1]['Metadata']['ProcessingJob']['Arn'] # index -1은 가장 처음 실행 step
    proc_job_name = proc_arn.split('/')[-1]
    
    response = client.describe_processing_job(ProcessingJobName = proc_job_name)
    test_preprocessed_file = response['ProcessingOutputConfig']['Outputs'][kind]['S3Output']['S3Uri'] 
    
    return test_preprocessed_file



In [47]:
import boto3
client = boto3.client("sagemaker")

train_preproc_dir_artifact = get_proc_artifact(execution, client, kind=0 )
test_preproc_dir_artifact = get_proc_artifact(execution, client, kind=1 )

print("train_preproc_dir_artifact: \n", train_preproc_dir_artifact)
print("test_preproc_dir_artifact: \n", test_preproc_dir_artifact)

train_preproc_dir_artifact: 
 s3://sagemaker-ap-northeast-2-242201274000/Cat-Breed-Demo/yexuildyuhgd/CatBreedProcess/output/train
test_preproc_dir_artifact: 
 s3://sagemaker-ap-northeast-2-242201274000/Cat-Breed-Demo/yexuildyuhgd/CatBreedProcess/output/test


In [49]:
%store train_preproc_dir_artifact
%store test_preproc_dir_artifact
%store role 
%store default_bucket
%store model_package_group_name
%store project_prefix
%store input_data_uri

Stored 'train_preproc_dir_artifact' (str)
Stored 'test_preproc_dir_artifact' (str)
Stored 'role' (str)
Stored 'default_bucket' (str)
Stored 'model_package_group_name' (str)
Stored 'project_prefix' (str)
Stored 'input_data_uri' (str)
