In [82]:
import boto3
import sagemaker
import os

sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()

project_prefix = "Kefico-Anomaly-Detection"
%store project_prefix
    
%store -r
%store

Stored 'project_prefix' (str)
Stored variables and their in-db values:
bucket                                 -> 'sagemaker-ap-northeast-2-242201274000'
cat_data_uri                           -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
controller_name                        -> 'V2LC'
dog_data_uri                           -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
image_uri                              -> '763104351884.dkr.ecr.ap-northeast-2.amazonaws.com
input_data_uri                         -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
local_cat_dir                          -> '../data/cvd_data/cat'
local_dog_dir                          -> '../data/cvd_data/dog'
main_test_name                         -> 'HEV_P2_ACOverLoad_IG1_1'
preprocessing_code                     -> 'src/ad_parquet_preprocessing.py'
project_prefix                         -> 'Kefico-Anomaly-Detection'
sub_test_name                          -> 'Severe1_Above2.83s_1'
test_preproc_dir_a

In [83]:
evaluating_code = 'src/ad_evaluating.py'

In [84]:
# ! python {evaluating_code}

In [85]:
# 파이프라인 파라미터 생성
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString)

evaluating_instance_type = ParameterString(
    name='EvaluatingInstanceType',
    default_value='ml.m4.4xlarge'
)

evaluating_instance_count = ParameterInteger(
    name='EvaluatingInstanceCount',
    default_value=1
)

In [86]:
max_run = 1*60*60

use_spot_instances = False
if use_spot_instances:
    max_wait = 1*60*60
else:
    max_wait = None

instance_type ='sagemaker'
if instance_type in ['local', 'local_gpu']:
    from sagemaker.local import LocalSession
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
else:
    sagemaker_session = sagemaker.session.Session()

In [94]:
from sagemaker.pytorch import PyTorch

estimator_output_path = f"s3://{bucket}/{project_prefix}/training_jobs"

estimator = PyTorch(
    # framework_version='2.2.0',
    # py_version='py310',
    image_uri="763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:2.4.0-cpu-py311-ubuntu22.04-sagemaker",
    entry_point='ad_evaluating.py',
    source_dir='src',
    output_path=estimator_output_path,
    code_location=estimator_output_path,
    role=role,
    sagemaker_session=sagemaker_session,
    instance_count=evaluating_instance_count,
    instance_type=evaluating_instance_type,
    hyperparameters={
        'thres': 1.0,
        'batch_size': 1024
    },
    max_run=max_run,
    use_spot_instances=use_spot_instances,
    max_wait=max_wait
)

In [88]:
print(train_preproc_dir_artifact)
print(train_model_artifact)

s3://sagemaker-ap-northeast-2-242201274000/Kefico-Anomaly-Detection/85nzargh0c8j/AD-Demo-Basic-Process/output/train
s3://sagemaker-ap-northeast-2-242201274000/Kefico-Anomaly-Detection/training_jobs/pipelines-7x8zsnx15dc6-Kefico-AD-Basic-Trai-OXAk7EvaE6/output/model.tar.gz


In [89]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep


step_evaluation = TrainingStep(
    name="Kefico-AD-Basic-Train",
    estimator=estimator,
    inputs={
        "train": TrainingInput(
            s3_data= train_preproc_dir_artifact
        ),
    },
)

In [90]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = project_prefix
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        evaluating_instance_type,        
        evaluating_instance_count,         
    ],
    steps=[step_evaluation],
)

In [91]:
import json

definition = json.loads(pipeline.definition())
definition



{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'EvaluatingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m4.4xlarge'},
  {'Name': 'EvaluatingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'Kefico-AD-Basic-Evaluation',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.EvaluatingInstanceType'},
      'InstanceCount': {'Get': 'Parameters.EvaluatingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:2.4.0-cpu-py311-ubuntu22.04-sagemaker',
     'ContainerEntrypoint': ['python3',
      '/opt/ml/processing/input/code/ad_evaluating.py']},
    'RoleArn': 'arn:aws:iam::242201274000:role/service-role/AmazonSageMaker-ExecutionRole-

In [92]:
pipeline.upsert(role_arn=role)
execution = pipeline.start()

execution.describe()



{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/Kefico-Anomaly-Detection',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/Kefico-Anomaly-Detection/execution/ncnh41u98bd0',
 'PipelineExecutionDisplayName': 'execution-1731587482346',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 11, 14, 12, 31, 22, 282000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 11, 14, 12, 31, 22, 282000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:user-profile/d-l0dltcg6j4kj/default-20240923T230629',
  'UserProfileName': 'default-20240923T230629',
  'DomainId': 'd-l0dltcg6j4kj',
  'IamIdentity': {'Arn': 'arn:aws:sts::242201274000:assumed-role/AmazonSageMaker-ExecutionRole-20240923T230631/SageMaker',
   'PrincipalId': 'AROATQZCSE2IHHIBWPD6G:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:user-profile

In [93]:
execution.wait()

WaiterError: Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"

In [None]:
execution.list_steps()