In [45]:
# 1. 라이브러리 로딩 & 변수 불러오기
import boto3
import sagemaker
import os

sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()

project_prefix = "Kefico-Anomaly-Detection"
%store project_prefix
    
%store -r
%store

Stored 'project_prefix' (str)
Stored variables and their in-db values:
bucket                                 -> 'sagemaker-ap-northeast-2-242201274000'
cat_data_uri                           -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
controller_name                        -> 'V2LC'
dog_data_uri                           -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
input_data_uri                         -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
local_cat_dir                          -> '../data/cvd_data/cat'
local_dog_dir                          -> '../data/cvd_data/dog'
main_test_name                         -> 'HEV_P2_ACOverLoad_IG1_1'
preprocessing_code                     -> 'src/ad_parquet_preprocessing.py'
project_prefix                         -> 'Kefico-Anomaly-Detection'
sub_test_name                          -> 'Severe1_Above2.83s_1'
test_preproc_dir_artifact              -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
train_preproc_dir_

In [46]:
train_code = 'src/ad_train_and_evaluate.py'

In [47]:
# ! python {train_code} --learning_rate 1e-4\
#                       --num_epochs 2

In [48]:
# 파이프라인 파라미터 생성
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString)

training_instance_type = ParameterString(
    name='TrainingInstanceType',
    default_value='ml.m4.4xlarge'
)

training_instance_count = ParameterInteger(
    name='TrainingInstanceCount',
    default_value=1
)

In [49]:
max_run = 1*60*60

use_spot_instances = False
if use_spot_instances:
    max_wait = 1*60*60
else:
    max_wait = None

instance_type ='sagemaker'
if instance_type in ['local', 'local_gpu']:
    from sagemaker.local import LocalSession
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
else:
    sagemaker_session = sagemaker.session.Session()

In [50]:
from sagemaker.pytorch import PyTorch

estimator_output_path = f"s3://{bucket}/{project_prefix}/training_jobs"

estimator = PyTorch(
    # framework_version='2.2.0',
    # py_version='py310',
    image_uri="763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:2.4.0-cpu-py311-ubuntu22.04-sagemaker",
    entry_point='ad_training.py',
    source_dir='src',
    output_path=estimator_output_path,
    code_location=estimator_output_path,
    role=role,
    sagemaker_session=sagemaker_session,
    instance_count=training_instance_count,
    instance_type=training_instance_type,
    hyperparameters={
        'num_epochs': 2,
        'batch_size': 1024
    },
    max_run=max_run,
    use_spot_instances=use_spot_instances,
    max_wait=max_wait
)

In [51]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep


step_train = TrainingStep(
    name="Kefico-AD-Basic-Train",
    estimator=estimator,
    inputs={
        "train": TrainingInput(
            s3_data= train_preproc_dir_artifact
        ),
    },
)

In [52]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = project_prefix
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        training_instance_type,        
        training_instance_count,         
    ],
    steps=[step_train],
)

In [53]:
import json

definition = json.loads(pipeline.definition())
definition



{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m4.4xlarge'},
  {'Name': 'TrainingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'Kefico-AD-Basic-Train',
   'Type': 'Training',
   'Arguments': {'AlgorithmSpecification': {'TrainingInputMode': 'File',
     'TrainingImage': '763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:2.4.0-cpu-py311-ubuntu22.04-sagemaker'},
    'OutputDataConfig': {'S3OutputPath': 's3://sagemaker-ap-northeast-2-242201274000/Kefico-Anomaly-Detection/training_jobs'},
    'StoppingCondition': {'MaxRuntimeInSeconds': 3600},
    'ResourceConfig': {'VolumeSizeInGB': 30,
     'InstanceCount': {'Get': 'Parameters.TrainingInstanceCount'},
     'InstanceType': {'Get': 'Parameters.TrainingInstanceType'}},


In [54]:
pipeline.upsert(role_arn=role)
execution = pipeline.start()



In [55]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/Kefico-Anomaly-Detection',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/Kefico-Anomaly-Detection/execution/7x8zsnx15dc6',
 'PipelineExecutionDisplayName': 'execution-1731583018181',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 11, 14, 11, 16, 58, 121000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 11, 14, 11, 16, 58, 121000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:user-profile/d-l0dltcg6j4kj/default-20240923T230629',
  'UserProfileName': 'default-20240923T230629',
  'DomainId': 'd-l0dltcg6j4kj',
  'IamIdentity': {'Arn': 'arn:aws:sts::242201274000:assumed-role/AmazonSageMaker-ExecutionRole-20240923T230631/SageMaker',
   'PrincipalId': 'AROATQZCSE2IHHIBWPD6G:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:user-profile

In [56]:
execution.wait()

In [57]:
execution.list_steps()

[{'StepName': 'Kefico-AD-Basic-Train',
  'StartTime': datetime.datetime(2024, 11, 14, 11, 16, 58, 783000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 11, 14, 11, 20, 55, 144000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:training-job/pipelines-7x8zsnx15dc6-Kefico-AD-Basic-Trai-OXAk7EvaE6'}},
  'AttemptCount': 1}]

In [62]:
def get_train_artifact(execution, client, job_type,  kind=0):
    '''
    kind: 0 --> train
    kind: 2 --> test
    '''
    response = execution.list_steps()
    print(response)
    proc_arn = response[0]['Metadata'][job_type]['Arn']
    train_job_name = proc_arn.split('/')[-1]
    # print("train_job_name: ", train_job_name)
    response = client.describe_training_job(TrainingJobName = train_job_name)
    train_model_artifact = response['ModelArtifacts']['S3ModelArtifacts']    
    
    return train_model_artifact

client = boto3.client('sagemaker')
train_model_artifact = get_train_artifact(execution, client, job_type='TrainingJob', kind=0)
print('train_model_artifact: ', train_model_artifact)

[{'StepName': 'Kefico-AD-Basic-Train', 'StartTime': datetime.datetime(2024, 11, 14, 11, 16, 58, 783000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 11, 14, 11, 20, 55, 144000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:training-job/pipelines-7x8zsnx15dc6-Kefico-AD-Basic-Trai-OXAk7EvaE6'}}, 'AttemptCount': 1}]
train_model_artifact:  s3://sagemaker-ap-northeast-2-242201274000/Kefico-Anomaly-Detection/training_jobs/pipelines-7x8zsnx15dc6-Kefico-AD-Basic-Trai-OXAk7EvaE6/output/model.tar.gz


In [63]:
image_uri = estimator.image_uri
# help(estimator)
print("image_uri: \n", image_uri)

image_uri: 
 763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:2.4.0-cpu-py311-ubuntu22.04-sagemaker


In [64]:
%store train_model_artifact
%store image_uri

Stored 'train_model_artifact' (str)
Stored 'image_uri' (str)
