In [99]:
# 1. 라이브러리 로딩 & 변수 불러오기
import boto3
import sagemaker
import os

sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()

%store -r
%store

Stored variables and their in-db values:
bucket                                 -> 'sagemaker-ap-northeast-2-242201274000'
cat_data_uri                           -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
dog_data_uri                           -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
input_data_uri                         -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
local_cat_dir                          -> '../data/cvd_data/cat'
local_dog_dir                          -> '../data/cvd_data/dog'
preprocessing_code                     -> 'src/cvd_preprocessing.py'
project_prefix                         -> 'sagemaker-catvsdog-pipeline-base'
test_preproc_dir_artifact              -> 's3://sagemaker-ap-northeast-2-242201274000/sagema
train_preproc_dir_artifact             -> 's3://sagemaker-ap-northeast-2-242201274000/sagema


In [100]:
train_code = 'src/cvd_training.py'

In [101]:
# ! python {train_code} --epoch 3

In [102]:
# 파이프라인 파라미터 생성
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString)

training_instance_type = ParameterString(
    name='TrainingInstanceType',
    default_value='ml.m5.xlarge'
)

training_instance_count = ParameterInteger(
    name='TrainingInstanceCount',
    default_value=1
)

input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri
)

In [103]:
input_data_uri

's3://sagemaker-ap-northeast-2-242201274000/sagemaker-catvsdog-pipeline-base/input'

In [104]:
# instance_type = "local"
max_run = 1*60*60

use_spot_instances = False
if use_spot_instances:
    max_wait = 1*60*60
else:
    max_wait = None

In [105]:
# instance_type = 'local'
instance_type ='sagemaker'
if instance_type in ['local', 'local_gpu']:
    from sagemaker.local import LocalSession
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
else:
    sagemaker_session = sagemaker.session.Session()

In [106]:
# estimator 생성
from sagemaker.pytorch import PyTorch

estimator_output_path = f"s3://{bucket}/{project_prefix}/training_jobs"

estimator = PyTorch(
    # framework_version='2.2.0',
    # py_version='py310',
    image_uri="763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:2.4.0-cpu-py311-ubuntu22.04-sagemaker",
    entry_point='cvd_training.py',
    source_dir='src',
    output_path=estimator_output_path,
    code_location=estimator_output_path,
    role=role,
    sagemaker_session=sagemaker_session,
    instance_count=training_instance_count,
    instance_type=training_instance_type,
    hyperparameters={
        'epochs': 5
    },
    max_run=max_run,
    use_spot_instances=use_spot_instances,
    max_wait=max_wait
)

In [107]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep


step_train = TrainingStep(
    name="CvD-Basic-Train_v1",
    estimator=estimator,
    inputs={
        "train": TrainingInput(
            s3_data= train_preproc_dir_artifact
        ),
    },
)

In [108]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = project_prefix
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        training_instance_type,        
        training_instance_count,         
        input_data,
    ],
    steps=[step_train],
)

In [109]:
import json

definition = json.loads(pipeline.definition())
definition



{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'TrainingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'InputData',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-ap-northeast-2-242201274000/sagemaker-catvsdog-pipeline-base/input'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'CvD-Basic-Train_v1',
   'Type': 'Training',
   'Arguments': {'AlgorithmSpecification': {'TrainingInputMode': 'File',
     'TrainingImage': '763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:2.4.0-cpu-py311-ubuntu22.04-sagemaker'},
    'OutputDataConfig': {'S3OutputPath': 's3://sagemaker-ap-northeast-2-242201274000/sagemaker-catvsdog-pipeline-base/training_jobs'},
    'StoppingCondition': {'MaxRuntimeInSeconds': 3600},
    'ResourceConfig': {'

In [110]:
pipeline.upsert(role_arn=role)
execution = pipeline.start()



In [111]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/sagemaker-catvsdog-pipeline-base',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/sagemaker-catvsdog-pipeline-base/execution/13cwhypkvokr',
 'PipelineExecutionDisplayName': 'execution-1731399178441',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 11, 12, 8, 12, 58, 385000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 11, 12, 8, 12, 58, 385000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:user-profile/d-l0dltcg6j4kj/default-20240923T230629',
  'UserProfileName': 'default-20240923T230629',
  'DomainId': 'd-l0dltcg6j4kj',
  'IamIdentity': {'Arn': 'arn:aws:sts::242201274000:assumed-role/AmazonSageMaker-ExecutionRole-20240923T230631/SageMaker',
   'PrincipalId': 'AROATQZCSE2IHHIBWPD6G:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-northeast-2:24220127400

In [None]:
execution.wait()

In [None]:
execution.list_steps()

In [None]:
def get_train_artifact(execution, client, job_type,  kind=0):
    '''
    kind: 0 --> train
    kind: 2 --> test
    '''
    response = execution.list_steps()
    print(response)
    proc_arn = response[0]['Metadata'][job_type]['Arn']
    train_job_name = proc_arn.split('/')[-1]
    # print("train_job_name: ", train_job_name)
    response = client.describe_training_job(TrainingJobName = train_job_name)
    train_model_artifact = response['ModelArtifacts']['S3ModelArtifacts']    
    
    return train_model_artifact

train_model_artifact = get_train_artifact(execution, client,job_type='TrainingJob', kind=0)

In [None]:
import boto3
client = boto3.client("sagemaker")
    
train_model_artifact = get_train_artifact(execution, client,job_type='TrainingJob', kind=0)
print(" train_model_artifact: \n", train_model_artifact)

In [None]:
def get_train_artifact(execution, client, job_type,  kind=0):
    '''
    kind: 0 --> train
    kind: 2 --> test
    '''
    response = execution.list_steps()
    # print("response: ", response)
    proc_arn = response[0]['Metadata'][job_type]['Arn']
    train_job_name = proc_arn.split('/')[-1]
    # print("train_job_name: ", train_job_name)
    response = client.describe_training_job(TrainingJobName = train_job_name)
    # print("\nresponse: ", response)    
    train_model_artifact = response['ModelArtifacts']['S3ModelArtifacts']    
    
    return train_model_artifact

In [None]:
image_uri = estimator.image_uri
# help(estimator)
print("image_uri: \n", image_uri)