In [1]:
%store -r
%store

Stored variables and their in-db values:
bucket                                 -> 'sagemaker-ap-northeast-2-242201274000'
controller_name                        -> 'V2LC'
default_bucket                         -> 'sagemaker-ap-northeast-2-242201274000'
image_uri                              -> '763104351884.dkr.ecr.ap-northeast-2.amazonaws.com
input_data_uri                         -> 's3://sagemaker-ap-northeast-2-242201274000/cat-br
main_test_name                         -> 'HEV_P2_ACOverLoad_IG1_1'
model_package_group_name               -> 'CatBreedModelPackage'
preprocessing_code                     -> 'src/ad_parquet_preprocessing.py'
project_prefix                         -> 'Cat-Breed-Demo'
role                                   -> 'arn:aws:iam::242201274000:role/service-role/Amazo
sub_test_name                          -> 'Severe1_Above2.83s_1'
test_preproc_dir_artifact              -> 's3://sagemaker-ap-northeast-2-242201274000/Cat-Br
train_model_artifact                   ->

In [2]:
import sagemaker
import sagemaker.workflow as workflow

from sagemaker.workflow.parameters import ParameterString, ParameterInteger

training_instance_type = ParameterString(
    name="TrainingInstanceType",
    default_value="ml.m5.large"
)

training_instance_count = ParameterInteger(
    name="TrainingInstanceCount",
    default_value=1
)

In [5]:
from sagemaker import image_uris, model_uris, script_uris

model_id, model_version = "pytorch-ic-resnet18", "2.0.0"

# JumpStart 용 기본 모델의 S3 Uri 검색
base_model_uri = model_uris.retrieve(
    model_id=model_id,
    model_version=model_version,
    model_scope="training"
)

# 학습 스크립트 검색
training_script_uri = script_uris.retrieve(
    model_id=model_id,
    model_version=model_version,
    script_scope="training"
)

# 학습 Docker Image 검색
model_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    image_scope="training",
    model_id=model_id,
    model_version=model_version,
    instance_type=training_instance_type
)

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is interpreted in pipeline execution time only. As the function needs to evaluate the argument value in SDK compile time, the default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


In [6]:
print(type(base_model_uri), base_model_uri)

<class 'str'> s3://jumpstart-cache-prod-ap-northeast-2/pytorch-training/v2.0.0/train-pytorch-ic-resnet18.tar.gz


In [7]:
print(type(training_script_uri), training_script_uri)

<class 'str'> s3://jumpstart-cache-prod-ap-northeast-2/source-directory-tarballs/pytorch/transfer_learning/ic/v2.0.0/sourcedir.tar.gz


In [8]:
print(type(model_image_uri), model_image_uri)

<class 'str'> 763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:1.10.0-cpu-py38


In [12]:
model_path = f"s3://{default_bucket}/CatBreedTrain"

from sagemaker.estimator import Estimator
from sagemaker import hyperparameters

# JumpStart 용 기본 변수 가져오기
default_hyperparameters = hyperparameters.retrieve_default(
    model_id=model_id,
    model_version=model_version
)

print(default_hyperparameters)

{'epochs': '3', 'adam-learning-rate': '0.05', 'batch-size': '4'}


In [15]:
# hyperparameter는 바꿔줄 수 있음
default_hyperparameters['epochs'] = 5
default_hyperparameters['batch-size'] = 8
default_hyperparameters['adam-learning-rate'] = "0.001"
default_hyperparameters['reinitialize-top-layer'] = "auto"
default_hyperparameters['train-only-yop-layer'] = "True"

print(default_hyperparameters)

{'epochs': 5, 'adam-learning-rate': '0.001', 'batch-size': 8, 'reinitialize-top-layer': 'auto', 'train-only-yop-layer': 'True'}


In [16]:
# Estimator 생성
estimator = Estimator(
    image_uri=model_image_uri,
    source_dir=training_script_uri,
    model_uri=base_model_uri,
    entry_point="transfer_learning.py",
    role=sagemaker.session.Session().get_caller_identity_arn(),
    hyperparameters=default_hyperparameters,
    instance_count=training_instance_count,
    instance_type=training_instance_type,
    enable_network_isolation=True,
    output_path=model_path,
    disable_profiler=True,
    metric_definitions=[{'Name': 'train:Loss', 'Regex': 'train Loss: (\S+)'},
                       {'Name': 'train:Acc', 'Regex': 'train Loss: \S+ Acc: (\S+)'},
                       {'Name': 'test:Loss', 'Regex': 'test Loss: (\S+)'},
                       {'Name': 'test:Acc', 'Regex': 'test Loss: \S+ Acc: (\S+)'},]
)

In [19]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

step_train = TrainingStep(
    name="CatBreedTrain",
    estimator=estimator,
    inputs = {
        'training': train_preproc_dir_artifact
    }
)

In [20]:
from sagemaker.workflow.pipeline import Pipeline

project_prefix = "Cat-Breed-Demo"

pipeline_name = project_prefix
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        training_instance_type, 
        training_instance_count,
    ],
    steps=[step_train],
)

In [21]:
import json

definition = json.loads(pipeline.definition())
definition



{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.large'},
  {'Name': 'TrainingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'CatBreedTrain',
   'Type': 'Training',
   'Arguments': {'AlgorithmSpecification': {'TrainingInputMode': 'File',
     'TrainingImage': '763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:1.10.0-cpu-py38',
     'MetricDefinitions': [{'Name': 'train:Loss',
       'Regex': 'train Loss: (\\S+)'},
      {'Name': 'train:Acc', 'Regex': 'train Loss: \\S+ Acc: (\\S+)'},
      {'Name': 'test:Loss', 'Regex': 'test Loss: (\\S+)'},
      {'Name': 'test:Acc', 'Regex': 'test Loss: \\S+ Acc: (\\S+)'}]},
    'OutputDataConfig': {'S3OutputPath': 's3://sagemaker-ap-northeast-2-242201274000/CatBreedTrain'},
    

In [22]:
pipeline.upsert(role_arn=role)
execution = pipeline.start()



In [23]:
type(execution.describe())

dict

In [24]:
for idx, (key, item) in enumerate(execution.describe().items()):
    print(f"{idx}. key: {key}\n   value: \n{item}\n\n")

0. key: PipelineArn
   value: 
arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/Cat-Breed-Demo


1. key: PipelineExecutionArn
   value: 
arn:aws:sagemaker:ap-northeast-2:242201274000:pipeline/Cat-Breed-Demo/execution/i6gc1y5pctlm


2. key: PipelineExecutionDisplayName
   value: 
execution-1731904279773


3. key: PipelineExecutionStatus
   value: 
Executing


4. key: PipelineExperimentConfig
   value: 
{'ExperimentName': 'cat-breed-demo', 'TrialName': 'i6gc1y5pctlm'}


5. key: CreationTime
   value: 
2024-11-18 04:31:19.670000+00:00


6. key: LastModifiedTime
   value: 
2024-11-18 04:31:19.670000+00:00


7. key: CreatedBy
   value: 
{'UserProfileArn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:user-profile/d-l0dltcg6j4kj/default-20240923T230629', 'UserProfileName': 'default-20240923T230629', 'DomainId': 'd-l0dltcg6j4kj', 'IamIdentity': {'Arn': 'arn:aws:sts::242201274000:assumed-role/AmazonSageMaker-ExecutionRole-20240923T230631/SageMaker', 'PrincipalId': 'AROATQZCSE2IHHIBWPD6G

In [26]:
execution.wait()

In [27]:
execution.list_steps()

[{'StepName': 'CatBreedTrain',
  'StartTime': datetime.datetime(2024, 11, 18, 4, 31, 20, 926000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 11, 18, 4, 35, 20, 219000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:ap-northeast-2:242201274000:training-job/pipelines-i6gc1y5pctlm-CatBreedTrain-wioBAR6TLe'}},
  'AttemptCount': 1}]

In [50]:
import boto3

def get_train_artifact(execution, client, job_type,  kind=0):
    '''
    kind: 0 --> train
    kind: 2 --> test
    '''
    response = execution.list_steps()
    # print("response: ", response[0]['Metadata'][job_type]['Arn'])
    proc_arn = response[0]['Metadata'][job_type]['Arn']
    train_job_name = proc_arn.split('/')[-1]
    # print("train_job_name: ", train_job_name)
    response = client.describe_training_job(TrainingJobName = train_job_name)
    # print("\nresponse: ", type(response))
    # for idx, (key, value) in enumerate(response.items()):
    #     print(f"{idx}.key: {key}\nvalue: {value}\n")
    train_model_artifact = response['ModelArtifacts']['S3ModelArtifacts']    
    
    return train_model_artifact
    
client = boto3.client("sagemaker")

train_model_artifact = get_train_artifact(execution, client, job_type='TrainingJob', kind=0)

In [51]:
train_model_artifact

's3://sagemaker-ap-northeast-2-242201274000/CatBreedTrain/pipelines-i6gc1y5pctlm-CatBreedTrain-wioBAR6TLe/output/model.tar.gz'

In [53]:
%store train_model_artifact
%store base_model_uri
%store training_script_uri
%store model_image_uri
%store model_id
%store model_version

# %store
# %store
# %store
# %store
# %store
# %store

Stored 'train_model_artifact' (str)
Stored 'base_model_uri' (str)
Stored 'training_script_uri' (str)
Stored 'model_image_uri' (str)
Stored 'model_id' (str)
Stored 'model_version' (str)
