In [14]:
import os

import sagemaker
from sagemaker import ScriptProcessor, ModelMetrics, MetricsSource, TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor
from sagemaker.sklearn import SKLearnProcessor, SKLearn
from sagemaker.model import Model
from sagemaker.workflow.condition_step import JsonGet, ConditionStep
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.dataset_definition.inputs import (
    AthenaDatasetDefinition,
    DatasetDefinition,
)
from datetime import datetime
import time
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_experiment_config import PipelineExperimentConfig
from sagemaker.workflow.execution_variables import ExecutionVariables

In [15]:
import boto3
import sagemaker


def get_environment(project_name, ssm_params):
    sm = boto3.client("sagemaker")
    ssm = boto3.client("ssm")

    r = sm.describe_domain(
            DomainId=sm.describe_project(
                ProjectName=project_name
                )["CreatedBy"]["DomainId"]
        )
    del r["ResponseMetadata"]
    del r["CreationTime"]
    del r["LastModifiedTime"]
    r = {**r, **r["DefaultUserSettings"]}
    del r["DefaultUserSettings"]

    i = {
        **r,
        **{t["Key"]:t["Value"]
            for t in sm.list_tags(ResourceArn=r["DomainArn"])["Tags"]
            if t["Key"] in ["EnvironmentName", "EnvironmentType"]}
    }

    for p in ssm_params:
        try:
            i[p["VariableName"]] = ssm.get_parameter(Name=f"{i['EnvironmentName']}-{i['EnvironmentType']}-{p['ParameterName']}")["Parameter"]["Value"]
        except:
            i[p["VariableName"]] = ""

    return i


def get_session(region, default_bucket):
    """Gets the sagemaker session based on the region.

    Args:
        region: the aws region to start the session
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        sagemaker.session.Session instance
    """

    boto_session = boto3.Session(region_name=region)

    sagemaker_client = boto_session.client("sagemaker")
    runtime_client = boto_session.client("sagemaker-runtime")
    print(f"Sarah: ml_pipelines > utiles > environments.py > sagemaker.__version__ is {sagemaker.__version__}")
    return sagemaker.session.Session(
        boto_session=boto_session,
        sagemaker_client=sagemaker_client,
        sagemaker_runtime_client=runtime_client,
        default_bucket=default_bucket,
    ), sagemaker_client


def environment_data(project_name):
    # Dynamically load environmental SSM parameters - provide the list of the variables to load from SSM parameter store
    ssm_parameters = [
        {"VariableName": "DataBucketName", "ParameterName": "data-bucket-name"},
        {"VariableName": "ModelBucketName", "ParameterName": "model-bucket-name"},
        {"VariableName": "S3KmsKeyId", "ParameterName": "kms-s3-key-arn"},
        {"VariableName": "EbsKmsKeyArn", "ParameterName": "kms-ebs-key-arn"},
        {"VariableName": "TrustedDefaultKinesisAccount", "ParameterName": "TrustedDefaultKinesisAccount"},
    ]
    env_data = get_environment(project_name=project_name, ssm_params=ssm_parameters)
    env_data["ProcessingRole"] = env_data["ExecutionRole"]
    env_data["TrainingRole"] = env_data["ExecutionRole"]
    
    return env_data


In [16]:
import os
import json



def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))


def get_pipeline(
        region,
        project_name=None,
        source_scripts_path="./",
        model_package_group_name="AbalonePackageGroup",
        pipeline_name="AbalonePipeline",
        base_job_prefix="Abalone",
        revision="no-revision-provided",):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        @todo arg. definitions

    Returns:
        an instance of a pipeline
    """

    # get env data
    env_data = environment_data(project_name)
#     print(f"Sarah: Environment data:\n{json.dumps(env_data, indent=2)}")

    sagemaker_session, sagemaker_client = get_session(region, env_data["DataBucketName"])
    default_bucket = sagemaker_session.default_bucket()
    base_dir = os.getcwd()
    print(f"Sarah: Creating the pipeline '{pipeline_name}':")
    print(f"sarah: Parameters:{region}\n{env_data['SecurityGroups']}\n{env_data['SubnetIds']}\n{env_data['ProcessingRole']}\n\
    {env_data['TrainingRole']}\n{env_data['DataBucketName']}\n{env_data['ModelBucketName']}\n{model_package_group_name}\n\
    {pipeline_name}\n{base_job_prefix}")

    pipeline = standard_model_pipeline(
        base_job_prefix=base_job_prefix,
        default_bucket=default_bucket,
        env_data=env_data,
        model_package_group_name=model_package_group_name,
        pipeline_name=pipeline_name,
        region=region,
        sagemaker_session=sagemaker_session,
        base_dir=base_dir,
        source_scripts_path=source_scripts_path,
        project=project_name,
        revision=revision)
    return pipeline




In [17]:
def standard_model_pipeline(base_job_prefix, default_bucket, env_data, model_package_group_name, pipeline_name,
                            region, sagemaker_session, base_dir, source_scripts_path, project="standard_model",
                            revision="none", purpose="p1033"):
    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) # used in sample application
    processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge") # used in sample application
    training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge") # used in sample application
    training_instance_count = "1"
    inference_instance_type = ParameterString(name="InferenceInstanceType", default_value="ml.m5.large")
    hpo_tuner_instance_type = ParameterString(name="HPOTunerScriptInstanceType", default_value="ml.t3.medium")
    model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval") # used in sample application
    role = "arn:aws:iam::370702650160:role/sm-mlops-env-EnvironmentI-SageMakerPipelineExecuti-1AWTL5A5UKOHN"
    # role = "arn:aws:iam::370702650160:role/sm-mlops-env-EnvironmentIAM-SageMakerExecutionRole-14AU65MVMBUGO"
    #role=$SAGEMAKER_PIPELINE_ROLE_ARN

#     default_bucket_data = ParameterString(name="DefaultS3BucketData", default_value="mlops-dev-370702650160-eu-north-1-data")
#     default_bucket_models = ParameterString(name="DefaultS3BucketModels", default_value="mlops-dev-370702650160-eu-north-1-models")
    default_bucket_data = ParameterString(name="DefaultS3BucketData", default_value=env_data['DataBucketName'])
    default_bucket_models = ParameterString(name="DefaultS3BucketModels", default_value=env_data['ModelBucketName'])

    baseline_model_objective_value = ParameterFloat(name='BaselineModelObjectiveValue', default_value=0.6)

#     bucket_prefix_data = ParameterString(name="S3BucketPrefixData", default_value="lifecycle/30d/customerone-dev-branch/") # lifecycle/30d/${SAGEMAKER_PROJECT_NAME}/
#     bucket_prefix_models = ParameterString(name="S3BucketPrefixModels", default_value="lifecycle/max/customerone-dev-branch/") # lifecycle/max/${SAGEMAKER_PROJECT_NAME}/


#     image_uri = "370702650160.dkr.ecr.eu-north-1.amazonaws.com/sagemaker-cross-validation-pipeline:0.1"
    image_uri = "813736554012.dkr.ecr.eu-north-1.amazonaws.com/engineering-custom-images:crossvalidation"
    framework_version = "0.23-1"
    baseline_model_objective_value = 0.6


#     # Variables / Constants used throughout the pipeline
#     model_package_group_name="customerone-dev-branch-p-lwkq81p5gxnk"
#     s3_bucket_base_path=f"s3://{default_bucket_data.default_value}/{bucket_prefix_data.default_value}"
#     # s3_bucket_base_path_train = f"{s3_bucket_base_path}/train"
#     # s3_bucket_base_path_test = f"{s3_bucket_base_path}/test"
#     s3_bucket_base_path_train = f"{s3_bucket_base_path}train"
#     s3_bucket_base_path_test = f"{s3_bucket_base_path}test"
#     s3_bucket_base_path_evaluation = f"{s3_bucket_base_path}/evaluation"
#     s3_bucket_base_path_jobinfo = f"{s3_bucket_base_path}/jobinfo"
#     s3_bucket_base_path_output = f"{s3_bucket_base_path}/output"

    
    
    model_approval_status, processing_instance_count, processing_instance_type, training_instance_type, training_instance_count, hpo_tuner_instance_type = sagemaker_pipeline_parameters(data_bucket=default_bucket)
    # TODO: Sarah what are the following parameters? How do I set them dynamically? Shouldn't they go to the sagemaker_pipeline_parameters() method too?
    database = ParameterString(name="DataBase", default_value="ml-test-datasets_rl")  # customerone_mock_data_rl
    table = ParameterString(name="AbaloneTable", default_value="ml_master") # ??????
    filter = ParameterString(name="FilterRings", default_value="disabled")
    time_path = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    trigger_id = ParameterString(name="TriggerID", default_value="0000000000") #from codebuild - use CODEBUILD_BUILD_ID env variable parsed after ":" The CodeBuild ID of the build (for example, codebuild-demo-project:b1e6661e-e4f2-4156-9ab9-82a19EXAMPLE).
    nowgmt = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
    execution_time = ParameterString(name="ExecutionTime", default_value=nowgmt)
    framework_version = "0.23-1"

    network_config = NetworkConfig(
        enable_network_isolation=False,
        security_group_ids=env_data["SecurityGroups"],
        subnets=env_data["SubnetIds"],
        encrypt_inter_container_traffic=True)

    model_name = "xsell_cust_voice_to_fixed"
    data_base_path = "s3://{}/lifecycle/60d/{}/{}/{}/{}/output/training".format(env_data["DataBucketName"], project, revision, time_path, purpose)
    # s3_bucket_base_path = f"s3://{default_bucket_data.default_value}/{data_base_path}",
    s3_buckets = {'default_bucket_data': default_bucket_data,
                  'default_bucket_models': default_bucket_models,
                  'bucket_prefix_data': data_base_path,
                  'bucket_prefix_models': "s3://{}/lifecycle/max/{}/{}/{}/{}/output/training".format(env_data["ModelBucketName"], project, revision, model_name, time_path),
                  'evaluation_path': "s3://{}/lifecycle/max/{}/{}/{}/{}/output/evaluation".format(env_data["ModelBucketName"], project, revision, model_name, time_path),
                  's3_bucket_base_path': data_base_path,
                  's3_bucket_base_path_train': f"{data_base_path}/train", 
                  's3_bucket_base_path_test': f"{data_base_path}/test",
                  's3_bucket_base_path_evaluation': f"{data_base_path}/evaluation",
                  's3_bucket_base_path_jobinfo': f"{data_base_path}/jobinfo",
                  's3_bucket_base_path_output': f"{data_base_path}/output",
                  'code_path': f"s3://{env_data['DataBucketName']}/lifecycle/max/{project}/{revision}/input/source_scripts/preprocessing",
                 }
    
    print("\n\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ S3 paths:")
    print(f"s3_buckets={s3_buckets}")
    print(" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ END \n\n")
    step_process, preprocessing_script = preprocessing(
                                                    s3_buckets,
                                                    base_job_prefix=base_job_prefix,
                                                    env_data=env_data,
                                                    network_config=network_config,
                                                    processing_instance_count=processing_instance_count,
                                                    processing_instance_type=processing_instance_type,
                                                    sagemaker_session=sagemaker_session,
                                                    source_scripts_path=source_scripts_path,
                                                    snapshot_path="{}/data-snapshot/".format(s3_buckets['bucket_prefix_data']),
                                                    # training_path="{}/processed/training".format(s3_buckets['bucket_prefix_data']),
                                                    # # validation_path="{}/processed/validation".format(s3_buckets['bucket_prefix_data']),
                                                    # test_path="{}/processed/test".format(s3_buckets['bucket_prefix_data']),
                                                    training_path="{}/train".format(s3_buckets['bucket_prefix_data']),
                                                    # validation_path="{}/processed/validation".format(s3_buckets['bucket_prefix_data']),
                                                    test_path="{}/test".format(s3_buckets['bucket_prefix_data']),
                                                    database=database,
                                                    table=table,
                                                    filter=filter,
                                                    execution_time=execution_time,
                                                    framework_version=framework_version,
                                                )
    
    print("\n\n88888888888")
    print(step_process.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"])

    
    # Specify the model path where you want to save the models from training:
#     model_path = "s3://{}/lifecycle/max/{}/{}/{}/{}/output/training".format(env_data["ModelBucketName"], project, revision, model_name, time_path)
#     evaluation_path = "s3://{}/lifecycle/max/{}/{}/{}/{}/output/evaluation".format(env_data["ModelBucketName"], project, revision, model_name, time_path)
    step_model_selection, step_cv_train_hpo, sklearn_estimator, evaluation_report = lightgbm_training_tasks(s3_buckets, base_job_prefix=base_job_prefix,
                                                    env_data=env_data,
                                                    image_uri=image_uri,
                                                    network_config=network_config,
                                                    sagemaker_session=sagemaker_session,
                                                    training_instance_type=training_instance_type,
                                                    training_instance_count=training_instance_count,
#                                                     model_path=model_path,
#                                                     data_base_path=data_base_path,
#                                                     evaluation_path=evaluation_path,
                                                    hpo_tuner_instance_type=hpo_tuner_instance_type,
                                                    region=region,
                                                    framework_version=framework_version,
                                                    source_scripts_path=source_scripts_path,
                                                    )

    postprocessing_script = "{}/postprocessing/postprocess.py".format(source_scripts_path)
    step_cond = lgbm_model_register_tasks(s3_buckets,
                                          evaluation_report,
                                          sklearn_estimator,
                                          step_model_selection,
                                          step_cv_train_hpo,
                                          model_approval_status,
                                          baseline_model_objective_value,
                                          # model_metrics,
                                          sagemaker_session,
                                          model_package_group_name,
                                          network_config,
                                          env_data,
                                          # step_eval,
                                          preprocessing_script,
                                          postprocessing_script,
                                          revision)


    pipeline_name = f"CrossValidationTrainingPipeline"
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_count,
            processing_instance_type,
            training_instance_type,
            inference_instance_type,
            hpo_tuner_instance_type,
            model_approval_status,
            role,
            default_bucket_data,
            baseline_model_objective_value,
#             bucket_prefix_data,
            image_uri,
            execution_time,
            database,
            table,
            filter,
        ],    
        pipeline_experiment_config=PipelineExperimentConfig(
          ExecutionVariables.PIPELINE_NAME,
          ExecutionVariables.PIPELINE_EXECUTION_ID),
#         steps=[step_process, step_cv_train_hpo, step_cond,],
         steps=[step_process, step_cv_train_hpo, step_cond],
    )
    return pipeline



In [18]:
def preprocessing(s3_buckets, base_job_prefix, env_data, network_config, processing_instance_count, processing_instance_type,
                  sagemaker_session, source_scripts_path, snapshot_path, training_path, test_path,
                  database, table, filter, execution_time, framework_version):

    print("SARAH: standard_model_pipeline > preprocessing starts")
    preprocessing_script = "{}/preprocessing/preprocess.py".format(source_scripts_path)
    # ## 1- processing step for feature engineering Step
    sklearn_processor = SKLearnProcessor(
        framework_version=framework_version,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-c1-xsell--preprocess_kfold",
        sagemaker_session=sagemaker_session,
        role=env_data["ProcessingRole"],
        network_config=network_config,
        volume_kms_key=env_data["EbsKmsKeyArn"],
        output_kms_key=env_data["S3KmsKeyId"]
    )

    step_process = ProcessingStep(
        name="PreprocessC1XsellData",
        processor=sklearn_processor,
        inputs=[ProcessingInput(source=f'{source_scripts_path}/preprocessing/utils/',
                                destination="/opt/ml/processing/input/code/utils/")],
        outputs=[
            ProcessingOutput(
                output_name="train",
                source="/opt/ml/processing/train",
                destination=training_path
            ),
            ProcessingOutput(output_name="test",
                             source="/opt/ml/processing/test",
                             destination=test_path
                             )
        ],
        code=preprocessing_script,
        job_arguments=[
            "--context", "training",
            "--executiontime", execution_time,
            "--database", database,
            "--table", table,
            "--filter", filter,
            "--ref_period", "None",  # it is only required for inference pipeline
            "--ref_date", "None",
        ],
    )
    print("SARAH: standard_model_pipeline > preprocessing ends!")
    return step_process, preprocessing_script  # TODO: SARAH: why should we return preprocessing_script??!??!



In [19]:
def lightgbm_training_tasks(s3_buckets, base_job_prefix, env_data, image_uri, network_config, sagemaker_session,
                            training_instance_type, training_instance_count, 
                            hpo_tuner_instance_type, region, framework_version, source_scripts_path):


    cross_validation_with_hpo_script = "{}/preprocessing/cross_validation_with_hpo.py".format(source_scripts_path)
#     s3_bucket_base_path_jobinfo = f"{data_base_path}/jobinfo"
# #     s3_bucket_base_path_train = f"{data_base_path}train"
# #     s3_bucket_base_path_test = f"{data_base_path}test"
#     s3_bucket_base_path_output = f"{data_base_path}/output"
#     s3_bucket_base_path_test = "s3://mlops-dev-370702650160-eu-north-1-data/lifecycle/60d/customerone-dev-branch/d583b4e/2022_08_28_13_19_09/p1033/output/training/processed/test"
#     s3_bucket_base_path_train = "s3://mlops-dev-370702650160-eu-north-1-data/lifecycle/60d/customerone-dev-branch/d583b4e/2022_08_28_13_19_09/p1033/output/training/processed/training"

#     print(f"\n\ndata_base_path={data_base_path}")
#     print(f"\n\n********* s3_bucket_base_path_train={s3_bucket_base_path_train}\n\n")
#     print(f"s3_bucket_base_path_test={s3_bucket_base_path_test}")
#     print(F"s3_bucket_base_path_output={s3_bucket_base_path_output}")
#     print(f"s3_bucket_base_path_jobinfo = {s3_bucket_base_path_jobinfo}")


    # ## 2- Cross Validation Model Training Step
#     print(f"network_config={network_config}")
    print(f"\n\nenv_data={env_data}")


    evaluation_report = PropertyFile(name="EvaluationReport", output_name="evaluation", path="evaluation.json")
    jobinfo = PropertyFile(name="JobInfo", output_name="jobinfo", path="jobinfo.json")

    script_tuner = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=hpo_tuner_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/KFoldCrossValidationHyperParameterTuner",
        role=env_data["TrainingRole"],
        sagemaker_session=sagemaker_session,
        volume_kms_key=env_data["EbsKmsKeyArn"],
        output_kms_key=env_data["S3KmsKeyId"],
        network_config=network_config,
    )
    
    k = "3"
    max_jobs = "3"
    max_parallel_jobs = "1"
    # C1 parameters
    preprocessing_categorical_encoder_min_samples_leaf = "100"
    preprocessing_categorical_encoder_smoothing = "1.0"
    over_sampler_sampling_strategy = "0.5"
    estimator_num_leaves = "20"
    estimator_min_child_samples = "100"
    estimator_max_depth = "5"
    estimator_learning_rate = "0.2"
    estimator_n_estimators = "50"



    step_cv_train_hpo = ProcessingStep(
        name="HyperParameterTuningStep",
        processor=script_tuner,
        code=cross_validation_with_hpo_script,
        outputs=[
            ProcessingOutput(output_name="evaluation",
                             source="/opt/ml/processing/evaluation",
                             destination=s3_buckets['s3_bucket_base_path_evaluation']), # s3_bucket_base_path_evaluation
            ProcessingOutput(output_name="jobinfo",
                             source="/opt/ml/processing/jobinfo",
                             destination=s3_buckets['s3_bucket_base_path_jobinfo'])
        ],
        job_arguments=["-k", k,
                       "--image-uri", image_uri,
                       "--train", s3_buckets['s3_bucket_base_path_train'],
                       "--test", s3_buckets['s3_bucket_base_path_test'],
                       "--instance-type", training_instance_type,
                       "--instance-count", "1",
                       "--output-path", s3_buckets['s3_bucket_base_path_output'],
                       "--max-jobs", "3",
                       "--max-parallel-jobs", max_parallel_jobs,
                       "--region", str(region),
                       "--subnets", env_data["SubnetIds"][0],
                       "--security_group_ids", env_data["SecurityGroups"][0],
                       "--preprocessing_categorical_encoder_min_samples_leaf", preprocessing_categorical_encoder_min_samples_leaf,
                       "--preprocessing_categorical_encoder_smoothing", preprocessing_categorical_encoder_smoothing,
                       "--over_sampler_sampling_strategy", over_sampler_sampling_strategy,
                       "--estimator_learning_rate", estimator_learning_rate,
                       ],
        property_files=[evaluation_report],
        depends_on=['PreprocessC1XsellData']
    )
    
#     model_bucket = "mlops-dev-370702650160-eu-north-1-models"
#     project_name = "customerone-dev-branch"
# #     code_path=f"s3://${model_bucket}/lifecycle/max/${project_name}/${SOURCE_HEADHASH}/input/source_scripts"
#     code_path = "s3://mlops-dev-370702650160-eu-north-1-models/lifecycle/max/customerone-dev-branch/8575b89a/input/source_scripts"
#     "s3://***/lifecycle/max/customerone-dev-branch/eaebec9/input/source_scripts/preprocessing/backup_cross_validation_with_hpo.py"
    # ## 3- Model Selection Step
    sklearn_estimator = SKLearn("scikit_learn_iris.py",
#                                 image_uri=image_uri,
                                framework_version=framework_version,
                                instance_type=training_instance_type,
                                py_version='py3',
                                source_dir='../source_scripts/preprocessing', #'../customized_containers', #"/opt/ml/code/",
                                output_path=s3_buckets['s3_bucket_base_path_output'],
                                role=env_data["TrainingRole"],
                                subnets=[env_data["SubnetIds"][0]],
                                security_group_ids=[env_data["SecurityGroups"][0]],
                                ) 
    
    step_model_selection = TrainingStep(
        name="ModelSelectionStep",
        estimator=sklearn_estimator,
        inputs={
            "train": TrainingInput(
                s3_data=f"{s3_buckets['s3_bucket_base_path_train']}/all",#f'{step_process.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]}/all' ,  
                content_type="text/csv"
            ),
            "jobinfo": TrainingInput(
                s3_data=f"{s3_buckets['s3_bucket_base_path_jobinfo']}",
                content_type="application/json"
            )
        }
    )
    
    
    print("SARAH: lightgbm_training_tasks > step_model_selection is created")
    return step_model_selection, step_cv_train_hpo, sklearn_estimator, evaluation_report

#         inputs=[ProcessingInput(source=f'{source_scripts_path}/preprocessing/utils/',
#                                 destination="/opt/ml/processing/input/code/utils/")],

In [20]:
os.getcwd()

'/root/sagemaker-customerone2-p-vrs1c6dm1yir-model-build-train/notebooks'

In [21]:
def lgbm_model_register_tasks(s3_buckets, evaluation_report, sklearn_estimator, step_model_selection, step_cv_train_hpo,
                              model_approval_status, baseline_model_objective_value, sagemaker_session, model_package_group_name, network_config,
                              env_data, preprocessing_script, postprocessing_script, revision):
    
    print("\n\n*********** register_model() *****")
#     print(f"sklearn_estimator.image_uri = {sklearn_estimator.image_uri}")
#     print(f"step_model_selection.properties.ModelArtifacts.S3ModelArtifacts = {step_model_selection.properties.ModelArtifacts.S3ModelArtifacts}")
    print("9999999999")
    print(step_cv_train_hpo.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"])
          
    model = Model(
        image_uri=sklearn_estimator.image_uri,
        model_data=step_model_selection.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=sagemaker_session,
        role=env_data['TrainingRole'],
    )

    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_cv_train_hpo.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json",
        )
    )

    step_register_model = RegisterModel(
        name="RegisterModelStep",
        estimator=sklearn_estimator,
        model_data=step_model_selection.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )
    # Condition Step
    cond_gte = ConditionGreaterThanOrEqualTo(
        left=JsonGet(
            step=step_cv_train_hpo,
            property_file=evaluation_report,
            json_path="multiclass_classification_metrics.accuracy.value",
        ),
        right=baseline_model_objective_value,
    )

    step_cond = ConditionStep(
        name="ModelEvaluationStep",
        conditions=[cond_gte],
        if_steps=[step_model_selection, step_register_model],
        else_steps=[],
    )


    return step_cond


In [22]:
def sagemaker_pipeline_parameters(data_bucket):
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    training_instance_count = "1"
    processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge")
    training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
    model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")
    hpo_tuner_instance_type = ParameterString(name="HPOTunerScriptInstanceType", default_value="ml.t3.medium")

    return model_approval_status, processing_instance_count, processing_instance_type, training_instance_type, training_instance_count, hpo_tuner_instance_type


In [23]:
pipeline_cvu=get_pipeline(
        region='eu-north-1',
        project_name='customerone-dev-branch',
        source_scripts_path="/root/sagemaker-customerone2-p-vrs1c6dm1yir-model-build-train/source_scripts/",
        model_package_group_name="cvm",
        pipeline_name="debugging-cv2",
        base_job_prefix="CVM",
        revision="no-revision-provided",)

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it.


Sarah: ml_pipelines > utiles > environments.py > sagemaker.__version__ is 2.107.0
Sarah: Creating the pipeline 'debugging-cv2':
sarah: Parameters:eu-north-1
['sg-041054ee4500f96f6']
['subnet-0724be5e7071e7070', 'subnet-01def51ffe7467c71']
arn:aws:iam::370702650160:role/sm-mlops-env-EnvironmentIAM-SageMakerExecutionRole-14AU65MVMBUGO
    arn:aws:iam::370702650160:role/sm-mlops-env-EnvironmentIAM-SageMakerExecutionRole-14AU65MVMBUGO
mlops-dev-370702650160-eu-north-1-data
mlops-dev-370702650160-eu-north-1-models
cvm
    debugging-cv2
CVM


 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ S3 paths:
s3_buckets={'default_bucket_data': ParameterString(name='DefaultS3BucketData', parameter_type=<ParameterTypeEnum.STRING: 'String'>, default_value='mlops-dev-370702650160-eu-north-1-data'), 'default_bucket_models': ParameterString(name='DefaultS3BucketModels', parameter_type=<ParameterTypeEnum.STRING: 'String'>, default_value='mlops-dev-370702650160-eu-north-1-models'), 'bucket_prefix_data': 's3://mlops-dev-370702

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it.


s3://mlops-dev-370702650160-eu-north-1-data/lifecycle/60d/customerone-dev-branch/no-revision-provided/2022_09_26_12_04_10/p1033/output/training/train


env_data={'DomainArn': 'arn:aws:sagemaker:eu-north-1:370702650160:domain/d-tdizim9qnor9', 'DomainId': 'd-tdizim9qnor9', 'DomainName': 'mlops-dev-eu-north-1-sagemaker-domain', 'HomeEfsFileSystemId': 'fs-03fc3d37f8623fea2', 'Status': 'InService', 'AuthMode': 'IAM', 'AppNetworkAccessType': 'VpcOnly', 'SubnetIds': ['subnet-0724be5e7071e7070', 'subnet-01def51ffe7467c71'], 'Url': 'https://d-tdizim9qnor9.studio.eu-north-1.sagemaker.aws', 'VpcId': 'vpc-0459a28f3637e285c', 'KmsKeyId': 'f4664542-0f2e-42ca-b51f-2bec0ad62278', 'ExecutionRole': 'arn:aws:iam::370702650160:role/sm-mlops-env-EnvironmentIAM-SageMakerExecutionRole-14AU65MVMBUGO', 'SecurityGroups': ['sg-041054ee4500f96f6'], 'JupyterServerAppSettings': {'DefaultResourceSpec': {'SageMakerImageArn': 'arn:aws:sagemaker:eu-north-1:243637512696:image/jupyter-server-3', 'InstanceType': 'system',

The class JsonGet has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [24]:
import json 
parsed = json.loads(pipeline_cvu.definition())
print(json.dumps(parsed, indent=2, sort_keys=True))

# Or:
# pipeline_cvu.definition()

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


{
  "Metadata": {},
  "Parameters": [
    {
      "DefaultValue": 1,
      "Name": "ProcessingInstanceCount",
      "Type": "Integer"
    },
    {
      "DefaultValue": "ml.m5.xlarge",
      "Name": "ProcessingInstanceType",
      "Type": "String"
    },
    {
      "DefaultValue": "ml.m5.xlarge",
      "Name": "TrainingInstanceType",
      "Type": "String"
    },
    {
      "DefaultValue": "ml.m5.large",
      "Name": "InferenceInstanceType",
      "Type": "String"
    },
    {
      "DefaultValue": "ml.t3.medium",
      "Name": "HPOTunerScriptInstanceType",
      "Type": "String"
    },
    {
      "DefaultValue": "PendingManualApproval",
      "Name": "ModelApprovalStatus",
      "Type": "String"
    },
    {
      "DefaultValue": "mlops-dev-370702650160-eu-north-1-data",
      "Name": "DefaultS3BucketData",
      "Type": "String"
    },
    {
      "DefaultValue": "2022-09-26 12:04:10",
      "Name": "ExecutionTime",
      "Type": "String"
    },
    {
      "DefaultValue": "ml-te

In [25]:
upsert_response = pipeline_cvu.upsert(
            role_arn="arn:aws:iam::370702650160:role/sm-mlops-env-EnvironmentI-SageMakerPipelineExecuti-1AWTL5A5UKOHN"
        )

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


In [26]:
upsert_response

{'PipelineArn': 'arn:aws:sagemaker:eu-north-1:370702650160:pipeline/crossvalidationtrainingpipeline',
 'ResponseMetadata': {'RequestId': '94959150-9632-4ba7-8ea5-72ca51067513',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '94959150-9632-4ba7-8ea5-72ca51067513',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '100',
   'date': 'Mon, 26 Sep 2022 12:04:13 GMT'},
  'RetryAttempts': 0}}

In [27]:
pipeline_cvu.start()

_PipelineExecution(arn='arn:aws:sagemaker:eu-north-1:370702650160:pipeline/crossvalidationtrainingpipeline/execution/fx3a9wagkh4m', sagemaker_session=<sagemaker.session.Session object at 0x7f9404409750>)