In [94]:
import boto3
import sagemaker 
import json
from time import gmtime, strftime, sleep
from IPython.display import HTML

In [None]:
%store -r 

%store

Stored variables and their in-db values:
baseline_s3_url                         -> 's3://sagemaker-us-east-1-906545278380/from-idea-t
bucket_name                             -> 'sagemaker-us-east-1-906545278380'
bucket_prefix                           -> 'from-idea-to-prod/xgboost'
dataset_feature_group_name              -> 'from-idea-to-prod-12-06-30-53'
dataset_file_local_path                 -> 'data/bank-additional/bank-additional-full.csv'
domain_id                               -> 'd-igloxuzrs3z2'
evaluation_s3_url                       -> 's3://sagemaker-us-east-1-906545278380/from-idea-t
experiment_name                         -> 'from-idea-to-prod-experiment-11-21-33-07'
feature_store_bucket_prefix             -> 'from-idea-to-prod/feature-store'
initialized                             -> True
input_s3_url                            -> 's3://sagemaker-us-east-1-906545278380/from-idea-t
mlflow_arn                              -> 'arn:aws:sagemaker:us-east-1:906545278380:mlflow

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
sm = boto3.client("sagemaker")

## Create an MLOps project

#### Set up the GitHub connection

In [62]:
# set this variable to the ARN of your code connection your created
code_connection_arn = <SET TO THE ARN OF THE CREATED CODE CONNECTION>

In [63]:
sc = boto3.client("servicecatalog")

sc_provider_name = "Amazon SageMaker"
sc_product_name = "MLOps template for model building, training, and deployment with third-party Git repositories using CodePipeline"

In [64]:
# find a Service Catalog product with the specific SageMaker project template
p_ids = [p['ProductId'] for p in sc.search_products(
    Filters={
        'FullTextSearch': [sc_product_name]
    },
)['ProductViewSummaries'] if p["Name"]==sc_product_name]

In [65]:
p_ids

['prod-cwbsact3annui']

In [66]:
# If you get any exception from this code, go to the Option 2 and create a project in Studio UI
if not len(p_ids):
    raise Exception("No Amazon SageMaker ML Ops products found!")
elif len(p_ids) > 1:
    raise Exception("Too many matching Amazon SageMaker ML Ops products found!")
else:
    product_id = p_ids[0]
    print(f"ML Ops product id: {product_id}")

ML Ops product id: prod-cwbsact3annui


In [67]:
# output what this project is about
sc.describe_product(Id=product_id)['ProductViewSummary']['ShortDescription']

'Use this template to automate the entire model lifecycle that includes both model buidling and deployment workflows. Ideally suited for continuous integration and continuous deployment (CI/CD) of ML models. Process data, extract features, train and test models, and register them in the model registry. Attach your own Git repository to the project for checking in and managing code versions. Kick off the model deployment workflow by approving the model registered in the model registry for deployment either manually or automatically. You can customize the seed code and the configuration files to suit your requirements. AWS CodePipeline is used to orchestrate the model deployment.\n\nModel building pipeline: SageMaker Pipelines\nCode repository: Third party Git\nOrchestration: AWS CodePipeline\n'

In [68]:
# get the latest template version
provisioning_artifact_id = sorted(
    [i for i in sc.list_provisioning_artifacts(
        ProductId=product_id
    )['ProvisioningArtifactDetails'] if i['Guidance']=='DEFAULT'],
    key=lambda d: d['Name'], reverse=True)[0]['Id']

In [69]:
provisioning_artifact_id

'pa-jt7oyaklc3eym'

In [70]:
sc.describe_provisioning_artifact(ProductId=product_id, ProvisioningArtifactId=provisioning_artifact_id)

{'ProvisioningArtifactDetail': {'Id': 'pa-jt7oyaklc3eym',
  'Name': 'v2.0',
  'Description': 'Adding error handling for access denied for seed code checkin',
  'Type': 'CLOUD_FORMATION_TEMPLATE',
  'CreatedTime': datetime.datetime(2024, 12, 10, 5, 6, 31, tzinfo=tzlocal()),
  'Active': True,
  'Guidance': 'DEFAULT'},
 'Info': {'TemplateUrl': 'https://s3.us-east-1.amazonaws.com/ciclo-us-east-1-prod-product-templates/model_build_deploy_toolchain_3p_git_template.yml'},
 'Status': 'AVAILABLE',
 'ResponseMetadata': {'RequestId': '1166418b-fb61-4e1b-b6a4-9cb05bd4b008',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1166418b-fb61-4e1b-b6a4-9cb05bd4b008',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '414',
   'date': 'Fri, 14 Feb 2025 11:02:36 GMT'},
  'RetryAttempts': 0}}

In [75]:
# set unique project name
project_name = f"mlops-{strftime('%m-%d-%H-%M-%S', gmtime())}"

In [76]:
# Branch name for the Model Building and Training Code Repository
model_build_code_repository_branch = 'main'
# Full repository name of the Model Building and Training Code Repository, which would be username/reponame or organizationname/reponame
model_build_code_repository_full_name = <ENTER YOUR FULL GITHUB REPO FOR MODEL BUILD NAME HERE> # e.g. username/reponame

# Branch name for the Model Deployment Code Repository
model_deploy_code_repository_branch = 'main'
# Full repository name of the Model Deployment Code Repository, which would be username/reponame or organizationname/reponame
model_deploy_code_repository_full_name = <ENTER YOUR FULL GITHUB REPO FOR MODEL DEPLOY NAME HERE> # e.g. username/reponame

In [77]:
# set project parameters
project_parameters = [
    {
        'Key': 'ModelBuildCodeRepositoryBranch',
        'Value': model_build_code_repository_branch,
    },
    {
        'Key': 'ModelBuildCodeRepositoryFullname',
        'Value': model_build_code_repository_full_name,
    },
    {
        'Key': 'ModelDeployCodeRepositoryBranch',
        'Value': model_deploy_code_repository_branch,
    },
    {
        'Key': 'ModelDeployCodeRepositoryFullname',
        'Value': model_deploy_code_repository_full_name,
    },
        {
        'Key': 'CodeConnectionArn',
        'Value': code_connection_arn,
    },
]

In [91]:
print(f'''Creating a {project_name} using {sc_product_name} with the following parameters:
{json.dumps(project_parameters, indent=2)}
''')

Creating a mlops-02-14-11-03-33 using MLOps template for model building, training, and deployment with third-party Git repositories using CodePipeline with the following parameters:
[
  {
    "Key": "ModelBuildCodeRepositoryBranch",
    "Value": "main"
  },
  {
    "Key": "ModelBuildCodeRepositoryFullname",
    "Value": "yevgeniyilyin/sagemaker-ai-model-build-2"
  },
  {
    "Key": "ModelDeployCodeRepositoryBranch",
    "Value": "main"
  },
  {
    "Key": "ModelDeployCodeRepositoryFullname",
    "Value": "yevgeniyilyin/sagemaker-ai-model-deploy-2"
  },
  {
    "Key": "CodeConnectionArn",
    "Value": "arn:aws:codeconnections:us-east-1:906545278380:connection/f76f091f-f02a-4390-8abd-184d1735ca3a"
  }
]



In [78]:
# create SageMaker project
r = sm.create_project(
    ProjectName=project_name,
    ProjectDescription="Model build and deploy project",
    ServiceCatalogProvisioningDetails={
        'ProductId': product_id,
        'ProvisioningArtifactId': provisioning_artifact_id,
        'ProvisioningParameters': project_parameters
    },
)

print(r)
project_id = r["ProjectId"]

{'ProjectArn': 'arn:aws:sagemaker:us-east-1:906545278380:project/mlops-02-14-11-03-33', 'ProjectId': 'p-ca7phmcraepa', 'ResponseMetadata': {'RequestId': '954c8ede-e181-4959-8ed0-f04946e002fb', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '954c8ede-e181-4959-8ed0-f04946e002fb', 'content-type': 'application/x-amz-json-1.1', 'content-length': '115', 'date': 'Fri, 14 Feb 2025 11:03:38 GMT'}, 'RetryAttempts': 0}}


In [84]:
# Project creation takes about 3-5 min
while sm.describe_project(ProjectName=project_name)['ProjectStatus'] not in ['CreateCompleted', 'CreateFailed']:
    print("Waiting for project creation completion")
    sleep(10)
    
print(f"MLOps project {project_name} creation completed")

MLOps project mlops-02-14-11-03-33 creation completed


In [85]:
assert sm.describe_project(ProjectName=project_name)['ProjectStatus'] == 'CreateCompleted', 'Project status must be CreateCompleted!'

## Configure the MLOps project

In [122]:
try:
    print(project_name)
    print(project_id)
    print(model_build_code_repository_full_name)
    print(code_connection_arn)
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("Set the code_connection_arn, project_name, and repository full name in the following code cell")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

mlops-02-14-11-03-33
p-ca7phmcraepa
yevgeniyilyin/sagemaker-ai-model-build-2
arn:aws:codeconnections:us-east-1:906545278380:connection/f76f091f-f02a-4390-8abd-184d1735ca3a


In [187]:
# project_name = "<SET TO THE NAME OF THE CREATED PROJECT>" # Keep commented out if you used option 1 to create a project
# model_build_code_repository_full_name = "<SET TO THE FULL NAME OF MODEL BUILD REPO>" # Keep commented out if you used option 1 to create a project

r = sm.describe_project(ProjectName=project_name)
project_id = r['ProjectId']
project_arn = r['ProjectArn']
repository_name = model_build_code_repository_full_name.split('/')[1]
git_folder = project_name
project_folder = f'sagemaker-{project_id}-modelbuild'
project_path = f'{git_folder}/{project_folder}'

%store project_name
%store project_arn
%store project_id
    
print(f"Project path: {project_path}")

Stored 'project_name' (str)
Stored 'project_arn' (str)
Stored 'project_id' (str)
Project path: mlops-02-14-11-03-33/sagemaker-p-ca7phmcraepa-modelbuild


### Explore the project in the Studio UI

In [105]:
# Show the project link
display(
    HTML('<b>See <a target="top" href="https://studio-{}.studio.{}.sagemaker.aws/projects/{}/">the project</a> in the Studio UI</b>'.format(
            domain_id, region, project_name))
)

### Clone the project seed code to the JupyterLab file system

In [158]:
def copy_output(text):
    return HTML(f'''
        <div style="position: relative;">
            <pre>{text}</pre>
            <button onclick="navigator.clipboard.writeText(`{text}`)"
                    style="position: absolute; top: 5px; right: 5px;">
                Copy
            </button>
        </div>
    ''')
    
cmd = f'''
git config --global credential.UseHttpPath true
git config --global credential.helper 'cache --timeout=720000'
git config --global credential.helper '!aws codecommit credential-helper $@'

mkdir -p $HOME/{project_path}

git clone https://codeconnections.{region}.amazonaws.com/git-http/{code_connection_arn.split(':')[4]}/{region}/{code_connection_arn.split(':')[-1].split('/')[-1]}/{model_build_code_repository_full_name}.git  $HOME/{project_path}
'''

copy_output(cmd)

### 2. Replace pipeline construction code

In [161]:
# see the workshop folder name
!pwd

/home/sagemaker-user/amazon-sagemaker-from-idea-to-production


In [162]:
# if you local path for the workshop folder is different, set the correct absolute path to the variable workshop_folder
workshop_folder = "amazon-sagemaker-from-idea-to-production"

In [194]:
!mkdir -p ~/{workshop_folder}/pipelines
!mv ~/{project_path}/codebuild-buildspec.yml ~/{project_path}/codebuild-buildspec-original.yml
!mv ~/{project_path}/setup.py ~/{project_path}/setup-original.py
!mv ~/{project_path}/pipelines/abalone ~/{project_path}/pipelines/fromideatoprod
!mv ~/{project_path}/pipelines/fromideatoprod/pipeline.py ~/{project_path}/pipelines/fromideatoprod/pipeline-original.py
!cp ~/{workshop_folder}/pipeline_steps/* ~/{project_path}/pipelines/
!cp ~/{workshop_folder}/pipeline_steps/* ~/{workshop_folder}/pipelines/
!cp ~/{workshop_folder}/requirements.txt ~/{project_path}
!cp ~/{workshop_folder}/config.yaml ~/{project_path}

mv: cannot stat '/home/sagemaker-user/mlops-02-14-11-03-33/sagemaker-p-ca7phmcraepa-modelbuild/pipelines/abalone': No such file or directory
cp: -r not specified; omitting directory '/home/sagemaker-user/amazon-sagemaker-from-idea-to-production/pipeline_steps/__pycache__'
cp: -r not specified; omitting directory '/home/sagemaker-user/amazon-sagemaker-from-idea-to-production/pipeline_steps/__pycache__'


In [195]:
%%writefile pipeline.py

import pandas as pd
import json
import boto3
import pathlib
import io
import os
import sagemaker
import mlflow
from time import gmtime, strftime, sleep
from sagemaker.deserializers import CSVDeserializer
from sagemaker.serializers import CSVSerializer

from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import (
    ProcessingInput, 
    ProcessingOutput, 
    ScriptProcessor
)
from sagemaker.inputs import TrainingInput

from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import (
    ProcessingStep, 
    TrainingStep, 
    CreateModelStep,
    CacheConfig
)
from sagemaker.workflow.check_job_config import CheckJobConfig
from sagemaker.workflow.parameters import (
    ParameterInteger, 
    ParameterFloat, 
    ParameterString, 
    ParameterBoolean
)
from sagemaker.workflow.quality_check_step import (
    DataQualityCheckConfig,
    ModelQualityCheckConfig,
    QualityCheckStep,
)
from sagemaker.workflow.clarify_check_step import (
    ModelBiasCheckConfig, 
    ClarifyCheckStep, 
    ModelExplainabilityCheckConfig
)
from sagemaker import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.conditions import (
    ConditionGreaterThan,
    ConditionGreaterThanOrEqualTo
)
from sagemaker.workflow.parallelism_config import ParallelismConfiguration
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import (
    Join,
    JsonGet
)
from sagemaker.workflow.lambda_step import (
    LambdaStep,
    LambdaOutput,
    LambdaOutputTypeEnum,
)
from sagemaker.lambda_helper import Lambda

from sagemaker.model_metrics import (
    MetricsSource, 
    ModelMetrics, 
    FileSource
)
from sagemaker.drift_check_baselines import DriftCheckBaselines
from sagemaker.workflow.pipeline_definition_config import PipelineDefinitionConfig 
from sagemaker.image_uris import retrieve
from sagemaker.workflow.function_step import step
from sagemaker.workflow.step_outputs import get_step
from sagemaker.model_monitor import DatasetFormat, model_monitoring

from pipelines.preprocess import preprocess
from pipelines.evaluate import evaluate
from pipelines.register import register
from pipelines.extract import prepare_datasets

def get_sagemaker_client(region):
     return boto3.Session(region_name=region).client("sagemaker")

def get_pipeline_session(region, bucket_name):
    """Gets the pipeline session based on the region.

    Args:
        region: the aws region to start the session
        bucket_name: the bucket to use for storing the artifacts

    Returns:
        PipelineSession instance
    """

    boto_session = boto3.Session(region_name=region)
    sagemaker_client = boto_session.client("sagemaker")

    return PipelineSession(
        boto_session=boto_session,
        sagemaker_client=sagemaker_client,
        default_bucket=bucket_name,
    )

def get_pipeline_custom_tags(new_tags, region, sagemaker_project_name=None):
    try:
        print(f"Getting project tags for {sagemaker_project_name}")
        
        sm_client = get_sagemaker_client(region)
        
        project_arn = sm_client.describe_project(ProjectName=sagemaker_project_name)['ProjectArn']
        project_tags = sm_client.list_tags(ResourceArn=project_arn)['Tags']

        print(f"Project tags: {project_tags}")
        
        for project_tag in project_tags:
            new_tags.append(project_tag)
            
    except Exception as e:
        print(f"Error getting project tags: {e}")
        
    return new_tags
    
def get_pipeline(
    region,
    sagemaker_project_id=None,
    sagemaker_project_name=None,
    role=None,
    bucket_name=None,
    bucket_prefix="from-idea-to-prod/xgboost",
    input_s3_url=None,
    feature_group_name=None,
    model_package_group_name="from-idea-to-prod-model-group",
    pipeline_name_prefix="from-idea-to-prod-pipeline",
    process_instance_type="ml.m5.large",
    train_instance_type="ml.m5.xlarge",
    test_score_threshold=0.70,
    tracking_server_arn=None,
):
    """Gets a SageMaker ML Pipeline instance.
    
    Returns:
        an instance of a pipeline
    """
    if feature_group_name is None and input_s3_url is None:
        print("One of feature_group_name or input_s3_url must be provided. Exiting...")
        return None

    session = get_pipeline_session(region, bucket_name)
    sm = session.sagemaker_client
    
    if role is None:
        role = sagemaker.session.get_execution_role(session)

    print(f"sagemaker version: {sagemaker.__version__}")
    print(f"Execution role: {role}")
    print(f"Input S3 URL: {input_s3_url}")
    print(f"Feature group: {feature_group_name}")
    print(f"Model package group: {model_package_group_name}")
    print(f"Pipeline name prefix: {pipeline_name_prefix}")
    print(f"Tracking server ARN: {tracking_server_arn}")
    
    pipeline_name = f"{pipeline_name_prefix}-{sagemaker_project_id}"
    experiment_name = pipeline_name

    output_s3_prefix = f"s3://{bucket_name}/{bucket_prefix}"
    # Set the output S3 url for model artifact
    output_s3_url = f"{output_s3_prefix}/output"
    # Set the output S3 url for feature store query results
    output_query_location = f'{output_s3_prefix}/offline-store/query_results'
    
    # Set the output S3 urls for processed data
    train_s3_url = f"{output_s3_prefix}/train"
    validation_s3_url = f"{output_s3_prefix}/validation"
    test_s3_url = f"{output_s3_prefix}/test"
    evaluation_s3_url = f"{output_s3_prefix}/evaluation"
    
    baseline_s3_url = f"{output_s3_prefix}/baseline"
    prediction_baseline_s3_url = f"{output_s3_prefix}/prediction_baseline"
    
    xgboost_image_uri = sagemaker.image_uris.retrieve(
            "xgboost", 
            region=region,
            version="1.5-1"
    )

    # If no tracking server ARN, try to find an active MLflow server
    if tracking_server_arn is None:
        r = sm.list_mlflow_tracking_servers(
            TrackingServerStatus='Created',
        )['TrackingServerSummaries']
    
        if len(r) < 1:
            print("You don't have any running MLflow servers. Exiting...")
            return None
        else:
            tracking_server_arn = r[0]['TrackingServerArn']
            print(f"Use the tracking server ARN:{tracking_server_arn}")
        
    # Parameters for pipeline execution
    
    # Set processing instance type
    process_instance_type_param = ParameterString(
        name="ProcessingInstanceType",
        default_value=process_instance_type,
    )

    # Set training instance type
    train_instance_type_param = ParameterString(
        name="TrainingInstanceType",
        default_value=train_instance_type,
    )

    # Set model approval param
    model_approval_status_param = ParameterString(
        name="ModelApprovalStatus",
        default_value="PendingManualApproval"
    )

    # Minimal threshold for model performance on the test dataset
    test_score_threshold_param = ParameterFloat(
        name="TestScoreThreshold", 
        default_value=test_score_threshold
    )

    # S3 url for the input dataset
    input_s3_url_param = ParameterString(
        name="InputDataUrl",
        default_value=input_s3_url if input_s3_url else "None",
    )

    # Feature group name for the input featureset
    feature_group_name_param = ParameterString(
        name="FeatureGroupName",
        default_value=feature_group_name if feature_group_name else "None",
    )
    
    # Model package group name
    model_package_group_name_param = ParameterString(
        name="ModelPackageGroupName",
        default_value=model_package_group_name,
    )

    # MLflow tracking server ARN
    tracking_server_arn_param = ParameterString(
        name="TrackingServerARN",
        default_value=tracking_server_arn,
    )
    
    # Define step cache config
    cache_config = CacheConfig(
        enable_caching=True,
        expire_after="P30d" # 30-day
    )

    # Construct the pipeline
    
    # Get datasets
    step_get_datasets = step(
            preprocess, 
            role=role,
            instance_type=process_instance_type_param,
            name=f"preprocess",
            keep_alive_period_in_seconds=3600,
    )(
        input_data_s3_path=input_s3_url_param,
        output_s3_prefix=output_s3_prefix,
        tracking_server_arn=tracking_server_arn_param,
        experiment_name=experiment_name,
        pipeline_run_name=ExecutionVariables.PIPELINE_EXECUTION_ID,
    ) if input_s3_url else step(
        prepare_datasets, 
        role=role,
        instance_type=process_instance_type_param,
        name=f"extract-featureset",
        keep_alive_period_in_seconds=3600,
    )(
        feature_group_name=feature_group_name_param,
        output_s3_prefix=output_s3_prefix,
        query_output_s3_path=output_query_location,
        tracking_server_arn=tracking_server_arn_param,
        experiment_name=experiment_name,
        pipeline_run_name=ExecutionVariables.PIPELINE_EXECUTION_ID,
    )
    
    # Instantiate an XGBoost estimator object
    estimator = sagemaker.estimator.Estimator(
        image_uri=xgboost_image_uri,
        role=role, 
        instance_type=train_instance_type_param,
        instance_count=1,
        output_path=output_s3_url,
        sagemaker_session=session,
        base_job_name=f"{pipeline_name}-train"
    )
    
    # Define algorithm hyperparameters
    estimator.set_hyperparameters(
        num_round=100, # the number of rounds to run the training
        max_depth=3, # maximum depth of a tree
        eta=0.5, # step size shrinkage used in updates to prevent overfitting
        alpha=2.5, # L1 regularization term on weights
        objective="binary:logistic",
        eval_metric="auc", # evaluation metrics for validation data
        subsample=0.8, # subsample ratio of the training instance
        colsample_bytree=0.8, # subsample ratio of columns when constructing each tree
        min_child_weight=3, # minimum sum of instance weight (hessian) needed in a child
        early_stopping_rounds=10, # the model trains until the validation score stops improving
        verbosity=1, # verbosity of printing messages
    )
    
    # train step
    step_train = TrainingStep(
        name=f"train",
        step_args=estimator.fit(
            {
                "train": TrainingInput(
                    step_get_datasets['train_data'],
                    content_type="text/csv",
                ),
                "validation": TrainingInput(
                    step_get_datasets['validation_data'],
                    content_type="text/csv",
                ),
            }
        ),
        cache_config=cache_config,
    )   
    
    # Evaluation step
    step_evaluate = step(
        evaluate,
        role=role,
        instance_type=process_instance_type_param,
        name=f"evaluate",
        keep_alive_period_in_seconds=3600,
    )(
        test_x_data_s3_path=step_get_datasets['test_x_data'],
        test_y_data_s3_path=step_get_datasets['test_y_data'],
        model_s3_path=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        output_s3_prefix=output_s3_prefix,
        tracking_server_arn=tracking_server_arn_param,
        experiment_name=step_get_datasets['experiment_name'],
        pipeline_run_id=step_get_datasets['pipeline_run_id'],
    )

    # register model step
    step_register = step(
        register,
        role=role,
        instance_type=process_instance_type_param,
        name=f"register",
        keep_alive_period_in_seconds=3600,
    )(
        training_job_name=step_train.properties.TrainingJobName,
        model_package_group_name=model_package_group_name_param,
        model_approval_status=model_approval_status_param,
        evaluation_result=step_evaluate['evaluation_result'],
        output_s3_prefix=output_s3_url,
        tracking_server_arn=tracking_server_arn_param,
        experiment_name=step_get_datasets['experiment_name'],
        pipeline_run_id=step_get_datasets['pipeline_run_id'],
    )

    # fail the pipeline execution step
    step_fail = FailStep(
        name=f"fail",
        error_message=Join(on=" ", values=["Execution failed due to AUC Score < ", test_score_threshold_param]),
    )
    
    # condition to check in the condition step
    condition_gte = ConditionGreaterThanOrEqualTo(
            left=step_evaluate['evaluation_result']['classification_metrics']['auc_score']['value'],  
            right=test_score_threshold_param,
    )
    
    # conditional register step
    step_conditional_register = ConditionStep(
        name=f"check-metrics",
        conditions=[condition_gte],
        if_steps=[step_register],
        else_steps=[step_fail],
    )   

    # Create a pipeline object
    pipeline = Pipeline(
        name=f"{pipeline_name}",
        parameters=[
            input_s3_url_param,
            feature_group_name_param,
            process_instance_type_param,
            train_instance_type_param,
            model_approval_status_param,
            test_score_threshold_param,
            model_package_group_name_param,
            tracking_server_arn_param,
        ],
        steps=[step_conditional_register],
        pipeline_definition_config=PipelineDefinitionConfig(use_custom_job_prefix=True)
    )
    
    return pipeline

Overwriting pipeline.py


In [196]:
!cp ~/{workshop_folder}/pipeline.py ~/{project_path}/pipelines/fromideatoprod/

In [197]:
from pipeline import get_pipeline

In [None]:
# If you created a feature store in the notebook 3, you can set the feature_group_name parameter instead of input_s3_url to take the data from the feature store
p = get_pipeline(
    region=region,
    sagemaker_project_id=project_id,
    sagemaker_project_name=project_name,
    role=sm_role,
    bucket_name=bucket_name,
    bucket_prefix=bucket_prefix,
    input_s3_url=input_s3_url,
    # feature_group_name=dataset_feature_group_name,
    model_package_group_name=model_package_group_name,
    pipeline_name_prefix=pipeline_name,
    process_instance_type="ml.m5.large",
    train_instance_type="ml.m5.xlarge",
    test_score_threshold=0.70,
    tracking_server_arn=mlflow_arn,
)

In [None]:
p.definition()

In [None]:
p.upsert(role_arn=sm_role)

In [201]:
from IPython.display import HTML

# Show the pipeline link
display(
    HTML('<b>See <a target="top" href="https://studio-{}.studio.{}.sagemaker.aws/pipelines/{}/graph">the pipeline</a> in the Studio UI</b>'.format(
            domain_id, region, p.describe()['PipelineName']))
)

#### Control project ownership with resource tags

In [202]:
model_package_group_arn = sm.describe_model_package_group(ModelPackageGroupName=model_package_group_name).get("ModelPackageGroupArn")

if model_package_group_arn:
    print(f"Adding tags {project_arn.split('/')[-1]} and {project_id} for model package group {model_package_group_arn}")
    r = sm.add_tags(
        ResourceArn=model_package_group_arn,
        Tags=[
            {
                'Key': 'sagemaker:project-name',
                'Value': project_arn.split("/")[-1]
            },
            {
                'Key': 'sagemaker:project-id',
                'Value': project_id
            },
        ]
    )
    print(r)
else:
    print(f"The model package group {model_package_group_name} doesn't exist")
    
sm.list_tags(ResourceArn=model_package_group_arn)["Tags"]

Adding tags mlops-02-14-11-03-33 and p-ca7phmcraepa for model package group arn:aws:sagemaker:us-east-1:906545278380:model-package-group/from-idea-to-prod-pipeline-model-12-06-30-22
{'Tags': [{'Key': 'sagemaker:project-name', 'Value': 'mlops-02-14-11-03-33'}, {'Key': 'sagemaker:project-id', 'Value': 'p-ca7phmcraepa'}], 'ResponseMetadata': {'RequestId': 'be9be1bf-da7a-4cdf-944f-0ae71254b32a', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'be9be1bf-da7a-4cdf-944f-0ae71254b32a', 'content-type': 'application/x-amz-json-1.1', 'content-length': '130', 'date': 'Sun, 16 Feb 2025 21:18:54 GMT'}, 'RetryAttempts': 0}}


[{'Key': 'sagemaker:project-name', 'Value': 'mlops-02-14-11-03-33'},
 {'Key': 'sagemaker:project-id', 'Value': 'p-ca7phmcraepa'}]

### 3. Modify the build specification file

In [203]:
try:
    print(f"""
        INPUT-S3-URL: {input_s3_url}
        FEATURE-GROUP-NAME: {dataset_feature_group_name}
        MODEL-PACKAGE-GROUP-NAME: {project_name}-{project_id}
        PIPELINE-NAME-PREFIX: {pipeline_name}
        ROLE: {sm_role}
        TRACKING-SERVER-ARN: {mlflow_arn}
        """)
except NameError:
    print(f"""
        Dataset feature group name is not defined, use input_s3_url instead:
        ********************************************************************
        
        INPUT-S3-URL: {input_s3_url}
        MODEL-PACKAGE-GROUP-NAME: {project_name}-{project_id}
        PIPELINE-NAME-PREFIX: {pipeline_name}
        ROLE: {sm_role}
        TRACKING-SERVER-ARN: {mlflow_arn}
        """)


        INPUT-S3-URL: s3://sagemaker-us-east-1-906545278380/from-idea-to-prod/xgboost/input/bank-additional-full.csv
        FEATURE-GROUP-NAME: from-idea-to-prod-12-06-30-53
        MODEL-PACKAGE-GROUP-NAME: mlops-02-14-11-03-33-p-ca7phmcraepa
        PIPELINE-NAME-PREFIX: from-idea-to-prod-pipeline-12-06-30-22
        ROLE: arn:aws:iam::906545278380:role/mlops-workshop-domain-SageMakerExecutionRole-rIgas55nwmQD
        TRACKING-SERVER-ARN: arn:aws:sagemaker:us-east-1:906545278380:mlflow-tracking-server/test-mlflow
        


In [204]:
%%writefile codebuild-buildspec.yml

version: 0.2

phases:
  install:
    runtime-versions:
      python: 3.10
    commands:
      - pip install --upgrade --force-reinstall . "awscli>1.20.30"
      - pip install --upgrade mlflow sagemaker-mlflow s3fs xgboost
    
  build:
    commands:
      - export SAGEMAKER_USER_CONFIG_OVERRIDE="./config.yaml"
      - export PYTHONUNBUFFERED=TRUE
      - export SAGEMAKER_PROJECT_NAME_ID="${SAGEMAKER_PROJECT_NAME}-${SAGEMAKER_PROJECT_ID}"
      - |
        run-pipeline --module-name pipelines.fromideatoprod.pipeline \
          --role-arn $SAGEMAKER_PIPELINE_ROLE_ARN \
          --tags "[{\"Key\":\"sagemaker:project-name\",\"Value\":\"${SAGEMAKER_PROJECT_NAME}\"}, {\"Key\":\"sagemaker:project-id\", \"Value\":\"${SAGEMAKER_PROJECT_ID}\"}]" \
          --kwargs "{ \
                \"input_s3_url\":\"s3://sagemaker-us-east-1-906545278380/from-idea-to-prod/xgboost/input/bank-additional-full.csv\", \
                \"model_package_group_name\":\"mlops-02-14-11-03-33-p-ca7phmcraepa\",\
                \"pipeline_name_prefix\":\"from-idea-to-prod-pipeline-12-06-30-22\",\
                \"role\":\"arn:aws:iam::906545278380:role/mlops-workshop-domain-SageMakerExecutionRole-rIgas55nwmQD\",\
                \"tracking_server_arn\":\"arn:aws:sagemaker:us-east-1:906545278380:mlflow-tracking-server/test-mlflow\", \
                \"region\":\"${AWS_REGION}\", \
                \"sagemaker_project_name\":\"${SAGEMAKER_PROJECT_NAME}\",\
                \"sagemaker_project_id\":\"${SAGEMAKER_PROJECT_ID}\",\
                \"bucket_name\":\"${ARTIFACT_BUCKET}\"\
                    }"
      - echo "Create/update of the SageMaker Pipeline and a pipeline execution completed."

Overwriting codebuild-buildspec.yml


In [205]:
!cp ~/{workshop_folder}/codebuild-buildspec.yml ~/{project_path}/codebuild-buildspec.yml

In [206]:
%%writefile setup.py
import os
import setuptools


about = {}
here = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(here, "pipelines", "__version__.py")) as f:
    exec(f.read(), about)


with open("README.md", "r") as f:
    readme = f.read()


required_packages = ["sagemaker"]
extras = {
    "test": [
        "black",
        "coverage",
        "flake8",
        "mock",
        "pydocstyle",
        "pytest",
        "pytest-cov",
        "sagemaker",
        "tox",
    ]
}
setuptools.setup(
    name=about["__title__"],
    description=about["__description__"],
    version=about["__version__"],
    author=about["__author__"],
    author_email=["__author_email__"],
    long_description=readme,
    long_description_content_type="text/markdown",
    url=about["__url__"],
    license=about["__license__"],
    packages=setuptools.find_packages(),
    include_package_data=True,
    python_requires=">=3.6",
    install_requires=required_packages,
    extras_require=extras,
    entry_points={
        "console_scripts": [
            "get-pipeline-definition=pipelines.get_pipeline_definition:main",
            "run-pipeline=pipelines.run_pipeline:main",
        ]
    },
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Developers",
        "Natural Language :: English",
        "Programming Language :: Python",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
    ],
)

Overwriting setup.py


In [207]:
!cp ~/{workshop_folder}/setup.py ~/{project_path}/setup.py

---

## Run the CI/CD for the model building pipeline

In [188]:
cmd = f'''
cd ~/{project_path}

git config --global user.email "you@example.com"
git config --global user.name "Your Name"
  
git add -A
git commit -am "customize project"
git push
'''

copy_output(cmd)

In [190]:
# Show the pipeline execution link
display(
    HTML('<b>See <a target="top" href="https://studio-{}.studio.{}.sagemaker.aws/pipelines/{}/executions/">the pipeline executions</a> in the Studio UI</b>'.format(
            domain_id, region, p.describe()['PipelineName']))
)

## View the model package in the model registry

In [191]:
# Show the model package link
display(
    HTML('<b>See <a target="top" href="https://studio-{}.studio.{}.sagemaker.aws/models/registered-models/{}-{}/versions">the model package versions</a> in the Studio UI</b>'.format(
            domain_id, region, project_name, project_id))
)