In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import sagemaker
import json
import boto3
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
from sklearn import datasets

sm_client = boto3.client('sagemaker')

In [3]:
# Retrieve the default bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name
print(region)
role = get_execution_role()

prefix='datasets/iris'
project_name = "iris-multi-13"
project_id = sm_client.describe_project(ProjectName=project_name)['ProjectId']
model_package_group_name = project_name
print("Model package group name: %s" % model_package_group_name)

assert(len(project_name) <= 15 ) # the project name should not have more than 15 chars

us-east-1
Model package group name: iris-multi-13


## Let's load the dataset, take a look and then prepare the training pipeline

In [4]:
iris = datasets.load_iris()
dataset = np.insert(iris.data, 0, iris.target,axis=1)

df = pd.DataFrame(data=dataset, columns=['iris_id'] + iris.feature_names)
df['species'] = df['iris_id'].map(lambda x: 'setosa' if x == 0 else 'versicolor' if x == 1 else 'virginica')

df.head()

Unnamed: 0,iris_id,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,0.0,5.1,3.5,1.4,0.2,setosa
1,0.0,4.9,3.0,1.4,0.2,setosa
2,0.0,4.7,3.2,1.3,0.2,setosa
3,0.0,4.6,3.1,1.5,0.2,setosa
4,0.0,5.0,3.6,1.4,0.2,setosa


### We need to upload the dataset to an S3 bucket

In [5]:
X=iris.data
y=iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
yX_train = np.column_stack((y_train, X_train))
yX_test = np.column_stack((y_test, X_test))
np.savetxt("iris_train.csv", yX_train, delimiter=",", fmt='%0.3f')
np.savetxt("iris_test.csv", yX_test, delimiter=",", fmt='%0.3f')

# Upload the dataset to an S3 bucket
input_train = sagemaker_session.upload_data(path='iris_train.csv', key_prefix='%s/data' % prefix)
input_test = sagemaker_session.upload_data(path='iris_test.csv', key_prefix='%s/data' % prefix)
print(input_train)
print(input_test)

s3://sagemaker-us-east-1-256537223841/datasets/iris/data/iris_train.csv
s3://sagemaker-us-east-1-256537223841/datasets/iris/data/iris_test.csv


### Creating the ML Pipeline

#### Pipeline input parameters

In [5]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

training_instance_type = ParameterString(
    name="TrainingInstanceType",
    default_value="ml.c4.2xlarge"
)
training_instance_count = ParameterInteger(
    name="TrainingInstanceCount",
    default_value=1
)
input_train_data = ParameterString(
    name="InputDataTrain",
    default_value=input_train,
)
input_test_data = ParameterString(
    name="InputDataTest",
    default_value=input_test,
)

In [6]:
from sagemaker.estimator import Estimator
import time

ts = time.strftime('%Y-%m-%d-%H-%M-%S')
model_path = "s3://%s/iris-%s" % (bucket, ts)

image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost", region=region, version="1.0-1", py_version="py3", 
    instance_type=training_instance_type.default_value,
)

#### Estimator that will run the training process

In [7]:

xgb_train = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=training_instance_count,
    output_path=model_path,
    sagemaker_session=sagemaker_session,
    role=role,
)
xgb_train.set_hyperparameters(
    eta=0.1,
    max_depth=10,
    gamma=4,
    num_class=len(np.unique(y)),
    alpha=10,
    min_child_weight=6,
    silent=0,
    objective='multi:softmax',
    num_round=30
)

### Now, let's create the steps. First the training step

In [8]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

step_train = TrainingStep(
    name="IrisTrain",
    estimator=xgb_train,
    inputs={
        "train": TrainingInput(s3_data=input_train_data, content_type="text/csv"),
        "validation": TrainingInput(s3_data=input_test_data, content_type="text/csv"
        )
    },
)

### Then, the Register step that will add a new version to the Model Registry

In [9]:
from sagemaker.workflow.step_collections import RegisterModel

# NOTE: model_approval_status is not available as arg in service dsl currently
step_register = RegisterModel(
    name="IrisRegisterModel",
    estimator=xgb_train,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name
)

### Now, we can create the pipeline

In [10]:
from botocore.exceptions import ClientError, ValidationError
from sagemaker.workflow.pipeline import Pipeline

# NOTE:
# condition steps have issues in service so we go straight to step_register
pipeline_name = "IrisTrainRegister-%s" % ts
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        training_instance_type,
        training_instance_count,        
        input_train_data,
        input_test_data
    ],
    steps=[step_train, step_register],
    sagemaker_session=sagemaker_session,
)

try:
    response = pipeline.create(role_arn=role)
except ClientError as e:
    error = e.response["Error"]
    if error["Code"] == "ValidationError" and "Pipeline names must be unique within" in error["Message"]:
        print(error["Message"])
        response = pipeline.describe()
    else:
        raise

pipeline_arn = response["PipelineArn"]
sm_client.add_tags(
    ResourceArn=pipeline_arn,
    Tags=[
        {'Key': 'sagemaker:project-name', 'Value': project_name },
        {'Key': 'sagemaker:project-id', 'Value': project_id }
    ]
)
print(pipeline_arn)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


arn:aws:sagemaker:us-east-1:256537223841:pipeline/iristrainregister-2023-02-09-11-14-37


### And then, run it

In [11]:
start_response = pipeline.start(parameters={
    "TrainingInstanceCount": "1"
})

pipeline_execution_arn = start_response.arn
print(pipeline_execution_arn)

while True:
    resp = sm_client.describe_pipeline_execution(PipelineExecutionArn=pipeline_execution_arn)
    if resp['PipelineExecutionStatus'] == 'Executing':
        print('Running...')
    else:
        print(resp['PipelineExecutionStatus'], pipeline_execution_arn)
        break
    time.sleep(15)

arn:aws:sagemaker:us-east-1:256537223841:pipeline/iristrainregister-2023-02-09-11-14-37/execution/36tfortavqnw
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Running...
Succeeded arn:aws:sagemaker:us-east-1:256537223841:pipeline/iristrainregister-2023-02-09-11-14-37/execution/36tfortavqnw


### Finally, approve the model to kick-off the deployment process

In [12]:
# list all packages and select the latest one
packages = sm_client.list_model_packages(ModelPackageGroupName=model_package_group_name)['ModelPackageSummaryList']
packages = sorted(packages, key=lambda x: x['CreationTime'], reverse=True)

latest_model_package_arn = packages[0]['ModelPackageArn']

In [13]:
model_package_update_response = sm_client.update_model_package(
    ModelPackageArn=latest_model_package_arn,
    ModelApprovalStatus="Approved",
)

## Done! :) Let's open the CodePipeline console and get some popcorn to watch