---
## Contents

1. [Libraries and Installations](#Libraries-and-Installations)
1. [Configs](#Configs)
1. [Batch transform](#Batch-Transform)
    1. [Fit the train data](#Batch-transform-FIT)
    1. [Transform the train data](#Batch-transform-TRANSFORM)
    1. [Transform the test data](#Batch-Transform-TRANSFORM-test)
    1. [Upload processed data to s3](#upload-processed-train-test)
1. [Train and Track](#Train-Track)
    1. [Pre-Processing Tracker](#Pre-Processing-Tracker)
    1. [Create Experiment](#Create-Experiment)
	1. [Hyper-Parameter Train](#Train)
	1. [List Train Jobs](#List-Train-Jobs)
	1. [Best Training Job](#Best-Training-Job)
	1. [Lineage](#Lineage)
1. [Pipeline](#Pipeline)
    1. [Preprocess_model](#Preprocess_model)
    1. [Inference model](#Inference_model)
1. [Deploy endpoint for the best training-job / trial component](#Deploy-endpoint)
1. [Make a request to our pipeline endpoint](#Request-Endpoint)
1. [Delete Endpoint](#Delete-Endpoint)
1. [Delete Experiments](#Delete-Experiments)
---

## 1. Libraries and Installations <a class="anchor" id="Libraries-and-Installations"></a>

In [1]:
!pip install sagemaker-experiments
!pip install s3fs
!pip install matplotlib
!pip install seaborn
!pip install shap
!pip install smdebug

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow_p36/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
from io import StringIO
import numpy as np
import os
import pandas as pd
import boto3
import time
import s3fs
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import re
import shap
from scipy import stats
import copy

In [3]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.analytics import ExperimentAnalytics

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig, CollectionConfig
from sagemaker.estimator import Estimator
from sagemaker.session import s3_input
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import Session

from smdebug.trials import create_trial

## 2. Configs <a class="anchor" id="Configs"></a>

In [4]:
now = datetime.now()

current_time = now.strftime("%Y-%m-%d--%H-%M-%S")
# dd/mm/YY H:M:S format
print("current_time:", current_time)

current_time: 2020-08-21--09-17-34


In [5]:
sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# S3 prefix
bucket = 'ta-sagemaker-experiments'
prefix = 'Scikit-pre-model-Inference-Pipelines'

#dataset = 's3://ta-sagemaker-experiments/housing/datasets/housing_dataset.csv'
train_data= 's3://ta-sagemaker-experiments/housing/input-datasets/train_data_without_header.csv'
test_data= 's3://ta-sagemaker-experiments/housing/input-datasets/test_data_without_header.csv'

account_id_bucket='sagemaker-us-east-1-171774164293'

FRAMEWORK_VERSION = "0.23-1"

base_job_name = f"Track-custom-XGB-{current_time}"

output_data_prefix = f'housing/datasets/output/{base_job_name}'
data_output_path = f's3://{bucket}/{output_data_prefix}'

debug_prefix = f'housing/jobs/debug/{base_job_name}'
debug_path = f's3://{bucket}/{debug_prefix}'


experiment_name_prefix = "xgboost-custom-track2"

In [6]:
script_path = 'preprocessor.py'
dependancy_path = 'preprocessor_dependencies.py'

## 3. Batch transform  <a class="anchor" id="Batch-Transform"></a>

### 3.1. Fit the train data  <a class="anchor" id="Batch-transform-FIT"></a>

In [7]:
sklearn_preprocessor = SKLearn(
    entry_point=script_path,
    role=role,
    framework_version=FRAMEWORK_VERSION,
    train_instance_type="ml.m5.xlarge",
    train_use_spot_instances=True,
    train_max_run = 600,
    train_max_wait = 1200,
    dependencies=[dependancy_path],
    sagemaker_session=sagemaker_session
    
)

In [8]:
sklearn_preprocessor.fit(
    inputs={'train': train_data},
    job_name=base_job_name
)

INFO:sagemaker:Creating training-job with name: Track-custom-XGB-2020-08-21--09-17-34


2020-08-21 09:17:37 Starting - Starting the training job...
2020-08-21 09:17:42 Starting - Launching requested ML instances.........
2020-08-21 09:19:25 Starting - Preparing the instances for training......
2020-08-21 09:20:26 Downloading - Downloading input data...
2020-08-21 09:20:48 Training - Downloading the training image..[34m2020-08-21 09:21:19,341 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-08-21 09:21:19,343 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:21:19,352 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-08-21 09:21:19,700 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:21:19,711 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:21:19,721 sagemaker-training-toolkit INFO     No GPUs detected (norm

### 3.2. Transform the train data  <a class="anchor" id="Batch-transform-TRANSFORM"></a>

In [9]:
transformer = sklearn_preprocessor.transformer(
    instance_count=1, 
    instance_type='ml.m5.xlarge',
    assemble_with = 'Line',
    accept = 'text/csv',
    output_path=data_output_path
)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2020-08-21-09-21-49-994


In [10]:
transformer.transform(
    data=train_data, 
    content_type="text/csv",
    job_name=base_job_name+'-train'
)
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()

INFO:sagemaker:Creating transform job with name: Track-custom-XGB-2020-08-21--09-17-34-train


Waiting for transform job: Track-custom-XGB-2020-08-21--09-17-34-train
.............................[32m2020-08-21T09:26:27.849:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34m2020-08-21 09:26:24,591 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:26:24,593 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:26:24,594 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[35m2020-08-21 09:26:24,591 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2020-08-21 09:26:24,593 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2020-08-21 09:26:24,594 INFO - sagemaker-con

In [11]:
preprocessed_train_data = transformer.output_path

In [12]:
preprocessed_train_data

's3://ta-sagemaker-experiments/housing/datasets/output/Track-custom-XGB-2020-08-21--09-17-34'

### 3.3. Transform the test data <a class="anchor" id="Batch-Transform-TRANSFORM-test"></a>

In [13]:
transformer.transform(
    data=test_data, 
    content_type="text/csv",
    job_name=base_job_name+'-test'
)
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()

INFO:sagemaker:Creating transform job with name: Track-custom-XGB-2020-08-21--09-17-34-test


Waiting for transform job: Track-custom-XGB-2020-08-21--09-17-34-test
................................[34m2020-08-21 09:32:11,999 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2020-08-21 09:32:11,999 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:32:12,001 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:32:12,002 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

   

In [14]:
preprocessed_test_data = transformer.output_path

In [15]:
f'{output_data_prefix}'

'housing/datasets/output/Track-custom-XGB-2020-08-21--09-17-34'

### 3.4. Upload processed data to s3  <a class="anchor" id="upload-processed-train-test"></a>

In [16]:
client = boto3.client('s3') 
obj = client.get_object(Bucket=bucket, Key=f'{output_data_prefix}/train_data_without_header.csv.out')
body = obj['Body']
csv_string = body.read().decode('utf-8')
processed_train_data = pd.read_csv(StringIO(csv_string))

In [17]:
train_file = 'processed_train_data.csv'
processed_train_data.to_csv(train_file,index=False,header=False)
with open(train_file,'rb') as data:
    boto3.Session().resource('s3').Bucket(bucket).upload_fileobj(data,os.path.join(output_data_prefix,'processed-train',train_file))

In [18]:
obj = client.get_object(Bucket=bucket, Key=f'{output_data_prefix}/test_data_without_header.csv.out')
body = obj['Body']
csv_string = body.read().decode('utf-8')
processed_test_data = pd.read_csv(StringIO(csv_string))

test_file = 'processed_test_data.csv'
processed_test_data.to_csv(test_file,index=False,header=False)
with open(test_file,'rb') as data:
    boto3.Session().resource('s3').Bucket(bucket).upload_fileobj(data,os.path.join(output_data_prefix,'processed-test',test_file))

## 4. Train and Track  <a class="anchor" id="Train-Track"></a>

In [19]:
sess = boto3.Session()
sm = sess.client('sagemaker')
role = get_execution_role()
region = "us-east-1"

In [20]:
container = get_image_uri(region, "xgboost", repo_version="0.90-2")

	get_image_uri(region, 'xgboost', '1.0-1').


In [21]:
save_interval = 2

In [22]:
prefix

'Scikit-pre-model-Inference-Pipelines'

In [23]:
content_type = "text/csv"
train_input = s3_input(f"s3://{bucket}/{output_data_prefix}/processed-train/processed_train_data.csv", content_type=content_type)
validation_input = s3_input(f"s3://{bucket}/{output_data_prefix}/processed-test/processed_test_data.csv", content_type=content_type)



In [24]:
train_df = pd.read_csv(f"s3://{bucket}/{output_data_prefix}/processed-train/processed_train_data.csv")

### 4.1. Pre-Processing Tracker <a class="anchor" id="Pre-Processing-Tracker"></a>

In [25]:
with Tracker.create(display_name="Pre-Processing", sagemaker_boto_client=sm) as tracker:
    tracker.log_parameters({
        "Num_Imputer" : "SimpleImputer",
        "Num_Norm"    : "StandardScaler",
        "Cat_Norm"    : "SimpleImputer",
        "Cat_Convert" : "OneHotEncoder",
        "No_of_rows"  : str(len(train_df))
    })
    # we can log the s3 uri to the dataset we just uploaded
    tracker.log_input(name="custom-xgboost-track", media_type="s3/uri", value=f"s3://{bucket}/{output_data_prefix}/processed-train/processed_train_data.csv")

### 4.2. Create Experiment <a class="anchor" id="Create-Experiment"></a>

In [26]:
xgboost_experiment = Experiment.create(
    experiment_name=f"{experiment_name_prefix}-{int(time.time())}", 
    description="custom-xgboost-track", 
    sagemaker_boto_client=sm)
print(xgboost_experiment)

Experiment(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7f20d46009e8>,experiment_name='xgboost-custom-track2-1598002374',description='custom-xgboost-track',tags=None,experiment_arn='arn:aws:sagemaker:us-east-1:171774164293:experiment/xgboost-custom-track2-1598002374',response_metadata={'RequestId': 'bdfc3a21-3370-4978-9ded-e83e88642923', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'bdfc3a21-3370-4978-9ded-e83e88642923', 'content-type': 'application/x-amz-json-1.1', 'content-length': '104', 'date': 'Fri, 21 Aug 2020 09:32:53 GMT'}, 'RetryAttempts': 0})


In [27]:
trial_name_map = {}

In [28]:
trial_component = tracker.trial_component

In [29]:
debug_path

's3://ta-sagemaker-experiments/housing/jobs/debug/Track-custom-XGB-2020-08-21--09-17-34'

In [30]:
FRAMEWORK_VERSION = "0.23-1"
script_path="train.py"
source_dir="train"

In [31]:
container

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3'

### 4.3. Train <a class="anchor" id="Train"></a>

In [32]:
for i, eta in enumerate([0.3, 0.4]):
    for j, max_depth in enumerate([2, 4]):
        base_name = f"xbg-eta-{str(eta).replace('.', '-')}-max-depth-{str(max_depth).replace('.', '-')}"
        time_val = f"time-{int(time.time())}"
        trial_name = base_name + "-" + time_val
        xgb_trial = Trial.create(
            trial_name=trial_name, 
            experiment_name=xgboost_experiment.experiment_name,
            sagemaker_boto_client=sm,
        )
        trial_name_map[(eta,max_depth)] = trial_name

        # associate the proprocessing trial component with the current trial
        xgb_trial.add_trial_component(trial_component)

        xgboost_estimator = SKLearn(
            entry_point=script_path,
            source_dir=source_dir,
            framework_version=FRAMEWORK_VERSION,
            role=role,
            base_job_name=trial_name,
            train_instance_count=1,
            train_instance_type='ml.m5.xlarge',
            hyperparameters={
                "max_depth": 6,
                "eta": 0.2,
                "gamma": 4,
                "min_child_weight": 6,
                "subsample": 0.7,
                "eval_metric":"rmse",
                "objective": "reg:squarederror",
                "n_estimators" : 51
            },
            metric_definitions=[
                {'Name':'crossvalidation:mae', 'Regex':'CV MAE=(.*?);'},
                {'Name':'crossvalidation:mse', 'Regex':'CV MSE=(.*?);'},
                {'Name':'train:mae', 'Regex':'Train MAE=(.*?);'},
                {'Name':'train:mse', 'Regex':'Train MSE=(.*?);'},
                {'Name':'validation:mae', 'Regex':'Validation MAE=(.*?);'},
                {'Name':'validation:mse', 'Regex':'Validation MSE=(.*?);'},
            ],
            enable_sagemaker_metrics=False,
            train_use_spot_instances=True,
            train_max_run = 600,
            train_max_wait = 1200,
        )

        xgb_training_job_name = "xgb-training-job-{}".format(int(time.time()))

        xgboost_estimator.fit(
            {"train": train_input, "validation": validation_input},
            # This is a fire and forget event. By setting wait=False, you submit the job to run in the background.
            # Amazon SageMaker starts one training job and release control to next cells in the notebook.
            # Follow this notebook to see status of the training job.
            experiment_config={
                "TrialName": xgb_trial.trial_name,
                "TrialComponentDisplayName": "Training",
            },
            wait=False
        )

        # give it a while before dispatching the next training job
        time.sleep(4)

INFO:sagemaker:Creating training-job with name: xbg-eta-0-3-max-depth-2-time-1598002374-2020-08-21-09-32-54-740
INFO:sagemaker:Creating training-job with name: xbg-eta-0-3-max-depth-4-time-1598002379-2020-08-21-09-32-59-286
INFO:sagemaker:Creating training-job with name: xbg-eta-0-4-max-depth-2-time-1598002383-2020-08-21-09-33-03-866
INFO:sagemaker:Creating training-job with name: xbg-eta-0-4-max-depth-4-time-1598002388-2020-08-21-09-33-08-367


In [33]:
xgboost_estimator

<sagemaker.sklearn.estimator.SKLearn at 0x7f20d44fb390>

### 4.4. List Train Jobs <a class="anchor" id="List-Train-Jobs"></a>

In [34]:
search_expression = {
    "Filters":[
        {
            "Name": "DisplayName",
            "Operator": "Equals",
            "Value": "Training",
        }
    ],
}

In [35]:
xgboost_experiment.experiment_name

'xgboost-custom-track2-1598002374'

In [36]:
trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=Session(sess, sm), 
    experiment_name=xgboost_experiment.experiment_name,
    search_expression=search_expression,
    sort_by="metrics.validation:mse.max",
    sort_order="Descending",
    metric_names=['train:mse', 'validation:mse', 'crossvalidation:mse'],
    parameter_names=['max_depth', 'eta', 'silent', 'gamma']
)

In [37]:
trial_component_analytics.dataframe()

Unnamed: 0,TrialComponentName,DisplayName,SourceArn,eta,gamma,max_depth,train:mse - Min,train:mse - Max,train:mse - Avg,train:mse - StdDev,...,crossvalidation:mse - Last,crossvalidation:mse - Count,train - MediaType,train - Value,validation - MediaType,validation - Value,SageMaker.DebugHookOutput - MediaType,SageMaker.DebugHookOutput - Value,SageMaker.ModelArtifact - MediaType,SageMaker.ModelArtifact - Value
0,xbg-eta-0-4-max-depth-4-time-1598002388-2020-0...,Training,arn:aws:sagemaker:us-east-1:171774164293:train...,0.2,4.0,6.0,1546476000.0,1546476000.0,1546476000.0,0.0,...,3049686000.0,3,text/csv,s3://ta-sagemaker-experiments/housing/datasets...,text/csv,s3://ta-sagemaker-experiments/housing/datasets...,,s3://sagemaker-us-east-1-171774164293/,,s3://sagemaker-us-east-1-171774164293/xbg-eta-...
1,xbg-eta-0-4-max-depth-2-time-1598002383-2020-0...,Training,arn:aws:sagemaker:us-east-1:171774164293:train...,0.2,4.0,6.0,1546476000.0,1546476000.0,1546476000.0,0.0,...,3049686000.0,3,text/csv,s3://ta-sagemaker-experiments/housing/datasets...,text/csv,s3://ta-sagemaker-experiments/housing/datasets...,,s3://sagemaker-us-east-1-171774164293/,,s3://sagemaker-us-east-1-171774164293/xbg-eta-...
2,xbg-eta-0-3-max-depth-2-time-1598002374-2020-0...,Training,arn:aws:sagemaker:us-east-1:171774164293:train...,0.2,4.0,6.0,1546476000.0,1546476000.0,1546476000.0,0.0,...,3049686000.0,3,text/csv,s3://ta-sagemaker-experiments/housing/datasets...,text/csv,s3://ta-sagemaker-experiments/housing/datasets...,,s3://sagemaker-us-east-1-171774164293/,,s3://sagemaker-us-east-1-171774164293/xbg-eta-...
3,xbg-eta-0-3-max-depth-4-time-1598002379-2020-0...,Training,arn:aws:sagemaker:us-east-1:171774164293:train...,0.2,4.0,6.0,1546476000.0,1546476000.0,1546476000.0,0.0,...,3049686000.0,3,text/csv,s3://ta-sagemaker-experiments/housing/datasets...,text/csv,s3://ta-sagemaker-experiments/housing/datasets...,,s3://sagemaker-us-east-1-171774164293/,,s3://sagemaker-us-east-1-171774164293/xbg-eta-...


### 4.5. Best Training Job <a class="anchor" id="Best-Training-Job"></a>

In [38]:
result_df = trial_component_analytics.dataframe(force_refresh=True).sort_values(["validation:mse - Avg", "crossvalidation:mse - Avg"]).reset_index(drop=True)

In [39]:
best_trial_component_name = result_df.iloc[0]['TrialComponentName']
best_trial_component = TrialComponent.load(best_trial_component_name)

In [40]:
print(best_trial_component.parameters['max_depth'])
print(best_trial_component.parameters['eta'])
print(best_trial_component.parameters['min_child_weight'])

6.0
0.2
6.0


In [41]:
best_trial_component.source.source_arn.split("/")[-1]

'xbg-eta-0-4-max-depth-4-time-1598002388-2020-08-21-09-33-08-367'

### 4.6. Lineage <a class="anchor" id="Lineage"></a>

In [42]:
trial_name_map

{(0.3, 2): 'xbg-eta-0-3-max-depth-2-time-1598002374',
 (0.3, 4): 'xbg-eta-0-3-max-depth-4-time-1598002379',
 (0.4, 2): 'xbg-eta-0-4-max-depth-2-time-1598002383',
 (0.4, 4): 'xbg-eta-0-4-max-depth-4-time-1598002388'}

In [43]:
lineage_table = ExperimentAnalytics(
    sagemaker_session=Session(sess, sm), 
    search_expression={
        "Filters":[{
            "Name": "Parents.TrialName",
            "Operator": "Equals",
            "Value": trial_name_map[(0.4, 4)]
        }]
    },
    sort_by="CreationTime",
    sort_order="Ascending",
)

lineage_table.dataframe()

Unnamed: 0,TrialComponentName,DisplayName,Cat_Convert,Cat_Norm,No_of_rows,Num_Imputer,Num_Norm,custom-xgboost-track - MediaType,custom-xgboost-track - Value,SourceArn,...,crossvalidation:mse - Last,crossvalidation:mse - Count,train - MediaType,train - Value,validation - MediaType,validation - Value,SageMaker.DebugHookOutput - MediaType,SageMaker.DebugHookOutput - Value,SageMaker.ModelArtifact - MediaType,SageMaker.ModelArtifact - Value
0,TrialComponent-2020-08-21-093253-znks,Pre-Processing,OneHotEncoder,SimpleImputer,16553.0,SimpleImputer,StandardScaler,s3/uri,s3://ta-sagemaker-experiments/housing/datasets...,,...,,,,,,,,,,
1,xbg-eta-0-4-max-depth-4-time-1598002388-2020-0...,Training,,,,,,,,arn:aws:sagemaker:us-east-1:171774164293:train...,...,3049686000.0,3.0,text/csv,s3://ta-sagemaker-experiments/housing/datasets...,text/csv,s3://ta-sagemaker-experiments/housing/datasets...,,s3://sagemaker-us-east-1-171774164293/,,s3://sagemaker-us-east-1-171774164293/xbg-eta-...


In [44]:
best_trial_component.source.source_arn.split("/")[-1]

'xbg-eta-0-4-max-depth-4-time-1598002388-2020-08-21-09-33-08-367'

In [45]:
f'{base_job_name}-Endpoint'

'Track-custom-XGB-2020-08-21--09-17-34-Endpoint'

## 5. Pipeline <a class="anchor" id="pipeline"></a>

In [46]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime
from sagemaker.estimator import Estimator
from sagemaker import PipelineModel

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

In [47]:
base_job_name

'Track-custom-XGB-2020-08-21--09-17-34'

### 5.1 Preprocess_model <a class="anchor" id="preprocess_model"></a>

In [48]:
#proprocess_estimator = Estimator.attach("Track-Debug-SHAP-XGB-2020-08-14--14-10-47")#preprocessing job id

proprocess_estimator = Estimator.attach(base_job_name)
proprocess_model = proprocess_estimator.create_model(
   
    env=dict(
        SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT="text/csv",
        SAGEMAKER_PROGRAM='preprocessor.py',
        SAGEMAKER_REGION='us-east-1',
        SAGEMAKER_SUBMIT_DIRECTORY=f's3://{account_id_bucket}/'+base_job_name+'/source/sourcedir.tar.gz'
    )
)
##bucket and job name



2020-08-21 09:21:32 Starting - Preparing the instances for training
2020-08-21 09:21:32 Downloading - Downloading input data
2020-08-21 09:21:32 Training - Training image download completed. Training in progress.
2020-08-21 09:21:32 Uploading - Uploading generated training model
2020-08-21 09:21:32 Completed - Training job completed[34m2020-08-21 09:21:19,341 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-08-21 09:21:19,343 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:21:19,352 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-08-21 09:21:19,700 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:21:19,711 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:21:19,721 sagemaker-training-toolkit INFO     No GPUs detected (

### 5.2 Inference model <a class="anchor" id="Inference model"></a>

In [49]:

inference_estimator = Estimator.attach(best_trial_component.source.source_arn.split("/")[-1]) ##best job id
inference_model = inference_estimator.create_model(
   
    env=dict(
        SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT="text/csv",
       
        SAGEMAKER_PROGRAM='train.py',
        SAGEMAKER_REGION='us-east-1',
        SAGEMAKER_SUBMIT_DIRECTORY=f's3://{account_id_bucket}/'+best_trial_component.source.source_arn.split("/")[-1]+'/source/sourcedir.tar.gz'
    )
)



2020-08-21 09:37:13 Starting - Preparing the instances for training
2020-08-21 09:37:13 Downloading - Downloading input data
2020-08-21 09:37:13 Training - Training image download completed. Training in progress.
2020-08-21 09:37:13 Uploading - Uploading generated training model
2020-08-21 09:37:13 Completed - Training job completed[34m2020-08-21 09:36:38,503 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-08-21 09:36:38,505 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-21 09:36:38,515 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-08-21 09:36:52,388 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting xgboost==1.1.1
  Downloading xgboost-1.1.1-py3-none-manylinux2010_x86_64.whl (127.6 MB)[0m
[34mInstalling collecte

## 6. Deploy endpoint for the best training-job / trial component <a class="anchor" id="Deploy-endpoint"></a>

In [50]:
bucket =  'ta-sagemaker-experiments'
#print("Bucket: {}".format(bucket))
prefix = 'ModelMonitor'

data_capture_prefix = '{}/datacapture'.format(prefix)
s3_capture_upload_path = 's3://{}/{}'.format(bucket, data_capture_prefix)

from sagemaker.model_monitor import DataCaptureConfig

data_capture_config = DataCaptureConfig(
                        enable_capture=True,
                        sampling_percentage=100,
                        destination_s3_uri=s3_capture_upload_path)

model_name = f'{base_job_name}-Model'
endpoint_name = f'{base_job_name}-custom-xgb-Endpoint'

custom_xgboost = PipelineModel(
    name=model_name, 
    role=role, 
    models=[proprocess_model, inference_model],
    sagemaker_session=sagemaker_session
)

predictor = custom_xgboost.deploy(initial_instance_count=1,
                         instance_type='ml.m5.xlarge',
                         endpoint_name=endpoint_name,
                         data_capture_config=data_capture_config)

INFO:sagemaker:Creating model with name: Track-custom-XGB-2020-08-21--09-17-34-Model
INFO:sagemaker:Creating endpoint with name Track-custom-XGB-2020-08-21--09-17-34-custom-xgb-Endpoint


---------------!

## 7. Make a request to our pipeline endpoint <a class="anchor" id="Request-Endpoint"></a>

In [51]:
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON
payload = """-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY"""

predictor = RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=csv_serializer,
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_CSV
)

In [52]:
predictor.predict(payload)

b'402227.47\n415057.06\n'

## 8. Delete Endpoint  <a class="anchor" id="Delete-Endpoint"></a>

In [53]:

endpoint_name=endpoint_name
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'b8f706bf-4002-463b-be34-40dde20c29fc',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b8f706bf-4002-463b-be34-40dde20c29fc',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Fri, 21 Aug 2020 09:51:00 GMT'},
  'RetryAttempts': 0}}

## 9. Delete Experiments <a class="anchor" id="Delete-Experiments"></a>

In [None]:
def cleanup(experiment):
    for trial_summary in experiment.list_trials():
        trial = Trial.load(sagemaker_boto_client=sm, trial_name=trial_summary.trial_name)
        for trial_component_summary in trial.list_trial_components():
            tc = TrialComponent.load(
                sagemaker_boto_client=sm,
                trial_component_name=trial_component_summary.trial_component_name)
            trial.remove_trial_component(tc)
            try:
                # comment out to keep trial components
                tc.delete()
            except:
                # tc is associated with another trial
                continue
            # to prevent throttling
            time.sleep(.5)
        trial.delete()
    experiment.delete()

cleanup(xgboost_experiment)