In [1]:
#!pip install sagemaker_datawrangler

Collecting sagemaker_datawrangler
  Downloading sagemaker_datawrangler-0.4.3-py3-none-any.whl.metadata (667 bytes)
Collecting sagemaker-data-insights==0.4.0 (from sagemaker_datawrangler)
  Downloading sagemaker_data_insights-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting ipywidgets<8.0.0 (from sagemaker_datawrangler)
  Downloading ipywidgets-7.8.1-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting sagemaker-scikit-learn-extension==2.5.0 (from sagemaker-data-insights==0.4.0->sagemaker_datawrangler)
  Downloading sagemaker-scikit-learn-extension-2.5.0.tar.gz (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.7/70.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting widgetsnbextension~=3.6.6 (from ipywidgets<8.0.0->sagemaker_datawrangler)
  Downloading widgetsnbextension-3.6.6-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting jupyterlab-widgets<3,>=1.0.0 (from ipywidgets<8.0.0->sagemaker_datawrangler)
 

In [1]:
#import sagemaker_datawrangler           # For interactive data prep widget
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime,strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker                                 

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
bucket = sagemaker.Session().default_bucket()
prefix = 'mlops/activity1'


In [3]:
!wget https://raw.githubusercontent.com/manifoldailearning/mlops-with-aws-datascientists/main/Section-13-Feature-Engineering/Dataset/bank-additional-full.csv

--2024-02-22 21:04:24--  https://raw.githubusercontent.com/manifoldailearning/mlops-with-aws-datascientists/main/Section-13-Feature-Engineering/Dataset/bank-additional-full.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5146674 (4.9M) [text/plain]
Saving to: ‘bank-additional-full.csv.2’


2024-02-22 21:04:24 (229 MB/s) - ‘bank-additional-full.csv.2’ saved [5146674/5146674]



In [4]:
from sagemaker import Session, get_execution_role
import boto3

In [5]:
sess = Session()
input_source = sess.upload_data('./bank-additional-full.csv', bucket = bucket, key_prefix=f'{prefix}/input_data')

In [6]:
input_source

's3://sagemaker-us-east-1-021332873423/mlops/activity1/input_data/bank-additional-full.csv'

In [7]:
role = get_execution_role()
role

'arn:aws:iam::021332873423:role/service-role/AmazonSageMaker-ExecutionRole-20240220T142919'

In [8]:
!wget https://raw.githubusercontent.com/manifoldailearning/mlops-with-aws-datascientists/main/Section-13-Feature-Engineering/feature-engg-script.py


--2024-02-22 21:04:29--  https://raw.githubusercontent.com/manifoldailearning/mlops-with-aws-datascientists/main/Section-13-Feature-Engineering/feature-engg-script.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2292 (2.2K) [text/plain]
Saving to: ‘feature-engg-script.py.2’


2024-02-22 21:04:29 (40.7 MB/s) - ‘feature-engg-script.py.2’ saved [2292/2292]



In [9]:
train_path = f"s3://{bucket}/{prefix}/train"
validation_path = f"s3://{bucket}/{prefix}/validation"
test_path = f"s3://{bucket}/{prefix}/test"

In [10]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput



In [12]:
sklearn_processor = SKLearnProcessor(
        role = role,
        framework_version="0.23-1",
        instance_type="ml.t3.large",
        instance_count=1, 
        base_job_name='mlops-sklearnprocessing'
    )

In [17]:
sklearn_processor.run(
    code = 'feature-engg-script.py',
    inputs = [
        ProcessingInput(
        source = input_source,
        destination= "/opt/ml/processing/input",
        s3_input_mode="File",
        s3_data_distribution_type="ShardedByS3Key")
    ],
    
    outputs = [
        ProcessingOutput(
        output_name='train_data',
        source="/opt/ml/processing/output/train",
        destination = train_path),
        
        ProcessingOutput(
        output_name="validation_data",
        source="/opt/ml/processing/output/validation",
        destination=validation_path),
        
        ProcessingOutput(
        output_name="test_data",
        source="/opt/ml/processing/output/test",
        destination=test_path),


    ]
)

INFO:sagemaker:Creating processing-job with name mlops-sklearnprocessing-2024-02-22-21-19-10-125


.................................................[34m## Processing completed. Exiting.[0m



In [18]:
!aws s3 ls $train_path/

2024-02-22 21:27:25    3545009 train_script.csv


In [19]:
!aws s3 ls $test_path/


2024-02-22 21:27:25     498229 test_script_x.csv
2024-02-22 21:27:25       8238 test_script_y.csv


## Model Training and Testing

In [1]:
from sagemaker import Session
import sagemaker
import boto3
import re
import numpy as np
import pandas as pd
import os
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [6]:
role = get_execution_role()

bucket = Session().default_bucket()
prefix = 'mlops/activity1'
sess = Session()

train_path = f"s3://{bucket}/{prefix}/train"
validation_path = f"s3://{bucket}/{prefix}/validation"
test_path = f"s3://{bucket}/{prefix}/test"

In [7]:
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')


In [None]:
#s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
#s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')
  output_path='s3://{}/{}/output'.format(bucket, prefix),


In [8]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data= f"s3://{bucket}/{prefix}/train",content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data= f"s3://{bucket}/{prefix}/train",content_type='csv')
                                                

In [9]:
xgb = sagemaker.estimator.Estimator(container,
                                   role = role,
                                   instance_count = 1,
                                   instance_type= 'ml.m4.xlarge',
                                   output_path= f"s3://{bucket}/{prefix}",
                                   sagemaker_session = sess)

In [10]:
xgb.set_hyperparameters(max_depth = 5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)


In [11]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 


INFO:sagemaker:Creating training-job with name: xgboost-2024-02-22-22-25-53-161


2024-02-22 22:25:53 Starting - Starting the training job...
2024-02-22 22:26:07 Starting - Preparing the instances for training.........
2024-02-22 22:27:26 Downloading - Downloading input data...
2024-02-22 22:28:17 Downloading - Downloading the training image...
2024-02-22 22:28:42 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2024-02-22:22:28:57:INFO] Running standalone xgboost training.[0m
[34m[2024-02-22:22:28:57:INFO] File size need to be processed in the node: 6.76mb. Available memory size in the node: 8534.27mb[0m
[34m[2024-02-22:22:28:57:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:28:57] S3DistributionType set as FullyReplicated[0m
[34m[22:28:57] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-02-22:22:28:57:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:28:57] S3DistributionType set as FullyReplicated[0m
[34m

[34m[22:29:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 20 extra nodes, 18 pruned nodes, max_depth=5[0m
[34m[59]#011train-error:0.097673#011validation-error:0.097673[0m
[34m[22:29:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 12 pruned nodes, max_depth=5[0m
[34m[60]#011train-error:0.097569#011validation-error:0.097569[0m
[34m[22:29:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 26 pruned nodes, max_depth=4[0m
[34m[61]#011train-error:0.097465#011validation-error:0.097465[0m
[34m[22:29:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 16 pruned nodes, max_depth=5[0m
[34m[62]#011train-error:0.097395#011validation-error:0.097395[0m
[34m[22:29:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 24 pruned nodes, max_depth=5[0m
[34m[63]#011train-error:0.097395#011validation-error:0.097395[0m
[34m[22:29:01] src/tree/updater_prune.cc:74: tree pruni

In [12]:
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2024-02-22-22-31-15-208
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-02-22-22-31-15-208
INFO:sagemaker:Creating endpoint with name xgboost-2024-02-22-22-31-15-208


-----!

In [13]:
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()

In [14]:
test_data_x = pd.read_csv(os.path.join(test_path, 'test_script_x.csv'),header = None)
test_data_y = pd.read_csv(os.path.join(test_path, 'test_script_y.csv'),header = None)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [15]:

def predict(data, predictor, rows=500 ):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')
predictions = predict(test_data_x, xgb_predictor)

In [16]:
pd.crosstab(index=test_data_y[0], columns=np.round(predictions), rownames=['actuals'], colnames=['predictions'])


predictions,0.0,1.0
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3584,51
1,383,101


In [17]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

INFO:sagemaker:Deleting endpoint configuration with name: xgboost-2024-02-22-22-31-15-208
INFO:sagemaker:Deleting endpoint with name: xgboost-2024-02-22-22-31-15-208


## Model Deployment

In [18]:
import boto3

client = boto3.client(service_name="sagemaker")
runtime = boto3.client(service_name='sagemaker-runtime')
print(client,runtime)

<botocore.client.SageMaker object at 0x7f1f5f3e8250> <botocore.client.SageMakerRuntime object at 0x7f1f5f3e9870>


In [20]:
model_artifacts = xgb.model_data
model_artifacts

's3://sagemaker-us-east-1-021332873423/mlops/activity1/xgboost-2024-02-22-22-25-53-161/output/model.tar.gz'

In [26]:
from time import gmtime, strftime

model_name = "xgboost-serverless"+ strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Model name: "+ model_name)

Model name: xgboost-serverless2024-02-22-22-49-13


In [27]:
byo_container_env_vars = {"SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SOME_ENV_VAR": "myEnvVar"}


In [28]:
create_model_response = client.create_model(
    ModelName=model_name,
    Containers=[
        {
            "Image": container,
            "Mode": "SingleModel",
            "ModelDataUrl": model_artifacts,
            "Environment": byo_container_env_vars,
        }
    ],
    ExecutionRoleArn=role,
)

print("Model Arn: " + create_model_response["ModelArn"])

Model Arn: arn:aws:sagemaker:us-east-1:021332873423:model/xgboost-serverless2024-02-22-22-49-13


In [31]:
xgboost_epc_name = "mlops-serverless-epc" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName=xgboost_epc_name,
    ProductionVariants=[
        {
            "VariantName": "byoVariant",
            "ModelName": model_name,
            "ServerlessConfig": {
                "MemorySizeInMB": 3072,
                "MaxConcurrency": 1,
            },
        },
    ],
)

print("Endpoint Configuration Arn: " + endpoint_config_response["EndpointConfigArn"])

Endpoint Configuration Arn: arn:aws:sagemaker:us-east-1:021332873423:endpoint-config/mlops-serverless-epc2024-02-22-22-52-23


In [32]:
endpoint_name = "xgboost-serverless-ep" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=xgboost_epc_name,
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

Endpoint Arn: arn:aws:sagemaker:us-east-1:021332873423:endpoint/xgboost-serverless-ep2024-02-22-22-52-24


In [33]:
# wait for endpoint to reach a terminal state (InService) using describe endpoint
import time

describe_endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)

while describe_endpoint_response["EndpointStatus"] == "Creating":
    describe_endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)
    print(describe_endpoint_response["EndpointStatus"])
    time.sleep(15)

describe_endpoint_response

{'EndpointName': 'xgboost-serverless-ep2024-02-22-22-52-24',
 'EndpointArn': 'arn:aws:sagemaker:us-east-1:021332873423:endpoint/xgboost-serverless-ep2024-02-22-22-52-24',
 'EndpointConfigName': 'mlops-serverless-epc2024-02-22-22-52-23',
 'ProductionVariants': [{'VariantName': 'byoVariant',
   'DeployedImages': [{'SpecifiedImage': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
     'ResolvedImage': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost@sha256:0c8f830ac408e6dee08445fb60392e9c3f05f790a4b3c07ec22327c08f75bcbf',
     'ResolutionTime': datetime.datetime(2024, 2, 22, 22, 52, 25, 545000, tzinfo=tzlocal())}],
   'CurrentWeight': 1.0,
   'DesiredWeight': 1.0,
   'CurrentInstanceCount': 0,
   'CurrentServerlessConfig': {'MemorySizeInMB': 3072, 'MaxConcurrency': 1}}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2024, 2, 22, 22, 52, 24, 657000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 2, 22, 22, 54, 18, 771000, tzinfo=tzl

In [34]:
# Endpoint invocation
payload = b"3., 999.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 1.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0., 0.,   0.,   0.,   0.,   0.,   1.,   0.,   1.,   0.,   0.,   1., 0.,   0.,   1.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   1., 0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0., 0.,   1.,   0."

response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=payload,
    ContentType="text/csv",
)

print(response["Body"].read().decode())

0.07072833180427551


In [35]:
client.delete_model(ModelName=model_name)
client.delete_endpoint_config(EndpointConfigName=xgboost_epc_name)
client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'e2eb7d63-bf0e-4075-9f64-ecbfdc28d0c6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e2eb7d63-bf0e-4075-9f64-ecbfdc28d0c6',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Thu, 22 Feb 2024 22:57:10 GMT'},
  'RetryAttempts': 0}}

# Automatic Model Tuning

In [36]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                            'min_child_weight': ContinuousParameter(1, 10),
                            'alpha': ContinuousParameter(0, 2),
                            'max_depth': IntegerParameter(1, 10)}
objective_metric_name = 'validation:auc'

In [37]:
tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=3)

In [None]:
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})


INFO:sagemaker:Creating hyperparameter tuning job with name: xgboost-240222-2340


.........................................................................................................

In [None]:
tuner.best_training_job()
