In [12]:
# Upload to S3 Bucket
!wget https://raw.githubusercontent.com/manifoldailearning/mlops-with-aws-datascientists/main/Section-13-Feature-Engineering/Dataset/bank-additional-full.csv --no-check-certificate
from sagemaker import Session
import sagemaker
bucket=sagemaker.Session().default_bucket()
prefix = 'mlops/activity-3'

sess = Session()
input_source = sess.upload_data('./bank-additional-full.csv', bucket=bucket, key_prefix=f'{prefix}/input_data')
input_source

--2025-04-02 09:46:21--  https://raw.githubusercontent.com/manifoldailearning/mlops-with-aws-datascientists/main/Section-13-Feature-Engineering/Dataset/bank-additional-full.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 5146674 (4.9M) [text/plain]
Saving to: ‘bank-additional-full.csv.1’


2025-04-02 09:46:21 (99.8 MB/s) - ‘bank-additional-full.csv.1’ saved [5146674/5146674]



's3://sagemaker-us-west-2-975050337104/mlops/activity-3/input_data/bank-additional-full.csv'

In [13]:
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

In [14]:
!wget https://raw.githubusercontent.com/manifoldailearning/mlops-with-aws-datascientists/main/Section-13-Feature-Engineering/feature-engg-script.py --no-check-certificate

--2025-04-02 09:47:08--  https://raw.githubusercontent.com/manifoldailearning/mlops-with-aws-datascientists/main/Section-13-Feature-Engineering/feature-engg-script.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
  Unable to locally verify the issuer's authority.
200 OKequest sent, awaiting response... 
Length: 2292 (2.2K) [text/plain]
Saving to: ‘feature-engg-script.py’


2025-04-02 09:47:09 (27.6 MB/s) - ‘feature-engg-script.py’ saved [2292/2292]



In [15]:
train_path = f"s3://{bucket}/{prefix}/train"
validation_path = f"s3://{bucket}/{prefix}/validation"
test_path = f"s3://{bucket}/{prefix}/test"

In [18]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role


sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=get_execution_role(),
    instance_type="ml.m5.large",
    instance_count=1, 
    base_job_name='mlops-sklearnprocessing'
)

sklearn_processor.run(
    code='feature-engg-script.py',
    # arguments = ['arg1', 'arg2'],
    inputs=[
        ProcessingInput(
            source=input_source, 
            destination="/opt/ml/processing/input",
            s3_input_mode="File",
            s3_data_distribution_type="ShardedByS3Key"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="train_data", 
            source="/opt/ml/processing/output/train",
            destination=train_path,
        ),
        ProcessingOutput(output_name="validation_data", source="/opt/ml/processing/output/validation", destination=validation_path),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/output/test", destination=test_path),
    ]
)


...........[34m## Processing completed. Exiting.[0m



In [20]:
!aws s3 ls $train_path/

2025-04-02 09:56:46    3545009 train_script.csv


In [21]:
!aws s3 ls $test_path/

2025-04-02 09:56:47     498229 test_script_x.csv
2025-04-02 09:56:47       8238 test_script_y.csv


In [22]:
# Use the previously prepared data
from sagemaker import Session
import sagemaker
import boto3
import re
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import os

role = get_execution_role()

bucket=sagemaker.Session().default_bucket()
prefix = 'mlops/activity-3'
sess = Session()
train_path = f"s3://{bucket}/{prefix}/train"
validation_path = f"s3://{bucket}/{prefix}/validation"
test_path = f"s3://{bucket}/{prefix}/test"

In [23]:
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')

In [25]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

In [26]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 

2025-04-02 10:34:41 Starting - Starting the training job...
..25-04-02 10:34:55 Starting - Preparing the instances for training.
..25-04-02 10:35:20 Downloading - Downloading input data.
..25-04-02 10:35:50 Downloading - Downloading the training image.
..25-04-02 10:36:46 Training - Training image download completed. Training in progress..
2025-04-02 10:37:12 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2025-04-02:10:37:01:INFO] Running standalone xgboost training.[0m
[34m[2025-04-02:10:37:01:INFO] File size need to be processed in the node: 4.35mb. Available memory size in the node: 8562.36mb[0m
[34m[2025-04-02:10:37:01:INFO] Determined delimiter of CSV input is ','[0m
[34m[10:37:01] S3DistributionType set as FullyReplicated[0m
[34m[10:37:02] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2025-04-02:10:37:02:INFO] Determined delimiter of CSV input is ','[0m
[34m[10:37:

In [27]:
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

------!

In [28]:
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()

In [30]:
!aws s3 ls $test_path/

2025-04-02 09:56:47     498229 test_script_x.csv
2025-04-02 09:56:47       8238 test_script_y.csv


In [37]:
import os
test_data_x = pd.read_csv(os.path.join(test_path, 'test_script_x.csv'),header=None)
test_data_y = pd.read_csv(os.path.join(test_path, 'test_script_y.csv'),header=None)

In [38]:
import numpy as np
def predict(data, predictor, rows=500 ):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')
predictions = predict(test_data_x, xgb_predictor)

  return bound(*args, **kwds)


In [39]:
pd.crosstab(index=test_data_y[0], columns=np.round(predictions), rownames=['actuals'], colnames=['predictions'])

predictions,0.0,1.0
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3584,51
1,383,101


In [40]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

In [41]:
import boto3

client = boto3.client(service_name="sagemaker")
runtime = boto3.client(service_name="sagemaker-runtime")

In [42]:
model_artifacts = xgb.model_data
model_artifacts

's3://sagemaker-us-west-2-975050337104/mlops/activity-3/output/xgboost-2025-04-02-10-34-40-847/output/model.tar.gz'

In [44]:
from time import gmtime, strftime

model_name = "xgboost-serverless" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Model name: " + model_name)

# dummy environment variables
byo_container_env_vars = {"SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SOME_ENV_VAR": "myEnvVar"}

create_model_response = client.create_model(
    ModelName=model_name,
    Containers=[
        {
            "Image": container,
            "Mode": "SingleModel",
            "ModelDataUrl": model_artifacts,
            "Environment": byo_container_env_vars,
        }
    ],
    ExecutionRoleArn=role,
)

print("Model Arn: " + create_model_response["ModelArn"])

Model name: xgboost-serverless2025-04-02-11-07-32
Model Arn: arn:aws:sagemaker:us-west-2:975050337104:model/xgboost-serverless2025-04-02-11-07-32


In [47]:
xgboost_epc_name = "mlops-serverless-epc" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName=xgboost_epc_name,
    ProductionVariants=[
        {
            "VariantName": "byoVariant",
            "ModelName": model_name,
            "ServerlessConfig": {
                "MemorySizeInMB": 3072,
                "MaxConcurrency": 1,
            },
        },
    ],
)

print("Endpoint Configuration Arn: " + endpoint_config_response["EndpointConfigArn"])

Endpoint Configuration Arn: arn:aws:sagemaker:us-west-2:975050337104:endpoint-config/mlops-serverless-epc2025-04-02-11-09-40


In [48]:
endpoint_name = "xgboost-serverless-ep" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=xgboost_epc_name,
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

Endpoint Arn: arn:aws:sagemaker:us-west-2:975050337104:endpoint/xgboost-serverless-ep2025-04-02-11-09-42


In [49]:
# wait for endpoint to reach a terminal state (InService) using describe endpoint
import time

describe_endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)

while describe_endpoint_response["EndpointStatus"] == "Creating":
    describe_endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)
    print(describe_endpoint_response["EndpointStatus"])
    time.sleep(15)

describe_endpoint_response

Creating
Creating
Creating
Creating
Creating
Creating
Creating
InService


{'EndpointName': 'xgboost-serverless-ep2025-04-02-11-09-42',
 'EndpointArn': 'arn:aws:sagemaker:us-west-2:975050337104:endpoint/xgboost-serverless-ep2025-04-02-11-09-42',
 'EndpointConfigName': 'mlops-serverless-epc2025-04-02-11-09-40',
 'ProductionVariants': [{'VariantName': 'byoVariant',
   'DeployedImages': [{'SpecifiedImage': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
     'ResolvedImage': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost@sha256:0c8f830ac408e6dee08445fb60392e9c3f05f790a4b3c07ec22327c08f75bcbf',
     'ResolutionTime': datetime.datetime(2025, 4, 2, 11, 9, 44, 920000, tzinfo=tzlocal())}],
   'CurrentWeight': 1.0,
   'DesiredWeight': 1.0,
   'CurrentInstanceCount': 0,
   'CurrentServerlessConfig': {'MemorySizeInMB': 3072, 'MaxConcurrency': 1}}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2025, 4, 2, 11, 9, 43, 480000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 4, 2, 11, 11, 37, 213000, tzinfo=tzlocal(

In [50]:
# Endpoint invocation
payload = b"3., 999.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 1.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0., 0.,   0.,   0.,   0.,   0.,   1.,   0.,   1.,   0.,   0.,   1., 0.,   0.,   1.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   1., 0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0., 0.,   1.,   0."

response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=payload,
    ContentType="text/csv",
)

print(response["Body"].read().decode())

0.07072833180427551


In [51]:
client.delete_model(ModelName=model_name)
client.delete_endpoint_config(EndpointConfigName=xgboost_epc_name)
client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '8f46e806-e503-4057-b07b-e78201c73706',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '8f46e806-e503-4057-b07b-e78201c73706',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Wed, 02 Apr 2025 11:14:31 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}

In [52]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                            'min_child_weight': ContinuousParameter(1, 10),
                            'alpha': ContinuousParameter(0, 2),
                            'max_depth': IntegerParameter(1, 10)}
objective_metric_name = 'validation:auc'

In [53]:
tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=3)

In [54]:
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

..............................................................................................................!


In [55]:
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

'Completed'

In [56]:
tuner_predictor = tuner.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')


2025-04-02 11:27:14 Starting - Found matching resource for reuse
2025-04-02 11:27:14 Downloading - Downloading the training image
2025-04-02 11:27:14 Training - Training image download completed. Training in progress.
2025-04-02 11:27:14 Uploading - Uploading generated training model
2025-04-02 11:27:14 Completed - Resource reused by training job: xgboost-250402-1121-010-e75e9990


------!

In [57]:
tuner_predictor.serializer = sagemaker.serializers.CSVSerializer()

In [58]:
test_data_x = pd.read_csv(os.path.join(test_path, 'test_script_x.csv'),header=None)
test_data_y = pd.read_csv(os.path.join(test_path, 'test_script_y.csv'),header=None)

In [62]:
predictions = predict(test_data_x, tuner_predictor)

In [63]:
pd.crosstab(index=test_data_y[0], columns=np.round(predictions), rownames=['actuals'], colnames=['predictions'])

predictions,0.0,1.0
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3588,47
1,377,107


In [64]:
tuner_predictor.delete_endpoint(delete_endpoint_config=True)