In [1]:
import numpy as np
import pandas as pd

import boto3
import re

import sagemaker
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


#### Upload recordio data to s3

In [2]:
### Import s3 bucket name as environment variable

import os
env_vars = !cat ./.env
for var in env_vars:
    key, value = var.split('=')
    os.environ[key] = value

In [3]:
# Initializing s3 bucket and s3 locations

bucket_name = os.environ['BUCKET_NAME']

In [4]:
training_file_key = 'biketrain/bike_train_numeric_columns.recordio'

s3_model_output_location = r's3://{0}/biketrain/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)

In [7]:
# Write the file into s3

def write_to_s3(filename,bucket,key):
    with open(filename,'rb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [8]:
write_to_s3('bike_train_numeric_columns.recordio',bucket_name,training_file_key)

### Training Algorithm using Docker image

In [9]:
use_spot_instances = True
max_run = 3600
max_wait = 3600 if use_spot_instances else None 

job_name = 'pca-biketrain-v1'

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket_name}/bikerental/checkpoints/{job_name}'


In [11]:
sess = sagemaker.Session()

In [12]:
role = get_execution_role()

In [17]:
# SDK 2 uses image_uris.retrieve to retrieve container image location

container = sagemaker.image_uris.retrieve('pca',sess.boto_region_name)

print(f'Using pca Container {container}')

Using pca Container 382416733822.dkr.ecr.us-east-1.amazonaws.com/pca:1


#### Build Model

In [19]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name

# SDK 2.0

estimator = sagemaker.estimator.Estimator(container,
                                         role,
                                         instance_count=1,
                                         instance_type='ml.m5.xlarge',
                                         output_path=s3_model_output_location,
                                         sagemaker_session=sess,
                                         base_job_name=job_name,
                                         use_spot_instances=use_spot_instances,
                                         max_run=max_run,
                                         max_wait=max_wait,
                                         checkpoint_s3_uri=checkpoint_s3_uri
                                         )

In [20]:
# specify hyper parameters that appropriate for training algorithm

estimator.set_hyperparameters(feature_dim=4,
                             num_components=2,
                             subtract_mean=False,
                             algorithm_mode='regular',
                             mini_batch_size=200)

In [21]:
estimator.hyperparameters()

{'feature_dim': 4,
 'num_components': 2,
 'subtract_mean': False,
 'algorithm_mode': 'regular',
 'mini_batch_size': 200}

#### Train the model

In [22]:
# XGBoost supports train validation channels

estimator.fit({'train':s3_training_file_location})

INFO:sagemaker:Creating training-job with name: pca-biketrain-v1-2024-05-12-00-10-41-672


2024-05-12 00:10:41 Starting - Starting the training job...
2024-05-12 00:10:57 Starting - Preparing the instances for training...
2024-05-12 00:11:41 Downloading - Downloading the training image.........
2024-05-12 00:13:07 Training - Training image download completed. Training in progress.
2024-05-12 00:13:07 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
[34m[05/12/2024 00:13:00 INFO 139783073036096] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'algorithm_mode': 'regular', 'subtract_mean': 'true', 'extra_components': '-1', 'force_dense': 'true', 'epochs': 1, '_log_level': 'info', '_kvstore': 'dist_sync', '_num_kv_servers': 'auto', '_num_gpus': 'auto'}[0m
[34m[05/12/2024 00:13:00 INFO 139783073036096] Merging with provided configuration from /opt/m


2024-05-12 00:13:18 Completed - Training job completed
Training seconds: 112
Billable seconds: 48
Managed Spot Training savings: 57.1%


#### Deploy model

In [26]:
predictor = estimator.deploy(initial_instance_count=1,
                         instance_type='ml.m5.xlarge',
                         endpoint_name=job_name)

INFO:sagemaker:Creating model with name: pca-biketrain-v1-2024-05-12-00-14-44-592
INFO:sagemaker:Creating endpoint-config with name pca-biketrain-v1
INFO:sagemaker:Creating endpoint with name pca-biketrain-v1


------!

#### Run Predictions

In [29]:
# SDK 2.0 serializers
# PCA output will be json so we use json deserializer
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

In [30]:
predictor.predict([[0.11876090844342149,0.1737361053905541,0.9776050390159696,0.0]])

{'projections': [{'projection': [-0.3131008744239807, -0.9453434944152832]}]}

In [31]:
predictor.predict([[0.110467,0.166986,0.979751,0.0]])

{'projections': [{'projection': [-0.32269519567489624, -0.9424563646316528]}]}

In [32]:
predictor.predict([[0.11876091, 0.17373611, 0.97760504, 0.        ],
       [0.11046696, 0.16698636, 0.9797513 , 0.        ],
       [0.11046696, 0.16698636, 0.9797513 , 0.        ],
       [0.12779176, 0.1869474 , 0.97402255, 0.        ],
       [0.12779176, 0.1869474 , 0.97402255, 0.        ]])

{'projections': [{'projection': [-0.3131008446216583, -0.9453434944152832]},
  {'projection': [-0.32269516587257385, -0.9424567222595215]},
  {'projection': [-0.32269516587257385, -0.9424567222595215]},
  {'projection': [-0.2985489070415497, -0.9494175910949707]},
  {'projection': [-0.2985489070415497, -0.9494175910949707]}]}