In [1]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

In [2]:
bucket_name = 'robiny-ml-sagemaker'
training_file_key = 'biketrain/bike_train_numeric_columns.recordio'

s3_model_output_location = r's3://{0}/biketrain/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)

In [3]:
print(s3_model_output_location)
print(s3_training_file_location)

s3://robiny-ml-sagemaker/biketrain/model
s3://robiny-ml-sagemaker/biketrain/bike_train_numeric_columns.recordio


In [4]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [5]:
write_to_s3('rob-bike_train_numeric_columns.recordio', bucket_name, training_file_key)

## Training Algorithm Docker Image
### AWS Maintains a separate image for every region and algorithm

In [6]:
# # Registry Path for algorithms provided by SageMaker
# #  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
# containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/pca:latest',
#               'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/pca:latest',
#               'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/pca:latest',
#               'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/pca:latest'}

# We no longer have to maintain a mapping of container images by region
# Simply use the convenience method provided by sagemaker
# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
from sagemaker.amazon.amazon_estimator import get_image_uri
image_name = get_image_uri(boto3.Session().region_name, 'pca')

In [7]:
image_name

'632365934929.dkr.ecr.us-west-1.amazonaws.com/pca:1'

In [8]:
role = get_execution_role()

In [9]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::501191679407:role/service-role/AmazonSageMaker-ExecutionRole-20200530T140508


## Build Model

In [10]:
sess = sagemaker.Session()

In [11]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name

estimator = sagemaker.estimator.Estimator(image_name,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='pca-biketrain')

In [12]:
# Specify hyper parameters that appropriate for the training algorithm
estimator.set_hyperparameters(feature_dim=4, # No. of Input features
                        num_components=3, # No. of prinicipal components to compute
                        subtract_mean=False, # Indicated whether the data should be unbiased both during training
                        algorithm_mode='regular', # Mode of computing the principal components
                        mini_batch_size=200) # No. of rows in a mini-batch

In [13]:
estimator.hyperparameters()

{'feature_dim': 4,
 'num_components': 3,
 'subtract_mean': False,
 'algorithm_mode': 'regular',
 'mini_batch_size': 200}

### Train the model

In [14]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':s3_training_file_location}) ## file is recordio format

2020-06-03 06:02:23 Starting - Starting the training job...
2020-06-03 06:02:26 Starting - Launching requested ML instances......
2020-06-03 06:03:34 Starting - Preparing the instances for training......
2020-06-03 06:04:41 Downloading - Downloading input data...
2020-06-03 06:05:16 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/03/2020 06:05:38 INFO 140659398784832] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'_num_gpus': u'auto', u'_log_level': u'info', u'subtract_mean': u'true', u'force_dense': u'true', u'epochs': 1, u'algorithm_mode': u'regular', u'extra_components': u'-1', u'_kvstore': u'dist_sync', u'_num_kv_servers': u'auto'}[0m
[34m[06/03/2020 06:05:38 INFO 140659398784832] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u'algorithm_mode': u'regular', u'f


2020-06-03 06:05:48 Uploading - Uploading generated training model
2020-06-03 06:05:48 Completed - Training job completed
[34m#metrics {"Metrics": {"finalize.time": {"count": 1, "max": 8.761882781982422, "sum": 8.761882781982422, "min": 8.761882781982422}}, "EndTime": 1591164340.254043, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "PCA"}, "StartTime": 1591164340.244517}
[0m
[34m[06/03/2020 06:05:40 INFO 140659398784832] Test data is not provided.[0m
[34m#metrics {"Metrics": {"totaltime": {"count": 1, "max": 1474.484920501709, "sum": 1474.484920501709, "min": 1474.484920501709}, "setuptime": {"count": 1, "max": 552.9389381408691, "sum": 552.9389381408691, "min": 552.9389381408691}}, "EndTime": 1591164340.254719, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "PCA"}, "StartTime": 1591164340.254107}
[0m
Training seconds: 67
Billable seconds: 67


## Deploy Model

In [16]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'pca-biketrain')

Using already existing model: pca-biketrain-2020-06-03-06-02-22-955


---------------!

## Run Predictions

In [17]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer

In [18]:
predictor.predict([[-1.333660693,-1.092736969,0.993213054,1.567753667]])

{'projections': [{'projection': [1.6828124523162842,
    0.45077428221702576,
    -1.8276870250701904]}]}

## Summary

1. Ensure Training, Test and Validation data are in S3 Bucket
2. Select Algorithm Container Registry Path - Path varies by region
3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location
4. Specify algorithm specific hyper parameters
5. Train model
6. Deploy model - Specify instance count, instance type and endpoint name
7. Run Predictions

## PCA Amazon SageMaker Prediction Invocation

In [19]:
# Acquire a realtime endpoint
endpoint_name = 'pca-biketrain'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [20]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer

In [21]:
# We are not going to use numeric features: 'temp','atemp','humidity','windspeed'
# Instead, we are going to use new components (aka features) generated by PCA for model training and testing
columns = ['count', 'season', 'holiday', 'workingday', 'weather','year', 'month', 'day', 'dayofweek','hour']

# PCA Training
colums_for_pca = ['temp','atemp','humidity','windspeed']

In [23]:
df_train = pd.read_csv('rob-train_normalized.csv')
display(df_train.head())

df_test = pd.read_csv('rob-test_normalized.csv')
display(df_test.head())

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,-1.333661,-1.092737,0.993213,-1.567754,2011,1,1,5,0
1,3.713572,1,0,0,1,-1.438907,-1.182421,0.941249,-1.567754,2011,1,1,5,1
2,3.496508,1,0,0,1,-1.438907,-1.182421,0.941249,-1.567754,2011,1,1,5,2
3,2.639057,1,0,0,1,-1.333661,-1.092737,0.68143,-1.567754,2011,1,1,5,3
4,0.693147,1,0,0,1,-1.333661,-1.092737,0.68143,-1.567754,2011,1,1,5,4


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,1,0,1,1,-1.228414,-1.450292,-0.305883,1.617227,2011,1,20,3,0
1,1,0,1,1,-1.228414,-1.182421,-0.305883,-1.567754,2011,1,20,3,1
2,1,0,1,1,-1.228414,-1.182421,-0.305883,-1.567754,2011,1,20,3,2
3,1,0,1,1,-1.228414,-1.271515,-0.305883,-0.22023,2011,1,20,3,3
4,1,0,1,1,-1.228414,-1.271515,-0.305883,-0.22023,2011,1,20,3,4


In [24]:
df_train[colums_for_pca].head()

Unnamed: 0,temp,atemp,humidity,windspeed
0,-1.333661,-1.092737,0.993213,-1.567754
1,-1.438907,-1.182421,0.941249,-1.567754
2,-1.438907,-1.182421,0.941249,-1.567754
3,-1.333661,-1.092737,0.68143,-1.567754
4,-1.333661,-1.092737,0.68143,-1.567754


In [25]:
test = df_train[colums_for_pca].head().as_matrix()

  if __name__ == '__main__':


In [26]:
result = predictor.predict(test)
print(result)

{'projections': [{'projection': [-0.5232375860214233, -1.7736060619354248, -1.7270781993865967]}, {'projection': [-0.5697786808013916, -1.7339260578155518, -1.8610637187957764]}, {'projection': [-0.5697786808013916, -1.7339260578155518, -1.8610637187957764]}, {'projection': [-0.7436795234680176, -1.554117202758789, -1.7062432765960693]}, {'projection': [-0.7436795234680176, -1.554117202758789, -1.7062432765960693]}]}


In [27]:
l = [values['projection'] for values in result['projections']]
print(l)

[[-0.5232375860214233, -1.7736060619354248, -1.7270781993865967], [-0.5697786808013916, -1.7339260578155518, -1.8610637187957764], [-0.5697786808013916, -1.7339260578155518, -1.8610637187957764], [-0.7436795234680176, -1.554117202758789, -1.7062432765960693], [-0.7436795234680176, -1.554117202758789, -1.7062432765960693]]


In [28]:
df_temp = pd.DataFrame(l)
df_temp

Unnamed: 0,0,1,2
0,-0.523238,-1.773606,-1.727078
1,-0.569779,-1.733926,-1.861064
2,-0.569779,-1.733926,-1.861064
3,-0.74368,-1.554117,-1.706243
4,-0.74368,-1.554117,-1.706243


In [29]:
# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed
def get_projection(arr_features):
    projections = []
    for arr in np.array_split(arr_features,100):        
        if arr.shape[0] > 0:
            print (arr.shape)
            result = predictor.predict(arr)
            projections += [values['projection'] for values in result['projections']]
    return projections

In [30]:
def replace_features(predictor, df, colums_for_pca):
    
    arr_features = df[colums_for_pca].as_matrix()
    
    projections = get_projection(arr_features)
    df_projection = pd.DataFrame(projections)
    
    tcols = []
    # New column names
    for i in range(df_projection.shape[1]):       
        tcols.append('component_' + str(i))
    
    df_projection.columns = tcols
    print ('components:',tcols)
    
    
    for col in df_projection.columns:
        df[col] = df_projection[col]
    
    df.drop(colums_for_pca, inplace=True, axis=1)
    
    return tcols

In [31]:
##
df_train.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,-1.333661,-1.092737,0.993213,-1.567754,2011,1,1,5,0
1,3.713572,1,0,0,1,-1.438907,-1.182421,0.941249,-1.567754,2011,1,1,5,1


In [32]:
##
new_cols = replace_features(predictor, df_train, colums_for_pca)

  app.launch_new_instance()


(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
components: ['component_0', 'component_1', 'component_2']


In [33]:
replace_features(predictor, df_test, colums_for_pca)

(65, 4)
(65, 4)


  app.launch_new_instance()


(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
components: ['component_0', 'component_1', 'component_2']


['component_0', 'component_1', 'component_2']

In [34]:
## 

for col in new_cols:
    columns.append(col)
columns

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour',
 'component_0',
 'component_1',
 'component_2']


## Training, Validation and Test Set
### Target Variable as first column followed by input features
### Training, Validation files do not have a column header

In [35]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df_train.index)
np.random.shuffle(l)
df_train = df_train.iloc[l]

In [36]:
rows = df_train.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)
print(rows, train, test)

10886 7620 3265


In [37]:
# Write Training Set
df_train[:train].to_csv('bike_train_pca_cloud.csv',
                        index=False,
                        header=False,
                        columns=columns)
# Write Validation Set
df_train[train:].to_csv('bike_validation_pca_cloud.csv',
                        index=False,
                        header=False,
                        columns=columns)

# Test Data has only input features
df_test.to_csv('bike_test_pca_cloud.csv',
               index=False)

In [38]:
# Write Column List
with open('bike_train_column_list_pca_cloud.txt','w') as f:
    f.write(','.join(columns))