In [29]:
# This example shows how to invoke SageMaker Endpoint from outside of AWS environment using SageMaker SDK
# Endpoint: XGBoost - Kaggle Bike Rental - Regressor Trained in XGBoost Lectures
# Makesure Endpoint is deployed before running this example
# 
# Reference:
#  https://github.com/awslabs/amazon-sagemaker-examples

# NOTE: SDK now requires additional permissions DescribeEndpoint, DescribeEndpointConfig in-addition to InvokeEndpoint
#   Please update SageMakerInvokeEndpoint permissions to reflect this policy document:
#   Logon with my_admin account and update permissions (IAM->Policies->SageMakerInvokeEndpoint->Edit Policy)
#   
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "sagemaker:DescribeEndpointConfig",
                "sagemaker:DescribeEndpoint",
                "sagemaker:InvokeEndpoint"
            ],
            "Resource": "*"
        }
    ]
}

{'Version': '2012-10-17',
 'Statement': [{'Sid': 'VisualEditor0',
   'Effect': 'Allow',
   'Action': ['sagemaker:DescribeEndpointConfig',
    'sagemaker:DescribeEndpoint',
    'sagemaker:InvokeEndpoint'],
   'Resource': '*'}]}

In [30]:
import boto3
import sagemaker
import math
import dateutil
import numpy as np
from sagemaker.predictor import csv_serializer, json_deserializer, json_serializer, csv_deserializer

In [31]:
# Establish a session with AWS
# Specify credentials and region to be used for this session.
# We will use a ml_user_predict credentials that has limited privileges
# boto_session = boto3.Session(profile_name='ml_user_predict',region_name='us-east-1')
boto_session = boto3.Session(
    aws_access_key_id='AKIA4DNPQJWROKTKHBXF',
    aws_secret_access_key='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'
    
)

In [32]:
sess = sagemaker.Session(boto_session=boto_session)

In [33]:
# Get RealTimePredictor using SageMaker SDK
# Specify Your Endpoint Name
endpoint_name = 'bike-rental-v1'

predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name,
                                                 sagemaker_session=sess)

In [34]:
# We are sending data for inference in CSV format
predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [35]:
#datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
# Actual=562
sample_one = '2012-12-19 17:00:00,4,0,1,1,16.4,20.455,50,26.0027'
# Actual=569
sample_two = '2012-12-19 18:00:00,4,0,1,1,15.58,19.695,50,23.9994'
# Actual=4
sample_three = '2012-12-10 01:00:00,4,0,1,2,14.76,18.94,100,0'

In [36]:
# Try invoking predict method
# It won't work...because we had transformed the original data to replace date time with a set of categorical features
predictor.predict(sample_one)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (415) from model with message "Loading csv data failed with Exception, please ensure data is in csv format:
 <type 'exceptions.ValueError'>
 invalid literal for float(): 2012-12-19 17:00:00". See https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logEventViewer:group=/aws/sagemaker/Endpoints/bike-rental-v1 in account 831981112738 for more information.

In [37]:
# Raw Data Structure: 
# datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count

# Model expects data in this format (it was trained with these features):
# season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour

def transform_data(data):
    features = data.split(',')
    
    # Extract year, month, day, dayofweek, hour
    dt = dateutil.parser.parse(features[0])

    features.append(str(dt.year))
    features.append(str(dt.month))
    features.append(str(dt.day))
    features.append(str(dt.weekday()))
    features.append(str(dt.hour))
    
    # Return the transformed data. skip datetime field
    return ','.join(features[1:])

In [38]:
print('Raw Data:\n',sample_one)
print('Transformed Data:\n',transform_data(sample_one))

Raw Data:
 2012-12-19 17:00:00,4,0,1,1,16.4,20.455,50,26.0027
Transformed Data:
 4,0,1,1,16.4,20.455,50,26.0027,2012,12,19,2,17


In [39]:
# Let's invoke prediction now
# Actual=562
predictor.predict(transform_data(sample_one))

b'573.628295898'

In [40]:
# Actual=569
predictor.predict(transform_data(sample_two))

b'547.521606445'

In [41]:
# Actual=4
predictor.predict(transform_data(sample_three))

b'10.4238166809'

In [42]:
# Actual Count is 562...but predicted is 6.36.

# Model was trained with log1p(count)
# So, we need to apply inverse transformation to get the actual count
# Predicted Count looks much better now
result = predictor.predict(transform_data(sample_one))
result = result.decode("utf-8")
print ('Predicted Count', math.expm1(float(result)))

Predicted Count 1.3292405212588473e+249


In [43]:
result

'573.628295898'

In [44]:
result = predictor.predict([transform_data(sample_one), transform_data(sample_two)])

In [45]:
result.decode("utf-8")

'573.628295898,547.521606445'

In [46]:
# Batch Prediction
# Transform data and invoke prediction in specified batch sizes
def run_predictions(data, batch_size):
    predictions = []
    
    transformed_data = [transform_data(row.strip()) for row in data]
    
    for i in range(0, len(data), batch_size):
        
        print(i,i+batch_size)
        
        result = predictor.predict(transformed_data[i : i + batch_size])
        
        result = result.decode("utf-8")
        result = result.split(',')
        
        predictions += [np.exp(float(r)) for r in result]
                
    return predictions

In [47]:
run_predictions([sample_one,sample_two,sample_three],10)

0 10


[1.3292405212588473e+249, 6.103970160235504e+237, 33651.626876408816]

In [48]:
# Run a batch prediction on Test.CSV File
# Read the file content
data = []
with open('test.csv','r') as f:
    # skip header
    f.readline()
    # Read remaining lines
    data = f.readlines()

In [49]:
len(data)

6493

In [50]:
%%time
predictions = run_predictions(data,100)

0 100
100 200
200 300
300 400
400 500
500 600
600 700
700 800
800 900
900 1000
1000 1100
1100 1200
1200 1300
1300 1400
1400 1500
1500 1600
1600 1700
1700 1800
1800 1900
1900 2000
2000 2100
2100 2200
2200 2300
2300 2400
2400 2500
2500 2600
2600 2700
2700 2800
2800 2900
2900 3000
3000 3100
3100 3200
3200 3300
3300 3400
3400 3500
3500 3600
3600 3700
3700 3800
3800 3900




3900 4000
4000 4100
4100 4200
4200 4300
4300 4400
4400 4500
4500 4600
4600 4700
4700 4800
4800 4900
4900 5000
5000 5100
5100 5200
5200 5300
5300 5400
5400 5500
5500 5600
5600 5700
5700 5800
5800 5900
5900 6000
6000 6100
6100 6200
6200 6300
6300 6400
6400 6500
CPU times: user 587 ms, sys: 19.4 ms, total: 606 ms
Wall time: 20.6 s


In [51]:
len(predictions),len(data)

(6493, 6493)