### Invoke SageMaker Endpoint to predict

In [2]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

import boto3
import re
from sagemaker import get_execution_role
import sagemaker

# SDK 2 serializers and deserializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

Matplotlib is building the font cache; this may take a moment.


In [3]:
# SDK 2
#RealTimePredictor renamed to Predictor

# Create a predictor and point to an existing endpoint

endpoint_name = 'xgboost-bikerental-v1'
predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)

In [4]:
predictor.serializer = CSVSerializer()

In [5]:
df_all = pd.read_csv('bike_test_rev3.csv')

In [6]:
df_all.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4


In [7]:
df_all.columns[:]

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek',
       'hour'],
      dtype='object')

In [8]:
df_all.columns[1:]

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek', 'hour'],
      dtype='object')

In [26]:
# Need to pass array to the prediction to test
# excluding the datetime
arr_test = df_all[df_all.columns[1:]].values

In [27]:
df_all[df_all.columns[1:]].values

array([[ 1.,  0.,  1., ..., 20.,  3.,  0.],
       [ 1.,  0.,  1., ..., 20.,  3.,  1.],
       [ 1.,  0.,  1., ..., 20.,  3.,  2.],
       ...,
       [ 1.,  0.,  1., ..., 31.,  0., 21.],
       [ 1.,  0.,  1., ..., 31.,  0., 22.],
       [ 1.,  0.,  1., ..., 31.,  0., 23.]])

In [28]:
type(arr_test)

numpy.ndarray

In [29]:
arr_test.shape

(6493, 13)

In [30]:
arr_test[:5]

array([[1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.13650e+01, 5.60000e+01, 2.60027e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 1.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 2.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 3.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 4.00000e+00]])

In [31]:
np.set_printoptions(precision=2)

In [32]:
arr_test[:5]

array([[1.00e+00, 0.00e+00, 1.00e+00, 1.00e+00, 1.07e+01, 1.14e+01,
        5.60e+01, 2.60e+01, 2.01e+03, 1.00e+00, 2.00e+01, 3.00e+00,
        0.00e+00],
       [1.00e+00, 0.00e+00, 1.00e+00, 1.00e+00, 1.07e+01, 1.36e+01,
        5.60e+01, 0.00e+00, 2.01e+03, 1.00e+00, 2.00e+01, 3.00e+00,
        1.00e+00],
       [1.00e+00, 0.00e+00, 1.00e+00, 1.00e+00, 1.07e+01, 1.36e+01,
        5.60e+01, 0.00e+00, 2.01e+03, 1.00e+00, 2.00e+01, 3.00e+00,
        2.00e+00],
       [1.00e+00, 0.00e+00, 1.00e+00, 1.00e+00, 1.07e+01, 1.29e+01,
        5.60e+01, 1.10e+01, 2.01e+03, 1.00e+00, 2.00e+01, 3.00e+00,
        3.00e+00],
       [1.00e+00, 0.00e+00, 1.00e+00, 1.00e+00, 1.07e+01, 1.29e+01,
        5.60e+01, 1.10e+01, 2.01e+03, 1.00e+00, 2.00e+01, 3.00e+00,
        4.00e+00]])

In [35]:
np.set_printoptions(precision=8) # changing it back to default 8

In [36]:
arr_test[:5]

array([[1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.13650e+01, 5.60000e+01, 2.60027e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 1.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 2.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 3.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 4.00000e+00]])

In [39]:
result =  predictor.predict(arr_test[:2]) # predicting first two arrays

In [40]:
arr_test[:2]

array([[1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.13650e+01, 5.60000e+01, 2.60027e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 1.00000e+00]])

In [41]:
result

b'2.332122325897217\n1.9005593061447144\n'

In [42]:
arr_test.shape

(6493, 13)

### Split the input data into chunks
There are thousand of rows in this data set for which need inferrence

When communicate over internet, it is good idea to split the data into chunks to prevent payload and timeout error

In [45]:
# For large number of predictions, we can split the input data and 
# Query the prediction service.
# array_split is convenient to specify how many splits are needed

# Splitting using regular expression as xgboost 1-2-2 is returning
# predicted values with inconsistend delimiters (comma, newline or both)

# pattern looks for one or more of non-numeric characters

pattern = r'[^0-9.]+'

predictions = []
for arr in np.array_split(arr_test,10):
    result = predictor.predict(arr)
    result = re.split(pattern, result.decode("utf-8"))
    
    print(arr.shape)
    predictions += [float(r) for r in result if r!=""] 

(650, 13)
(650, 13)
(650, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)


In [46]:
len(predictions)

6493

In [47]:
np.expm1(predictions)

array([  9.29977784,   5.68963495,   4.11209937, ..., 133.68598006,
        92.35797767,  52.00191752])

In [48]:
df_all['count'] = np.expm1(predictions)

In [49]:
df_all.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour,count
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0,9.299778
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1,5.689635
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2,4.112099
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3,1.822707
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4,1.604999


In [52]:
df_all[['datetime','count']].to_csv('predicted_count_cloud.csv',index=False)

In [53]:
#Delete Endpoint to prevent unecessary charges
predictor.delete_endpoint()