In [118]:
%matplotlib inline
import numpy as np
import pandas as pd
import os
import sys
import math
import matplotlib.pyplot as plt

XGBoost Prediction Invocation

Invoke Sagemaker Predciction service

In [119]:
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

In [120]:
# Aquire a realtime endpoint
endpoint_name = 'xgboost-bikeshare-v2'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [121]:
# we need to specify the format in which the test data is supplied. 

from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv' # Always specify content type as 'text/csv' to avoid model data type error. 
predictor.serializer = csv_serializer
predictor.deserializer = None

In [122]:
df_test = pd.read_csv('bike_test.csv')

In [123]:
df_test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4


In [124]:
df_test.columns[1:]

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek', 'hour'],
      dtype='object')

In [125]:
# For prediction we need to pass an array to the prediction
# we can pass a numpy array or list of values.
test_arr = df_test.as_matrix(df_test.columns[1:])

  app.launch_new_instance()


In [126]:
test_arr.shape

(6493, 13)

In [127]:
test_arr[:5]

array([[1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.13650e+01, 5.60000e+01, 2.60027e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 1.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 2.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 3.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 4.00000e+00]])

In [128]:
result = predictor.predict(test_arr[:2])

In [129]:
result

b'2.23719191551,1.81957924366'

In [130]:
test_arr.shape

(6493, 13)

In [131]:
# For large number of predictions we can split the input data, so that we test the model in small batches
# Also this way the load on the machine is lower and you will not cross the free tier limit. - for trial purposes this is easy
# array split is is easy to specify how many splits we need. 
predictions = []
for arr in np.array_split(test_arr, 10):
    result = predictor.predict(arr) # Predict method retuens a byte array, which needs to be decodec to utf-8
    result = result.decode('utf-8') # Further convert it to ta parseable string using utf-8 method
    result = result.split(',')
    print(arr.shape)
    predictions +=[float(r) for r in result]

(650, 13)
(650, 13)
(650, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)


In [132]:
len(predictions)

6493

In [134]:
# Predictions are present as log1p(count), which needs to converted to actual count using np.expm1
np.expm1(predictions)

array([  8.36699102,   5.16926215,   3.6755613 , ..., 133.05179581,
       100.92993831,  51.28660088])

In [135]:
df_test['count'] = np.expm1(predictions)

In [136]:
df_test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour,count
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0,8.366991
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1,5.169262
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2,3.675561
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3,2.189196
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4,2.168817


In [137]:
df_test[['datetime', 'count']].to_csv('bikeshare_predictedcount_cloud.csv', index=False)