## Train a model with bike rental data using XGBoost algorithm
###  Model is trained with SageMaker's XGBoost algorithm


In [1]:
## Libraries
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

### Invoke SageMaker predictive service

In [2]:
import boto3
import re
from sagemaker import get_execution_role
import sagemaker
# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload data to S3

In [4]:
# Specify your bucket name
bucket_name = 'robiny-ml-sagemaker'

training_folder = r'rob-bikerental/training/'
validation_folder = r'rob-bikerental/validation/'
test_folder = r'rob-bikerental/test/'

s3_model_output_location = r's3://{0}/rob-bikerental/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name, training_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name, validation_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name, test_folder)

In [5]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://robiny-ml-sagemaker/rob-bikerental/model
s3://robiny-ml-sagemaker/rob-bikerental/training/
s3://robiny-ml-sagemaker/rob-bikerental/validation/
s3://robiny-ml-sagemaker/rob-bikerental/test/


### Write and Read file from S3

In [6]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3

# File stored in S3 is automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

write_to_s3('rob_bike_train.csv', 
            bucket_name,
            training_folder + 'rob_bike_train.csv')

write_to_s3('rob_bike_validation.csv',
            bucket_name,
            validation_folder + 'rob_bike_validation.csv')

write_to_s3('rob_bike_test.csv',
            bucket_name,
            test_folder + 'rob_bike_test.csv')

## Training Algorithm Docker Image
### SageMaker maintains a separate image for algorithm and region
https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

In [7]:
# Establish a session with AWS
sess = sagemaker.Session()
role = get_execution_role()

# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::501191679407:role/service-role/AmazonSageMaker-ExecutionRole-20200530T140508


In [8]:
# Sagemaker API now maintains the algorithm container mapping for us
# Specify the region, algorithm and version
container = sagemaker.amazon.amazon_estimator.get_image_uri(
    sess.boto_region_name,
    "xgboost", 
    "latest")

print('Using SageMaker XGBoost container:\n{} ({})'.format(container, sess.boto_region_name))

	get_image_uri(region, 'xgboost', '0.90-1').


Using SageMaker XGBoost container:
632365934929.dkr.ecr.us-west-1.amazonaws.com/xgboost:latest (us-west-1)


## Build the model

In [9]:
# Configure the training job
# Specify type and number of instances to use
# S3 location where final artifacts needs to be stored

#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html

estimator = sagemaker.estimator.Estimator(
    container,
    role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name ='xgboost-bikerental')

In [10]:
# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

# max_depth=5,eta=0.1,subsample=0.7,num_round=150
estimator.set_hyperparameters(max_depth=5,
                              objective="reg:linear",
                              eta=0.1,
                              num_round=150)

### Specify Training Data Location and Optionally, Validation Data Location

In [11]:
# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.s3_input(
    s3_data=s3_training_file_location,
    content_type='csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.s3_input(
    s3_data=s3_validation_file_location,
    content_type='csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://robiny-ml-sagemaker/rob-bikerental/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://robiny-ml-sagemaker/rob-bikerental/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


## Train the model

In [12]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit(data_channels)

2020-06-02 19:34:17 Starting - Starting the training job...
2020-06-02 19:34:19 Starting - Launching requested ML instances......
2020-06-02 19:35:30 Starting - Preparing the instances for training......
2020-06-02 19:36:31 Downloading - Downloading input data...
2020-06-02 19:37:18 Training - Training image download completed. Training in progress.
2020-06-02 19:37:18 Uploading - Uploading generated training model.[34mArguments: train[0m
[34m[2020-06-02:19:37:13:INFO] Running standalone xgboost training.[0m
[34m[2020-06-02:19:37:13:INFO] File size need to be processed in the node: 0.65mb. Available memory size in the node: 8453.14mb[0m
[34m[2020-06-02:19:37:13:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:37:13] S3DistributionType set as FullyReplicated[0m
[34m[19:37:13] 7620x13 matrix with 99060 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-06-02:19:37:13:INFO] Determined delimiter of CSV input is ','[0m
[34


2020-06-02 19:37:25 Completed - Training job completed
Training seconds: 54
Billable seconds: 54


## Deploy the model

In [13]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'xgboost-bikerental')

---------------!

## Run Predictions

In [14]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [15]:
predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3]])

b'3.77382278442'

## Summary
1. Ensure Training, Test and Validation data are in S3 Bucket
2. Select Algorithm Container Registry Path - Path varies by region
3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location
4. Specify algorithm specific hyper parameters
5. Train model
6. Deploy model - Specify instance count, instance type and endpoint name
7. Run Predictions

<h1>XGBoost Amazon SageMaker Prediction </h1>


In [17]:
# Acquire a realtime endpoint
endpoint_name = 'xgboost-bikerental'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [21]:
df_all = pd.read_csv('rob_bike_test.csv')
display(df_all.head())
print(df_all.shape)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4


(6493, 14)


In [24]:
## check columns header
print(df_all.columns.tolist())
print(df_all.columns[1:])

['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek', 'hour']
Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek', 'hour'],
      dtype='object')


In [26]:
# Need to pass an array to the prediction
# can pass a numpy array or a list of values [[19,1],[20,1]]
arr_test = df_all[df_all.columns[1:]].values
print(arr_test)

print("\nType of data", type(arr_test))
print("\nShape of data", arr_test.shape)

[[ 1.  0.  1. ... 20.  3.  0.]
 [ 1.  0.  1. ... 20.  3.  1.]
 [ 1.  0.  1. ... 20.  3.  2.]
 ...
 [ 1.  0.  1. ... 31.  0. 21.]
 [ 1.  0.  1. ... 31.  0. 22.]
 [ 1.  0.  1. ... 31.  0. 23.]]

Type of data <class 'numpy.ndarray'>

Shape of data (6493, 13)


In [27]:
arr_test[:5]

array([[1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.13650e+01, 5.60000e+01, 2.60027e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 1.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.36350e+01, 5.60000e+01, 0.00000e+00, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 2.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 3.00000e+00],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 1.06600e+01,
        1.28800e+01, 5.60000e+01, 1.10014e+01, 2.01100e+03, 1.00000e+00,
        2.00000e+01, 3.00000e+00, 4.00000e+00]])

In [28]:
## Prediction original test dataset
result = predictor.predict(arr_test[:2])
result

b'2.3321223259,1.90055930614'

In [29]:
# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed
predictions = []
for arr in np.array_split(arr_test,10):
    result = predictor.predict(arr)
    result = result.decode("utf-8")
    result = result.split(',')
    print (arr.shape)
    predictions += [float(r) for r in result]

(650, 13)
(650, 13)
(650, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)


In [30]:
## Inverse the log transofrmation to get actual bike rental count
np.expm1(predictions)

df_all['count'] = np.expm1(predictions)
df_all.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour,count
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0,9.299778
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1,5.689635
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2,4.112099
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3,1.822707
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4,1.604999


In [31]:
df_all[['datetime','count']].to_csv('predicted_count_cloud.csv',index=False)

In [33]:
check = pd.read_csv("predicted_count_cloud.csv")
check.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,9.299778
1,2011-01-20 01:00:00,5.689635
2,2011-01-20 02:00:00,4.112099
3,2011-01-20 03:00:00,1.822707
4,2011-01-20 04:00:00,1.604999
