### DeepAR Model - Bike Rental Training

Note: This data is not a true timeseries as there are lots of gaps

We have data only for first 20 days of each month and model needs to predict the rental for the remaining days of the month. The dataset consists of two years data. DeepAR will shine with true multiple-timeseries dataset.

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import time
import datetime

import boto3
import sagemaker
from sagemaker import get_execution_role

In [5]:
### Import s3 bucket name as environment variable

import os
env_vars = !cat ./.env
for var in env_vars:
    key, value = var.split('=')
    os.environ[key] = value

In [6]:
# set differnt job names when building different models based on choices
# Also have jobnames easily differiantiate according to choice

with_categories = False
if with_categories:
    base_job_name = 'deepar-biketrain-with-categories'
else:
    base_job_name = 'deepar-biketrain-no-categories'

In [7]:
# specify your bucket name and dataset path in that

bucket = os.environ['BUCKET_NAME']
prefix = 'deepar/bikerental'

# This structure allows multiple training and test files for model development and testing

if with_categories:
    s3_data_path = "{}/{}/data_with_categories".format(bucket, prefix)
else:
    s3_data_path = "{}/{}/data".format(bucket,prefix)
    
s3_output_path = "{}/{}/output".format(bucket,prefix)


In [9]:
#s3_data_path, s3_output_path

In [10]:
# function that uploads files to s3 bucket

def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [11]:
# upload one or more training files and test files to s3

if with_categories:
    write_to_s3('train_with_categories.json', bucket, 'deepar/bikerental/data_with_categories/train/train_with_categories.json')
    write_to_s3('test_with_categories.json', bucket, 'deepar/bikerental/data_with_categories/test/test_with_categories.json')
else:
    write_to_s3('train.json',bucket, 'deepar/bikerental/data/train/train.json')
    write_to_s3('test.json', bucket, 'deepar/bikerental/data/test/test.json')

In [13]:
# Use spont instance 

use_spot_instances = True
max_run = 3600
max_wait = 3600 if use_spot_instances else None 

job_name = base_job_name

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket}/{prefix}/checkpoints/{job_name}'
    
#print(f'Checkpoint uri: {checkpoint_s3_uri}')

In [39]:
# Establish a session with AWS

sess = sagemaker.Session()
role = get_execution_role()

In [16]:
# This role contains the permissions needed to train, deploy models
# Sagemaker serveis is trusted to assume this role

#print(role)

In [21]:
# SDK 2 uses image_uris.retrie to get container image

container = sagemaker.image_uris.retrieve("forecasting-deepar",sess.boto_region_name)

print(f'Using DeepAR container {container}')

Using DeepAR container 522234722520.dkr.ecr.us-east-1.amazonaws.com/forecasting-deepar:1


In [23]:
freq = "H" # Data consists hourly data

prediction_length = 288 # need to predict 12 days of data in hours

context_length = 288 # AWS recommends that context lenght to be same as prediction length to look past the same length as pred


In [40]:
# Configure training job

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count = 1,
    instance_type = 'ml.m5.xlarge',
    output_path = 's3://'+s3_output_path,
    sagemaker_session=sess,
    base_job_name= job_name,
    use_spot_instances = use_spot_instances,
    max_run= max_run,
    max_wait = max_wait,
    checkpoint_s3_uri = checkpoint_s3_uri
)

In [41]:
freq, context_length, prediction_length

('H', 288, 288)

In [42]:
# deepar hyperparameters

hyperparameters = {
    "time_freq" : freq,
    "epochs":"400",
    "early_stopping_patience":"10",
    "mini_batch_size": "64",
    "learning_rate" : "5E-4",
    "context_length" : str(context_length),
    "prediction_length" : str(prediction_length),
    "cardinality" : "auto" if with_categories else ''
}

In [43]:
hyperparameters

{'time_freq': 'H',
 'epochs': '400',
 'early_stopping_patience': '10',
 'mini_batch_size': '64',
 'learning_rate': '5E-4',
 'context_length': '288',
 'prediction_length': '288',
 'cardinality': ''}

In [44]:
estimator.set_hyperparameters(**hyperparameters)

In [45]:
# creating data channels of train and test files

data_channels = {
    "train" : "s3://{}/train/".format(s3_data_path),
    "test" : "s3://{}/test".format(s3_data_path)
}

In [32]:
#data_channels

In [46]:
# fitting the model

estimator.fit(inputs=data_channels)

INFO:sagemaker:Creating training-job with name: deepar-biketrain-no-categories-2024-06-06-13-20-47-915


2024-06-06 13:20:48 Starting - Starting the training job...
2024-06-06 13:21:03 Starting - Preparing the instances for training...
2024-06-06 13:21:36 Downloading - Downloading input data...
2024-06-06 13:21:56 Downloading - Downloading the training image.........
2024-06-06 13:23:42 Training - Training image download completed. Training in progress...[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mRunning custom environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
[34m[06/06/2024 13:23:54 INFO 139771206125376] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_tuning_objective_metric': '', 'cardinality': 'auto', 'dropout_rate': '0.10', 'early_stopping_patience': '', 'embedding_dimension': '10', 'learning_rate': '0.001', 'likelihood': 'student-t

[34m[06/06/2024 13:24:46 INFO 139771206125376] Epoch[1] Batch[5] avg_epoch_loss=3.497752[0m
[34m[06/06/2024 13:24:46 INFO 139771206125376] #quality_metric: host=algo-1, epoch=1, batch=5 train loss <loss>=3.4977524280548096[0m
[34m[06/06/2024 13:24:46 INFO 139771206125376] Epoch[1] Batch [5]#011Speed: 37.83 samples/sec#011loss=3.497752[0m
[34m[06/06/2024 13:24:52 INFO 139771206125376] processed a total of 617 examples[0m
[34m#metrics {"StartTime": 1717680274.1669016, "EndTime": 1717680292.9312084, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 18764.24527168274, "count": 1, "min": 18764.24527168274, "max": 18764.24527168274}}}[0m
[34m[06/06/2024 13:24:52 INFO 139771206125376] #throughput_metric: host=algo-1, train throughput=32.88120734561044 records/second[0m
[34m[06/06/2024 13:24:52 INFO 139771206125376] #progress_metric: host=algo-1, completed 0.5 % of epochs[0m
[34m[06/06/2024 13:24:52 INFO 1397

[34m[06/06/2024 13:26:25 INFO 139771206125376] Epoch[6] Batch[5] avg_epoch_loss=3.043598[0m
[34m[06/06/2024 13:26:25 INFO 139771206125376] #quality_metric: host=algo-1, epoch=6, batch=5 train loss <loss>=3.043597936630249[0m
[34m[06/06/2024 13:26:25 INFO 139771206125376] Epoch[6] Batch [5]#011Speed: 37.70 samples/sec#011loss=3.043598[0m
[34m[06/06/2024 13:26:34 INFO 139771206125376] Epoch[6] Batch[10] avg_epoch_loss=2.923261[0m
[34m[06/06/2024 13:26:34 INFO 139771206125376] #quality_metric: host=algo-1, epoch=6, batch=10 train loss <loss>=2.7788577556610106[0m
[34m[06/06/2024 13:26:34 INFO 139771206125376] Epoch[6] Batch [10]#011Speed: 37.31 samples/sec#011loss=2.778858[0m
[34m[06/06/2024 13:26:34 INFO 139771206125376] processed a total of 650 examples[0m
[34m#metrics {"StartTime": 1717680373.7430818, "EndTime": 1717680394.2405653, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 20497.43127822876, "

[34m[06/06/2024 13:28:06 INFO 139771206125376] Epoch[11] Batch[5] avg_epoch_loss=2.812002[0m
[34m[06/06/2024 13:28:06 INFO 139771206125376] #quality_metric: host=algo-1, epoch=11, batch=5 train loss <loss>=2.812001665433248[0m
[34m[06/06/2024 13:28:06 INFO 139771206125376] Epoch[11] Batch [5]#011Speed: 34.96 samples/sec#011loss=2.812002[0m
[34m[06/06/2024 13:28:15 INFO 139771206125376] Epoch[11] Batch[10] avg_epoch_loss=2.873223[0m
[34m[06/06/2024 13:28:15 INFO 139771206125376] #quality_metric: host=algo-1, epoch=11, batch=10 train loss <loss>=2.94668869972229[0m
[34m[06/06/2024 13:28:15 INFO 139771206125376] Epoch[11] Batch [10]#011Speed: 37.13 samples/sec#011loss=2.946689[0m
[34m[06/06/2024 13:28:15 INFO 139771206125376] processed a total of 644 examples[0m
[34m#metrics {"StartTime": 1717680473.9963236, "EndTime": 1717680495.1907887, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 21194.4122314453

[34m[06/06/2024 13:29:46 INFO 139771206125376] Epoch[16] Batch[5] avg_epoch_loss=2.781484[0m
[34m[06/06/2024 13:29:46 INFO 139771206125376] #quality_metric: host=algo-1, epoch=16, batch=5 train loss <loss>=2.7814835707346597[0m
[34m[06/06/2024 13:29:46 INFO 139771206125376] Epoch[16] Batch [5]#011Speed: 37.85 samples/sec#011loss=2.781484[0m
[34m[06/06/2024 13:29:53 INFO 139771206125376] processed a total of 617 examples[0m
[34m#metrics {"StartTime": 1717680574.4997854, "EndTime": 1717680593.227391, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 18727.54955291748, "count": 1, "min": 18727.54955291748, "max": 18727.54955291748}}}[0m
[34m[06/06/2024 13:29:53 INFO 139771206125376] #throughput_metric: host=algo-1, train throughput=32.945942966930666 records/second[0m
[34m[06/06/2024 13:29:53 INFO 139771206125376] #progress_metric: host=algo-1, completed 4.25 % of epochs[0m
[34m[06/06/2024 13:29:53 INFO 

[34m[06/06/2024 13:31:23 INFO 139771206125376] Epoch[21] Batch[5] avg_epoch_loss=2.703329[0m
[34m[06/06/2024 13:31:23 INFO 139771206125376] #quality_metric: host=algo-1, epoch=21, batch=5 train loss <loss>=2.70332940419515[0m
[34m[06/06/2024 13:31:23 INFO 139771206125376] Epoch[21] Batch [5]#011Speed: 37.70 samples/sec#011loss=2.703329[0m
[34m[06/06/2024 13:31:32 INFO 139771206125376] Epoch[21] Batch[10] avg_epoch_loss=2.776821[0m
[34m[06/06/2024 13:31:32 INFO 139771206125376] #quality_metric: host=algo-1, epoch=21, batch=10 train loss <loss>=2.8650105953216554[0m
[34m[06/06/2024 13:31:32 INFO 139771206125376] Epoch[21] Batch [10]#011Speed: 36.88 samples/sec#011loss=2.865011[0m
[34m[06/06/2024 13:31:32 INFO 139771206125376] processed a total of 669 examples[0m
[34m#metrics {"StartTime": 1717680672.0763466, "EndTime": 1717680692.6257646, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 20549.362182617

[34m[06/06/2024 13:33:04 INFO 139771206125376] Epoch[26] Batch[5] avg_epoch_loss=2.628748[0m
[34m[06/06/2024 13:33:04 INFO 139771206125376] #quality_metric: host=algo-1, epoch=26, batch=5 train loss <loss>=2.628748138745626[0m
[34m[06/06/2024 13:33:04 INFO 139771206125376] Epoch[26] Batch [5]#011Speed: 35.97 samples/sec#011loss=2.628748[0m
[34m[06/06/2024 13:33:12 INFO 139771206125376] Epoch[26] Batch[10] avg_epoch_loss=2.614921[0m
[34m[06/06/2024 13:33:12 INFO 139771206125376] #quality_metric: host=algo-1, epoch=26, batch=10 train loss <loss>=2.598328161239624[0m
[34m[06/06/2024 13:33:12 INFO 139771206125376] Epoch[26] Batch [10]#011Speed: 37.47 samples/sec#011loss=2.598328[0m
[34m[06/06/2024 13:33:12 INFO 139771206125376] processed a total of 651 examples[0m
[34m#metrics {"StartTime": 1717680771.7960544, "EndTime": 1717680792.6719594, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 20875.478982925

[34m[06/06/2024 13:34:45 INFO 139771206125376] Epoch[31] Batch[5] avg_epoch_loss=2.643259[0m
[34m[06/06/2024 13:34:45 INFO 139771206125376] #quality_metric: host=algo-1, epoch=31, batch=5 train loss <loss>=2.643259366353353[0m
[34m[06/06/2024 13:34:45 INFO 139771206125376] Epoch[31] Batch [5]#011Speed: 37.70 samples/sec#011loss=2.643259[0m
[34m[06/06/2024 13:34:54 INFO 139771206125376] Epoch[31] Batch[10] avg_epoch_loss=2.620497[0m
[34m[06/06/2024 13:34:54 INFO 139771206125376] #quality_metric: host=algo-1, epoch=31, batch=10 train loss <loss>=2.5931821346282957[0m
[34m[06/06/2024 13:34:54 INFO 139771206125376] Epoch[31] Batch [10]#011Speed: 37.16 samples/sec#011loss=2.593182[0m
[34m[06/06/2024 13:34:54 INFO 139771206125376] processed a total of 653 examples[0m
[34m#metrics {"StartTime": 1717680873.6475687, "EndTime": 1717680894.1842113, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 20536.36622428

[34m[06/06/2024 13:36:25 INFO 139771206125376] Epoch[36] Batch[5] avg_epoch_loss=2.637874[0m
[34m[06/06/2024 13:36:25 INFO 139771206125376] #quality_metric: host=algo-1, epoch=36, batch=5 train loss <loss>=2.637873967488607[0m
[34m[06/06/2024 13:36:25 INFO 139771206125376] Epoch[36] Batch [5]#011Speed: 37.73 samples/sec#011loss=2.637874[0m
[34m[06/06/2024 13:36:32 INFO 139771206125376] processed a total of 616 examples[0m
[34m#metrics {"StartTime": 1717680973.5530176, "EndTime": 1717680992.311981, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 18758.907079696655, "count": 1, "min": 18758.907079696655, "max": 18758.907079696655}}}[0m
[34m[06/06/2024 13:36:32 INFO 139771206125376] #throughput_metric: host=algo-1, train throughput=32.83756409396397 records/second[0m
[34m[06/06/2024 13:36:32 INFO 139771206125376] #progress_metric: host=algo-1, completed 9.25 % of epochs[0m
[34m[06/06/2024 13:36:32 INFO

[34m[06/06/2024 13:38:21 INFO 139771206125376] Epoch[42] Batch[5] avg_epoch_loss=2.610039[0m
[34m[06/06/2024 13:38:21 INFO 139771206125376] #quality_metric: host=algo-1, epoch=42, batch=5 train loss <loss>=2.6100385586420694[0m
[34m[06/06/2024 13:38:21 INFO 139771206125376] Epoch[42] Batch [5]#011Speed: 37.79 samples/sec#011loss=2.610039[0m
[34m[06/06/2024 13:38:28 INFO 139771206125376] processed a total of 621 examples[0m
[34m#metrics {"StartTime": 1717681089.744831, "EndTime": 1717681108.52467, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 18779.327154159546, "count": 1, "min": 18779.327154159546, "max": 18779.327154159546}}}[0m
[34m[06/06/2024 13:38:28 INFO 139771206125376] #throughput_metric: host=algo-1, train throughput=33.0680964699801 records/second[0m
[34m[06/06/2024 13:38:28 INFO 139771206125376] #progress_metric: host=algo-1, completed 10.75 % of epochs[0m
[34m[06/06/2024 13:38:28 INFO 


2024-06-06 13:39:55 Uploading - Uploading generated training model
2024-06-06 13:40:08 Completed - Training job completed
Training seconds: 1112
Billable seconds: 500
Managed Spot Training savings: 55.0%


In [47]:
job_name = estimator.latest_training_job.name

#### Create endpoint using jobname

In [48]:
print('job name: {0}'.format(job_name))

job name: deepar-biketrain-no-categories-2024-06-06-13-20-47-915


In [49]:
# create an endpoint for real-time predictions

endpoint_name = sess.endpoint_from_job(
    job_name = job_name,
    initial_instance_count = 1,
    instance_type = 'ml.m5.large' if use_spot_instances else 'ml.m5.xlarge',
    image_uri = container, 
    role = role
)

INFO:sagemaker:Creating model with name: deepar-biketrain-no-categories-2024-06-06-13-20-47-915
INFO:sagemaker:Creating endpoint-config with name deepar-biketrain-no-categories-2024-06-06-13-20-47-915
INFO:sagemaker:Creating endpoint with name deepar-biketrain-no-categories-2024-06-06-13-20-47-915


----------!