## DeepAR Model - Predict Bike Rental with Dynamic Features

Note: This dataset is not a true timeseries as there a lot of gaps

We have data only for first 20 days of each month and model needs to predict the rentals for 
the remaining days of the month. The dataset consists of two years data. DeepAR will shine with true multiple-timeseries dataset like the electricity example given below

In [2]:
import time
import numpy as np
import pandas as pd
import json
import time
import datetime
import matplotlib.pyplot as plt

import boto3
import sagemaker
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
with_categories =False

base_job_name = 'deepar-biketrain-with-dynamic-feat'

In [4]:
### Import s3 bucket name as environment variable

import os
env_vars = !cat ./.env
for var in env_vars:
    key, value = var.split('=')
    os.environ[key] = value

In [6]:
# specify your bucket name and dataset path in that

bucket = os.environ['BUCKET_NAME']
prefix = 'deepar/bikerental'

s3_data_path = "{}/{}/data_dynamic".format(bucket, prefix)
s3_output_path = "{}/{}/output".format(bucket,prefix)

In [8]:
#s3_data_path, s3_output_path

In [12]:
# In S3 filename is keyname

def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [14]:
# Upload one or more files to s3

write_to_s3('train_dynamic_feat.json', bucket, 'deepar/bikerental/data_dynamic/train/train_dynamic_feat.json')
write_to_s3('test_dynamic_feat.json', bucket, 'deepar/bikerental/data_dynamic/test/test_dynamic_feat.json')

In [15]:
# Use spot instance

use_spot_instances = True
max_run = 3600
max_wait = 3600 if use_spot_instances else None

job_name = base_job_name

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket}/{prefix}/checkpoints/{job_name}'


In [17]:
#print(checkpoint_s3_uri)

In [18]:
# Establish a Session with AWS

sess = sagemaker.Session()
role = get_execution_role()

In [19]:
sess

<sagemaker.session.Session at 0x7f4db5596230>

In [21]:
#role

In [23]:
# use deepar container

container = sagemaker.image_uris.retrieve("forecasting-deepar",sess.boto_region_name)

In [24]:
print(container)

522234722520.dkr.ecr.us-east-1.amazonaws.com/forecasting-deepar:1


In [25]:
freq = 'H'   # Timeseries contains hourly data

# Prediction length is tweleve days

prediction_length = 12*24

# context length is how far in the past it should look for prediction
# AWS recommends context length same as prediction length

context_length = prediction_length

In [26]:
print(context_length)

288


In [27]:
#configure the training job

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type = 'ml.m5.xlarge',
    outut_path='s3://'+s3_output_path,
    sagemaker_session=sess,
    base_job_name=base_job_name,
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
    checkpoint_s3_uri=checkpoint_s3_uri
)

In [28]:
freq, context_length, prediction_length

('H', 288, 288)

In [29]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/deepar_hyperparameters.html

In [31]:
hyperparameters = {
    "time_freq": freq,
    "epochs": "400",
    "early_stopping_patience": "10",
    "mini_batch_size":"64",
    "learning_rate": "5E-4",
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
    "cardinality" : "auto" if with_categories else ''
    
}

In [32]:
hyperparameters

{'time_freq': 'H',
 'epochs': '400',
 'early_stopping_patience': '10',
 'mini_batch_size': '64',
 'learning_rate': '5E-4',
 'context_length': '288',
 'prediction_length': '288',
 'cardinality': ''}

In [33]:
estimator.set_hyperparameters(**hyperparameters)

In [35]:
# train and test data path in s3 

data_channels = {
    "train": "s3://{}/train/".format(s3_data_path),
    "test": "s3://{}/train/".format(s3_data_path)
}

In [37]:
#data_channels

In [38]:
# fitting the model by passing channel details

estimator.fit(inputs=data_channels)

INFO:sagemaker:Creating training-job with name: deepar-biketrain-with-dynamic-feat-2024-06-11-19-51-55-174


2024-06-11 19:51:55 Starting - Starting the training job...
2024-06-11 19:52:12 Starting - Preparing the instances for training...
2024-06-11 19:52:41 Downloading - Downloading input data...
2024-06-11 19:53:02 Downloading - Downloading the training image..................
2024-06-11 19:56:18 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mRunning custom environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
[34m[06/11/2024 19:56:30 INFO 140205716232000] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_tuning_objective_metric': '', 'cardinality': 'auto', 'dropout_rate': '0.10', 'early_stopping_patience': '', 'embedding_dimension': '10', 'learning_rate': '0.001', 'likelihood': 's

[34m[06/11/2024 19:57:33 INFO 140205716232000] Epoch[1] Batch[0] avg_epoch_loss=3.643927[0m
[34m[06/11/2024 19:57:33 INFO 140205716232000] #quality_metric: host=algo-1, epoch=1, batch=0 train loss <loss>=3.6439270973205566[0m
[34m[06/11/2024 19:57:43 INFO 140205716232000] Epoch[1] Batch[5] avg_epoch_loss=3.555964[0m
[34m[06/11/2024 19:57:43 INFO 140205716232000] #quality_metric: host=algo-1, epoch=1, batch=5 train loss <loss>=3.55596387386322[0m
[34m[06/11/2024 19:57:43 INFO 140205716232000] Epoch[1] Batch [5]#011Speed: 31.22 samples/sec#011loss=3.555964[0m
[34m[06/11/2024 19:57:51 INFO 140205716232000] processed a total of 639 examples[0m
[34m#metrics {"StartTime": 1718135840.2092967, "EndTime": 1718135871.8831902, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 31673.832654953003, "count": 1, "min": 31673.832654953003, "max": 31673.832654953003}}}[0m
[34m[06/11/2024 19:57:51 INFO 140205716232000] 

[32m#metrics {"StartTime": 1718136754.7023, "EndTime": 1718136767.49206, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"get_graph.time": {"sum": 12786.656141281128, "count": 1, "min": 12786.656141281128, "max": 12786.656141281128}}}[0m
[32m[06/11/2024 20:12:47 INFO 139746470463296] Number of GPUs being used: 0[0m
[32m[06/11/2024 20:12:50 INFO 139746470463296] #memory_usage::<model> = 279 mb[0m
[32m#metrics {"StartTime": 1718136767.492153, "EndTime": 1718136770.7370257, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"initialize.time": {"sum": 16034.56997871399, "count": 1, "min": 16034.56997871399, "max": 16034.56997871399}}}[0m
[32m[20:13:03] /opt/brazil-pkg-cache/packages/AIAlgorithmsMXNet/AIAlgorithmsMXNet-1.3.x_Cuda_11.1.x.406.0/AL2_x86_64/generic-flavor/src/src/operator/nn/mkldnn/mkldnn_base.cc:74: Allocate 10240 bytes with malloc directly[0m
[32m[06/11/2024 20:13:05 IN

[32m[06/11/2024 20:15:24 INFO 139746470463296] Epoch[4] Batch[0] avg_epoch_loss=3.655554[0m
[32m[06/11/2024 20:15:24 INFO 139746470463296] #quality_metric: host=algo-1, epoch=4, batch=0 train loss <loss>=3.6555542945861816[0m
[32m[06/11/2024 20:15:35 INFO 139746470463296] Epoch[4] Batch[5] avg_epoch_loss=3.529871[0m
[32m[06/11/2024 20:15:35 INFO 139746470463296] #quality_metric: host=algo-1, epoch=4, batch=5 train loss <loss>=3.5298712650934854[0m
[32m[06/11/2024 20:15:35 INFO 139746470463296] Epoch[4] Batch [5]#011Speed: 30.70 samples/sec#011loss=3.529871[0m
[32m[06/11/2024 20:15:43 INFO 139746470463296] processed a total of 601 examples[0m
[32m#metrics {"StartTime": 1718136911.2053545, "EndTime": 1718136943.3886774, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 32183.262825012207, "count": 1, "min": 32183.262825012207, "max": 32183.262825012207}}}[0m
[32m[06/11/2024 20:15:43 INFO 139746470463296

[32m[06/11/2024 20:17:51 INFO 139746470463296] processed a total of 619 examples[0m
[32m#metrics {"StartTime": 1718137040.4291031, "EndTime": 1718137071.9346907, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 31505.51986694336, "count": 1, "min": 31505.51986694336, "max": 31505.51986694336}}}[0m
[32m[06/11/2024 20:17:51 INFO 139746470463296] #throughput_metric: host=algo-1, train throughput=19.647283119530126 records/second[0m
[32m[06/11/2024 20:17:51 INFO 139746470463296] #progress_metric: host=algo-1, completed 2.25 % of epochs[0m
[32m[06/11/2024 20:17:51 INFO 139746470463296] #quality_metric: host=algo-1, epoch=8, train loss <loss>=3.1502495288848875[0m
[32m[06/11/2024 20:17:51 INFO 139746470463296] best epoch loss so far[0m
[32m[06/11/2024 20:17:52 INFO 139746470463296] Saved checkpoint to "/opt/ml/model/state_83716e4e-4e36-4ba8-b670-e2293dd6093a-0000.params"[0m
[32m#metrics {"StartTime": 17181

[32m[06/11/2024 20:20:40 INFO 139746470463296] processed a total of 632 examples[0m
[32m#metrics {"StartTime": 1718137208.5077076, "EndTime": 1718137240.6076372, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 32099.8694896698, "count": 1, "min": 32099.8694896698, "max": 32099.8694896698}}}[0m
[32m[06/11/2024 20:20:40 INFO 139746470463296] #throughput_metric: host=algo-1, train throughput=19.68848630107231 records/second[0m
[32m[06/11/2024 20:20:40 INFO 139746470463296] #progress_metric: host=algo-1, completed 3.5 % of epochs[0m
[32m[06/11/2024 20:20:40 INFO 139746470463296] #quality_metric: host=algo-1, epoch=13, train loss <loss>=2.935241937637329[0m
[32m[06/11/2024 20:20:40 INFO 139746470463296] loss did not improve[0m
[32m[06/11/2024 20:20:54 INFO 139746470463296] Epoch[14] Batch[0] avg_epoch_loss=2.918228[0m
[32m[06/11/2024 20:20:54 INFO 139746470463296] #quality_metric: host=algo-1, epoch=14, 

[32m[06/11/2024 20:23:36 INFO 139746470463296] Epoch[19] Batch[0] avg_epoch_loss=2.824089[0m
[32m[06/11/2024 20:23:36 INFO 139746470463296] #quality_metric: host=algo-1, epoch=19, batch=0 train loss <loss>=2.8240888118743896[0m
[32m[06/11/2024 20:23:47 INFO 139746470463296] Epoch[19] Batch[5] avg_epoch_loss=2.878533[0m
[32m[06/11/2024 20:23:47 INFO 139746470463296] #quality_metric: host=algo-1, epoch=19, batch=5 train loss <loss>=2.8785329262415567[0m
[32m[06/11/2024 20:23:47 INFO 139746470463296] Epoch[19] Batch [5]#011Speed: 30.70 samples/sec#011loss=2.878533[0m
[32m[06/11/2024 20:23:55 INFO 139746470463296] processed a total of 602 examples[0m
[32m#metrics {"StartTime": 1718137403.1354418, "EndTime": 1718137435.5566168, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 32421.11325263977, "count": 1, "min": 32421.11325263977, "max": 32421.11325263977}}}[0m
[32m[06/11/2024 20:23:55 INFO 1397464704632

[32m[06/11/2024 20:26:10 INFO 139746470463296] processed a total of 637 examples[0m
[32m#metrics {"StartTime": 1718137538.1383197, "EndTime": 1718137570.6082559, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 32469.874143600464, "count": 1, "min": 32469.874143600464, "max": 32469.874143600464}}}[0m
[32m[06/11/2024 20:26:10 INFO 139746470463296] #throughput_metric: host=algo-1, train throughput=19.6181211066826 records/second[0m
[32m[06/11/2024 20:26:10 INFO 139746470463296] #progress_metric: host=algo-1, completed 6.0 % of epochs[0m
[32m[06/11/2024 20:26:10 INFO 139746470463296] #quality_metric: host=algo-1, epoch=23, train loss <loss>=2.798051381111145[0m
[32m[06/11/2024 20:26:10 INFO 139746470463296] loss did not improve[0m
[32m[06/11/2024 20:26:23 INFO 139746470463296] Epoch[24] Batch[0] avg_epoch_loss=2.841030[0m
[32m[06/11/2024 20:26:23 INFO 139746470463296] #quality_metric: host=algo-1, epoch

[32m[06/11/2024 20:28:38 INFO 139746470463296] Epoch[28] Batch[0] avg_epoch_loss=2.629813[0m
[32m[06/11/2024 20:28:38 INFO 139746470463296] #quality_metric: host=algo-1, epoch=28, batch=0 train loss <loss>=2.6298134326934814[0m
[32m[06/11/2024 20:28:48 INFO 139746470463296] Epoch[28] Batch[5] avg_epoch_loss=2.749303[0m
[32m[06/11/2024 20:28:48 INFO 139746470463296] #quality_metric: host=algo-1, epoch=28, batch=5 train loss <loss>=2.749303142229716[0m
[32m[06/11/2024 20:28:48 INFO 139746470463296] Epoch[28] Batch [5]#011Speed: 31.27 samples/sec#011loss=2.749303[0m
[32m[06/11/2024 20:28:59 INFO 139746470463296] Epoch[28] Batch[10] avg_epoch_loss=2.730471[0m
[32m[06/11/2024 20:28:59 INFO 139746470463296] #quality_metric: host=algo-1, epoch=28, batch=10 train loss <loss>=2.7078726291656494[0m
[32m[06/11/2024 20:28:59 INFO 139746470463296] Epoch[28] Batch [10]#011Speed: 29.39 samples/sec#011loss=2.707873[0m
[32m[06/11/2024 20:28:59 INFO 139746470463296] processed a total of

[32m[06/11/2024 20:31:43 INFO 139746470463296] Epoch[33] Batch[10] avg_epoch_loss=2.663760[0m
[32m[06/11/2024 20:31:43 INFO 139746470463296] #quality_metric: host=algo-1, epoch=33, batch=10 train loss <loss>=2.6385024547576905[0m
[32m[06/11/2024 20:31:43 INFO 139746470463296] Epoch[33] Batch [10]#011Speed: 28.86 samples/sec#011loss=2.638502[0m
[32m[06/11/2024 20:31:43 INFO 139746470463296] processed a total of 677 examples[0m
[32m#metrics {"StartTime": 1718137869.8538618, "EndTime": 1718137903.6050038, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 33750.55003166199, "count": 1, "min": 33750.55003166199, "max": 33750.55003166199}}}[0m
[32m[06/11/2024 20:31:43 INFO 139746470463296] #throughput_metric: host=algo-1, train throughput=20.058871848808327 records/second[0m
[32m[06/11/2024 20:31:43 INFO 139746470463296] #progress_metric: host=algo-1, completed 8.5 % of epochs[0m
[32m[06/11/2024 20:31:43 IN

[32m[06/11/2024 20:34:09 INFO 139746470463296] Epoch[38] Batch[0] avg_epoch_loss=2.686535[0m
[32m[06/11/2024 20:34:09 INFO 139746470463296] #quality_metric: host=algo-1, epoch=38, batch=0 train loss <loss>=2.6865346431732178[0m
[32m[06/11/2024 20:34:19 INFO 139746470463296] Epoch[38] Batch[5] avg_epoch_loss=2.707425[0m
[32m[06/11/2024 20:34:19 INFO 139746470463296] #quality_metric: host=algo-1, epoch=38, batch=5 train loss <loss>=2.707425316174825[0m
[32m[06/11/2024 20:34:19 INFO 139746470463296] Epoch[38] Batch [5]#011Speed: 32.04 samples/sec#011loss=2.707425[0m
[32m[06/11/2024 20:34:28 INFO 139746470463296] processed a total of 627 examples[0m
[32m#metrics {"StartTime": 1718138036.8139436, "EndTime": 1718138068.0113568, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 31196.917057037354, "count": 1, "min": 31196.917057037354, "max": 31196.917057037354}}}[0m
[32m[06/11/2024 20:34:28 INFO 13974647046

[32m[06/11/2024 20:37:08 INFO 139746470463296] Epoch[43] Batch[5] avg_epoch_loss=2.708628[0m
[32m[06/11/2024 20:37:08 INFO 139746470463296] #quality_metric: host=algo-1, epoch=43, batch=5 train loss <loss>=2.7086277405420938[0m
[32m[06/11/2024 20:37:08 INFO 139746470463296] Epoch[43] Batch [5]#011Speed: 29.56 samples/sec#011loss=2.708628[0m
[32m[06/11/2024 20:37:18 INFO 139746470463296] Epoch[43] Batch[10] avg_epoch_loss=2.747630[0m
[32m[06/11/2024 20:37:18 INFO 139746470463296] #quality_metric: host=algo-1, epoch=43, batch=10 train loss <loss>=2.794431734085083[0m
[32m[06/11/2024 20:37:18 INFO 139746470463296] Epoch[43] Batch [10]#011Speed: 29.40 samples/sec#011loss=2.794432[0m
[32m[06/11/2024 20:37:18 INFO 139746470463296] processed a total of 656 examples[0m
[32m#metrics {"StartTime": 1718138203.940493, "EndTime": 1718138238.904068, "Dimensions": {"Algorithm": "AWS/DeepAR", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 34962.9299640655

In [39]:
job_name = estimator.latest_training_job.name

In [40]:
print('job name: {}'.format(job_name))

job name: deepar-biketrain-with-dynamic-feat-2024-06-11-19-51-55-174


In [41]:
# Create an endport for real-time predictions

endpoint_name = sess.endpoint_from_job(
    job_name = job_name,
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    image_uri=container,
    role=role
)

INFO:sagemaker:Creating model with name: deepar-biketrain-with-dynamic-feat-2024-06-11-19-51-55-174
INFO:sagemaker:Creating endpoint-config with name deepar-biketrain-with-dynamic-feat-2024-06-11-19-51-55-174
INFO:sagemaker:Creating endpoint with name deepar-biketrain-with-dynamic-feat-2024-06-11-19-51-55-174


-----------!