In [1]:
import boto3
import s3fs
import sagemaker
from sagemaker import get_execution_role
import time
import pandas as pd
import matplotlib.pyplot as plt
from helper import *
from datetime import timedelta

  from pandas.util.testing import assert_frame_equal


In [2]:
# session, role, bucket
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()             # IAM role to use by SageMaker

s3_bucket = sagemaker_session.default_bucket()
s3_prefix = 'MLEND-Capstone-Project'    

region = sagemaker_session.boto_region_name

s3_data_path = "s3://{}/{}/data_hyper_param".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output_hyper_param".format(s3_bucket, s3_prefix)

image_name = sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest")

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [3]:
stock_hyper_param_data = pd.read_csv('stock_indicator_data.csv',parse_dates=True, index_col=[0,1])
get_target_distribution(stock_hyper_param_data)

-1:  363122
 0:  357809
 1:  456392


In [4]:
tickers = get_sp500_tickers()

freq = 'D'

# we predict for 1 day
prediction_length = 1

# we use 50 days as context length, this is the number of state updates accomplished before making predictions
context_length = 50

end_training = pd.Timestamp('2018-12-31', freq=freq)

timeseries = []
    
for ID,ticker in list(enumerate(tickers)):
    ticker = stock_hyper_param_data.loc[(slice(None), ticker), :]
    if ticker.index[0][0]<end_training:
        timeseries.append(ticker)

In [5]:
tickers = []
for ts in timeseries:
    tickers.append(ts.index[1][1])
cat = {}
for ticker in enumerate(tickers):
    cat[ticker[1]] = ticker[0]


In [6]:
dynamic_feat = ['Adj Close','Volume','PC1','PC2','PC3','PC4','PC5','PC6']
training_data = [
    {
        "start": str(ts.index[0][0]),
        "target": ts['target'][ts.index[0][0]:end_training].tolist(), # We use -1, because pandas indexing includes the upper bound 
        "cat" : cat[ts.index[1][1]],
        "dynamic_feat": ts[dynamic_feat][ts.index[0][0]:end_training].values.T.tolist()
        
    }
    for ts in timeseries
]
print(len(training_data))

491


In [8]:
num_test_windows = 10

test_data = [
    {
        "start": str(ts.index[0][0]),
        "target": ts['target'][ts.index[0][0]:end_training + timedelta(days=(2*k * prediction_length))].tolist(),
        "cat" : cat[ts.index[1][1]], # input stock ticker id
        "dynamic_feat": ts[dynamic_feat][ts.index[0][0]:end_training + timedelta(days=(2*k * prediction_length))].values.T.tolist()
    }
    for k in range(1, num_test_windows + 1) 
    for ts in timeseries
]
print(len(test_data))



4910


In [9]:
%%time
write_json_dataset("train_hyper_param.json", training_data)
write_json_dataset("test_hyper_param.json", test_data)

CPU times: user 1min 8s, sys: 3.72 s, total: 1min 12s
Wall time: 1min 15s


In [10]:
copy_to_s3("train_hyper_param.json", s3_data_path + "/train/train.json", s3_bucket)
copy_to_s3("train_hyper_param.json", s3_data_path + "/test/test.json", s3_bucket)

File s3://sagemaker-us-east-2-017500148529/s3://sagemaker-us-east-2-017500148529/MLEND-Capstone-Project/data_hyper_param/train/train.json already exists.
Set override to upload anyway.

File s3://sagemaker-us-east-2-017500148529/s3://sagemaker-us-east-2-017500148529/MLEND-Capstone-Project/data_hyper_param/test/test.json already exists.
Set override to upload anyway.



In [23]:
s3filesystem = s3fs.S3FileSystem()
with s3filesystem.open(s3_data_path + "/train/train.json", 'rb') as fp:
    print(fp.readline().decode("utf-8")[:10000] + "...")

{"start": "2010-03-16 00:00:00", "target": [1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 0, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, -1, -1, 1, 1, 0, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 0, 0, 0, -1,

In [12]:
estimator_hyper_param = sagemaker.estimator.Estimator(
    sagemaker_session=sagemaker_session,
    image_name=image_name,
    role=role,
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    base_job_name='MLEND-Capstone-Project',
    output_path=s3_output_path
)




In [12]:
hyperparameters = {
    "time_freq": freq,
    "epochs": "100",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
    "num_cells": 50,
    "num_layers":2,
    "dropout_rate":0.1,
    "num_dynamic_feat": 'auto',
}
estimator_hyper_param.set_hyperparameters(**hyperparameters)



In [44]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

estimator_hyper_param_tuner = HyperparameterTuner(estimator = estimator_hyper_param, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'test:RMSE', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 30, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'epochs': IntegerParameter(50, 200),
                                                    'context_length': IntegerParameter(10, 100),
                                                    'mini_batch_size': IntegerParameter(32, 256),
                                                    'learning_rate': ContinuousParameter("1E-5", "1E-3"),
                                                    'num_cells': IntegerParameter(30, 200),
                                                    'dropout_rate': ContinuousParameter(0,0.2),
                                                    'num_layers': IntegerParameter(1,3)
                                               })

In [46]:
%%time
data_channels = {
    "train": s3_data_path + "/train/train.json",
    "test": s3_data_path + "/test/test.json"
}

#estimator_hyper_param_tuner.fit(inputs=data_channels, wait=True)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.06 µs


In [13]:
estimator_hyper_param = sagemaker.estimator.Estimator(
    sagemaker_session=sagemaker_session,
    image_name=image_name,
    role=role,
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    base_job_name='MLEND-Capstone-Project',
    output_path=s3_output_path
)



In [15]:
hyperparameters = {
    "time_freq": freq,
    "epochs": "96",
    "early_stopping_patience": "40",
    "mini_batch_size": "212",
    "learning_rate": "4.177378470748047e-05",
    "context_length": "94",
    "prediction_length": "1",
    "num_cells": 115,
    "num_layers":3,
    "dropout_rate":0.04030803446099004,
    "num_dynamic_feat": 'auto',
}
estimator_hyper_param.set_hyperparameters(**hyperparameters)



In [16]:
%%time
data_channels = {
    "train": s3_data_path + "/train/train.json",
    "test": s3_data_path + "/test/test.json"
}
estimator_hyper_param.fit(inputs=data_channels, wait=True)



2020-06-09 01:43:48 Starting - Starting the training job...
2020-06-09 01:43:50 Starting - Launching requested ML instances.........
2020-06-09 01:45:22 Starting - Preparing the instances for training......
2020-06-09 01:46:25 Downloading - Downloading input data...
2020-06-09 01:47:08 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[06/09/2020 01:47:10 INFO 139663651043136] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'num_dynamic_feat': u'auto', u'dropout_rate': u'0.10', u'mini_batch_size': u'128', u'test_quantiles': u'[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'num_eval_samples': u'100', u'learning_rate': u'0.001', u'num_cells': u'40', u'num_layers': u'2', u'embedding_dimension': u'10', u'_kvstore': u'auto', u'_num_kv_servers': u'auto', u'cardinality': u'auto', u'likelihood': u'student-t', u'early_sto

In [16]:
#hyper_param_predictor = estimator_hyper_param.deploy(
#    initial_instance_count=1,
#    instance_type='ml.m4.xlarge',
#    content_type="application/json")

In [23]:
hyper_param_predictor = sagemaker.predictor.RealTimePredictor(endpoint='hyper-param-endpoint')

In [24]:
get_stock_prediction('F', '2019-01-02',stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['F'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Volume,PC1,PC2,PC3,PC4,PC5,PC6,target,prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-01-02,F,0.00187,0.055249,-1.929559,0.396906,-1.196386,-0.85156,-0.963647,0.654602,1,1


In [51]:
date_index = pd.read_csv('test_date_index.csv')
date_index = date_index.values.reshape(252).tolist()

#acc = {}

In [None]:
acc['PEP'] = get_prediction_accuracy('PEP', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['PEP'])
acc['IBM'] = get_prediction_accuracy('IBM', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['IBM'])
acc['PXD'] = get_prediction_accuracy('PXD', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['PXD'])
acc['VLO'] = get_prediction_accuracy('VLO', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['VLO'])
acc['KMX'] = get_prediction_accuracy('KMX', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['KMX'])
acc['YUM'] = get_prediction_accuracy('YUM', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['YUM'])
acc['AIG'] = get_prediction_accuracy('AIG', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['AIG'])


In [84]:
#acc['A'] = get_prediction_accuracy('A', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['A'])
#acc['F'] = get_prediction_accuracy('F', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['F'])
#acc['GE'] = get_prediction_accuracy('GE', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['GE'])
#acc['DAL'] = get_prediction_accuracy('DAL', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['DAL'])
#acc['UAL'] = get_prediction_accuracy('UAL', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['UAL'])
#acc['ABC'] = get_prediction_accuracy('ABC', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['ABC'])
#acc['CAT'] = get_prediction_accuracy('CAT', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['CAT'])
#acc['DE'] = get_prediction_accuracy('DE', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['DE'])
#acc['D'] = get_prediction_accuracy('D', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['D'])
acc['FB'] = get_prediction_accuracy('FB', date_index, stock_hyper_param_data,hyper_param_predictor, dynamic_feat, cat['FB'])



In [88]:
np.array(list(acc.values())).mean()

0.6693121693121693

In [87]:
acc

{'A': 0.6388888888888888,
 'F': 0.6785714285714286,
 'GE': 0.5595238095238095,
 'DAL': 0.7063492063492064,
 'UAL': 0.7380952380952381,
 'ABC': 0.6428571428571429,
 'CAT': 0.6626984126984127,
 'DE': 0.6309523809523809,
 'D': 0.7658730158730159}