In [2]:
import boto3
import s3fs
import sagemaker
from sagemaker import get_execution_role
import time
import pandas as pd
import matplotlib.pyplot as plt
from helper import *


In [3]:
# session, role, bucket
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()             # IAM role to use by SageMaker

s3_bucket = sagemaker_session.default_bucket()
s3_prefix = 'MLEND-Capstone-Project'    

region = sagemaker_session.boto_region_name

s3_data_path = "s3://{}/{}/data".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output".format(s3_bucket, s3_prefix)

image_name = sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest")

In [4]:
stock_data_preprocessed = pd.read_csv('stock_data_preprocessed.csv',parse_dates=True, index_col=[0,1])

get_target_distribution(stock_data_preprocessed)

-1:  1109910
 0:  1087758
 1:  1400007


In [48]:
tickers = get_sp500_tickers()

freq = 'D'

# we predict for 1 day
prediction_length = 1

# we use 50 days as context length, this is the number of state updates accomplished before making predictions
context_length = 50

end_training = pd.Timestamp('2018-12-31', freq=freq)

timeseries = []
    
for ID,ticker in list(enumerate(tickers)):
    ticker = stock_data_preprocessed.loc[(slice(None), ticker), :]
    if ticker.index[0][0]<end_training:
        timeseries.append(ticker)

In [49]:
training_data = [
    {
            "start": str(ts.index[0][0]),
            "target": ts['target'][ts.index[0][0]:end_training].tolist(), # We use -1, because pandas indexing includes the upper bound 
            "dynamic_feat": ts[['Adj Close','Volume']][ts.index[0][0]:end_training].values.T.tolist()
    }
    for ts in timeseries
]
print(len(training_data))

491


In [50]:
num_test_windows = 10

test_data = [
    {
        "start": str(ts.index[0][0]),
        "target": ts['target'][ts.index[0][0]:end_training + (2*k * prediction_length)].tolist(),
        "dynamic_feat": ts[['Adj Close','Volume']][ts.index[0][0]:end_training + (2*k * prediction_length)].values.T.tolist()
    }
    for k in range(1, num_test_windows + 1) 
    for ts in timeseries
]
print(len(test_data))



4910


In [51]:
%%time
write_json_dataset("train.json", training_data)
write_json_dataset("test.json", test_data)

CPU times: user 18.8 s, sys: 440 ms, total: 19.2 s
Wall time: 22.1 s


In [52]:
%%time
copy_to_s3("train.json", s3_data_path + "/train/train.json", s3_bucket)
copy_to_s3("test.json", s3_data_path + "/test/test.json", s3_bucket)

File s3://sagemaker-us-east-2-017500148529/s3://sagemaker-us-east-2-017500148529/MLEND-Capstone-Project/data/train/train.json already exists.
Set override to upload anyway.

File s3://sagemaker-us-east-2-017500148529/s3://sagemaker-us-east-2-017500148529/MLEND-Capstone-Project/data/test/test.json already exists.
Set override to upload anyway.

CPU times: user 76.4 ms, sys: 0 ns, total: 76.4 ms
Wall time: 162 ms


In [53]:
s3filesystem = s3fs.S3FileSystem()
with s3filesystem.open(s3_data_path + "/train/train.json", 'rb') as fp:
    print(fp.readline().decode("utf-8")[:100] + "...")

{"start": "2010-01-04 00:00:00", "target": [-1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...


In [54]:
estimator = sagemaker.estimator.Estimator(
    sagemaker_session=sagemaker_session,
    image_name=image_name,
    role=role,
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    base_job_name='MLEND-Capstone-Project',
    output_path=s3_output_path
)

In [55]:
hyperparameters = {
    "time_freq": freq,
    "epochs": "100",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
    "num_dynamic_feat": 'auto',
}

In [56]:
estimator.set_hyperparameters(**hyperparameters)

In [57]:
%%time
data_channels = {
    "train": s3_data_path + "/train/train.json",
    "test": s3_data_path + "/test/test.json"
}

#estimator.fit(inputs=data_channels, wait=True)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.3 µs


In [58]:
#predictor_indicator = estimator_indicator.deploy(
#    initial_instance_count=1,
#    instance_type='ml.m4.xlarge',
#    content_type="application/json")

In [5]:
predictor = sagemaker.predictor.RealTimePredictor(endpoint='MLEND-Capstone-Project-2020-06-03-22-25-41-743')

In [8]:
get_stock_prediction('AAPL', '2019-01-02',stock_data_preprocessed,predictor,['Adj Close','Volume'])

  date_start = date_pred-50
  "target": pred_df['target'][date_start:date_pred-1].tolist(),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  result_df['prediction'] = pred


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Volume,target,prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-02,AAPL,0.03987,0.043087,-1,-1


In [7]:
date_index = pd.read_csv('test_date_index.csv')
date_index = date_index.values.reshape(252).tolist()
get_prediction_accuracy('A', date_index, stock_data_preprocessed,predictor,['Adj Close','Volume'])

  date_start = date_pred-50
  "target": pred_df['target'][date_start:date_pred-1].tolist(),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  result_df['prediction'] = pred


(0.7301587301587301,             target  prediction
 2019-01-02      -1          -1
 2019-01-03       1          -1
 2019-01-04       1           1
 2019-01-07       1           1
 2019-01-08       1           1
 2019-01-09       1           1
 2019-01-10       1           1
 2019-01-11       0           1
 2019-01-14       1           0
 2019-01-15       1           1
 2019-01-16       1           1
 2019-01-17       1           1
 2019-01-18       1           1
 2019-01-22       1           1
 2019-01-23       1           1
 2019-01-24       1           1
 2019-01-25       1           1
 2019-01-28       1           1
 2019-01-29       1           1
 2019-01-30       0           1
 2019-01-31       0           0
 2019-02-01       0           0
 2019-02-04       0           0
 2019-02-05       0           0
 2019-02-06       1           0
 2019-02-07       1           1
 2019-02-08       1           1
 2019-02-11       1           1
 2019-02-12       0           1
 2019-02-13       1 