In [1]:
import boto3
import s3fs
import sagemaker
from sagemaker import get_execution_role
import time
import pandas as pd
import matplotlib.pyplot as plt
from helper import *

  from pandas.util.testing import assert_frame_equal


In [2]:
# session, role, bucket
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()             # IAM role to use by SageMaker

s3_bucket = sagemaker_session.default_bucket()
s3_prefix = 'MLEND-Capstone-Project'    

region = sagemaker_session.boto_region_name

s3_data_path = "s3://{}/{}/data_indicator".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output_indicator".format(s3_bucket, s3_prefix)

image_name = sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest")

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [3]:
stock_indicator_data = pd.read_csv('stock_data.csv',parse_dates=True, index_col=[0,1])
get_target_distribution(stock_indicator_data)


-1:  390748
 0:  389090
 1:  397485


In [4]:
tickers = get_sp500_tickers()


In [5]:

freq = 'D'

# we predict for 1 day
prediction_length = 1

# we use 50 days as context length, this is the number of state updates accomplished before making predictions
context_length = 50

end_training = pd.Timestamp('2018-12-31', freq=freq)

timeseries = []
    
for ID,ticker in list(enumerate(tickers)):
    ticker = stock_indicator_data.loc[(slice(None), ticker), :]
    if ticker.index[0][0]<end_training:
        timeseries.append(ticker)

In [25]:
dynamic_feat = ['Adj Close','Volume','PC1','PC2','PC3','PC4','PC5','PC6']
training_data = [
    {
            "start": str(ts.index[0][0]),
            "target": ts['target'][ts.index[0][0]:end_training].tolist(), # We use -1, because pandas indexing includes the upper bound 
            "dynamic_feat": ts[dynamic_feat][ts.index[0][0]:end_training].values.T.tolist()
    }
    for ts in timeseries
]
print(len(training_data))

491


In [26]:
num_test_windows = 10

test_data = [
    {
        "start": str(ts.index[0][0]),
        "target": ts['target'][ts.index[0][0]:end_training + timedelta(days=(2*k * prediction_length))].tolist(),
        "dynamic_feat": ts[dynamic_feat][ts.index[0][0]:end_training + timedelta(days=(2*k * prediction_length))].values.T.tolist()
    }
    for k in range(1, num_test_windows + 1) 
    for ts in timeseries
]
print(len(test_data))



4910


In [27]:
%%time
write_json_dataset("train_indicator.json", training_data)
write_json_dataset("test_indicator.json", test_data)

CPU times: user 1min 7s, sys: 1.33 s, total: 1min 8s
Wall time: 1min 12s


In [31]:
copy_to_s3("train_indicator.json", s3_data_path + "/train/train.json", s3_bucket)
copy_to_s3("test_indicator.json", s3_data_path + "/test/test.json", s3_bucket)

Uploading file to s3://sagemaker-us-east-2-017500148529/MLEND-Capstone-Project/data_indicator/train/train.json
Uploading file to s3://sagemaker-us-east-2-017500148529/MLEND-Capstone-Project/data_indicator/test/test.json


In [32]:
s3filesystem = s3fs.S3FileSystem()
with s3filesystem.open(s3_data_path + "/train/train.json", 'rb') as fp:
    print(fp.readline().decode("utf-8")[:100] + "...")

{"start": "2010-03-16 00:00:00", "target": [1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 1, 1, 1, 1, 1...


In [35]:
estimator_indicator = sagemaker.estimator.Estimator(
    sagemaker_session=sagemaker_session,
    image_name=image_name,
    role=role,
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    base_job_name='deep-ar-indicators-1',
    output_path=s3_output_path
)




In [36]:
hyperparameters = {
    "time_freq": freq,
    "epochs": "100",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "dropout_rate": 0.04030803446099004,
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
    "num_dynamic_feat": 'auto',
}
estimator_indicator.set_hyperparameters(**hyperparameters)



In [6]:
%%time
data_channels = {
    "train": s3_data_path + "/train/train.json",
    "test": s3_data_path + "/test/test.json"
}

estimator_indicator.fit(inputs=data_channels, wait=True)


CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 21.2 µs


In [38]:
predictor_indicator = estimator_indicator.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    content_type="application/json")



-------------!

In [56]:
indicator_predictor = sagemaker.predictor.RealTimePredictor(endpoint='deep-ar-indicator-endpoint')

In [75]:
dynamic_feat = ['Adj Close','Volume','PC1','PC2','PC3','PC4','PC5','PC6']
date = '2019-01-02'
ticker = 'AAPL'
df = stock_indicator_data
predictor = indicator_predictor
def get_dynamic_feat_prediction(ticker,date,df,predictor,dynamic_feat):

    date_pred = pd.Timestamp(date, freq='D')
    date_start = date_pred-timedelta(days=50)
    pred_df = stock_indicator_data.loc[(slice(str(date_start),str(date_pred)), ticker), :]
    result_df = pred_df.loc[(slice(str(date_pred),str(date_pred)), ticker), :]
    pred = {
            "start": str(date_pred),
            "target": pred_df['target'][date_start:date_pred-timedelta(days=1)].tolist(),
            "dynamic_feat": pred_df[dynamic_feat][date_start:date_pred].values.T.tolist()
        }

    req = encode_request(instance=pred, num_samples=50, quantiles=['0.1', '0.5', '0.9'])
    res = indicator_predictor.predict(req)
    prediction_data = json.loads(res.decode('utf-8'))
    pred = round(prediction_data['predictions'][0]['quantiles']['0.5'][0])
    result_df['prediction'] = pred


    return result_df

get_dynamic_feat_prediction(ticker,date,df,predictor,dynamic_feat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Volume,PC1,PC2,PC3,PC4,PC5,PC6,target,prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-01-02,AAPL,0.03987,0.043087,0.07433,0.285545,0.364432,0.436737,0.372038,0.292267,-1,-1


In [76]:
date_index = pd.read_csv('test_date_index.csv')
date_index = date_index.values.reshape(252).tolist()

def get_dynamic_feat_accuracy(ticker):
    i = 0
    target = []
    prediction = []
    df = stock_indicator_data
    for date in date_index:
        target.append(get_dynamic_feat_prediction(ticker, date,df,indicator_predictor,dynamic_feat)['target'].values[0])
        prediction.append(int(get_dynamic_feat_prediction(ticker, date,df,indicator_predictor,dynamic_feat)['prediction'].values[0]))

    return accuracy_score(target, prediction)

In [77]:
get_dynamic_feat_accuracy(ticker='AAPL')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.746031746031746

In [43]:
acc = {}

In [7]:
for ticker in tickers:
    acc[ticker] = get_dynamic_feat_accuracy(ticker)

In [71]:
np.array(list(acc.values())).mean()

0.6612781323480014

In [8]:
acc

NameError: name 'acc' is not defined

In [89]:
acc['A'] = get_dynamic_feat_accuracy('A')
acc['F'] = get_dynamic_feat_accuracy('F')
acc['GE'] = get_dynamic_feat_accuracy('GE')
acc['DAL'] = get_dynamic_feat_accuracy('DAL')
acc['UAL'] = get_dynamic_feat_accuracy('UAL')
acc['ABC'] = get_dynamic_feat_accuracy('ABC')
acc['CAT'] = get_dynamic_feat_accuracy('CAT')
acc['DE'] = get_dynamic_feat_accuracy('DE')
acc['D'] = get_dynamic_feat_accuracy('D')
acc['PEP'] = get_dynamic_feat_accuracy('PEP')
acc['IBM'] = get_dynamic_feat_accuracy('IBM')
acc['PXD'] = get_dynamic_feat_accuracy('PXD')
acc['VLO'] = get_dynamic_feat_accuracy('VLO')
acc['YUM'] = get_dynamic_feat_accuracy('YUM')
acc['AIG'] = get_dynamic_feat_accuracy('AIG')
acc['BWA'] = get_dynamic_feat_accuracy('BWA')
acc['HLT'] = get_dynamic_feat_accuracy('HLT')
acc['INTU'] = get_dynamic_feat_accuracy('INTU')
acc['L'] = get_dynamic_feat_accuracy('L')
acc['ZTS'] = get_dynamic_feat_accuracy('ZTS')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [90]:
np.array(list(acc.values())).mean()

0.746626984126984