In [1]:
import boto3
import s3fs
import sagemaker
from sagemaker import get_execution_role
import time
import pandas as pd
import matplotlib.pyplot as plt
from helper import *

  from pandas.util.testing import assert_frame_equal


In [2]:
# session, role, bucket
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()             # IAM role to use by SageMaker

s3_bucket = sagemaker_session.default_bucket()
s3_prefix = 'MLEND-Capstone-Project'    

region = sagemaker_session.boto_region_name

s3_data_path = "s3://{}/{}/data_indicator".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output_indicator".format(s3_bucket, s3_prefix)

image_name = sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest")

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [3]:
stock_indicator_data = pd.read_csv('stock_data.csv',parse_dates=True, index_col=[0,1])
get_target_distribution(stock_indicator_data)


-1:  390748
 0:  389090
 1:  397485


In [4]:
tickers = get_sp500_tickers()


In [5]:

freq = 'D'

# we predict for 1 day
prediction_length = 1

# we use 50 days as context length, this is the number of state updates accomplished before making predictions
context_length = 50

end_training = pd.Timestamp('2018-12-31', freq=freq)

timeseries = []
    
for ID,ticker in list(enumerate(tickers)):
    ticker = stock_indicator_data.loc[(slice(None), ticker), :]
    if ticker.index[0][0]<end_training:
        timeseries.append(ticker)

In [25]:
dynamic_feat = ['Adj Close','Volume','PC1','PC2','PC3','PC4','PC5','PC6']
training_data = [
    {
            "start": str(ts.index[0][0]),
            "target": ts['target'][ts.index[0][0]:end_training].tolist(), # We use -1, because pandas indexing includes the upper bound 
            "dynamic_feat": ts[dynamic_feat][ts.index[0][0]:end_training].values.T.tolist()
    }
    for ts in timeseries
]
print(len(training_data))

491


In [26]:
num_test_windows = 10

test_data = [
    {
        "start": str(ts.index[0][0]),
        "target": ts['target'][ts.index[0][0]:end_training + timedelta(days=(2*k * prediction_length))].tolist(),
        "dynamic_feat": ts[dynamic_feat][ts.index[0][0]:end_training + timedelta(days=(2*k * prediction_length))].values.T.tolist()
    }
    for k in range(1, num_test_windows + 1) 
    for ts in timeseries
]
print(len(test_data))



4910


In [27]:
%%time
write_json_dataset("train_indicator.json", training_data)
write_json_dataset("test_indicator.json", test_data)

CPU times: user 1min 7s, sys: 1.33 s, total: 1min 8s
Wall time: 1min 12s


In [31]:
copy_to_s3("train_indicator.json", s3_data_path + "/train/train.json", s3_bucket)
copy_to_s3("test_indicator.json", s3_data_path + "/test/test.json", s3_bucket)

Uploading file to s3://sagemaker-us-east-2-017500148529/MLEND-Capstone-Project/data_indicator/train/train.json
Uploading file to s3://sagemaker-us-east-2-017500148529/MLEND-Capstone-Project/data_indicator/test/test.json


In [32]:
s3filesystem = s3fs.S3FileSystem()
with s3filesystem.open(s3_data_path + "/train/train.json", 'rb') as fp:
    print(fp.readline().decode("utf-8")[:100] + "...")

{"start": "2010-03-16 00:00:00", "target": [1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 1, 1, 1, 1, 1...


In [35]:
estimator_indicator = sagemaker.estimator.Estimator(
    sagemaker_session=sagemaker_session,
    image_name=image_name,
    role=role,
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    base_job_name='deep-ar-indicators-1',
    output_path=s3_output_path
)




In [36]:
hyperparameters = {
    "time_freq": freq,
    "epochs": "100",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "dropout_rate": 0.04030803446099004,
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
    "num_dynamic_feat": 'auto',
}
estimator_indicator.set_hyperparameters(**hyperparameters)



In [37]:
%%time
data_channels = {
    "train": s3_data_path + "/train/train.json",
    "test": s3_data_path + "/test/test.json"
}

estimator_indicator.fit(inputs=data_channels, wait=True)




2020-06-12 01:14:23 Starting - Starting the training job...
2020-06-12 01:14:25 Starting - Launching requested ML instances......
2020-06-12 01:15:29 Starting - Preparing the instances for training...
2020-06-12 01:16:09 Downloading - Downloading input data......
2020-06-12 01:16:57 Training - Downloading the training image[34mArguments: train[0m
[34m[06/12/2020 01:17:13 INFO 140292449302336] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'num_dynamic_feat': u'auto', u'dropout_rate': u'0.10', u'mini_batch_size': u'128', u'test_quantiles': u'[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'num_eval_samples': u'100', u'learning_rate': u'0.001', u'num_cells': u'40', u'num_layers': u'2', u'embedding_dimension': u'10', u'_kvstore': u'auto', u'_num_kv_servers': u'auto', u'cardinality': u'auto', u'likelihood': u'student-t', u'early_stopping_patience': u''}[0m
[3

In [38]:
predictor_indicator = estimator_indicator.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    content_type="application/json")



-------------!

In [56]:
indicator_predictor = sagemaker.predictor.RealTimePredictor(endpoint='deep-ar-indicator-endpoint')

In [75]:
dynamic_feat = ['Adj Close','Volume','PC1','PC2','PC3','PC4','PC5','PC6']
date = '2019-01-02'
ticker = 'AAPL'
df = stock_indicator_data
predictor = indicator_predictor
def get_dynamic_feat_prediction(ticker,date,df,predictor,dynamic_feat):

    date_pred = pd.Timestamp(date, freq='D')
    date_start = date_pred-timedelta(days=50)
    pred_df = stock_indicator_data.loc[(slice(str(date_start),str(date_pred)), ticker), :]
    result_df = pred_df.loc[(slice(str(date_pred),str(date_pred)), ticker), :]
    pred = {
            "start": str(date_pred),
            "target": pred_df['target'][date_start:date_pred-timedelta(days=1)].tolist(),
            "dynamic_feat": pred_df[dynamic_feat][date_start:date_pred].values.T.tolist()
        }

    req = encode_request(instance=pred, num_samples=50, quantiles=['0.1', '0.5', '0.9'])
    res = indicator_predictor.predict(req)
    prediction_data = json.loads(res.decode('utf-8'))
    pred = round(prediction_data['predictions'][0]['quantiles']['0.5'][0])
    result_df['prediction'] = pred


    return result_df

get_dynamic_feat_prediction(ticker,date,df,predictor,dynamic_feat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Volume,PC1,PC2,PC3,PC4,PC5,PC6,target,prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-01-02,AAPL,0.03987,0.043087,0.07433,0.285545,0.364432,0.436737,0.372038,0.292267,-1,-1


In [76]:
date_index = pd.read_csv('test_date_index.csv')
date_index = date_index.values.reshape(252).tolist()

def get_dynamic_feat_accuracy(ticker):
    i = 0
    target = []
    prediction = []
    df = stock_indicator_data
    for date in date_index:
        target.append(get_dynamic_feat_prediction(ticker, date,df,indicator_predictor,dynamic_feat)['target'].values[0])
        prediction.append(int(get_dynamic_feat_prediction(ticker, date,df,indicator_predictor,dynamic_feat)['prediction'].values[0]))

    return accuracy_score(target, prediction)

In [77]:
get_dynamic_feat_accuracy(ticker='AAPL')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.746031746031746

In [43]:
acc = {}

In [70]:
for ticker in tickers:
    acc[ticker] = get_dynamic_feat_accuracy(ticker)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


KeyError: 'HWM'

In [71]:
np.array(list(acc.values())).mean()

0.6612781323480014

In [72]:
acc

{'A': 0.6730769230769231,
 'AAL': 0.6346153846153846,
 'AAP': 0.7403846153846154,
 'AAPL': 0.6826923076923077,
 'ABBV': 0.7307692307692307,
 'ABC': 0.6153846153846154,
 'ABMD': 0.7307692307692307,
 'ABT': 0.7019230769230769,
 'ACN': 0.5769230769230769,
 'ADBE': 0.5961538461538461,
 'ADI': 0.5673076923076923,
 'ADM': 0.6442307692307693,
 'ADP': 0.7019230769230769,
 'ADS': 0.6634615384615384,
 'ADSK': 0.6346153846153846,
 'AEE': 0.6826923076923077,
 'AEP': 0.6826923076923077,
 'AES': 0.6346153846153846,
 'AFL': 0.6730769230769231,
 'AIG': 0.6826923076923077,
 'AIV': 0.6346153846153846,
 'AIZ': 0.6923076923076923,
 'AJG': 0.6923076923076923,
 'AKAM': 0.7307692307692307,
 'ALB': 0.7211538461538461,
 'ALGN': 0.625,
 'ALK': 0.6538461538461539,
 'ALL': 0.7211538461538461,
 'ALLE': 0.6826923076923077,
 'ALXN': 0.7403846153846154,
 'AMAT': 0.4807692307692308,
 'AMCR': 0.7019230769230769,
 'AMD': 0.6538461538461539,
 'AME': 0.6730769230769231,
 'AMGN': 0.7019230769230769,
 'AMP': 0.6153846153846

In [89]:
acc['A'] = get_dynamic_feat_accuracy('A')
acc['F'] = get_dynamic_feat_accuracy('F')
acc['GE'] = get_dynamic_feat_accuracy('GE')
acc['DAL'] = get_dynamic_feat_accuracy('DAL')
acc['UAL'] = get_dynamic_feat_accuracy('UAL')
acc['ABC'] = get_dynamic_feat_accuracy('ABC')
acc['CAT'] = get_dynamic_feat_accuracy('CAT')
acc['DE'] = get_dynamic_feat_accuracy('DE')
acc['D'] = get_dynamic_feat_accuracy('D')
acc['PEP'] = get_dynamic_feat_accuracy('PEP')
acc['IBM'] = get_dynamic_feat_accuracy('IBM')
acc['PXD'] = get_dynamic_feat_accuracy('PXD')
acc['VLO'] = get_dynamic_feat_accuracy('VLO')
acc['YUM'] = get_dynamic_feat_accuracy('YUM')
acc['AIG'] = get_dynamic_feat_accuracy('AIG')
acc['BWA'] = get_dynamic_feat_accuracy('BWA')
acc['HLT'] = get_dynamic_feat_accuracy('HLT')
acc['INTU'] = get_dynamic_feat_accuracy('INTU')
acc['L'] = get_dynamic_feat_accuracy('L')
acc['ZTS'] = get_dynamic_feat_accuracy('ZTS')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [90]:
np.array(list(acc.values())).mean()

0.746626984126984

In [91]:
acc

{'A': 0.7301587301587301,
 'F': 0.7063492063492064,
 'GE': 0.7301587301587301,
 'DAL': 0.7182539682539683,
 'UAL': 0.7777777777777778,
 'ABC': 0.7142857142857143,
 'CAT': 0.7341269841269841,
 'DE': 0.7301587301587301,
 'D': 0.7619047619047619,
 'PEP': 0.7857142857142857,
 'IBM': 0.7579365079365079,
 'PXD': 0.75,
 'VLO': 0.7103174603174603,
 'YUM': 0.8253968253968254,
 'AIG': 0.7182539682539683,
 'BWA': 0.7896825396825397,
 'HLT': 0.7420634920634921,
 'INTU': 0.7301587301587301,
 'L': 0.7738095238095238,
 'ZTS': 0.746031746031746}

In [62]:
test_df = pd.read_csv('stock_test_data.csv',parse_dates=True, index_col=[0,1])

In [63]:
test_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Volume,PC1,PC2,PC3,PC4,PC5,PC6,target
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-02,A,0.020762,0.004206,-0.923785,2.71684,-3.987023,-1.518074,-0.115581,0.38385,0
2020-01-02,AAL,0.0067,0.019235,-0.923785,2.71684,-3.987023,-1.518074,-0.115581,0.38385,-1
2020-01-02,AAP,0.038836,0.002817,-0.923785,2.71684,-3.987023,-1.518074,-0.115581,0.38385,-1
2020-01-02,AAPL,0.073563,0.10099,-0.923785,2.71684,-3.987023,-1.518074,-0.115581,0.38385,1
2020-01-02,ABBV,0.02109,0.016814,-0.923785,2.71684,-3.987023,-1.518074,-0.115581,0.38385,0


In [64]:
dynamic_feat = ['Adj Close','Volume','PC1','PC2','PC3','PC4','PC5','PC6']
date = '2020-04-01'
ticker = 'AAPL'
df = test_df
predictor = indicator_predictor
def get_dynamic_feat_prediction(ticker,date,df,predictor,dynamic_feat):

    date_pred = pd.Timestamp(date, freq='D')
    date_start = date_pred-timedelta(days=50)
    pred_df = test_df.loc[(slice(str(date_start),str(date_pred)), ticker), :]
    result_df = pred_df.loc[(slice(str(date_pred),str(date_pred)), ticker), :]
    pred = {
            "start": str(date_pred),
            "target": pred_df['target'][date_start:date_pred-timedelta(days=1)].tolist(),
            "dynamic_feat": pred_df[dynamic_feat][date_start:date_pred].values.T.tolist()
        }

    req = encode_request(instance=pred, num_samples=50, quantiles=['0.1', '0.5', '0.9'])
    res = indicator_predictor.predict(req)
    prediction_data = json.loads(res.decode('utf-8'))
    pred = round(prediction_data['predictions'][0]['quantiles']['0.5'][0])
    result_df['prediction'] = pred


    return result_df

get_dynamic_feat_prediction(ticker,date,df,predictor,dynamic_feat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Volume,PC1,PC2,PC3,PC4,PC5,PC6,target,prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-04-01,AAPL,0.059051,0.131357,4.21148,0.224202,-0.21806,-1.052568,0.15746,-0.346668,1,-1


In [68]:
date_index = pd.read_csv('final_test_date_index.csv')
date_index = date_index.values.reshape(104).tolist()
dynamic_feat = ['Adj Close','Volume','PC1','PC2','PC3','PC4','PC5','PC6']
df = test_df
target = []
prediction = []
predictor = indicator_predictor
acc = {}
for ticker in tickers:
    print(ticker)
    for date in date_index:
        target.append(get_dynamic_feat_prediction(ticker, date,df,indicator_predictor,dynamic_feat)['target'].values[0])
        prediction.append(get_dynamic_feat_prediction(ticker, date,df,indicator_predictor,dynamic_feat)['prediction'].values[0])
        acc[ticker] = accuracy_score(target, prediction)


A


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


AAL
AAP
AAPL
ABBV
ABC
ABMD
ABT
ACN
ADBE
ADI
ADM
ADP
ADS
ADSK
AEE
AEP
AES
AFL
AIG
AIV
AIZ
AJG
AKAM
ALB
ALGN
ALK
ALL
ALLE
ALXN
AMAT
AMCR
AMD
AME
AMGN
AMP
AMT
AMZN
ANET
ANSS
ANTM
AON
AOS
APA
APD
APH
APTV
ARE
ATO
ATVI
AVB
AVGO
AVY
AWK
AXP
AZO
BA
BAC
BAX
BBY
BDX
BEN
BIIB
BK
BKNG
BKR
BLK
BLL
BMY
BR
BSX
BWA
BXP
C
CAG
CAH
CAT
CB
CBOE
CBRE
CCI
CCL
CDNS
CDW
CE
CERN
CF
CFG
CHD
CHRW
CHTR
CI
CINF
CL
CLX
CMA
CMCSA
CME
CMG
CMI
CMS
CNC
CNP
COF
COG
COO
COP
COST
COTY
CPB
CPRT
CRM
CSCO
CSX
CTAS
CTL
CTSH
CTVA
CTXS
CVS
CVX
CXO
D
DAL
DD
DE
DFS
DG
DGX
DHI
DHR
DIS
DISCA
DISCK
DISH
DLR
DLTR
DOV
DOW
DRE
DRI
DTE
DUK
DVA
DVN
DXC
EA
EBAY
ECL
ED
EFX
EIX
EL
EMN
EMR
EOG
EQIX
EQR
ES
ESS
ETFC
ETN
ETR
EVRG
EW
EXC
EXPD
EXPE
EXR
F
FANG
FAST
FB
FBHS
FCX
FDX
FE
FFIV
FIS
FISV
FITB
FLIR
FLS
FLT
FMC
FOX
FOXA
FRC
FRT
FTI
FTNT
FTV
GD
GE
GILD
GIS
GL
GLW
GM
GOOG
GOOGL
GPC
GPN
GPS
GRMN
GS
GWW
HAL
HAS
HBAN
HBI
HCA
HD
HES
HFC
HIG
HII
HLT
HOG
HOLX
HON
HPE
HPQ
HRB
HRL
HSIC
HST
HSY
HUM
HWM


KeyError: 'HWM'

In [69]:
acc

{'A': 0.6730769230769231,
 'AAL': 0.6538461538461539,
 'AAP': 0.6826923076923077,
 'AAPL': 0.6826923076923077,
 'ABBV': 0.6923076923076923,
 'ABC': 0.6794871794871795,
 'ABMD': 0.6868131868131868,
 'ABT': 0.6887019230769231,
 'ACN': 0.6762820512820513,
 'ADBE': 0.6682692307692307,
 'ADI': 0.6590909090909091,
 'ADM': 0.6578525641025641,
 'ADP': 0.6612426035502958,
 'ADS': 0.6614010989010989,
 'ADSK': 0.6596153846153846,
 'AEE': 0.6610576923076923,
 'AEP': 0.6623303167420814,
 'AES': 0.6607905982905983,
 'AFL': 0.6614372469635628,
 'AIG': 0.6625,
 'AIV': 0.6611721611721612,
 'AIZ': 0.6625874125874126,
 'AJG': 0.6638795986622074,
 'AKAM': 0.6666666666666666,
 'ALB': 0.6688461538461539,
 'ALGN': 0.6671597633136095,
 'ALK': 0.6666666666666666,
 'ALL': 0.6686126373626373,
 'ALLE': 0.6690981432360743,
 'ALXN': 0.6714743589743589,
 'AMAT': 0.6653225806451613,
 'AMCR': 0.6664663461538461,
 'AMD': 0.666083916083916,
 'AME': 0.666289592760181,
 'AMGN': 0.6673076923076923,
 'AMP': 0.66586538461538