In [12]:
import boto3
import s3fs
import sagemaker
import bs4 as bs
import pickle
import requests
from sagemaker import get_execution_role
import time
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [4]:
# session, role, bucket
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()             # IAM role to use by SageMaker

s3_bucket = sagemaker_session.default_bucket()
s3_prefix = 'MLEND-Capstone-Project'    

region = sagemaker_session.boto_region_name

s3_data_path = "s3://{}/{}/data_indicator".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output_indicator".format(s3_bucket, s3_prefix)

image_name = sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest")

In [5]:
stock_indicator_data = pd.read_csv('stock_indicator_data.csv',parse_dates=True, index_col=[0,1])

In [14]:
def get_sp500_tickers():
    resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

    soup = bs.BeautifulSoup(resp.text, "lxml")
    table = soup.find('table', {'class':'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        ticker = ticker[:-1]
        tickers.append(ticker)

    
    tickers.sort()
    tickers.remove('BF.B')
    tickers.remove('BRK.B')
    tickers.remove('CARR')
    tickers.remove('DPZ')
    tickers.remove('DXCM')
    tickers.remove('OTIS')
    tickers.remove('WST')
    
    return tickers
tickers = get_sp500_tickers()

# we use 2 hour frequency for the time series

freq = 'D'

# we predict for 1 day
prediction_length = 1

# we use 50 days as context length, this is the number of state updates accomplished before making predictions
context_length = 50

end_training = pd.Timestamp('2018-12-31', freq=freq)

timeseries = []
    
for ID,ticker in list(enumerate(tickers)):
    ticker = stock_indicator_data.loc[(slice(None), ticker), :]
    if ticker.index[0][0]<end_training:
        timeseries.append(ticker)

        
    

In [19]:
def write_json_dataset(filename, data): 
    with open(filename, 'wb') as f:
        # for each of our times series, there is one JSON line
        for d in data:
            json_line = json.dumps(d) + '\n'
            json_line = json_line.encode('utf-8')
            f.write(json_line) 
            
def copy_to_s3(local_file, s3_path, override=False):
    assert s3_path.startswith('s3://')
    split = s3_path.split('/')
    bucket = split[2]
    path = '/'.join(split[3:])
    buk = s3.Bucket(bucket)
    
    if len(list(buk.objects.filter(Prefix=path))) > 0:
        if not override:
            print('File s3://{}/{} already exists.\nSet override to upload anyway.\n'.format(s3_bucket, s3_path))
            return
        else:
            print('Overwriting existing file')
    with open(local_file, 'rb') as data:
        print('Uploading file to {}'.format(s3_path))
        buk.put_object(Key=path, Body=data)

In [89]:
training_data = [
    {
            "start": str(ts.index[0][0]),
            "target": ts['target'][ts.index[0][0]:end_training].tolist(), # We use -1, because pandas indexing includes the upper bound 
            "dynamic_feat": ts[['Adj Close','Volume','PC1','PC2','PC3','PC4','PC5','PC6']][ts.index[0][0]:end_training].values.T.tolist()
    }
    for ts in timeseries
]
print(len(training_data))

491


In [90]:
#training_data[0]
#test_data[0]['dynamic_feat']

In [91]:
num_test_windows = 10

test_data = [
    {
        "start": str(ts.index[0][0]),
        "target": ts['target'][ts.index[0][0]:end_training + (2*k * prediction_length)].tolist(),
        "dynamic_feat": ts[['Adj Close','Volume','PC1','PC2','PC3','PC4','PC5','PC6']][ts.index[0][0]:end_training + (2*k * prediction_length)].values.T.tolist()
    }
    for k in range(1, num_test_windows + 1) 
    for ts in timeseries
]
print(len(test_data))



4910


In [92]:
%%time
write_json_dataset("train_indicator.json", training_data)
write_json_dataset("test_indicator.json", test_data)

CPU times: user 1min 9s, sys: 1.35 s, total: 1min 11s
Wall time: 1min 15s


In [93]:
s3 = boto3.resource('s3')

In [94]:
copy_to_s3("train_indicator.json", s3_data_path + "/train/train.json")
copy_to_s3("test_indicator.json", s3_data_path + "/test/test.json")

Uploading file to s3://sagemaker-us-east-2-017500148529/MLEND-Capstone-Project/data_indicator/train/train.json
Uploading file to s3://sagemaker-us-east-2-017500148529/MLEND-Capstone-Project/data_indicator/test/test.json


In [95]:
s3filesystem = s3fs.S3FileSystem()
with s3filesystem.open(s3_data_path + "/train/train.json", 'rb') as fp:
    print(fp.readline().decode("utf-8")[:100] + "...")

{"start": "2010-03-16 00:00:00", "target": [1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, -1, 1, 1, 1, 1, 1...


In [96]:
estimator_indicator = sagemaker.estimator.Estimator(
    sagemaker_session=sagemaker_session,
    image_name=image_name,
    role=role,
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    base_job_name='MLEND-Capstone-Project',
    output_path=s3_output_path
)

In [97]:
hyperparameters = {
    "time_freq": freq,
    "epochs": "100",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
    "num_dynamic_feat": 'auto',
}
estimator_indicator.set_hyperparameters(**hyperparameters)

In [98]:
%%time
data_channels = {
    "train": s3_data_path + "/train/train.json",
    "test": s3_data_path + "/test/test.json"
}

estimator_indicator.fit(inputs=data_channels, wait=True)

2020-06-03 17:01:46 Starting - Starting the training job...
2020-06-03 17:01:49 Starting - Launching requested ML instances......
2020-06-03 17:02:50 Starting - Preparing the instances for training...
2020-06-03 17:03:46 Downloading - Downloading input data......
2020-06-03 17:04:42 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[06/03/2020 17:04:45 INFO 140291508827968] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'num_dynamic_feat': u'auto', u'dropout_rate': u'0.10', u'mini_batch_size': u'128', u'test_quantiles': u'[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'num_eval_samples': u'100', u'learning_rate': u'0.001', u'num_cells': u'40', u'num_layers': u'2', u'embedding_dimension': u'10', u'_kvstore': u'auto', u'_num_kv_servers': u'auto', u'cardinality': u'auto', u'likelihood': u'student-t', u'early_stopp

In [120]:
#predictor_indicator = estimator_indicator.deploy(
#    initial_instance_count=1,
#    instance_type='ml.m4.xlarge',
#    content_type="application/json")



In [118]:
#predictor_indicator = sagemaker.predictor.RealTimePredictor(endpoint='MLEND-Capstone-Project-2020-06-03-14-40-11-261')

In [116]:
def encode_request(instance, num_samples, quantiles):
        configuration = {
            "num_samples": num_samples,
            "output_types": ["quantiles"],
            "quantiles": quantiles
        }
        
        http_request_data = {
            "instances": [instance],
            "configuration": configuration
        }
        
        return json.dumps(http_request_data).encode('utf-8')


In [121]:
stock_indicator_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Volume,PC1,PC2,PC3,PC4,PC5,PC6,target
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-03-16,A,0.005604,0.003822,-0.752058,-1.622765,1.379154,0.829721,0.766182,-0.299314,1
2010-03-16,AAL,0.00188,0.006881,-1.155141,-1.345847,0.508184,0.746179,1.022953,-0.38404,-1
2010-03-16,AAP,0.010572,0.000981,-0.637966,-0.966827,0.851218,0.315813,0.04765,-0.163081,1
2010-03-16,AAPL,0.007148,0.129969,-0.50301,-1.684244,1.363548,0.603027,0.750602,-0.149463,0
2010-03-16,ABC,0.006258,0.004898,-1.058094,-0.997907,0.891929,0.373091,-0.156564,-0.012736,1


In [122]:
def get_stock_prediction(ticker,date):
    date_pred = pd.Timestamp(date, freq='D')
    date_start = date_pred-50
    #
    pred_df = stock_indicator_data.loc[(slice(str(date_start),str(date_pred)), ticker), :]
    result_df = pred_df.loc[(slice(str(date_pred),str(date_pred)), ticker), :]

    pred = {
            "start": str(date_pred),
            "target": pred_df['target'][date_start:date_pred-1].tolist(),
            "dynamic_feat": pred_df[['Adj Close','Volume','PC1','PC2','PC3','PC4','PC5','PC6']][date_start:date_pred].values.T.tolist()
           }

    req = encode_request(instance=pred, num_samples=50, quantiles=['0.1', '0.5', '0.9'])
    res = predictor_indicator.predict(req)

    prediction_data = json.loads(res.decode('utf-8'))
    pred = round(prediction_data['predictions'][0]['quantiles']['0.5'][0])
    result_df['prediction'] = pred
    return result_df
    

In [123]:
get_stock_prediction('AAPL', '2019-01-23')

  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Volume,PC1,PC2,PC3,PC4,PC5,PC6,target,prediction
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-01-23,AAPL,0.03886,0.026907,-0.847888,4.83929,2.19196,-3.383544,1.655766,-0.505774,1,1


In [124]:
date_index = pd.read_csv('test_date_index.csv')
date_index = date_index.values.reshape(252).tolist()

def get_prediction_accuracy(ticker, date_range):
    ticker = str(ticker)
    i = 0
    target = []
    prediction = []

    for date in date_range:
        target.append(get_stock_prediction(ticker, date)['target'].values[0])
        prediction.append(int(get_stock_prediction(ticker, date)['prediction'].values[0]))
    target = list(np.array(target).reshape(252))
    prediction = list(np.array(prediction).reshape(252))
    data = {'target': list(target), 'prediction': list(prediction)}
    prediction_df = pd.DataFrame(data=data,index=date_index, columns=['target','prediction'])
    
    return accuracy_score(target, prediction), prediction_df

##### 

In [None]:
get_prediction_accuracy('AAPL', date_index)