In [58]:
import sys

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from yahoo_fin import stock_info as si
from collections import deque

import os
import numpy as np
import pandas as pd
import random
from fredapi import Fred
import datetime

import torch

In [59]:
import os
import time
from tensorflow.keras.layers import LSTM

# Window size or the sequence length
N_STEPS = 50
# Lookup step, 1 is the next day
LOOKUP_STEP = 15

# whether to scale feature columns & output price as well
SCALE = False
scale_str = f"sc-{int(SCALE)}"
# whether to shuffle the dataset
SHUFFLE = True
shuffle_str = f"sh-{int(SHUFFLE)}"
# whether to split the training/testing set by date
SPLIT_BY_DATE = True
split_by_date_str = f"sbd-{int(SPLIT_BY_DATE)}"
# test ratio size, 0.2 is 20%
TEST_SIZE = 0.1
# features to use
# FEATURE_COLUMNS = ["close", "volume", "open", "high", "low","ema100"]
FEATURE_COLUMNS = ["close"
    # , "volume", "open", "high", "low"
                    # ,"ma7","ma21"
                    #   ,"ma100","ma50"
                    #   ,"26ema","12ema","MACD","ema","momentum"
                   ] \
                  # + series_ids

# date now
date_now = time.strftime("%Y-%m-%d")

### model parameters

N_LAYERS = 2
# LSTM cell
CELL = LSTM
# 256 LSTM neurons
UNITS = 256
# 40% dropout
DROPOUT = 0.4
# whether to use bidirectional RNNs
BIDIRECTIONAL = False

### training parameters

# mean absolute error loss
# LOSS = "mae"
# huber loss
LOSS = "huber_loss"
OPTIMIZER = "adam"
BATCH_SIZE = 64
EPOCHS = 700

# Amazon stock market
ticker = "^GSPC"
ticker_data_filename = os.path.join("data", f"{ticker}_{date_now}")
# model name to save, making it as unique as possible based on parameters
model_name = f"{date_now}_{ticker}-{shuffle_str}-{scale_str}-{split_by_date_str}-\
{LOSS}-{OPTIMIZER}-{CELL.__name__}-seq-{N_STEPS}-step-{LOOKUP_STEP}-layers-{N_LAYERS}-units-{UNITS}"
if BIDIRECTIONAL:
    model_name += "-b"

In [60]:
# set seed, so we can get the same results after rerunning several times
np.random.seed(314)
tf.random.set_seed(314)
random.seed(314)

In [61]:
def get_technical_indicators(dataset):
    # Create 7 and 21 days Moving Average
    dataset['ma7'] = dataset['close'].rolling(window=7).mean()
    dataset['ma21'] = dataset['close'].rolling(window=21).mean()
    dataset['ma100'] = dataset['close'].rolling(window=100).mean()
    dataset['ma50'] = dataset['close'].rolling(window=50).mean()

    # Create MACD
	# candles['ema20'] = pd.Series.ewm(candles['<CLOSE>'], span=20).mean()
    dataset['26ema'] = pd.Series.ewm(dataset['close'], span=26).mean()
    dataset['12ema'] = pd.Series.ewm(dataset['close'], span=12).mean()
    dataset['MACD'] = (dataset['12ema']-dataset['26ema'])

    # Create Bollinger Bands
    # dataset['20sd'] = pd.stats.moments.rolling_std(dataset['GS'],20)
    # dataset['upper_band'] = dataset['ma21'] + (dataset['20sd']*2)
    # dataset['lower_band'] = dataset['ma21'] - (dataset['20sd']*2)

    # Create Exponential moving average
    dataset['ema'] = dataset['close'].ewm(com=0.5).mean()

    # Create Momentum
    dataset['momentum'] = dataset['close']-1

    dataset.replace('', np.nan, inplace=True)
    dataset = dataset.dropna()

    return dataset

def shuffle_in_unison(a, b):
    # shuffle two arrays in the same way
    state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(state)
    np.random.shuffle(b)

In [62]:
# def load_data(ticker, n_steps=N_STEPS, scale=True, shuffle=True, lookup_step=LOOKUP_STEP, split_by_date=True,
#                 test_size=0.2, feature_columns=['close', 'volume', 'open', 'high', 'low']):
n_steps=N_STEPS
scale= SCALE
shuffle=True
lookup_step=LOOKUP_STEP
split_by_date=True
test_size=0.2
feature_columns=FEATURE_COLUMNS

"""
Loads data from Yahoo Finance source, as well as scaling, shuffling, normalizing and splitting.
Params:
    ticker (str/pd.DataFrame): the ticker you want to load, examples include AAPL, TESL, etc.
    n_steps (int): the historical sequence length (i.e window size) used to predict, default is 50
    scale (bool): whether to scale prices from 0 to 1, default is True
    shuffle (bool): whether to shuffle the dataset (both training & testing), default is True
    lookup_step (int): the future lookup step to predict, default is 1 (e.g next day)
    split_by_date (bool): whether we split the dataset into training/testing by date, setting it
        to False will split datasets in a random way
    test_size (float): ratio for test data, default is 0.2 (20% testing data)
    feature_columns (list): the list of features to use to feed into the model, default is everything grabbed from yahoo_fin
"""

# see if ticker is already a loaded stock from yahoo finance
if isinstance(ticker, str):
    # load it from yahoo_fin library
    # df = si.get_data(ticker)
    # df = si.get_data(ticker,start_date = '2000-01-01', end_date = None, index_as_date = True, interval= "1mo")
    df = si.get_data(ticker,start_date = '2000-01-01')
    # get_data(ticker, start_date = None, end_date = None, index_as_date = True, interval = “1d”)
elif isinstance(ticker, pd.DataFrame):
    # already loaded, use it directly
    df = ticker
else:
    raise TypeError("ticker can be either a str or a `pd.DataFrame` instances")

In [63]:
df = get_technical_indicators(df)
# df_fred = get_fred()
# df = pd.concat([df_fred , df], axis=1)

df.replace('', np.nan, inplace=True)
df = df.dropna()

In [64]:
# print(df.tail)
# this will contain all the elements we want to return from this function
result = {}
# we will also return the original dataframe itself
result['df'] = df.copy()

In [65]:
# make sure that the passed feature_columns exist in the dataframe
for col in feature_columns:
    assert col in df.columns, f"'{col}' does not exist in the dataframe."

# add date as a column
if "date" not in df.columns:
    df["date"] = df.index

if scale:
    column_scaler = {}
    # scale the data (prices) from 0 to 1
    for column in feature_columns:
        scaler = preprocessing.MinMaxScaler()
        df[column] = scaler.fit_transform(np.expand_dims(df[column].values, axis=1))
        column_scaler[column] = scaler

    # add the MinMaxScaler instances to the result returned
    result["column_scaler"] = column_scaler

# add the target column (label) by shifting by `lookup_step`
df['future'] = df['close'].shift(-lookup_step)

In [66]:
df[feature_columns + ["date"]].values

array([[1399.050048828125, Timestamp('2000-05-24 00:00:00')],
       [1381.52001953125, Timestamp('2000-05-25 00:00:00')],
       [1378.02001953125, Timestamp('2000-05-26 00:00:00')],
       ...,
       [4701.4599609375, Timestamp('2021-11-24 00:00:00')],
       [4594.6201171875, Timestamp('2021-11-26 00:00:00')],
       [4655.27001953125, Timestamp('2021-11-29 00:00:00')]], dtype=object)

In [67]:
# last `lookup_step` columns contains NaN in future column
# get them before droping NaNs
last_sequence = np.array(df[feature_columns].tail(lookup_step))

# drop NaNs
df.dropna(inplace=True)

sequence_data = []
sequences = deque(maxlen=n_steps)

for entry, target in zip(df[feature_columns + ["date"]].values, df['future'].values):
    sequences.append(entry)
    if len(sequences) == n_steps:
        sequence_data.append([np.array(sequences), target])

In [68]:
# get the last sequence by appending the last `n_step` sequence with `lookup_step` sequence
# for instance, if n_steps=50 and lookup_step=10, last_sequence should be of 60 (that is 50+10) length
# this last_sequence will be used to predict future stock prices that are not available in the dataset
last_sequence = list([s[:len(feature_columns)] for s in sequences]) + list(last_sequence)
last_sequence = np.array(last_sequence).astype(np.float32)
# add to result
result['last_sequence'] = last_sequence

# construct the X's and y's
X, y = [], []
for seq, target in sequence_data:
    X.append(seq)
    y.append(target)

In [69]:
# convert to numpy arrays
X = np.array(X)
y = np.array(y)

In [70]:
# import datetime
#
# # train_samples = int((1 - test_size) * len(X))
# train_samples = len(X) - (datetime.date.today() - datetime.date(2021, 1, 1)).days
# print(len(X))
# train_samples


In [71]:
# import datetime

# print((datetime.date.today() - datetime.date(2021, 1, 1)).days)

# train_samples = (datetime.date.today() - datetime.date(2021, 1, 1)).days
# train_samples

In [72]:
if split_by_date:
    # split the dataset into training & testing sets by date (not randomly splitting)
    # train_samples = int((1 - test_size) * len(X))
    train_samples = len(X) - len(result['df'].loc['2021-01-01':])
    result["X_train"] = X[:train_samples]
    result["y_train"] = y[:train_samples]
    result["X_test"]  = X[train_samples:]
    result["y_test"]  = y[train_samples:]
    if shuffle:
        # shuffle the datasets for training (if shuffle parameter is set)
        shuffle_in_unison(result["X_train"], result["y_train"])
        shuffle_in_unison(result["X_test"], result["y_test"])
else:
    # split the dataset randomly
    result["X_train"], result["X_test"], result["y_train"], result["y_test"] = train_test_split(X, y,
                                                                            test_size=test_size, shuffle=shuffle)

In [79]:
print(X)

[[[1436.1099853515625 Timestamp('2007-03-23 00:00:00')]
  [1437.5 Timestamp('2007-03-26 00:00:00')]
  [1428.6099853515625 Timestamp('2007-03-27 00:00:00')]
  ...
  [1530.6199951171875 Timestamp('2007-05-31 00:00:00')]
  [1536.3399658203125 Timestamp('2007-06-01 00:00:00')]
  [1539.1800537109375 Timestamp('2007-06-04 00:00:00')]]

 [[887.3400268554688 Timestamp('2003-01-23 00:00:00')]
  [861.4000244140625 Timestamp('2003-01-24 00:00:00')]
  [847.47998046875 Timestamp('2003-01-27 00:00:00')]
  ...
  [858.47998046875 Timestamp('2003-04-01 00:00:00')]
  [880.9000244140625 Timestamp('2003-04-02 00:00:00')]
  [876.4500122070312 Timestamp('2003-04-03 00:00:00')]]

 [[1313.0 Timestamp('2006-09-12 00:00:00')]
  [1318.0699462890625 Timestamp('2006-09-13 00:00:00')]
  [1316.280029296875 Timestamp('2006-09-14 00:00:00')]
  ...
  [1399.760009765625 Timestamp('2006-11-16 00:00:00')]
  [1401.199951171875 Timestamp('2006-11-17 00:00:00')]
  [1400.5 Timestamp('2006-11-20 00:00:00')]]

 ...

 [[4204.109

In [73]:
# result["X_train"]

In [74]:
# get the list of test set dates
dates = result["X_test"][:, -1, -1]
# retrieve test features from the original dataframe
result["test_df"] = result["df"].loc[dates]
# remove duplicated dates in the testing dataframe
result["test_df"] = result["test_df"][~result["test_df"].index.duplicated(keep='first')]
# remove dates from the training/testing sets & convert to float32
result["X_train"] = result["X_train"][:, :, :len(feature_columns)].astype(np.float32)
result["X_test"] = result["X_test"][:, :, :len(feature_columns)].astype(np.float32)

In [80]:
result['df'].loc['2021-01-01':]

Unnamed: 0,open,high,low,close,adjclose,volume,ticker,ma7,ma21,ma100,ma50,26ema,12ema,MACD,ema,momentum
2021-01-04,3764.610107,3769.989990,3662.709961,3700.649902,3700.649902,5006680000,^GSPC,3720.604318,3698.381441,3497.669902,3590.778994,3676.304286,3710.918349,34.614063,3716.220706,3699.649902
2021-01-05,3698.020020,3737.830078,3695.070068,3726.860107,3726.860107,4582620000,^GSPC,3725.868617,3701.245257,3501.135002,3596.246396,3680.049162,3713.370928,33.321766,3723.313640,3725.860107
2021-01-06,3712.199951,3783.040039,3705.340088,3748.139893,3748.139893,6049970000,^GSPC,3732.308594,3703.579532,3504.882102,3601.901396,3685.092920,3718.719999,33.627079,3739.864475,3747.139893
2021-01-07,3764.709961,3811.550049,3764.709961,3803.790039,3803.790039,5080870000,^GSPC,3742.084298,3708.904774,3509.191501,3609.957798,3693.885299,3731.807698,37.922399,3782.481518,3802.790039
2021-01-08,3815.050049,3826.689941,3783.600098,3824.679932,3824.679932,4764180000,^GSPC,3756.032854,3714.734770,3513.618401,3618.637798,3703.573790,3746.095734,42.521943,3810.613794,3823.679932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-22,4712.000000,4743.830078,4682.169922,4682.939941,4682.939941,3206280000,^GSPC,4691.522810,4652.419968,4471.426069,4515.723359,4624.065500,4674.788751,50.723251,4688.152938,4681.939941
2021-11-23,4678.479980,4699.390137,4652.660156,4690.700195,4690.700195,3428780000,^GSPC,4692.644252,4658.335217,4474.809673,4520.676367,4629.001403,4677.236665,48.235262,4689.851110,4689.700195
2021-11-24,4675.779785,4702.870117,4659.890137,4701.459961,4701.459961,2464040000,^GSPC,4695.309989,4664.367118,4478.388872,4525.091562,4634.368704,4680.963326,46.594622,4697.590344,4700.459961
2021-11-26,4664.629883,4664.629883,4585.430176,4594.620117,4594.620117,2676740000,^GSPC,4680.127162,4666.411877,4480.753774,4527.508965,4631.424364,4667.679756,36.255392,4628.943526,4593.620117


In [81]:
result["test_df"].to_excel("test_df_2.xlsx")