In [1]:
data_root = "../../data/"
#stats stuff
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

# ML stuff
import numpy as np
from numpy.fft import *
import torch
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
import pandas as pd
import lightgbm as lgb

# DL stuff
from torch.autograd import Variable
from fastprogress import master_bar, progress_bar
import torch
import torch.nn as nn
from torch.utils.data import Dataset


# plotting
import matplotlib.pyplot as plt
import seaborn as sns


# basic stuff
import datetime
import requests
import io
from collections import Counter



## Data loading and preprocessing functions

In [22]:
# set index as datetime
def date_index_nasdaq(nasdaq):
    nasdaq_c = nasdaq.copy()
    dates = pd.to_datetime(nasdaq_c.Date)
    nasdaq_c.set_index(dates, inplace=True)
    # set date as index
    nasdaq_c.drop("Date", axis=1, inplace=True)
    nasdaq_c = nasdaq_c["2012-05-18":]
    return nasdaq_c

############## REINDEX FUNCTION AND PREPARE_STOCK FUNCTION ARE PRETTY MUCH SAME, HOWEVER, I PREFER THE PRIOR ##################
# for ARIMA or some shit    
def reindex(df):
    return df.reindex(pd.date_range(df.index[0], df.index[-1])).fillna(method="ffill")

# for prepare_stock
def date_range_df(start, end, column_name = "Time"):
    date_range = pd.date_range(start, end)
    df = pd.DataFrame(date_range, columns = [column_name])
    df.set_index(column_name, inplace=True)
    return df

# merging with date range df
def prepare_stock(nasdaq, start, end, stock_name="AAPL", drop=True):
    nasdaq = nasdaq.loc[nasdaq["Name"]==stock_name]
    dates = date_range_df(start, end)
    new_nasdaq = dates.merge(nasdaq, how="left", left_index=True, right_index=True)
    if drop:
        new_nasdaq.dropna(inplace=True)
    return new_nasdaq
#############################################################################################################################

# create features volatility, volume, adj close
def get_features(nasdaq):
    #rename Adj Close
    nasdaq.rename(columns={"Adj Close":"Adj_Close"}, inplace=True)
    nasdaq["log_Volatility"] = np.log(nasdaq.High - nasdaq.Low + 1)
    nasdaq["log_Volume"] = np.log(nasdaq.Volume + 1) 
    nasdaq["log_Adj_Close"] = np.log(nasdaq["Adj_Close"] + 1)
    # nasdaq["log_Adj_Close_diff"] = nasdaq["log_Adj_Close"].diff()
    nasdaq.drop(columns = ["Low", "High", "Close", "Open", "Name", "Volume"], inplace=True)
    # nasdaq.dropna(inplace = True)
    return nasdaq

# this will return feature engineered stock dataframe
def get_stock(nasdaq, stock_name="AAPL"):
    nasdaq_c = date_index_nasdaq(nasdaq)
    stock = prepare_stock(nasdaq_c, nasdaq_c.index[0], nasdaq_c.index[-1], stock_name)
    stock = get_features(stock)
    stock.fillna("ffill", inplace=True)
    return stock

# plot heatmap for top stocks
def plot_attribute(nasdaq, using,feature="log_Adj_Close"):
    stocks = pd.DataFrame()
    for name in using:
        stocks[name] = get_stock(nasdaq, name)[feature]
    stocks.dropna(inplace=True)
    stocks.plot()
    plt.show()

####### In the 2 functions below, we are adding weekday however ###########
####### prob we could have done this in like get_stock or something #######
# the main difference between the two is , the prior is just adding weekday at the end,
# whereas the latter function is adding it to every stock
def get_train_df(nasdaq, using, features):
    df_features_arr = reindex(get_stock(nasdaq, using[0])).to_numpy().T
    for name in using[1:]:
        adding = reindex(get_stock(nasdaq, name)).to_numpy().T
        df_features_arr = np.concatenate([df_features_arr, adding])
    df_features_arr = df_features_arr.T

    ## df_features = pd.DataFrame(data=df_features_arr, columns=pd.MultiIndex.from_tuples(zip(col_one, col_two)))
    
    # making columns
    # features must not include weekday here
    if "weekday" in features:
        features.remove("weekday")
    col_one = []
    for element in using:
        for i in range(len(features)):
            col_one.append(element)
    col_two = list(features)*len(using)
    print(len(col_one), len(col_two))
    # scaling 
    scaler = MinMaxScaler((-1, 1))
    scaled = scaler.fit_transform(df_features_arr)
    df_features = pd.DataFrame(data=scaled, columns=pd.MultiIndex.from_tuples(zip(col_one, col_two)))

    df_features.index = pd.date_range("2012-05-18", "2021-09-10")

    day_of_week = np.array(list(map(lambda date: date.weekday(), df_features.index)))
    day_of_week = day_of_week.reshape(-1, 1)
    day_of_week = pd.Series(data=scaler.fit_transform(day_of_week).reshape(-1,), index = df_features.index)
    df_features["weekday"] = day_of_week
    if "weekday" not in features:
        features.append("weekday")

    return df_features, features


# for feeding into network
def get_train_arr(nasdaq, using, features):
    df_features_arr = []
    for name in using:
        arr = reindex(get_stock(nasdaq, name)).to_numpy()
        # scaling for each column, for each stock_df in nasdaq
        scaler = MinMaxScaler(feature_range=(-1, 1))
        arr_scaled = scaler.fit_transform(arr)    

        # adding day of week
        day_of_week = np.array(list(map(lambda date: date.weekday(), pd.date_range("2012-05-18", "2021-09-10"))))
        day_of_week = day_of_week.reshape(-1, 1)
        day_of_week = scaler.fit_transform(day_of_week)
      
        arr_scaled = np.concatenate([arr_scaled, day_of_week], axis=1)

        df_features_arr.append(arr_scaled)


    df_features_arr = np.array(df_features_arr)
    if "weekday" not in features:
        features.append("weekday")
    df_features_arr = df_features_arr.reshape(-1, len(features), 7)

    return df_features_arr, features


def sliding_windows_mutli_features(data, seq_length, target_cols_ids):
    x = []
    y = []

    for i in range((data.shape[0])-seq_length-1):
        #change here after finishing feature engineering process
        _x = data[i:(i+seq_length), :] 
        _y = data[i+seq_length, target_cols_ids] ## column 1 contains the labbel(log_Adj_Close)
        x.append(_x)
        y.append(_y)

    return np.array(x), np.array(y)


In [23]:
#################### LOAD DATA ######################

nasdaq = pd.read_csv(data_root + "NASDAQ_100_Data_From_2010.csv", sep="\t")

features = ['Adj_Close', 'log_Volatility', 'log_Volume', 'log_Adj_Close']
using = ['FB', 'TSLA', 'AAPL', 'AMZN', 'NVDA', 'MSFT', 'GOOGL']
# AAPL(Apple), MSFT(Microsoft), GOOGL(Google), AMZN(Amazon), TSLA(Tesla), FB(Facebook), NVDA(Nvidia)

In [51]:
# this part is sketchy
df, features = get_train_df(nasdaq, using, features)

adj_close_cols_ids = []
log_adj_close_cols_ids = []
volatility_cols_ids = []
volume_cols_ids = []
weekday_col_id = []
count = 0
for col in df.columns:
    if col[1] == "Adj_Close":
        df.drop(col, axis=1, inplace=True)
        count -= 1
    if col[1] == "log_Adj_Close":
        adj_close_cols_ids.append(count)
    if col[1] == "log_Volume":
        volume_cols_ids.append(count)
    if col[1] == "log_Volatility":
        volatility_cols_ids.append(count)
    if col[0] == "weekday":
        weekday_col_id.append(count)
    count += 1
df = df.to_numpy()
x, y = sliding_windows_mutli_features(df, 30, adj_close_cols_ids)

x.shape, y.shape


28 28


((3372, 30, 22), (3372, 7))

In [52]:
# train test split (70:30)
train_size = int(len(y)*0.80)
test_size = len(y) - train_size

dataX = Variable(torch.Tensor(np.array(x)))
dataY = Variable(torch.Tensor(np.array(y)))

trainX = Variable(torch.Tensor(np.array(x[0:train_size])))
trainY = Variable(torch.Tensor(np.array(y[0:train_size])))

testX = Variable(torch.Tensor(np.array(x[train_size:len(x)])))
testY = Variable(torch.Tensor(np.array(y[train_size:len(y)])))

print("train shape is:",trainX.size())
print("train label shape is:",trainY.size())
print("test shape is:",testX.size())
print("test label shape is:",testY.size())

train shape is: torch.Size([2697, 30, 22])
train label shape is: torch.Size([2697, 7])
test shape is: torch.Size([675, 30, 22])
test label shape is: torch.Size([675, 7])


In [66]:
device = torch.device('cpu')

class LSTM2(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM2, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        #self.seq_length = seq_length

        
        # what does the batch_first do
        self.LSTM2 = nn.LSTM(\
            input_size=input_size, 
            hidden_size=hidden_size,
            num_layers=num_layers, 
            batch_first=True,
            dropout = 0.25)
        
        # Linear(in_features, out_features)
        self.fc1 = nn.Linear(hidden_size, 256)                                                                                                                                                                                                                           
        self.bn1 = nn.BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.dp1 = nn.Dropout(0.25)

        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.dp2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(128, 7)
        self.relu = nn.ReLU()

    def forward(self, x):
        h_1 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size).to(device))
        
        c_1 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size).to(device))
        
        # Propagate input through LSTM
        _, (hn, cn) = self.LSTM2(x, (h_1, c_1))
        y = hn.view(-1, self.hidden_size)

        final_state = hn.view(self.num_layers, x.size(0), self.hidden_size)[-1]

        x0 = self.fc1(final_state)
        x0 = self.bn1(x0)
        x0 = self.dp1(x0)
        x0 = self.relu(x0)

        x0 = self.fc2(x0)
        x0 = self.bn2(x0)
        x0 = self.dp2(x0)

        x0 = self.relu(x0)
        
        out = self.fc3(x0)                                         
        # out = self.dropout(out)
       
        return out

def init_weights(model):
    for name, param in model.named_parameters():
        nn.init.uniform_(param.data, -0.88, 0.08)

# create a nn class (just-for-fun choice :-) 
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))

In [67]:
###### Parameters #######
num_epochs = 700
learning_rate = 1e-3
input_size = 22 # feature nums
hidden_size = 512
num_layers = 2
num_classes = 7 # because we are using 7 stocks
#########################

In [68]:
trainX.shape

torch.Size([2697, 30, 22])

In [69]:
############################################################################################
############################################################################################
############################### ONLY RUN FOR TRAINING ######################################
############################################################################################
############################################################################################
best_val_loss = 100 
### Init Model
lstm = LSTM2(num_classes, input_size, hidden_size, num_layers)
lstm.to(device)
lstm.apply(init_weights)

### Set Criterion Optimizer and scheduler
criterion = torch.nn.MSELoss().to(device) 
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate, weight_decay=1e-5)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=100, factor=0.5, min_lr=1e-7, eps=1e-08)

#optimizer = torch.optim.SGD(lstm.parameters(), lr=learning_rate)

# Train model
for epoch in progress_bar(range(num_epochs)):
    lstm.train()
    outputs= lstm(trainX.to(device))
    optimizer.zero_grad()
    torch.nn.utils.clip_grad_norm_(lstm.parameters(),1)

    # obtain loss func
    loss = criterion(outputs, trainY.to(device))
    loss.backward()

    scheduler.step(loss)
    optimizer.step()

    #evaluate on test
    lstm.eval()
    valid = lstm(testX.to(device))
    vall_loss = criterion(valid, testY.to(device))

    scheduler.step(vall_loss)

    if vall_loss.cpu().item() < best_val_loss:
         torch.save(lstm.state_dict(), 'best_model.pt')
         print("saved best model epoch:",epoch,"val loss is:",vall_loss.cpu().item())
         best_val_loss = vall_loss.cpu().item()

    if epoch % 50 == 0:
        print(f"Epoch: {epoch}, loss: {loss.cpu().item()}, valid loss:{vall_loss.cpu().item()}")




Epoch: 0, loss: 244.97987365722656, valid loss:256754.15625
Epoch: 50, loss: 137.4081573486328, valid loss:619.5578002929688
Epoch: 100, loss: 81.82657623291016, valid loss:353.4774475097656
Epoch: 150, loss: 51.77170944213867, valid loss:220.6194610595703
Epoch: 200, loss: 33.650917053222656, valid loss:142.38873291015625
saved best model epoch: 243 val loss is: 99.82648468017578
saved best model epoch: 244 val loss is: 99.05354309082031
saved best model epoch: 245 val loss is: 98.2851333618164
saved best model epoch: 246 val loss is: 98.07386779785156
saved best model epoch: 247 val loss is: 97.12197875976562
saved best model epoch: 248 val loss is: 96.6666259765625
saved best model epoch: 249 val loss is: 95.70043182373047
saved best model epoch: 250 val loss is: 95.17382049560547
Epoch: 250, loss: 23.385440826416016, valid loss:95.17382049560547
saved best model epoch: 251 val loss is: 94.39668273925781
saved best model epoch: 252 val loss is: 93.59685516357422
saved best model epo

In [30]:
testX_c = testX.cpu().detach().numpy().copy()

# testX_drop_Volatility = np.delete(testX_c, [0, 3, 6, 9, 12, 15, 18], 2)
# testX_drop_Volume = np.delete(testX_c, [1, 4, 7, 10, 13, 16, 19], 2)
# testX_drop_Adj_Close = np.delete(testX_c, target_cols_ids, 2)
testX_drop_Volatility = testX_c.copy()
testX_drop_Volatility[:,:,volatility_cols_ids] = 0.
testX_drop_Volume = testX_c.copy()
testX_drop_Volume[:,:,volume_cols_ids] = 0.
testX_drop_Adj_Close = testX_c.copy()
testX_drop_Adj_Close[:, :, adj_close_cols_ids] = 0.

testX_drop_Volatility = Variable(torch.Tensor(testX_drop_Volatility))
testX_drop_Volume = Variable(torch.Tensor(testX_drop_Volume))
testX_drop_Adj_Close = Variable(torch.Tensor(testX_drop_Adj_Close))