# Predicting the direction of the Stock Market

In [18]:
# Import statements
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, roc_auc_score, multilabel_confusion_matrix
from tensorflow.keras import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.models import load_model
%matplotlib inline

In [104]:
class ModelLoader:
    def __init__(self, ticker):
        self.model_name = ticker + '-model'
        self.create_model(ticker)
        print(self.model.summary())
    
    def create_model(self, ticker):
        filename = ''
        for file in os.listdir('../../data'):
            if file.split('_')[0] == ticker:
                filename = file
        filename = '../../data/' + filename
        _, self.train, self.test = self.prep_data(filename)
        self.batches = 32
        neurons = 32
        nb_epochs = 100
        train_trimmed = self.train[len(self.train)%self.batches:]
        test_trimmed = self.test[:-(len(self.test) % self.batches)] if len(self.test) % self.batches != 0 else self.test
        for file in os.listdir('../models/'):
            if file.split('-')[0] == ticker:
                self.model = load_model('../models/' + file)
                return
        self.compile_model()
        self.fit_lstm(train_trimmed, nb_epochs, neurons)
        self.model.save('../models/' + self.model_name + '.h5')
        return
        
    def retrain(self, batches = 32, neurons = 32, nb_epochs = 100):
        self.batches = batches
        train_trimmed = self.train[len(self.train)%self.batches:]
        self.model = None
        self.fit_lstm(train_trimmed, nb_epochs, neurons)
        print(self.model.summary())
        self.model.save('../models/' + self.model_name + '.h5')
    
    def evaluate(self, batches = 32):
        self.model.reset_states()
        train_trimmed = self.train[len(self.train)%batches:]
        test_trimmed = self.test[:-(len(self.test) % batches)] if len(self.test) % batches != 0 else self.test
        # forecast the training set to set state
        train_reshaped = train_trimmed[:,:-5].reshape(len(train_trimmed), 1, -1)
        self.model.predict(train_reshaped, batch_size=batches, verbose=0)
        # now forecast the test set
        test_reshaped = test_trimmed[:,:-5].reshape(len(test_trimmed), 1, -1)
        ytrue = test_trimmed[:,-5:]
        yhat = self.model.predict(test_reshaped, batch_size=batches, verbose=0)
        labels = 'horrid poor neutral good great'.split()
        pred_class = np.argmax(yhat, axis=1)
        preds = np.zeros((len(pred_class), 5))
        preds[np.arange(pred_class.size), pred_class] = 1
        print(classification_report(ytrue, preds, target_names=labels))

    def fit_lstm(self, train, nb_epoch, neurons, label_size=5):
        X, y = train[:,:-label_size], train[:,-label_size:]
        X = X.reshape(X.shape[0], 1, X.shape[1])
        # prepare a model
        if not self.model or isinstance(self.model, Sequential):
            self.compile_model(neurons)
        
        for i in range(nb_epoch):
            if not i % 10:
                print('%d/%d' % (i+1, nb_epoch), end='')
            else:
                print('.', end='')
            self.model.fit(X, y, epochs=1, batch_size=self.batches, verbose=0, shuffle=False)
            self.model.reset_states()
        print()
    
    def predict(self):
        self.model.reset_states()
        test_trimmed = self.test[(len(self.test) % self.batches):] if len(self.test) % self.batches != 0 else self.test
        test_reshaped = test_trimmed[:,:-5].reshape(len(test_trimmed), 1, -1)
        yhat = self.model.predict(test_reshaped, batch_size=self.batches)
        print('five days from %s probabilities:' % (self.data.columns[-1]))
        print('fall:%.3f, dip:%.3f, stay:%.3f, rise:%.3f, rocket:%.3f'
              %(yhat[-1,0], yhat[-1,1], yhat[-1,2], yhat[-1,3], yhat[-1,4]))
    
    def compile_model(self, neurons = 32, label_size=5):
        batch_size = self.batches
        X= self.train[:,:-label_size]
        X = X.reshape(X.shape[0], 1, X.shape[1])
        
        self.model = Sequential()
        self.model.add(LSTM(neurons,
                       batch_input_shape=(batch_size, X.shape[1], X.shape[2]),
                       stateful=True,
                       return_sequences=True,
                       dropout=.2))
        self.model.add(LSTM(neurons,
                       batch_input_shape=(batch_size, X.shape[1], X.shape[2]),
                       stateful=True,
                       dropout=.2))
        self.model.add(Dense(5, activation='softmax'))
        self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Precision(),
                                                                                       tf.keras.metrics.Recall(),
                                                                                       'AUC'])
    
    def read_data(self, filename):
        # read in the data
        data = pd.read_csv(filename)
        data = data.rename(str.lower, axis=1)
        data = data.rename(mapper={'adj close': 'adj_close'}, axis=1)
        data.date = pd.to_datetime(data.date)
        data = data.set_index('date')
        return data

    # helper function for one of our technical indicators
    def _calc_rsi(self, df, periods = 14, ema = True):
        """
        Returns a pd.Series with the relative strength index.
        """
        close_delta = df.adj_close.diff()

        # Make two series: one for lower closes and one for higher closes
        up = close_delta.clip(lower=0)
        down = -1 * close_delta.clip(upper=0)

        if ema == True:
            # Use exponential moving average
            ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
            ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
        else:
            # Use simple moving average
            ma_up = up.rolling(window = periods, adjust=False).mean()
            ma_down = down.rolling(window = periods, adjust=False).mean()

        rsi = ma_up / ma_down
        rsi = 100 - (100/(1 + rsi))
        return rsi
    
    def add_features(self, df):
        # accumulation/Distribution line
        mult = ((df.close - df.low) - (df.high - df.close)) / (df.high - df.low)
        MFVolume = mult * df.volume
        accum_dist_indicator = MFVolume.cumsum()
        ret_df = pd.concat([df, accum_dist_indicator], axis=1)
        ret_df = ret_df.rename(mapper={0:'accum_dist_indicator'}, axis=1)

        #MACD
        EMA_12 = df.adj_close.ewm(span=12, adjust=False).mean()
        EMA_26 = df.adj_close.ewm(span=26, adjust=False).mean()
        macd = EMA_12 - EMA_26
        signal = macd.ewm(span=9, adjust=False).mean()
        ret_df = pd.concat([ret_df, macd.rename('macd'), signal.rename('signal_macd')], axis=1)

        #RSI
        rsi = self._calc_rsi(df)
        ret_df = pd.concat([ret_df, rsi.rename('rsi')], axis=1)

        return ret_df

    def _class_separation(self, x):
        if x < -.05:
            return -2
        elif x < -.005:
            return -1
        elif x > .05:
            return 2
        elif x > .005:
            return 1
        else:
            return 0

    def prep_data(self, filename, lookback=14):
        temp = np.array([0,1,2,-1,-2]).reshape(-1,1)
        # read in the data
        data = self.read_data(filename)
        self.data = data
        # add technical indicators as features
        data = self.add_features(data)
        # frame as an RNN problem
        data = self.series_to_supervised(data, lookback, 5, data.columns, data.index, [4])
        # only keep the predictive columns I care about
        data = data.drop(data.columns[-5:-1], axis=1)
        labels = (data['adj_close(t+4)'] - data['adj_close(t-1)']) / data['adj_close(t-1)']
        labels = labels.apply(self._class_separation)
        data = data.drop(data.columns[-1], axis=1)
        data = pd.concat([data, labels.rename('labels')], axis=1)
        data_values = data.values
        train, test = self.split_data(data_values, .2)
        scaler, train_scaled, test_scaled = self.scale(train[:,:-1], test[:,:-1])
        ohe = OneHotEncoder(sparse=False).fit(temp)
        train_scaled = np.append(train_scaled, ohe.transform(train[:,-1].reshape((-1,1))), axis=1)
        test_scaled = np.append(test_scaled, ohe.transform(test[:,-1].reshape((-1,1))), axis=1)
        return scaler, train_scaled, test_scaled
    
    def series_to_supervised(self, data, n_in=1, n_out=1, col_names = [], indicies = [], preds = [], dropnan=True):
        '''
        Convert a time series to a supervised learning dataset
        Args:
            data -> time series to convert as a list or numpy array
            n_in -> number of lag observations as input (X)
            n_out -> number of observations as output (y)
            col_names -> names of the columns
            indicies -> list of the indicies
            preds -> list of column indicies to determine which variables to predict
            dropnan -> flag of whether to drop the rows with NaN
        Returns:
            Pandas DataFrame of series framed for supervised learning
        '''
        n_vars = 1 if type(data) is list else data.shape[1]
        df = pd.DataFrame(data)
        cols, names = list(), list()
        # input sequence (t-n, ... t-1)
        for i in range(n_in, 0, -1):
            cols.append(df.shift(i))
            names += [('%s(t-%d)' % (col_names[j], i)) for j in range(n_vars)]
        # forecast sequence
        for i in range(0, n_out):
            cols.append(df[col_names[preds]].shift(-i))
            if i==0:
                names += [('%s(t)' % (col_names[j])) for j in preds]
            else:
                names += [('%s(t+%d)' % (col_names[j], i)) for j in preds]
        # putting it together
        agg = pd.concat(cols, axis=1)
        agg.columns = names
        agg.index = indicies
        if dropnan:
            agg.dropna(inplace=True)
        return agg

    def scale(self, in_train, in_test, with_labels=False):
        '''
        Rescales the train and test sets
        Args:
            train -> numpy array of the training data
            test -> numpy array of the test data
            with_labels -> if set to true will cut off last column before scaling,
                reattached after scaling
        Returns:
            scaler -> the scaler object for transforming
            train_scaled -> a rescaled version of the train data
            test_scaled -> a rescaled version of the test data
        '''
        train = in_train
        test = in_test
        if with_labels:
            train_labels = train[:,-1]
            train = train[:,:-1]
            test_labels = test[:,-1]
            test = test[:,:-1]
        # scale train and test to [-1,1]
        scaler = MinMaxScaler(feature_range=(-1,1))
        scaler = scaler.fit(train)
        # transform train
        train = train.reshape(train.shape[0], train.shape[1])
        train_scaled = scaler.transform(train)
        # transform test
        test = test.reshape(test.shape[0], test.shape[1])
        test_scaled = scaler.transform(test)
        if with_labels:
            train_scaled.append(train_labels, axis=1)
            test_scaled.append(test_labels, axis=1)
        return scaler, train_scaled, test_scaled

    def split_data(self, data, test_percent):
        '''
        Splits the data by percentage amount
        Returns: train, test
        '''
        split_val = int(len(data) * (1 - test_percent))
        train, test = data[:split_val], data[split_val:]
        return train, test

In [105]:
my_model = ModelLoader('SPY')

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (32, 1, 32)               22144     
                                                                 
 lstm_9 (LSTM)               (32, 32)                  8320      
                                                                 
 dense_4 (Dense)             (32, 5)                   165       
                                                                 
Total params: 30,629
Trainable params: 30,629
Non-trainable params: 0
_________________________________________________________________
None


In [76]:
my_model.evaluate()

              precision    recall  f1-score   support

      horrid       0.00      0.00      0.00        42
        poor       0.24      0.23      0.24       349
     neutral       0.28      0.56      0.38       351
        good       0.46      0.27      0.34       700
       great       0.05      0.03      0.04        30

   micro avg       0.32      0.32      0.32      1472
   macro avg       0.21      0.22      0.20      1472
weighted avg       0.35      0.32      0.31      1472
 samples avg       0.32      0.32      0.32      1472



In [91]:
my_model.retrain()

1/100.........11/100.........21/100.........31/100.........41/100.........51/100.........61/100.........71/100.........81/100.........91/100.........
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (32, 1, 32)               22144     
                                                                 
 lstm_9 (LSTM)               (32, 32)                  8320      
                                                                 
 dense_4 (Dense)             (32, 5)                   165       
                                                                 
Total params: 30,629
Trainable params: 30,629
Non-trainable params: 0
_________________________________________________________________
None


In [106]:
my_model.predict()

five days from volume probabilities:
fall:0.172, dip:0.149, stay:0.299, rise:0.137, rocket:0.243
