# LSTM Model

This notebook is used to develop an LSTM model for predicting Dow Jones stocks.  

We will begin with the Walmart stock data as a beginning test.

### Packages

First, we load important packages.

In [None]:
#import some useful packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

### Data

Now, we can load the data.

In [None]:
#load Walmart Stock Data
filepath = os.path.join('..', 'Resources', 'WMT.csv')
df = pd.read_csv(filepath)
df.head()

We get rid of columns we do not need and set the index as the date column.

In [None]:
#drop unnessecary columns
df.drop(['volume', 'unadjustedVolume', 'change', 'changePercent', 'vwap', 'label', 'changeOverTime'], 1, inplace = True)
df.head()

In [None]:
#set index
df.set_index('date', inplace = True)
df.head()

In [None]:
#plot close price
df.plot(y='close')

### Create Train and Test Split Data

We need to create a train/test split.  To do so, we will assume we feed sequences of some length and predict at some point in the future.

In [None]:
#save data as a matrix
data = df.values
data.shape

In [None]:
#save sequence length and time in the future
#we will start with 30 days and 5 days in the future (about 1 month and 1 week)
seq_length = 30
fut_point = 5
features = 4

In [None]:
#get X data (30 day sequences)
X = []
#get all sequences up to (sequence length + future point) days out of last point (can then predict last point)
for index in range(len(data) - seq_length - fut_point):
    X.append(data[index: index + seq_length])
#get X as a numpy array
X = np.array(X)
X.shape

In [None]:
#get Y data (close price for all days except first (sequence length + future point) days)
y = data[(seq_length + fut_point):, -1]
y.shape

In [None]:
#train/test split of 0.85/0.15
train_split = 0.85
last_row = int(train_split * X.shape[0])
X_train = X[:last_row]
X_test = X[last_row:]
y_train = y[:last_row]
y_test = y[last_row:]
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### Scale Data
We scale the data using the MinMaxScaler fit for the training data.

In [None]:
#instantiate scalers
X_scaler = MinMaxScaler(feature_range = (-1, 1))
y_scaler = MinMaxScaler(feature_range = (-1, 1))

In [None]:
#reshape data so it can be fit
X_train_reshaped = np.reshape(X_train, (-1, 4))
X_test_reshaped = np.reshape(X_test, (-1, 4))
y_train_reshaped = np.reshape(y_train, (-1, 1))
y_test_reshaped = np.reshape(y_test, (-1, 1))

In [None]:
#fit scalers
X_scaler.fit(X_train_reshaped)
y_scaler.fit(y_train_reshaped)

In [None]:
#transform and rescale
X_train_scaled = np.reshape(X_scaler.transform(X_train_reshaped), X_train.shape)
X_test_scaled = np.reshape(X_scaler.transform(X_test_reshaped), X_test.shape)
y_train_scaled = np.reshape(y_scaler.transform(y_train_reshaped), y_train.shape[0])
y_test_scaled = np.reshape(y_scaler.transform(y_test_reshaped), y_test.shape[0])
X_train_scaled.shape

### LSTM model
Now, we build a basic LSTM network model.

We build several LSTM layers together, adding Dropout layers and a few dense layers to summarize.

In [None]:
#import layers
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM

In [None]:
#create an LSTM model
model = Sequential()

#add first LSTM layer and dropout layer
model.add(LSTM(256, return_sequences = True, input_shape = (seq_length, features)))
model.add(Dropout(0.2))

#add second LSTM layer and dropout layer
model.add(LSTM(256, return_sequences = False))
model.add(Dropout(0.2))

#add an reLU layer
model.add(Dense(32, activation = 'relu'))

#add a final layer
model.add(Dense(1, activation = 'linear'))

#compile model
model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy'])

model.summary()

In [None]:
#fit model
model.fit(X_train_scaled, y_train_scaled, epochs = 100, batch_size = 64, validation_split = 0.15, verbose = 1)

In [None]:
model.save('first_model.h5')

### Load Model and Make Predictions

We load the model from memory.

In [None]:
model = load_model('first_model.h5')
model.summary()

Now we get the scores and root mean square errors.

In [None]:
#score models
import math
train_score = model.evaluate(X_train_scaled, y_train_scaled, verbose = 0)
test_score = model.evaluate(X_test_scaled, y_test_scaled, verbose = 0)
train_rmse = math.sqrt(train_score[0])
test_rmse = math.sqrt(test_score[0])
print(f"Training Set- Score: {train_score[0]}, RMSE: {train_rmse}")
print(f"Test Set- Score: {test_score[0]}, RMSE: {test_rmse}")

Now, we make predictions.

In [None]:
#evaluate model on training set and test set
y_train_preds_scaled = model.predict(X_train_scaled)
y_test_preds_scaled = model.predict(X_test_scaled)
y_train_preds_scaled.shape

### Plot results
We now wish to visualize our results.

First, we need to denormalize.

In [None]:
#rescale results
y_train_preds_denormed = y_scaler.inverse_transform(y_train_preds_scaled)
y_test_preds_denormed = y_scaler.inverse_transform(y_test_preds_scaled)

Now, we can reshape to the same shape as the training and test sets.

In [None]:
#reshape results for plotting
y_train_preds = np.reshape(y_train_preds_denormed, y_train.shape[0])
y_test_preds = np.reshape(y_test_preds_denormed, y_test.shape[0])
y_train_preds.shape

Now, we can plot the results.

In [None]:
#create x arrays (just day indices)
days1 = np.arange(len(y_train))
days2 = np.arange(len(y_train), len(y_train) + len(y_test))

In [None]:
#plot
fig, ax = plt.subplots()
ax.plot(days1, y_train, 'b', label = 'Training Set Actual')
ax.plot(days1, y_train_preds, 'r', label = 'Training Set Predictions')
ax.plot(days2, y_test, 'k', label = 'Test Set Actual')
ax.plot(days2, y_test_preds, 'g', label = 'Test Set Predictions')
ax.legend()
ax.set_title('Walmart Stock Predictions')
ax.set_xlabel('Day Index')
ax.set_ylabel('Closing Price')
plt.show()

We can also plot only the test set.

In [None]:
#plot test set only
fig2, ax2 = plt.subplots()
ax2.plot(days2, y_test, 'k', label = 'Test Set Actual')
ax2.plot(days2, y_test_preds, 'g', label = 'Test Set Predictions')
ax2.legend()
ax2.set_title('Walmart Test Set Predictions')
ax2.set_xlabel('Day Index')
ax2.set_ylabel('Closing Price')
plt.show()

### Functions for Arbitrary Size, Number of Neurons, and Future Time Point
We wish to create functions that allow for some more arbitrary settings.

In [None]:
#function to create training and test data from a dataframe
def train_test_splitter(df, seq_length, fut_point, train_split):
    #save data as a matrix
    data = df.values
    
    #save number of features
    features = data.shape[1]
    
    #get X data (30 day sequences)
    X = []
    #get all sequences up to (sequence length + future point) days out of last point (can then predict last point)
    for index in range(len(data) - seq_length - fut_point):
        X.append(data[index: index + seq_length])
    #get X as a numpy array
    X = np.array(X)
    
    #get Y data (close price for all days except first (sequence length + future point) days)
    y = data[(seq_length + fut_point):, -1]
    
    #create train/test splits using chosing training split (between 0 and 1)
    last_row = int(train_split * X.shape[0])
    X_train = X[:last_row]
    X_test = X[last_row:]
    y_train = y[:last_row]
    y_test = y[last_row:]
    return X_train, X_test, y_train, y_test

In [None]:
#test function with 180 days sequence and 80 days future point
X_train2, X_test2, y_train2, y_test2 = train_test_splitter(df, 180, 80, 0.85)

In [None]:
#function to create scaled data and scalers
def create_scalers_and_normalize(X_train, X_test, y_train, y_test):
    #instantiate scalers
    X_scaler = MinMaxScaler(feature_range = (-1, 1))
    y_scaler = MinMaxScaler(feature_range = (-1, 1))
    
    #get number of features
    features = X_train.shape[2]
    
    #reshape data so it can be fit
    X_train_reshaped = np.reshape(X_train, (-1, features))
    X_test_reshaped = np.reshape(X_test, (-1, features))
    y_train_reshaped = np.reshape(y_train, (-1, 1))
    y_test_reshaped = np.reshape(y_test, (-1, 1))
    
    #fit scalers
    X_scaler.fit(X_train_reshaped)
    y_scaler.fit(y_train_reshaped)
    
    #transform and rescale
    X_train_scaled = np.reshape(X_scaler.transform(X_train_reshaped), X_train.shape)
    X_test_scaled = np.reshape(X_scaler.transform(X_test_reshaped), X_test.shape)
    y_train_scaled = np.reshape(y_scaler.transform(y_train_reshaped), y_train.shape[0])
    y_test_scaled = np.reshape(y_scaler.transform(y_test_reshaped), y_test.shape[0])
    
    return X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, X_scaler, y_scaler

In [None]:
X_train_scaled2, X_test_scaled2, y_train_scaled2, y_test_scaled2, X_scaler2, y_scaler2 = create_scalers_and_normalize(
    X_train2, X_test2, y_train2, y_test2)

In [None]:
#create an LSTM model with different neuron sizes
def create_generic_LSTM_model(neurons, dropout, seq_length, features):
    #create an LSTM model
    model = Sequential()

    #add first LSTM layer and dropout layer
    model.add(LSTM(neurons[0], return_sequences = True, input_shape = (seq_length, features)))
    model.add(Dropout(0.2))

    #add second LSTM layer and dropout layer
    model.add(LSTM(neurons[1], return_sequences = False))
    model.add(Dropout(0.2))

    #add an reLU layer
    model.add(Dense(neurons[2], activation = 'relu'))

    #add a final layer
    model.add(Dense(1, activation = 'linear'))

    #compile model
    model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy'])

    return model

In [None]:
#test function
new_model = create_generic_LSTM_model([256, 256, 32], 0.2, 180, 4)

In [None]:
#save model
new_model.save('second_model.h5')

In [None]:
#load model
new_model = load_model('second_model.h5')

In [None]:
#function to evaluate score and return predictions for a given model path
import math
def make_preds(model_path, X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, y_scaler):
    #load model
    model = load_model(model_path)
    
    #score models
    train_score = model.evaluate(X_train_scaled, y_train_scaled, verbose = 0)
    test_score = model.evaluate(X_test_scaled, y_test_scaled, verbose = 0)
    train_rmse = math.sqrt(train_score[0])
    test_rmse = math.sqrt(test_score[0])
    print(f"Training Set- Score: {train_score[0]}, RMSE: {train_rmse}")
    print(f"Test Set- Score: {test_score[0]}, RMSE: {test_rmse}")
    
    #evaluate model on training set and test set
    y_train_preds_scaled = model.predict(X_train_scaled)
    y_test_preds_scaled = model.predict(X_test_scaled)
    
    #rescale results
    y_train_preds_denormed = y_scaler.inverse_transform(y_train_preds_scaled)
    y_test_preds_denormed = y_scaler.inverse_transform(y_test_preds_scaled)
    
    #reshape results for plotting
    y_train_preds = np.reshape(y_train_preds_denormed, len(y_train_scaled))
    y_test_preds = np.reshape(y_test_preds_denormed, len(y_test_scaled))
    
    return y_train_preds, y_test_preds, train_score, test_score

In [None]:
#test function
y_train_preds, y_test_preds, train_score, test_score =  make_preds('first_model.h5', X_train_scaled, 
                                                                   X_test_scaled, y_train_scaled, y_test_scaled, 
                                                                   y_scaler)

In [None]:
#function to create, compile, fit a model, and make predictions
def fit_generic_LSTM_model(df, seq_length, fut_point, train_split, neurons, dropout, epochs, batch_size, 
                           validation_split, model_path):
    
    #get train/test split
    X_train, X_test, y_train, y_test = train_test_splitter(df, seq_length, fut_point, train_split)
    
    #get number of features
    features = X_train.shape[2]
    
    #get scalers and normalized data
    X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, X_scaler, y_scaler = create_scalers_and_normalize(
        X_train, X_test, y_train, y_test)
    
    #create model
    model = create_generic_LSTM_model(neurons, dropout, seq_length, features)
    
    #fit model
    model.fit(X_train_scaled, y_train_scaled, epochs = epochs, 
              batch_size = batch_size, validation_split = validation_split, verbose = 1)
    
    #save model
    model.save(model_path)
    
    #evaluate model and get predictions
    y_train_preds, y_test_preds, train_score, test_score = make_preds(model_path, 
                                                                      X_train_scaled, X_test_scaled, 
                                                                      y_train_scaled, y_test_scaled, y_scaler)
    
    #return necessary variables to create predictions
    return y_train, y_test, y_train_preds, y_test_preds, train_score, test_score

In [None]:
#test function
seq_length = 180
fut_point = 80
train_split = 0.85
neurons = [256, 256, 32]
epochs = 100
batch_size = 64
validation_split = 0.15
dropout = 0.2
model_path = 'third_model.h5'
y_train3, y_test3, y_train_preds3, y_test_preds3, train_score3, test_score3 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#function to plot results
def make_results_plot(y_train, y_test, y_train_preds, y_test_preds):
    #create x arrays (just day indices)
    days1 = np.arange(len(y_train))
    days2 = np.arange(len(y_train), len(y_train) + len(y_test))
    
    #plot
    fig, ax = plt.subplots()
    ax.plot(days1, y_train, 'b', label = 'Training Set Actual')
    ax.plot(days1, y_train_preds, 'r', label = 'Training Set Predictions')
    ax.plot(days2, y_test, 'k', label = 'Test Set Actual')
    ax.plot(days2, y_test_preds, 'g', label = 'Test Set Predictions')
    ax.legend()
    ax.set_title('Walmart Stock Predictions')
    ax.set_xlabel('Day Index')
    ax.set_ylabel('Closing Price')
    plt.show()

In [None]:
#test function
y_train_preds3, y_test_preds3, train_score3, test_score3 = make_preds('third_model.h5', 
                                                                                         X_train_scaled2, 
                                                                                         X_test_scaled2, 
                                                                                         y_train_scaled2, 
                                                                                         y_test_scaled2, y_scaler2)
make_results_plot(y_train2, y_test2, y_train_preds3, y_test_preds3)

### Time Interval Testing

One day future point, 20 days past information.

In [None]:
#test function
seq_length = 20
fut_point = 1
train_split = 0.85
neurons = [256, 256, 32]
epochs = 100
batch_size = 64
validation_split = 0.15
dropout = 0.2
model_path = 'first_one_day_model.h5'
y_train5, y_test5, y_train_preds5, y_test_preds5, train_score5, test_score5 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train5, y_test5, y_train_preds5, y_test_preds5)

Now, we try even lengths:  20 days and 20 days.

In [None]:
#test function
seq_length = 20
fut_point = 20
train_split = 0.85
neurons = [256, 256, 32]
epochs = 100
batch_size = 64
validation_split = 0.15
dropout = 0.2
model_path = 'four_week_model.h5'
y_train6, y_test6, y_train_preds6, y_test_preds6, train_score6, test_score6 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train6, y_test6, y_train_preds6, y_test_preds6)

In [None]:
#predict one day ahead with last week's data
seq_length = 5
fut_point = 1
train_split = 0.85
neurons = [256, 256, 32]
epochs = 100
batch_size = 64
validation_split = 0.15
dropout = 0.2
model_path = 'one_day_model.h5'
y_train4, y_test4, y_train_preds4, y_test_preds4, train_score4, test_score4 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train4, y_test4, y_train_preds4, y_test_preds4)

### Try different numbers of features

We will read in the dataframe again and consider another couple features (volume traded and vwap).

We test compared to the first model.

In [None]:
#load Walmart Stock Data
filepath = os.path.join('..', 'Resources', 'WMT.csv')
new_df = pd.read_csv(filepath)

#drop unnessecary columns
new_df.drop(['unadjustedVolume', 'change', 'changePercent', 'label', 'changeOverTime'], 1, inplace = True)

#set index
new_df.set_index('date', inplace = True)
new_df.head()

We will first run the functions for 30 days sequence and 5 days of future point for the old number of features.

In [None]:
#model for old number of features
seq_length = 30
fut_point = 5
train_split = 0.85
neurons = [256, 256, 32]
epochs = 100
batch_size = 64
validation_split = 0.15
dropout = 0.2
model_path = 'more_features.h5'
y_train3, y_test3, y_train_preds3, y_test_preds3, train_score3, test_score3 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train3, y_test3, y_train_preds3, y_test_preds3)

In [None]:
#model for new number of features
seq_length = 30
fut_point = 5
train_split = 0.85
neurons = [256, 256, 32]
epochs = 100
batch_size = 64
validation_split = 0.15
dropout = 0.2
model_path = 'more_features_real_long.h5'
y_train3, y_test3, y_train_preds3, y_test_preds3, train_score3, test_score3 = fit_generic_LSTM_model(new_df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train3, y_test3, y_train_preds3, y_test_preds3)

In [None]:
#test with 5 days sequence 1 day future point
seq_length = 5
fut_point = 1
train_split = 0.85
neurons = [256, 256, 32]
epochs = 100
batch_size = 64
validation_split = 0.15
dropout = 0.2
model_path = 'more_features_real.h5'
y_train3, y_test3, y_train_preds3, y_test_preds3, train_score3, test_score3 = fit_generic_LSTM_model(new_df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train3, y_test3, y_train_preds3, y_test_preds3)

### Hyperparameter Tuning
Now, we attempt to tune hyperparameters of the model.

First, we will look at number of neurons.  For all of these first trials, we will use a sequence length of thirty 
and a future point of five.

In [None]:
#look at number of neurons
#use training score as metric (should really only score on test set when done.)
#set up parameters
seq_length = 30
fut_point = 5
train_split = 0.85
#neurons = [256, 256, 32]
epochs = 100
batch_size = 64
validation_split = 0.15
dropout = 0.2
model_path = 'dummy_path.h5'

#set up variances of neuron size
neuron_lengths = [[256, 256, 32], [256, 256, 16], [128, 128, 32], [128, 128, 16], [64, 64, 32], [64, 64, 16]]

#create lists to store results
neurons = []
train_scores = []

#iterate
for neuron_length in neuron_lengths:
    neurons.append(f"{neuron_length}")
    
    train, test, train_preds, test_preds, train_score, test_score = fit_generic_LSTM_model(df, seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neuron_length, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)
    
    train_scores.append(train_score)
    
#create dataframe
results = pd.DataFrame({'Neuron Lengths': neurons, 'Train Scores': train_scores})

results.plot.bar(x = 'Neuron Lengths', y = 'Train Scores')

In [None]:
results

In [None]:
#train model for 128, 128, 16 and visualize
seq_length = 30
fut_point = 5
train_split = 0.85
neurons = [128, 128, 16]
epochs = 100
batch_size = 64
validation_split = 0.15
dropout = 0.2
model_path = 'low_neurons.h5'
y_train3, y_test3, y_train_preds3, y_test_preds3, train_score3, test_score3 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train3, y_test3, y_train_preds3, y_test_preds3)

It seems there is a bit of randomness in the training scores (likely due to different starting weights and which variables are dropped in dropout layers during fitting).  They seem to score about the same.  As a simpler network seems to work, we will use it (128, 128, 16).

#### Number of Epochs

We will look at how the number of epochs affects our error.  The inbuilt fit function in keras has a way of doing this.


In [None]:
def see_history(df, seq_length, fut_point, train_split, neurons, dropout, epochs, batch_size, 
                           validation_split, model_path):
    
    #get train/test split
    X_train, X_test, y_train, y_test = train_test_splitter(df, seq_length, fut_point, train_split)
    
    #get number of features
    features = X_train.shape[2]
    
    #get scalers and normalized data
    X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, X_scaler, y_scaler = create_scalers_and_normalize(
        X_train, X_test, y_train, y_test)
    
    #create model
    model = create_generic_LSTM_model(neurons, dropout, seq_length, features)
    
    #fit model
    history = model.fit(X_train_scaled, y_train_scaled, epochs = epochs, 
              batch_size = batch_size, validation_split = validation_split, verbose = 1)
    
    #save model
    model.save(model_path)
    
    return history

In [None]:
#do so
seq_length = 30
fut_point = 5
train_split = 0.85
neurons = [128, 128, 16]
epochs = 200
batch_size = 64
validation_split = 0.15
dropout = 0.2
model_path = 'epoch_test.h5'
history = see_history(df, seq_length, fut_point, train_split, neurons, dropout, epochs, batch_size, validation_split,
                     model_path)

In [None]:
#loss history by epoch
history.history

We can look at the loss history by reading it into a dataframe.

In [None]:
#read into dataframe
history_df = pd.DataFrame(history.history)

In [None]:
#look at columns
history_df.head()

In [None]:
history_df.plot(y = ['val_loss', 'loss'], title = 'Loss History by Epoch')

The loss in general seems to decrease a great deal, but the validation loss is very noisy after about 100 epochs.  We will use 100 epochs going forward.

#### Dropout Amount

Now, we consider dropout amount (percentage of weights dropped in each iteration).

In [None]:
#look at dropout
#use training score as metric (should really only score on test set when done.)
#set up parameters
seq_length = 30
fut_point = 5
train_split = 0.85
neurons = [128, 128, 16]
epochs = 100
batch_size = 64
validation_split = 0.15
model_path = 'dummy_path.h5'

#set up variances of neuron size
dropout_list = [0.2, 0.3, 0.4, 0.5, 0.6]

#create lists to store results
dropouts = []
train_scores = []

#iterate
for dropout in dropout_list:
    dropouts.append(dropout)
    
    train, test, train_preds, test_preds, train_score, test_score = fit_generic_LSTM_model(df, seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neuron_length, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)
    
    train_scores.append(train_score[0])
    
#create dataframe
results = pd.DataFrame({'Dropouts': dropouts, 'Train Scores': train_scores})

results.plot(x = 'Dropouts', y = 'Train Scores')

In [None]:
#plot results
results.plot(x = 'Dropouts', y = 'Train Scores')

We see that the dropout value of 0.30 is best.

#### Training Split

We try several different train/test split sizes.

In [None]:
#look at train/test split
#use training score as metric (should really only score on test set when done.)
#set up parameters
seq_length = 30
fut_point = 5
dropout = 0.3
neurons = [128, 128, 16]
epochs = 100
batch_size = 64
validation_split = 0.15
model_path = 'dummy_path.h5'

#set up variances of neuron size
split_list = [0.75, 0.8, 0.85, 0.9]

#create lists to store results
train_splits = []
train_scores = []

#iterate
for train_split in split_list:
    train_splits.append(train_split)
    
    train, test, train_preds, test_preds, train_score, test_score = fit_generic_LSTM_model(df, seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)
    
    train_scores.append(train_score[0])
    
#create dataframe
results = pd.DataFrame({'Train_Test_Splits': train_splits, 'Train Scores': train_scores})

In [None]:
#plot results
results.plot.bar(x = 'Train_Test_Splits', y = 'Train Scores', color = 'orange')

Due to the way train/test split works with this time series data, we will actually take a look at a graph for this one after training a model.  It may be that training/test performance is different.

In [None]:
#train a model with 0.75 train/test split
seq_length = 30
fut_point = 5
train_split = 0.75
neurons = [128, 128, 16]
epochs = 100
batch_size = 64
validation_split = 0.15
dropout = 0.3
model_path = 'three_quarters_split.h5'
y_train3, y_test3, y_train_preds3, y_test_preds3, train_score3, test_score3 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train3, y_test3, y_train_preds3, y_test_preds3)

We see that a 0.75 train/test split actually does not do well as the scale is wrong for the test set.  We will still use a split of 0.85 going forward.

We will now try several different validation set splits.

In [None]:
#look at validation split
#use training score as metric (should really only score on test set when done.)
#set up parameters
seq_length = 30
fut_point = 5
dropout = 0.3
neurons = [128, 128, 16]
epochs = 100
batch_size = 64
train_split = 0.85
model_path = 'dummy_path.h5'

#set up variances of neuron size
split_list = [0.1, 0.15, 0.2, 0.25]

#create lists to store results
validation_splits = []
train_scores = []

#iterate
for validation_split in split_list:
    validation_splits.append(validation_split)
    
    train, test, train_preds, test_preds, train_score, test_score = fit_generic_LSTM_model(df, seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)
    
    train_scores.append(train_score[0])
    
#create dataframe
results = pd.DataFrame({'Validation Split': train_splits, 'Train Scores': train_scores})

In [None]:
results = pd.DataFrame({'Validation Split': validation_splits, 'Train Scores': train_scores})
results.plot.bar(x = 'Validation Split', y = 'Train Scores', color = 'green')

We will use a validation split of 0.1.

#### Batch Size
Finally, we will try a few different batch sizes.

In [None]:
#look at batch size
#use training score as metric (should really only score on test set when done.)
#set up parameters
seq_length = 30
fut_point = 5
dropout = 0.3
neurons = [128, 128, 16]
epochs = 100
validation_split = 0.1
train_split = 0.85
model_path = 'dummy_path.h5'

#set up variances of neuron size
sizes = [16, 32, 64]

#create lists to store results
batch_sizes = []
train_scores = []

#iterate
for batch_size in sizes:
    batch_sizes.append(batch_size)
    
    train, test, train_preds, test_preds, train_score, test_score = fit_generic_LSTM_model(df, seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)
    
    train_scores.append(train_score[0])
    
#create dataframe
results = pd.DataFrame({'Batch Size': train_splits, 'Train Scores': train_scores})

In [None]:
results = pd.DataFrame({'Batch Size': batch_sizes, 'Train Scores': train_scores})
results.plot(x = 'Batch Size', y = 'Train Scores', color = 'black')

We will use a batch size of 32.

### Re-visiting Time Windows

With selected hyperparameters, we can now look at and graph results for different sequence lengths and future points.

We begin with our standard 30 day sequence length and a 5 day future point.

In [None]:
#train a model with
seq_length = 30
fut_point = 5
train_split = 0.85
neurons = [128, 128, 16]
epochs = 100
batch_size = 32
validation_split = 0.1
dropout = 0.3
model_path = 'final_model.h5'
y_train3, y_test3, y_train_preds3, y_test_preds3, train_score3, test_score3 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train3, y_test3, y_train_preds3, y_test_preds3)

This seems to do well enough for now.

Now, we try 5 days sequence 1 day ahead.

In [None]:
#train a model with 5 days sequence, 1 day future point
seq_length = 5
fut_point = 1
train_split = 0.85
neurons = [128, 128, 16]
epochs = 100
batch_size = 32
validation_split = 0.1
dropout = 0.3
model_path = 'final_model_short.h5'
y_train3, y_test3, y_train_preds3, y_test_preds3, train_score3, test_score3 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train3, y_test3, y_train_preds3, y_test_preds3)

This also now has the range in a good place.

We now use 30 days to predict 30 days out.

In [None]:
#train a model with 30 days sequence, 30 day future point
seq_length = 30
fut_point = 30
train_split = 0.85
neurons = [128, 128, 16]
epochs = 100
batch_size = 32
validation_split = 0.1
dropout = 0.3
model_path = 'final_model_months.h5'
y_train3, y_test3, y_train_preds3, y_test_preds3, train_score3, test_score3 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train3, y_test3, y_train_preds3, y_test_preds3)

That this is not completely bad is nice.

Now, for a very long conmparison, 180 days sequence for 80 days in the future.

In [None]:
#train a model with 180 days sequence, 80 day future point
seq_length = 180
fut_point = 80
train_split = 0.85
neurons = [128, 128, 16]
epochs = 100
batch_size = 32
validation_split = 0.1
dropout = 0.3
model_path = 'final_model_long.h5'
y_train3, y_test3, y_train_preds3, y_test_preds3, train_score3, test_score3 = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)

In [None]:
#plot
make_results_plot(y_train3, y_test3, y_train_preds3, y_test_preds3)

This is problematic.

### Generic Stock

There are actually 30 Dow Jones Stocks.  We need a function to produce a model for a generic filepath.


In [None]:
#function to produce predictions for a generic filepath
def generic_stock_predictions(filepath, stock_name, seq_length, fut_point):
    
    #define variables
    train_split = 0.85
    neurons = [128, 128, 16]
    epochs = 100
    batch_size = 32
    validation_split = 0.1
    dropout = 0.3
    
    #define model path
    model_path = stock_name + '_model.h5'
    
    #read in data frame and drop unnescessary columns
    df = pd.read_csv(filepath)
    df.drop(['volume', 'unadjustedVolume', 'change', 'changePercent', 'vwap', 'label', 
             'changeOverTime'], 1, inplace = True)
    df.set_index('date', inplace = True)
    
    #fit model
    y_train, y_test, y_train_preds, y_test_preds, train_score, test_score = fit_generic_LSTM_model(df, 
                                                                                                     seq_length, 
                                                                                                     fut_point, 
                                                                                                     train_split, 
                                                                                                     neurons, 
                                                                                                     dropout, 
                                                                                                     epochs, 
                                                                                                     batch_size,
                                                                                                     validation_split, 
                                                                                                     model_path)
    
    #return
    return y_train, y_test, y_train_preds, y_test_preds, train_score, test_score

In [None]:
#test
filepath = os.path.join('..', 'Resources', 'WMT.csv')
y_train4, y_test4, y_train_preds4, y_test_preds4, train_score4, test_score4 = generic_stock_predictions(filepath,
                                                                                                       'WMT',
                                                                                                       30, 5)

In [None]:
#plot
make_results_plot(y_train4, y_test4, y_train_preds4, y_test_preds4)

We wish to have a function that looks at predicted profitability on the last day.

In [None]:
#predicted profitability function
def predicted_profit(filepath, stock_name, seq_length, fut_point):
    
    #fit model
    y_train, y_test, y_train_preds, y_test_preds, train_score, test_score = generic_stock_predictions(
        filepath, stock_name, seq_length, fut_point)
    
    #get values
    start_close = y_test[-1-fut_point]
    end_close = y_test[-1]
    pred_close = y_test_preds[-1]
    actual_profit = (end_close - start_close)*100/start_close
    pred_profit = (pred_close - start_close)*100/start_close
    
    #create dictionary for output
    stock_dictionary = {'Stock': stock_name, 'Start Close': start_close, 'End Close': end_close, 
                       'Predicted Close': pred_close, 'Actual Profit': actual_profit, 'Predicted Profit': pred_profit}
    
    return stock_dictionary
    

In [None]:
#test function
filepath = os.path.join('..', 'Resources', 'WMT.csv')
dictionary = predicted_profit(filepath, 'WMT', 30, 5)
dictionary

### Variation

There is variation observed in these results.  As a quick test, let us look at the results of doing this 10 times with the profitability function.

In [None]:
#do function 10 times appending to list
dicts = []
for i in range(0, 10):
    print(f"Iteration {i}")
    dictionary = predicted_profit(filepath, 'WMT', 30, 5)
    dictionary['Iteration'] = i
    dicts.append(dictionary)
    
#create dataframe and display
variation_df = pd.DataFrame(dicts)

variation_df

In [None]:
#describe predicted close
variation_df['Predicted Close'].describe()

There is variation in the results.