In [None]:
import pandas as pd

# file import
file = 'SP500.csv'
data = pd.read_csv(file)

print(data.head())
print(data.describe())
print(data.isnull().sum())

# deleting columns not needed for lstm
data.drop("volume", inplace = True, axis = 1)
data.drop("avg_vol_20d", inplace = True, axis = 1)

# set null value for change_percent to 0.57
data.fillna(0.57, inplace = True)

# reset date format from YYYY-MM-DD to YYYYMMDD for easier input later on
data["date"] = pd.to_datetime(data["date"]).dt.strftime("%Y%m%d")

# check
print(data.head())

# save
data.to_csv('SP500Processed.csv', index = False)

         date   open   high    low  close  volume  change_percent  avg_vol_20d
0  1927-12-30  17.66  17.66  17.66  17.66       0             NaN          NaN
1  1928-01-03  17.76  17.76  17.76  17.76       0            0.57          NaN
2  1928-01-04  17.72  17.72  17.72  17.72       0           -0.23          NaN
3  1928-01-05  17.55  17.55  17.55  17.55       0           -0.96          NaN
4  1928-01-06  17.66  17.66  17.66  17.66       0            0.63          NaN
               open          high           low         close        volume  \
count  24187.000000  24187.000000  24187.000000  24187.000000  2.418700e+04   
mean     621.549940    625.206228    617.672947    621.682056  8.840179e+08   
std     1001.744483   1007.314809    995.813028   1001.942570  1.585401e+09   
min        4.400000      4.400000      4.400000      4.400000  0.000000e+00   
25%       24.605000     24.605000     24.605000     24.605000  1.505000e+06   
50%      102.000000    102.710000    101.180000    1

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("SP500Processed.csv")

# define reformatted dates as dates_train
dates_train = data["date"]
# change open, high, low, close type to float for normalization
ohlc = list(df)[1:4]
trainingdf = df[ohlc].astype(float)

# initialization for X, y
X_train = []
y_train = []

# normalization
s = StandardScaler()
s = s.fit(trainingdf)
scaled = s.transform(trainingdf)
print(scaled)

# days in the future to predict
futuredays = 1

# days in the past to use for prediction
pastdays = 24

# loop to change the X and y train values to take into account future and past days.
for i in range(pastdays, len(trainingdf)):
  X_train.append(scaled[i - pastdays:i, 0:])
  y_train.append(scaled[i + futuredays - 1: i + futuredays, 0])

X_train, y_train = np.array(X_train), np.array(y_train)

# defining two layer lstm model
model = Sequential()
model.add(LSTM(units = 64, activation = "relu", input_shape = (X_train.shape[1], X_train.shape[2]), return_sequences = True))
model.add(LSTM(units = 32, activation = "relu", return_sequences = False))
model.add(Dropout(0.05))
model.add(Dense(y_train.shape[1]))

# compiling model
model.compile(optimizer = "adam", loss = "mse")
model.summary()

history = model.fit(X_train, y_train, epochs = 10, batch_size = 16 , validation_split = 0.25, verbose = 1)

# forecasting (begins on 4/15/2024)
futuredays = int(input())
forecast_period = pd.date_range(list(dates_train)[-1], periods = futuredays, freq = "1d").tolist()
prediction = model.predict(X_train[-futuredays: ])

# inverse transformation for rescale (duplicate for shape match)
shapematch = np.repeat(prediction, trainingdf.shape[1], axis = -1)
y_pred = s.inverse_transform(shapematch)[:, 0]

forecast_date = []

for time_i in forecast_period:
  forecast_date.append(time_i.date())

# below copied, redo
df_forecast = pd.DataFrame({'date':np.array(forecast_date), 'close':y_pred})
df_forecast['date']=pd.to_datetime(df_forecast['date'])

original = df[['date', 'close']]
original['date']=pd.to_datetime(original['date'])
original = original.loc[original['date'] >= '2024-5-1']

# below gpt
sns.lineplot(x='date', y='close', data=df_forecast, label='Prediction', color='orange')
sns.lineplot(x='date', y='close', data=original, label='True', color='blue')
plt.legend()
plt.title('S&P 500 Prediction Graph')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.show()

[[-0.60285076 -0.60314688 -0.6025482 ]
 [-0.60275093 -0.60304761 -0.60244778]
 [-0.60279086 -0.60308732 -0.60248795]
 ...
 [ 4.54356817  4.55336142  4.54020021]
 [ 4.54213025  4.51687753  4.50923984]
 [ 4.52032793  4.51032572  4.45353576]]
Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_16 (LSTM)              (None, 24, 64)            17408     
                                                                 
 lstm_17 (LSTM)              (None, 32)                12416     
                                                                 
 dropout_8 (Dropout)         (None, 32)                0         
                                                                 
 dense_8 (Dense)             (None, 1)                 33        
                                                                 
Total params: 29857 (116.63 KB)
Trainable params: 29857 (116.63 KB)
Non-traina

In [None]:
# plot for validation/training loss graph
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show

In [None]:
# save
model.save('lstm.h5')