# Predicting the closing price stock price of S&P500:

In [None]:
import pandas as pd

data = pd.read_csv("../input/sp500-20162021/SP500.csv")
data = data[::-1] #invert data
data = data.reset_index()
data.head()

In [None]:
data.info()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16,6))
plt.title('Close Price History')
plt.plot(data['Close'])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price TWD ($)', fontsize=18)
plt.show()

In [None]:
pre_days = 60 # length of predicting days 
# Create a new dataframe with only the 'Close column 
data = data.filter(['Close'])
# Convert the dataframe to a numpy array
dataset = data.values
# Get the number of rows to train the model on
training_data_len = int(len(dataset)*.7)
validation_data_len = int(len(dataset)*0.2)
testing_data_len = len(dataset) - training_data_len - validation_data_len

print("The number of trainning dataset: ", training_data_len)
print("The number of validation dataset: ", validation_data_len)
print("The number of testing dataset: ", testing_data_len)

In [None]:
import numpy as np
# Create the training data set 
# Create the scaled training data set
train_data = dataset[0:training_data_len, :]
# Split the data into x_train and y_train data sets
x_train = []
y_train = []

for i in range(pre_days, len(train_data)):
    x_train.append(train_data[i-pre_days:i, 0])
    y_train.append(train_data[i, 0])
#    if i<= 61:
#        print(x_train)
#        print(y_train)
#        print()
        
# Convert the x_train and y_train to numpy arrays 
x_train, y_train = np.array(x_train), np.array(y_train)

In [None]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler

scaler_x = MinMaxScaler(feature_range=(0,1))
input_sc = scaler_x.fit(x_train)
x_train_norm = input_sc.transform(x_train)

y_train = np.reshape(y_train,(y_train.shape[0], 1))
scaler_y = MinMaxScaler(feature_range=(0,1))
output_sc = scaler_y.fit(y_train)
y_train_norm = output_sc.transform(y_train)

# Reshape the data
x_train_norm = np.reshape(x_train_norm, (x_train_norm.shape[0], x_train_norm.shape[1], 1))
print("The shape of input data: ", x_train_norm.shape)

In [None]:
# Create the validation data set 
# Create the scaled validation data set
val_data = dataset[training_data_len - pre_days: training_data_len + validation_data_len, :]
# Split the data into x_val and y_val data sets
x_val = []
y_val = []

for i in range(pre_days, len(val_data)):
    x_val.append(val_data[i-pre_days:i, 0])
    y_val.append(val_data[i, 0])
#    if i<= 61:
#        print(x_train)
#        print(y_train)
#        print()
        
# Convert the x_train and y_train to numpy arrays 
x_val, y_val = np.array(x_val), np.array(y_val)

In [None]:
x_val_norm = input_sc.transform(x_val)
y_val = np.reshape(y_val, (y_val.shape[0], 1))
y_val_norm = output_sc.transform(y_val)

# Reshape the data
x_val_norm = np.reshape(x_val_norm, (x_val_norm.shape[0], x_val_norm.shape[1], 1))
print("The shape of validation data: ", x_val_norm.shape)

In [None]:
# test 資料集處理， label處理
# Create the testing data set 
# Create the scaled testing data set
test_data = dataset[training_data_len + validation_data_len - pre_days:, :]
# Split the data into x_test and y_test data sets
x_test = []
y_test = []

for i in range(pre_days, len(test_data)):
    x_test.append(test_data[i-pre_days:i, 0])
    y_test.append(test_data[i, 0])
#    if i<= 61:
#        print(x_train)
#        print(y_train)
#        print()
        
# Convert the x_train and y_train to numpy arrays 
x_test, y_test = np.array(x_test), np.array(y_test)

In [None]:
x_test_norm = input_sc.transform(x_test)
y_test = np.reshape(y_test, (y_test.shape[0],1))
y_test_norm = output_sc.transform(y_test)


# Reshape the data
x_test_norm = np.reshape(x_test_norm, (x_test_norm.shape[0], x_test_norm.shape[1], 1))
print("The shape of testing data: ", x_test_norm.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM

# Build the LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (x_train.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

model.summary()
print("\n")
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(x_train_norm, y_train_norm, batch_size = 16, epochs = 20, validation_data = (x_val_norm, y_val_norm))

In [None]:
# Get the models predicted price values 
predictions_train = model.predict(x_train_norm)
predictions_train = output_sc.inverse_transform(predictions_train)

# Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((predictions_train - y_train) ** 2)))
print("root mean squred error of trainning data: ", rmse)

In [None]:
# Get the models predicted price values 
predictions_val = model.predict(x_val_norm)
predictions_val = output_sc.inverse_transform(predictions_val)

# Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((predictions_val - y_val) ** 2)))
print("root mean squred error of validation data: ", rmse)

In [None]:
# Get the models predicted price values 
predictions_test = model.predict(x_test_norm)
predictions_test = output_sc.inverse_transform(predictions_test)

# Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((predictions_test - y_test) ** 2)))
print("root mean squred error of testing data: ", rmse)

In [None]:
# Plot the data
train = data[:training_data_len]
valid = data[training_data_len: training_data_len+validation_data_len]
test = data[training_data_len+validation_data_len:]
valid['Predictions'] = predictions_val
test['Predictions'] = predictions_test

# Visualize the data
plt.figure(figsize=(16,6))
plt.title('Close Price ML Predicton of S&P500')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.plot(train['Close'])
plt.plot(valid[['Close', 'Predictions']])
plt.plot(test[['Close', 'Predictions']])
plt.legend(['Train', 'Val', 'val_predication', 'test_prediction'], loc='lower right')
plt.show()