# This program uses an artificial recurrent neural network called Long Short Term Memory (LSTM) to predict the closing stock price of a company (User Specified) using the past 5 years of stock prices.

# It also allows the user to predict the following next future day close price by using the past 60 days and previous training of the model

In [None]:
import os
# Removes tensorflow cuda warning
# This warning is only applicable if user
# has an NVidia graphics card. but does not
# change the output or functionality of the code
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import math
import datetime
from sklearn.preprocessing import MinMaxScaler # Used for scaling data from 0-1 for normalization
from tensorflow.python.keras import Sequential # Turns a model into a Sequential object for training methods
from tensorflow.python.keras.layers import Dense, LSTM # Used for adding layers to the Sequential Object
import pandas_datareader as web # Used for pulling stock data from yahoo finance
# Used for checking user input of stock choice with try-except block to catch invalid inputs
from pandas_datareader._utils import RemoteDataError
import numpy as np # Used for arranging and reshaping data to fit LSTM model
import pandas as pd # Used to create a dataframe of data in order to filter data
import matplotlib.pyplot as plt # Used for visualizations of the data


# Get rid of false-positive warning messages
# The program runs as expected these warnings
# are not crucial to functionality
pd.options.mode.chained_assignment = None
plt.style.use('fivethirtyeight')

In [None]:
# Creates two date variables for use in the loop below
current_date = datetime.date.today()
past_date = current_date.replace(year=current_date.year - 5)

# Takes in user input for a stock symbol and then pulls the stock data
# from yahoo finance using the symbol provided
# Loops until user inputs a valid stock symbol choice
while True:
    try:
        
        stock = input("""Note: Stock data is pulled from yahoo finance.
Available stock choices can be found on finance.yahoo.com\n
Enter stock symbol for the stock you'd like to see data on: 
(Ex: GOOG for Google, AAPL for Apple): """).upper()
        
        # Reads stock data from yahoo finance using user-inputted stock symbol
        # Uses the current day's date as the end date and then 5 years prior to
        # the current date for the start date in order to pull 5 years of data
        df = web.DataReader(stock, data_source='yahoo', start=past_date, end=current_date)
        break
        
    except RemoteDataError:
        print("Stock choice invalid. Please try again.")

In [None]:
print("\nThe raw dataset from yahoo finance contains the following: ")
display(df)

print(f"\n\n\n Here is the raw Close Price History for {stock}: ")

# Visualize the raw closing price history of stock choice in a line graph
plt.figure(figsize=(16, 8))
plt.title(f'{stock} Close Price History')
# Plots only the Close price data
plt.plot(df['Close'])
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.show()


In [None]:
# Creates a new dataframe that contains only the Close price data
data = df.filter(['Close'])

print(f"\nThe filtered raw dataset for close prices is:")
display(data)
print("\n\n")

# Creates copy of dataframe
# Creates primary keys then writes to raw csv file
data_csv = data.copy()
primary_keys = []
for num in range(len(data)):
    primary_keys.append(num+1)
data_csv['PK'] = primary_keys
data_csv = data_csv[['PK', 'Close']]
data_csv

# Creates folder within data folder for stock if
# it does not exist
if not os.path.isdir(f"data/{stock}"):
    os.mkdir(f"data/{stock}")

# Creates a raw data csv file
data_csv.to_csv(f"data/{stock}/{stock}_raw.csv")


# Converts the dataframe into a numpy array
dataset = data.values

# Get the number of rows to train the model on
# math.ceil used to round up and not have real number as value
# Takes 85% of data set
training_data_len = math.ceil(len(dataset) * .85)

In [None]:
# Scale, fit, and transform the data to 0-1 inclusive
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)
scaled_data

In [None]:
# Create the scaled training data set
# This grabs the data up to the cutoff in order to only grab data
# we want to train the model with
train_data = scaled_data[0:training_data_len , :]


# Create two empty training data lists
x_train = []
y_train = []


# x_train - contains past 60 values each passthrough
# y_train - contains next value each passthrough
# This allows 60 values to be used to train for a predition on the value after
# Each cycle then uses the previous prediction for the next
for num in range(60, len(train_data)):
    x_train.append(train_data[num-60:num, 0])
    y_train.append(train_data[num, 0])

In [None]:
# Converts both training sets to numpy arrays for reshaping
x_train = np.array(x_train)
y_train = np.array(y_train)

In [None]:
# Re-shape the data
# LSTM needs 3-dimensional data, currently the arrays are 2-dimensional
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train.shape

In [None]:
# This creates a Sequential model and builds the LSTM model from it
# This is done by adding 4 layers, 2 LSTM layers and 2 dense layers for the 2 LSTM layers
# The Dense layers are for connecting the outputs and data together for continuous learning
# The LSTM model is what uses the data itself in order to make the predictions with help of
# the connections through Dense layers.
# The input shape of LSTM is supposed to be (60, 1) similar to the for loop above for
# the training data. x_train.shape[1] is equal to 60 here and provides a better robust
# way of setting the input shape
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

In [None]:
# Compiles the model using adam optimized and mean_squared_error for the loss
# Accuracy is calculated later using mean_squared_error as well
# adam optimizer is widely used for deep learning models such as LSTM
# as it is very efficient
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
print("\n\nTraining model... Please wait...\n")

# Trains the model using the training data created
# batch_size is how big the training sets are and epochs is the number
# of iterations of batches. 
# These values could be changed for better or worse accuracy
# This combination resulted in relatively good accuracy without
# a huge runtime cost.
model.fit(x_train, y_train, batch_size=64, epochs=40)

In [None]:
# Creates another test data set containing 
# the rest of the data not used in the training sets
test_data = scaled_data[training_data_len - 60: , :]
# Creates two new lists with one being empty
# and the other y_test containing the training values
x_test = []
y_test = dataset[training_data_len:, :]

# Appends the data that wasn't trained on to x_test
for num in range(60, len(test_data)):
    x_test.append(test_data[num-60:num, 0])
    


In [None]:
# Convert the x_test data to a numpy array
x_test = np.array(x_test)

In [None]:
# Re-shape this data for LSTM model 2d-3d
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

In [None]:
# Gets the models predicted values for the values
# that were not trained on 
predictions = model.predict(x_test)

# Inverse transforming the data
# This unscales the values back to non 0-1
# in order for us to see the real prices
predictions = scaler.inverse_transform(predictions)

In [None]:
# Get the root mean squared error (RMSE) - Measure of how accurate the model is
# lower number = better fit
rmse = np.sqrt(np.mean(((y_test - predictions)**2)))
rmse

# Root mean squared percentage error
rmspe = np.sqrt(np.mean(np.square(((y_test - predictions) / y_test)), axis=0))
rmspe = rmspe[0] * 100

# round values to 2 decimal places
rmse = round(rmse, 2)
rmspe = round(rmspe, 2)

print(f"\n\n The rmse(root mean square error) for the {stock} predictions is: ${rmse}")
print(f"\n The rmspe(root mean square percent error) for the {stock} predictions is: {rmspe}%")
print(f"\n This means the average difference between the actual close prices" 
f" and the prediction prices from the model is: \n${rmse} or {rmspe}%")

In [None]:
# Sets training_data = closing prices up to the training_data_len
training_data = data[:training_data_len]
# Sets prediction_data to the data after training_data_len
prediction_data = data[training_data_len:]
# Creates another key 'Predictions' and sets it equal to the un-scaled
# prediction values for the close prices in the prediction_data dataset
prediction_data['Predictions'] = predictions

print(f"\n\n The prediction data set:")
display(prediction_data)
print("\n\n")

# Create folder for model images if it does not exist
if not os.path.isdir(f"models/{stock}"):
    os.mkdir(f"models/{stock}")


print(f"\n\nHere are the graphs created using the predictions generated for {stock}: ")
# Visualize the training and predictions in a line graph
# This shows a big look of the overall stock trend and 
# a small look at the predictions compared to the actual
# close prices
plt.figure(figsize=(16,8))
plt.title(f'{stock} Predictions Model With Training')
plt.xlabel('Year', fontsize=18)
plt.ylabel('Close Price ($)', fontsize=18)
# Plots training data close data, and prediction close and predictions data
plt.plot(training_data['Close'])
plt.plot(prediction_data[['Close', 'Predictions']])
plt.legend(['Training', 'Real Values', 'Predictions'], loc='lower right')
plt.savefig(f'models/{stock}/{stock}_predictions_with_training.png')
plt.show()


In [None]:
print("\n\n")

# Plots another line graph displaying a more close up represenation
# of only the Close vs prediction data without training data
# This shows a much closer look at the trends the predictions
# were making and the trend of the price of the stock
plt.figure(figsize=(16,8))
plt.title(f'{stock} Predictions Model Without Training')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price ($)', fontsize=18)
plt.plot(prediction_data[['Close', 'Predictions']])
plt.legend(['Real Values', 'Predictions'], loc='lower right')
plt.savefig(f'models/{stock}/{stock}_predictions_without_training.png')

In [None]:
print("\n\n")
# Creates a Histogram of the close vs prediction prices
# This is useful to see another look at the accuracy of 
# the predictions vs the actual close prices in different
# price categories
plt.hist(prediction_data, bins='auto')
plt.legend(['Actual','Predictions'], loc='upper right')
plt.title(f"Histogram of {stock} prices")
plt.xlabel('Price')
plt.ylabel('Number of Actual Closes at price')
plt.savefig(f'models/{stock}/{stock}_predictions_histogram.png')

In [None]:
print("\n\n")
# Creates a scatterplot of the data
# This displays and represents accuracy by
# showing how clustered points are
# if there are more clustered points in the graph
# then there is more accuracy within the predictions
plt.scatter(prediction_data['Close'], prediction_data['Predictions'])
plt.title(f'Scatterplot of {stock} prices')
plt.xlabel('Close ($)')
plt.ylabel('Predictions ($)')
plt.savefig(f'models/{stock}/{stock}_predictions_scatterplot.png')

In [None]:
# Saves closing and prediction prices to a csv file

# Creates copy of dataframe
# Creates primary keys then writes to raw csv file
data_csv = prediction_data.copy()
primary_keys = []
for num in range(len(data_csv)):
    primary_keys.append(num+1)
data_csv['PK'] = primary_keys
data_csv = data_csv[['PK', 'Close']]

# Creates a raw data csv file
data_csv.to_csv(f"data/{stock}/{stock}_predictions.csv")


In [None]:
# This section uses a similar method as previously used in order to 
# Create a predicted close price for the stock on the next business day
future_price = web.DataReader(stock, data_source='yahoo', start=past_date, end=current_date)

future_df = future_price.filter(['Close'])

# Get the last 60 day closing price values
previous_60_days = future_df[-60:].values

# Scale the data to be values between 0 and 1
previous_60_days_scaled = scaler.transform(previous_60_days)

# Create an empty list
x_future = []

# Append the scaled data and turn data into a numpy array
x_future.append(previous_60_days_scaled)
x_future = np.array(x_future)

# Re-shape data into 3d array for model
x_future = np.reshape(x_future, (x_future.shape[0], x_future.shape[1], 1))

# Predict the price for next day
predicted_price = model.predict(x_future)

# Undo the scaling
predicted_price = scaler.inverse_transform(predicted_price)
predicted_price = predicted_price[0][0]

print(f"\n\n The prediction data set is:")
display(prediction_data)

# print predicted price for next day
print(f"\nAnd the predicted price for {stock} stock" 
f" on {current_date.replace(day=current_date.day + 1)} is: ${predicted_price}")