# Setup

---

The idea for the following project came from a [Medium post](https://medium.com/@dmytrosazonov/how-to-predict-stock-market-using-google-tensorflow-and-lstm-neural-network-81ccc41a22a8). We improve the Medium's code adding new models and new analysis. To do that, the group used as tools the [ChatGPT](https://chat.openai.com/) optimizations and the [Aurélien Geron's public notebooks](https://github.com/ageron/handson-ml2/blob/master/15_processing_sequences_using_rnns_and_cnns.ipynb)


In [None]:
import datetime as dt
import time as tm

# AI
import keras

# Graphics library
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

# Data preparation
from yahoo_fin import stock_info as yf

from utils import (
    last_time_step_mse,
    plot_learning_curves,
    plot_multiple_forecasts,
    plot_series,
)

## Data

--- 

#### Load Data From Yahoo API

In [None]:
tf.random.set_seed(42)  

ANALYZED_YEARS = 5 #periodo de analise
STOCK = "GOOGL" #acao a ser analisada
INTERVAL = "1d" #intervalo de tempo
N_STEPS = 22  # 22 dias uteis por mes
EPOCHS = 50 #epocas de treinamento

In [None]:
# Downloading data
init_df = yf.get_data(
    STOCK,
    start_date=(dt.date.today() - dt.timedelta(days=365 * ANALYZED_YEARS)).strftime(
        "%Y-%m-%d"
    ),
    end_date=tm.strftime("%Y-%m-%d"),
    interval=INTERVAL,
)
init_df.to_csv(f"./data/raw/{STOCK}_{ANALYZED_YEARS}Y_{INTERVAL}.csv") #salvando dados brutos
init_df.head() #mostrando os 5 primeiros registros

In [None]:
#removendo colunas nao utilizadas
init_df.drop(["open", "high", "low", "adjclose", "ticker", "volume"], axis=1, inplace=True) 

init_df["date"] = init_df.index #criando coluna de data
init_df.reset_index(drop=True, inplace=True) #resetando index

# Mudando a escala dos dados para 0-1
scaler = MinMaxScaler()
init_df["close_norm"] = scaler.fit_transform(
    np.expand_dims(init_df["close"].values, axis=1))

init_df.to_csv(f"./data/processed/{STOCK}_{ANALYZED_YEARS}Y_{INTERVAL}.csv") #salvando dados processados

# Salvando os dados normalizados
np.save(
    f"./data/processed/{STOCK}_{ANALYZED_YEARS}Y_{INTERVAL}_norm", init_df["close_norm"])

init_df.head() #mostrando os 5 primeiros registros


#### Visualization 

In [None]:
# Plotando o grafico de preco de fechamento
plt.style.use(style="ggplot")
plt.figure(figsize=(16, 10))
plt.plot(init_df["date"], init_df["close"])
plt.xlabel("days")
plt.ylabel("price")
plt.legend([f"Actual price for {STOCK}"])
plt.show()

#### Data split

In [None]:
data = np.load(f"./data/processed/{STOCK}_{ANALYZED_YEARS}Y_{INTERVAL}_norm.npy") #carregando dados normalizados

data = np.reshape(data[data.size % N_STEPS :], (data.size // N_STEPS, N_STEPS, 1)) #reorganizando dados para o formato (n, 22, 1) 

# Dividindo os dados em treino e teste (80% e 20%)
X_train, y_train = (
    data[: int(data.shape[0] * 0.8), : N_STEPS - 1],
    data[: int(data.shape[0] * 0.8), -1],
)
X_test, y_test = (
    data[int(data.shape[0] * 0.8):, : N_STEPS - 1],
    data[int(data.shape[0] * 0.8):, -1],
)

## Models

---

#### Naive Forecasting

In [None]:
y_pred = X_test[:, -1] #pegando o ultimo valor de cada sequencia de teste
np.mean(keras.losses.mean_squared_error(y_pred, y_test)) #calculando o erro medio quadrado

#### Linear Regression

In [None]:
np.random.seed(42) 
tf.random.set_seed(42)

# Criando o modelo de regressao linear com keras
model = keras.models.Sequential(
    [keras.layers.Flatten(input_shape=[N_STEPS - 1, 1]), keras.layers.Dense(1)]
)

model.compile(loss="mse", optimizer="adam") #compilando o modelo com o otimizador adam e a funcao de perda mse
history = model.fit(X_train, y_train, epochs=EPOCHS, validation_data=(X_test, y_test)) #treinando o modelo com os dados de treino e validando com os dados de teste 
model.save(f"./models/LR_{STOCK}_{ANALYZED_YEARS}Y_{INTERVAL}-{N_STEPS}timeSteps") #salvando o modelo treinado

In [None]:
model.evaluate(X_test, y_test) #avaliando o modelo com os dados de teste

#plotando as curvas de treinamento e validacao
plot_learning_curves(history.history["loss"], history.history["val_loss"]) 
plt.show() 

In [None]:
y_pred = model.predict(X_test) #fazendo a predicao com os dados de teste

#plotando o grafico de comparacao entre os dados de teste e a predicao
plot_series(scaler.inverse_transform([X_test[0, :,0]])[0], scaler.inverse_transform(np.array([[y_test[0,0]]])), scaler.inverse_transform(np.array([[y_pred[0,0]]])), n_steps=N_STEPS)
plt.show()

#### Deep RNN

In [None]:
#DEEP RNN
model = keras.models.Sequential(
    [
        keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),
        keras.layers.SimpleRNN(20),
        keras.layers.Dense(1),
    ]
)

model.compile(loss="mse", optimizer="adam")
history = model.fit(X_train, y_train, epochs=EPOCHS, validation_data=(X_test, y_test))
model.save(f"./models/RNN_{STOCK}_{ANALYZED_YEARS}Y_{INTERVAL}-{N_STEPS}timeSteps")


In [None]:
# Plotando as curvas de treinamento e validacao
plot_learning_curves(history.history["loss"], history.history["val_loss"])
plt.show()

In [None]:
# Fazendo a predicao com os dados de teste e plotando o grafico de comparacao
y_pred = model.predict(X_test)
plot_series(scaler.inverse_transform([X_test[0, :,0]])[0], scaler.inverse_transform(np.array([[y_test[0,0]]])), scaler.inverse_transform(np.array([[y_pred[0,0]]])), n_steps=N_STEPS)
plt.show()

### Forecast several time steps ahead

---

To forecast several time steps ahead, we need a new manipulation data method and robust models with memory cells. Here we can establish a comparison between a LSTM and a GRU model.

In [None]:
#Forecasting Several Steps Ahead

N_STEPS = 30 #numero de time steps
FORECAST_DAYS = 5 #numero de dias para previsao

data = np.load(f"./data/processed/{STOCK}_{ANALYZED_YEARS}Y_{INTERVAL}_norm.npy") 
data = np.reshape(data[data.size % N_STEPS :], (data.size // N_STEPS, N_STEPS, 1)) #reorganizando dados para o formato (n, 22, 1)

X_train = data[: int(data.shape[0] * 0.8), : N_STEPS - FORECAST_DAYS] 
X_test = data[int(data.shape[0] * 0.8) :, : N_STEPS - FORECAST_DAYS]

Y = np.empty((data.shape[0], N_STEPS - FORECAST_DAYS, FORECAST_DAYS))
for step_ahead in range(1, FORECAST_DAYS + 1):
    Y[:, :, step_ahead - 1] = data[
        :, step_ahead : step_ahead + N_STEPS - FORECAST_DAYS, 0
    ]

y_train = Y[: int(data.shape[0] * 0.8)]
y_test = Y[int(data.shape[0] * 0.8) :]


#### LSTMs

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

# Criando o modelo de regressao linear com keras para previsao de varios dias
model = keras.models.Sequential(
    [
        keras.layers.LSTM(20, return_sequences=True, input_shape=[None, 1]),
        keras.layers.LSTM(20, return_sequences=True),
        keras.layers.TimeDistributed(keras.layers.Dense(FORECAST_DAYS)),
    ]
)

model.compile(loss="mse", optimizer="adam", metrics=[last_time_step_mse])
history = model.fit(X_train, y_train, epochs=EPOCHS, validation_data=(X_test, y_test))


In [None]:
# Plotando as curvas de treinamento e validacao 
model.evaluate(X_test, y_test)
plot_learning_curves(history.history["loss"], history.history["val_loss"])
plt.show()

In [None]:
np.random.seed(43)

# Fazendo a predicao com os dados de teste e plotando o grafico de comparacao
X_new, Y_new = (
    data[:, : N_STEPS - FORECAST_DAYS, :],
    data[:, N_STEPS - FORECAST_DAYS :, :],
)
Y_pred = model.predict(X_new)[:, -1][..., np.newaxis]
plot_multiple_forecasts(X_new, Y_new, Y_pred, scaler)
plt.show()


#### GRUs

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential(
    [
        keras.layers.GRU(20, return_sequences=True, input_shape=[None, 1]),
        keras.layers.GRU(20, return_sequences=True),
        keras.layers.TimeDistributed(keras.layers.Dense(FORECAST_DAYS)),
    ]
)

model.compile(loss="mse", optimizer="adam", metrics=[last_time_step_mse])
history = model.fit(X_train, y_train, epochs=EPOCHS, validation_data=(X_test, y_test))
model.save(f"./models/GRU_{STOCK}_{ANALYZED_YEARS}Y_{INTERVAL}-5daysPrediction")

In [None]:
model.evaluate(X_test, y_test)
plot_learning_curves(history.history["loss"], history.history["val_loss"])
plt.show()

In [None]:
np.random.seed(43)
X_new, Y_new = (
    data[:, : N_STEPS - FORECAST_DAYS, :],
    data[:, N_STEPS - FORECAST_DAYS :, :],
)
Y_pred = model.predict(X_new)[:, -1][..., np.newaxis]
plot_multiple_forecasts(X_new, Y_new, Y_pred, scaler)
plt.show()

### Conclusion

--- 
As we can see, some of the tested models didn't work as expected. The Naive Forecasting and the Linear Regression models didn't work well. This happens because the stock market is a non-linear system and this models takes into account only the linear relation between the variables and the target. On the other hand, LSTM and GRU models were better and they can be used to forcaste several time steps ahead but nontheless, still not good enough. Finaly, the Deep RNN is the one that mostly got near the real values. This happens because the RNNs are able to learn the non-linear relations between the variables and the target and make a better prediction.