In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential, save_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
# Defining the constants
DATA_FILE = "data_GOOGL_2010-2024.csv"
SEQUENCE_INTERVAL = 15
TEST_SIZE = 0.2
EPOCHS = 50
BATCH_SIZE = 32
DROPOUT = 0.2
LSTM_UNITS = 50
DENSE_UNITS = 50
MIN_LEARNING_RATE = 1e-6
EARLY_STOPPING_PATIENCE = 5
REDUCE_LR_PATIENCE = 10
REDUCE_LR_FACTOR = 0.5


In [None]:
# Loading the data
df = pd.read_csv(DATA_FILE)
df["Date"] = pd.to_datetime(df["Date"])
df.set_index("Date", inplace=True)

In [None]:
# Normalizing the data
scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = scaler.fit_transform(df[['Close', 'High', 'Low', 'Open', 'Volume']])


In [None]:
# Splitting the data into X and y
X, y = [], []
for i in range(len(df_scaled) - SEQUENCE_INTERVAL):
    X.append(df_scaled[i:i+SEQUENCE_INTERVAL])
    y.append(df_scaled[i+SEQUENCE_INTERVAL, 0])
X, y = np.array(X), np.array(y)

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, shuffle=True)

print(len(X_train), len(X_test))

plt.figure(figsize=(10,4))
plt.hist(X_train.flatten(), bins=50, alpha=0.5, label="Treino")
plt.hist(X_test.flatten(), bins=50, alpha=0.5, label="Validação")
plt.legend()
plt.title("Distribuição dos Dados")
plt.show()



In [None]:
model = Sequential([
    LSTM(units=LSTM_UNITS, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(DROPOUT),
    LSTM(units=LSTM_UNITS, return_sequences=True),
    Dropout(DROPOUT),
    LSTM(units=LSTM_UNITS, return_sequences=False),
    Dropout(DROPOUT),
    Dense(units=DENSE_UNITS),
    Dense(units=1)
])
model.compile(optimizer="adam", loss="mean_squared_error")
model.summary()

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=REDUCE_LR_FACTOR, patience=EARLY_STOPPING_PATIENCE, min_lr=MIN_LEARNING_RATE)
early_stopping = EarlyStopping(monitor='val_loss', patience=REDUCE_LR_PATIENCE, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_test, y_test), callbacks=[reduce_lr, early_stopping])

plt.plot(history.history['loss'], label="Loss Treino")
plt.plot(history.history['val_loss'], label="Loss Validação")
plt.legend()
plt.show()

In [None]:
y_pred = model.predict(X_test)
y_pred = scaler.inverse_transform(np.column_stack((y_pred, np.zeros((y_pred.shape[0], 4)))))[:, 0]
y_test_real = scaler.inverse_transform(np.column_stack((y_test.reshape(-1, 1), np.zeros((y_test.shape[0], 4)))))[:, 0]

# Calculating the difference between real and predicted values
difference = y_test_real - y_pred

# Displaying the difference for each entry
for i in range(len(difference)):
    print(f"Entry {i+1}: Real = {y_test_real[i]}, Predicted = {y_pred[i]}, Difference = {difference[i]}")

mae = mean_absolute_error(y_test_real, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_real, y_pred))

print(f"MAE: {mae}")
print(f"RMSE: {rmse}")

In [None]:
save_model(model, "google_prediction_model.keras")