In [13]:
# Carrega as variáveis de embiente usadas pelo notebook. Estas variáveis devem ser criadas no arquivo .env, localizada no mesmo diretório do notebook
# As seguintes variáveis devem ser criadas:
# - export STR_CON=<string de conexão com o banco de dados relacional>
# - export RAW_DATA_PATH=<caminho onde os arquivos contendo o histórico de preço de cada ativo será armazenado>
# - export DATASET_PATH=<caminho onde os arquivos contendo o histórico de preço e os indicadores técnicos serão armazenados>
# - export MODELS_PATH=<caminnho onde os modelos treinados serão armazenados>
# - export TRAIN_DATASET=<caminho onde os datasets estandarizados e formatados para o treinamento serão armazenados>
%load_ext dotenv
%dotenv

# Garante que os scripts Python sejam recarregados em cada execução do notebook.
%load_ext autoreload
%autoreload 2

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import pandas as pd
import numpy as np
from data_preparation import PreProcess

preprocess = PreProcess()
df_train = preprocess.read_dataset_from_parquet("data/train_dataset/indicadores_com_sinais/train_data.parquet")
df_test = preprocess.read_dataset_from_parquet("data/train_dataset/indicadores_com_sinais/test_data.parquet")

X_Train = np.array([val[0] for val in df_train.values])
X_Test = np.array([val[0] for val in df_test.values])

In [15]:
X_Train.shape

(56503, 45, 25)

In [18]:
from keras.layers import Input, LSTM, RepeatVector, Dense, LeakyReLU, TimeDistributed
from keras.models import Model, Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import plot_model


input_data = Input((X_Train.shape[1], X_Train.shape[2]) )
# Encoder
encoder = LSTM(32, input_shape=X_Train.shape, return_sequences=True)(input_data)
encoder = LSTM(16, input_shape=X_Train.shape)(encoder)
# _encoder = Sequential([
#     LSTM(50, input_shape=X_Train[0].shape)
# ])

# _decoder = Sequential([
#     RepeatVector(X_Train.shape[1]),
#     LSTM(50, return_sequences=True),
#     TimeDistributed(Dense(X_Train.shape[2]))])

bridge = RepeatVector(X_Train.shape[1])(encoder)

# Decoder
decoder = LSTM(16, input_shape=X_Train.shape, return_sequences=True)(bridge)
decoder = LSTM(32, input_shape=X_Train.shape, return_sequences=True)(decoder)
decoder = TimeDistributed(Dense(X_Train.shape[2]))(decoder)

autoencoder = Model(input_data, decoder)
# model = Sequential([_encoder, _decoder])
# model.compile(optimizer='adam', loss='mse')

autoencoder.summary()
plot_model(autoencoder, 'autoencoder_compress.png', show_shapes=True)
# plot_model(_decoder, 'autoencoder_decompress.png', show_shapes=True)

autoencoder.compile(loss='mae', 
                    optimizer='adam')

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 45, 25)]          0         
                                                                 
 lstm_16 (LSTM)              (None, 45, 32)            7424      
                                                                 
 lstm_17 (LSTM)              (None, 16)                3136      
                                                                 
 repeat_vector_4 (RepeatVect  (None, 45, 16)           0         
 or)                                                             
                                                                 
 lstm_18 (LSTM)              (None, 45, 16)            2112      
                                                                 
 lstm_19 (LSTM)              (None, 45, 32)            6272      
                                                           

In [19]:
batch_size = 32
epochs = 200

cp_early_stopping = EarlyStopping(monitor='val_loss', 
                               patience=10, 
                               min_delta=0.001, 
                               mode='auto')
                               
cp_model_checkpoint = ModelCheckpoint("data/tf_models/autoencoder/autoencoder_{epoch:02d}-{val_loss:.4f}.h5",
                                monitor=f'val_loss',
                                save_best_only=True,
                                verbose=1,
                                mode="min")

autoencoder.fit(X_Train, X_Train, batch_size=batch_size, epochs=epochs, validation_data=(X_Test, X_Test), callbacks=[cp_early_stopping, cp_model_checkpoint])

Epoch 1/200


2022-12-28 20:33:47.739544: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-28 20:33:48.409247: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-28 20:33:48.592203: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-28 20:33:48.783600: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-28 20:33:48.980701: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-28 20:33:49.255774: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-28 20:33:49.571272: I tensorflow/core/grappler/optimizers/cust



2022-12-28 20:38:11.674035: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-28 20:38:11.897352: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-28 20:38:12.088611: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-28 20:38:12.271321: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-28 20:38:12.440171: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.



Epoch 1: val_loss improved from inf to 0.08010, saving model to data/tf_models/autoencoder/autoencoder_01-0.0801.h5
Epoch 2/200
Epoch 2: val_loss improved from 0.08010 to 0.06559, saving model to data/tf_models/autoencoder/autoencoder_02-0.0656.h5
Epoch 3/200
Epoch 3: val_loss improved from 0.06559 to 0.05938, saving model to data/tf_models/autoencoder/autoencoder_03-0.0594.h5
Epoch 4/200
Epoch 4: val_loss improved from 0.05938 to 0.05889, saving model to data/tf_models/autoencoder/autoencoder_04-0.0589.h5
Epoch 5/200
Epoch 5: val_loss improved from 0.05889 to 0.05381, saving model to data/tf_models/autoencoder/autoencoder_05-0.0538.h5
Epoch 6/200
Epoch 6: val_loss improved from 0.05381 to 0.04981, saving model to data/tf_models/autoencoder/autoencoder_06-0.0498.h5
Epoch 7/200
Epoch 7: val_loss improved from 0.04981 to 0.04871, saving model to data/tf_models/autoencoder/autoencoder_07-0.0487.h5
Epoch 8/200
Epoch 8: val_loss improved from 0.04871 to 0.04774, saving model to data/tf_mod

<keras.callbacks.History at 0x1b4236670>

In [25]:
from keras.models import load_model

model = load_model("data/tf_models/autoencoder/autoencoder_43-0.0376.h5")

model.layers[:3]

latent_layer = Model()
# loss = 0.0792
# X_Test_repr = _encoder.predict(X_Test)
# X_Train_repr = _encoder.predict(X_Train)
# X_Train_repr.shape, X_Test_repr.shape

[<keras.engine.input_layer.InputLayer at 0x158677a90>,
 <keras.layers.rnn.lstm.LSTM at 0x158677fd0>,
 <keras.layers.rnn.lstm.LSTM at 0x147eb54c0>]

In [None]:
from sklearn.linear_model import LogisticRegressionCV

logit = LogisticRegressionCV(cv=10, max_iter=500, n_jobs=-1)
logit.fit(X_Train_repr, df_train['label'].values)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

Y_test_predict = logit.predict(X_Test_repr)
print(accuracy_score(df_test['label'].values, Y_test_predict))
print(confusion_matrix(df_test['label'].values, Y_test_predict))
print(precision_score(df_test['label'].values, Y_test_predict))
print(recall_score(df_test['label'].values, Y_test_predict))
print(f1_score(df_test['label'].values, Y_test_predict))

0.6305157874092959
[[6343   70]
 [3698   87]]
0.554140127388535
0.02298546895640687
0.04414003044140031
