# Sequence Model Research

The scope of this notebook is to assess and train different sequence models given the training data generated.

Training data is generated based on financial time series data labeled with potential profits using a buy-sell system.

The goal is to create a sequence model that can choose favourable stock charts equal to or better than a human can via traditional technical analysis.

## Import Libraries and Data

In [4]:
import os
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import matplotlib.pyplot as plt

# List available devices
devices = tf.config.list_physical_devices('GPU')
print("GPUs available: ", devices)

# Confirm TensorFlow is using the GPU
if devices:
    print("TensorFlow is using the GPU")
else:
    print("TensorFlow is not using the GPU")

# Define the data directory relative to the script location
data_dir = 'data'

# Define the file paths
sequences_path = os.path.join(data_dir, 'sequences.npy')
labels_path = os.path.join(data_dir, 'labels.npy')
metadata_path = os.path.join(data_dir, 'metadata.npy')

# Load the data
try:
    data_sequences = np.load(sequences_path)
    data_labels = np.load(labels_path)
    data_metadata = np.load(metadata_path)

    # Inspect the shape of the loaded data
    print(f'Sequences shape: {data_sequences.shape}')
    print(f'Labels shape: {data_labels.shape}')
    print(f'Metadata shape: {data_metadata.shape}')

    print(f'Sequences shape: {data_sequences}')
    print(f'Labels shape: {data_labels}')
    print(f'Metadata shape: {data_metadata}')

except FileNotFoundError as e:
    print(f"Error loading files: {e}")



2024-06-07 15:40:59.325636: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-07 15:40:59.520414: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-07 15:40:59.520459: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


GPUs available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow is using the GPU
Sequences shape: (126071, 252, 15)
Labels shape: (126071,)
Metadata shape: (126071, 2)
Sequences shape: [[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  ...
  [2.34932089e-14 2.33649087e-14 2.45510655e-14 ...            nan
   5.62408901e-15 0.00000000e+00]
  [2.31486709e-14 2.39308785e-14 2.34916386e-14 ...            nan
   5.58780764e-15 0.00000000e+00]
  [2.25672614e-14 2.31299789e-14 2.39983216e-14 ...            nan
   5.67213783e-15 0.00000000e+00]]

 [[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  

## Data Preprocessing

In [6]:
# Shuffle the data
indices = np.arange(data_sequences.shape[0])
np.random.shuffle(indices)

data_sequences = data_sequences[indices]
data_labels = data_labels[indices]
data_metadata = data_metadata[indices]

# Define the threshold for considering a buy
threshold = 0.1  # Example threshold value

# Transform labels to binary (1 for buy, 0 for no buy)
binary_labels = (data_labels >= threshold).astype(int)

# Define the proportions for the splits
train_size = 0.7  # 70%
validation_size = 0.15  # 15%
test_size = 0.15  # 15%

# Calculate the number of samples for each set
num_samples = data_sequences.shape[0]
train_end = int(num_samples * train_size)
validation_end = int(num_samples * (train_size + validation_size))

# Split the data
X_train = data_sequences[:train_end]
y_train = binary_labels[:train_end]
profits_train = data_labels[:train_end]

X_val = data_sequences[train_end:validation_end]
y_val = binary_labels[train_end:validation_end]
profits_val = data_labels[train_end:validation_end]

X_test = data_sequences[validation_end:]
y_test = binary_labels[validation_end:]
profits_test = data_labels[validation_end:]

# Convert numpy arrays to pandas DataFrames for easier handling of NaNs
X_train_df = pd.DataFrame(X_train.reshape(-1, X_train.shape[-1]))
X_val_df = pd.DataFrame(X_val.reshape(-1, X_val.shape[-1]))
X_test_df = pd.DataFrame(X_test.reshape(-1, X_test.shape[-1]))

# Replace NaNs with 0
X_train_df.fillna(0, inplace=True)
X_val_df.fillna(0, inplace=True)
X_test_df.fillna(0, inplace=True)

# Convert DataFrames back to numpy arrays
X_train = X_train_df.values.reshape(-1, X_train.shape[1], X_train.shape[2])
X_val = X_val_df.values.reshape(-1, X_val.shape[1], X_val.shape[2])
X_test = X_test_df.values.reshape(-1, X_test.shape[1], X_test.shape[2])

# Verify there are no NaNs left
assert not np.isnan(X_train).any(), "NaNs found in X_train"
assert not np.isnan(X_val).any(), "NaNs found in X_val"
assert not np.isnan(X_test).any(), "NaNs found in X_test"

print("NaNs replaced with 0 successfully.")

# Inspect the shape of the splits
print(f'Training set shape: {X_train.shape}, {y_train.shape}')
print(f'Validation set shape: {X_val.shape}, {y_val.shape}')
print(f'Test set shape: {X_test.shape}, {y_test.shape}')

print(np.min(data_sequences))

NaNs replaced with 0 successfully.
Training set shape: (88249, 252, 15), (88249,)
Validation set shape: (18911, 252, 15), (18911,)
Test set shape: (18911, 252, 15), (18911,)
0.0


## Model

In [None]:
# Build the LSTM model
model = Sequential()

# Input layer
model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))

# First LSTM layer (return_sequences=True to pass sequences to the next LSTM layer)
model.add(LSTM(100, return_sequences=True))

# Dropout for regularization
model.add(Dropout(0.2))

# Second LSTM layer (final LSTM layer, return_sequences=False to pass a single vector)
model.add(LSTM(50, return_sequences=False))

# Dropout for regularization
model.add(Dropout(0.2))

# Dense layer with 25 units
model.add(Dense(25, activation='relu'))

# Output layer with 1 unit for binary classification (sigmoid activation)
model.add(Dense(1, activation='sigmoid'))

# Compile the model using the Adam optimizer with a lower learning rate and gradient clipping
optimizer = Adam(learning_rate=1e-4, clipnorm=1.0)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Train the model with validation data
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=20)

# Plot training & validation loss values
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot training & validation accuracy values
plt.figure(figsize=(10, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Predicting and evaluating the model
train_predict = model.predict(X_train)
val_predict = model.predict(X_val)
test_predict = model.predict(X_test)

# Convert probabilities to binary predictions
train_predict = (train_predict > 0.5).astype(int)
val_predict = (val_predict > 0.5).astype(int)
test_predict = (test_predict > 0.5).astype(int)

# Calculate accuracy
train_accuracy = np.mean(train_predict == y_train)
val_accuracy = np.mean(val_predict == y_val)
test_accuracy = np.mean(test_predict == y_test)

print(f'Train Accuracy: {train_accuracy * 100:.2f}%')
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')



KeyboardInterrupt: 

In [None]:

if np.isnan(X_train).any() or np.isnan(y_train).any():
    print("NaNs found in training data!")
if np.isnan(X_val).any() or np.isnan(y_val).any():
    print("NaNs found in validation data!")
if np.isnan(X_test).any() or np.isnan(y_test).any():
    print("NaNs found in test data!")

X_train.shape

out = data_sequences[2,:,:]
np.save("out.npy", out)


In [None]:
np.min(data_sequences)
np.argwhere(np.isnan(data_sequences))