# Sequence Model Research

The scope of this notebook is to assess and train different sequence models given the training data generated.

Training data is generated based on financial time series data labeled with potential profits using a buy-sell system.

The goal is to create a sequence model that can choose favourable stock charts equal to or better than a human can via traditional technical analysis.

## Import Libraries and Data

In [4]:
import os
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# List available devices
devices = tf.config.list_physical_devices('GPU')
print("GPUs available: ", devices)

# Confirm TensorFlow is using the GPU
if devices:
    print("\nTensorFlow is using the GPU\n")
else:
    print("\nTensorFlow is not using the GPU\n")

# Define the data directory relative to the script location
data_dir = 'data'

# Define the file paths
sequences_path = os.path.join(data_dir, 'sequences.npy')
labels_path = os.path.join(data_dir, 'labels.npy')
metadata_path = os.path.join(data_dir, 'metadata.npy')

# Load the data
try:
    data_sequences = np.load(sequences_path)
    data_labels = np.load(labels_path)
    data_metadata = np.load(metadata_path)

    # Inspect the shape of the loaded data
    print(f'Sequences shape: {data_sequences.shape}')
    print(f'Labels shape: {data_labels.shape}')
    print(f'Metadata shape: {data_metadata.shape}')

except FileNotFoundError as e:
    print(f"Error loading files: {e}")

# Define relevant columns and indices for normalization
relevant_columns = [
    'Open', 'High', 'Low', 'Close', 'Volume', 'Turnover', 'Consol_Detected',
    'Consol_Len_Bars', 'Consol_Depth_Percent', 'Close_21_bar_ema',
    'Close_50_bar_sma', 'Close_150_bar_sma', 'Close_200_bar_sma',
    'RSL', 'RSL_NH'
]

price_columns_indices = [0, 1, 2, 3, 9, 10, 11, 12]  # Indices of price-related columns in the sequence data

# Map indices to column names
price_columns = [relevant_columns[i] for i in price_columns_indices]

print("\nPrice-related columns:")
for index, column in zip(price_columns_indices, price_columns):
    print(f"Index: {index}, Column: {column}")

GPUs available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

TensorFlow is using the GPU

Sequences shape: (126133, 252, 15)
Labels shape: (126133,)
Metadata shape: (126133, 2)

Price-related columns:
Index: 0, Column: Open
Index: 1, Column: High
Index: 2, Column: Low
Index: 3, Column: Close
Index: 9, Column: Close_21_bar_ema
Index: 10, Column: Close_50_bar_sma
Index: 11, Column: Close_150_bar_sma
Index: 12, Column: Close_200_bar_sma


## Data Preprocessing

### NaN Removal

In [5]:
# replace all nan with 0. These nans are due to the moving averages having insufficient data to compute anything leaving blank inputs
# check if nans exist

# Dictionary to map variable names to their corresponding data arrays
data_dict = {
    'data_sequences': data_sequences,
    'data_labels': data_labels,
}

# Using a dictionary to iterate over variables
for var_name, data in data_dict.items():
    num_nans = np.sum(np.isnan(data))
    print(f"NaNs in {var_name}: {num_nans}")

    # Remove NaNs
    if num_nans > 0:
        data_dict[var_name][:] = np.nan_to_num(data)
        num_nans = np.sum(np.isnan(data))
        print(f"NaNs remaining in {var_name} after removal: {num_nans}")

print(f"Data Seq Min: {np.min(data_sequences[:,:,price_columns_indices])}")
print(f"Data Seq Max: {np.max(data_sequences[:,:,price_columns_indices])}")

NaNs in data_sequences: 4995309
NaNs remaining in data_sequences after removal: 0
NaNs in data_labels: 0
Data Seq Min: 0.0
Data Seq Max: 468223510183936.0


### Corrupted sequence removal

99% of stocks I buy will be below 1000, with a few above 1000, although they are important.

I also noticed quite a few training examples have weird price data, which I filter out below.

I noticed with thresholds above 3e3, the max is the threshold, which is very suspect.

The loss of training examples is insignificant, and the result is better normalization of the data and obviously no corrupted sequences.

In [7]:
# Map indices to column names
price_columns = [relevant_columns[i] for i in price_columns_indices]

# Set the threshold for abnormal values
threshold = 3.0e3

# Detect all sequences with abnormally large price data
abnormal_sequences = []

# Iterate through each sequence to check for abnormal values
for sequence_index in range(data_sequences.shape[0]):
    # Extract price-related columns for the current sequence
    price_data = data_sequences[sequence_index, :, price_columns_indices]
    
    # Check if any value in the price_data exceeds the threshold
    if np.any(price_data > threshold):
        abnormal_sequences.append(sequence_index)

# Print the indices of the abnormal sequences
print(f"Abnormal Sequence Count: {len(abnormal_sequences)}")
print(f"Indices of abnormal sequences: {abnormal_sequences}")

# Create a mask for sequences that are not abnormal
mask = np.ones(data_sequences.shape[0], dtype=bool)
mask[abnormal_sequences] = False

# Filter out abnormal sequences from data_sequences and data_labels
filtered_data_sequences = data_sequences[mask]
filtered_data_labels = data_labels[mask]

# Print the shape of the filtered data
print(f"Filtered data_sequences shape: {filtered_data_sequences.shape}")
print(f"Filtered data_labels shape: {filtered_data_labels.shape}")

print(f"Data Seq Min: {np.min(filtered_data_sequences[:,:,price_columns_indices])}")
print(f"Data Seq Max: {np.max(filtered_data_sequences[:,:,price_columns_indices])}")

Abnormal Sequence Count: 1335
Indices of abnormal sequences: [1495, 1496, 1497, 1498, 1499, 1500, 1501, 1523, 1524, 1525, 1526, 1527, 1528, 1552, 1553, 1554, 1555, 2113, 2202, 2203, 2204, 2205, 2206, 2207, 2208, 2209, 2210, 2211, 2212, 2213, 2214, 2215, 2216, 2217, 2218, 2610, 2611, 2612, 2613, 2614, 3269, 3407, 3408, 3409, 3410, 3411, 3412, 3413, 3414, 3415, 3416, 3417, 3418, 3419, 3420, 3421, 3422, 3423, 3424, 3425, 3687, 4769, 4770, 4771, 4772, 5940, 5941, 5942, 5943, 5944, 6386, 6387, 6390, 6496, 6497, 6498, 6499, 6500, 6501, 6502, 6503, 6767, 6768, 7924, 7925, 7926, 7927, 8028, 8283, 8284, 8285, 8286, 8287, 8464, 8465, 8466, 8467, 8468, 8469, 8470, 8471, 8646, 8647, 8648, 8649, 8650, 8651, 8652, 8653, 9262, 9263, 9264, 9265, 9369, 9370, 9371, 9372, 9373, 9374, 9375, 9376, 9377, 9885, 9886, 9887, 9888, 9889, 9890, 9891, 9892, 9893, 9894, 9895, 9896, 9897, 9898, 9899, 9900, 9901, 9902, 9903, 9904, 9905, 9906, 9907, 11473, 11474, 11475, 11476, 11477, 11478, 11479, 11480, 11481, 11482

### Normalization of Training Data

In [8]:
# Normalize the price-related features together
price_scaler = MinMaxScaler()

# Reshape the price-related features to fit the scaler
original_shape = filtered_data_sequences[:, :, price_columns_indices].shape
reshaped_data = filtered_data_sequences[:, :, price_columns_indices].reshape(-1, len(price_columns_indices))

# Fit and transform the price-related features
normalized_price_data = price_scaler.fit_transform(reshaped_data)

# Reshape back to the original shape
normalized_price_data = normalized_price_data.reshape(original_shape)

# Replace the original price-related features with the normalized ones
filtered_data_sequences[:, :, price_columns_indices] = normalized_price_data

# Normalize the remaining features individually
num_sequences, num_timesteps, num_features = filtered_data_sequences.shape
for feature_index in range(num_features):
    if feature_index not in price_columns_indices:
        # Initialize a new scaler for each feature
        feature_scaler = MinMaxScaler()
        
        # Extract the feature data
        feature_data = filtered_data_sequences[:, :, feature_index].reshape(-1, 1)
        
        # Fit and transform the scaler
        normalized_feature_data = feature_scaler.fit_transform(feature_data)
        
        # Reshape back to the original shape
        normalized_feature_data = normalized_feature_data.reshape(num_sequences, num_timesteps)
        
        # Replace the original feature with the normalized one
        filtered_data_sequences[:, :, feature_index] = normalized_feature_data

# Print normalized data sequences to check
print(f"Normalized Data Seq Min: {np.min(filtered_data_sequences)}")
print(f"Normalized Data Seq Max: {np.max(filtered_data_sequences)}")

# Scale the labels
label_scaler = MinMaxScaler(feature_range=(0,1))

# Reshape the labels to 2D array
filtered_data_labels = filtered_data_labels.reshape(-1, 1)

# Fit and transform the labels
filtered_data_labels = label_scaler.fit_transform(filtered_data_labels)

# Reshape the labels back to their original shape if needed
filtered_data_labels = filtered_data_labels.reshape(-1)

print(f"Normalized Data Labels Min: {np.min(filtered_data_labels)}")
print(f"Normalized Data Labels Max: {np.max(filtered_data_labels)}")

Normalized Data Seq Min: 0.0
Normalized Data Seq Max: 1.0
Normalized Data Labels Min: 0.0
Normalized Data Labels Max: 1.0000000000000002


## Model

In [9]:
# Check if TensorFlow is using GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Hyperparameters
test_size = 0.2
val_size = 0.2
lstm_units = 50
dropout_rate = 0.2
epochs = 100
batch_size = 32
patience = 10

# Assume filtered_data_sequences and filtered_data_labels are already normalized
print("Shape of filtered_data_sequences:", filtered_data_sequences.shape)
print("Shape of filtered_data_labels:", filtered_data_labels.shape)

# Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(filtered_data_sequences, filtered_data_labels, test_size=test_size + val_size, random_state=42)
val_ratio = val_size / (test_size + val_size)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=val_ratio, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_test:", y_test.shape)

# Build the LSTM model
model = Sequential()
model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(lstm_units, activation='relu', return_sequences=True))
model.add(Dropout(dropout_rate))
model.add(LSTM(lstm_units, activation='relu'))
model.add(Dropout(dropout_rate))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Print model summary
model.summary()

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'Test MSE: {mse}')
print(f'Test MAE: {mae}')

# Plot training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot true vs predicted values
plt.figure(figsize=(12, 6))
plt.plot(y_test, label='True Values')
plt.plot(y_pred, label='Predicted Values')
plt.title('True vs Predicted Values')
plt.xlabel('Samples')
plt.ylabel('Value')
plt.legend()
plt.show()


Shape of filtered_data_sequences: (124798, 252, 15)
Shape of filtered_data_labels: (124798,)
Shape of X_train: (74878, 252, 15)
Shape of X_val: (24960, 252, 15)
Shape of X_test: (24960, 252, 15)
Shape of y_train: (74878,)
Shape of y_val: (24960,)
Shape of y_test: (24960,)


: 