# Sequence Model Research

The scope of this notebook is to assess and train different sequence models given the training data generated.

Training data is generated based on financial time series data labeled with potential profits using a buy-sell system.

The goal is to create a sequence model that can choose favourable stock charts equal to or better than a human can via traditional technical analysis.

## Import Libraries and Data

In [1]:
import os
import numpy as np

# Define the data directory relative to the script location
data_dir = 'data'

# Define the file paths
sequences_path = os.path.join(data_dir, 'sequences.npy')
labels_path = os.path.join(data_dir, 'labels.npy')
metadata_path = os.path.join(data_dir, 'metadata.npy')

# Load the data
try:
    data_sequences = np.load(sequences_path)
    data_labels = np.load(labels_path)
    data_metadata = np.load(metadata_path)

    # Number of examples to select
    num_examples = 115000

    # Generate a random permutation of indices
    indices = np.random.permutation(len(data_sequences))

    # Select the first `num_examples` indices
    selected_indices = indices[:num_examples]

    # Use the selected indices to create the random subset
    data_sequences = data_sequences[selected_indices, :, :]
    data_labels = data_labels[selected_indices]
    data_metadata = data_metadata[selected_indices]

    # Inspect the shape and size of the loaded data before slicing
    print(f'Loaded sequences shape: {data_sequences.shape}')
    print(f'Loaded sequences size: {data_sequences.size}')
    print(f'Loaded labels shape: {data_labels.shape}')
    print(f'Loaded metadata shape: {data_metadata.shape}')

except FileNotFoundError as e:
    print(f"Error loading files: {e}")
except ValueError as e:
    print(f"Value error: {e}")


Loaded sequences shape: (115000, 63, 12)
Loaded sequences size: 86940000
Loaded labels shape: (115000,)
Loaded metadata shape: (115000, 2)


## Data Preprocessing

### NaN anf INF Removal

In [2]:
import numpy as np

# Dictionary to map variable names to their corresponding data arrays
data_dict = {
    'data_sequences': data_sequences,
    'data_labels': data_labels,
}

# Using a dictionary to iterate over variables
for var_name, data in data_dict.items():
    num_nans = np.sum(np.isnan(data))
    num_infs = np.sum(np.isinf(data))
    print(f"NaNs in {var_name}: {num_nans}")
    print(f"Infs in {var_name}: {num_infs}")

    # Remove NaNs and Infs
    if num_nans > 0 or num_infs > 0:
        data_dict[var_name][:] = np.nan_to_num(data, nan=0.0, posinf=0.0, neginf=0.0)
        num_nans_after = np.sum(np.isnan(data))
        num_infs_after = np.sum(np.isinf(data))
        print(f"NaNs remaining in {var_name} after removal: {num_nans_after}")
        print(f"Infs remaining in {var_name} after removal: {num_infs_after}")

print("NaN and Inf removal completed.")

NaNs in data_sequences: 905259
Infs in data_sequences: 37
NaNs remaining in data_sequences after removal: 0
Infs remaining in data_sequences after removal: 0
NaNs in data_labels: 0
Infs in data_labels: 0
NaN and Inf removal completed.


### Corrupted sequence removal

99% of stocks I buy will be below 1000, with a few above 1000, although they are important.

I also noticed quite a few training examples have weird price data, which I filter out below.

I noticed with thresholds above 3e3, the max is the threshold, which is very suspect.

The loss of training examples is insignificant, and the result is better normalization of the data and obviously no corrupted sequences.

#### Feature Stats

In [3]:
import numpy as np
import pandas as pd

def print_feature_stats(data_sequences, feature_names):
    print("Feature Statistics:")
    print("-" * 50)

    for i, feature_name in enumerate(feature_names):
        feature_data = data_sequences[:, :, i].flatten()
        
        stats = {
            "Mean": np.mean(feature_data),
            "Median": np.median(feature_data),
            "Std Dev": np.std(feature_data),
            "Min": np.min(feature_data),
            "Max": np.max(feature_data),
            "25th Percentile": np.percentile(feature_data, 25),
            "75th Percentile": np.percentile(feature_data, 75),
            "Skewness": pd.Series(feature_data).skew(),
            "Kurtosis": pd.Series(feature_data).kurtosis(),
            "Zero Count": np.sum(feature_data == 0),
            "Zero Percentage": np.mean(feature_data == 0) * 100
        }
        
        print(f"Feature: {feature_name}")
        for stat_name, stat_value in stats.items():
            print(f"  {stat_name}: {stat_value:.4f}")
        print("-" * 50)

# List of feature names
feature_names = [
    'Consol_Len_Bars', 'Consol_Depth_Percent',
    'Distance_to_21EMA', 'Distance_to_50SMA', 'Distance_to_200SMA', 
    'RSL_NH_Count', 'RSL_Slope', 'Up_Down_Days', 
    'Stage 2', 'UpDownVolumeRatio', 'ATR', '%B'
]

# Call the function to print statistics
print_feature_stats(data_sequences, feature_names)

# Additional overall statistics
print("Overall Dataset Statistics:")
print(f"Total number of sequences: {data_sequences.shape[0]}")
print(f"Sequence length: {data_sequences.shape[1]}")
print(f"Number of features: {data_sequences.shape[2]}")
print(f"Total number of data points: {data_sequences.size}")
print(f"Memory usage: {data_sequences.nbytes / (1024 * 1024):.2f} MB")

Feature Statistics:
--------------------------------------------------
Feature: Consol_Len_Bars
  Mean: 69.4247
  Median: 27.0000
  Std Dev: 121.0637
  Min: 0.0000
  Max: 3142.0000
  25th Percentile: 6.0000
  75th Percentile: 77.0000
  Skewness: 4.4888
  Kurtosis: 38.1585
  Zero Count: 1417116.0000
  Zero Percentage: 19.5599
--------------------------------------------------
Feature: Consol_Depth_Percent
  Mean: 16.4847
  Median: 15.6965
  Std Dev: 11.7504
  Min: 0.0000
  Max: 54.4286
  25th Percentile: 7.3892
  75th Percentile: 25.9770
  Skewness: 0.2046
  Kurtosis: -0.8865
  Zero Count: 1341459.0000
  Zero Percentage: 18.5157
--------------------------------------------------
Feature: Distance_to_21EMA
  Mean: 0.8472
  Median: 0.8899
  Std Dev: 5.5330
  Min: -99.9886
  Max: 712.1856
  25th Percentile: -1.2164
  75th Percentile: 3.0212
  Skewness: 1.0784
  Kurtosis: 96.7708
  Zero Count: 17893.0000
  Zero Percentage: 0.2470
--------------------------------------------------
Feature: D

### Normalization of Training Data

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Define the indices based on the provided feature names
feature_names = [
    'Consol_Len_Bars', 'Consol_Depth_Percent',
    'Distance_to_21EMA', 'Distance_to_50SMA', 'Distance_to_200SMA', 
    'RSL_NH_Count', 'RSL_Slope', 'Up_Down_Days', 
    'Stage 2', 'UpDownVolumeRatio', 'ATR', '%B'
]

feature_indices = {name: idx for idx, name in enumerate(feature_names)}

# Function to remove outliers and cap values
def preprocess_data(sequences, labels):
    # Reshape sequences to 2D array for easier processing (flatten the timesteps)
    num_sequences, num_timesteps, num_features = sequences.shape
    sequences_reshaped = sequences.reshape(-1, num_features)
    
    # Create a mask to filter out invalid sequences
    valid_mask = (
        (sequences_reshaped[:, feature_indices['Distance_to_21EMA']] <= 100) &
        (sequences_reshaped[:, feature_indices['Distance_to_50SMA']] <= 200) &
        (sequences_reshaped[:, feature_indices['Distance_to_200SMA']] <= 500)
    )
    
    # Reshape the valid_mask to match the original sequence shape
    valid_mask_reshaped = valid_mask.reshape(num_sequences, num_timesteps)
    
    # Filter out sequences with any invalid timesteps
    valid_sequences_mask = valid_mask_reshaped.all(axis=1)
    filtered_sequences = sequences[valid_sequences_mask]
    filtered_labels = labels[valid_sequences_mask]
    
    # Cap 'UpDownVolumeRatio' at 10
    filtered_sequences[:, :, feature_indices['UpDownVolumeRatio']] = np.minimum(
        filtered_sequences[:, :, feature_indices['UpDownVolumeRatio']], 10
    )
    
    # Normalize the features using Z-score normalization
    scaler = StandardScaler()
    
    # Flatten the sequence again for normalization
    filtered_sequences_reshaped = filtered_sequences.reshape(-1, num_features)
    
    # Normalize
    normalized_data_reshaped = scaler.fit_transform(filtered_sequences_reshaped)
    
    # Reshape back to the original 3D shape
    normalized_data = normalized_data_reshaped.reshape(filtered_sequences.shape)
    
    return normalized_data, filtered_labels

# Function to print feature statistics
def print_feature_stats(data_sequences, feature_names):
    print("Feature Statistics:")
    print("-" * 50)

    for i, feature_name in enumerate(feature_names):
        feature_data = data_sequences[:, :, i].flatten()
        
        stats = {
            "Mean": np.mean(feature_data),
            "Median": np.median(feature_data),
            "Std Dev": np.std(feature_data),
            "Min": np.min(feature_data),
            "Max": np.max(feature_data),
            "25th Percentile": np.percentile(feature_data, 25),
            "75th Percentile": np.percentile(feature_data, 75),
            "Skewness": pd.Series(feature_data).skew(),
            "Kurtosis": pd.Series(feature_data).kurtosis(),
            "Zero Count": np.sum(feature_data == 0),
            "Zero Percentage": np.mean(feature_data == 0) * 100
        }
        
        print(f"Feature: {feature_name}")
        for stat_name, stat_value in stats.items():
            print(f"  {stat_name}: {stat_value:.4f}")
        print("-" * 50)

# Example usage with data_sequences and data_labels
# Assuming data_sequences is loaded and has shape (115000, 63, 12)

# Process the data
normalized_data, processed_labels = preprocess_data(data_sequences, data_labels)




#### Stats again

In [5]:
# Print statistics
print_feature_stats(normalized_data, feature_names)

# Additional overall statistics
print("Overall Dataset Statistics:")
print(f"Total number of sequences: {data_sequences.shape[0]}")
print(f"Sequence length: {data_sequences.shape[1]}")
print(f"Number of features: {data_sequences.shape[2]}")
print(f"Total number of data points: {data_sequences.size}")
print(f"Memory usage: {data_sequences.nbytes / (1024 * 1024):.2f} MB")

Feature Statistics:
--------------------------------------------------
Feature: Consol_Len_Bars
  Mean: 0.0000
  Median: -0.3507
  Std Dev: 1.0000
  Min: -0.5737
  Max: 25.3726
  25th Percentile: -0.5241
  75th Percentile: 0.0622
  Skewness: 4.4875
  Kurtosis: 38.1379
  Zero Count: 0.0000
  Zero Percentage: 0.0000
--------------------------------------------------
Feature: Consol_Depth_Percent
  Mean: 0.0000
  Median: -0.0672
  Std Dev: 1.0000
  Min: -1.4035
  Max: 3.2297
  25th Percentile: -0.7736
  75th Percentile: 0.8075
  Skewness: 0.2047
  Kurtosis: -0.8855
  Zero Count: 0.0000
  Zero Percentage: 0.0000
--------------------------------------------------
Feature: Distance_to_21EMA
  Mean: -0.0000
  Median: 0.0097
  Std Dev: 1.0000
  Min: -18.5580
  Max: 18.0991
  25th Percentile: -0.3778
  75th Percentile: 0.4016
  Skewness: -0.2241
  Kurtosis: 14.0967
  Zero Count: 0.0000
  Zero Percentage: 0.0000
--------------------------------------------------
Feature: Distance_to_50SMA
  Mean

#### Label Engineering

In [6]:
# Function to print statistics of the labels
def print_label_stats(labels, label_name="Labels"):
    stats = {
        "Mean": np.mean(labels),
        "Median": np.median(labels),
        "Std Dev": np.std(labels),
        "Min": np.min(labels),
        "Max": np.max(labels),
        "25th Percentile": np.percentile(labels, 25),
        "75th Percentile": np.percentile(labels, 75),
        "Zero Count": np.sum(labels == 0),
        "Zero Percentage": np.mean(labels == 0) * 100
    }
    
    print(f"Statistics for {label_name}:")
    print("-" * 50)
    for stat_name, stat_value in stats.items():
        print(f"  {stat_name}: {stat_value:.4f}")
    print("-" * 50)

# Assuming `normalized_data` and `processed_labels` are already available from the preprocessing step

# Print statistics for the original labels
print_label_stats(processed_labels, label_name="Original Labels")

# Convert processed_labels to binary labels: profit (1) or not profit (0)
binary_labels = (processed_labels > 0).astype(int)

# Print statistics for the binary labels
print_label_stats(binary_labels, label_name="Binary Labels")

Statistics for Original Labels:
--------------------------------------------------
  Mean: 0.0284
  Median: -0.0209
  Std Dev: 0.2813
  Min: -0.9115
  Max: 15.0370
  25th Percentile: -0.0730
  75th Percentile: 0.0591
  Zero Count: 106.0000
  Zero Percentage: 0.0922
--------------------------------------------------
Statistics for Binary Labels:
--------------------------------------------------
  Mean: 0.4094
  Median: 0.0000
  Std Dev: 0.4917
  Min: 0.0000
  Max: 1.0000
  25th Percentile: 0.0000
  75th Percentile: 1.0000
  Zero Count: 67864.0000
  Zero Percentage: 59.0563
--------------------------------------------------


## Model Exploration

#### Simple RNN

In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import class_weight

# Compute class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(binary_labels), y=binary_labels)
class_weights_dict = {i : class_weights[i] for i in range(len(class_weights))}
print("Class weights:", class_weights_dict)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(normalized_data, binary_labels, test_size=0.2, random_state=42)

# Print the shapes of the variables
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_val: {X_val.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_val: {y_val.shape}")

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(normalized_data.shape[1], normalized_data.shape[2])),
    tf.keras.layers.LSTM(64, return_sequences=False),  # LSTM layer with 64 units
    tf.keras.layers.Dense(32, activation='relu'),     # Dense layer with 32 units
    tf.keras.layers.Dense(1, activation='sigmoid')    # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with class weights
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val), class_weight=class_weights_dict)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss}")
print(f"Validation Accuracy: {accuracy}")

# Plot training & validation accuracy values
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()

2024-06-27 13:14:03.730246: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-27 13:14:03.756993: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Class weights: {0: 0.8466491807143699, 1: 1.221190223166844}
Shape of X_train: (91931, 63, 12)
Shape of X_val: (22983, 63, 12)
Shape of y_train: (91931,)
Shape of y_val: (22983,)


2024-06-27 13:14:05.177597: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-27 13:14:05.410425: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/20
[1m2873/2873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 14ms/step - accuracy: 0.5269 - loss: 0.6908 - val_accuracy: 0.5394 - val_loss: 0.6872
Epoch 2/20
[1m2873/2873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 14ms/step - accuracy: 0.5422 - loss: 0.6879 - val_accuracy: 0.5674 - val_loss: 0.6814
Epoch 3/20
[1m 581/2873[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m29s[0m 13ms/step - accuracy: 0.5520 - loss: 0.6856

#### Add drop out and and early stopping

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import class_weight

# Compute class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(binary_labels), y=binary_labels)
class_weights_dict = {i : class_weights[i] for i in range(len(class_weights))}
print("Class weights:", class_weights_dict)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(normalized_data, binary_labels, test_size=0.2, random_state=42)

# Define the model with Dropout and Early Stopping
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(normalized_data.shape[1], normalized_data.shape[2])),
    tf.keras.layers.LSTM(64, return_sequences=False),
    tf.keras.layers.Dropout(0.5),  # Add Dropout with 50% rate
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with class weights and early stopping
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), class_weight=class_weights_dict, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss}")
print(f"Validation Accuracy: {accuracy}")

# Plot training & validation accuracy values
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()


#### Custom Loss

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import class_weight

def custom_loss(y_true, y_pred):
    # Binary Cross-Entropy Loss
    bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)

    # Calculate the number of trades (predicted positive class)
    num_trades = tf.reduce_sum(y_pred)

    # Penalize for not taking trades
    trade_penalty = tf.maximum(0.0, 1.0 - num_trades / tf.cast(tf.shape(y_true)[0], tf.float32))

    # Combine the losses
    loss = bce + trade_penalty
    return loss


# Assuming binary_labels and normalized_data are already defined
# Compute class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(binary_labels), y=binary_labels)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}
print("Class weights:", class_weights_dict)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(normalized_data, binary_labels, test_size=0.2, random_state=42)

# Define the model with Dropout and Early Stopping
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(normalized_data.shape[1], normalized_data.shape[2])),
    tf.keras.layers.LSTM(64, return_sequences=False),
    tf.keras.layers.Dropout(0.5),  # Add Dropout with 50% rate
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model with the custom loss function
model.compile(optimizer='adam', loss=custom_loss, metrics=['accuracy'])

# Define EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with class weights and early stopping
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), class_weight=class_weights_dict, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss}")
print(f"Validation Accuracy: {accuracy}")

# Plot training & validation accuracy values
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()