# Sequence Model Research

The scope of this notebook is to assess and train different sequence models given the training data generated.

Training data is generated based on financial time series data labeled with potential profits using a buy-sell system.

The goal is to create a sequence model that can choose favourable stock charts equal to or better than a human can via traditional technical analysis.

## Import Libraries and Data

In [None]:
import os
import numpy as np

# Define the data directory relative to the script location
data_dir = 'data'

# Define the file paths
sequences_path = os.path.join(data_dir, 'sequences.npy')
labels_path = os.path.join(data_dir, 'labels.npy')
metadata_path = os.path.join(data_dir, 'metadata.npy')

# Load the data
try:
    data_sequences = np.load(sequences_path)
    data_labels = np.load(labels_path)
    data_metadata = np.load(metadata_path)

    # Number of examples to select
    num_examples = 115000

    # Generate a random permutation of indices
    indices = np.random.permutation(len(data_sequences))

    # Select the first `num_examples` indices
    selected_indices = indices[:num_examples]

    # Use the selected indices to create the random subset
    data_sequences = data_sequences[selected_indices, -84:, :]
    data_labels = data_labels[selected_indices]
    data_metadata = data_metadata[selected_indices]

    # Inspect the shape and size of the loaded data before slicing
    print(f'Loaded sequences shape: {data_sequences.shape}')
    print(f'Loaded sequences size: {data_sequences.size}')
    print(f'Loaded labels shape: {data_labels.shape}')
    print(f'Loaded metadata shape: {data_metadata.shape}')

except FileNotFoundError as e:
    print(f"Error loading files: {e}")
except ValueError as e:
    print(f"Value error: {e}")

# Calculate and print the expected total size
expected_total_size = num_examples * 252 * 15
print(f'Expected total size: {expected_total_size}')

# Define relevant columns and indices for normalization
relevant_columns = [
    'Open', 'High', 'Low', 'Close', 'Volume', 'Turnover', 'Consol_Detected',
    'Consol_Len_Bars', 'Consol_Depth_Percent', 'Close_21_bar_ema',
    'Close_50_bar_sma', 'Close_150_bar_sma', 'Close_200_bar_sma',
    'RSL', 'RSL_NH'
]

price_columns_indices = [0, 1, 2, 3, 9, 10, 11, 12]  # Indices of price-related columns in the sequence data

# Map indices to column names
price_columns = [relevant_columns[i] for i in price_columns_indices]

print("\nPrice-related columns:")
for index, column in zip(price_columns_indices, price_columns):
    print(f"Index: {index}, Column: {column}")


## Data Preprocessing

### NaN Removal

In [None]:
# Replace all NaNs with 0 due to moving averages having insufficient data to compute anything, leaving blank inputs.
# Check if NaNs exist

# Dictionary to map variable names to their corresponding data arrays
data_dict = {
    'data_sequences': data_sequences,
    'data_labels': data_labels,
}

# Using a dictionary to iterate over variables
for var_name, data in data_dict.items():
    num_nans = np.sum(np.isnan(data))
    print(f"NaNs in {var_name}: {num_nans}")

    # Remove NaNs
    if num_nans > 0:
        data_dict[var_name][:] = np.nan_to_num(data)
        num_nans = np.sum(np.isnan(data))
        print(f"NaNs remaining in {var_name} after removal: {num_nans}")

print(f"Data Seq Min: {np.min(data_sequences[:,:,price_columns_indices])}")
print(f"Data Seq Max: {np.max(data_sequences[:,:,price_columns_indices])}")


### Corrupted sequence removal

99% of stocks I buy will be below 1000, with a few above 1000, although they are important.

I also noticed quite a few training examples have weird price data, which I filter out below.

I noticed with thresholds above 3e3, the max is the threshold, which is very suspect.

The loss of training examples is insignificant, and the result is better normalization of the data and obviously no corrupted sequences.

In [None]:
# Set the threshold for abnormal values based on domain knowledge
threshold = 3.0e3

# Detect all sequences with abnormally large price data
abnormal_sequences = []

# Iterate through each sequence to check for abnormal values
for sequence_index in range(data_sequences.shape[0]):
    # Extract price-related columns for the current sequence
    price_data = data_sequences[sequence_index, :, price_columns_indices]
    
    # Check if any value in the price_data exceeds the threshold
    if np.any(price_data > threshold):
        abnormal_sequences.append(sequence_index)

# Print the indices of the abnormal sequences
print(f"Abnormal Sequence Count: {len(abnormal_sequences)}")
print(f"Indices of abnormal sequences: {abnormal_sequences}")

# Create a mask for sequences that are not abnormal
mask = np.ones(data_sequences.shape[0], dtype=bool)
mask[abnormal_sequences] = False

# Filter out abnormal sequences from data_sequences and data_labels
filtered_data_sequences = data_sequences[mask]
filtered_data_labels = data_labels[mask]

# Print the shape of the filtered data
print(f"Filtered data_sequences shape: {filtered_data_sequences.shape}")
print(f"Filtered data_labels shape: {filtered_data_labels.shape}")

print(f"Data Seq Min: {np.min(filtered_data_sequences[:,:,price_columns_indices])}")
print(f"Data Seq Max: {np.max(filtered_data_sequences[:,:,price_columns_indices])}")


### Normalization of Training Data

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Indices of price-related columns
price_columns_indices = [0, 1, 2, 3]
ma_columns_indices = [9, 10, 11, 12]

# Extract data shapes
num_sequences, num_timesteps, num_features = data_sequences.shape

# Calculate log transformation for price-related features
price_data = data_sequences[:, :, price_columns_indices]
log_price_data = np.log(price_data + 1e-8)

# Replace the original price-related features with the log values
data_sequences[:, :, price_columns_indices] = log_price_data

# Calculate percentage away from the Close price for moving averages
close_price_data = data_sequences[:, :, 3].reshape(num_sequences, num_timesteps, 1)  # Close price at index 3
ma_data = data_sequences[:, :, ma_columns_indices]

# Avoid division by zero by adding epsilon
epsilon = 1e-8
percentage_away_from_close = (close_price_data - ma_data) / (ma_data + epsilon)

# Replace the original moving average features with the percentage away values
data_sequences[:, :, ma_columns_indices] = percentage_away_from_close

# Handle infinite values by replacing them with NaNs and then replacing NaNs with zero
data_sequences = np.nan_to_num(data_sequences, nan=0.0, posinf=0.0, neginf=0.0)

# Clip extreme values to avoid large outliers
data_sequences = np.clip(data_sequences, -1e3, 1e3)

# Normalize the price-related features together
price_scaler = MinMaxScaler(feature_range=(-1, 1))

# Reshape the price-related features to fit the scaler
original_shape = data_sequences[:, :, price_columns_indices].shape
reshaped_data = data_sequences[:, :, price_columns_indices].reshape(-1, len(price_columns_indices))

# Fit and transform the price-related features
normalized_price_data = price_scaler.fit_transform(reshaped_data)

# Reshape back to the original shape
normalized_price_data = normalized_price_data.reshape(original_shape)

# Replace the original price-related features with the normalized ones
data_sequences[:, :, price_columns_indices] = normalized_price_data

# Normalize the remaining features individually
for feature_index in range(num_features):
    if feature_index not in price_columns_indices and feature_index not in ma_columns_indices:
        # Initialize a new scaler for each feature
        feature_scaler = MinMaxScaler()

        # Extract the feature data
        feature_data = data_sequences[:, :, feature_index].reshape(-1, 1)

        # Fit and transform the scaler
        normalized_feature_data = feature_scaler.fit_transform(feature_data)

        # Reshape back to the original shape
        normalized_feature_data = normalized_feature_data.reshape(num_sequences, num_timesteps)

        # Replace the original feature with the normalized one
        data_sequences[:, :, feature_index] = normalized_feature_data

# Print normalized data sequences to check
print(f"Normalized Data Seq Min: {np.min(data_sequences)}")
print(f"Normalized Data Seq Max: {np.max(data_sequences)}")

# Make the labels a binary decision, rather than a profit
min_profit = 0.1  # implies a good decision is a breakout that produces more than min_profit (*100 for percent, 0.2 = 20%)

data_labels = (data_labels > min_profit).astype(int)

print(f"Data Labels for > {min_profit*100}% Min: {np.min(data_labels)}")
print(f"Data Labels for > {min_profit*100}% Max: {np.max(data_labels)}")

# Count how many labels are 1 and how many are 0
num_ones = np.sum(data_labels)
num_zeros = len(data_labels) - num_ones

print(f"Number of labels that are 1: {num_ones}")
print(f"Number of labels that are 0: {num_zeros}")
print(f"Probability of randomly selecting a stock making {min_profit*100}% is {num_ones/(num_ones+num_zeros)*100}%")


## Model -> Hyperparameter Tuning