**Running this notebook consumes multiple hours and excessive RAM. <br> As a result, file references and function calls are commented out.
<br> <br>
The code is provided for clarity and understanding.** <br>

# Import Libraries

In [None]:
# Install and Import Libraries
%%capture
!pip install pmdarima

import cv2
import os
import pprint
import pickle
import random
import numpy as np
import pmdarima as pm
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from google.colab.patches import cv2_imshow

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, BatchNormalization
from keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import load_model


from keras.utils import to_categorical
from tensorflow.keras.losses import categorical_crossentropy

In [None]:
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data (Constituents Dataset)

This code segment loads the times series and image encodings generated in the previous notebook **Thesis_Images**. <br> In the thesis paper, these datasets are refered to as the **Constituents Dataset**.

In [None]:
# Raw Time Series
# with open('/content/drive/MyDrive/20230425_raw_time_series_arrays.pkl', 'rb') as f:
#    raw_time_series_arrays = pickle.load(f)

# Candlestick images
# with open('/content/drive/MyDrive/20230425_candlestick_arrays.pkl', 'rb') as f:
#    candlestick_arrays = pickle.load(f)

# MTF images
# with open('/content/drive/MyDrive/20230425_mtf_arrays.pkl', 'rb') as f:
#    mtf_arrays = pickle.load(f)

# GAF images
# with open('/content/drive/MyDrive/20230425_gaf_arrays.pkl', 'rb') as f:
#    gaf_arrays = pickle.load(f)

# Labels
# with open('/content/drive/MyDrive/20230425_labels.pkl', 'rb') as f:
#    labels = pickle.load(f)

# Years
# with open('/content/drive/MyDrive/20230425_max_years.pkl', 'rb') as f:
#    max_years = pickle.load(f)

# Split Datasets (Train, Validation, Test)

This code segment splits the data in Train, Validation and Test Sets as defined in the thesis paper.

In [None]:
assert len(raw_time_series_arrays) == len(candlestick_arrays) == len(mtf_arrays) == len(gaf_arrays) == len(labels) == len(max_years)

# Define the periods
periods = [
    {'train': (2013, 2015), 'validation': 2016, 'test': 2017},
    {'train': (2014, 2016), 'validation': 2017, 'test': 2018},
    {'train': (2015, 2017), 'validation': 2018, 'test': 2019},
    {'train': (2016, 2018), 'validation': 2019, 'test': 2020},
    {'train': (2017, 2019), 'validation': 2020, 'test': 2021},
    {'train': (2018, 2020), 'validation': 2021, 'test': 2022},
]

data_lists = [
    raw_time_series_arrays,
    candlestick_arrays,
    mtf_arrays,
    gaf_arrays,
    labels,
    max_years
]

data_list_names = [
    'raw_time_series',
    'candlestick',
    'mtf',
    'gaf',
    'labels',
    'max_years'
]

# Create a nested dictionary to store the train, validation, and test sets for each period
data_splits = {i: {'train': {}, 'validation': {}, 'test': {}} for i in range(len(periods))}

# Iterate over periods
for i, period in enumerate(periods):
    train_years = set(range(period['train'][0], period['train'][1] + 1))
    validation_year = period['validation']
    test_year = period['test']

    # Split data into train, validation, and test sets for the current period
    train_indices = [j for j, year in enumerate(max_years) if year in train_years]
    validation_indices = [j for j, year in enumerate(max_years) if year == validation_year]
    test_indices = [j for j, year in enumerate(max_years) if year == test_year]

    # Store them in the data_splits dictionary
    for name, data_list in zip(data_list_names, data_lists):
        data_splits[i]['train'][name] = [data_list[j] for j in train_indices]
        data_splits[i]['validation'][name] = [data_list[j] for j in validation_indices]
        data_splits[i]['test'][name] = [data_list[j] for j in test_indices]

# Fit Models and Predict (Constituents Dataset)

The following code segments implement the models for prediction as defined in the thesis paper. <br> Subsequently, they are used to predict on the **Constituents Dataset**.

In [None]:
# Reshape Data for CNN input
data_splits_reshaped = {}

for period_key, period_data in data_splits.items():
    reshaped_period_data = {}

    for split_key, split_data in period_data.items():
        reshaped_split_data = {}

        for method_key, method_data in split_data.items():
            if method_key == 'raw_time_series':
                reshaped_data = [array.reshape(20, 1, 1) for array in method_data]
            elif method_key in ['candlestick', 'mtf', 'gaf']:
                reshaped_data = [array.reshape(20, 20, 1) for array in method_data]
            else:  # For 'labels', no reshaping is needed
                reshaped_data = method_data

            reshaped_split_data[method_key] = reshaped_data

        reshaped_period_data[split_key] = reshaped_split_data

    data_splits_reshaped[period_key] = reshaped_period_data

**Convolutional Neural Network (CNN)**

In [None]:
def create_raw_model():
    model = Sequential()

    model.add(Conv2D(256, (3, 1), activation='relu', padding='same', input_shape=(20, 1, 1)))
    model.add(BatchNormalization())
    model.add(Conv2D(256, (3, 1), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 1)))
    model.add(Dropout(0.3))

    model.add(Conv2D(512, (3, 1), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(512, (3, 1), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 1)))
    model.add(Dropout(0.3))

    model.add(Conv2D(1024, (3, 1), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(1024, (3, 1), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 1)))
    model.add(Dropout(0.3))

    model.add(Flatten())
    model.add(Dense(8192, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(8192, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    learning_rate = 1e-5
    optimizer = Adam(learning_rate=learning_rate) # tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

def create_image_model():
    model = Sequential()

    model.add(Conv2D(256, (3, 3), activation='relu', padding='same', input_shape=(20, 20, 1)))
    model.add(BatchNormalization())
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.3))

    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.3))

    model.add(Conv2D(1024, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(1024, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.3))

    model.add(Flatten())
    model.add(Dense(8192, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(8192, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    learning_rate = 1e-5
    optimizer = Adam(learning_rate=learning_rate) # tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

In [None]:
def train_predict_evaluate(X_train, X_validation, X_test, y_train, y_validation, y_test, method):
    if method == 'raw_time_series':
        model = create_raw_model()
    else:
        model = create_image_model()

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)  # Stop training if validation loss doesn't decrease for 3 epochs
    model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)  # Save the model with the lowest validation loss

    model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_validation, y_validation), callbacks=[early_stopping, model_checkpoint])  # Pass the callbacks list to the fit method

    predictions = model.predict(X_test)
    predicted_labels = np.argmax(predictions, axis=1)
    true_labels = np.argmax(y_test, axis=1)  # Convert the one-hot encoded y_test to label indices

    loss = model.evaluate(X_test, y_test, verbose=0)[0]
    accuracy = accuracy_score(true_labels, predicted_labels)

    print(f"Test loss for {method}: {loss}")
    print(f"Test accuracy for {method}: {accuracy}")
    return loss, accuracy, model

methods = ['raw_time_series', 'candlestick', 'mtf', 'gaf']

def process_period_cnn(period_data):
    period_results = {}
    period_models_fitted = {}

    for method in methods:
        X_train = np.array(period_data['train'][method])
        y_train = to_categorical(np.array(period_data['train']['labels']))
        X_validation = np.array(period_data['validation'][method])
        y_validation = to_categorical(np.array(period_data['validation']['labels']))
        X_test = np.array(period_data['test'][method])
        y_test = to_categorical(np.array(period_data['test']['labels']))

        loss, accuracy, model = train_predict_evaluate(X_train, X_validation, X_test, y_train, y_validation, y_test, method)

        period_results[method] = {'loss': loss, 'accuracy': accuracy}
        period_models_fitted[method] = model

    return period_results, period_models_fitted

# period_results, period_models_fitted = process_period_cnn(data_splits_reshaped[5])

# for name, model in period_models_fitted.items():
    # model.save(f'/content/drive/My Drive/X_{name}_model_period_6.h5')

**Simple Moving Average (SMA)**

In [None]:
def average_calculation(raw_time_series_arrays, labels):
    average_values = []
    average_labels = []

    for raw_time_series in raw_time_series_arrays:
        if len(raw_time_series) != 20:
            raise ValueError("Each array in the input list must contain exactly 20 elements.")

        # Calculate the average for the all 20 elements
        avg_value = sum(raw_time_series) / len(raw_time_series)

        # Determine the label (1 if the 20th element of the time series is > thatn the average, 0 otherwise)
        label = 1 if raw_time_series[-1] > avg_value else 0

        average_values.append(avg_value)
        average_labels.append(label)

    # Calculate accuracy for the period
    accuracy = accuracy_score(labels, average_labels)

    return average_values, average_labels, accuracy

def process_periods_avg(data_splits_reshaped):
    period_dict = {}

    for period_key, period_value in data_splits_reshaped.items():
        test_split_value = period_value['test']
        raw_time_series = test_split_value['raw_time_series']
        labels = test_split_value['labels']

        avg_values, avg_labels, accuracy = average_calculation(raw_time_series, labels)

        period_dict[f'period_{period_key}_test'] = {
            'average_values': avg_values,
            'average_labels': avg_labels,
            'accuracy': accuracy,
        }

    return period_dict

# all_periods_average_data = process_periods_avg(data_splits_reshaped)

# for period, period_data in all_periods_average_data.items():
    # print(f"Accuracy for {period}: {period_data['accuracy']}")

**Random Prediction (RND)**

In [None]:
def process_periods_rnd(data_splits_reshaped, num_iterations=5):
    period_dict = {}

    for period_key, period_value in data_splits_reshaped.items():
        np.random.seed(0)  # Set the random seed
        test_split_value = period_value['test']
        labels = test_split_value['labels']
        labels_one_hot = to_categorical(labels)

        # Iterate over the number of requested iterations
        for i in range(num_iterations):
            # Generate random labels
            random_labels = np.random.randint(0, 2, size=(len(labels),))
            random_probabilities = tf.nn.softmax(np.random.rand(len(labels), 2)).numpy()

            # Calculate accuracy for the period
            accuracy = accuracy_score(labels, random_labels)

            # Calculate the categorical cross entropy loss
            loss = categorical_crossentropy(labels_one_hot, random_probabilities).numpy().mean()

            if period_key not in period_dict:
                period_dict[period_key] = {}

            period_dict[period_key][f'iteration_{i+1}'] = {
                'random_labels': random_labels,
                'random_probabilities': random_probabilities,
                'accuracy': accuracy,
                'loss': loss
            }

    return period_dict

# all_periods_random_data = process_periods_rnd(data_splits_reshaped)

# for period, period_data in all_periods_random_data.items():
#    for iteration, iteration_data in period_data.items():
#        print(f"Loss for {period} during {iteration}: {iteration_data['loss']}")
#        print(f"Accuracy for {period} during {iteration}: {iteration_data['accuracy']}")