## Import Libraries

In [None]:
# Install and Import Libraries
%%capture
!pip install yfinance
!pip install pyts
!pip install mplfinance
!pip install opencv-python-headless
!pip installpandas_market_calendars
!pip install tqdm
from keras.models import load_model
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import mplfinance as mpf
import datetime
import logging
import warnings
import pyts
import pickle
import pywt
import gc
import time
import cv2
from google.colab.patches import cv2_imshow
from io import BytesIO
from pyts.image import MarkovTransitionField
from pyts.image import GramianAngularField
from pyts.image import RecurrencePlot
from scipy.signal import spectrogram
from skimage.measure import block_reduce
from tqdm import tqdm
import os
from collections import defaultdict
from keras.losses import categorical_crossentropy
import warnings
from keras.utils import to_categorical

warnings.filterwarnings("ignore", message="Some quantiles are equal.")
warnings.filterwarnings('ignore', message='The frame.append method is deprecated')

In [None]:
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Generate Time Series (RAW)

The following code segments generate the S&P 500 index ETF time series windows and labels. <br> In the thesis paper, this data is referred to as the `Index Dataset`.

**Query Index ETF Data**

In [None]:
## Get $VOO Data
ticker_obj = yf.Ticker('VOO')
sp500_index_data = ticker_obj.history(start='2016-12-05', end='2022-12-30')

# Reset the index
sp500_index_data.reset_index(inplace=True)

# Extract the year from the 'Date' column and create a new 'Year' column
sp500_index_data['Year'] = sp500_index_data['Date'].dt.year

# Remove time components
sp500_index_data['Date'] = pd.to_datetime(sp500_index_data['Date']).dt.date

# Select only the desired columns
desired_columns = ["Date", "Open", "High", "Low", "Close", "Year"]
sp500_index_data = sp500_index_data[desired_columns]

# Extract the year from the 'Date' column and create a new 'Year' column

## Fill missing trading holidays with previous available day's data

# Create a date range between the min and max date, including only weekdays
date_range = pd.date_range(start='2016-12-05', end='2022-12-30', freq='B')  # 'B' is for business days (Monday-Friday)

# Merge the sp500_index_data with the date_range, forward filling the missing data
sp500_index_data = sp500_index_data.set_index('Date').reindex(date_range, method='ffill').reset_index().rename(columns={'index': 'Date'})

sp500_index_data

Unnamed: 0,Date,Open,High,Low,Close,Year
0,2016-12-05,180.158830,180.745612,179.963236,180.398880,2016
1,2016-12-06,180.630017,181.038990,180.158817,181.030090,2016
2,2016-12-07,180.878972,183.430579,180.745619,183.323883,2016
3,2016-12-08,183.368296,184.275140,183.101576,183.803940,2016
4,2016-12-09,184.052921,184.950865,184.017351,184.924194,2016
...,...,...,...,...,...,...
1580,2022-12-26,347.499208,350.506844,345.925700,350.427155,2022
1581,2022-12-27,350.317617,350.596467,347.419546,349.032898,2022
1582,2022-12-28,348.983122,350.755822,344.481637,344.750549,2022
1583,2022-12-29,347.479304,351.682001,347.041106,350.865356,2022


**Raw Time Series (RAW)**

In [None]:
def create_windows_and_labels(df, window_size, threshold=0.5):
    df = df.sort_values(by=['Date'])

    windows = []
    labels = []
    returns = []   # list to store returns
    max_years = []

    for i in range(len(df) - window_size):
        window_start = df.iloc[i]['Date']

        # Check if the window starts on a Monday
        if window_start.weekday() == 0:
            window_end = df.iloc[i + window_size - 1]
            next_week_index = i + window_size + 4

            # Check if next_week_index is within the dataframe's length
            if next_week_index < len(df):
                next_week_close = df.iloc[next_week_index]['Close']
                window = df.iloc[i:i + window_size][['Date', 'Close', 'Year', 'Open', 'High', 'Low']]
                label = 1 if next_week_close > window_end['Close'] else 0

                # Calculate the return
                return_val = (next_week_close - window_end['Close'])/window_end['Close']

                # Check if all the 'Close' values in the window are the same
                if window['Close'].nunique() > 1:
                    # Calculate the percentage change between consecutive 'Close' values
                    window['Close_pct_change'] = window['Close'].pct_change()

                    # Check if there is any extreme percentage change (above the threshold)
                    extreme_change = any(window['Close_pct_change'].abs() > threshold)

                    if not extreme_change:
                        window = window.drop(columns=['Close_pct_change'])
                        windows.append(window)
                        labels.append(label)
                        returns.append(return_val)  # Append the return
                        max_years.append(window['Year'].max())

    return windows, labels, returns, max_years

window_size = 20
sp500_windows, sp500_labels, sp500_returns, sp500_max_years = create_windows_and_labels(sp500_index_data, window_size)

sp500_raw_time_series_arrays = [window['Close'].values for window in sp500_windows]

## Encode Time Series (RAW) as Images

The following code segments encode the previously created raw time series (RAW) windows as images.

**Candlestick Charts (CND)**

In [None]:
def custom_greyscale_conversion(img):
    red_channel = img[:, :, 2]
    green_channel = img[:, :, 1]
    blue_channel = img[:, :, 0]

    # Convert the image to greyscale using OpenCV's built-in function
    grey_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Create masks to extract red and green candles
    red_mask = (red_channel > green_channel) & (blue_channel < red_channel)
    green_mask = (green_channel > red_channel) & (blue_channel < green_channel)

    # Create a custom greyscale image by emphasizing the difference between red and green channels
    custom_grey_img = grey_img.copy()
    custom_grey_img[red_mask] = grey_img[red_mask] * 1.5
    custom_grey_img[green_mask] = grey_img[green_mask] * 0.5

    # Clip the custom greyscale image to the range [0, 255]
    custom_grey_img = np.clip(custom_grey_img, 0, 255)

    return custom_grey_img.astype(np.uint8)

def max_pooling(img, pool_size):
    return block_reduce(img, block_size=(pool_size, pool_size), func=np.max)

def generate_candlestick_array(window):
    target_size = (20, 20)
    pool_size = 2

    num_rows = window.shape[0]
    temp_index = pd.date_range(start='2000-01-01', periods=num_rows, freq='D')
    temp_window = window.copy()
    temp_window.index = temp_index

    fig, ax = mpf.plot(temp_window, type='candle', style='charles', returnfig=True, axisoff=True)

    buf = BytesIO()
    fig.savefig(buf, format='png', bbox_inches='tight')
    plt.close(fig)

    buf.seek(0)
    img = cv2.imdecode(np.frombuffer(buf.read(), np.uint8), -1)

    grey_img = custom_greyscale_conversion(img)

    pooled_grey_img = max_pooling(grey_img, pool_size)

    resized_grey_img = cv2.resize(pooled_grey_img, target_size, interpolation=cv2.INTER_AREA)

    return resized_grey_img

sp500_candlestick_arrays = []

for window in sp500_windows:
  sp500_candlestick_array = generate_candlestick_array(window)
  sp500_candlestick_arrays.append(sp500_candlestick_array)
  del sp500_candlestick_array

**Markov Transition Fields (MTF)**

In [None]:
def generate_mtf_arrays(windows, n_bins=5, size=(20, 20)):
    images = []
    mtf = MarkovTransitionField(n_bins=n_bins)

    for window in windows:
        # Extract the 'Close' column from the window
        close_prices = window['Close'].values

        # Compute the MTF
        mtf_image = mtf.fit_transform([close_prices])[0]

        # Normalize the MTF image to the range [0, 1]
        mtf_image_normalized = (mtf_image - mtf_image.min()) / (mtf_image.max() - mtf_image.min())

        # Convert the normalized MTF image to a grayscale image
        mtf_image_grayscale = (mtf_image_normalized * 255).astype(np.uint8)

        # Resize the MTF image
        mtf_image_resized = cv2.resize(mtf_image_grayscale, size, interpolation=cv2.INTER_LINEAR)

        # Append the image to the list of images
        images.append(mtf_image_resized)

    return images

sp500_mtf_arrays = generate_mtf_arrays(sp500_windows)

**Gramian Angular Fields (GAF)**

In [None]:
def generate_gaf_difference_arrays(windows, method='difference', size=(20, 20)):
    arrays = []
    gaf = GramianAngularField(method=method)

    for window in windows:
        close_prices = window['Close'].values
        gaf_image = gaf.fit_transform([close_prices])[0]
        gaf_image_normalized = (gaf_image - gaf_image.min()) / (gaf_image.max() - gaf_image.min())

        # Convert the normalized GAF image to a greyscale image
        gaf_image_grayscale = (gaf_image_normalized * 255).astype(np.uint8)

        # Resize the greyscale GAF image
        gaf_image_resized = cv2.resize(gaf_image_grayscale, size, interpolation=cv2.INTER_LINEAR)

        arrays.append(gaf_image_resized)  # Append the resized greyscale array to the list of arrays

    return arrays

sp500_gaf_arrays = generate_gaf_difference_arrays(sp500_windows)

**Split Data**

In [None]:
# Initialize the main dictionary
sp500_dict = {2017: {}, 2018: {}, 2019: {}, 2020: {}, 2021: {}, 2022: {}}

# Initialize the nested dictionaries
for year in sp500_dict.keys():
    sp500_dict[year] = {
        'labels': [],
        'weekly_returns': [],
        'raw_time_series': [],
        'candlestick': [],
        'mtf': [],
        'gaf': []
    }

# Iterate through all the arrays simultaneously
for max_year, label, weekly_return, raw_time_series, candlestick, mtf, gaf in zip(
    sp500_max_years,
    sp500_labels,
    sp500_returns,
    sp500_raw_time_series_arrays,
    sp500_candlestick_arrays,
    sp500_mtf_arrays,
    sp500_gaf_arrays):

    # Determine the year to assign the data to
    if max_year <= 2017:
        year = 2017
    elif max_year == 2018:
        year = 2018
    elif max_year == 2019:
        year = 2019
    elif max_year == 2020:
        year = 2020
    elif max_year == 2021:
        year = 2021
    elif max_year == 2022:
        year = 2022

    # Add the data to the corresponding year in the dictionary
    sp500_dict[year]['labels'].append(label)
    sp500_dict[year]['weekly_returns'].append(weekly_return)
    sp500_dict[year]['raw_time_series'].append(raw_time_series)
    sp500_dict[year]['candlestick'].append(candlestick)
    sp500_dict[year]['mtf'].append(mtf)
    sp500_dict[year]['gaf'].append(gaf)

**Reshape Data**

In [None]:
sp500_dict_reshaped = {}

for year, data in sp500_dict.items():
    reshaped_data = {}

    for key, array_data in data.items():
        if key == 'raw_time_series':
            reshaped_data_array = [array.reshape(20, 1, 1) for array in array_data]
        elif key in ['candlestick', 'mtf', 'gaf']:
            reshaped_data_array = [array.reshape(20, 20, 1) for array in array_data]
        else:  # For 'labels' and 'weekly_returns', no reshaping is needed
            reshaped_data_array = array_data

        reshaped_data[key] = reshaped_data_array

    sp500_dict_reshaped[year] = reshaped_data

## Load Fitted Models and Predict (Index Dataset)

The following code segments load the models fitted on the `Constituents Dataset` in the previous notebook `Thesis_Models`. <br> These are used to generate predictions on the `Index Dataset`.
<br> <br> The models, which achieved minimum, median, and maximum prediction accuracy on the `Constituents Dataset`, <br> have been loaded and tested on the `Index Dataset`. The code output of the following cells shows the results of the maximum models.

**Raw Time Series (RAW)**

In [None]:
## Load models fitted on the Constituents Dataset
RAW_model_2017 = load_model(f'/content/drive/My Drive/X2_raw_time_series_model_period_1.h5')
RAW_model_2018 = load_model(f'/content/drive/My Drive/X4_raw_time_series_model_period_2.h5')
RAW_model_2019 = load_model(f'/content/drive/My Drive/X3_raw_time_series_model_period_3.h5')
RAW_model_2020 = load_model(f'/content/drive/My Drive/X4_raw_time_series_model_period_4.h5')
RAW_model_2021 = load_model(f'/content/drive/My Drive/X2_raw_time_series_model_period_5.h5')
RAW_model_2022 = load_model(f'/content/drive/My Drive/X2_raw_time_series_model_period_6.h5')

# List of models and corresponding years
models = [(RAW_model_2017, 2017), (RAW_model_2018, 2018), (RAW_model_2019, 2019),
          (RAW_model_2020, 2020), (RAW_model_2021, 2021), (RAW_model_2022, 2022)]

## Use fitted models for predictions on the Index Dataset
# Iterate through each model and year
for model, year in models:
    # Get raw time series data and labels
    raw_time_series_data = np.array(sp500_dict_reshaped[year]['raw_time_series'])
    actual_labels = sp500_dict_reshaped[year]['labels']

    # Convert labels to one-hot encoded labels
    actual_labels_one_hot = to_categorical(actual_labels)

    # Predict the probabilities
    predicted_probabilities = model.predict(raw_time_series_data)

    # Get the class with the highest probability
    predicted_labels = np.argmax(predicted_probabilities, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_score(actual_labels, predicted_labels)

    # Evaluate the loss
    loss = model.evaluate(raw_time_series_data, actual_labels_one_hot, verbose=0)[0]

    print(f"Raw Time Series Model for year {year} Loss: {loss}")
    print(f"Raw Time Series Model for year {year} Accuracy: {accuracy}")
    print(f"Raw Time Series Model for year {year} Labels: {', '.join(map(str, predicted_labels))}")

Raw Time Series Model for year 2017 Loss: 0.6566095352172852
Raw Time Series Model for year 2017 Accuracy: 0.6792452830188679
Raw Time Series Model for year 2017 Labels: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
Raw Time Series Model for year 2018 Loss: 0.6826632618904114
Raw Time Series Model for year 2018 Accuracy: 0.5769230769230769
Raw Time Series Model for year 2018 Labels: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
Raw Time Series Model for year 2019 Loss: 0.6624692678451538
Raw Time Series Model for year 2019 Accuracy: 0.6346153846153846
Raw Time Series Model for year 2019 Labels: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
Raw Time Series Model 







Raw Time Series Model for year 2021 Loss: 0.669948935508728
Raw Time Series Model for year 2021 Accuracy: 0.6538461538461539
Raw Time Series Model for year 2021 Labels: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1








Raw Time Series Model for year 2022 Loss: 0.684952974319458
Raw Time Series Model for year 2022 Accuracy: 0.5490196078431373
Raw Time Series Model for year 2022 Labels: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0


**Candlestick Charts (CND)**

In [None]:
## Load models fitted on the Constituents Dataset
CND_model_2017 = load_model(f'/content/drive/My Drive/X5_candlestick_model_period_1.h5')
CND_model_2018 = load_model(f'/content/drive/My Drive/X3_candlestick_model_period_2.h5')
CND_model_2019 = load_model(f'/content/drive/My Drive/X1_candlestick_model_period_3.h5')
CND_model_2020 = load_model(f'/content/drive/My Drive/X2_candlestick_model_period_4.h5')
CND_model_2021 = load_model(f'/content/drive/My Drive/X3_candlestick_model_period_5.h5')
CND_model_2022 = load_model(f'/content/drive/My Drive/X3_candlestick_model_period_6.h5')

# List of models and corresponding years
models = [(CND_model_2017, 2017), (CND_model_2018, 2018), (CND_model_2019, 2019),
          (CND_model_2020, 2020), (CND_model_2021, 2021), (CND_model_2022, 2022)]

## Use fitted models for predictions on the Index Dataset
# Iterate through each model and year
for model, year in models:
    # Get candlestick data and labels
    candlestick_data = np.array(sp500_dict_reshaped[year]['candlestick'])
    actual_labels = sp500_dict_reshaped[year]['labels']

    # Convert labels to one-hot encoded labels
    actual_labels_one_hot = to_categorical(actual_labels)

    # Predict the probabilities
    predicted_probabilities = model.predict(candlestick_data)

    # Get the class with the highest probability
    predicted_labels = np.argmax(predicted_probabilities, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_score(actual_labels, predicted_labels)

    # Evaluate the loss
    loss = model.evaluate(candlestick_data, actual_labels_one_hot, verbose=0)[0]

    print(f"Candlestick Model for year {year} Loss: {loss}")
    print(f"Candlestick Model for year {year} Accuracy: {accuracy}")
    print(f"Candlestick Model for year {year} Labels: {', '.join(map(str, predicted_labels))}")

Candlestick Model for year 2017 Loss: 0.6585046052932739
Candlestick Model for year 2017 Accuracy: 0.5849056603773585
Candlestick Model for year 2017 Labels: 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0
Candlestick Model for year 2018 Loss: 0.7596337795257568
Candlestick Model for year 2018 Accuracy: 0.5576923076923077
Candlestick Model for year 2018 Labels: 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1
Candlestick Model for year 2019 Loss: 0.7013486623764038
Candlestick Model for year 2019 Accuracy: 0.6346153846153846
Candlestick Model for year 2019 Labels: 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0
Candlestick Model for year 2020 Loss: 0.667930006980896
Ca

**Markov Transition Fields (MTF)**

In [None]:
# Load models fitted on the Constituents Dataset
MTF_model_2017 = load_model(f'/content/drive/My Drive/X5_mtf_model_period_1.h5')
MTF_model_2018 = load_model(f'/content/drive/My Drive/X3_mtf_model_period_2.h5')
MTF_model_2019 = load_model(f'/content/drive/My Drive/X5_mtf_model_period_3.h5')
MTF_model_2020 = load_model(f'/content/drive/My Drive/X3_mtf_model_period_4.h5')
MTF_model_2021 = load_model(f'/content/drive/My Drive/X2_mtf_model_period_5.h5')
MTF_model_2022 = load_model(f'/content/drive/My Drive/X1_mtf_model_period_6.h5')

# List of models and corresponding years
models = [(MTF_model_2017, 2017), (MTF_model_2018, 2018), (MTF_model_2019, 2019),
          (MTF_model_2020, 2020), (MTF_model_2021, 2021), (MTF_model_2022, 2022)]

## Use fitted models for predictions on the Index Dataset
# Iterate through each model and year
for model, year in models:
    # Get MTF data and labels
    mtf_data = np.array(sp500_dict_reshaped[year]['mtf'])
    actual_labels = sp500_dict_reshaped[year]['labels']

    # Convert labels to one-hot encoded labels
    actual_labels_one_hot = to_categorical(actual_labels)

    # Predict the probabilities
    predicted_probabilities = model.predict(mtf_data)

    # Get the class with the highest probability
    predicted_labels = np.argmax(predicted_probabilities, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_score(actual_labels, predicted_labels)

    # Evaluate the loss
    loss = model.evaluate(mtf_data, actual_labels_one_hot, verbose=0)[0]

    print(f"MTF Model for year {year} Loss: {loss}")
    print(f"MTF Model for year {year} Accuracy: {accuracy}")
    print(f"MTF Model for year {year} Labels: {', '.join(map(str, predicted_labels))}")

MTF Model for year 2017 Loss: 0.6509992480278015
MTF Model for year 2017 Accuracy: 0.6226415094339622
MTF Model for year 2017 Labels: 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1
MTF Model for year 2018 Loss: 0.7190297245979309
MTF Model for year 2018 Accuracy: 0.5576923076923077
MTF Model for year 2018 Labels: 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1
MTF Model for year 2019 Loss: 0.7530075907707214
MTF Model for year 2019 Accuracy: 0.5384615384615384
MTF Model for year 2019 Labels: 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1
MTF Model for year 2020 Loss: 0.6557144522666931
MTF Model for year 2020 Accuracy: 0.6037735849056604
MTF Model for year 2020 Labe

**Gramian Angular Fields (GAF)**

In [None]:
## Load models fitted on the Constituents Dataset
GAF_model_2017 = load_model(f'/content/drive/My Drive/X2_gaf_model_period_1.h5')
GAF_model_2018 = load_model(f'/content/drive/My Drive/X5_gaf_model_period_2.h5')
GAF_model_2019 = load_model(f'/content/drive/My Drive/X1_gaf_model_period_3.h5')
GAF_model_2020 = load_model(f'/content/drive/My Drive/X1_gaf_model_period_4.h5')
GAF_model_2021 = load_model(f'/content/drive/My Drive/X1_gaf_model_period_5.h5')
GAF_model_2022 = load_model(f'/content/drive/My Drive/X5_gaf_model_period_6.h5')

# List of models and corresponding years
models = [(GAF_model_2017, 2017), (GAF_model_2018, 2018), (GAF_model_2019, 2019),
          (GAF_model_2020, 2020), (GAF_model_2021, 2021), (GAF_model_2022, 2022)]

## Use fitted models for predictions on the Index Dataset
# Iterate through each model and year
for model, year in models:
    # Get GAF data and labels
    gaf_data = np.array(sp500_dict_reshaped[year]['gaf'])
    actual_labels = sp500_dict_reshaped[year]['labels']

    # Convert labels to one-hot encoded labels
    actual_labels_one_hot = to_categorical(actual_labels)

    # Predict the probabilities
    predicted_probabilities = model.predict(gaf_data)

    # Get the class with the highest probability
    predicted_labels = np.argmax(predicted_probabilities, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_score(actual_labels, predicted_labels)

    # Evaluate the loss
    loss = model.evaluate(gaf_data, actual_labels_one_hot, verbose=0)[0]

    print(f"GAF Model for year {year} Loss: {loss}")
    print(f"GAF Model for year {year} Accuracy: {accuracy}")
    print(f"GAF Model for year {year} Labels: {', '.join(map(str, predicted_labels))}")

GAF Model for year 2017 Loss: 0.7184849977493286
GAF Model for year 2017 Accuracy: 0.6037735849056604
GAF Model for year 2017 Labels: 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1
GAF Model for year 2018 Loss: 0.7542997598648071
GAF Model for year 2018 Accuracy: 0.5192307692307693
GAF Model for year 2018 Labels: 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0
GAF Model for year 2019 Loss: 0.811617910861969
GAF Model for year 2019 Accuracy: 0.5384615384615384
GAF Model for year 2019 Labels: 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
GAF Model for year 2020 Loss: 0.6849138736724854
GAF Model for year 2020 Accuracy: 0.6792452830188679
GAF Model for year 2020 Label

**Simple Moving Average (SMA)**

In [None]:
## Simple moving average calculation on the Index Dataset
def average_calculation(raw_time_series_arrays, labels):
    average_values = []
    average_labels = []

    for raw_time_series in raw_time_series_arrays:
        if raw_time_series.shape[0] != 20:
            raise ValueError("Each array in the input list must contain exactly 20 elements.")

        # Calculate the average for all 20 elements
        avg_value = np.mean(raw_time_series, axis=0)

        # Determine the label (1 if the 20th element of the time series is > than the average, 0 otherwise)
        label = 1 if raw_time_series[-1] > avg_value else 0

        average_values.append(avg_value)
        average_labels.append(label)

    # Calculate accuracy for the period
    accuracy = accuracy_score(labels, average_labels)

    return average_values, average_labels, accuracy

def process_periods_avg(sp500_dict_reshaped):
    period_dict = {}

    for year in range(2017, 2023):
        # Get raw time series data and labels
        raw_time_series_data = np.array(sp500_dict_reshaped[year]['raw_time_series'])
        labels = sp500_dict_reshaped[year]['labels']

        avg_values, avg_labels, accuracy = average_calculation(raw_time_series_data, labels)

        period_dict[year] = {
            'average_values': avg_values,
            'average_labels': avg_labels,
            'accuracy': accuracy,
        }

    return period_dict

all_periods_average_data = process_periods_avg(sp500_dict_reshaped)

for year, period_data in all_periods_average_data.items():
    print(f"Year {year} Average-based Model Accuracy: {period_data['accuracy']}")
    print(f"Year {year} Average-based Model Labels: {', '.join(map(str, period_data['average_labels']))}")

Year 2017 Average-based Model Accuracy: 0.5660377358490566
Year 2017 Average-based Model Labels: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
Year 2018 Average-based Model Accuracy: 0.46153846153846156
Year 2018 Average-based Model Labels: 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0
Year 2019 Average-based Model Accuracy: 0.6923076923076923
Year 2019 Average-based Model Labels: 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
Year 2020 Average-based Model Accuracy: 0.5471698113207547
Year 2020 Average-based Model Labels: 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,

**Random Prediction (RND)**

In [None]:
## Random predictions on the Index Dataset
# Initialize random seed for reproducibility
np.random.seed(0)

for year in range(2017, 2023):
    # Get the labels
    actual_labels = sp500_dict[year]['labels']
    actual_labels_categorical = to_categorical(actual_labels)  # convert to one-hot vectors

    # Generate 5 random predictions
    for i in range(5):
        # Generate random probabilities
        random_predictions = np.random.rand(len(actual_labels), 2)  # Binary classification
        random_predictions = random_predictions / np.sum(random_predictions, axis=1, keepdims=True)  # Normalize to make probabilities sum to 1

        # Calculate accuracy
        predicted_labels = np.argmax(random_predictions, axis=-1)
        accuracy = accuracy_score(actual_labels, predicted_labels)

        # Calculate loss
        loss = categorical_crossentropy(actual_labels_categorical, random_predictions).numpy().mean()

        print(f"Year {year} - Random guess {i+1} Loss: {loss}, Accuracy: {accuracy}, Labels: {', '.join(map(str, predicted_labels))}")

Year 2017 - Random guess 1 Loss: 1.0483529853814881, Accuracy: 0.41509433962264153, Labels: 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1
Year 2017 - Random guess 2 Loss: 0.8783584660819423, Accuracy: 0.39622641509433965, Labels: 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0
Year 2017 - Random guess 3 Loss: 1.0794107494114413, Accuracy: 0.4339622641509434, Labels: 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1
Year 2017 - Random guess 4 Loss: 0.7002639746410375, Accuracy: 0.6415094339622641, Labels: 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0
Ye