# Baseline Features

The baseline features include: `Timestamp`, `Open`, `High`, `Low`, `Close`, `Volume`.
- Working with the full index dataset (7.0M entries) can lead to excessive training times.
- We'll restrict our dataset to the most recent `100,000` data points, which are likely more relevant for future predictions.
- Note that a `training period` of only two months may not capture all relevant trends, despite containing substantial data.

## Target Variable

- We define our prediction variable `signal` using the `Close` price relative to the Bull Market Support Band (the lower of 20-week SMA and 21-week EMA).
- If closing price falls below the Bull Market Support Band, it signals a bear market, otherwise a bull market.
- The trading strategy assigns signal value = 1 (buy) in bull markets and signal value = 0 (sell) in bear markets.
- The window values for both moving averages are configurable parameters, both of which are arbitrary, and can affect the results, ideally an optimisation study needs to be carried out to find optimum values.


In [None]:
import os
import pandas as pd

pd.set_option("display.precision", 2)

PROCESSED_PATH = "data/processed/"
TRAIN_DATA_NAME = "train_data.csv"  # 60%
VALIDATION_DATA_NAME = "validation_data.csv"  # 20%
TEST_DATA_NAME = "test_data.csv"  # 20%

TRAIN_DATA_FILE = os.path.join(PROCESSED_PATH, TRAIN_DATA_NAME)
VALIDATION_DATA_FILE = os.path.join(PROCESSED_PATH, VALIDATION_DATA_NAME)

train_data = pd.read_csv(TRAIN_DATA_FILE)
valid_data = pd.read_csv(VALIDATION_DATA_FILE)

train_data.set_index("Timestamp", inplace=True)
valid_data.set_index("Timestamp", inplace=True)

In [None]:
import numpy as np


def generate_bull_bear_signals(price_data, verbose=True):
    # Calculate 20-week SMA for minute-level data
    # 20 weeks = 20 weeks * 7 days * 24 hours * 60 minutes = 201,600 minutes
    price_data["SMA_20W"] = (
        price_data["Close"].rolling(window=201600, min_periods=1, center=False).mean()
    )

    # Calculate 21-week EMA for minute-level data
    # 21 weeks = 21 weeks * 7 days * 24 hours * 60 minutes = 211,680 minutes
    price_data["EMA_21W"] = (
        price_data["Close"].ewm(span=211680, min_periods=1, adjust=False).mean()
    )

    # Create Bull Market Support Band (the lower of the two indicators)
    price_data["Bull_Support_Band"] = price_data[["SMA_20W", "EMA_21W"]].min(axis=1)

    # Create signals: 1 (buy) when Close is above Bull Market Support Band, 0 (sell) otherwise
    price_data["signal"] = np.where(
        price_data["Close"] > price_data["Bull_Support_Band"], 1.0, 0.0
    )

    if verbose:
        display(price_data["signal"].value_counts())

    return price_data


train_data_with_signals = train_data.copy()
valid_data_with_signals = valid_data.copy()

generate_bull_bear_signals(train_data_with_signals, verbose=True)
generate_bull_bear_signals(valid_data_with_signals, verbose=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_target_correlation(df, target="demand", figsize=(12, 1), return_corr=False):
    # Calculate correlation matrix
    corr = df.corr()[target].drop(target, errors="ignore")

    if return_corr:
        return corr

    # Plot
    plt.figure(figsize=figsize)
    sns.heatmap(
        corr.to_frame().T,
        annot=True,
        cmap="coolwarm",
        center=0,
        vmin=-0.3,
        vmax=0.3,
        cbar=False,
        linewidths=1,
    )
    plt.title(f"Feature Correlation with {target}")
    plt.subplots_adjust(bottom=0.3, top=0.8)

    plt.gcf()

In [None]:
plot_target_correlation(train_data_with_signals, "signal", figsize=(7, 0.5))

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


def plot_vertical_subplots(
    df: pd.DataFrame,
    columns: list,
    title: str = "",
    line_widths: list = None,
    size: list = [600, 1000],
    default_lw: float = 1.0,
    max_datapoints: int = None,
):
    if max_datapoints is not None and max_datapoints > 0 and max_datapoints < len(df):
        df_to_plot = df.head(max_datapoints)
        print(f"Info: Plotting the first {max_datapoints} data points.")
    else:
        df_to_plot = df
        if max_datapoints is not None and max_datapoints <= 0:
            print(
                f"Warning: Invalid max_datapoints value ({max_datapoints}). Plotting all points."
            )

    nplots = len(columns)
    fig = make_subplots(rows=nplots, cols=1, shared_xaxes=True, subplot_titles=columns)

    if line_widths is None or len(line_widths) != nplots:
        if line_widths is not None:
            print(
                f"Warning: Length of line_widths ({len(line_widths)}) does not match number of plots ({nplots}). Using default width {default_lw}."
            )
        current_widths = [default_lw] * nplots
    else:
        current_widths = line_widths

    print(f"Plotting {len(df_to_plot)} data points for each of the {nplots} plots...")

    for i, col_name in enumerate(columns):
        if col_name not in df_to_plot.columns:
            print(
                f"Warning: Column '{col_name}' not found in DataFrame. Skipping plot."
            )
            continue

        fig.add_trace(
            go.Scattergl(
                x=df_to_plot.index,
                y=df_to_plot[col_name],
                mode="lines",
                name=col_name,
                line=dict(width=current_widths[i]),
                showlegend=(i == 0),
            ),
            row=i + 1,
            col=1,
        )

    fig.update_layout(
        height=size[0],
        width=size[1],
        template="plotly_white",
        title_text=title,
        margin=dict(l=50, r=50, t=80 if title else 50, b=40),
        hovermode="x unified",
    )
    fig.update_xaxes(rangeslider_visible=False)
    print("Displaying plot...")
    fig.show(
        config={
            "scrollZoom": True,
            "displayModeBar": True,
        }
    )

In [None]:
df_shortened = train_data_with_signals.head(800000)
plot_vertical_subplots(
    df=df_shortened,
    columns=["Close", "signal"],
    title="Closing Price and Signal Fluctuation in Training Data",
    line_widths=[2, 0.4],
    size=[500, 1000],
)