# Feature Engineering



- `Moving Average`: This indicator reveals price trends by smoothing out market fluctuations, effectively filtering out short-term noise from price data.
- `Momentum (MOM)`: This metric quantifies the acceleration of a security's price or volume, essentially measuring how rapidly prices are changing in a given direction.
- `Rate Of Change (ROC)`: This momentum oscillator calculates the percentage difference between current price and the price n periods ago. Assets displaying **elevated ROC values** typically signal overbought conditions, while **depressed ROC readings** often indicate oversold market states.
- `Relative Strength Index (RSI)`: A momentum measurement that evaluates the speed and magnitude of recent price movements to identify potential overbought or oversold conditions. It operates on a scale of 0-100, with **readings above 70 suggesting an overbought asset** and **readings below 30 indicating an undervalued, oversold condition**.
- `Stochastic Oscillator (%K and %D)`: This momentum tool compares a security's closing price to its price range over a specific timeframe. The %K line represents the faster signal, while %D serves as the slower, more smoothed indicator.

In [None]:
import os
import pandas as pd

pd.set_option("display.precision", 2)

PROCESSED_PATH = "data/processed/"
TRAIN_DATA_NAME = "train_data.csv"  # 60%
VALIDATION_DATA_NAME = "validation_data.csv"  # 20%
TEST_DATA_NAME = "test_data.csv"  # 20%

TRAIN_DATA_FILE = os.path.join(PROCESSED_PATH, TRAIN_DATA_NAME)
VALIDATION_DATA_FILE = os.path.join(PROCESSED_PATH, VALIDATION_DATA_NAME)

train_data = pd.read_csv(TRAIN_DATA_FILE)
valid_data = pd.read_csv(VALIDATION_DATA_FILE)

train_data.set_index("Timestamp", inplace=True)
valid_data.set_index("Timestamp", inplace=True)

In [None]:
# Limit to the last 100,000 entries (most recent)
train_data = train_data.tail(100000)
valid_data = valid_data.tail(100000)

In [None]:
import numpy as np


def generate_bull_bear_signals(df, verbose=True):
    # Calculate 20-week SMA for minute-level data
    # 20 weeks = 20 weeks * 7 days * 24 hours * 60 minutes = 201,600 minutes
    df["SMA_20W"] = (
        df["Close"].rolling(window=201600, min_periods=1, center=False).mean()
    )

    # Calculate 21-week EMA for minute-level data
    # 21 weeks = 21 weeks * 7 days * 24 hours * 60 minutes = 211,680 minutes
    df["EMA_21W"] = df["Close"].ewm(span=211680, min_periods=1, adjust=False).mean()

    # Create Bull Market Support Band (the lower of the two indicators)
    df["Bull_Support_Band"] = df[["SMA_20W", "EMA_21W"]].min(axis=1)

    # Create signals: 1 (buy) when Close is above Bull Market Support Band, 0 (sell) otherwise
    df["signal"] = np.where(df["Close"] > df["Bull_Support_Band"], 1.0, 0.0)

    if verbose:
        display(df["signal"].value_counts())


generate_bull_bear_signals(train_data, verbose=True)

In [None]:
def calculate_simple_moving_average(df, period):
    return pd.Series(
        df["Close"].rolling(period, min_periods=period).mean(),
        name=f"SMA_{period}",
    )


def calculate_exponential_moving_average(df, period):
    return pd.Series(
        df["Close"].ewm(span=period, min_periods=period).mean(),
        name=f"EMA_{period}",
    )


def calculate_momentum(df, period):
    return pd.Series(df["Close"].diff(period), name=f"Momentum_{period}")


def calculate_rate_of_change(df, period):
    current_price = df["Close"]
    price_n_periods_ago = df["Close"].shift(period)
    return pd.Series(
        ((current_price - price_n_periods_ago) / price_n_periods_ago) * 100,
        name=f"ROC_{period}",
    )


def calculate_relative_strength_index(df, period):
    delta = df["Close"].diff().dropna()
    gains = delta.copy()
    losses = delta.copy()

    gains[gains < 0] = 0
    losses[losses > 0] = 0
    losses = abs(losses)

    avg_gain = gains[:period].mean()
    avg_loss = losses[:period].mean()

    for i in range(period, len(gains)):
        avg_gain = ((avg_gain * (period - 1)) + gains.iloc[i]) / period
        avg_loss = ((avg_loss * (period - 1)) + losses.iloc[i]) / period

    rs = avg_gain / avg_loss if avg_loss != 0 else float("inf")
    rsi_values = 100 - (100 / (1 + rs))

    return pd.Series(rsi_values, name=f"RSI_{period}")


def calculate_stochastic_oscillator(df, period, k_or_d="k"):
    stoch_k = (
        (df["Close"] - df["Low"].rolling(period).min())
        / (df["High"].rolling(period).max() - df["Low"].rolling(period).min())
    ) * 100
    if k_or_d.lower() == "k":
        return pd.Series(stoch_k, name=f"Stochastic_K_{period}")
    else:
        stoch_d = stoch_k.rolling(3).mean()
        return pd.Series(stoch_d, name=f"Stochastic_D_{period}")

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


def plot_line(df, columns, title="", secondary_y=None, size=[350, 1000]):
    fig = make_subplots(specs=[[{"secondary_y": True}]]) if secondary_y else go.Figure()

    for i, col in enumerate(columns):
        trace = go.Scatter(
            x=df.index, y=df[col], mode="lines", name=col, line=dict(width=2.0)
        )
        if secondary_y:
            fig.add_trace(trace, secondary_y=secondary_y[i])
        else:
            fig.add_trace(trace)
    fig.update_layout(
        height=size[0],
        width=size[1],
        template="plotly_white",
        title=title,
        margin=dict(l=50, r=80, t=50, b=40),
    )
    fig.show()

In [None]:
def calculate_technical_indicators(df, plot_data=True, plot_period=None):
    indicator_groups = {}

    df["MA10"] = calculate_simple_moving_average(df, 10)
    df["MA30"] = calculate_simple_moving_average(df, 30)
    df["MA200"] = calculate_simple_moving_average(df, 200)
    indicator_groups["Simple Moving Average"] = ["MA10", "MA30", "MA200"]

    df["EMA10"] = calculate_exponential_moving_average(df, 10)
    df["EMA30"] = calculate_exponential_moving_average(df, 30)
    df["EMA200"] = calculate_exponential_moving_average(df, 200)
    indicator_groups["Exponential Moving Average"] = ["EMA10", "EMA30", "EMA200"]

    df["MOM10"] = calculate_momentum(df, 10)
    df["MOM30"] = calculate_momentum(df, 30)
    df["MOM200"] = calculate_momentum(df, 200)
    indicator_groups["Momentum"] = ["MOM10", "MOM30", "MOM200"]

    df["ROC10"] = calculate_rate_of_change(df, 10)
    df["ROC30"] = calculate_rate_of_change(df, 30)
    df["ROC200"] = calculate_rate_of_change(df, 200)
    indicator_groups["Rate of Change"] = ["ROC10", "ROC30", "ROC200"]

    df["RSI10"] = calculate_relative_strength_index(df, 10)
    df["RSI30"] = calculate_relative_strength_index(df, 30)
    df["RSI200"] = calculate_relative_strength_index(df, 200)
    indicator_groups["Relative Strength Index"] = ["RSI10", "RSI30", "RSI200"]

    df["STOCH_K_10"] = calculate_stochastic_oscillator(df, 10, "k")
    df["STOCH_K_30"] = calculate_stochastic_oscillator(df, 30, "k")
    df["STOCH_K_200"] = calculate_stochastic_oscillator(df, 200, "k")
    indicator_groups["Stochastic Oscillator (Fast)"] = [
        "STOCH_K_10",
        "STOCH_K_30",
        "STOCH_K_200",
    ]

    df["STOCH_D_10"] = calculate_stochastic_oscillator(df, 10, "d")
    df["STOCH_D_30"] = calculate_stochastic_oscillator(df, 30, "d")
    df["STOCH_D_200"] = calculate_stochastic_oscillator(df, 200, "d")
    indicator_groups["Stochastic Oscillator (Slow)"] = [
        "STOCH_D_10",
        "STOCH_D_30",
        "STOCH_D_200",
    ]

    if plot_data:
        plot_period_data = df if plot_period is None else df.loc[plot_period]

        for title, indicators in indicator_groups.items():
            plot_line(
                plot_period_data[indicators],
                indicators,
                title=f"{title} (periods=10,30,200)",
            )

    return df

In [None]:
calculate_technical_indicators(train_data)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_target_correlation(
    df, target="demand", figsize=(20, 6), return_corr=False, fontsize=10
):
    corr = df.corr()[target].drop(target, errors="ignore")

    if return_corr:
        return corr

    corr = corr.sort_values(ascending=False)
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    plt.figure(figsize=figsize)
    ax = sns.heatmap(
        corr.to_frame().T,
        annot=True,
        cmap=cmap,
        center=0,
        vmin=-1,
        vmax=1,
        cbar=True,
        linewidths=1,
        annot_kws={"size": fontsize},
    )

    plt.title(f"Feature Correlation with {target}", fontsize=fontsize + 4)
    plt.xticks(rotation=45, ha="right", fontsize=fontsize)
    plt.yticks(fontsize=fontsize)
    plt.subplots_adjust(bottom=0.3)
    cbar = ax.collections[0].colorbar
    cbar.ax.tick_params(labelsize=fontsize)
    cbar.set_label("Correlation Coefficient", fontsize=fontsize)

    plt.tight_layout()
    plt.gcf()

In [None]:
plot_target_correlation(train_data, "signal")