# Libraries

In [None]:
import glob

import numpy as np
import pandas as pd

# Preprocessing

In [None]:
dfs = [pd.read_parquet(f) for f in glob.glob("../data/raw/*_eod.parquet")]
df = pd.concat(dfs, ignore_index=True)
df = df[['date', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'adj_close']]
df.head()

In [None]:
df.shape

In [None]:
# Ensure date is datetime and sort by symbol and date
df.date = pd.to_datetime(df.date, utc=True)
df = df.sort_values(['symbol', 'date']).reset_index(drop=True)
df.to_parquet("../data/processed/combined_eod.parquet", index=False)
df.head()

In [None]:
# Binary target: 1 if next close > current close, else 0
df["target"] = (df.groupby("symbol")["close"].shift(-1) > df["close"]).astype(int)
df = df.dropna()  # Drop rows without future data
df["target"].value_counts()  / len(df) # Check balance

# Feature Engineer

In [None]:
aapl = df[df["symbol"] == "AAPL"]
aapl_df = aapl.copy()
aapl_df.head()

## Return, ranges

In [None]:
aapl_df['high_low'] = aapl_df['high'] - aapl_df['low']
aapl_df['close_open'] = aapl_df['close'] - aapl_df['open']
aapl_df['return'] = aapl_df['close'].pct_change()
aapl_df.head()

## Lagged features

In [None]:
lag_days = [1, 2, 5, 10]

for day in lag_days:
    aapl_df[f'return_lag_{day}'] = aapl_df['return'].shift(day)

aapl_df.head()

## Rolling features

In [None]:
rolling_windows = [5, 10]

for window in rolling_windows:
    aapl_df[f'return_roll_mean_{window}'] = aapl_df['return'].rolling(window).mean()
    aapl_df[f'return_roll_std_{window}'] = aapl_df['return'].rolling(window).std()

aapl_df.head()

## Time features

In [None]:
aapl_df['day_of_week'] = aapl_df['date'].dt.weekday
aapl_df['month'] = aapl_df['date'].dt.month
aapl_df['day_of_month'] = aapl_df['date'].dt.day    
aapl_df['quarter'] = aapl_df['date'].dt.quarter
aapl_df['is_quarter_end'] = aapl_df['date'].dt.is_quarter_end.astype(int)
aapl_df.head()

## Technical features

In [None]:
# Simple moving average (SMA)
aapl_df['sma_10'] = aapl_df['close'].rolling(window=10, min_periods=10).mean()
aapl_df['sma_20'] = aapl_df['close'].rolling(window=20, min_periods=20).mean()

# Relative Strength Index (RSI)
window = 14
delta = aapl_df['close'].diff()

gain = np.where(delta > 0, delta, 0)
loss = np.where(delta < 0, -delta, 0)

avg_gain = pd.Series(gain).rolling(window=window, min_periods=window).mean()
avg_loss = pd.Series(loss).rolling(window=window, min_periods=window).mean()

rs = avg_gain / (avg_loss.replace(0, np.nan))
aapl_df['rsi_14'] = 100 - (100 / (1 + rs))

# --- EMA ---
aapl_df['ema_12'] = aapl_df['close'].ewm(span=12, adjust=False).mean()
aapl_df['ema_26'] = aapl_df['close'].ewm(span=26, adjust=False).mean()

# --- MACD and Signal Line ---
aapl_df['macd'] = aapl_df['ema_12'] - aapl_df['ema_26']
aapl_df['macd_signal'] = aapl_df['macd'].ewm(span=9, adjust=False).mean()
aapl_df.head()

In [None]:
# 1) drop the initial warm-up rows for rolling/lag features
W_DROP = 20
aapl_df_handled_nan = aapl_df.iloc[W_DROP:].copy()
print(aapl_df.shape)
print(aapl_df_handled_nan.shape)

# 2) drop any leftover NaNs (should be few; e.g., first valid RSI/MACD row)
aapl_df_handled_nan = aapl_df_handled_nan.dropna().reset_index(drop=True)
print(aapl_df_handled_nan.shape)
print(aapl_df_handled_nan.isna().sum())

In [None]:
def create_target(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create target varible indicating whetther
    the next day's closing price is higher than the current day's.

    Args:
        df (pd.DataFrame): Input DataFram

    Returns:
        pd.DataFrame: DataFrame with added 'target' column (1 if next close > current close, else 0)

    Examples:
        >>> df = pd.DataFrame({
        ...     'symbol': ['AAPL', 'AAPL'],
        ...     'date': pd.to_datetime(['2020-01-01', '2020-01-02']),
        ...     'close': [100, 105]
        ... })
        >>> result = create_target(df)
        >>> result['target'].tolist()
        [1]
    """
    df = df.sort_values(["symbol", "date"]).reset_index(drop=True)
    df["target"] = (df.groupby("symbol")["close"].shift(-1) > df["close"]).astype(int)
    df = df.dropna()
    return df


def create_range_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates range and return features based on price differences.

    Args:
        df (pd.DataFrame): Input DataFrame

    Returns:
        pd.DataFrame: DataFrame with added features.

    Examples:
        >>> df = pd.DataFrame({
        ...     'high': [110, 115],
        ...     'low': [100, 105],
        ...     'close': [105, 110],
        ...     'open': [100, 105]
        ... })
        >>> result = create_range_features(df)
        >>> result[['high_low', 'close_open']].iloc[0].tolist()
        [10, 5]
    """
    df["high_low"] = df["high"] - df["low"]
    df["close_open"] = df["close"] - df["open"]
    df["return"] = df["close"].pct_change()
    return df


def create_lag_features(df: pd.DataFrame, lag_days: list[int]) -> pd.DataFrame:
    """
    Creates lagged return features for specified days.

    Args:
        df (pd.DataFrame): Input DataFrame with return.
        lag_days (List[int]): List of lag days (e.g., [1, 2, 5]).

    Returns:
        pd.DataFrame: DataFrame with added lagged return columns.

    Examples:
        >>> df = pd.DataFrame({'return': [0.01, 0.02, 0.03]})
        >>> result = create_lag_features(df, [1])
        >>> result['return_lag_1'].tolist()
        [nan, 0.01, 0.02]
    """
    for day in lag_days:
        df[f"return_lag_{day}"] = df["return"].shift(day)
    return df


def create_rolling_features(
    df: pd.DataFrame, rolling_windows: list[int]
) -> pd.DataFrame:
    """
    Creates rolling mean and standard deviation features for returns.

    Args:
        df (pd.DataFrame): Input DataFrame with return.
        rolling_windows (List[int]): List of rolling window sizes (e.g., [5, 10]).

    Returns:
        pd.DataFrame: DataFrame with added rolling mean and std columns.

    Examples:
        >>> df = pd.DataFrame({'return': [0.01, 0.02, 0.03, 0.04, 0.05]})
        >>> result = create_rolling_features(df, [3])
        >>> result['return_roll_mean_3'].iloc[2]
        0.02
    """
    for window in rolling_windows:
        df[f"return_roll_mean_{window}"] = df["return"].rolling(window).mean()
        df[f"return_roll_std_{window}"] = df["return"].rolling(window).std()
    return df


def create_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates time-based features from the date column.

    Args:
        df (pd.DataFrame): Input DataFrame with date (datetime).

    Returns:
        pd.DataFrame: DataFrame with added time features.

    Examples:
        >>> df = pd.DataFrame({'date': pd.to_datetime(['2020-01-01'])})
        >>> result = create_time_features(df)
        >>> result[['day_of_week', 'month']].iloc[0].tolist()
        [2, 1]
    """
    df["day_of_week"] = df["date"].dt.weekday
    df["month"] = df["date"].dt.month
    df["day_of_month"] = df["date"].dt.day
    df["quarter"] = df["date"].dt.quarter
    df["is_quarter_end"] = df["date"].dt.is_quarter_end.astype(int)
    return df


def create_sma_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates Simple Moving Average (SMA) features.

    Args:
        df (pd.DataFrame): Input DataFrame with close.

    Returns:
        pd.DataFrame: DataFrame with added sma columns.

    Examples:
        >>> df = pd.DataFrame({'close': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110]})
        >>> result = create_sma_features(df)
        >>> result['sma_10'].iloc[9]
        104.5
    """
    df["sma_10"] = df["close"].rolling(window=10, min_periods=10).mean()
    df["sma_20"] = df["close"].rolling(window=20, min_periods=20).mean()
    return df


def create_rsi_features(df: pd.DataFrame, window: int = 14) -> pd.DataFrame:
    """
    Creates Relative Strength Index (RSI) feature.

    Args:
        df (pd.DataFrame): Input DataFrame with close.
        window (int, optional): Window size for RSI calculation. Defaults to 14.

    Returns:
        pd.DataFrame: DataFrame with added rsi column.

    Examples:
        >>> df = pd.DataFrame({'close': [List of close prices]})
        >>> result = create_rsi_features(df, window=14)
        >>> result['rsi_14'].iloc[13]  # Approximate value
        100.0
    """
    delta = df["close"].diff()

    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)

    avg_gain = pd.Series(gain).rolling(window=window, min_periods=window).mean()
    avg_loss = pd.Series(loss).rolling(window=window, min_periods=window).mean()

    rs = avg_gain / (avg_loss.replace(0, np.nan))
    df["rsi_14"] = 100 - (100 / (1 + rs))
    return df


def create_ema_macd_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates Exponential Moving Average (EMA) and MACD features.

    Args:
        df (pd.DataFrame): Input DataFrame with close.

    Returns:
        pd.DataFrame: DataFrame with added ema and macdcolumns.

    Examples:
        >>> df = pd.DataFrame({'close': [List of close prices]})
        >>> result = create_ema_macd_features(df)
        >>> result[['ema_12', 'macd']].iloc[25]  # Approximate values
        (ema_12=..., macd=...)
    """
    df["ema_12"] = df["close"].ewm(span=12, adjust=False).mean()
    df["ema_26"] = df["close"].ewm(span=26, adjust=False).mean()

    df["macd"] = df["ema_12"] - df["ema_26"]
    df["macd_signal"] = df["macd"].ewm(span=9, adjust=False).mean()
    return df


def create_features(
    df: pd.DataFrame,
    lag_days: list[int] = [1, 2, 5, 10],
    rolling_windows: list[int] = [5, 10],
) -> pd.DataFrame:
    """
    Applies all feature creation functions to the DataFrame and handles NaNs.

    Args:
        df (pd.DataFrame): Input DataFrame with required columns
            ('symbol', 'date', 'open', 'high', 'low', 'close').
        lag_days (List[int], optional): List of lag days for lagged features.
            Defaults to [1, 2, 5, 10].
        rolling_windows (List[int], optional): List of rolling window sizes.
            Defaults to [5, 10].

    Returns:
        pd.DataFrame: DataFrame with all features added,
                        initial rows lagged dropped,
                        and NaNs removed.

    Examples:
        >>> df = pd.DataFrame({stock data here})
        >>> result = create_features(df)
        >>> result.shape[0] < 50  # Should be less due to drops
        True
    """
    df = create_target(df)
    df = create_range_features(df)
    df = create_lag_features(df, lag_days)
    df = create_rolling_features(df, rolling_windows)
    df = create_time_features(df)
    df = create_sma_features(df)
    df = create_rsi_features(df)
    df = create_ema_macd_features(df)

    # drop the initial warm-up rows for rolling/lag features
    W_DROP = 20
    df = df.iloc[W_DROP:]

    # drop any leftover NaNs
    df = df.dropna().reset_index(drop=True)

    return df

In [None]:
# Sanity check
aapl_df_to_compare_sanity = create_features(aapl.copy())
aapl_df_to_compare_sanity.equals(aapl_df_handled_nan)

In [None]:
# Read combined EOD data
df = pd.read_parquet("../data/processed/combined_eod.parquet")

# Apply to the full set before splitting
df_features = create_features(df.copy())
df_features.shape

In [None]:
df_features.to_parquet("../data/feature/stock_eod_features.parquet", index=False)