In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

In [4]:
# ---------------------------
# Paths
# ---------------------------
UNIFIED_DIR = Path("../../data/unified/")
OUT_DIR = Path("../../data/features/")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------------------------
# Technical indicators function
# ---------------------------
def compute_ema(series, span):
    return series.ewm(span=span, adjust=False).mean()


def compute_rsi(series, window=14):
    delta = series.diff()
    gain = delta.clip(lower=0).rolling(window).mean()
    loss = -delta.clip(upper=0).rolling(window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))


def compute_macd(series):
    ema12 = compute_ema(series, 12)
    ema26 = compute_ema(series, 26)
    return ema12 - ema26

# ---------------------------
# Feature engineering
# ---------------------------
def engineer_features(ticker: str):
    df = pd.read_parquet(UNIFIED_DIR / f"{ticker}.parquet")

    # ---- Technicals ----
    df["ema_34"] = compute_ema(df["close"], 34)
    df["ema_89"] = compute_ema(df["close"], 89)
    df["ema_200"] = compute_ema(df["close"], 200)

    df["rsi_14"] = compute_rsi(df["close"])
    df["macd"] = compute_macd(df["close"])

    df["log_return"] = np.log(df["close"]).diff()
    df["vol_20"] = df["log_return"].rolling(20).std()

    # ---- Lag features ----
    lags = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144]
    for lag in lags:
        df[f"close_lag_{lag}"] = df["close"].shift(lag)
        df[f"ret_lag_{lag}"] = df["log_return"].shift(lag)

    # ---- Drop leakage ----
    df = df.dropna()

    # ---- Scaling (fit later on train split only) ----
    numeric_features = (
        df
        .select_dtypes(include=["float64", "int64"])
        .columns
        .difference(["next_close"])
    )

    scaler = StandardScaler()
    df[numeric_features] = scaler.fit_transform(df[numeric_features])

    joblib.dump(
        scaler,
        OUT_DIR / f"{ticker}_scaler.pkl"
    )

    # ---- Save ----
    df.to_parquet(
        OUT_DIR / f"{ticker}_features.parquet",
        engine="pyarrow"
    )

    return df


# ---------------------------
# CLI
# ---------------------------
if __name__ == "__main__":
    engineer_features("BTC")