## This is the code to combine both the price dataset with economic indicators dataset and fed speech embeddings dataset

### Dataset unifying and merging

In [1]:
from pathlib import Path 
import pandas as pd 
import numpy as np

In [3]:
# ---------------------------
# Paths
# ---------------------------
PRICE_CSV = Path("../../data/raw/{ticker}.csv")
ECO_DIR = Path("../../data/raw/economic/")
FED_EMB_DIR = Path("../../data/processed/daily_embeddings/")
OUT_DIR = Path("../../data/unified/")

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------------------------
# Helpers Loaders
# ---------------------------
def load_economic_indicators():
    frames = []

    for f in ECO_DIR.glob("*.csv"):
        df = pd.read_csv(f, parse_dates=["date"])
        df = df.set_index("date").sort_index()
        frames.append(df)

    if not frames:
        return pd.DataFrame()

    eco = pd.concat(frames, axis=1)
    return eco.ffill()

def load_fed_embeddings():
    records = []

    for f in FED_EMB_DIR.glob("*_embeddings.npz"):
        data = np.load(f)
        date = pd.to_datetime(data["date"])
        emb = data["embedding"]

        record = {"date": date}
        for i, v in enumerate(emb):
            record[f"fed_emb_{i}"] = v

        records.append(record)

    if not records:
        return pd.DataFrame()

    df = pd.DataFrame(records)
    return df.set_index("date").sort_index()

# ---------------------------
# Main builder
# ---------------------------
def build_unified_dataset(ticker: str):
    # ---- Prices ----
    price_path = Path(str(PRICE_CSV).format(ticker=ticker))
    prices = pd.read_csv(price_path, parse_dates=["date"])
    prices = prices.set_index("date").sort_index()

    # ---- Economic indicators ----
    eco = load_economic_indicators()

    # ---- Fed embeddings ----
    fed = load_fed_embeddings()

    # ---- Merge Left to match date range from price dataset----
    df = prices.join(eco, how="left")
    df = df.join(fed, how="left")

    # Forward-fill macro & fed (never backward-fill)
    df = df.ffill()

    # ---- Label ----
    df["next_close"] = df["close"].shift(-1)
    df = df.dropna(subset=["next_close"])

    # ---- Save ----
    out_path = OUT_DIR / f"{ticker}.parquet"
    df.to_parquet(out_path, engine="pyarrow")

    return df


# ---------------------------
# CLI
# ---------------------------
if __name__ == "__main__":
    build_unified_dataset("BTC")