In [None]:
# --- data_prep.py ---
# Prepare LONG and SHORT trade datasets for deep learning.
#
# Design:
# - Outer split is **time-based**:
#     * sort by timestamp
#     * earliest ~80% = dev (train+valid), latest ~20% = test
# - Inner split within dev is **random**:
#     * train ≈ 60% of all data
#     * valid ≈ 20% of all data
#     * test ≈ 20% of all data (strictly later in time)
#
# - LONG and SHORT are split separately, but:
#     * a **single unified scaler** is fitted on (LONG_train ∪ SHORT_train)
#     * the same scaler is applied to all splits of both directions
#
# Outputs (per direction):
#   prepared/
#       long_train.npz / long_valid.npz / long_test.npz
#       short_train.npz / short_valid.npz / short_test.npz
#
# Each NPZ contains:
#   X_series:   (N, 160, 18)   # time-series features (standardized)
#   X_snapshot: (N, 17)        # snapshot features (standardized)
#   y:          (N,)           # net PnL in USD (float32)
#   sample_id, timestamp, direction, exit_reason: string arrays
#
# Also saves:
#   prepared/scaler_unified.json

from pathlib import Path
import json
import numpy as np
import pandas as pd

# -------------------- Global constants --------------------

SEED = 42  # base seed for reproducibility

# Window length (time steps) and feature channels must match the logger in NinjaTrader.
T = 160  # time steps
F = 18   # time-series feature channels
S = 17   # snapshot features

# CSV inputs (exported by your NinjaTrader strategy)
DATA_DIR = Path("./data")  # change if needed
LONG_CSV  = DATA_DIR / "STDL_LongLog_20_row_sample.csv"
SHORT_CSV = DATA_DIR / "STDL_ShortLog_20_row_sample.csv"

# Output directory for prepared datasets
OUT_DIR = Path("./prepared")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Base names for the 18 time-series feature channels (must match CSV header)
SERIES_BASES = [
    "series_log_ret_1",
    "series_tr_range_pct",
    "series_vol_rel20",
    "series_atr14_pct",
    "series_dist_rma1_atr",
    "series_dist_rma2_atr",
    "series_rma1_slope_norm",
    "series_rma2_slope_norm",
    "series_dist_st_upper_atr",
    "series_dist_st_lower_atr",
    "series_st_bandwidth_atr",
    "series_rsi14_scaled",
    "series_macd_hist",
    "series_body_to_range",
    "series_dist_ema20_atr",
    "series_dist_ema50_atr",
    "series_ema20_slope_norm",
    "series_ema50_slope_norm",
]

# Snapshot feature names (order matters; will be standardized per-feature)
SNAPSHOT_COLS = [
    "order_offset_ticks", "minutes_from_open",
    "dow_mon", "dow_tue", "dow_wed", "dow_thu", "dow_fri",
    "vol_regime20", "ret_since_session_open_pct",
    "bars_since_last_st_flip", "st_flips_today",
    "htf15_dist_ema50_atr", "htf15_dist_ema200_atr", "htf15_rsi14_scaled",
    "htf60_dist_ema50_atr", "htf60_dist_ema200_atr", "htf60_rsi14_scaled",
]

# Meta and label columns (kept for reference; labels used for y)
META_COLS = [
    "sample_id", "timestamp", "instrument", "session_date", "direction"
]
LABEL_COLS = [
    "filled", "exit_reason", "entry_price", "exit_price", "quantity",
    "pnl_ticks_gross", "pnl_usd_gross", "pnl_ticks_net", "pnl_usd_net"
]

print("Paths set. Ready to load CSVs.")


# -------------------- Helper functions --------------------

def build_series_columns():
    """
    Return ordered list of all series columns as they appear in CSV:
       feature1_1..feature1_160, feature2_1..feature2_160, ..., feature18_1..feature18_160
    """
    cols = []
    for base in SERIES_BASES:
        cols.extend([f"{base}_{i}" for i in range(1, T + 1)])
    return cols


def load_direction_csv(csv_path: Path) -> pd.DataFrame:
    """
    Load a single-direction CSV (already filtered to LONG or SHORT by NinjaTrader).
    - Parse timestamps
    - Enforce presence of required columns
    - Sort by time ascending (for time-based splitting)
    """
    parse_dates = ["timestamp", "session_date"]
    df = pd.read_csv(csv_path, low_memory=False, parse_dates=parse_dates)

    # Ensure required columns exist
    missing = []
    for c in META_COLS + SNAPSHOT_COLS + LABEL_COLS:
        if c not in df.columns:
            missing.append(c)

    # Check all time-series columns
    for c in build_series_columns():
        if c not in df.columns:
            missing.append(c)

    if missing:
        raise ValueError(
            f"Missing columns in {csv_path}: "
            f"{missing[:10]}{'...' if len(missing) > 10 else ''}"
        )

    # Sort by time (chronological ascending)
    df = df.sort_values("timestamp").reset_index(drop=True)
    return df


def dataframe_to_arrays(df: pd.DataFrame):
    """
    Extract tensors (X_series, X_snapshot), labels y, and metadata arrays.
       X_series   shape: (N, T, F)
       X_snapshot shape: (N, S)
       y          shape: (N,) - net PnL in USD (float)
    """
    # Build series tensor by stacking per-channel sequences of length T
    series_list = []
    for base in SERIES_BASES:
        cols = [f"{base}_{i}" for i in range(1, T + 1)]
        chan = df[cols].to_numpy(dtype=np.float64)   # (N, T)
        series_list.append(chan[:, :, None])         # (N, T, 1)
    X_series = np.concatenate(series_list, axis=2)   # (N, T, F)

    # Snapshot features
    X_snapshot = df[SNAPSHOT_COLS].to_numpy(dtype=np.float64)  # (N, S)

    # Labels: net PnL in USD
    y = df["pnl_usd_net"].to_numpy(dtype=np.float64)  # (N,)

    # Meta for reference
    meta = {
        "sample_id":  df["sample_id"].astype(str).to_numpy(),
        "timestamp":  df["timestamp"].astype(str).to_numpy(),
        "direction":  df.get("direction", pd.Series(["NA"] * len(df))).astype(str).to_numpy(),
        "exit_reason": df["exit_reason"].astype(str).to_numpy(),
    }
    return X_series, X_snapshot, y, meta


def time_then_random_split_indices(
    N: int,
    train_ratio_total: float = 0.6,
    valid_ratio_total: float = 0.2,
    seed: int = 42,
):
    """
    Time-based outer split + random inner split:

    - Data assumed sorted by time ascending: index 0 -> oldest trade, N-1 -> newest trade.
    - dev_frac = train_ratio_total + valid_ratio_total (e.g. 0.8)
    - dev part = earliest dev_frac of data
    - test part = latest (1 - dev_frac) of data
    - Inside dev, do a random split to get overall ~train_ratio_total / valid_ratio_total.

    Returns:
        train_idx, valid_idx, test_idx   (each is a 1D int array of indices)
    """
    dev_frac = train_ratio_total + valid_ratio_total
    if dev_frac <= 0.0 or dev_frac >= 1.0:
        raise ValueError("dev_frac must be between 0 and 1 (exclusive).")

    dev_size = int(round(N * dev_frac))
    dev_size = max(min(dev_size, N), 2)  # at least 2 samples in dev

    test_idx = np.arange(dev_size, N)  # strictly later in time
    dev_idx = np.arange(0, dev_size)   # earlier period

    # Random split within dev
    rng = np.random.default_rng(seed)
    perm = rng.permutation(dev_size)

    # Train share within dev
    train_frac_dev = train_ratio_total / dev_frac
    n_train = int(round(dev_size * train_frac_dev))
    n_train = max(min(n_train, dev_size - 1), 1)  # at least 1 train, 1 valid

    train_idx = perm[:n_train]
    valid_idx = perm[n_train:]

    return train_idx, valid_idx, test_idx


def fit_scalers(X_series_train: np.ndarray, X_snapshot_train: np.ndarray):
    """
    Compute per-channel z-score parameters.
    Series: mean/std per channel across (samples × time).
    Snapshot: mean/std per feature across samples.
    """
    mu_series = X_series_train.mean(axis=(0, 1))  # (F,)
    std_series = X_series_train.std(axis=(0, 1))  # (F,)
    std_series = np.where(std_series < 1e-8, 1.0, std_series)

    mu_snap = X_snapshot_train.mean(axis=0)       # (S,)
    std_snap = X_snapshot_train.std(axis=0)       # (S,)
    std_snap = np.where(std_snap < 1e-8, 1.0, std_snap)

    return mu_series, std_series, mu_snap, std_snap


def apply_scalers(
    X_series: np.ndarray,
    X_snapshot: np.ndarray,
    mu_series,
    std_series,
    mu_snap,
    std_snap,
):
    """
    Apply z-score scaling with broadcasting.
    """
    Xs = (X_series - mu_series.reshape(1, 1, -1)) / std_series.reshape(1, 1, -1)
    Xp = (X_snapshot - mu_snap.reshape(1, -1)) / std_snap.reshape(1, -1)
    return Xs.astype(np.float32), Xp.astype(np.float32)


def save_npz(path: Path, X_series, X_snapshot, y, meta: dict):
    """
    Save arrays to compressed NPZ. Meta fields are saved as arrays of strings.
    """
    np.savez_compressed(
        path,
        X_series=X_series,
        X_snapshot=X_snapshot,
        y=y.astype(np.float32),
        sample_id=meta["sample_id"],
        timestamp=meta["timestamp"],
        direction=meta["direction"],
        exit_reason=meta["exit_reason"],
    )


def save_scaler_json(path: Path, mu_series, std_series, mu_snap, std_snap):
    """
    Save scaler params so you can reproduce the exact standardization at inference.
    """
    obj = {
        "T": T,
        "F": F,
        "S": S,
        "series_bases": SERIES_BASES,
        "snapshot_cols": SNAPSHOT_COLS,
        "mu_series": mu_series.tolist(),
        "std_series": std_series.tolist(),
        "mu_snapshot": mu_snap.tolist(),
        "std_snapshot": std_snap.tolist(),
    }
    path.write_text(json.dumps(obj, indent=2))


# -------------------- Load LONG and SHORT raw CSVs --------------------

print("\n=== Loading LONG CSV ===")
df_long = load_direction_csv(LONG_CSV)
print(f"Loaded LONG: {len(df_long):,} rows")
X_series_L, X_snap_L, y_L, meta_L = dataframe_to_arrays(df_long)
print("LONG raw shapes:", X_series_L.shape, X_snap_L.shape, y_L.shape)

print("\n=== Loading SHORT CSV ===")
df_short = load_direction_csv(SHORT_CSV)
print(f"Loaded SHORT: {len(df_short):,} rows")
X_series_S, X_snap_S, y_S, meta_S = dataframe_to_arrays(df_short)
print("SHORT raw shapes:", X_series_S.shape, X_snap_S.shape, y_S.shape)


# -------------------- Time-based split for each direction --------------------

# LONG
idx_tr_L, idx_va_L, idx_te_L = time_then_random_split_indices(
    len(df_long),
    train_ratio_total=0.6,
    valid_ratio_total=0.2,
    seed=SEED + 1,
)
print(
    f"\nLONG split -> "
    f"train: {len(idx_tr_L)}, valid: {len(idx_va_L)}, test: {len(idx_te_L)}"
)

# SHORT
idx_tr_S, idx_va_S, idx_te_S = time_then_random_split_indices(
    len(df_short),
    train_ratio_total=0.6,
    valid_ratio_total=0.2,
    seed=SEED + 2,
)
print(
    f"SHORT split -> "
    f"train: {len(idx_tr_S)}, valid: {len(idx_va_S)}, test: {len(idx_te_S)}"
)


# -------------------- Fit a unified scaler on LONG_train ∪ SHORT_train --------------------

X_series_train_all = np.concatenate(
    [X_series_L[idx_tr_L], X_series_S[idx_tr_S]], axis=0
)
X_snap_train_all = np.concatenate(
    [X_snap_L[idx_tr_L], X_snap_S[idx_tr_S]], axis=0
)

mu_series, sd_series, mu_snap, sd_snap = fit_scalers(
    X_series_train_all,
    X_snap_train_all,
)

print("\nUnified scaler fitted on LONG_train ∪ SHORT_train.")
print("Series mean shape:", mu_series.shape, "std shape:", sd_series.shape)
print("Snapshot mean shape:", mu_snap.shape, "std shape:", sd_snap.shape)

# -------------------- Apply unified scaler and save NPZs --------------------

# LONG
Xs_tr_L, Xp_tr_L = apply_scalers(
    X_series_L[idx_tr_L], X_snap_L[idx_tr_L],
    mu_series, sd_series, mu_snap, sd_snap
)
Xs_va_L, Xp_va_L = apply_scalers(
    X_series_L[idx_va_L], X_snap_L[idx_va_L],
    mu_series, sd_series, mu_snap, sd_snap
)
Xs_te_L, Xp_te_L = apply_scalers(
    X_series_L[idx_te_L], X_snap_L[idx_te_L],
    mu_series, sd_series, mu_snap, sd_snap
)

save_npz(
    OUT_DIR / "long_train.npz",
    Xs_tr_L,
    Xp_tr_L,
    y_L[idx_tr_L],
    {k: v[idx_tr_L] for k, v in meta_L.items()},
)
save_npz(
    OUT_DIR / "long_valid.npz",
    Xs_va_L,
    Xp_va_L,
    y_L[idx_va_L],
    {k: v[idx_va_L] for k, v in meta_L.items()},
)
save_npz(
    OUT_DIR / "long_test.npz",
    Xs_te_L,
    Xp_te_L,
    y_L[idx_te_L],
    {k: v[idx_te_L] for k, v in meta_L.items()},
)

print("Saved LONG datasets.")

# SHORT
Xs_tr_S, Xp_tr_S = apply_scalers(
    X_series_S[idx_tr_S], X_snap_S[idx_tr_S],
    mu_series, sd_series, mu_snap, sd_snap
)
Xs_va_S, Xp_va_S = apply_scalers(
    X_series_S[idx_va_S], X_snap_S[idx_va_S],
    mu_series, sd_series, mu_snap, sd_snap
)
Xs_te_S, Xp_te_S = apply_scalers(
    X_series_S[idx_te_S], X_snap_S[idx_te_S],
    mu_series, sd_series, mu_snap, sd_snap
)

save_npz(
    OUT_DIR / "short_train.npz",
    Xs_tr_S,
    Xp_tr_S,
    y_S[idx_tr_S],
    {k: v[idx_tr_S] for k, v in meta_S.items()},
)
save_npz(
    OUT_DIR / "short_valid.npz",
    Xs_va_S,
    Xp_va_S,
    y_S[idx_va_S],
    {k: v[idx_va_S] for k, v in meta_S.items()},
)
save_npz(
    OUT_DIR / "short_test.npz",
    Xs_te_S,
    Xp_te_S,
    y_S[idx_te_S],
    {k: v[idx_te_S] for k, v in meta_S.items()},
)

print("Saved SHORT datasets.")

# Save unified scaler
save_scaler_json(
    OUT_DIR / "scaler_unified.json",
    mu_series,
    sd_series,
    mu_snap,
    sd_snap,
)
print("Saved unified scaler to scaler_unified.json")


# -------------------- Quick sanity checks --------------------

def npz_info(path: Path):
    with np.load(path) as z:
        Xs = z["X_series"]
        Xp = z["X_snapshot"]
        y  = z["y"]
        print(
            f"{path.name}: "
            f"X_series={Xs.shape}, X_snapshot={Xp.shape}, y={y.shape}, "
            f"y_mean={y.mean():.4f}, y_std={y.std():.4f}"
        )


print("\n=== Sanity check on prepared NPZ files ===")
for p in [
    OUT_DIR / "long_train.npz",
    OUT_DIR / "long_valid.npz",
    OUT_DIR / "long_test.npz",
    OUT_DIR / "short_train.npz",
    OUT_DIR / "short_valid.npz",
    OUT_DIR / "short_test.npz",
]:
    npz_info(p)

print("\nDone. Datasets are ready with a **single unified scaler**. "
      "You can now train the unified LONG+SHORT classifier notebook on these NPZ files.")


Paths set. Ready to load CSVs.

=== Loading LONG CSV ===
Loaded LONG: 2,308 rows
LONG raw shapes: (2308, 160, 18) (2308, 17) (2308,)

=== Loading SHORT CSV ===
Loaded SHORT: 2,317 rows
SHORT raw shapes: (2317, 160, 18) (2317, 17) (2317,)

LONG split -> train: 1384, valid: 462, test: 462
SHORT split -> train: 1390, valid: 464, test: 463

Unified scaler fitted on LONG_train ∪ SHORT_train.
Series mean shape: (18,) std shape: (18,)
Snapshot mean shape: (17,) std shape: (17,)
Saved LONG datasets.
Saved SHORT datasets.
Saved unified scaler to scaler_unified.json

=== Sanity check on prepared NPZ files ===
long_train.npz: X_series=(1384, 160, 18), X_snapshot=(1384, 17), y=(1384,), y_mean=1.8388, y_std=134.9398
long_valid.npz: X_series=(462, 160, 18), X_snapshot=(462, 17), y=(462,), y_mean=10.3098, y_std=146.8674
long_test.npz: X_series=(462, 160, 18), X_snapshot=(462, 17), y=(462,), y_mean=9.3164, y_std=175.2634
short_train.npz: X_series=(1390, 160, 18), X_snapshot=(1390, 17), y=(1390,), y_me