In [1]:
 # =========================
# 0) SETUP — Imports & GPU
# =========================
import os, math, warnings
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict, Optional

# Silence benign runtime warnings from comparisons on NaNs
warnings.filterwarnings("ignore", category=RuntimeWarning)

# GPU (CuPy) optional
try:
    import cupy as cp
    GPU_AVAILABLE = True
    xp = cp
    n_gpus = cp.cuda.runtime.getDeviceCount()
    print(f"CuPy active — {n_gpus} GPU(s)")
    for i in range(n_gpus):
        props = cp.cuda.runtime.getDeviceProperties(i)
        print(f"GPU {i}: {props['name'].decode()} | {(props['totalGlobalMem']/1e9):.1f} GB")
except Exception:
    GPU_AVAILABLE = False
    xp = np
    print("CuPy not available — using CPU (NumPy)")

# ADF test (optional)
try:
    from statsmodels.tsa.stattools import adfuller
except Exception:
    adfuller = None

CuPy active — 2 GPU(s)
GPU 0: Tesla T4 | 15.8 GB
GPU 1: Tesla T4 | 15.8 GB


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("arashnic/stock-data-intraday-minute-bar")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/stock-data-intraday-minute-bar


In [3]:
# 1) LOAD & NORMALIZE SPXUSD minute CSV
# ======================================
FILE = "/kaggle/input/stock-data-intraday-minute-bar/pyfinancialdata/data/stocks/histdata/SPXUSD/DAT_ASCII_SPXUSD_M1_2018.csv"

df = pd.read_csv(FILE, sep=';', header=None,
                 names=['datetime','open','high','low','close','volume'])

df['timestamp'] = pd.to_datetime(df['datetime'], format='%Y%m%d %H%M%S',
                                 errors='coerce', utc=True)
df['volume'] = pd.to_numeric(df['volume'], errors='coerce').fillna(0.0)
df['symbol'] = 'SPXUSD'

raw = (df.dropna(subset=['timestamp','open','high','low','close'])
       .sort_values('timestamp')
       .reset_index(drop=True)[['timestamp','open','high','low','close','volume','symbol']])

print(raw.dtypes)
print("range & shape:", raw['timestamp'].min(), raw['timestamp'].max(), raw.shape)

timestamp    datetime64[ns, UTC]
open                     float64
high                     float64
low                      float64
close                    float64
volume                     int64
symbol                    object
dtype: object
range & shape: 2018-01-01 16:30:00+00:00 2018-12-31 16:13:00+00:00 (310381, 7)


In [4]:
#2) SAVE to Feather
# ======================
OUT_DIR = "/kaggle/working"
raw.to_feather(f"{OUT_DIR}/spxusd_m1_2018.feather")
print("Saved:", f"{OUT_DIR}/spxusd_m1_2018.feather")

Saved: /kaggle/working/spxusd_m1_2018.feather


In [5]:
# 3.1) VOLUME BARS
# ======================
def _agg_rows_to_bar(rows: pd.DataFrame) -> pd.Series:
    o = rows['open'].iloc[0]
    h = rows['high'].max()
    l = rows['low'].min()
    c = rows['close'].iloc[-1]
    v = rows['volume'].sum()
    ts = rows['timestamp'].iloc[-1]
    return pd.Series({'timestamp': ts, 'open': o, 'high': h, 'low': l, 'close': c, 'volume': v})

def make_volume_bars(df: pd.DataFrame, threshold: float) -> pd.DataFrame:
    out, cumv, bucket = [], 0.0, []
    for _, row in df.iterrows():
        cumv += float(row['volume'] if row['volume'] else 1.0)
        bucket.append(row)
        if cumv >= threshold:
            out.append(_agg_rows_to_bar(pd.DataFrame(bucket)))
            cumv, bucket = 0.0, []
    if bucket:
        out.append(_agg_rows_to_bar(pd.DataFrame(bucket)))
    return pd.DataFrame(out)

median_vol = max(1.0, raw['volume'].replace(0, np.nan).median(skipna=True) or 1.0)
vol_bars = make_volume_bars(raw, threshold=2000 * median_vol)
print("volume bars shape:", vol_bars.shape)
print(vol_bars.head())

volume bars shape: (156, 6)
                  timestamp     open     high      low    close  volume
0 2018-01-04 04:58:00+00:00  2668.00  2715.75  2668.00  2715.50       0
1 2018-01-08 13:02:00+00:00  2715.25  2747.50  2713.00  2744.00       0
2 2018-01-10 20:03:00+00:00  2744.13  2759.75  2736.50  2753.50       0
3 2018-01-14 23:57:00+00:00  2753.25  2797.00  2747.75  2794.75       0
4 2018-01-17 06:13:00+00:00  2794.88  2808.25  2769.25  2791.00       0


In [6]:
# ======================
# 3.2) DOLLAR BARS
# ======================
def make_dollar_bars(df: pd.DataFrame, threshold: float) -> pd.DataFrame:
    out, cumd, bucket = [], 0.0, []
    for _, row in df.iterrows():
        vol = float(row['volume'] if row['volume'] else 1.0)
        dollar = float(row['close']) * vol
        cumd += dollar
        bucket.append(row)
        if cumd >= threshold:
            out.append(_agg_rows_to_bar(pd.DataFrame(bucket)))
            cumd, bucket = 0.0, []
    if bucket:
        out.append(_agg_rows_to_bar(pd.DataFrame(bucket)))
    return pd.DataFrame(out)

median_close = float(raw['close'].median())
dollar_bars = make_dollar_bars(raw, threshold=median_close * median_vol * 5000)
print("dollar bars shape:", dollar_bars.shape)
print(dollar_bars.head())

dollar bars shape: (63, 6)
                  timestamp     open     high     low    close  volume
0 2018-01-09 20:59:00+00:00  2668.00  2759.75  2668.0  2749.00       0
1 2018-01-17 05:25:00+00:00  2748.88  2808.25  2736.5  2787.50       0
2 2018-01-23 16:57:00+00:00  2787.38  2844.50  2780.5  2838.00       0
3 2018-01-30 03:54:00+00:00  2837.88  2878.25  2825.5  2842.50       0
4 2018-02-04 23:03:00+00:00  2842.25  2845.50  2733.0  2746.88       0


In [7]:
# 3.3) INFORMATION (VOLATILITY) BARS (GPU)
# ===========================================
def make_volatility_bars(df: pd.DataFrame, sigma_thresh: float = 0.9, lookback: int = 60) -> pd.DataFrame:
    out, bucket, rets = [], [], []
    for _, row in df.iterrows():
        if bucket:
            prev_close = bucket[-1]['close']
            rets.append(math.log((row['close'] + 1e-12)/(prev_close + 1e-12)))
        bucket.append(row)
        if len(rets) >= lookback:
            r = xp.array(rets[-lookback:], dtype=float)
            sigma = float(xp.std(r)) * math.sqrt(252*24*60)  # annualized
            if sigma >= sigma_thresh:
                out.append(_agg_rows_to_bar(pd.DataFrame(bucket)))
                bucket, rets = [], []
    if bucket:
        out.append(_agg_rows_to_bar(pd.DataFrame(bucket)))
    return pd.DataFrame(out)

info_bars = make_volatility_bars(raw, sigma_thresh=0.9, lookback=60)
print("info bars shape:", info_bars.shape)
print(info_bars.head())

info bars shape: (17, 6)
                  timestamp     open     high      low    close  volume
0 2018-02-05 15:12:00+00:00  2668.00  2878.25  2635.75  2669.75       0
1 2018-02-05 16:13:00+00:00  2671.13  2696.50  2604.50  2620.75       0
2 2018-02-05 20:58:00+00:00  2619.75  2634.00  2542.00  2551.25       0
3 2018-02-06 07:26:00+00:00  2551.50  2644.25  2529.00  2602.25       0
4 2018-02-06 09:32:00+00:00  2602.50  2614.38  2573.00  2614.25       0


In [8]:
# 4) TRIPLE-BARRIER LABELING (final fixed)
# ==============================================
def get_daily_vol(close: pd.Series, span: int = 50, day_minutes: int = 1440) -> pd.Series:
    prev = close.shift(day_minutes)
    returns = (close / prev) - 1
    return returns.ewm(span=span).std()

def get_events(close: pd.Series, daily_vol: pd.Series, pt_sl: Tuple[float,float],
               min_ret: float, num_minutes: int) -> pd.DataFrame:
    trgt = daily_vol.copy()
    trgt = trgt[trgt > min_ret].dropna()
    t1 = trgt.index + pd.Timedelta(minutes=num_minutes)
    return pd.DataFrame({'t1': t1, 'trgt': trgt})

def apply_pt_sl_on_t1(close: pd.Series, events: pd.DataFrame, pt_sl: Tuple[float,float]) -> pd.DataFrame:
    out = events[['t1']].copy()
    pt, sl = pt_sl
    for t0, e in events.iterrows():
        if t0 not in close.index:
            continue
        p0 = close.loc[t0]
        up = p0 * (1 + pt*e['trgt']) if pt > 0 else np.nan
        dn = p0 * (1 - sl*e['trgt']) if sl > 0 else np.nan
        path = close.loc[t0:e['t1']].dropna()

        hit_up = path[path >= up].index.min() if pd.notna(up) and len(path) else pd.NaT
        hit_dn = path[path <= dn].index.min() if pd.notna(dn) and len(path) else pd.NaT
        first = min([x for x in [hit_up, hit_dn, e['t1']] if pd.notna(x)])

        out.loc[t0, 't1'] = first
        out.loc[t0, 'trgt'] = e['trgt']
        out.loc[t0, 'up_price'] = up
        out.loc[t0, 'dn_price'] = dn
    return out

def get_bins(close: pd.Series, events: pd.DataFrame, pt_sl: Tuple[float,float]) -> pd.DataFrame:
    ev = apply_pt_sl_on_t1(close, events, pt_sl)
    out = pd.DataFrame(index=ev.index)
    for t0, e in ev.iterrows():
        t1 = e['t1']
        if pd.isna(t1) or t1 not in close.index:
            out.loc[t0, 'ret'] = 0.0
            out.loc[t0, 'bin'] = 0
            continue

        ret = (close.loc[t1] / close.loc[t0]) - 1
        out.loc[t0, 'ret'] = ret
        val = close.loc[t1]
        if pd.notna(e['up_price']) and not pd.isna(val) and val >= e['up_price']:
            out.loc[t0, 'bin'] = 1
        elif pd.notna(e['dn_price']) and not pd.isna(val) and val <= e['dn_price']:
            out.loc[t0, 'bin'] = -1
        else:
            out.loc[t0, 'bin'] = 0
    return out.join(ev[['trgt','t1']])

# Apply on SPXUSD close series
close = raw.set_index(pd.DatetimeIndex(raw['timestamp']))['close'].sort_index()
dvol = get_daily_vol(close, span=50, day_minutes=1440)
min_ret = float(dvol.median())
events = get_events(close, dvol, pt_sl=(1,1), min_ret=min_ret, num_minutes=720)  # 12h
bins = get_bins(close, events, pt_sl=(1,1))

print(bins['bin'].value_counts(dropna=False))
print(bins.head())

bin
 1.0    76457
-1.0    73413
 0.0     4600
Name: count, dtype: int64
                                ret  bin      trgt                        t1
timestamp                                                                   
2018-01-03 11:48:00+00:00  0.002308  1.0  0.002146 2018-01-03 15:46:00+00:00
2018-01-03 11:49:00+00:00  0.002032  1.0  0.001921 2018-01-03 15:38:00+00:00
2018-01-03 11:50:00+00:00  0.001847  1.0  0.001760 2018-01-03 15:36:00+00:00
2018-01-03 11:51:00+00:00  0.001662  1.0  0.001591 2018-01-03 12:51:00+00:00
2018-01-03 11:52:00+00:00  0.001478  1.0  0.001477 2018-01-03 12:49:00+00:00


In [9]:
# 4.5) PRIMARY MODEL → produces p̂_i = P(y=+1 | X)
#     (simple baseline using raw OHLCV features for demo)
# ===========================================================
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# align features (X) with label timestamps (bins.index)
raw_idx = raw.set_index('timestamp').sort_index()
bins = bins.sort_index()

X = raw_idx.loc[bins.index, ['open','high','low','close','volume']]
y = bins['bin'].fillna(0)           # true labels in {-1,0,+1}
y_model = y.replace(-1, 0)          # convert to {0,1} for logistic regression

print("Shapes:", X.shape, y_model.shape, "index_equal:", X.index.equals(y_model.index))

X_train, X_test, y_train, y_test = train_test_split(X, y_model, test_size=0.3, shuffle=False)

primary = LogisticRegression(max_iter=1000)
primary.fit(X_train, y_train)

# p̂_i (probability of UP = class 1)
primary_probs = pd.Series(primary.predict_proba(X_test)[:,1], index=y_test.index)
print("primary_probs:", primary_probs.shape)

Shapes: (154470, 5) (154470,) index_equal: True
primary_probs: (46341,)


In [10]:
# 5) META-LABELING
# =========================
def meta_label(primary_probs: pd.Series, primary_labels: pd.Series, prob_thresh: float = 0.5) -> pd.Series:
    y_true = (primary_labels > 0).astype(int)      # +1 -> 1, else 0
    y_hat = (primary_probs >= prob_thresh).astype(int)
    return (y_true == y_hat).astype(int)

meta = meta_label(primary_probs, y.loc[primary_probs.index], prob_thresh=0.5)
print("meta distribution:\n", meta.value_counts(normalize=True))

meta distribution:
 1    0.511599
0    0.488401
Name: proportion, dtype: float64


In [11]:
# Map each event [t0, t1] to integer indices over close_index (inclusive)
def _events_to_spans(events: pd.DataFrame, close_index: pd.DatetimeIndex):
    t0 = events.index
    t1 = events['t1'].reindex(t0)
    s = np.searchsorted(close_index.values, t0.values)
    e = np.searchsorted(close_index.values, t1.values, side='right') - 1  # inclusive
    # clip to index range & drop invalid
    valid = (s >= 0) & (e >= s) & (e < len(close_index))
    s, e = s[valid], e[valid]
    idx = t0.values[valid]
    lengths = (e - s + 1).astype(np.int64)
    inv_len = 1.0 / lengths
    return idx, s, e, lengths, inv_len


In [12]:
# Build an array of concurrency counts for a set of events using prefix-sum (O(T + |set|))
def _concurrency_from_spans(T: int, starts: np.ndarray, ends: np.ndarray) -> np.ndarray:
    diff = np.zeros(T + 1, dtype=np.int32)
    np.add.at(diff, starts, 1)
    np.add.at(diff, ends + 1, -1)
    w = np.cumsum(diff[:-1])  # shape (T,)
    return w


In [13]:
def _mean_uniqueness_for_set(T, starts, ends):
    # concurrency
    w = _concurrency_from_spans(T, starts, ends)
    # avoid division by zero; we only query within active regions anyway
    b = np.zeros_like(w, dtype=float)
    nz = w > 0
    b[nz] = 1.0 / w[nz]             # b[t] = 1 / concurrency
    csb = np.cumsum(b)               # prefix sums for fast segment sums

    lengths = (ends - starts + 1).astype(np.int64)
    seg_sum = csb[ends] - np.where(starts > 0, csb[starts - 1], 0.0)
    u = seg_sum / lengths            # per-event uniqueness
    return u, w, b, csb, lengths


In [14]:
def sequential_bootstrap_fast(events: pd.DataFrame,
                              close_index: pd.DatetimeIndex,
                              size: int) -> List[pd.Timestamp]:
    # ---------- preprocess once ----------
    idx, s_all, e_all, lengths_all, invlen_all = _events_to_spans(events, close_index)
    T = len(close_index)
    N = len(idx)
    if N == 0:
        return []

    # containers for the selected set (by integer positions)
    chosen_mask = np.zeros(N, dtype=bool)
    chosen = []              # list of timestamps to return
    chosen_starts = []
    chosen_ends   = []
    chosen_invlen = []

    # maintain current totals
    total_metric = 0.0       # = sum of mean uniqueness over selected events
    k = 0                    # |S|

    # Precompute arrays we’ll need for quick unions
    # (we will rebuild per-iteration arrays from chosen spans with prefix sums)
    while k < size and k < N:
        # ---------- build per-iteration arrays from current chosen set ----------
        if k == 0:
            # no selected events yet
            w = np.zeros(T, dtype=int)
            b = np.zeros(T, dtype=float)
            csA = csWD = None    # will not be used until k>=1
            total_metric = 0.0
            # for wlen (sum of 1/len_i over active events), zero array
            wlen = np.zeros(T, dtype=float)
        else:
            # concurrency counts for selected
            sel_s = np.asarray(chosen_starts)
            sel_e = np.asarray(chosen_ends)
            w = _concurrency_from_spans(T, sel_s, sel_e)

            # wlen[t] = sum of 1/len_i over active selected events
            diff = np.zeros(T + 1, dtype=float)
            np.add.at(diff, sel_s, chosen_invlen)
            np.add.at(diff, sel_e + 1, -np.array(chosen_invlen))
            wlen = np.cumsum(diff[:-1])

            # b = 1/w on active bars
            b = np.zeros(T, dtype=float)
            nz = w > 0
            b[nz] = 1.0 / w[nz]
            # a = 1/(w+1) everywhere
            a = 1.0 / (w + 1.0)
            d = np.zeros(T, dtype=float)           # d = a - b where w>0, else 0
            d[nz] = a[nz] - b[nz]

            # prefix sums for fast segment queries
            csA  = np.cumsum(a)                    # for candidate's own uniqueness
            csWD = np.cumsum(wlen * d)             # for selected set adjustment

            # recompute current total_metric (= sum of mean uniqueness of selected)
            # vectorized using prefix sums of b
            csB = np.cumsum(b)
            segB = csB[sel_e] - np.where(sel_s > 0, csB[sel_s - 1], 0.0)
            total_metric = float((segB * chosen_invlen).sum())

        # ---------- evaluate candidates in O(1) each ----------
        not_sel = ~chosen_mask
        cand_idx = np.nonzero(not_sel)[0]
        if len(cand_idx) == 0:
            break

        if k == 0:
            # first pick: choose the event with highest own uniqueness under w=0 → a=1
            # uniqueness = 1.0 over its active bars, so tie-break by longest? use max length.
            j = cand_idx[np.argmax(lengths_all[cand_idx])]
            best = j
        else:
            # compute per-candidate scores
            s_c = s_all[cand_idx]
            e_c = e_all[cand_idx]
            len_c = lengths_all[cand_idx]
            invlen_c = 1.0 / len_c

            # candidate's own sum of a over its range
            sumA = csA[e_c] - np.where(s_c > 0, csA[s_c - 1], 0.0)
            u_c = sumA * invlen_c

            # adjustment to selected totals over overlap range: sum(wlen * d)
            adj = csWD[e_c] - np.where(s_c > 0, csWD[s_c - 1], 0.0)

            # new average if we add candidate: (total_metric + adj + u_c) / (k+1)
            score = (total_metric + adj + u_c) / (k + 1.0)

            best = cand_idx[np.argmax(score)]

        # ---------- commit best candidate ----------
        chosen_mask[best] = True
        chosen.append(idx[best])
        chosen_starts.append(s_all[best])
        chosen_ends.append(e_all[best])
        chosen_invlen.append(1.0 / lengths_all[best])
        k += 1

    # return timestamps of chosen events in selection order
    # (same type as events.index)
    return list(chosen)


In [15]:
# Precompute average uniqueness for all events (optional, for reporting)
idx, s_all, e_all, lengths_all, invlen_all = _events_to_spans(events, close.index)
u_all, *_ = _mean_uniqueness_for_set(len(close.index), s_all, e_all)
print("Avg uniqueness (all events):", float(np.mean(u_all)))

# Fast sequential bootstrap
chosen = sequential_bootstrap_fast(events, close.index, size=min(200, len(events)))
print("Sequential bootstrap picked:", len(chosen))


Avg uniqueness (all events): 0.0034299971931661143
Sequential bootstrap picked: 200


Perfect 👏 — that’s *exactly* what you want to see.

Let’s unpack those results clearly so you understand what those two numbers *mean* and why they matter:

---

## ✅ 1️⃣ “Average uniqueness (all events): 0.0034…”

This tells you:

> On average, each event (trade or label) overlaps **heavily** with many others.

Because uniqueness ( u_i = \text{mean}(1 / \text{concurrency}) ).

So if `avg uniqueness ≈ 0.0034`, then:

[
\text{average concurrency} ≈ \frac{1}{0.0034} ≈ 294
]

That means at any given timestamp, about **~294 events** are active on average.

🧠 **Interpretation:**

* The dataset is very dense — events overlap a lot in time.
* This is *normal* in minute-level or tick data (many trades are active simultaneously).
* High concurrency = low uniqueness → potential label leakage if you sample randomly.

---

## ✅ 2️⃣ “Sequential bootstrap picked: 200”

That means your **greedy uniqueness-optimized sampler** successfully chose **200 events** (out of possibly thousands) that are:

* The **most independent** possible given the overlap structure,
* **Spread out** in time,
* And thus give your model a **clean, low-correlation** training subset.

Each event selected contributes **maximum marginal uniqueness** to the group — i.e., it adds new, non-redundant information rather than reinforcing already overlapping ones.

---

## 🧩 How to Think About It Visually

Imagine your timeline looks like this:

```
|--- Event 1 ---|
    |--- Event 2 ---|
        |--- Event 3 ---|
                        |--- Event 4 ---|
```

* If you train on 1, 2, 3 → they overlap → model sees almost the same info three times.
* Sequential bootstrap instead might pick 1 and 4 → **minimal overlap** → more diverse signal.

So the algorithm has **filtered out redundancy**, keeping the “clean” signals.

---

## 💡 Practical Use Now

You can use those `chosen` timestamps to **subset your labels and features** before training:

```python
chosen_events = events.loc[chosen]
X_train = X.loc[chosen_events.index]
y_train = y.loc[chosen_events.index]
```

This gives you a **high-quality training sample**:

* Labels are non-overlapping (independent in time).
* Features can now be **fractionally differentiated (FFD)** for stationarity.
* You avoid the double trap of temporal dependence **in both rows and columns**.

---

## 🔁 Recap of What You’ve Achieved So Far

| Step                                         | Purpose                                     | Outcome                        |
| -------------------------------------------- | ------------------------------------------- | ------------------------------ |
| **Concurrency**                              | Count how many events overlap at each time  | Baseline overlap map           |
| **Uniqueness**                               | Measure independence per event              | Uniqueness scores              |
| **Sequential Bootstrap (fast)**              | Select subset maximizing average uniqueness | 200 clean, independent samples |
| **→ Next: Fractional Differentiation (FFD)** | Make features stationary but memory-rich    | Ready for modeling             |

---

## ⚙️ Next Move

Now that your **labels** (rows) are independent,
the next step is to make your **features** stationary:

```python
ffd_close = fracdiff_ffd_gpu(close, d=0.4, thres=1e-4)
print(adf_stat(ffd_close))
```

Then train on:

```python
X_ffd = X.assign(close_ffd=ffd_close)
X_train = X_ffd.loc[chosen_events.index]
y_train = y.loc[chosen_events.index]
```


 

In [16]:
# ============================================================
# 7) FRACTIONAL DIFFERENTIATION (FFD) + ADF — FAST (No Python loops)
# ============================================================
def get_weights_ffd(d: float, thres: float = 1e-5, max_size: int = 10000) -> np.ndarray:
    """
    Fixed-width fractional differentiation weights.
    Returns weights oldest->newest (length = window size).
    """
    w = [1.0]
    k = 1
    while k < max_size:
        w_ = -w[-1] * (d - k + 1) / k
        if abs(w_) < thres:
            break
        w.append(w_)
        k += 1
    # Oldest to newest
    return np.array(w[::-1], dtype=float)


def fracdiff_ffd_fast(series: pd.Series, d: float, thres: float = 1e-5) -> pd.Series:
    """
    FAST fixed-width fractional differencing using vectorized correlation.
    - Works on GPU if CuPy is available (xp = cp).
    - Handles NaNs: any window containing NaN → output NaN for that position.
    """
    # 1) Weights (CPU) → move to xp
    w = get_weights_ffd(d, thres=thres)
    width = int(len(w))
    if width == 0:
        return pd.Series(np.full(series.shape, np.nan), index=series.index)

    w_xp = xp.asarray(w, dtype=float)

    # 2) Data & mask to xp
    vals_np = series.values.astype(float)
    mask_np = ~np.isnan(vals_np)  # True where valid
    vals_filled_np = np.where(mask_np, vals_np, 0.0)

    vals = xp.asarray(vals_filled_np, dtype=float)
    mask = xp.asarray(mask_np.astype(float))  # 1.0 valid, 0.0 NaN

    # 3) Vectorized "sliding dot": use correlate so we don't need to flip weights
    #    correlate(vals, w)[i] = sum(vals[i:i+width] * w)
    num = xp.correlate(vals, w_xp, mode='valid')              # shape: (n - width + 1,)

    # 4) Validity via mask: count of valid points in each window
    ones = xp.ones(width, dtype=float)
    cnt = xp.correlate(mask, ones, mode='valid')              # number of non-NaN per window

    # 5) Stitch back into full-length array and null-out invalid windows
    out = xp.full(vals.shape, xp.nan, dtype=float)
    valid_slice = slice(width - 1, len(vals))                 # positions aligned to end of each window
    out_valid = num                                           # same shape as cnt
    # Only keep windows with full validity (cnt == width)
    valid_windows = cnt == float(width)
    out_valid = xp.where(valid_windows, out_valid, xp.nan)
    out[valid_slice] = out_valid

    # 6) Back to CPU for pandas
    out_cpu = xp.asnumpy(out) if GPU_AVAILABLE else out
    return pd.Series(out_cpu, index=series.index)


def adf_stat(series: pd.Series, maxlag: Optional[int] = None) -> Dict[str, float]:
    """
    Augmented Dickey–Fuller test wrapper.
    """
    if adfuller is None:
        return {"adf_stat": np.nan, "pvalue": np.nan}

    s = series.dropna()
    if len(s) < 40:
        return {"adf_stat": np.nan, "pvalue": np.nan}

    stat, p, *_ = adfuller(s, maxlag=maxlag, autolag="AIC")
    return {"adf_stat": float(stat), "pvalue": float(p)}


# =======================
# Apply FFD + ADF (FAST)
# =======================
ffd = fracdiff_ffd_fast(close, d=0.4, thres=1e-4)

print(f"Non-NaN raw: {close.dropna().shape[0]} | Non-NaN FFD: {ffd.dropna().shape[0]}")
print("ADF raw:", adf_stat(close))
print("ADF ffd:", adf_stat(ffd))


Non-NaN raw: 310381 | Non-NaN FFD: 309197
ADF raw: {'adf_stat': -1.7753650647015773, 'pvalue': 0.39275900851346823}
ADF ffd: {'adf_stat': -5.283443060842624, 'pvalue': 5.925045819499616e-06}


Exactly 🔥 — that’s a **perfect observation** and you’re absolutely right.

Let’s unpack what’s going on here so you understand it at a deep, quantitative level.

---

## ✅ 1️⃣ What Happened

You started with:

```
ADF raw: p-value = 0.3927  →  non-stationary
```

Then after fractional differentiation (`d = 0.4`):

```
ADF ffd: p-value = 0.0000059  →  stationary ✅
```

That means — in statistical terms —
you effectively **“differentiated” the series just enough** to make it stationary,
but **not as aggressively** as a full first difference (which would remove all memory).

So yes, you can think of this as **“differentiating the price series once — but fractionally.”**
It’s the middle ground between:

| Method                | Differencing Order | Memory Retention | Stationarity     |
| :-------------------- | :----------------- | :--------------- | :--------------- |
| Raw prices            | d = 0.0            | High             | ❌ Non-stationary |
| First difference      | d = 1.0            | None             | ✅ Stationary     |
| Fractional difference | d = 0.4            | Partial          | ✅ Stationary     |

---

## 🧠 2️⃣ Why We Don’t “Just Differentiate Once”

When you difference once (`d=1`):

* You lose all long-term information (trends, mean reversion structure).
* Your model can only learn from very short-term fluctuations (like noise).
* Signal-to-noise ratio drops dramatically.

Fractional differentiation (here `d=0.4`) gives the **best of both worlds**:

* Stationarity (confirmed by the very low p-value)
* Preserved memory (unlike full differencing)

So, you “differentiated one more time,”
but only **by 40% of a full difference** — enough to stabilize, not destroy, the signal.

---

## ⚙️ 3️⃣ What the Numbers Tell Us

Let’s interpret them quantitatively:

| Metric              |    Value | Interpretation                                          |
| :------------------ | -------: | :------------------------------------------------------ |
| **ADF raw stat**    |   -1.775 | Too close to 0 → non-stationary                         |
| **ADF raw p-value** |   0.3928 | > 0.05 → fails stationarity test                        |
| **ADF ffd stat**    |   -5.283 | Far below critical value (~-2.86) → strongly stationary |
| **ADF ffd p-value** | 5.9×10⁻⁶ | ≪ 0.05 → reject unit-root hypothesis ✅                  |

So the fractional differentiation **successfully transformed** your price series from a random walk into a *mean-reverting* stationary series.

---

## 🧩 4️⃣ Visual Intuition

Here’s how the transformations compare conceptually:

```
Raw prices (non-stationary)
   ↑
   |          ↗ long upward drift ↗
   |
   +-------------------------------------------------------> time

First difference (d = 1.0)
   ↑
   |  fluctuates around 0, but pure noise
   +-------------------------------------------------------> time

Fractional diff (d = 0.4)
   ↑
   |  mean-reverting & smoother, still tracks long swings
   +-------------------------------------------------------> time
```

You “tempered” the drift without erasing the useful low-frequency structure.

---

## 🧭 5️⃣ Why This Matters for You

Now your **features (columns)** are statistically valid for model training:

* They don’t have spurious correlations due to non-stationarity.
* You can safely use them in regression, logistic, or ML pipelines.
* You can generate lagged versions, returns, or volatility proxies based on `ffd`.

Combined with your **Sequential Bootstrap** (which handled independence in labels),
you now have a dataset that’s:

* **Row-wise independent**
* **Column-wise stationary**

That’s the holy grail of quantitative modeling.

---

## ✅ 6️⃣ TL;DR — What You Achieved

| Step                                 | Goal                                         | Result                                 |
| ------------------------------------ | -------------------------------------------- | -------------------------------------- |
| Fractional Differentiation (`d=0.4`) | Remove non-stationarity                      | ✅ Achieved                             |
| ADF test                             | Verify statistical stationarity              | ✅ p ≈ 0.0000059                        |
| Data retention                       | Avoid information loss                       | ✅ 99.6% of data kept                   |
| Effect                               | You “differentiated once — but fractionally” | ✅ Stationary, memory-preserving series |

---

 

* raw prices
* first differences
* fractional differences (d = 0.4)

 


In [17]:
# 8) SAVE ARTIFACTS & QUICK SUMMARY
# ======================================
OUT = "/kaggle/working"
vol_bars.to_feather(f"{OUT}/volume_bars.feather")
dollar_bars.to_feather(f"{OUT}/dollar_bars.feather")
info_bars.to_feather(f"{OUT}/info_bars.feather")
bins.to_csv(f"{OUT}/triple_barrier_labels.csv", index=True)

print("=== OUTPUTS SAVED TO", OUT, "===")
print("\nLabel distribution:")
print(bins['bin'].value_counts(normalize=True, dropna=False))
print("\nBars shapes:")
print("Volume:", vol_bars.shape, "Dollar:", dollar_bars.shape, "Info:", info_bars.shape)

=== OUTPUTS SAVED TO /kaggle/working ===

Label distribution:
bin
 1.0    0.494963
-1.0    0.475257
 0.0    0.029779
Name: proportion, dtype: float64

Bars shapes:
Volume: (156, 6) Dollar: (63, 6) Info: (17, 6)


In [18]:
# ======================================
# 9) OPTIONAL — AUTO MARKDOWN REPORT
# ======================================
summary_md = f"""
# Project 1 — SPXUSD (GPU-aware)

**Bars:**
- Volume: {vol_bars.shape[0]}
- Dollar: {dollar_bars.shape[0]}
- Info (vol): {info_bars.shape[0]}

**Triple-Barrier label distribution:**
{bins['bin'].value_counts(normalize=True).to_string()}

**ADF:**
- Raw p-value: {adf_stat(close)['pvalue']:.4f}
- FFD p-value: {adf_stat(ffd)['pvalue']:.4f}

**Primary → Meta:**
- Primary model: LogisticRegression
- Meta (share of 1s): {meta.mean():.4f}

**GPU:**
- Enabled: {'Yes' if GPU_AVAILABLE else 'No'}
- Backend: {'CuPy' if GPU_AVAILABLE else 'NumPy'}
"""

with open(f"{OUT}/project1_summary.md", "w") as f:
    f.write(summary_md)

print("Markdown summary:", f"{OUT}/project1_summary.md")

Markdown summary: /kaggle/working/project1_summary.md
