
# Transformer — 15‑min Direction Using Close, Order Book Depth, and Metrics (OI RSI16)

**Inputs (exact schemas):**
- **klines**: `open_time, open, high, low, close, volume, close_time, quote_volume, count, taker_buy_volume, taker_buy_quote_volume, ignore`
- **bookdepth**: `timestamp, percentage, depth, notional`
- **metrics**: `create_time, symbol, sum_open_interest, sum_open_interest_value, count_toptrader_long_short_ratio, sum_toptrader_long_short_ratio, count_long_short_ratio, sum_taker_long_short_vol_ratio`

**What we use:**
- 15‑min **close** from `klines`
- `bookdepth` aggregated to 15‑min: **bd_notional_sum**, **bd_depth_sum**, **bd_notional_max** (proxy for large resting orders)
- `metrics` resampled to 15‑min + **OI RSI(16)** computed from `sum_open_interest`

**Model:** Transformer encoder → binary P(up) over next *N* bars.

**Memory:** `bookdepth` streamed in chunks (`pyarrow.ParquetFile.iter_batches`) and cached per file.


## 0) Pin dependencies (run ONCE), then Runtime → Restart

In [None]:
%pip -q uninstall -y torch torchvision torchaudio pyarrow
%pip -q install pandas==2.2.2 pyarrow==16.1.0 fastparquet==2024.5.0
%pip -q install --index-url https://download.pytorch.org/whl/cu126 torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0
import os, pandas, pyarrow, torch
print('OK:', pandas.__version__, pyarrow.__version__, torch.__version__)
os.kill(os.getpid(), 9)

## 1) Setup & Mount Drive

In [None]:
import os, sys, json, gc, math, pickle
from pathlib import Path
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
DATA_ROOT = Path('/content/drive/MyDrive/binance_data')
CACHE_DIR = Path('/content/drive/MyDrive/binance_cache')
ARTIFACT_DIR = Path('/content/drive/MyDrive/binance_models')
CACHE_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
assert DATA_ROOT.exists(), f'Missing data root: {DATA_ROOT}'
BAR_FREQ = '15min'

## 2) Discover klines, bookDepth, metrics

In [None]:
def build_manifest(root: Path) -> dict:
    m = {k: [] for k in ['klines','bookdepth','metrics']}
    for p in root.rglob('*.parquet'):
        name = p.name.lower()
        if 'bookdepth' in name or 'orderbook' in name:
            m['bookdepth'].append(str(p))
        elif 'metric' in name:
            m['metrics'].append(str(p))
        elif 'kline' in name:
            m['klines'].append(str(p))
    for k in m:
        m[k] = sorted(m[k])
    return m

manifest = build_manifest(DATA_ROOT)
for k,v in manifest.items():
    print(f"{k}: {len(v)} files")

## 3) Helpers — robust time handling & streaming

In [None]:
import pyarrow.parquet as pq

def _cache_path(kind: str, src_path: str) -> Path:
    fn = Path(src_path).name.replace('.parquet', f'.{kind}.15m.parquet')
    return CACHE_DIR / fn

def to_utc_index(df, time_col: str, reconstruct_from_close=False):
    if reconstruct_from_close and time_col == 'open_time':
        if 'close_time' not in df.columns:
            raise ValueError('close_time required to reconstruct open_time')
        close_ts = pd.to_datetime(df['close_time'], errors='coerce', utc=True)
        df = df.copy(); df['open_time'] = close_ts - pd.Timedelta('15min')
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col], errors='coerce', utc=False)
    df = df.dropna(subset=[time_col]).sort_values(time_col).drop_duplicates(time_col)
    df = df.set_index(time_col)
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError(f'Index is not DatetimeIndex after setting {time_col}')
    df.index = df.index.tz_localize('UTC') if df.index.tz is None else df.index.tz_convert('UTC')
    return df

def load_concat(files, columns):
    dfs = []
    for f in files:
        try:
            df = pd.read_parquet(f, columns=columns, engine='pyarrow')
            dfs.append(df)
        except Exception as e:
            print('Failed to load', f, e)
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

def stream_parquet_batches(path, columns, batch_size=200_000):
    pf = pq.ParquetFile(path)
    for batch in pf.iter_batches(batch_size=batch_size, columns=columns):
        yield batch.to_pandas()


## 4) Stream & cache bookDepth → 15m features

In [None]:
def aggregate_bookdepth_file_streaming(path):
    outs = []
    cols = ['timestamp','percentage','depth','notional']
    for chunk in stream_parquet_batches(path, columns=cols, batch_size=200_000):
        if chunk.empty:
            continue
        if any(c not in chunk.columns for c in cols):
            continue
        chunk['timestamp'] = pd.to_datetime(chunk['timestamp'], utc=True, errors='coerce')
        chunk = chunk.dropna(subset=['timestamp']).set_index('timestamp')
        for c in ('percentage','depth','notional'):
            chunk[c] = pd.to_numeric(chunk[c], errors='coerce').astype('float32')
        bins = chunk.index.floor('15min')
        bd_notional_sum = chunk['notional'].groupby(bins).sum(min_count=1)
        bd_depth_sum    = chunk['depth'].groupby(bins).sum(min_count=1)
        bd_notional_max = chunk['notional'].groupby(bins).max()
        out = pd.DataFrame({
            'bd_notional_sum': bd_notional_sum.astype('float32'),
            'bd_depth_sum':    bd_depth_sum.astype('float32'),
            'bd_notional_max': bd_notional_max.astype('float32'),
        })
        out.index.name = 'time'
        outs.append(out)
    if not outs:
        return pd.DataFrame()
    df_all = pd.concat(outs)
    return df_all.groupby(level=0, sort=True).sum()

def process_bookdepth(files):
    outs = []
    for p in files:
        cpath = _cache_path('bookdepth', p)
        if cpath.exists():
            out = pd.read_parquet(cpath, engine='pyarrow')
        else:
            out = aggregate_bookdepth_file_streaming(p)
            if out is not None and not out.empty:
                out.sort_index(inplace=True)
                out.to_parquet(cpath)
        if out is not None and not out.empty:
            outs.append(out)
    if not outs:
        return pd.DataFrame()
    df_all = pd.concat(outs)
    return df_all.groupby(level=0, sort=True).sum()

## 5) Load klines (close), metrics (15m + OI RSI16)

In [None]:
kl_cols = ['open_time','close_time','close']
kl_raw = load_concat(manifest['klines'], columns=kl_cols)
print('klines raw:', kl_raw.shape)
kl_df = to_utc_index(kl_raw, time_col='open_time', reconstruct_from_close=True)
kl_df = kl_df[['close']].astype('float32')
print('klines aligned:', kl_df.shape)

mt_cols = ['create_time','symbol','sum_open_interest','sum_open_interest_value',
           'count_toptrader_long_short_ratio','sum_toptrader_long_short_ratio',
           'count_long_short_ratio','sum_taker_long_short_vol_ratio']
mt_raw = load_concat(manifest['metrics'], columns=mt_cols)
mt_df  = to_utc_index(mt_raw, time_col='create_time') if not mt_raw.empty else pd.DataFrame()
if not mt_df.empty:
    mt_15 = mt_df[['sum_open_interest','sum_open_interest_value',
                   'count_toptrader_long_short_ratio','sum_toptrader_long_short_ratio',
                   'count_long_short_ratio','sum_taker_long_short_vol_ratio']].astype('float32').resample('15min').ffill()
else:
    mt_15 = pd.DataFrame()
print('metrics 15m:', mt_15.shape)

def rsi_series(x: pd.Series, length=16):
    delta = x.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    roll_up = up.ewm(alpha=1/length, adjust=False).mean()
    roll_down = down.ewm(alpha=1/length, adjust=False).mean()
    rs = roll_up / (roll_down + 1e-9)
    return 100 - (100 / (1 + rs))

if not mt_15.empty and 'sum_open_interest' in mt_15.columns:
    mt_15['oi_rsi16'] = rsi_series(mt_15['sum_open_interest'].astype('float32'), length=16).astype('float32')
else:
    if not mt_15.empty:
        mt_15['oi_rsi16'] = np.nan
    else:
        mt_15 = pd.DataFrame({'oi_rsi16': []})

## 6) Build bookDepth 15m aggregates (streamed)

In [None]:
bd_15 = process_bookdepth(manifest['bookdepth'])
print('bookdepth15:', bd_15.shape)
bd_15.tail()

## 7) Merge unified 15m frame

In [None]:
base = kl_df.copy()
def safe_join(left, right):
    return left.join(right, how='left') if right is not None and not right.empty else left

base = safe_join(base, mt_15)
base = safe_join(base, bd_15)
base = base.sort_index().replace([np.inf,-np.inf], np.nan).fillna(method='ffill').dropna()
print('Unified base shape:', base.shape)
base.tail()

## 8) Features & labels (binary up/down in next N bars)

In [None]:
N_HORIZON = 16
NEUTRAL_EPS = 0.0

feat_cols = ['close','sum_open_interest','sum_open_interest_value',
             'count_toptrader_long_short_ratio','sum_toptrader_long_short_ratio',
             'count_long_short_ratio','sum_taker_long_short_vol_ratio',
             'oi_rsi16','bd_notional_sum','bd_depth_sum','bd_notional_max']

X = base[feat_cols].copy()
X['close_ret_1'] = X['close'].pct_change().fillna(0).astype('float32')
future_close = base['close'].shift(-N_HORIZON)
fwd_ret = (future_close - base['close']) / base['close']
y = (fwd_ret > NEUTRAL_EPS).astype('int64')
df = X.copy(); df['target'] = y
df = df.dropna()
print('Final dataset:', df.shape)
df.tail()

## 9) Time split & sequence windows

In [None]:
from sklearn.preprocessing import StandardScaler
LOOKBACK = 128
STRIDE = 1
n = len(df)
i_tr = int(n*0.7); i_va = int(n*0.85)
train_df = df.iloc[:i_tr]; val_df = df.iloc[i_tr:i_va]; test_df = df.iloc[i_va:]
features = [c for c in df.columns if c != 'target']
scaler = StandardScaler().fit(train_df[features])

def make_sequences(frame):
    Xn = scaler.transform(frame[features]).astype('float32')
    y  = frame['target'].astype('int64').values
    X_seq, y_seq = [], []
    for t in range(LOOKBACK, len(frame), STRIDE):
        X_seq.append(Xn[t-LOOKBACK:t, :])
        y_seq.append(y[t])
    return np.stack(X_seq), np.array(y_seq)

Xtr, Ytr = make_sequences(train_df)
Xva, Yva = make_sequences(val_df)
Xte, Yte = make_sequences(test_df)
print('Seq shapes:', Xtr.shape, Xva.shape, Xte.shape)

## 10) Transformer — train

In [None]:
import torch, torch.nn as nn, math
from torch.utils.data import TensorDataset, DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=10000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x):
        L = x.size(1); return x + self.pe[:, :L, :]

class TimeSeriesTransformer(nn.Module):
    def __init__(self, num_features, d_model=64, nhead=4, num_layers=3, dim_feedforward=128, dropout=0.1):
        super().__init__()
        self.input_proj = nn.Linear(num_features, d_model)
        self.posenc = PositionalEncoding(d_model)
        enc = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        self.encoder = nn.TransformerEncoder(enc, num_layers=num_layers)
        self.head = nn.Sequential(nn.LayerNorm(d_model), nn.Linear(d_model, 1))
    def forward(self, x):
        z = self.input_proj(x); z = self.posenc(z); z = self.encoder(z)
        return self.head(z[:, -1, :]).squeeze(-1)

num_features = len(features)
model = TimeSeriesTransformer(num_features=num_features).to(device)
train_ds = TensorDataset(torch.from_numpy(Xtr), torch.from_numpy(Ytr))
val_ds   = TensorDataset(torch.from_numpy(Xva), torch.from_numpy(Yva))
test_ds  = TensorDataset(torch.from_numpy(Xte), torch.from_numpy(Yte))
train_loader = DataLoader(train_ds, batch_size=256, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_ds, batch_size=512, shuffle=False)
criterion = nn.BCEWithLogitsLoss(); optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)

def evaluate(loader):
    model.eval(); loss_sum=0; n=0; correct=0
    with torch.no_grad():
        for xb,yb in loader:
            xb = xb.to(device); yb = yb.float().to(device)
            logits = model(xb); loss = criterion(logits, yb)
            loss_sum += loss.item()*len(xb)
            preds = (torch.sigmoid(logits) > 0.5).long()
            correct += (preds.cpu()==yb.long().cpu()).sum().item(); n += len(xb)
    return loss_sum/n, correct/n

from pathlib import Path
ARTIFACT_DIR = Path('/content/drive/MyDrive/binance_models')
ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)
BEST_PATH = ARTIFACT_DIR/'transformer_close_depth_metrics_clean.pth'
best_val = float('inf')
for ep in range(1, 13):
    model.train()
    for xb,yb in train_loader:
        xb = xb.to(device); yb = yb.float().to(device)
        optimizer.zero_grad(); logits = model(xb); loss = criterion(logits, yb)
        loss.backward(); optimizer.step()
    vloss, vacc = evaluate(val_loader)
    print(f'Epoch {ep:02d}  val_loss={vloss:.4f}  val_acc={vacc:.3f}')
    if vloss < best_val:
        best_val = vloss
        torch.save({'model_state': model.state_dict(), 'features': features, 'config': {'lookback': 128, 'horizon': 16}}, BEST_PATH)
        print('  ✓ saved best')


## 11) Evaluate on test & save artifacts

In [None]:
from torch.utils.data import DataLoader
tloss, tacc = evaluate(DataLoader(test_ds, batch_size=512, shuffle=False))
print('Test  loss:', round(tloss,4), ' acc:', round(tacc,3))

import joblib, json
joblib.dump(scaler, '/content/drive/MyDrive/binance_models/scaler_close_depth_metrics_clean.gz')
with open('/content/drive/MyDrive/binance_models/features_close_depth_metrics_clean.json','w') as f:
    json.dump(features, f)
print('Artifacts saved to /content/drive/MyDrive/binance_models')


## 12) Inference helper — latest window → P(up)

In [None]:
import torch, json
from pathlib import Path
def predict_latest_prob_up(model_path='/content/drive/MyDrive/binance_models/transformer_close_depth_metrics_clean.pth'):
    ckpt = torch.load(model_path, map_location='cpu')
    feats = ckpt['features']; lookback = ckpt['config']['lookback']
    window = df[feats].iloc[-lookback:]
    from sklearn.preprocessing import StandardScaler
    import joblib
    scaler = joblib.load('/content/drive/MyDrive/binance_models/scaler_close_depth_metrics_clean.gz')
    Xw = scaler.transform(window).astype('float32')
    xb = torch.from_numpy(Xw).unsqueeze(0)
    class TimeSeriesTransformer(torch.nn.Module):
        def __init__(self, num_features, d_model=64, nhead=4, num_layers=3, dim_feedforward=128, dropout=0.1):
            super().__init__(); import math, torch.nn as nn
            class PositionalEncoding(nn.Module):
                def __init__(self, d_model, max_len=10000):
                    super().__init__(); import torch
                    pe = torch.zeros(max_len, d_model)
                    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
                    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
                    pe[:, 0::2] = torch.sin(position * div_term)
                    pe[:, 1::2] = torch.cos(position * div_term)
                    self.register_buffer('pe', pe.unsqueeze(0))
                def forward(self, x):
                    L = x.size(1); return x + self.pe[:, :L, :]
            self.input_proj = nn.Linear(num_features, d_model)
            self.posenc = PositionalEncoding(d_model)
            enc = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
            self.encoder = nn.TransformerEncoder(enc, num_layers=num_layers)
            self.head = nn.Sequential(nn.LayerNorm(d_model), nn.Linear(d_model, 1))
        def forward(self, x):
            z = self.input_proj(x); z = self.posenc(z); z = self.encoder(z)
            return self.head(z[:, -1, :]).squeeze(-1)
    m = TimeSeriesTransformer(num_features=len(feats))
    m.load_state_dict(ckpt['model_state']); m.eval()
    with torch.no_grad():
        p_up = torch.sigmoid(m(xb)).item()
    return p_up

print('P(up) next N bars:', round(predict_latest_prob_up(), 4))
