# Colab: Binance Futures Direction Model (LightGBM, Memory‑Efficient)

**Goals**
- Be *reliably memory‑safe* on Colab.
- Stream & cache heavy datasets (trades/aggTrades/bookDepth) **per file → 15m** before merging.
- Load only needed columns and downcast to **float32**.
- Train a calibrated **LightGBM** classifier for next‑`n` direction.

**Workflow**
1) Mount Drive & discover `/content/drive/MyDrive/binance_data`.
2) Process **klines** as the base 15m frame.
3) For heavy sources: **process file-by-file → resample(15m) → cache parquet** under `/content/drive/MyDrive/binance_cache`.
4) Merge small cached frames and train.

Toggle heavy sources with flags if RAM is tight, then re‑enable; caches persist across runs.


## 0) Setup & Mount Google Drive

In [None]:
%pip -q install lightgbm==4.3.0 pyarrow==16.1.0 fastparquet==2024.5.0
import os, sys, json, gc, pickle
import numpy as np
import pandas as pd
from pathlib import Path
from google.colab import drive
drive.mount('/content/drive')
print('Drive mounted.')

## 1) Configure paths & discover Parquet files

In [None]:
DATA_ROOT = Path('/content/drive/MyDrive/binance_data')
CACHE_DIR = Path('/content/drive/MyDrive/binance_cache')
CACHE_DIR.mkdir(parents=True, exist_ok=True)
assert DATA_ROOT.exists(), f'Data root not found: {DATA_ROOT}'

def build_manifest(root: Path) -> dict:
    manifest = {k: [] for k in [
        'klines','markpriceklines','indexpriceklines','premiumindexklines',
        'metrics','trades','aggtrades','bookdepth','other']}
    for p in root.rglob('*.parquet'):
        name = p.name.lower()
        if 'markprice' in name and 'kline' in name:
            manifest['markpriceklines'].append(str(p))
        elif 'indexprice' in name and 'kline' in name:
            manifest['indexpriceklines'].append(str(p))
        elif 'premiumindex' in name and 'kline' in name:
            manifest['premiumindexklines'].append(str(p))
        elif 'kline' in name or 'klines' in name:
            manifest['klines'].append(str(p))
        elif 'aggtrade' in name:
            manifest['aggtrades'].append(str(p))
        elif 'bookdepth' in name or 'orderbook' in name:
            manifest['bookdepth'].append(str(p))
        elif 'metrics' in name or 'open_interest' in name:
            manifest['metrics'].append(str(p))
        elif 'trade' in name:
            manifest['trades'].append(str(p))
        else:
            manifest['other'].append(str(p))
    return manifest

manifest = build_manifest(DATA_ROOT)
for k,v in manifest.items():
    print(f"{k}: {len(v)} files")

## 2) Memory‑safe loaders, robust time handling, and per‑file caching

In [None]:
BAR_FREQ = '15T'  # 15‑minute bars

def _cache_path(kind: str, src_path: str) -> Path:
    fn = Path(src_path).name.replace('.parquet', f'.{kind}.15m.parquet')
    return CACHE_DIR / fn

def detect_time_col(df, candidates):
    for c in candidates:
        if c in df.columns: return c
    return None

def to_utc_index(df, preferred_time_cols=('open_time','time','timestamp','create_time','transact_time','close_time'), reconstruct_from_close=False):
    if df.empty: return df
    tcol = detect_time_col(df, preferred_time_cols)
    if reconstruct_from_close and (tcol is None or tcol == 'close_time'):
        if 'close_time' in df.columns:
            close_ts = pd.to_datetime(df['close_time'], utc=True, errors='coerce')
            df = df.copy(); df['open_time'] = close_ts - pd.Timedelta(BAR_FREQ); tcol = 'open_time'
        else:
            raise ValueError('Need close_time to reconstruct open_time.')
    if tcol is None:
        raise AssertionError(f'Expected one of {preferred_time_cols}, got {list(df.columns)[:10]} ...')
    df[tcol] = pd.to_datetime(df[tcol], utc=True, errors='coerce')
    df = df.dropna(subset=[tcol]).sort_values(tcol).drop_duplicates(tcol)
    df = df.set_index(tcol)
    return df.tz_localize('UTC') if df.index.tz is None else df.tz_convert('UTC')

def load_concat(files, columns=None):
    dfs = []
    for f in files:
        try:
            df = pd.read_parquet(f, columns=columns) if columns else pd.read_parquet(f)
            dfs.append(df)
        except Exception as e:
            print('Failed to load', f, e)
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

def _safe_numeric(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
    return df

def _to_bool_series(s):
    if s.dtype == 'bool': return s
    mapping_true  = {True, 'true', 'True', 1, '1'}
    mapping_false = {False, 'false', 'False', 0, '0'}
    return s.map(lambda x: True if x in mapping_true else (False if x in mapping_false else False)).fillna(False).astype(bool)

def aggregate_trades_files(files):
    outs = []
    for f in files:
        cpath = _cache_path('trades', f)
        if cpath.exists():
            outs.append(pd.read_parquet(cpath)); continue
        try:
            df = pd.read_parquet(f, columns=['price','qty','quote_qty','time','is_buyer_maker'])
        except:
            df = pd.read_parquet(f)
        df = to_utc_index(df, preferred_time_cols=('time','timestamp'))
        if df.empty: continue
        df = _safe_numeric(df, ['price','qty','quote_qty'])
        if 'is_buyer_maker' in df.columns: df['is_buyer_maker'] = _to_bool_series(df['is_buyer_maker'])
        df['dollar'] = (df.get('price',0) * df.get('qty',0)).fillna(0)
        g = df.resample(BAR_FREQ)
        out = pd.DataFrame({
            'trades_count': g.size(),
            'qty_sum':      g['qty'].sum(min_count=1),
            'dollar_sum':   g['dollar'].sum(min_count=1),
            'vwap':         g.apply(lambda x: (x['price']*x['qty']).sum()/max(x['qty'].sum(),1e-9) if 'price' in x and 'qty' in x else np.nan)
        })
        if 'is_buyer_maker' in df.columns:
            def side_sum(x, side_bool):
                if len(x)==0: return 0.0
                return x.loc[x['is_buyer_maker']==side_bool, 'qty'].sum()
            out['buy_qty']  = g.apply(lambda x: side_sum(x, False))
            out['sell_qty'] = g.apply(lambda x: side_sum(x, True))
            out['ofi']      = (out['buy_qty'] - out['sell_qty']) / (out['qty_sum'] + 1e-9)
        else:
            out['buy_qty'] = 0.0; out['sell_qty'] = 0.0; out['ofi'] = 0.0
        out = out.astype('float32')
        out.to_parquet(cpath)
        outs.append(out)
        del df, out; gc.collect()
    if not outs: return pd.DataFrame()
    df_all = pd.concat(outs).sort_index()
    return df_all.groupby(df_all.index).sum()

def aggregate_aggtrades_files(files):
    outs = []
    for f in files:
        cpath = _cache_path('aggtrades', f)
        if cpath.exists():
            outs.append(pd.read_parquet(cpath)); continue
        try:
            df = pd.read_parquet(f, columns=['price','quantity','transact_time'])
        except:
            df = pd.read_parquet(f)
        df = to_utc_index(df, preferred_time_cols=('transact_time','time','timestamp'))
        if df.empty: continue
        df = df.rename(columns={'quantity':'qty'})
        df = _safe_numeric(df, ['price','qty'])
        df['dollar'] = (df.get('price',0) * df.get('qty',0)).fillna(0)
        g = df.resample(BAR_FREQ)
        out = pd.DataFrame({
            'agg_count':      g.size(),
            'agg_qty_sum':    g['qty'].sum(min_count=1),
            'agg_dollar_sum': g['dollar'].sum(min_count=1),
            'agg_vwap':       g.apply(lambda x: (x['price']*x['qty']).sum()/max(x['qty'].sum(),1e-9) if 'price' in x and 'qty' in x else np.nan)
        }).astype('float32')
        out.to_parquet(cpath)
        outs.append(out)
        del df, out; gc.collect()
    if not outs: return pd.DataFrame()
    df_all = pd.concat(outs).sort_index()
    return df_all.groupby(df_all.index).sum()

def aggregate_bookdepth_files(files):
    outs = []
    for f in files:
        cpath = _cache_path('bookdepth', f)
        if cpath.exists():
            outs.append(pd.read_parquet(cpath)); continue
        try:
            df = pd.read_parquet(f, columns=['timestamp','percentage','depth','notional'])
        except:
            df = pd.read_parquet(f)
        df = to_utc_index(df, preferred_time_cols=('timestamp','time'))
        if df.empty: continue
        df = _safe_numeric(df, ['percentage','depth','notional'])
        g = df.resample(BAR_FREQ)
        out = pd.DataFrame({
            'bd_notional_sum': g['notional'].sum(min_count=1),
            'bd_depth_sum':    g['depth'].sum(min_count=1)
        }).astype('float32')
        out.to_parquet(cpath)
        outs.append(out)
        del df, out; gc.collect()
    if not outs: return pd.DataFrame()
    df_all = pd.concat(outs).sort_index()
    return df_all.groupby(df_all.index).sum()


## 3) Load datasets with memory toggles (safe defaults)

In [None]:
# Toggle heavy sources off initially; enable one-by-one after caches are built
USE_KLINES = True
USE_MARK_INDEX_PREMIUM = True
USE_METRICS = True
USE_TRADES = False      # set True after first light run
USE_AGGTRADES = False   # set True after first light run
USE_BOOKDEPTH = False   # set True after first light run

# Base klines
if USE_KLINES:
    kl_raw = load_concat(manifest['klines'], columns=['open_time','close_time','open','high','low','close','volume'])
    print('klines raw:', kl_raw.shape)
    kl_df = to_utc_index(kl_raw, preferred_time_cols=('open_time','time','timestamp','close_time'), reconstruct_from_close=True)
    keep_cols = [c for c in ['open','high','low','close','volume'] if c in kl_df.columns]
    assert len(keep_cols) >= 4, 'Expected at least open/high/low/close in klines.'
    kl_df = kl_df[keep_cols].astype('float32')
    print('klines aligned:', kl_df.shape)
else:
    raise RuntimeError('Klines are required as the base.')

# Mark/Index/Premium (bar → small)
if USE_MARK_INDEX_PREMIUM:
    mk_df = to_utc_index(load_concat(manifest['markpriceklines'], columns=['open_time','close_time','open','high','low','close','volume']))
    ix_df = to_utc_index(load_concat(manifest['indexpriceklines'], columns=['open_time','close_time','open','high','low','close','volume']))
    pr_df = to_utc_index(load_concat(manifest['premiumindexklines'], columns=['open_time','close_time','open','high','low','close','volume']))
    def sel_bars(df):
        return df[[c for c in ['open','high','low','close','volume'] if c in df.columns]].astype('float32') if not df.empty else df
    mark_k, index_k, prem_k = sel_bars(mk_df), sel_bars(ix_df), sel_bars(pr_df)
else:
    mark_k = index_k = prem_k = pd.DataFrame()

# Metrics (sparse → ffill 15m)
if USE_METRICS:
    mt_raw = load_concat(manifest['metrics'], columns=['create_time','sum_open_interest','sum_open_interest_value','sum_toptrader_long_short_ratio','sum_taker_long_short_vol_ratio'])
    mt_df  = to_utc_index(mt_raw, preferred_time_cols=('create_time','time','timestamp'))
    if not mt_df.empty:
        metrics_cols = [c for c in ['sum_open_interest','sum_open_interest_value','sum_toptrader_long_short_ratio','sum_taker_long_short_vol_ratio'] if c in mt_df.columns]
        mt_15 = mt_df[metrics_cols].astype('float32').resample(BAR_FREQ).ffill()
    else:
        mt_15 = pd.DataFrame()
else:
    mt_15 = pd.DataFrame()

# Heavy sources: stream → cache → merge
tr_15 = aggregate_trades_files(manifest['trades'])   if USE_TRADES    and manifest['trades']    else pd.DataFrame()
ag_15 = aggregate_aggtrades_files(manifest['aggtrades']) if USE_AGGTRADES and manifest['aggtrades'] else pd.DataFrame()
bd_15 = aggregate_bookdepth_files(manifest['bookdepth']) if USE_BOOKDEPTH and manifest['bookdepth'] else pd.DataFrame()

print('Shapes — kl:', kl_df.shape,
      'mark:', getattr(mark_k,'shape',()), 'index:', getattr(index_k,'shape',()), 'prem:', getattr(prem_k,'shape',()))
print('Shapes — metrics15:', mt_15.shape, 'trades15:', tr_15.shape, 'aggtrades15:', ag_15.shape, 'bookdepth15:', bd_15.shape)

## 4) Merge into unified 15m base

In [None]:
base = kl_df.copy()

def safe_join(left, right):
    return left.join(right, how='left') if not right.empty else left

if not mark_k.empty:  base = safe_join(base, mark_k[['close']].rename(columns={'close':'mark_close'}))
if not index_k.empty: base = safe_join(base, index_k[['close']].rename(columns={'close':'index_close'}))
if not prem_k.empty:  base = safe_join(base, prem_k[['close']].rename(columns={'close':'premium_close'}))
base = safe_join(base, mt_15)
base = safe_join(base, tr_15)
base = safe_join(base, ag_15)
base = safe_join(base, bd_15)

base = base.sort_index().replace([np.inf,-np.inf], np.nan).fillna(method='ffill').fillna(0)
print('Unified base shape:', base.shape)
base.tail()

## 5) Triple‑barrier labels (next n bars)

In [None]:
N_HORIZON = 16   # next n=16 bars (~4h on 15m)
K_UP = 1.5; K_DN = 1.5; ATR_LEN = 14

def compute_atr(df: pd.DataFrame, atr_len=14):
    high, low, close = df['high'], df['low'], df['close']
    tr = pd.concat([(high-low).abs(), (high-close.shift()).abs(), (low-close.shift()).abs()], axis=1).max(axis=1)
    return tr.ewm(span=atr_len, adjust=False).mean()

def triple_barrier_labels(df, n=16, k_up=1.5, k_dn=1.5, atr_len=14):
    atr = compute_atr(df, atr_len=atr_len)
    price = df['close'].values
    up_m = (k_up * atr / df['close']).fillna(method='bfill').values
    dn_m = (k_dn * atr / df['close']).fillna(method='bfill').values
    y = np.full(len(df), 2, dtype=np.int8)
    hi, lo = df['high'].values, df['low'].values
    L = len(df)
    for i in range(max(0, L-n)):
        p0 = price[i]
        up = p0 * (1 + up_m[i]); dn = p0 * (1 - dn_m[i])
        hp = hi[i+1:i+n+1]; lp = lo[i+1:i+n+1]
        iu = np.where(hp >= up)[0]; idn = np.where(lp <= dn)[0]
        if iu.size and (not idn.size or iu[0] < idn[0]): y[i] = 1
        elif idn.size and (not iu.size or idn[0] < iu[0]): y[i] = 0
        else: y[i] = 2
    labels = pd.Series(y, index=df.index, name='label'); labels.iloc[-n:] = 2
    return labels

labels = triple_barrier_labels(base, n=N_HORIZON, k_up=K_UP, k_dn=K_DN, atr_len=ATR_LEN)
labels.value_counts()

## 6) Lightweight features (float32)

In [None]:
def engineer_features(df: pd.DataFrame):
    X = pd.DataFrame(index=df.index)
    X['ret_1'] = df['close'].pct_change()
    for lag in [2,4,8,16,32,64]:
        X[f'ret_{lag}'] = df['close'].pct_change(lag)
    atr14 = compute_atr(df, atr_len=14)
    X['atr14p'] = (atr14/df['close'])
    ema8 = df['close'].ewm(span=8).mean(); ema21 = df['close'].ewm(span=21).mean()
    X['ema_diff'] = ema8 - ema21
    delta = df['close'].diff(); up = delta.clip(lower=0); down = -delta.clip(upper=0)
    rs = up.ewm(span=14).mean() / (down.ewm(span=14).mean() + 1e-9)
    X['rsi14'] = 100 - 100/(1+rs)
    if 'mark_close' in df.columns:  X['mark_spread_p']  = (df['mark_close']  - df['close'])/df['close']
    if 'index_close' in df.columns: X['index_spread_p'] = (df['index_close'] - df['close'])/df['close']
    if 'premium_close' in df.columns: X['premium_chg'] = df['premium_close'].pct_change()
    if 'sum_open_interest' in df.columns: X['oi_chg_p'] = df['sum_open_interest'].pct_change().fillna(0)
    if 'sum_taker_long_short_vol_ratio' in df.columns: X['taker_ls_chg_p'] = df['sum_taker_long_short_vol_ratio'].pct_change().fillna(0)
    if 'ofi' in df.columns: X['ofi'] = df['ofi']
    if 'trades_count' in df.columns: X['trades_count'] = df['trades_count']
    if 'vwap' in df.columns: X['vwap_close_spread'] = (df['vwap'] - df['close'])/df['close']
    if 'bd_notional_sum' in df.columns: X['bd_notional_sum'] = df['bd_notional_sum']
    X = X.replace([np.inf,-np.inf], np.nan).fillna(method='bfill').fillna(0)
    return X.astype('float32')

X = engineer_features(base)
print('X shape:', X.shape, 'dtype:', X.dtypes.iloc[0])
gc.collect(); X.head()

## 7) Train LightGBM (early stopping) + isotonic calibration

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss
import lightgbm as lgb

# Time-based holdout for calibration
n = len(X); cal_size = max(int(n * 0.15), 500); split_idx = n - cal_size
X_tr, y_tr = X.iloc[:split_idx], labels.iloc[:split_idx]
X_cal, y_cal = X.iloc[split_idx:], labels.iloc[split_idx:]

le = LabelEncoder(); y_tr_enc = le.fit_transform(y_tr.values); y_cal_enc = le.transform(y_cal.values)

clf = lgb.LGBMClassifier(
    objective='multiclass', num_class=3,
    learning_rate=0.03, n_estimators=4000,  # allow many trees w/ early stopping
    num_leaves=64, subsample=0.8, colsample_bytree=0.8,
    random_state=42, n_jobs=-1
)
clf.fit(
    X_tr, y_tr_enc,
    eval_set=[(X_cal, y_cal_enc)],
    eval_metric='multi_logloss',
    callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
)

# Isotonic calibration on tail holdout
cal = CalibratedClassifierCV(clf, method='isotonic', cv='prefit')
cal.fit(X_cal, y_cal_enc)

probs_cal = cal.predict_proba(X_cal)
up_idx = list(le.classes_).index(1) if 1 in le.classes_ else 0
print('Calibration Brier (UP on holdout):', brier_score_loss((y_cal_enc==1).astype(int), probs_cal[:, up_idx]))
gc.collect()

## 8) Save artifacts & predict() helper

In [None]:
ARTIFACT_DIR = Path('/content/drive/MyDrive/binance_models')
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
with open(ARTIFACT_DIR/'lgbm_calibrated.pkl', 'wb') as f: pickle.dump(cal, f)
with open(ARTIFACT_DIR/'label_encoder.pkl', 'wb') as f: pickle.dump(le, f)
with open(ARTIFACT_DIR/'feature_columns.json', 'w') as f: f.write(json.dumps(list(X.columns)))
print('Saved artifacts to', ARTIFACT_DIR)

def predict_live(latest_row: pd.Series, calibrated_model, label_encoder, feature_columns):
    Xr = latest_row[feature_columns].values.reshape(1,-1)
    proba = calibrated_model.predict_proba(Xr)[0]
    classes = list(label_encoder.classes_)
    out = {}
    for cname, cid in [('P_down',0), ('P_up',1), ('P_neutral',2)]:
        out[cname] = float(proba[classes.index(cid)]) if cid in classes else np.nan
    return out

print('Latest probabilities (current row):', predict_live(X.iloc[-1], cal, le, X.columns))