# Colab: Binance Futures Direction Model (LightGBM Only, Full Data)

**What this notebook does**
- Mounts **Google Drive**; scans `/content/drive/MyDrive/binance_data`
- Loads ALL folders you provided (klines, mark/index/premium klines, metrics, trades, aggTrades, bookDepth)
- Robust timestamp parsing:
  - Uses common time columns per dataset
  - Reconstructs `open_time` from `close_time` for klines if needed
- Aggregates non-bar datasets to **15m**
- Builds a unified features frame
- Labels with **triple-barrier** (UP/DOWN/NEUTRAL) for next `n` periods
- Trains **LightGBM** (multiclass) and calibrates probabilities via **isotonic**
- Saves model + feature schema to Drive and provides a **predict()** stub for live usage

> Adjust `DATA_ROOT`, `BAR_FREQ`, and labeling params to your liking.

## 0) Setup & Mount Google Drive

In [None]:
%pip -q install lightgbm==4.3.0 pyarrow==16.1.0 fastparquet==2024.5.0
import os, sys, json, math, gc, pickle, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from google.colab import drive
drive.mount('/content/drive')
print('Drive mounted.')

## 1) Configure paths & discover Parquet files

In [None]:
from pathlib import Path

# === Adjust if your data is elsewhere in Drive ===
DATA_ROOT = Path('/content/drive/MyDrive/binance_data')
assert DATA_ROOT.exists(), f'Data root not found: {DATA_ROOT}'

def build_manifest(root: Path) -> dict:
    manifest = {k: [] for k in [
        'klines','markpriceklines','indexpriceklines','premiumindexklines',
        'metrics','trades','aggtrades','bookdepth','other']}
    for p in root.rglob('*.parquet'):
        name = p.name.lower()
        if 'markprice' in name and 'kline' in name:
            manifest['markpriceklines'].append(str(p))
        elif 'indexprice' in name and 'kline' in name:
            manifest['indexpriceklines'].append(str(p))
        elif 'premiumindex' in name and 'kline' in name:
            manifest['premiumindexklines'].append(str(p))
        elif 'kline' in name or 'klines' in name:
            manifest['klines'].append(str(p))
        elif 'aggtrade' in name:
            manifest['aggtrades'].append(str(p))
        elif 'bookdepth' in name or 'orderbook' in name:
            manifest['bookdepth'].append(str(p))
        elif 'metrics' in name or 'open_interest' in name:
            manifest['metrics'].append(str(p))
        elif 'trade' in name:
            manifest['trades'].append(str(p))
        else:
            manifest['other'].append(str(p))
    return manifest

manifest = build_manifest(DATA_ROOT)
for k,v in manifest.items():
    print(f"{k}: {len(v)} files")


## 2) Load helpers, robust time handling, and 15m aggregation

In [None]:
# --- Global bar frequency ---
BAR_FREQ = '15T'  # 15-minute bars

def load_concat(files):
    dfs = []
    for f in files:
        try:
            df = pd.read_parquet(f)
            dfs.append(df)
        except Exception as e:
            print('Failed to load', f, e)
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

def detect_time_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def to_utc_index(
    df,
    preferred_time_cols=('open_time','time','timestamp','create_time','transact_time','close_time'),
    reconstruct_from_close=False
):
    """Parse the first matching time column to a UTC DatetimeIndex.
    If reconstruct_from_close is True and only close_time exists, create open_time = close_time - BAR_FREQ.
    """
    if df.empty:
        return df

    tcol = detect_time_col(df, preferred_time_cols)

    # Special case: klines may only have close_time
    if reconstruct_from_close and (tcol is None or tcol == 'close_time'):
        if 'close_time' in df.columns:
            close_ts = pd.to_datetime(df['close_time'], utc=True, errors='coerce')
            open_ts = close_ts - pd.Timedelta(BAR_FREQ)
            df = df.copy()
            df['open_time'] = open_ts
            tcol = 'open_time'
        else:
            raise ValueError("Need close_time to reconstruct open_time.")

    if tcol is None:
        raise AssertionError(f"Expected one of {preferred_time_cols}, got {list(df.columns)[:10]} ...")

    df[tcol] = pd.to_datetime(df[tcol], utc=True, errors='coerce')
    df = df.dropna(subset=[tcol]).sort_values(tcol).drop_duplicates(tcol)
    df = df.set_index(tcol)
    if df.index.tz is None:
        df = df.tz_localize('UTC')
    else:
        df = df.tz_convert('UTC')
    return df

# --- Aggregators to 15m ---
def _to_bool_series(s):
    """Robust conversion of is_buyer_maker to bool."""
    if s.dtype == 'bool':
        return s
    mapping_true = {True, 'true', 'True', 1, '1'}
    mapping_false = {False, 'false', 'False', 0, '0'}
    return s.map(lambda x: True if x in mapping_true else (False if x in mapping_false else False)).astype(bool)

def agg_trades_15m(df):
    # trades: id, price, qty, quote_qty, time, is_buyer_maker
    if df.empty: return df
    df = df.copy()
    for col in ['price','qty','quote_qty']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    if 'is_buyer_maker' in df.columns:
        df['is_buyer_maker'] = _to_bool_series(df['is_buyer_maker'])
    df['dollar'] = (df.get('price',0) * df.get('qty',0)).fillna(0)
    g = df.resample(BAR_FREQ)
    out = pd.DataFrame({
        'trades_count': g.size(),
        'qty_sum': g['qty'].sum(min_count=1),
        'dollar_sum': g['dollar'].sum(min_count=1),
        'vwap': g.apply(lambda x: (x['price']*x['qty']).sum()/max(x['qty'].sum(),1e-9) if 'price' in x and 'qty' in x else np.nan)
    })
    if 'is_buyer_maker' in df.columns:
        def side_sum(x, side_bool):
            if len(x)==0: return 0.0
            return x.loc[x['is_buyer_maker']==side_bool, 'qty'].sum()
        out['buy_qty'] = g.apply(lambda x: side_sum(x, False))
        out['sell_qty'] = g.apply(lambda x: side_sum(x, True))
        out['ofi'] = (out['buy_qty'] - out['sell_qty']) / (out['qty_sum'] + 1e-9)
    else:
        out['buy_qty'] = 0.0; out['sell_qty'] = 0.0; out['ofi'] = 0.0
    return out

def agg_aggtrades_15m(df):
    # aggtrades: agg_trade_id, price, quantity, ..., is_buyer_maker
    if df.empty: return df
    df = df.copy()
    if 'quantity' in df.columns:
        df.rename(columns={'quantity':'qty'}, inplace=True)
    for col in ['price','qty']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df['dollar'] = (df.get('price',0) * df.get('qty',0)).fillna(0)
    g = df.resample(BAR_FREQ)
    out = pd.DataFrame({
        'agg_count': g.size(),
        'agg_qty_sum': g['qty'].sum(min_count=1),
        'agg_dollar_sum': g['dollar'].sum(min_count=1),
        'agg_vwap': g.apply(lambda x: (x['price']*x['qty']).sum()/max(x['qty'].sum(),1e-9) if 'price' in x and 'qty' in x else np.nan)
    })
    return out

def agg_bookdepth_15m(df):
    # bookdepth: timestamp, percentage, depth, notional (pre-aggregated snapshots)
    if df.empty: return df
    df = df.copy()
    for col in ['percentage','depth','notional']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    g = df.resample(BAR_FREQ)
    out = pd.DataFrame({
        'bd_notional_sum': g['notional'].sum(min_count=1),
        'bd_depth_sum': g['depth'].sum(min_count=1)
    })
    return out


## 3) Load datasets and align to 15m

In [None]:
# --- klines (base OHLCV 15m) ---
kl_raw = load_concat(manifest['klines'])
print('klines raw:', kl_raw.shape)
kl_df = to_utc_index(kl_raw, preferred_time_cols=('open_time','time','timestamp','close_time'), reconstruct_from_close=True)
keep_cols = [c for c in ['open','high','low','close','volume'] if c in kl_df.columns]
assert len(keep_cols) >= 4, 'Expected at least open/high/low/close in klines.'
kl_df = kl_df[keep_cols].astype('float32')
print('klines aligned:', kl_df.shape)

# --- mark/index/premium klines ---
mk_df = to_utc_index(load_concat(manifest['markpriceklines']))
ix_df = to_utc_index(load_concat(manifest['indexpriceklines']))
pr_df = to_utc_index(load_concat(manifest['premiumindexklines']))
def sel_bars(df):
    return df[[c for c in ['open','high','low','close','volume'] if c in df.columns]].astype('float32') if not df.empty else df
mark_k = sel_bars(mk_df)
index_k = sel_bars(ix_df)
prem_k = sel_bars(pr_df)
print('mark/index/premium:', mark_k.shape, index_k.shape, prem_k.shape)

# --- metrics (ffill to 15m) ---
mt_df = to_utc_index(load_concat(manifest['metrics']), preferred_time_cols=('create_time','time','timestamp'))
if not mt_df.empty:
    metrics_cols = [c for c in ['sum_open_interest','sum_open_interest_value','sum_toptrader_long_short_ratio','sum_taker_long_short_vol_ratio'] if c in mt_df.columns]
    mt_15 = mt_df[metrics_cols].astype('float32').resample(BAR_FREQ).ffill()
else:
    mt_15 = pd.DataFrame()
print('metrics 15m:', mt_15.shape)

# --- trades & aggTrades (aggregate to 15m) ---
tr_df = to_utc_index(load_concat(manifest['trades']), preferred_time_cols=('time','timestamp'))
ag_df = to_utc_index(load_concat(manifest['aggtrades']), preferred_time_cols=('transact_time','time','timestamp'))
tr_15 = agg_trades_15m(tr_df) if not tr_df.empty else pd.DataFrame()
ag_15 = agg_aggtrades_15m(ag_df) if not ag_df.empty else pd.DataFrame()
print('trades15 / aggtrades15:', tr_15.shape, ag_15.shape)

# --- bookDepth (aggregate to 15m) ---
bd_df = to_utc_index(load_concat(manifest['bookdepth']), preferred_time_cols=('timestamp','time'))
bd_15 = agg_bookdepth_15m(bd_df) if not bd_df.empty else pd.DataFrame()
print('bookdepth15:', bd_15.shape)

## 4) Merge into unified 15m feature base

In [None]:
base = kl_df.copy()

def safe_join(left, right):
    return left.join(right, how='left') if not right.empty else left

# spreads and premiums
if not mark_k.empty:
    base = safe_join(base, mark_k[['close']].rename(columns={'close':'mark_close'}))
if not index_k.empty:
    base = safe_join(base, index_k[['close']].rename(columns={'close':'index_close'}))
if not prem_k.empty:
    base = safe_join(base, prem_k[['close']].rename(columns={'close':'premium_close'}))

# metrics
base = safe_join(base, mt_15)

# order flow & depth
base = safe_join(base, tr_15)
base = safe_join(base, ag_15)
base = safe_join(base, bd_15)

base = base.sort_index().replace([np.inf,-np.inf], np.nan).fillna(method='ffill').fillna(0)
print('Unified feature base shape:', base.shape)
base.tail()

## 5) Triple-barrier labels (next n periods)

In [None]:
# Label params (tune as desired)
N_HORIZON = 16   # predict direction within next n bars (4 hours on 15m)
K_UP = 1.5
K_DN = 1.5
ATR_LEN = 14

def compute_atr(df: pd.DataFrame, atr_len=14):
    high = df['high']; low = df['low']; close = df['close']
    tr1 = (high - low).abs()
    tr2 = (high - close.shift()).abs()
    tr3 = (low - close.shift()).abs()
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = tr.ewm(span=atr_len, adjust=False).mean()
    return atr

def triple_barrier_labels(df, n=16, k_up=1.5, k_dn=1.5, atr_len=14):
    atr = compute_atr(df, atr_len=atr_len)
    price = df['close'].values
    up_mult = (k_up * atr / df['close']).fillna(method='bfill').values
    dn_mult = (k_dn * atr / df['close']).fillna(method='bfill').values
    y = np.full(len(df), 2, dtype=np.int8)  # 1=UP,0=DOWN,2=NEUTRAL
    highs = df['high'].values
    lows = df['low'].values
    L = len(df)
    for i in range(L - n):
        p0 = price[i]
        up = p0 * (1 + up_mult[i])
        dn = p0 * (1 - dn_mult[i])
        hi_path = highs[i+1:i+n+1]
        lo_path = lows[i+1:i+n+1]
        hit_up = np.where(hi_path >= up)[0]
        hit_dn = np.where(lo_path <= dn)[0]
        if hit_up.size and (not hit_dn.size or hit_up[0] < hit_dn[0]):
            y[i] = 1
        elif hit_dn.size and (not hit_up.size or hit_dn[0] < hit_up[0]):
            y[i] = 0
        else:
            y[i] = 2
    labels = pd.Series(y, index=df.index, name='label')
    labels.iloc[-n:] = 2
    return labels

labels = triple_barrier_labels(base, n=N_HORIZON, k_up=K_UP, k_dn=K_DN, atr_len=ATR_LEN)
labels.value_counts()

## 6) Feature engineering (rich set)

In [None]:
def engineer_features(df: pd.DataFrame):
    X = pd.DataFrame(index=df.index)
    # Price returns
    X['ret_1'] = df['close'].pct_change()
    for lag in [2,4,8,16,32,64,96]:
        X[f'ret_{lag}'] = df['close'].pct_change(lag)
    # Volatility
    atr14 = compute_atr(df, atr_len=14)
    X['atr14p'] = (atr14/df['close']).replace([np.inf,-np.inf], np.nan)
    X['rv_24'] = np.log(df['close']).diff().rolling(96).std()
    # EMAs
    ema8 = df['close'].ewm(span=8).mean(); ema21 = df['close'].ewm(span=21).mean()
    X['ema_diff'] = ema8 - ema21
    # RSI
    delta = df['close'].diff(); up = delta.clip(lower=0); down = -delta.clip(upper=0)
    rs = up.ewm(span=14).mean() / (down.ewm(span=14).mean() + 1e-9)
    X['rsi14'] = 100 - 100/(1+rs)
    # Spreads / premiums
    if 'mark_close' in df.columns: X['mark_spread_p'] = (df['mark_close'] - df['close'])/df['close']
    if 'index_close' in df.columns: X['index_spread_p'] = (df['index_close'] - df['close'])/df['close']
    if 'premium_close' in df.columns: X['premium_chg'] = df['premium_close'].pct_change()
    # Metrics (changes)
    if 'sum_open_interest' in df.columns: X['oi_chg_p'] = df['sum_open_interest'].pct_change().fillna(0)
    if 'sum_taker_long_short_vol_ratio' in df.columns: X['taker_ls_chg_p'] = df['sum_taker_long_short_vol_ratio'].pct_change().fillna(0)
    # Order flow
    if 'ofi' in df.columns: X['ofi'] = df['ofi']
    if 'trades_count' in df.columns: X['trades_count'] = df['trades_count']
    if 'vwap' in df.columns: X['vwap_close_spread'] = (df['vwap'] - df['close'])/df['close']
    # Depth
    if 'bd_notional_sum' in df.columns: X['bd_notional_sum'] = df['bd_notional_sum']
    # Volume-scaled features
    X['vol'] = df['volume'] if 'volume' in df.columns else 0
    X['vol_med_24'] = X['vol'].rolling(96).median().replace(0,np.nan)
    X['vol_surge'] = X['vol']/(X['vol_med_24']+1e-9)
    # Clean
    X = X.replace([np.inf,-np.inf], np.nan).fillna(method='bfill').fillna(0)
    return X

X = engineer_features(base)
print('Feature matrix shape:', X.shape)
X.head()

## 7) Train LightGBM and calibrate (time-based split for calibration)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss, log_loss
import lightgbm as lgb

# Reserve the last ~15% for calibration (time-based holdout)
n = len(X)
cal_size = max(int(n * 0.15), 500)
split_idx = n - cal_size

X_tr, y_tr = X.iloc[:split_idx], labels.iloc[:split_idx]
X_cal, y_cal = X.iloc[split_idx:], labels.iloc[split_idx:]

le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr.values)
y_cal_enc = le.transform(y_cal.values)

params = dict(objective='multiclass', num_class=3, learning_rate=0.03, n_estimators=1200,
              num_leaves=96, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1)
clf = lgb.LGBMClassifier(**params)
clf.fit(X_tr, y_tr_enc, eval_set=[(X_cal, y_cal_enc)], eval_metric='multi_logloss', verbose=False)

# Isotonic calibration on tail holdout
cal = CalibratedClassifierCV(clf, method='isotonic', cv='prefit')
cal.fit(X_cal, y_cal_enc)

probs_cal = cal.predict_proba(X_cal)
up_idx = list(le.classes_).index(1) if 1 in le.classes_ else 0
print('Calibration Brier (UP on holdout):', brier_score_loss((y_cal_enc==1).astype(int), probs_cal[:, up_idx]))

## 8) Save artifacts to Drive & predict() stub

In [None]:
from pathlib import Path
ARTIFACT_DIR = Path('/content/drive/MyDrive/binance_models')
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

with open(ARTIFACT_DIR/'lgbm_calibrated.pkl', 'wb') as f:
    pickle.dump(cal, f)
with open(ARTIFACT_DIR/'label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)
with open(ARTIFACT_DIR/'feature_columns.json', 'w') as f:
    f.write(json.dumps(list(X.columns)))
print('Saved artifacts to', ARTIFACT_DIR)

def predict_live(latest_row: pd.Series, calibrated_model, label_encoder, feature_columns):
    Xr = latest_row[feature_columns].values.reshape(1,-1)
    proba = calibrated_model.predict_proba(Xr)[0]
    classes = list(label_encoder.classes_)
    out = {}
    for cname, cid in [('P_down',0), ('P_up',1), ('P_neutral',2)]:
        out[cname] = float(proba[classes.index(cid)]) if cid in classes else np.nan
    return out

res = predict_live(X.iloc[-1], cal, le, X.columns)
print('Latest probabilities:', res)