# Colab: Binance Futures Turning-Point Pipeline

**What this notebook does**
- Mounts **Google Drive** and points to `MyDrive/binance_data` (adjustable)
- Recursively **lists subfolders/files** (tree view) and builds a **manifest** of Parquet files
- Loads/merges **klines** (and optionally other endpoints) into a 15m dataframe
- Creates **triple-barrier labels** (volatility-adaptive)
- Engineers basic features
- Trains a **LightGBM** baseline with **probability calibration**
- Provides a **predict()** stub for live usage

> Run this in **Google Colab**. If your Drive layout is different, edit `DATA_ROOT`.


## 0) Setup & Mount Google Drive

In [None]:
%pip -q install lightgbm==4.3.0 pyarrow==16.1.0 fastparquet==2024.5.0
import os, sys, json, math, gc, pickle, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from google.colab import drive
drive.mount('/content/drive')
print('Drive mounted.')

## 1) Configure path & list folder structure

In [None]:
from pathlib import Path

DATA_ROOT = Path('/content/drive/MyDrive/binance_data')  # adjust if needed
assert DATA_ROOT.exists(), f'Data root not found: {DATA_ROOT}'

def list_dir_structure(root: Path, only_dirs=False, max_depth=10, _depth=0):
    if _depth > max_depth: return
    try:
        entries = sorted(root.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
    except Exception as e:
        print('Error listing', root, e)
        return
    for p in entries:
        icon = '📂' if p.is_dir() else '📄'
        if only_dirs and not p.is_dir():
            continue
        print(' ' * (_depth*2) + f"{icon} {p.name}")
        if p.is_dir():
            list_dir_structure(p, only_dirs=only_dirs, max_depth=max_depth, _depth=_depth+1)

print('Root:', DATA_ROOT)
print('\n--- DIRECTORY TREE (folders only) ---')
list_dir_structure(DATA_ROOT, only_dirs=True)
print('\n--- DIRECTORY TREE (folders & files) ---')
list_dir_structure(DATA_ROOT, only_dirs=False)

## 2) Build manifest of Parquet files

In [None]:
from typing import Dict, List
from pathlib import Path

def build_manifest(root: Path) -> dict:
    manifest = {
        'klines': [], 'markpriceklines': [], 'indexpriceklines': [], 'premiumindexklines': [],
        'trades': [], 'aggtrades': [], 'bookdepth': [], 'metrics': [], 'other': []
    }
    for p in root.rglob('*.parquet'):
        name = p.name.lower()
        if 'markprice' in name and 'kline' in name:
            manifest['markpriceklines'].append(str(p))
        elif 'indexprice' in name and 'kline' in name:
            manifest['indexpriceklines'].append(str(p))
        elif 'premiumindex' in name and 'kline' in name:
            manifest['premiumindexklines'].append(str(p))
        elif 'kline' in name or 'klines' in name:
            manifest['klines'].append(str(p))
        elif 'aggtrade' in name:
            manifest['aggtrades'].append(str(p))
        elif 'bookdepth' in name or 'orderbook' in name:
            manifest['bookdepth'].append(str(p))
        elif 'metrics' in name or 'open_interest' in name:
            manifest['metrics'].append(str(p))
        elif 'trade' in name:
            manifest['trades'].append(str(p))
        else:
            manifest['other'].append(str(p))
    return manifest

manifest = build_manifest(DATA_ROOT)
for k,v in manifest.items():
    print(f"{k}: {len(v)} files")
manifest

## 3) Load & concatenate klines (15m)

In [None]:
import pyarrow.parquet as pq

def load_concat_parquet(files):
    dfs = []
    for f in files:
        try:
            df = pd.read_parquet(f)
            dfs.append(df)
        except Exception as e:
            print('Failed to load', f, e)
    if not dfs:
        raise RuntimeError('No parquet files loaded')
    return pd.concat(dfs, ignore_index=True)

kl_files = manifest['klines']
assert len(kl_files) > 0, 'No kline parquet files found. Ensure filenames include kline/klines.'
kl_df = load_concat_parquet(kl_files)
print('klines shape:', kl_df.shape)
print('Columns:', kl_df.columns.tolist())

time_col = 'open_time' if 'open_time' in kl_df.columns else ('time' if 'time' in kl_df.columns else None)
assert time_col is not None, 'Expected open_time or time column in klines parquet.'
kl_df[time_col] = pd.to_datetime(kl_df[time_col], utc=True)
kl_df = kl_df.sort_values(time_col).drop_duplicates(time_col)
kl_df = kl_df.set_index(time_col)
kl_df = kl_df.tz_convert('UTC') if kl_df.index.tz is not None else kl_df.tz_localize('UTC')
keep_cols = [c for c in ['open','high','low','close','volume'] if c in kl_df.columns]
kl_df = kl_df[keep_cols].astype('float32')
kl_df.head()

## 4) Triple-barrier labeling

In [None]:
def compute_atr(df: pd.DataFrame, atr_len=14):
    high = df['high']; low = df['low']; close = df['close']
    tr1 = (high - low).abs()
    tr2 = (high - close.shift()).abs()
    tr3 = (low - close.shift()).abs()
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = tr.ewm(span=atr_len, adjust=False).mean()
    return atr

def triple_barrier_labels(df, n=16, k_up=1.5, k_dn=1.5, atr_len=14):
    atr = compute_atr(df, atr_len=atr_len)
    price = df['close'].values
    up_mult = (k_up * atr / df['close']).fillna(method='bfill').values
    dn_mult = (k_dn * atr / df['close']).fillna(method='bfill').values
    y = np.full(len(df), 2, dtype=np.int8)
    highs = df['high'].values
    lows = df['low'].values
    L = len(df)
    for i in range(L - n):
        p0 = price[i]
        up = p0 * (1 + up_mult[i])
        dn = p0 * (1 - dn_mult[i])
        hi_path = highs[i+1:i+n+1]
        lo_path = lows[i+1:i+n+1]
        hit_up = np.where(hi_path >= up)[0]
        hit_dn = np.where(lo_path <= dn)[0]
        if hit_up.size and (not hit_dn.size or hit_up[0] < hit_dn[0]):
            y[i] = 1
        elif hit_dn.size and (not hit_up.size or hit_dn[0] < hit_up[0]):
            y[i] = 0
        else:
            y[i] = 2
    labels = pd.Series(y, index=df.index, name='label')
    labels.iloc[-n:] = 2
    return labels

labels = triple_barrier_labels(kl_df, n=16, k_up=1.5, k_dn=1.5, atr_len=14)
labels.value_counts()

## 5) Feature engineering (examples)

In [None]:
def make_features(df):
    X = pd.DataFrame(index=df.index)
    X['ret_1'] = df['close'].pct_change()
    for lag in [2,4,8,16,32,64]:
        X[f'ret_{lag}'] = df['close'].pct_change(lag)
    atr14 = compute_atr(df, atr_len=14)
    X['atr14p'] = (atr14 / df['close']).fillna(method='bfill')
    X['rv_24'] = np.log(df['close']).diff().rolling(96).std()
    ema8 = df['close'].ewm(span=8).mean()
    ema21 = df['close'].ewm(span=21).mean()
    X['ema_diff'] = ema8 - ema21
    delta = df['close'].diff()
    up = delta.clip(lower=0); down = -delta.clip(upper=0)
    rs = up.ewm(span=14).mean() / (down.ewm(span=14).mean() + 1e-9)
    X['rsi14'] = 100 - 100/(1+rs)
    X['vol'] = df['volume']
    med24 = X['vol'].rolling(96).median().replace(0, np.nan)
    X['vol_surge'] = X['vol'] / (med24 + 1e-9)
    return X.fillna(method='bfill').fillna(0)

X = make_features(kl_df)
X.head()

## 6) Train/test split & LightGBM training + calibration

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss, roc_auc_score
import lightgbm as lgb

train_start = pd.Timestamp('2025-02-06', tz='UTC')
train_end   = pd.Timestamp('2025-06-06', tz='UTC')
test_start  = pd.Timestamp('2025-06-07', tz='UTC')
test_end    = pd.Timestamp('2025-08-25', tz='UTC')

y = labels.loc[X.index]
mask_tr = (X.index >= train_start) & (X.index <= train_end)
mask_te = (X.index >= test_start) & (X.index <= test_end)
X_train, y_train = X.loc[mask_tr], y.loc[mask_tr]
X_test,  y_test  = X.loc[mask_te], y.loc[mask_te]
print('Train rows:', len(X_train), 'Test rows:', len(X_test))

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train.values)
y_test_enc  = le.transform(y_test.values)

params = dict(objective='multiclass', num_class=3, learning_rate=0.03, n_estimators=800,
              num_leaves=64, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1)
clf = lgb.LGBMClassifier(**params)
clf.fit(X_train, y_train_enc, eval_set=[(X_test, y_test_enc)], eval_metric='multi_logloss', verbose=False)
cal = CalibratedClassifierCV(clf, method='isotonic', cv=3)
cal.fit(X_train, y_train_enc)

probs = cal.predict_proba(X_test)
up_idx = list(le.classes_).index(1) if 1 in le.classes_ else 0
print('Test Brier (UP):', brier_score_loss((y_test_enc==1).astype(int), probs[:, up_idx]))

## 7) Predict function & save artifacts

In [None]:
def predict_live(latest_row: pd.Series, cal_model, feature_columns):
    Xr = latest_row[feature_columns].values.reshape(1,-1)
    proba = cal_model.predict_proba(Xr)[0]
    return {
        'P_down': float(proba[list(le.classes_).index(0)] if 0 in le.classes_ else np.nan),
        'P_up': float(proba[list(le.classes_).index(1)] if 1 in le.classes_ else np.nan),
        'P_neutral': float(proba[list(le.classes_).index(2)] if 2 in le.classes_ else np.nan)
    }

if len(X_test) > 0:
    res = predict_live(X_test.iloc[-1], cal, X.columns)
    print('Latest probs:', res)

from pathlib import Path
ARTIFACT_DIR = Path('/content/drive/MyDrive/binance_models')
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
with open(ARTIFACT_DIR/'calibrated_lgbm.pkl', 'wb') as f:
    pickle.dump(cal, f)
with open(ARTIFACT_DIR/'feature_columns.json', 'w') as f:
    f.write(json.dumps(list(X.columns)))
print('Saved to', ARTIFACT_DIR)