# Binance BTC Futures Forecasting

This notebook demonstrates how to load Binance futures BTC data from Google Drive, create features, and train a forecasting model using a PatchTST-style transformer with quantile regression. The notebook is designed to be run in Google Colab. Update the `data_dir` path to point to your dataset folder in Google Drive.


In [None]:
# Mount Google Drive (Colab)
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Standard libraries
import os
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime


In [None]:
# Path to your data directory on Google Drive
# Update this path to where you've stored the Binance data files
# For example: '/content/drive/MyDrive/binance_data'
data_dir = Path('/content/drive/MyDrive/binance_data')


In [None]:
# Dictionary mapping dataset keys to subfolders located in `data_dir`
dataset_folders = {
    'aggtrades': Path('aggTrades'),
    'bookdepth': Path('bookDepth'),
    'indexpriceklines': Path('indexPriceKlines'),
    'klines': Path('klines'),
    'markpriceklines': Path('markPriceKlines'),
    'metrics': Path('metrics'),
    'premiumindexklines': Path('premiumIndexKlines'),
    'trades': Path('trades'),
}

# Column names for each dataset
cols = {
    'aggtrades': ['agg_trade_id','price','quantity','first_trade_id','last_trade_id','transact_time','is_buyer_maker'],
    'bookdepth': ['timestamp','percentage','depth','notional'],
    'indexpriceklines': ['open_time','open','high','low','close','volume','close_time','quote_volume','count','taker_buy_volume','taker_buy_quote_volume','ignore'],
    'klines': ['open_time','open','high','low','close','volume','close_time','quote_volume','count','taker_buy_volume','taker_buy_quote_volume','ignore'],
    'markpriceklines': ['open_time','open','high','low','close','volume','close_time','quote_volume','count','taker_buy_volume','taker_buy_quote_volume','ignore'],
    'metrics': ['create_time','symbol','sum_open_interest','sum_open_interest_value','count_toptrader_long_short_ratio','sum_toptrader_long_short_ratio','count_long_short_ratio','sum_taker_long_short_vol_ratio'],
    'premiumindexklines': ['open_time','open','high','low','close','volume','close_time','quote_volume','count','taker_buy_volume','taker_buy_quote_volume','ignore'],
    'trades': ['id','price','qty','quote_qty','time','is_buyer_maker'],
}


In [None]:
def load_dataset(key):
    """Load and concatenate all Parquet files for the given dataset key."""
    folder_path = data_dir / dataset_folders[key]
    parquet_files = sorted(folder_path.glob('*.parquet'))
    if not parquet_files:
        raise FileNotFoundError(f'No Parquet files found in {folder_path}')
    df = pd.concat((pd.read_parquet(p) for p in parquet_files), ignore_index=True)

    # Enforce correct column names
    if list(df.columns) != cols[key]:
        df.columns = cols[key]
    return df


In [None]:
# Load each dataset and display the first few rows
datasets = {}
for key in dataset_folders:
    print(f'Loading {key}...')
    datasets[key] = load_dataset(key)
    display(datasets[key].head())


In [None]:
# Convert relevant timestamp columns to datetime
# Kline-like data: convert open_time and close_time (milliseconds)
for key in ['indexpriceklines','klines','markpriceklines','premiumindexklines']:
    df = datasets[key]
    df['open_time'] = pd.to_datetime(df['open_time'], unit='ms')
    df['close_time'] = pd.to_datetime(df['close_time'], unit='ms')
    datasets[key] = df

# Metrics: convert create_time to datetime
datasets['metrics']['create_time'] = pd.to_datetime(datasets['metrics']['create_time'], unit='ms')

# Book depth: convert timestamp
datasets['bookdepth']['timestamp'] = pd.to_datetime(datasets['bookdepth']['timestamp'], unit='ms')

# Agg trades: convert transact_time
datasets['aggtrades']['transact_time'] = pd.to_datetime(datasets['aggtrades']['transact_time'], unit='ms')

# Trades: convert time
datasets['trades']['time'] = pd.to_datetime(datasets['trades']['time'], unit='ms')

print('Timestamps converted.')


In [None]:
# Example: Resample klines to 15-minute bars
# If your klines are already 1-minute bars, this aggregates them to 15-minute intervals
price_df = datasets['klines'][['open_time','open','high','low','close','volume','quote_volume']].copy()
price_df = price_df.set_index('open_time').sort_index()

price_15m = price_df.resample('15T').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume': 'sum',
    'quote_volume': 'sum'
}).dropna()

price_15m.head()


In [None]:
# Feature engineering: compute log returns and rolling volatility
price_15m['return'] = np.log(price_15m['close'] / price_15m['close'].shift(1))
# Rolling volatility over 3 hours (12 bars)
price_15m['volatility'] = price_15m['return'].rolling(window=12).std() * np.sqrt(12)
price_15m = price_15m.dropna()

price_15m.head()


In [None]:
# Prepare data for model training
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

CONTEXT_LENGTH = 100  # past 100 15-min bars (~25 hours)
HORIZON = 100         # forecast next 100 15-min bars

# Select features for modelling (e.g., return and volatility)
features = price_15m[['return','volatility']].values

# Standardize features
scaler = StandardScaler()
features_norm = scaler.fit_transform(features)

# Convert to tensor
features_tensor = torch.tensor(features_norm, dtype=torch.float32)

class TimeSeriesDataset(Dataset):
    def __init__(self, data, context_length, horizon):
        self.data = data
        self.context_length = context_length
        self.horizon = horizon
    def __len__(self):
        return len(self.data) - (self.context_length + self.horizon) + 1
    def __getitem__(self, idx):
        x = self.data[idx : idx + self.context_length]
        y = self.data[idx + self.context_length : idx + self.context_length + self.horizon, 0]  # use return as target
        return x, y

# Create dataset and dataloader
dataset = TimeSeriesDataset(features_tensor.numpy(), CONTEXT_LENGTH, HORIZON)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print(f'Number of training samples: {len(dataset)}')
# Inspect shapes
x0, y0 = dataset[0]
print('Context shape:', x0.shape)
print('Target shape:', y0.shape)


In [None]:
# Define a PatchTST-like transformer for multi-horizon quantile forecasting
import torch.nn as nn

class PatchTST(nn.Module):
    def __init__(self, input_dim, context_length, horizon, patch_len=8, stride=4, d_model=256, n_heads=8, n_layers=4, quantiles=None):
        super().__init__()
        self.context_length = context_length
        self.horizon = horizon
        self.patch_len = patch_len
        self.stride = stride
        self.input_dim = input_dim
        self.n_patches = (context_length - patch_len) // stride + 1
        self.quantiles = quantiles or [0.05, 0.1, 0.5, 0.9, 0.95]
        
        # Patch embedding and transformer encoder
        self.patch_embed = nn.Linear(patch_len * input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        # Output head to produce (horizon * num_quantiles) outputs
        self.head = nn.Linear(d_model, horizon * len(self.quantiles))
        
    def forward(self, x):
        batch_size = x.size(0)
        patches = []
        for i in range(0, self.context_length - self.patch_len + 1, self.stride):
            patch = x[:, i:i+self.patch_len, :].reshape(batch_size, -1)
            patches.append(patch)
        patches = torch.stack(patches, dim=1)
        z = self.patch_embed(patches)
        z = self.transformer(z)
        z = z.mean(dim=1)  # global average pooling over patches
        out = self.head(z)
        return out.view(batch_size, self.horizon, len(self.quantiles))



In [None]:
# Training loop skeleton
import torch.optim as optim

model = PatchTST(input_dim=features_tensor.shape[1], context_length=CONTEXT_LENGTH, horizon=HORIZON)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
quantiles = torch.tensor(model.quantiles)

# Quantile loss function
def quantile_loss(pred, target, quantiles):
    # pred: (batch, horizon, num_quantiles)
    # target: (batch, horizon)
    target = target.unsqueeze(-1)
    diff = target - pred
    loss = torch.max(quantiles * diff, (quantiles - 1) * diff)
    return loss.mean()

# Train for a few epochs (adjust as needed)
epochs = 2
for epoch in range(epochs):
    total_loss = 0.0
    for x_batch, y_batch in dataloader:
        x_batch = x_batch.float()
        y_batch = y_batch.float()
        optimizer.zero_grad()
        preds = model(x_batch)
        loss = quantile_loss(preds, y_batch, quantiles)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')



## Conformal Calibration

After training your quantile forecasting model, you should set aside a recent portion of data as a calibration set. Use the model's predictions on the calibration set to adjust your prediction intervals via conformal quantile regression (CQR) or other conformal methods to ensure the empirical coverage matches your desired confidence level (e.g., 80% or 90%). This step will convert the raw quantile outputs into well-calibrated prediction intervals.
