In [25]:
import ccxt
import pandas as pd
import numpy as np
import os
import zipfile
import time
import logging
import requests
import io
from datetime import date, datetime, timezone, timedelta
from tqdm import tqdm
from config import DATA_LOCATION, PROXIES,START_DATE,END_DATE
from utils import format_symbol, get_ms_from_midnight

logger = logging.getLogger(__name__)
# datetime set time zone to UTC
os.environ['TZ'] = 'UTC'
time.tzset()

In [36]:

def fetch_depth_range_cryptofuture(symbol, since_ms, until_ms, step_ms=60000, margin_type="um", data_source="binancevision"):
    """fetch order book snapshots for crypto futures from [BinanceVision](https://data.binance.vision/?prefix=data/futures/margin_type/daily/bookDepth/)
    
    :note: data is 30s interval snapshots by default.
    :param step_ms: (int) The interval to align snapshots to. Default 30000 (30s).
    
    :return: (list)
    
    """
    if data_source != "binancevision":
        raise NotImplementedError("Only binancevision data source is supported for depth data.")

    all_data = []
    current_since = since_ms
    last_snapshots = []
    
    # Track processed dates to avoid re-downloading same daily file
    processed_dates = set()
    
    pbar = tqdm(total=until_ms - since_ms, desc=f"Fetching Depth {symbol}")
    
    while current_since < until_ms:
        dt = datetime.fromtimestamp(current_since / 1000, tz=timezone.utc)
        date_str = dt.strftime("%Y-%m-%d")
        
        if date_str in processed_dates:
            # Advance to next day start to avoid infinite loop
            next_day = (dt.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1))
            current_since = int(next_day.timestamp() * 1000)
            continue
            
        formatted_symbol = format_symbol(symbol).upper()
        # Correct URL pattern: includes symbol subdirectory
        url = f"https://data.binance.vision/data/futures/{margin_type}/daily/bookDepth/{formatted_symbol}/{formatted_symbol}-bookDepth-{date_str}.zip"
        
        try:
            response = requests.get(url, proxies=PROXIES, timeout=30)
            if response.status_code == 200:
                with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
                    csv_name = zf.namelist()[0]
                    with zf.open(csv_name) as f:
                        # Use low_memory=False to avoid DtypeWarning
                        df = pd.read_csv(f, low_memory=False)
                        
                        # Sample data columns: timestamp,percentage,depth,notional
                        if 'timestamp' in df.columns:
                            df['dt'] = pd.to_datetime(df['timestamp'])
                            if df['dt'].dt.tz is None:
                                df['dt'] = df['dt'].dt.tz_localize('UTC')
                            
                            # Convert to milliseconds since epoch robustly
                            epoch = pd.Timestamp("1970-01-01", tz='UTC')
                            df['ms_original'] = (df['dt'] - epoch) // pd.Timedelta(milliseconds=1)
                            
                            # Filter out "30s" snapshots if we only want minute data (step_ms=60000)
                            # User instruction: "if the timestamp is 30s, drop them."
                            if step_ms == 60000:
                                df = df[((df['ms_original'] // 1000) % 60) < 30].copy()

                            # Align to the nearest step_ms boundary
                            # We use round to ensure snapshots close to the boundary are mapped correctly
                            if step_ms > 0:
                                df['ms'] = (np.round(df['ms_original'] / step_ms) * step_ms).astype(int)
                            else:
                                df['ms'] = df['ms_original']
                            
                            # Filter by range [since_ms, until_ms)
                            # We filter the ALIGNED timestamp to ensure the data is "valid" for this backtest range
                            mask = (df['ms'] >= since_ms) & (df['ms'] < until_ms)
                            df_filtered = df.loc[mask].copy()
                            
                            if not df_filtered.empty:
                                # Deduplication: if multiple snapshots round to the same ms, take the latest one
                                # Percentage is part of the snapshot structure
                                df_filtered = df_filtered.sort_values(['ms', 'ms_original'])
                                
                                # Find the latest original timestamp for each bucket
                                latest_original = df_filtered.groupby('ms')['ms_original'].max()
                                df_filtered = df_filtered[df_filtered['ms_original'] == df_filtered['ms'].map(latest_original)]
                                
                                # Final safety drop duplicates (in case ms_original itself is repeated)
                                df_filtered = df_filtered.drop_duplicates(['ms', 'percentage'], keep='last')
                                
                                # Convert to list of dicts for return
                                records = df_filtered.to_dict('records')
                                
                                # If we have data from previous day, fill the beginning of the current day
                                # to ensure continuity if there's a gap at midnight.
                                day_start_ms = int(dt.replace(hour=0, minute=0, second=0, microsecond=0).timestamp() * 1000)
                                first_ms_in_day = df_filtered['ms'].min()
                                
                                if last_snapshots and first_ms_in_day > day_start_ms:
                                    fill_records = []
                                    for snap in last_snapshots:
                                        fill_snap = snap.copy()
                                        fill_snap['ms'] = day_start_ms
                                        fill_records.append(fill_snap)
                                    all_data.extend(fill_records)

                                all_data.extend(records)
                                
                                # Update last_snapshots for the next day's gap filling
                                max_ms = df_filtered['ms'].max()
                                last_snapshots = [r for r in records if r['ms'] == max_ms]
            elif response.status_code == 404:
                logger.debug(f"No depth data for {symbol} on {date_str} (404)")
            else:
                logger.warning(f"Failed to download depth data for {symbol} on {date_str}: {response.status_code}")
        except Exception as e:
            logger.error(f"Error fetching depth data for {symbol} on {date_str}: {e}")
            
        processed_dates.add(date_str)
        # Move current_since to next day start
        next_day = (dt.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1))
        new_since = int(next_day.timestamp() * 1000)
        
        # Update progress bar
        progress = new_since - current_since
        if progress > 0:
            pbar.update(min(progress, until_ms - current_since))
        
        current_since = new_since

    pbar.close()
    return all_data


In [None]:
df=fetch_depth_range_cryptofuture("BTCUSDT", datetime.fromisoformat(START_DATE).timestamp()*1000, (datetime.fromisoformat(START_DATE)+timedelta(days=2)).timestamp()*1000,step_ms=30000, margin_type="um", data_source="binancevision")

Fetching Depth BTCUSDT: 100%|██████████| 172800000.0/172800000.0 [00:06<00:00, 25593077.32it/s]


In [38]:
pd.DataFrame.from_records(df,index=None)

Unnamed: 0,timestamp,percentage,depth,notional,dt,ms_original,ms
0,2026-01-01 00:00:09,-5,6419.548,5.524643e+08,2026-01-01 00:00:09+00:00,1767225609000,1767225600000
1,2026-01-01 00:00:09,-4,5971.786,5.149844e+08,2026-01-01 00:00:09+00:00,1767225609000,1767225600000
2,2026-01-01 00:00:09,-3,5193.131,4.491309e+08,2026-01-01 00:00:09+00:00,1767225609000,1767225600000
3,2026-01-01 00:00:09,-2,4120.377,3.576510e+08,2026-01-01 00:00:09+00:00,1767225609000,1767225600000
4,2026-01-01 00:00:09,-1,2320.040,2.022281e+08,2026-01-01 00:00:09+00:00,1767225609000,1767225600000
...,...,...,...,...,...,...,...
52935,2026-01-02 23:59:31,1,1634.510,1.476713e+08,2026-01-02 23:59:31+00:00,1767398371000,1767398370000
52936,2026-01-02 23:59:31,2,2996.034,2.719585e+08,2026-01-02 23:59:31+00:00,1767398371000,1767398370000
52937,2026-01-02 23:59:31,3,3902.723,3.554995e+08,2026-01-02 23:59:31+00:00,1767398371000,1767398370000
52938,2026-01-02 23:59:31,4,4315.175,3.938851e+08,2026-01-02 23:59:31+00:00,1767398371000,1767398370000


In [40]:
df=fetch_depth_range_cryptofuture("BTCUSDT", datetime.fromisoformat(START_DATE).timestamp()*1000, (datetime.fromisoformat(START_DATE)+timedelta(days=2)).timestamp()*1000,step_ms=60000, margin_type="um", data_source="binancevision")
pd.DataFrame.from_records(df,index=None)

Fetching Depth BTCUSDT: 100%|██████████| 172800000.0/172800000.0 [00:04<00:00, 37463155.54it/s]


Unnamed: 0,timestamp,percentage,depth,notional,dt,ms_original,ms
0,2026-01-01 00:00:09,-5,6419.548,5.524643e+08,2026-01-01 00:00:09+00:00,1767225609000,1767225600000
1,2026-01-01 00:00:09,-4,5971.786,5.149844e+08,2026-01-01 00:00:09+00:00,1767225609000,1767225600000
2,2026-01-01 00:00:09,-3,5193.131,4.491309e+08,2026-01-01 00:00:09+00:00,1767225609000,1767225600000
3,2026-01-01 00:00:09,-2,4120.377,3.576510e+08,2026-01-01 00:00:09+00:00,1767225609000,1767225600000
4,2026-01-01 00:00:09,-1,2320.040,2.022281e+08,2026-01-01 00:00:09+00:00,1767225609000,1767225600000
...,...,...,...,...,...,...,...
26865,2026-01-02 23:59:01,1,1710.788,1.545749e+08,2026-01-02 23:59:01+00:00,1767398341000,1767398340000
26866,2026-01-02 23:59:01,2,3081.166,2.796609e+08,2026-01-02 23:59:01+00:00,1767398341000,1767398340000
26867,2026-01-02 23:59:01,3,3991.695,3.635519e+08,2026-01-02 23:59:01+00:00,1767398341000,1767398340000
26868,2026-01-02 23:59:01,4,4400.348,4.015800e+08,2026-01-02 23:59:01+00:00,1767398341000,1767398340000
