In [1]:
import pandas as pd
import numpy as np
import datetime as dt

from utils.load_data import get_files

# Load Data

In [2]:
exchange = "binance"

start_date = "20230101"
end_date = "20241231"

start_dt = dt.datetime.strptime(start_date, "%Y%m%d")
end_dt = dt.datetime.strptime(end_date, "%Y%m%d")

In [3]:
level1_data = get_files(start_dt, end_dt, "level1", exchange, "futures", "BTCUSDT")
book_data = get_files(start_dt, end_dt, "book", exchange, "futures", "BTCUSDT")
trade_data = get_files(start_dt, end_dt, "trade", exchange, "futures", "BTCUSDT")

level1_data.shape, book_data.shape, trade_data.shape

((1052640, 28), (1052640, 63), (1052640, 23))

In [4]:
# Convert timestamps to datetime
level1_data['ts_end'] = pd.to_datetime(level1_data['ts_end'], unit='ms')
book_data[['ts_end', 'ts_book']] = book_data[['ts_end', 'ts_book']].apply(pd.to_datetime, unit='ms')
trade_data['ts_end'] = pd.to_datetime(trade_data['ts_end'], unit='ms')

# Set index to ts_end
level1_data.set_index('ts_end', inplace=True)
book_data.set_index('ts_end', inplace=True)
trade_data.set_index('ts_end', inplace=True)

# Features

#### Level1 Features

In [5]:
level1_features = level1_data[['tick_count', 'l3_updates',
       'ask_up_ret', 'ask_down_ret', 'bid_up_ret', 'bid_down_ret',
       'median_spread', 'mean_spread', 'min_spread', 'max_spread',
       'mean_bid_size', 'mean_ask_size', 'tick_volatility']].copy()

In [None]:
def feature_log_ret(level1_data, taus):
    df = pd.DataFrame(index=level1_data.index)

    for tau in taus:
        log_ret_col = f'log_ret_{tau}min'
        df[log_ret_col] = np.log(
            level1_data['close_mid'] / level1_data['close_mid'].shift(tau)
        )

    return df

In [11]:
def feature_abs_ret(level1_data, taus):
    log_ret = feature_log_ret(level1_data, taus)
    df = pd.DataFrame(index=level1_data.index)

    for tau in taus:
        col = f'abs_ret_{tau}min'
        df[col] = log_ret[f'log_ret_{tau}min'].abs()

    return df

In [None]:
def feature_sq_ret(level1_data, taus):
    log_ret = feature_log_ret(level1_data, taus)
    df = pd.DataFrame(index=level1_data.index)

    for tau in taus:
        col = f'sq_ret_{tau}min'
        df[col] = log_ret[f'log_ret_{tau}min'] ** 2

    return df

In [13]:
feature_log_ret(level1_data, taus=[1,5,10,15,30,60])

Unnamed: 0_level_0,log_ret_1min,log_ret_5min,log_ret_10min,log_ret_15min,log_ret_30min,log_ret_60min
ts_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 00:01:00,,,,,,
2023-01-01 00:02:00,-0.000181,,,,,
2023-01-01 00:03:00,-0.000218,,,,,
2023-01-01 00:04:00,-0.000266,,,,,
2023-01-01 00:05:00,0.000139,,,,,
...,...,...,...,...,...,...
2024-12-31 23:56:00,0.000270,0.000669,0.001726,0.002396,0.000500,0.001450
2024-12-31 23:57:00,-0.000425,-0.000425,0.001301,0.001477,-0.000932,0.001668
2024-12-31 23:58:00,-0.000265,-0.000940,0.001297,0.001383,-0.001241,0.001598
2024-12-31 23:59:00,-0.000155,-0.000770,0.000977,0.001436,-0.001345,0.001532


In [14]:
feature_abs_ret(level1_data, taus=[1,5,10,15,30,60])

Unnamed: 0_level_0,abs_ret_1min,abs_ret_5min,abs_ret_10min,abs_ret_15min,abs_ret_30min,abs_ret_60min
ts_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 00:01:00,,,,,,
2023-01-01 00:02:00,0.000181,,,,,
2023-01-01 00:03:00,0.000218,,,,,
2023-01-01 00:04:00,0.000266,,,,,
2023-01-01 00:05:00,0.000139,,,,,
...,...,...,...,...,...,...
2024-12-31 23:56:00,0.000270,0.000669,0.001726,0.002396,0.000500,0.001450
2024-12-31 23:57:00,0.000425,0.000425,0.001301,0.001477,0.000932,0.001668
2024-12-31 23:58:00,0.000265,0.000940,0.001297,0.001383,0.001241,0.001598
2024-12-31 23:59:00,0.000155,0.000770,0.000977,0.001436,0.001345,0.001532


In [15]:
feature_sq_ret(level1_data, taus=[1,5,10,15,30,60])

Unnamed: 0_level_0,sq_ret_1min,sq_ret_5min,sq_ret_10min,sq_ret_15min,sq_ret_30min,sq_ret_60min
ts_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 00:01:00,,,,,,
2023-01-01 00:02:00,3.291228e-08,,,,,
2023-01-01 00:03:00,4.741260e-08,,,,,
2023-01-01 00:04:00,7.086052e-08,,,,,
2023-01-01 00:05:00,1.936466e-08,,,,,
...,...,...,...,...,...,...
2024-12-31 23:56:00,7.300359e-08,4.471208e-07,2.979068e-06,5.739973e-06,2.498588e-07,2.102690e-06
2024-12-31 23:57:00,1.806910e-07,1.806910e-07,1.692393e-06,2.182794e-06,8.689178e-07,2.781817e-06
2024-12-31 23:58:00,7.020588e-08,8.833693e-07,1.682170e-06,1.911537e-06,1.539779e-06,2.552585e-06
2024-12-31 23:59:00,2.400980e-08,5.932746e-07,9.550730e-07,2.062982e-06,1.807953e-06,2.345691e-06
