### 1. Setup

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

from utils import data_utils as du, feature_utils as fu, plot_utils as pu

### 2. Load & Clean Data

In [2]:
exchange = "binance"

start_date = "20230101"
end_date = "20241231"

start_dt = dt.datetime.strptime(start_date, "%Y%m%d")
end_dt = dt.datetime.strptime(end_date, "%Y%m%d")

In [3]:
level1_data = du.get_files(start_dt, end_dt, "level1", exchange, "futures", "BTCUSDT")
book_data = du.get_files(start_dt, end_dt, "book", exchange, "futures", "BTCUSDT")
trade_data = du.get_files(start_dt, end_dt, "trade", exchange, "futures", "BTCUSDT")

level1_data.shape, book_data.shape, trade_data.shape

((1052640, 28), (1052640, 63), (1052640, 23))

In [4]:
# Convert timestamps to datetime
level1_data['ts_end'] = pd.to_datetime(level1_data['ts_end'], unit='ms')
book_data[['ts_end', 'ts_book']] = book_data[['ts_end', 'ts_book']].apply(pd.to_datetime, unit='ms')
trade_data['ts_end'] = pd.to_datetime(trade_data['ts_end'], unit='ms')

# Set index to ts_end
level1_data.set_index('ts_end', inplace=True)
book_data.set_index('ts_end', inplace=True)
trade_data.set_index('ts_end', inplace=True)

# Align time series data
(level1_data, book_data, trade_data), start, end = du.align_ts(level1_data, book_data, trade_data)

# Create a common time index
time_idx = pd.date_range(start=start, end=end, freq='1min')

time_idx[[0, -1]]

DatetimeIndex(['2023-01-01 00:01:00', '2024-12-31 23:59:00'], dtype='datetime64[ns]', freq=None)

In [5]:
# Compute log return
level1_data['log_return'] = np.log(level1_data['close_mid'] / level1_data['close_mid'].shift(1))

### 3. Build Feature

In [8]:
taus = [1, 5, 15, 30, 60, 120]
features_df = fu.build_features(time_idx, level1_data, book_data, trade_data, taus)
features_df.head()

Unnamed: 0,log_ret_1m,log_ret_5m,log_ret_15m,log_ret_30m,log_ret_60m,log_ret_120m,abs_ret_1m,abs_ret_5m,abs_ret_15m,abs_ret_30m,...,minute,sin_minute,cos_minute,day_of_week,sin_dow,cos_dow,is_weekend,is_dst,is_hour_start,is_hour_end
2023-01-01 00:01:00,,,,,,,,,,,...,1,0.104528,0.994522,6,-0.781831,0.62349,1,0,1,0
2023-01-01 00:02:00,-0.000181,,,,,,0.000181,,,,...,2,0.207912,0.978148,6,-0.781831,0.62349,1,0,1,0
2023-01-01 00:03:00,-0.000218,,,,,,0.000218,,,,...,3,0.309017,0.951057,6,-0.781831,0.62349,1,0,1,0
2023-01-01 00:04:00,-0.000266,,,,,,0.000266,,,,...,4,0.406737,0.913545,6,-0.781831,0.62349,1,0,1,0
2023-01-01 00:05:00,0.000139,,,,,,0.000139,,,,...,5,0.5,0.866025,6,-0.781831,0.62349,1,0,0,0


### 4. Build Target

In [9]:
target_df = fu.target_rv(time_idx, level1_data, horizons=[60])
target_df.head()

Unnamed: 0,target_rv_fwd60m
2023-01-01 00:01:00,0.158508
2023-01-01 00:02:00,0.158838
2023-01-01 00:03:00,0.159852
2023-01-01 00:04:00,0.157915
2023-01-01 00:05:00,0.158012


### 5. Combine Features & Target

In [None]:
df = pd.concat([features_df, target_df], axis=1)
df = df.dropna(axis=0)
df.head()

Unnamed: 0,log_ret_1m,log_ret_5m,log_ret_15m,log_ret_30m,log_ret_60m,log_ret_120m,abs_ret_1m,abs_ret_5m,abs_ret_15m,abs_ret_30m,...,sin_minute,cos_minute,day_of_week,sin_dow,cos_dow,is_weekend,is_dst,is_hour_start,is_hour_end,target_rv_fwd60m
2023-01-01 02:01:00,-0.000502,-0.000212,0.000333,0.000738,0.001089,0.000254,0.000502,0.000212,0.000333,0.000738,...,0.104528,0.994522,6,-0.781831,0.62349,1,0,1,0,0.112397
2023-01-01 02:02:00,0.000169,-0.000296,0.000363,0.000968,0.001046,0.000605,0.000169,0.000296,0.000363,0.000968,...,0.207912,0.978148,6,-0.781831,0.62349,1,0,1,0,0.111795
2023-01-01 02:03:00,-5.4e-05,-0.000514,0.000248,0.000913,0.000701,0.000768,5.4e-05,0.000514,0.000248,0.000913,...,0.309017,0.951057,6,-0.781831,0.62349,1,0,1,0,0.114303
2023-01-01 02:04:00,-0.000175,-0.000622,9.1e-05,0.000738,0.00055,0.000859,0.000175,0.000622,9.1e-05,0.000738,...,0.406737,0.913545,6,-0.781831,0.62349,1,0,1,0,0.113171
2023-01-01 02:05:00,0.0,-0.000562,-0.000109,0.000774,0.000702,0.00072,0.0,0.000562,0.000109,0.000774,...,0.5,0.866025,6,-0.781831,0.62349,1,0,0,0,0.113172


In [14]:
df.tail()

Unnamed: 0,log_ret_1m,log_ret_5m,log_ret_15m,log_ret_30m,log_ret_60m,log_ret_120m,abs_ret_1m,abs_ret_5m,abs_ret_15m,abs_ret_30m,...,sin_minute,cos_minute,day_of_week,sin_dow,cos_dow,is_weekend,is_dst,is_hour_start,is_hour_end,target_rv_fwd60m
2024-12-31 22:55:00,-6e-05,-0.002757,-0.002534,-0.000625,-0.003698,-0.001337,6e-05,0.002757,0.002534,0.000625,...,-0.5,0.866025,1,0.781831,0.62349,0,0,0,1,0.323412
2024-12-31 22:56:00,0.000113,-0.002643,-0.002571,-0.000584,-0.0036,-0.000448,0.000113,0.002643,0.002571,0.000584,...,-0.406737,0.913545,1,0.781831,0.62349,0,0,0,1,0.324226
2024-12-31 22:57:00,-0.000643,-0.002403,-0.003393,-0.001408,-0.004462,0.000132,0.000643,0.002403,0.003393,0.001408,...,-0.309017,0.951057,1,0.781831,0.62349,0,0,0,1,0.321068
2024-12-31 22:58:00,-0.000195,-0.001043,-0.003762,-0.00206,-0.004911,-0.001194,0.000195,0.001043,0.003762,0.00206,...,-0.207912,0.978148,1,0.781831,0.62349,0,0,0,1,0.321508
2024-12-31 22:59:00,-8.9e-05,-0.000873,-0.004214,-0.002752,-0.005292,-0.000876,8.9e-05,0.000873,0.004214,0.002752,...,-0.104528,0.994522,1,0.781831,0.62349,0,0,0,1,0.321727


### 6. Train / Valid / Test Split

In [15]:
train = df[df.index < '2024-01-01']
valid = df[(df.index >= '2024-01-01') & (df.index < '2024-07-01')]
test  = df[df.index >= '2024-07-01']

### 7. Feature Selection (Train-only)

In [33]:
corr = train[features_df.columns].apply(lambda x: x.corr(train[target_df.columns[0]], method="spearman"))
selected = corr[abs(corr) > 0.1].index

In [28]:
corr = train[selected].corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.7)]

final_features = [f for f in selected if f not in to_drop]

In [None]:
corr[final_features]

abs_ret_1m            0.446153
abs_ret_5m            0.431251
abs_ret_15m           0.451432
sq_ret_1m             0.446153
rv_5m                 0.645498
kurtosis_15m         -0.159001
kurtosis_30m         -0.150584
kurtosis_60m         -0.104963
tick_vol              0.654520
price_range_z_120m    0.105226
trade_vol             0.660708
hour                  0.210318
cos_hour             -0.112866
day_of_week          -0.249476
is_dst               -0.126330
dtype: float64

### 7. Walk-Forward CV (Train-Only)

### 8. Train Final Model

### 9. Validation Evaluation

### 9. OOS Evaluation

### 10. Test & Backtest

### 11. Diagnostics