In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import glob

In [2]:
lobs_df = pd.read_csv("adhoc/orderbook_logs/btcusdt_orderbook_20241218_165805.csv")
lobs_pkl = "adhoc/orderbook_logs/btcusdt_orderbook_20241218_165805.pickle"

### PREPROCESS LOB FILE

In [3]:
lobs_df = lobs_df.set_index(lobs_df.columns[0])
lobs_df.index.names = ['']

In [4]:
lobs_df.set_index('receive_ts', inplace=True)
lobs_df = lobs_df[~lobs_df.index.duplicated(keep='first')]
lobs_df.sort_values(by=['receive_ts'], inplace=True)
lobs_df.reset_index(inplace=True)

In [5]:
lobs_df[['bid_price_0', 'ask_price_0']].head()

Unnamed: 0,bid_price_0,ask_price_0
0,103780.0,103780.0
1,103780.0,103780.0
2,103780.0,103780.0
3,103777.7,103777.7
4,103777.7,103777.7


In [6]:
lobs_df['mid_price'] = (lobs_df['ask_price_0'] + lobs_df['bid_price_0']) / 2

In [7]:
lobs_df['returns'] = (lobs_df['mid_price'] / lobs_df['mid_price'].shift(1) - 1).fillna(0)

In [8]:
windows = {60: '1m', 300: '5m', 900: '15m'}
for w in windows.keys():
    lobs_df[f'gain_{windows[w]}'] = lobs_df.set_index('receive_ts')['returns'].rolling(w).apply(
        lambda x: x[x > 0].sum()
    ).reset_index()['returns']

    print(lobs_df.head(2))

    lobs_df[f'loss_{windows[w]}'] = lobs_df.set_index('receive_ts')['returns'].rolling(w).apply(
        lambda x: x[x < 0].sum()
    ).reset_index()['returns']

            receive_ts  bid_price_0  bid_vol_0  bid_price_1  bid_vol_1  \
0  2024-12-18 08:58:06     103780.0    2.27683     103778.6     0.0001   
1  2024-12-18 08:58:07     103780.0    1.96353     103775.1     0.0000   

   bid_price_2  bid_vol_2  bid_price_3  bid_vol_3  bid_price_4  ...  \
0     103778.5    0.03860     103775.1    0.00100     103774.1  ...   
1     103761.1    0.33546     103758.0    0.78287     103755.6  ...   

   ask_price_7  ask_vol_7  ask_price_8  ask_vol_8  ask_price_9  ask_vol_9  \
0     103783.1    0.00000     103785.7    0.00000     103786.5        0.0   
1     103782.9    0.04831     103786.0    0.19275     103788.6        0.0   

                  exchange_ts  mid_price  returns  gain_1m  
0  2024-12-18T08:58:06.095042   103780.0      0.0      NaN  
1  2024-12-18T08:58:07.093771   103780.0      0.0      NaN  

[2 rows x 45 columns]
            receive_ts  bid_price_0  bid_vol_0  bid_price_1  bid_vol_1  \
0  2024-12-18 08:58:06     103780.0    2.27683     

In [9]:
lobs_df.describe()

Unnamed: 0,bid_price_0,bid_vol_0,bid_price_1,bid_vol_1,bid_price_2,bid_vol_2,bid_price_3,bid_vol_3,bid_price_4,bid_vol_4,...,ask_price_9,ask_vol_9,mid_price,returns,gain_1m,loss_1m,gain_5m,loss_5m,gain_15m,loss_15m
count,26641.0,26641.0,26641.0,26641.0,26641.0,26641.0,26641.0,26641.0,26641.0,26641.0,...,26641.0,26641.0,26641.0,26641.0,26582.0,26582.0,26342.0,26342.0,25742.0,25742.0
mean,104512.150377,3.002141,104510.676773,0.10534,104508.915975,0.093529,104506.378297,0.088939,104498.631256,0.093973,...,104977.426613,0.181687,104512.161985,-7.445986e-08,0.00083,-0.000836,0.004126,-0.004152,0.012346,-0.012375
std,392.935047,2.478657,392.952995,0.548406,398.08153,0.421495,402.723182,0.387226,612.494891,0.836345,...,3565.02191,6.226202,392.932286,6.830916e-05,0.000839,0.000867,0.00363,0.003834,0.010339,0.010858
min,103412.5,1e-05,103412.4,0.0,94219.0,0.0,94218.0,0.0,33000.0,0.0,...,103414.5,0.0,103412.55,-0.0008948374,0.0,-0.005635,0.000132,-0.020396,0.001917,-0.043965
25%,104194.1,1.05083,104192.2,0.0001,104191.4,7e-05,104188.9,0.0,104184.7,0.0,...,104225.4,0.0,104194.1,0.0,0.000227,-0.001155,0.001641,-0.005957,0.005034,-0.021892
50%,104525.6,2.53651,104523.7,0.0002,104521.0,0.0002,104519.4,0.0002,104517.0,0.0002,...,104566.0,0.00015,104525.6,0.0,0.000526,-0.000524,0.002595,-0.002595,0.008091,-0.007755
75%,104823.6,4.40513,104821.5,0.04268,104820.5,0.04111,104819.0,0.0385,104815.8,0.0384,...,104884.5,0.03362,104823.6,0.0,0.001159,-0.000238,0.005514,-0.00146,0.017414,-0.004327
max,105350.0,23.19445,105349.9,30.56878,105349.8,23.34201,105349.6,15.35774,105349.5,115.79628,...,158543.3,541.099419,105350.0,0.001222881,0.005225,0.0,0.016514,0.0,0.044644,-0.002015


### GEN FEATURE FILE

In [10]:
features_df = pd.DataFrame({'receive_ts': lobs_df['receive_ts']})

In [11]:
for column in lobs_df.filter(regex="_price_").columns.values:
    features_df[f'dist_{column}'] = (lobs_df[column] / lobs_df['mid_price'] - 1) * 1e06

In [12]:
for side in ['bid', 'ask']:
    features_df[f'{side}_cumul_0'] = lobs_df[f'{side}_price_0'] * lobs_df[f'{side}_vol_0']

for i in range(1, 10):
    for side in ['bid', 'ask']:
        features_df[f'{side}_cumul_{i}'] = (
                features_df[f'{side}_cumul_{i - 1}'] + lobs_df[f'{side}_price_{i}'] * lobs_df[f'{side}_vol_{i}']
        )

In [13]:
for i in range(10):
    features_df[f'national_imbalance_{i}'] = (
            (features_df[f'ask_cumul_{i}'] - features_df[f'bid_cumul_{i}']) /
            (features_df[f'ask_cumul_{i}'] + features_df[f'bid_cumul_{i}'])
    )

In [14]:
features_df.head(100)

Unnamed: 0,receive_ts,dist_bid_price_0,dist_bid_price_1,dist_bid_price_2,dist_bid_price_3,dist_bid_price_4,dist_bid_price_5,dist_bid_price_6,dist_bid_price_7,dist_bid_price_8,...,national_imbalance_0,national_imbalance_1,national_imbalance_2,national_imbalance_3,national_imbalance_4,national_imbalance_5,national_imbalance_6,national_imbalance_7,national_imbalance_8,national_imbalance_9
0,2024-12-18 08:58:06,0.000000,-13.490075,-14.453652,-47.215263,-56.851031,-63.596069,-75.158990,-88.649065,-103.102717,...,0.352411,0.352401,0.345027,0.344846,0.344857,0.360368,0.372773,0.346659,0.346659,0.271395
1,2024-12-18 08:58:07,0.000000,-47.215263,-182.116015,-211.986895,-235.112738,-265.947196,-393.139333,-491.424167,-501.059934,...,0.406870,0.449466,0.387539,0.256702,0.086654,0.086542,0.085247,0.089785,0.056223,0.056223
2,2024-12-18 08:58:08,0.000000,-5.781461,-14.453652,-22.162266,-27.943727,-36.615918,-38.543072,-48.178840,-49.142417,...,0.790178,0.808916,0.817273,0.822471,0.790264,0.790008,0.789316,0.779567,0.779567,0.781069
3,2024-12-18 08:58:09,0.000000,-5.781589,-6.745187,-9.635982,-14.453972,-16.381169,-23.126356,-26.017150,-28.907945,...,0.999740,0.999399,0.999372,0.999297,0.999279,0.996896,0.996799,0.998115,0.998173,0.998151
4,2024-12-18 08:58:10,0.000000,-5.781589,-6.745187,-9.635982,-14.453972,-15.417570,-16.381169,-23.126356,-28.907945,...,0.855698,0.855675,0.881935,0.881937,0.881891,0.882241,0.885785,0.891105,0.891186,0.891172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2024-12-18 08:59:41,-0.481648,-9.151303,-12.041189,-13.004484,-22.637435,-30.343795,-37.086861,-47.683107,-60.205943,...,0.995262,0.995903,0.995903,0.995999,0.995875,0.995814,0.972129,0.966149,0.966149,0.958797
96,2024-12-18 08:59:42,-0.481648,-13.004484,-15.894369,-21.674139,-22.637435,-37.086861,-47.683107,-61.169238,-94.884566,...,0.908694,0.912926,0.884507,0.884487,0.884380,0.884565,0.877655,0.877808,0.876909,0.868520
97,2024-12-18 08:59:43,-0.481648,-2.408238,-3.371533,-4.334828,-13.004484,-13.967779,-15.894369,-22.637435,-26.490615,...,0.547106,0.560055,0.560038,0.544907,0.547397,0.519215,0.520228,0.512894,0.512691,0.516958
98,2024-12-18 08:59:44,-0.481648,-3.371533,-4.334828,-6.261418,-13.004484,-21.674139,-22.637435,-37.086861,-99.701041,...,0.546147,0.546777,0.546754,0.532678,0.532708,0.532957,0.533025,0.494142,0.480040,0.480030


In [15]:
bid_price = lobs_df['bid_price_0']
ask_price = lobs_df['ask_price_0']
bid_vol = lobs_df['bid_vol_0']
ask_vol = lobs_df['ask_vol_0']

prev_bid_price = lobs_df['bid_price_0'].shift(1)
prev_ask_price = lobs_df['ask_price_0'].shift(1)
prev_bid_vol = lobs_df['bid_vol_0'].shift(1)
prev_ask_vol = lobs_df['ask_vol_0'].shift(1)

features_df['order_flow_imbalance'] = (
        (bid_price >= prev_bid_price) * bid_vol -
        (bid_price <= prev_bid_price) * prev_bid_vol -
        (ask_price <= prev_ask_price) * ask_vol +
        (ask_price >= prev_ask_price) * prev_ask_vol
)
features_df['order_flow_imbalance'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features_df['order_flow_imbalance'].fillna(0, inplace=True)


In [16]:
features_df['spread'] = (lobs_df['ask_price_0'] - lobs_df['bid_price_0']) / lobs_df['mid_price'] * 1e06

In [17]:
features_df.head()

Unnamed: 0,receive_ts,dist_bid_price_0,dist_bid_price_1,dist_bid_price_2,dist_bid_price_3,dist_bid_price_4,dist_bid_price_5,dist_bid_price_6,dist_bid_price_7,dist_bid_price_8,...,national_imbalance_2,national_imbalance_3,national_imbalance_4,national_imbalance_5,national_imbalance_6,national_imbalance_7,national_imbalance_8,national_imbalance_9,order_flow_imbalance,spread
0,2024-12-18 08:58:06,0.0,-13.490075,-14.453652,-47.215263,-56.851031,-63.596069,-75.15899,-88.649065,-103.102717,...,0.345027,0.344846,0.344857,0.360368,0.372773,0.346659,0.346659,0.271395,0.0,0.0
1,2024-12-18 08:58:07,0.0,-47.215263,-182.116015,-211.986895,-235.112738,-265.947196,-393.139333,-491.424167,-501.059934,...,0.387539,0.256702,0.086654,0.086542,0.085247,0.089785,0.056223,0.056223,-0.2158,0.0
2,2024-12-18 08:58:08,0.0,-5.781461,-14.453652,-22.162266,-27.943727,-36.615918,-38.543072,-48.17884,-49.142417,...,0.817273,0.822471,0.790264,0.790008,0.789316,0.779567,0.779567,0.781069,-1.81601,0.0
3,2024-12-18 08:58:09,0.0,-5.781589,-6.745187,-9.635982,-14.453972,-16.381169,-23.126356,-26.01715,-28.907945,...,0.999372,0.999297,0.999279,0.996896,0.996799,0.998115,0.998173,0.998151,-7.83294,0.0
4,2024-12-18 08:58:10,0.0,-5.781589,-6.745187,-9.635982,-14.453972,-15.41757,-16.381169,-23.126356,-28.907945,...,0.881935,0.881937,0.881891,0.882241,0.885785,0.891105,0.891186,0.891172,0.77462,0.0


In [18]:
for w in windows.values():
    features_df[f'CRSI_{w}'] = (
            (lobs_df[f'gain_{w}'] - lobs_df[f'loss_{w}'].abs()) /
            (lobs_df[f'gain_{w}'] + lobs_df[f'loss_{w}'].abs())
    ).fillna(0)

In [19]:
features_df.tail()

Unnamed: 0,receive_ts,dist_bid_price_0,dist_bid_price_1,dist_bid_price_2,dist_bid_price_3,dist_bid_price_4,dist_bid_price_5,dist_bid_price_6,dist_bid_price_7,dist_bid_price_8,...,national_imbalance_5,national_imbalance_6,national_imbalance_7,national_imbalance_8,national_imbalance_9,order_flow_imbalance,spread,CRSI_1m,CRSI_5m,CRSI_15m
26636,2024-12-18 16:22:11,0.0,-2.898148,-3.864197,-4.830246,-6.762344,-15.456787,-18.354934,-19.320983,-23.18518,...,-0.521786,-0.535293,-0.551307,-0.553797,-0.553844,-6.38671,0.0,-0.07762,-0.085931,-0.164109
26637,2024-12-18 16:22:12,0.0,-1.931906,-2.897859,-3.863812,-4.829765,-6.761671,-10.625484,-11.591437,-12.55739,...,-0.675715,-0.675656,-0.667856,-0.663819,-0.663608,4.06696,0.0,-0.041241,-0.072284,-0.162609
26638,2024-12-18 16:22:13,0.0,-0.965863,-1.931725,-3.86345,-4.829313,-5.795175,-7.7269,-8.692763,-12.556213,...,-0.927996,-0.927908,-0.92302,-0.922791,-0.899463,3.63341,0.0,-0.024924,-0.067865,-0.161806
26639,2024-12-18 16:22:14,0.0,-0.96555,-2.89665,-6.758851,-8.689951,-9.655501,-10.621052,-11.586602,-13.517702,...,0.833563,0.833563,0.833563,0.711516,0.712369,0.26215,0.0,0.027693,-0.05288,-0.155822
26640,2024-12-18 16:22:15,0.0,-2.89665,-6.758851,-8.689951,-9.655501,-12.552152,-13.517702,-19.311003,-21.242103,...,0.407531,0.407479,0.414672,0.414635,0.429425,-0.0454,0.0,0.027693,-0.05288,-0.155254


In [20]:
features_df['ESS'] = features_df.iloc[:, 1:].values.tolist()

In [21]:
import pickle
features_dict = features_df.set_index('receive_ts')['ESS'].to_dict()
with open(lobs_pkl, 'wb') as f:
    pickle.dump(features_dict, f)