In [2]:
import pandas as pd
import os
import numpy as np
from keras.preprocessing import timeseries_dataset_from_array

2024-12-05 00:44:00.307104: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
class DataPreprocess:
    def __init__(self):
        pass

    def load_data(self, path):
        df = pd.read_csv(path)
        df.drop("Unnamed: 0", inplace=True, axis = 1)
        #df.drop("system_time", inplace=True, axis = 1)
        return df

    def get_bid_levels(self, df, bid_levels):
        df_tmp = pd.DataFrame()
        df_tmp['time'] = pd.to_datetime(df['system_time'])
        df_tmp['midpoint'] = df['midpoint']
        
        # Filter ask and bid volume columns
        ask_cols = [f'asks_limit_notional_{i}' for i in range(bid_levels)]
        bid_cols = [f'bids_limit_notional_{i}' for i in range(bid_levels)]
        ask_cols = [col for col in ask_cols if col in df.columns]
        bid_cols = [col for col in bid_cols if col in df.columns]
        
        # Assign ask and bid volumes
        for lvls in range(len(ask_cols)):  # Use the length of filtered columns
            df_tmp[f'ask_volume{lvls}'] = df[ask_cols[lvls]]
        for lvls in range(len(bid_cols)):  # Use the length of filtered columns
            df_tmp[f'bid_volume{lvls}'] = df[bid_cols[lvls]]
        
        # Filter ask and bid distance columns
        ask_distance_cols = [f'asks_distance_{i}' for i in range(bid_levels)]
        bid_distance_cols = [f'bids_distance_{i}' for i in range(bid_levels)]
        ask_distance_cols = [col for col in ask_distance_cols if col in df.columns]
        bid_distance_cols = [col for col in bid_distance_cols if col in df.columns]
        
        # Assign ask and bid prices
        for lvls in range(len(ask_distance_cols)):  # Use the length of filtered columns
            df_tmp[f'ask_price{lvls}'] = df['midpoint'] + (df['midpoint'] * df[ask_distance_cols[lvls]])
        for lvls in range(len(bid_distance_cols)):  # Use the length of filtered columns
            df_tmp[f'bid_price{lvls}'] = df['midpoint'] + (df['midpoint'] * df[bid_distance_cols[lvls]])
        return df_tmp

    """
        For a actively traded stock,
        1. If there is an increase in the bid price in future compared to its current or previous value.
            We take the Bid volume.
        2. If the previous and current price is same we take the difference in the volume.
        3. If the current price is less than the current one then we take -ve of the current volume.

        Order Flow Imbalance is actually a factor that denotes the buying side or selling pressue for a particular stock at that level.
        When OFI > 0 it means that market is dominated by sellers. Lot more bids have happened.
        When OFI < 0 it means that market is dominated by buyers. Lot more asks have happened.
    """
    def get_bid_with_ofi_levels(self, df_tmp, bid_levels):
        for lvls in range(bid_levels):
            bid_price = df_tmp[f'bid_price{lvls}']
            ask_price = df_tmp[f'ask_price{lvls}']
            bid_volume = df_tmp[f'bid_volume{lvls}']
            ask_volume = df_tmp[f'ask_volume{lvls}']
            
            # Shift prices and volumes to compute differences
            bid_price_prev = bid_price.shift(1)
            ask_price_prev = ask_price.shift(1)
            bid_volume_prev = bid_volume.shift(1)
            ask_volume_prev = ask_volume.shift(1)
            
            # Calculate bid and ask order flows
            bid_of = np.where(
                bid_price > bid_price_prev, bid_volume,
                np.where(bid_price == bid_price_prev, bid_volume - bid_volume_prev, -bid_volume)
            )
            ask_of = np.where(
                ask_price > ask_price_prev, -ask_volume,
                np.where(ask_price == ask_price_prev, ask_volume - ask_volume_prev, ask_volume)
            )
            
            # Calculate OFI
            df_tmp[f'ofi_level{lvls}'] = bid_of - ask_of
        return df_tmp

    
    def make_window(self, window_size, dataframe, partition):
        normalized_df = df_new.to_numpy()
        dataframe.drop("time", inplace=True, axis = 1)
        train_length = int(len(normalized_df) * partition)
        data_length = len(normalized_df)
        
        midpoint_idx = df_new.columns.get_loc("midpoint")
        
        windows_train = timeseries_dataset_from_array(
            normalized_df[:train_length],  # Features for training
            normalized_df[window_size:train_length + window_size, midpoint_idx],  # Labels for training
            window_size,
            sequence_stride=1,
            sampling_rate=1,
            batch_size=64,
            shuffle=False,
            seed=None
        )
        
        # Windows for test dataset
        windows_test = timeseries_dataset_from_array(
            normalized_df[train_length:data_length - window_size],  # Features for testing
            normalized_df[train_length + window_size:, midpoint_idx],  # Labels for testing
            window_size,
            sequence_stride=1,
            sampling_rate=1,
            batch_size=64,
            shuffle=False
        )
        return windows_train, windows_test

    def make_generic_window(self, window_side, dataframe):
        """
            TODO: Not sure why its required but good to have.
        """
        pass

In [4]:
dp = DataPreprocess()

df = dp.load_data('archive/ETH_1min.csv')

In [5]:
df.head()

Unnamed: 0,system_time,midpoint,spread,buys,sells,bids_distance_0,bids_distance_1,bids_distance_2,bids_distance_3,bids_distance_4,...,asks_market_notional_5,asks_market_notional_6,asks_market_notional_7,asks_market_notional_8,asks_market_notional_9,asks_market_notional_10,asks_market_notional_11,asks_market_notional_12,asks_market_notional_13,asks_market_notional_14
0,2021-04-07 11:33:49.861733+00:00,1965.845,0.01,875154.482918,1684774.0,-3e-06,-0.000155,-0.00016,-0.000257,-0.000262,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-04-07 11:34:49.861733+00:00,1969.645,0.65,514168.079888,858219.0,-0.000165,-0.00019,-0.000201,-0.000206,-0.000216,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-04-07 11:35:49.861733+00:00,1975.595,0.29,729915.129243,1446984.0,-7.3e-05,-7.8e-05,-9.4e-05,-9.9e-05,-0.000195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2021-04-07 11:36:49.861733+00:00,1969.335,0.19,611826.976792,598110.2,-4.8e-05,-0.000165,-0.00017,-0.00019,-0.000277,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-04-07 11:37:49.861733+00:00,1970.965,0.49,429786.641273,414178.3,-0.000124,-0.000129,-0.0002,-0.000246,-0.000261,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df.head()

Unnamed: 0,system_time,midpoint,spread,buys,sells,bids_distance_0,bids_distance_1,bids_distance_2,bids_distance_3,bids_distance_4,...,asks_market_notional_5,asks_market_notional_6,asks_market_notional_7,asks_market_notional_8,asks_market_notional_9,asks_market_notional_10,asks_market_notional_11,asks_market_notional_12,asks_market_notional_13,asks_market_notional_14
0,2021-04-07 11:33:49.861733+00:00,1965.845,0.01,875154.482918,1684774.0,-3e-06,-0.000155,-0.00016,-0.000257,-0.000262,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-04-07 11:34:49.861733+00:00,1969.645,0.65,514168.079888,858219.0,-0.000165,-0.00019,-0.000201,-0.000206,-0.000216,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-04-07 11:35:49.861733+00:00,1975.595,0.29,729915.129243,1446984.0,-7.3e-05,-7.8e-05,-9.4e-05,-9.9e-05,-0.000195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2021-04-07 11:36:49.861733+00:00,1969.335,0.19,611826.976792,598110.2,-4.8e-05,-0.000165,-0.00017,-0.00019,-0.000277,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-04-07 11:37:49.861733+00:00,1970.965,0.49,429786.641273,414178.3,-0.000124,-0.000129,-0.0002,-0.000246,-0.000261,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_new = dp.get_bid_levels(df, 5)
df_new.head()

Unnamed: 0,time,midpoint,ask_volume0,ask_volume1,ask_volume2,ask_volume3,ask_volume4,bid_volume0,bid_volume1,bid_volume2,...,ask_price0,ask_price1,ask_price2,ask_price3,ask_price4,bid_price0,bid_price1,bid_price2,bid_price3,bid_price4
0,2021-04-07 11:33:49.861733+00:00,1965.845,20005.570312,3234.25,20005.31,4350.029785,5839.970215,86711.492188,9650.30957,29679.5,...,1965.85,1966.11,1966.12,1966.23,1966.32,1965.84,1965.54,1965.53,1965.34,1965.33
1,2021-04-07 11:34:49.861733+00:00,1969.645,36921.578125,29840.800781,1003755.0,3873.060059,20216.859375,29539.800781,1031.329956,2736.439941,...,1969.97,1969.99,1970.0,1970.02,1970.07,1969.32,1969.27,1969.25,1969.24,1969.22
2,2021-04-07 11:35:49.861733+00:00,1975.595,13438.839844,3873.080078,6975.46,21856.339844,28646.380859,1278.369995,2453.889893,31211.480469,...,1975.74,1976.06,1976.33,1976.34,1976.35,1975.45,1975.44,1975.41,1975.4,1975.21
3,2021-04-07 11:36:49.861733+00:00,1969.335,2028.51001,31570.119141,22044.19,4873.629883,19998.630859,5555.390137,1969.01001,224564.453125,...,1969.43,1969.44,1969.99,1970.24,1970.25,1969.24,1969.01,1969.0,1968.96,1968.79
4,2021-04-07 11:37:49.861733+00:00,1970.965,1163.01001,1163.02002,3769.63,21908.570312,4873.0,1032.089966,4995.910156,29558.550781,...,1971.21,1971.22,1971.27,1971.62,1971.63,1970.72,1970.71,1970.57,1970.48,1970.45


In [8]:
df_new['time'].dtype

datetime64[ns, UTC]

In [9]:
df_new = dp.get_bid_with_ofi_levels(df_new, 5)

In [20]:
df_new

Unnamed: 0,time,midpoint,ask_volume0,ask_volume1,ask_volume2,ask_volume3,ask_volume4,bid_volume0,bid_volume1,bid_volume2,...,bid_price0,bid_price1,bid_price2,bid_price3,bid_price4,ofi_level0,ofi_level1,ofi_level2,ofi_level3,ofi_level4
0,2021-04-07 11:33:49.861733+00:00,1965.845,20005.570312,3234.250000,2.000531e+04,4350.029785,5839.970215,86711.492188,9650.309570,29679.500000,...,1965.84,1965.54,1965.53,1965.34,1965.33,-106717.062500,-12884.559570,-4.968481e+04,-14322.169434,-89773.001465
1,2021-04-07 11:34:49.861733+00:00,1969.645,36921.578125,29840.800781,1.003755e+06,3873.060059,20216.859375,29539.800781,1031.329956,2736.439941,...,1969.32,1969.27,1969.25,1969.24,1969.22,66461.378906,30872.130737,1.006492e+06,14716.290527,52525.349609
2,2021-04-07 11:35:49.861733+00:00,1975.595,13438.839844,3873.080078,6.975460e+03,21856.339844,28646.380859,1278.369995,2453.889893,31211.480469,...,1975.45,1975.44,1975.41,1975.40,1975.21,14717.209839,6326.969971,3.818694e+04,22612.319824,36547.220703
3,2021-04-07 11:36:49.861733+00:00,1969.335,2028.510010,31570.119141,2.204419e+04,4873.629883,19998.630859,5555.390137,1969.010010,224564.453125,...,1969.24,1969.01,1969.00,1968.96,1968.79,-7583.900146,-33539.129150,-2.466086e+05,-4932.699883,-20758.770874
4,2021-04-07 11:37:49.861733+00:00,1970.965,1163.010010,1163.020020,3.769630e+03,21908.570312,4873.000000,1032.089966,4995.910156,29558.550781,...,1970.72,1970.71,1970.57,1970.48,1970.45,2195.099976,6158.930176,3.332818e+04,23779.410278,36103.509766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17105,2021-04-19 09:49:00.345392+00:00,2238.505,135510.562500,34591.921875,3.906943e+04,41868.960938,63918.011719,10297.099609,2417.570068,2417.040039,...,2238.50,2238.49,2238.00,2237.82,2237.59,-145807.662109,-37009.491943,-4.148647e+04,-50947.090820,-79133.622070
17106,2021-04-19 09:50:00.345392+00:00,2238.005,90845.898438,1277.660034,5.309400e+02,3516.110107,4477.000000,67052.460938,5594.649902,17410.080078,...,2238.00,2237.86,2237.80,2237.72,2237.45,-157898.359375,-6872.309937,-1.794102e+04,-19868.909912,-8951.899902
17107,2021-04-19 09:51:00.345392+00:00,2240.405,30305.349609,20006.939453,3.874170e+03,34628.078125,38479.269531,54528.968750,5600.649902,4480.500000,...,2240.40,2240.26,2240.25,2240.24,2240.19,84834.318359,25607.589355,8.354670e+03,40855.948242,41229.499512
17108,2021-04-19 09:52:00.345392+00:00,2236.795,2972.479980,2972.969971,2.965580e+03,3872.439941,20004.179688,107661.648438,6620.870117,5591.580078,...,2236.79,2236.78,2236.63,2236.60,2236.59,-110634.128418,-9593.840088,-8.557160e+03,-8345.640137,-22378.359619
