# Calculate Other Bar Sizes Given a Fixed TimeBar Size

In the time-bar-coverage-ratio.ipynb notebook we find that 4s time bars of BTC can cover above 95% trades and 8s time bars of ETH can cover above 95% trades. **So we're going to use 4s as BTC TimeBar size and 8s as ETH TimeBar size.** This notebook calculate bar sizes for TickBar, VolumeBar and DollarBar. 

* The TickBar size is calculated as `total number of trades / number of time bars`
* The VolumeBar size is calculated as `Sum of Volume / number of time bars`
* The DollarBar size is calculated as `Sum of USD Volume / number of time bars`

In [1]:
import gc
import glob
import math
import os
from typing import Callable, Dict, List, Tuple

import dask.bag as db
import pandas as pd
import seaborn as sns
from dask.diagnostics import ProgressBar
from matplotlib import pyplot as plt
from scipy import stats

## Read Trade Data

In [2]:
# Trades are sorted by trade_id already(except BitMEX, which is sorted by timestamp)
PER_TRADE_DATA_DIR = '/data/csv'

In [3]:
BTC_PAIRS = [
    ('Binance', 'Spot', 'BTC_USDT'),
    ('Binance', 'Swap', 'BTC_USDT'),
    ('BitMEX', 'Swap', 'BTC_USD'),
    ('Huobi', 'Spot', 'BTC_USDT'),
    ('Huobi', 'Swap', 'BTC_USD'),
    ('OKEx', 'Spot', 'BTC_USDT'),
    ('OKEx', 'Swap', 'BTC_USDT'),
    ('OKEx', 'Swap', 'BTC_USD'),
]

ETH_PAIRS = [
    ('Binance', 'Spot', 'ETH_USDT'),
    ('Binance', 'Swap', 'ETH_USDT'),
    ('BitMEX', 'Swap', 'ETH_USD'),
    ('Huobi', 'Spot', 'ETH_USDT'),
    ('Huobi', 'Swap', 'ETH_USD'),
    ('OKEx', 'Spot', 'ETH_USDT'),
    ('OKEx', 'Swap', 'ETH_USDT'),
    ('OKEx', 'Swap', 'ETH_USD'),
]

In [4]:
BTC_TIME_BAR_SIZE = 4000
ETH_TIME_BAR_SIZE = 8000

In [5]:
def get_csv_file(exchange: str, market_type: str, pair: str)->str:
    assert market_type == 'Spot' or market_type == 'Swap'
    return os.path.join(PER_TRADE_DATA_DIR, f'{exchange}.{market_type}.{pair}.csv')

In [6]:
get_csv_file(*BTC_PAIRS[0])

'/data/csv/Binance.Spot.BTC_USDT.csv'

In [7]:
get_csv_file(*ETH_PAIRS[-1])

'/data/csv/OKEx.Swap.ETH_USD.csv'

In [8]:
def read_csv(trade_csv_file: str)->pd.DataFrame:
    df = pd.read_csv(trade_csv_file, engine='c',
                     dtype={'exchange': 'category', 'marketType': 'category', 'pair': 'category',
                            'timestamp': 'int64', 'price': 'float64',
                            'quantity': 'float64', 'side': 'bool', 'trade_id': 'string'},
                     usecols=['timestamp', 'price', 'quantity'])
    return df

In [9]:
okex_swap_eth_usd = read_csv(get_csv_file(*ETH_PAIRS[-1]))

In [10]:
okex_swap_eth_usd.head()

Unnamed: 0,timestamp,price,quantity
0,1588291218207,206.18,3.880105
1,1588291218207,206.18,2.134058
2,1588291218207,206.18,2.425065
3,1588291218207,206.15,2.279893
4,1588291218207,206.15,1.261218


## Calculate Number of Bars

In [11]:
def calc_num_bars(input_csv_file: str, bar_type:str, bar_size: int)->int:
    assert bar_type == 'TimeBar' or bar_type == 'TickBar' or bar_type == 'VolumeBar' or bar_type == 'DollarBar'
    df = read_csv(input_csv_file)

    if bar_type == 'TimeBar':
        df['bar_index'] = df['timestamp'] // bar_size
    elif bar_type == 'TickBar':
        df['bar_index'] = (df.index // bar_size).to_series().reset_index(drop=True)
    elif bar_type == 'VolumeBar':
        df['bar_index'] = df['quantity'].astype('float64').cumsum().floordiv(bar_size).astype('uint32')
    elif bar_type == 'DollarBar':
        df['bar_index'] = (df['quantity'] * df['price']).astype('float64').cumsum().floordiv(bar_size).astype('uint32')

    num_bars = df['bar_index'].nunique()  # df.groupby('bar_index').ngroups
    del df
    gc.collect()
    return num_bars

In [12]:
calc_num_bars(get_csv_file(*BTC_PAIRS[0]), 'TimeBar', 4000)

750215

## Calculate Other Bar Sizes Given a fixed TimeBar size

In [13]:
def calc_bar_sizes(exchange: str, market_type: str, pair: str, time_bar_size: int)->Dict:
    csv_file = get_csv_file(exchange, market_type, pair)
    num_time_bars = calc_num_bars(csv_file, 'TimeBar', time_bar_size)

    df = read_csv(csv_file)
    tick_bar_size = df.shape[0] / num_time_bars
    volume_bar_size = df['quantity'].sum() / num_time_bars
    dollar_bar_size = (df['quantity'] * df['price']).sum() / num_time_bars

    del df
    gc.collect()
    return {
        'exchange': exchange, 
        'market_type': market_type, 
        'pair': pair,
        'time_bar_size': time_bar_size,
        'tick_bar_size': tick_bar_size,
        'volume_bar_size': volume_bar_size,
        'dollar_bar_size': dollar_bar_size,
    }

In [14]:
calc_bar_sizes(*BTC_PAIRS[0], BTC_TIME_BAR_SIZE)

{'exchange': 'Binance',
 'market_type': 'Spot',
 'pair': 'BTC_USDT',
 'time_bar_size': 4000,
 'tick_bar_size': 34.84456189225755,
 'volume_bar_size': 3.7722689756163326,
 'dollar_bar_size': 34991.182889387055}

In [15]:
def calc_bar_sizes_batch(exchange_market_pairs: List[Tuple[str, str, str]], time_bar_size: int)->pd.DataFrame:

    with ProgressBar():
        lst = db.from_sequence(exchange_market_pairs).map(lambda t: calc_bar_sizes(*t, time_bar_size)).compute()
        return pd.DataFrame(lst)

In [16]:
btc_bar_sizes_df = calc_bar_sizes_batch(BTC_PAIRS, BTC_TIME_BAR_SIZE)

[########################################] | 100% Completed | 24.5s


In [17]:
btc_bar_sizes_df

Unnamed: 0,exchange,market_type,pair,time_bar_size,tick_bar_size,volume_bar_size,dollar_bar_size
0,Binance,Spot,BTC_USDT,4000,34.844562,3.772269,34991.182889
1,Binance,Swap,BTC_USDT,4000,33.611422,17.373747,161001.40988
2,BitMEX,Swap,BTC_USD,4000,34.119584,12.09363,112079.639823
3,Huobi,Spot,BTC_USDT,4000,26.396362,2.388049,22187.847475
4,Huobi,Swap,BTC_USD,4000,28.493864,22.327544,206708.265216
5,OKEx,Spot,BTC_USDT,4000,31.980953,4.587918,42674.201193
6,OKEx,Swap,BTC_USDT,4000,12.571986,3.424178,31804.049356
7,OKEx,Swap,BTC_USD,4000,17.936789,4.061572,37683.871856


In [18]:
eth_bar_sizes_df = calc_bar_sizes_batch(ETH_PAIRS, ETH_TIME_BAR_SIZE)

[########################################] | 100% Completed | 10.2s


In [19]:
eth_bar_sizes_df

Unnamed: 0,exchange,market_type,pair,time_bar_size,tick_bar_size,volume_bar_size,dollar_bar_size
0,Binance,Spot,ETH_USDT,8000,18.002067,69.972778,14692.076916
1,Binance,Swap,ETH_USDT,8000,20.244162,143.411014,30306.774187
2,BitMEX,Swap,ETH_USD,8000,12.494811,141.442765,30016.88871
3,Huobi,Spot,ETH_USDT,8000,26.013492,69.137962,14665.322329
4,Huobi,Swap,ETH_USD,8000,32.312178,371.240306,77990.499777
5,OKEx,Spot,ETH_USDT,8000,22.110132,55.443811,11751.34898
6,OKEx,Swap,ETH_USDT,8000,7.382528,46.831136,9990.643317
7,OKEx,Swap,ETH_USD,8000,11.721034,64.297399,13545.915915


## Conclusion

We're going to use the following bar sizes.

In [20]:
def prev_power_of_2(n: float):
    '''Find the largest power of two less or equal than the number.'''
    return int(math.pow(2, int(math.log(n, 2))))

In [21]:
btc_bar_sizes_df['tick_bar_size'] = btc_bar_sizes_df['tick_bar_size'].apply(prev_power_of_2)
btc_bar_sizes_df['volume_bar_size'] = btc_bar_sizes_df['volume_bar_size'].apply(prev_power_of_2)
btc_bar_sizes_df['dollar_bar_size'] = btc_bar_sizes_df['dollar_bar_size'].apply(prev_power_of_2)

In [22]:
btc_bar_sizes_df

Unnamed: 0,exchange,market_type,pair,time_bar_size,tick_bar_size,volume_bar_size,dollar_bar_size
0,Binance,Spot,BTC_USDT,4000,32,2,32768
1,Binance,Swap,BTC_USDT,4000,32,16,131072
2,BitMEX,Swap,BTC_USD,4000,32,8,65536
3,Huobi,Spot,BTC_USDT,4000,16,2,16384
4,Huobi,Swap,BTC_USD,4000,16,16,131072
5,OKEx,Spot,BTC_USDT,4000,16,4,32768
6,OKEx,Swap,BTC_USDT,4000,8,2,16384
7,OKEx,Swap,BTC_USD,4000,16,4,32768


In [23]:
eth_bar_sizes_df['tick_bar_size'] = eth_bar_sizes_df['tick_bar_size'].apply(prev_power_of_2)
eth_bar_sizes_df['volume_bar_size'] = eth_bar_sizes_df['volume_bar_size'].apply(prev_power_of_2)
eth_bar_sizes_df['dollar_bar_size'] = eth_bar_sizes_df['dollar_bar_size'].apply(prev_power_of_2)

In [24]:
eth_bar_sizes_df

Unnamed: 0,exchange,market_type,pair,time_bar_size,tick_bar_size,volume_bar_size,dollar_bar_size
0,Binance,Spot,ETH_USDT,8000,16,64,8192
1,Binance,Swap,ETH_USDT,8000,16,128,16384
2,BitMEX,Swap,ETH_USD,8000,8,128,16384
3,Huobi,Spot,ETH_USDT,8000,16,64,8192
4,Huobi,Swap,ETH_USD,8000,32,256,65536
5,OKEx,Spot,ETH_USDT,8000,16,32,8192
6,OKEx,Swap,ETH_USDT,8000,4,32,8192
7,OKEx,Swap,ETH_USD,8000,8,64,8192
