# Build Standard Bars(Time Bar, Tick Bar, Volume Bar and Dollar Bar)

In [1]:
import gc
import glob
import os
from operator import itemgetter
from typing import Dict, List, Tuple

import dask.bag as db
import dask.dataframe as pd
import numpy as np
from dask.diagnostics import ProgressBar

In [2]:
total_volume = pd.read_csv('/data/csv/OKEx.Swap.XMR_USDT.csv', usecols=['quantity']).sum().compute()

## Build on a single CSV file

In [3]:
def build_standard_bars(input_csv_file: str, bar_type:str, bar_size: np.int64, output_csv_file: str)->None:
    assert bar_type == 'TimeBar' or bar_type == 'TickBar' or bar_type == 'VolumeBar' or bar_type == 'DollarBar'
    df = pd.read_csv(input_csv_file, engine='c',
                     dtype={'exchange': 'category', 'marketType': 'category', 'pair': 'category',
                            'timestamp': 'int64', 'price': 'float32',
                            'quantity': 'float32', 'side': 'bool', 
                            'trade_id': 'string' if 'BitMEX' in input_csv_file else 'int64'})

    if 'BitMEX' in input_csv_file:
        # Tell Dask that the timestamp column is already sorted
        df = df.set_index('timestamp', sorted=True, drop=False).reset_index(drop=True)
    else:
        # Tell Dask that the trade_id column is already sorted
        df = df.set_index('trade_id', sorted=True, drop=False).reset_index(drop=True)

    df['quantity_sell'] = df['quantity'] * df['side']
    df['quantity_buy'] = df['quantity']-df['quantity_sell']
    df['quantity_quote'] = df['quantity'] * df['price']
    df['quantity_quote_sell'] = df['quantity_quote'] * df['side']
    df['quantity_quote_buy'] = df['quantity_quote']-df['quantity_sell']
    df['count'] = 1
    df['count_sell'] = df['side'].astype('int32')
    df['count_buy'] = df['count']-df['count_sell']
    
    if bar_type == 'TimeBar':
        df['bar_index'] = df['timestamp'] // bar_size
    elif bar_type == 'TickBar':
        df['bar_index'] = (df.index // bar_size).to_series().reset_index(drop=True)
    elif bar_type == 'VolumeBar':
        df['bar_index'] = df['quantity'].astype('float64').cumsum().floordiv(bar_size).astype('uint32')
    elif bar_type == 'DollarBar':
        df['bar_index'] = (df['quantity'] * df['price']).astype('float64').cumsum().floordiv(bar_size).astype('uint32')

    bars_df = df.groupby('bar_index').agg({
            'timestamp': ['first', 'last'],
            'price': ['first', 'max', 'min', 'last', 'mean'],
            'quantity': 'sum',
            'quantity_sell': 'sum',
            'quantity_buy': 'sum',
            'quantity_quote': 'sum',
            'quantity_quote_sell': 'sum',
            'quantity_quote_buy': 'sum',
            'count': 'sum',
            'count_sell': 'sum',
            'count_buy': 'sum'
        }
    )
    
    column_names_map = {
        'timestamp_first': 'timestamp',
        'timestamp_last': 'timestamp_end',
        'price_first': 'open',
        'price_max': 'high',
        'price_min': 'low',
        'price_last': 'close',
        'price_mean': 'mean',
        'price_median': 'median',
        'quantity_sum': 'volume',
        'quantity_sell_sum': 'volume_sell',
        'quantity_buy_sum': 'volume_buy',
        'quantity_quote_sum': 'volume_quote',
        'quantity_quote_sell_sum': 'volume_quote_sell',
        'quantity_quote_buy_sum': 'volume_quote_buy',
        'count_sum': 'count',
        'count_sell_sum': 'count_sell',
        'count_buy_sum': 'count_buy'
    }
    # see https://stackoverflow.com/a/14508355/381712
    new_columns_names = [column_names_map['_'.join(col).strip()] for col in bars_df.columns.values] 
    bars_df.columns = new_columns_names
    
    exchange, market_type, pair, _ = os.path.basename(input_csv_file).split('.')
    bars_df['exchange'] = exchange
    bars_df['market_type'] = market_type
    bars_df['pair'] = pair
    bars_df['bar_type'] = bar_type
    bars_df['bar_size'] = bar_size
    # sort columns
    bars_df = bars_df[['exchange', 'market_type', 'pair', 'bar_type', 'bar_size'] + new_columns_names] 

    os.makedirs(os.path.dirname(output_csv_file), exist_ok=True)
    bars_df.to_csv(output_csv_file, index=False, single_file = True)

    # optional
    del bars_df
    del df
    gc.collect()

In [4]:
build_standard_bars('/data/csv/OKEx.Swap.XMR_USDT.csv', 'TimeBar', 10000, '/data/bars/TimeBar/10000/TimeBar.10000.OKEx.Swap.XMR_USDT.csv')

In [5]:
time_bars = pd.read_csv('/data/bars/TimeBar/10000/TimeBar.10000.OKEx.Swap.XMR_USDT.csv')

In [6]:
time_bars.head()

Unnamed: 0,exchange,market_type,pair,bar_type,bar_size,timestamp,timestamp_end,open,high,low,...,mean,volume,volume_sell,volume_buy,volume_quote,volume_quote_sell,volume_quote_buy,count,count_sell,count_buy
0,OKEx,Swap,XMR_USDT,TimeBar,10000,1590216461309,1590216461309,63.42,63.42,63.42,...,63.419998,2.8,2.8,0.0,177.57599,177.57599,174.77599,1,1,0
1,OKEx,Swap,XMR_USDT,TimeBar,10000,1590220818421,1590220818421,63.56,63.56,63.56,...,63.560001,0.7,0.0,0.7,44.492,0.0,44.492,1,0,1
2,OKEx,Swap,XMR_USDT,TimeBar,10000,1590221708996,1590221708996,63.07,63.07,63.07,...,63.07,0.6,0.6,0.0,37.842003,37.842003,37.242004,1,1,0
3,OKEx,Swap,XMR_USDT,TimeBar,10000,1590229378881,1590229378881,62.41,62.41,62.41,...,62.41,1.5,1.5,0.0,93.615,93.615,92.115,1,1,0
4,OKEx,Swap,XMR_USDT,TimeBar,10000,1590249619582,1590249619582,63.49,63.49,63.49,...,63.490002,0.5,0.0,0.5,31.745,0.0,31.745,1,0,1


In [7]:
time_bars['volume'].sum().compute()/total_volume

quantity    1.0
dtype: float64

In [8]:
build_standard_bars('/data/csv/OKEx.Swap.XMR_USDT.csv', 'TickBar', 4, '/data/bars/TickBar/4/TickBar.4.OKEx.Swap.XMR_USDT.csv')

In [9]:
tick_bars = pd.read_csv('/data/bars/TickBar/4/TickBar.4.OKEx.Swap.XMR_USDT.csv')

In [10]:
tick_bars.head()

Unnamed: 0,exchange,market_type,pair,bar_type,bar_size,timestamp,timestamp_end,open,high,low,...,mean,volume,volume_sell,volume_buy,volume_quote,volume_quote_sell,volume_quote_buy,count,count_sell,count_buy
0,OKEx,Swap,XMR_USDT,TickBar,4,1590216461309,1590229378881,63.42,63.56,62.41,...,63.114998,5.6,4.9,0.7,353.525,309.033,348.625,4,3,1
1,OKEx,Swap,XMR_USDT,TickBar,4,1590249619582,1590263779094,63.49,64.39,63.49,...,64.014999,1.4,0.0,1.4,89.546,0.0,89.546,4,0,4
2,OKEx,Swap,XMR_USDT,TickBar,4,1590265720050,1590292415864,64.69,64.99,63.22,...,64.432503,5.7,0.7,5.0,368.412,44.254,367.71198,4,1,3
3,OKEx,Swap,XMR_USDT,TickBar,4,1590293593504,1590293777431,63.22,63.28,63.22,...,63.25,8.0,0.0,8.0,506.00003,0.0,506.00003,4,0,4
4,OKEx,Swap,XMR_USDT,TickBar,4,1590295679230,1590295691153,63.03,63.03,63.02,...,63.022499,4.5,0.0,4.5,283.606,0.0,283.606,4,0,4


In [11]:
tick_bars['volume'].sum().compute()/total_volume

quantity    1.0
dtype: float64

In [12]:
build_standard_bars('/data/csv/OKEx.Swap.XMR_USDT.csv', 'VolumeBar', 10, '/data/bars/VolumeBar/10/VolumeBar.10.OKEx.Swap.XMR_USDT.csv')

In [13]:
volume_bars = pd.read_csv('/data/bars/VolumeBar/10/VolumeBar.10.OKEx.Swap.XMR_USDT.csv')

In [14]:
volume_bars.head()

Unnamed: 0,exchange,market_type,pair,bar_type,bar_size,timestamp,timestamp_end,open,high,low,...,mean,volume,volume_sell,volume_buy,volume_quote,volume_quote_sell,volume_quote_buy,count,count_sell,count_buy
0,OKEx,Swap,XMR_USDT,VolumeBar,10,1590216461309,1590265720050,63.42,64.69,62.41,...,63.690002,7.4,4.9,2.5,468.947,309.033,464.047,9,3,6
1,OKEx,Swap,XMR_USDT,VolumeBar,10,1590265720050,1590293777429,64.83,64.99,63.22,...,63.793335,10.9,0.7,10.2,696.664,44.254,695.964,6,1,5
2,OKEx,Swap,XMR_USDT,VolumeBar,10,1590293777431,1590298086864,63.28,63.28,62.94,...,63.03857,10.1,0.0,10.1,636.918,0.0,636.918,7,0,7
3,OKEx,Swap,XMR_USDT,VolumeBar,10,1590298265847,1590298450707,63.0,63.07,63.0,...,63.045559,10.9,0.0,10.9,687.20703,0.0,687.20703,9,0,9
4,OKEx,Swap,XMR_USDT,VolumeBar,10,1590298535853,1590298729120,63.05,63.08,63.05,...,63.063334,8.0,0.0,8.0,504.528,0.0,504.528,3,0,3


In [15]:
volume_bars['volume'].sum().compute()/total_volume

quantity    1.0
dtype: float64

In [16]:
build_standard_bars('/data/csv/OKEx.Swap.XMR_USDT.csv', 'DollarBar', 1000, '/data/bars/DollarBar/1000/DollarBar.1000.OKEx.Swap.XMR_USDT.csv')

In [17]:
dollar_bars = pd.read_csv('/data/bars/DollarBar/1000/DollarBar.1000.OKEx.Swap.XMR_USDT.csv')

In [18]:
dollar_bars.head()

Unnamed: 0,exchange,market_type,pair,bar_type,bar_size,timestamp,timestamp_end,open,high,low,...,mean,volume,volume_sell,volume_buy,volume_quote,volume_quote_sell,volume_quote_buy,count,count_sell,count_buy
0,OKEx,Swap,XMR_USDT,DollarBar,1000,1590216461309,1590293593504,63.42,64.99,62.41,...,63.805382,15.099999,5.6,9.5,963.211,353.287,957.61096,13,4,9
1,OKEx,Swap,XMR_USDT,DollarBar,1000,1590293593507,1590298265847,63.22,63.28,62.94,...,63.077002,14.900001,0.0,14.900001,940.11804,0.0,940.11804,10,0,10
2,OKEx,Swap,XMR_USDT,DollarBar,1000,1590298282361,1590298729120,63.04,63.08,63.04,...,63.054543,17.3,0.0,17.3,1090.935,0.0,1090.935,11,0,11
3,OKEx,Swap,XMR_USDT,DollarBar,1000,1590298814347,1590298979246,63.07,63.07,63.07,...,63.07,14.1,0.0,14.1,889.287,0.0,889.287,4,0,4
4,OKEx,Swap,XMR_USDT,DollarBar,1000,1590298995801,1590315969750,63.07,63.69,63.07,...,63.230001,17.2,2.0,15.2,1086.469,126.774,1084.469,15,3,12


In [19]:
dollar_bars['volume'].sum().compute()/total_volume

quantity    1.0
dtype: float64

In [20]:
os.remove('/data/bars/TimeBar/10000/TimeBar.10000.OKEx.Swap.XMR_USDT.csv')
os.remove('/data/bars/TickBar/4/TickBar.4.OKEx.Swap.XMR_USDT.csv')
os.remove('/data/bars/VolumeBar/10/VolumeBar.10.OKEx.Swap.XMR_USDT.csv')
os.remove('/data/bars/DollarBar/1000/DollarBar.1000.OKEx.Swap.XMR_USDT.csv')

## Build on multiple CSV files

In [21]:
def generate_tasks(csv_files: List[str], bar_type: str, bar_sizes: List[int], output_dir: str)->List[Tuple[str, str, int, str]]:
    tasks = [(file, bar_type, bar_size, os.path.join(output_dir, str(bar_size),f'{bar_type}.{bar_size}.{os.path.basename(file)}'))
             for file in csv_files for bar_size in bar_sizes]
    return tasks

In [22]:
def run_tasks_parallell(tasks: List[Tuple[str, str, int, str]])->None:
    with ProgressBar():
        db.from_sequence(tasks).map(lambda t: build_standard_bars(t[0], t[1], t[2], t[3])).compute()

In [23]:
btc_files = glob.glob('/data/csv/*BTC_USD*.csv')
eth_files = glob.glob('/data/csv/*ETH_USD*.csv')

In [24]:
btc_files

['/data/csv/Newdex.Spot.BTC_USDT.csv',
 '/data/csv/OKEx.Spot.BTC_USDT.csv',
 '/data/csv/Binance.Swap.BTC_USDT.csv',
 '/data/csv/Kraken.Spot.BTC_USDT.csv',
 '/data/csv/Bitfinex.Swap.BTC_USDT.csv',
 '/data/csv/Bitstamp.Spot.BTC_USD.csv',
 '/data/csv/Huobi.Spot.BTC_USDT.csv',
 '/data/csv/OKEx.Swap.BTC_USDT.csv',
 '/data/csv/OKEx.Swap.BTC_USD.csv',
 '/data/csv/Kraken.Spot.BTC_USD.csv',
 '/data/csv/WhaleEx.Spot.BTC_USDT.csv',
 '/data/csv/Bitfinex.Spot.BTC_USD.csv',
 '/data/csv/MXC.Spot.BTC_USDT.csv',
 '/data/csv/BitMEX.Swap.BTC_USD.csv',
 '/data/csv/Binance.Spot.BTC_USDT.csv',
 '/data/csv/CoinbasePro.Spot.BTC_USD.csv',
 '/data/csv/Huobi.Swap.BTC_USD.csv',
 '/data/csv/Bitfinex.Spot.BTC_USDT.csv']

In [25]:
eth_files

['/data/csv/Huobi.Swap.ETH_USD.csv',
 '/data/csv/Kraken.Spot.ETH_USD.csv',
 '/data/csv/Bitfinex.Spot.ETH_USD.csv',
 '/data/csv/Bitstamp.Spot.ETH_USD.csv',
 '/data/csv/MXC.Spot.ETH_USDT.csv',
 '/data/csv/CoinbasePro.Spot.ETH_USD.csv',
 '/data/csv/Bitfinex.Swap.ETH_USDT.csv',
 '/data/csv/Newdex.Spot.ETH_USDT.csv',
 '/data/csv/WhaleEx.Spot.ETH_USDT.csv',
 '/data/csv/Kraken.Spot.ETH_USDT.csv',
 '/data/csv/OKEx.Swap.ETH_USDT.csv',
 '/data/csv/BitMEX.Swap.ETH_USD.csv',
 '/data/csv/OKEx.Swap.ETH_USD.csv',
 '/data/csv/OKEx.Spot.ETH_USDT.csv',
 '/data/csv/Binance.Swap.ETH_USDT.csv',
 '/data/csv/Bitfinex.Spot.ETH_USDT.csv',
 '/data/csv/Binance.Spot.ETH_USDT.csv',
 '/data/csv/Huobi.Spot.ETH_USDT.csv']

In [26]:
!ls -lhS /data/csv/*BTC_USD*.csv | awk '{print $5,$9}'

2.1G /data/csv/BitMEX.Swap.BTC_USD.csv
1.7G /data/csv/Binance.Spot.BTC_USDT.csv
1.6G /data/csv/Binance.Swap.BTC_USDT.csv
1.3G /data/csv/Huobi.Swap.BTC_USD.csv
1.3G /data/csv/Huobi.Spot.BTC_USDT.csv
1.2G /data/csv/OKEx.Spot.BTC_USDT.csv
664M /data/csv/OKEx.Swap.BTC_USD.csv
378M /data/csv/OKEx.Swap.BTC_USDT.csv
260M /data/csv/CoinbasePro.Spot.BTC_USD.csv
256M /data/csv/Bitfinex.Spot.BTC_USD.csv
114M /data/csv/WhaleEx.Spot.BTC_USDT.csv
84M /data/csv/Bitfinex.Spot.BTC_USDT.csv
64M /data/csv/Bitstamp.Spot.BTC_USD.csv
62M /data/csv/Kraken.Spot.BTC_USD.csv
59M /data/csv/MXC.Spot.BTC_USDT.csv
4.9M /data/csv/Bitfinex.Swap.BTC_USDT.csv
2.4M /data/csv/Kraken.Spot.BTC_USDT.csv
5.4K /data/csv/Newdex.Spot.BTC_USDT.csv


In [27]:
!ls -lhS /data/csv/*ETH_USD*.csv | awk '{print $5,$9}'

740M /data/csv/Huobi.Swap.ETH_USD.csv
585M /data/csv/Huobi.Spot.ETH_USDT.csv
463M /data/csv/Binance.Swap.ETH_USDT.csv
421M /data/csv/Binance.Spot.ETH_USDT.csv
410M /data/csv/OKEx.Spot.ETH_USDT.csv
271M /data/csv/BitMEX.Swap.ETH_USD.csv
213M /data/csv/OKEx.Swap.ETH_USD.csv
93M /data/csv/OKEx.Swap.ETH_USDT.csv
85M /data/csv/CoinbasePro.Spot.ETH_USD.csv
65M /data/csv/MXC.Spot.ETH_USDT.csv
44M /data/csv/Bitfinex.Spot.ETH_USD.csv
28M /data/csv/Kraken.Spot.ETH_USD.csv
17M /data/csv/WhaleEx.Spot.ETH_USDT.csv
13M /data/csv/Bitstamp.Spot.ETH_USD.csv
6.0M /data/csv/Bitfinex.Spot.ETH_USDT.csv
2.4M /data/csv/Bitfinex.Swap.ETH_USDT.csv
943K /data/csv/Kraken.Spot.ETH_USDT.csv
5.9K /data/csv/Newdex.Spot.ETH_USDT.csv


In [28]:
tasks = generate_tasks(
    btc_files + eth_files,
    'TimeBar',
    [10000, 60000, 180000, 300000, 900000, 1800000, 3600000],
    '/data/bars/TimeBar',
) + generate_tasks(
    btc_files + eth_files,
    'TickBar',
    [4, 8, 16, 32, 64, 128],
    '/data/bars/TickBar',
) + generate_tasks(
    btc_files,
    'VolumeBar',
    [1, 2, 4, 8, 16, 32],
    '/data/bars/VolumeBar',
) + generate_tasks(
    eth_files,
    'VolumeBar',
    [10, 20, 40, 80, 160, 320],
    '/data/bars/VolumeBar',
) + generate_tasks(
    btc_files,
    'DollarBar',
    [10000, 20000, 40000, 80000, 160000, 320000],
    '/data/bars/DollarBar',
) + generate_tasks(
    eth_files,
    'DollarBar',
    [2000, 4000, 8000, 16000, 32000],
    '/data/bars/DollarBar',
)

In [29]:
len(tasks)

882

In [None]:
run_tasks_parallell(tasks)

[##########                              ] | 25% Completed |  2min 22.1s

**Dask version consumes much more memory and runs even slower.**

## References

* [Tick, Volume, Dollar Volume Bars.ipynb](https://github.com/BlackArbsCEO/Adv_Fin_ML_Exercises/blob/master/notebooks/Tick%2C%20Volume%2C%20Dollar%20Volume%20Bars.ipynb)