# Build Standard Bars(Time Bar, Tick Bar, Volume Bar and Dollar Bar)

In [1]:
import gc
import glob
import os
from operator import itemgetter
from typing import Dict, List, Tuple

import dask.bag as db
import numpy as np
import pandas as pd
from dask.diagnostics import ProgressBar

In [2]:
from utils import aggregate, aggregate_trade, convert_to_bar

In [3]:
aggregate(pd.Series([3,2,1,4,5,6]))

{'open': 3, 'high': 6, 'low': 1, 'close': 6, 'mean': 3.5, 'median': 3.5}

In [4]:
total_volume = pd.read_csv('/data/csv/OKEx.Swap.XMR_USDT.csv', usecols=['quantity']).sum()

## Build on a single CSV file

In [5]:
def build_bars_internal(grouped: pd.core.groupby.DataFrameGroupBy, bar_type: str, bar_size: int, output_csv_file: str)->None:
    series = grouped.apply(lambda df: convert_to_bar(bar_type, bar_size, df))  # parallel_apply, from pandarallel
    
    bars_df = pd.DataFrame(list(series))

    os.makedirs(os.path.dirname(output_csv_file), exist_ok=True)
    bars_df.to_csv(output_csv_file, index=False)
    # optional
    del bars_df
    del series
    gc.collect()

In [6]:
def build_standard_bars(input_csv_file: str, bar_type:str, bar_size: np.int64, output_csv_file: str)->None:
    assert bar_type == 'TimeBar' or bar_type == 'TickBar' or bar_type == 'VolumeBar' or bar_type == 'DollarBar'
    df = pd.read_csv(input_csv_file, engine='c',
                     dtype={'exchange': 'category', 'marketType': 'category', 'pair': 'category',
                            'timestamp': 'int64', 'price': 'float32',
                            'quantity': 'float32', 'side': 'bool', 'trade_id': 'string'})
    if bar_type == 'TimeBar':
        grouped = df.groupby(df['timestamp'] // bar_size)
    elif bar_type == 'TickBar':
        grouped = df.groupby(df.index // bar_size)
    elif bar_type == 'VolumeBar':
        series = df['quantity'].astype('float64').cumsum().floordiv(bar_size).astype('uint32')
        grouped = df.groupby(series)
    elif bar_type == 'DollarBar':
        series = (df['quantity'] * df['price']).astype('float64').cumsum().floordiv(bar_size).astype('uint32')
        grouped = df.groupby(series)

    build_bars_internal(grouped, bar_type, bar_size, output_csv_file)
    del df
    del grouped
    gc.collect()

In [7]:
build_standard_bars('/data/csv/OKEx.Swap.XMR_USDT.csv', 'TimeBar', 10000, '/data/bars/TimeBar/10000/TimeBar.10000.OKEx.Swap.XMR_USDT.csv')

In [8]:
time_bars = pd.read_csv('/data/bars/TimeBar/10000/TimeBar.10000.OKEx.Swap.XMR_USDT.csv')

In [9]:
time_bars.head()

Unnamed: 0,exchange,market_type,pair,bar_type,bar_size,timestamp,timestamp_end,open,high,low,...,volume,volume_sell,volume_buy,volume_quote,volume_quote_sell,volume_quote_buy,vwap,count,count_sell,count_buy
0,OKEx,Swap,XMR_USDT,TimeBar,10000,1590216460000,1590216470000,63.419998,63.419998,63.419998,...,2.8,2.8,0.0,177.575989,177.575989,0.0,63.419998,1,1,0
1,OKEx,Swap,XMR_USDT,TimeBar,10000,1590220810000,1590220820000,63.560001,63.560001,63.560001,...,0.7,0.0,0.7,44.492001,0.0,44.492001,63.560001,1,0,1
2,OKEx,Swap,XMR_USDT,TimeBar,10000,1590221700000,1590221710000,63.07,63.07,63.07,...,0.6,0.6,0.0,37.842003,37.842003,0.0,63.070004,1,1,0
3,OKEx,Swap,XMR_USDT,TimeBar,10000,1590229370000,1590229380000,62.41,62.41,62.41,...,1.5,1.5,0.0,93.614998,93.614998,0.0,62.41,1,1,0
4,OKEx,Swap,XMR_USDT,TimeBar,10000,1590249610000,1590249620000,63.490002,63.490002,63.490002,...,0.5,0.0,0.5,31.745001,0.0,31.745001,63.490002,1,0,1


In [10]:
time_bars['volume'].sum()/total_volume

quantity    1.0
dtype: float64

In [11]:
build_standard_bars('/data/csv/OKEx.Swap.XMR_USDT.csv', 'TickBar', 4, '/data/bars/TickBar/4/TickBar.4.OKEx.Swap.XMR_USDT.csv')

In [12]:
tick_bars = pd.read_csv('/data/bars/TickBar/4/TickBar.4.OKEx.Swap.XMR_USDT.csv')

In [13]:
tick_bars.head()

Unnamed: 0,exchange,market_type,pair,bar_type,bar_size,timestamp,timestamp_end,open,high,low,...,volume,volume_sell,volume_buy,volume_quote,volume_quote_sell,volume_quote_buy,vwap,count,count_sell,count_buy
0,OKEx,Swap,XMR_USDT,TickBar,4,1590216461309,1590229378882,63.419998,63.560001,62.41,...,5.6,4.9,0.7,353.524994,309.03299,44.492001,63.129463,4,3,1
1,OKEx,Swap,XMR_USDT,TickBar,4,1590249619582,1590263779095,63.490002,64.389999,63.490002,...,1.4,0.0,1.4,89.546005,0.0,89.546005,63.961433,4,0,4
2,OKEx,Swap,XMR_USDT,TickBar,4,1590265720050,1590292415865,64.690002,64.989998,63.220001,...,5.7,0.7,5.0,368.411987,44.254002,324.15799,64.633682,4,1,3
3,OKEx,Swap,XMR_USDT,TickBar,4,1590293593504,1590293777432,63.220001,63.279999,63.220001,...,8.0,0.0,8.0,506.0,0.0,506.0,63.25,4,0,4
4,OKEx,Swap,XMR_USDT,TickBar,4,1590295679230,1590295691154,63.029999,63.029999,63.02,...,4.5,0.0,4.5,283.605988,0.0,283.605988,63.023552,4,0,4


In [14]:
tick_bars['volume'].sum()/total_volume

quantity    1.0
dtype: float64

In [15]:
build_standard_bars('/data/csv/OKEx.Swap.XMR_USDT.csv', 'VolumeBar', 10, '/data/bars/VolumeBar/10/VolumeBar.10.OKEx.Swap.XMR_USDT.csv')

In [16]:
volume_bars = pd.read_csv('/data/bars/VolumeBar/10/VolumeBar.10.OKEx.Swap.XMR_USDT.csv')

In [17]:
volume_bars.head()

Unnamed: 0,exchange,market_type,pair,bar_type,bar_size,timestamp,timestamp_end,open,high,low,...,volume,volume_sell,volume_buy,volume_quote,volume_quote_sell,volume_quote_buy,vwap,count,count_sell,count_buy
0,OKEx,Swap,XMR_USDT,VolumeBar,10,1590216461309,1590265720051,63.419998,64.690002,62.41,...,7.4,4.9,2.5,468.946991,309.03299,159.914001,63.371216,9,3,6
1,OKEx,Swap,XMR_USDT,VolumeBar,10,1590265720050,1590293777430,64.830002,64.989998,63.220001,...,10.900001,0.7,10.200001,696.66394,44.254002,652.409973,63.91412,6,1,5
2,OKEx,Swap,XMR_USDT,VolumeBar,10,1590293777431,1590298086865,63.279999,63.279999,62.939999,...,10.1,0.0,10.1,636.91803,0.0,636.91803,63.061188,7,0,7
3,OKEx,Swap,XMR_USDT,VolumeBar,10,1590298265847,1590298450708,63.0,63.07,63.0,...,10.900001,0.0,10.900001,687.20697,0.0,687.20697,63.046509,9,0,9
4,OKEx,Swap,XMR_USDT,VolumeBar,10,1590298535853,1590298729121,63.049999,63.080002,63.049999,...,8.0,0.0,8.0,504.528015,0.0,504.528015,63.066002,3,0,3


In [18]:
volume_bars['volume'].sum()/total_volume

quantity    1.0
dtype: float64

In [19]:
build_standard_bars('/data/csv/OKEx.Swap.XMR_USDT.csv', 'DollarBar', 1000, '/data/bars/DollarBar/1000/DollarBar.1000.OKEx.Swap.XMR_USDT.csv')

In [20]:
dollar_bars = pd.read_csv('/data/bars/DollarBar/1000/DollarBar.1000.OKEx.Swap.XMR_USDT.csv')

In [21]:
dollar_bars.head()

Unnamed: 0,exchange,market_type,pair,bar_type,bar_size,timestamp,timestamp_end,open,high,low,...,volume,volume_sell,volume_buy,volume_quote,volume_quote_sell,volume_quote_buy,vwap,count,count_sell,count_buy
0,OKEx,Swap,XMR_USDT,DollarBar,1000,1590216461309,1590293593505,63.419998,64.989998,62.41,...,15.1,5.6,9.5,963.210999,353.286987,609.924011,63.788807,13,4,9
1,OKEx,Swap,XMR_USDT,DollarBar,1000,1590293593507,1590298265848,63.220001,63.279999,62.939999,...,14.900001,0.0,14.900001,940.118042,0.0,940.118042,63.095169,10,0,10
2,OKEx,Swap,XMR_USDT,DollarBar,1000,1590298282361,1590298729121,63.040001,63.080002,63.040001,...,17.299999,0.0,17.299999,1090.934937,0.0,1090.934937,63.059826,11,0,11
3,OKEx,Swap,XMR_USDT,DollarBar,1000,1590298814347,1590298979247,63.07,63.07,63.07,...,14.099999,0.0,14.099999,889.286987,0.0,889.286987,63.070004,4,0,4
4,OKEx,Swap,XMR_USDT,DollarBar,1000,1590298995801,1590315969751,63.07,63.689999,63.07,...,17.200001,2.0,15.200001,1086.468994,126.774002,959.694946,63.166798,15,3,12


In [22]:
dollar_bars['volume'].sum()/total_volume

quantity    1.0
dtype: float64

In [23]:
os.remove('/data/bars/TimeBar/10000/TimeBar.10000.OKEx.Swap.XMR_USDT.csv')
os.remove('/data/bars/TickBar/4/TickBar.4.OKEx.Swap.XMR_USDT.csv')
os.remove('/data/bars/VolumeBar/10/VolumeBar.10.OKEx.Swap.XMR_USDT.csv')
os.remove('/data/bars/DollarBar/1000/DollarBar.1000.OKEx.Swap.XMR_USDT.csv')

## Build on multiple CSV files

In [24]:
def generate_tasks(csv_files: List[str], bar_type: str, bar_sizes: List[int], output_dir: str)->List[Tuple[str, str, int, str]]:
    tasks = [(file, bar_type, bar_size, os.path.join(output_dir, str(bar_size),f'{bar_type}.{bar_size}.{os.path.basename(file)}'))
             for file in csv_files for bar_size in bar_sizes]
    return tasks

In [25]:
def run_tasks_parallell(tasks: List[Tuple[str, str, int, str]])->None:
    with ProgressBar():
        db.from_sequence(tasks).map(lambda t: build_standard_bars(t[0], t[1], t[2], t[3])).compute()

In [26]:
csv_files = glob.glob('/data/csv/*XMR_USD*.csv')

In [27]:
csv_files

['/data/csv/MXC.Spot.XMR_USDT.csv',
 '/data/csv/Kraken.Spot.XMR_USD.csv',
 '/data/csv/Binance.Swap.XMR_USDT.csv',
 '/data/csv/OKEx.Swap.XMR_USD.csv',
 '/data/csv/OKEx.Swap.XMR_USDT.csv',
 '/data/csv/Binance.Spot.XMR_USDT.csv',
 '/data/csv/Huobi.Spot.XMR_USDT.csv',
 '/data/csv/Bitfinex.Spot.XMR_USD.csv',
 '/data/csv/OKEx.Spot.XMR_USDT.csv']

In [28]:
tasks = generate_tasks(
    csv_files,
    'TimeBar',
    [10000, 60000],
    '/data/bars/TimeBar',
) + generate_tasks(
    csv_files,
    'TickBar',
    [4, 8],
    '/data/bars/TickBar',
) + generate_tasks(
    csv_files,
    'VolumeBar',
    [10, 100],
    '/data/bars/VolumeBar',
) + generate_tasks(
    csv_files,
    'DollarBar',
    [1000, 10000],
    '/data/bars/DollarBar',
)

In [29]:
len(tasks)

72

In [30]:
run_tasks_parallell(tasks)

[########################################] | 100% Completed | 47min 47.3s


## References

* [Tick, Volume, Dollar Volume Bars.ipynb](https://github.com/BlackArbsCEO/Adv_Fin_ML_Exercises/blob/master/notebooks/Tick%2C%20Volume%2C%20Dollar%20Volume%20Bars.ipynb)