# Time Bar

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf
import matplotlib.pyplot as plt
from typing import List, Dict, Union
from operator import itemgetter
import gc
import dask.bag as db
from dask.diagnostics import ProgressBar
%matplotlib inline

In [2]:
from tqdm.notebook import tqdm
tqdm.pandas()

  from pandas import Panel


In [3]:
# from pandarallel import pandarallel
# pandarallel.initialize()

In [4]:
def aggregate(nums: pd.Series)-> Dict:
    assert nums.size > 0
    return {
        'open': nums.iloc[0],
        'high': nums.max(),
        'low': nums.min(),
        'close': nums.iloc[-1],
        'mean': nums.mean(),
        'median': nums.median(),
    }

In [5]:
aggregate(pd.Series([3,2,1,4,5,6]))

{'open': 3, 'high': 6, 'low': 1, 'close': 6, 'mean': 3.5, 'median': 3.5}

In [6]:
def aggregate_trade(trades: pd.DataFrame)-> Dict:
    price_ohlcv = aggregate(trades['price']);
    
    trades_sell = trades[trades['side'] == True]
    trades_buy = trades[trades['side'] == False]

    volume = trades['quantity'].sum()
    volume_sell = trades_sell['quantity'].sum()
    volume_buy = trades_buy['quantity'].sum()
    volume_quote = (trades['price'] * trades['quantity']).sum()
    volume_quote_sell = (trades_sell['price'] * trades_sell['quantity']).sum()
    volume_quote_buy = (trades_buy['price'] * trades_buy['quantity']).sum()

    price_ohlcv.update({
        'volume': volume,
        'volume_sell': volume_sell,
        'volume_buy': volume_buy,
        'volume_quote': volume_quote,
        'volume_quote_sell': volume_quote_sell,
        'volume_quote_buy': volume_quote_buy,

        'vwap': volume_quote / volume,

        'count': trades.shape[0],
        'count_sell': trades_sell.shape[0],
        'count_buy': trades_buy.shape[0],
    })
    return price_ohlcv

In [7]:
def convert_to_bar(bar_type:str, bar_size:np.int64, trades: pd.DataFrame)->Dict:
    assert trades.shape[0] > 0

    trades.sort_values('trade_id', inplace=True)

    exchange, marketType, pair = itemgetter('exchange', 'marketType', 'pair')(trades.iloc[0])
    trade_agg = aggregate_trade(trades);

    timestamp_begin = trades.iloc[0].timestamp // bar_size * bar_size if bar_type == 'TimeBar' else trades.iloc[0].timestamp;
    timestamp_end = (timestamp_begin + bar_size) if bar_type == 'TimeBar' else (trades.iloc[-1].timestamp + 1);

    bar = {
      'exchange': exchange,
      'market_type': marketType,
      'pair': pair,
      'bar_type': bar_type,
      'bar_size': bar_size,
      'timestamp': timestamp_begin,
      'timestamp_end': timestamp_end,
    };
    bar.update(trade_agg)

    return bar;

In [8]:
def generate_time_bars(hdf_file: str, bar_size: np.int64, output_file: str)->None:
    """Build time bars.
    Args:
        df: per trade data.
        bar_size: time bar size, in milliseconds.
        output_dir: Output directory.
    Returns:
        Time bars.
  """
    df = pd.read_hdf(hdf_file)
    df['timestamp']=df['timestamp'].astype(np.int64) // int(1e6)
    df['timestamp_begin']=df['timestamp'] // bar_size * bar_size
    grouped = df.groupby('timestamp_begin')

    series = grouped.apply(lambda df: convert_to_bar('TimeBar', bar_size, df))  # parallel_apply, from pandarallel
    
    del df
    gc.collect()
    
    bars = pd.DataFrame(list(series))

    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    bars.to_hdf(
        output_file,
        key=os.path.basename(output_file)[0:-len('.hdf5')].replace('.', '__'),
        mode='w',
        complevel=9)

In [9]:
# generate_time_bars('/data/hdf5/OKEx.Swap.BTC_USDT.hdf5', 10000,
#                    '/data/bars/TimeBar/10000/TimeBar.10000.OKEx.Swap.BTC_USDT.hdf5')

In [10]:
def generate_multi(hdf5_files: List[str], bar_sizes: List[int], output_dir: str)->None:
    file_bar_sizes = [(file, bar_size) for file in hdf5_files for bar_size in bar_sizes]
    with ProgressBar():
        db.from_sequence(file_bar_sizes).map(
            lambda t: generate_time_bars(t[0], t[1], os.path.join(
                output_dir, str(t[1]), f'TimeBar.{t[1]}.{os.path.basename(t[0])}'))).compute()

In [11]:
hdf_files = glob.glob('/data/hdf5/*BTC_USD*.hdf5')

In [12]:
hdf_files

['/data/hdf5/OKEx.Swap.BTC_USD.hdf5',
 '/data/hdf5/Bitfinex.Spot.BTC_USDT.hdf5',
 '/data/hdf5/MXC.Spot.BTC_USDT.hdf5',
 '/data/hdf5/Huobi.Swap.BTC_USD.hdf5',
 '/data/hdf5/Newdex.Spot.BTC_USDT.hdf5',
 '/data/hdf5/Bitfinex.Swap.BTC_USDT.hdf5',
 '/data/hdf5/WhaleEx.Spot.BTC_USDT.hdf5',
 '/data/hdf5/Binance.Spot.BTC_USDT.hdf5',
 '/data/hdf5/Binance.Swap.BTC_USDT.hdf5',
 '/data/hdf5/OKEx.Spot.BTC_USDT.hdf5',
 '/data/hdf5/Bitfinex.Spot.BTC_USD.hdf5',
 '/data/hdf5/Huobi.Spot.BTC_USDT.hdf5',
 '/data/hdf5/CoinbasePro.Spot.BTC_USD.hdf5',
 '/data/hdf5/Bitstamp.Spot.BTC_USD.hdf5',
 '/data/hdf5/BitMEX.Swap.BTC_USD.hdf5',
 '/data/hdf5/OKEx.Swap.BTC_USDT.hdf5',
 '/data/hdf5/Kraken.Spot.BTC_USDT.hdf5',
 '/data/hdf5/Kraken.Spot.BTC_USD.hdf5']

In [13]:
generate_multi(
    hdf_files,
    list(map(lambda x: x* 1000, [10, 60, 180, 300, 900, 1800, 3600, 14400])),
    '/data/bars/TimeBar',
)

[########################################] | 100% Completed | 28min  1.1s


## References

* [Tick, Volume, Dollar Volume Bars.ipynb](https://github.com/BlackArbsCEO/Adv_Fin_ML_Exercises/blob/master/notebooks/Tick%2C%20Volume%2C%20Dollar%20Volume%20Bars.ipynb)