In [1]:
import os
# import dask.dataframe as pd  # Consumes over 200G memory and run over 1 hour, worse than pandas
import pandas as pd
import glob
from tqdm.notebook import tqdm
import dask.bag as db
from dask.diagnostics import ProgressBar
import gc
from typing import List

**Hardware:** n1-highmem-32, 32 vCPUs, 208 GB memory, 1TB SSD, 4 x NVIDIA Tesla V100

## Convert JSON to HDF5

In [2]:
DATA_DIR = '/data/trade'
OUTPUT_DIR = '/data/hdf5'

In [3]:
# ValueError: Value is too big, Huobi.Spot.EOS_BTC.2020-05-06.json
# https://stackoverflow.com/a/61733123/381712
import json
pd.io.json._json.loads = lambda s, *a, **kw: json.loads(s)

In [4]:
def merge_dates(exchange:str, market_type:str, pair:str, data_dir:str, output_dir: str)->None:
    json_files = glob.glob(os.path.join(data_dir, f'**/{exchange}.{market_type}.{pair}.*.json'), recursive=True)
    json_files = sorted(json_files)
    pd.io.json._json.loads = lambda s, *a, **kw: json.loads(s)  # ValueError: Value is too big
    dfs = [pd.read_json(file, lines=True, dtype=False).drop(
        columns=['rawPair', 'channel', 'channelType', 'raw']) for file in json_files]
    df = pd.concat(dfs)
    df.to_hdf(
        os.path.join(output_dir, f'{exchange}.{market_type}.{pair}.hdf5'),
        key=f'{exchange}__{market_type}__{pair}',
        mode='w',
        complevel=9)
    del df
    del dfs
    gc.collect()

In [9]:
# merge_dates('OKEx', 'Swap', 'XMR_USDT', DATA_DIR, OUTPUT_DIR)

In [6]:
def merge_multi(market_type:str, input_dir:str, output_dir:str)->None:
    assert market_type != 'Futures'
    json_files = glob.glob(os.path.join(input_dir, f'**/*.{market_type}.*.json'), recursive=True)
    # exchange -> pairs
    exchange_pairs = {}
    for file in json_files:
        filename = os.path.basename(file)
        arr = filename.split('.')
        exchange = arr[0]
        assert market_type == arr[1]
        pair = arr[2]
        if exchange not in exchange_pairs:
            exchange_pairs[exchange] = []
        exchange_pairs[exchange].append(pair)
    # deduplication
    for exchange in exchange_pairs:
        exchange_pairs[exchange] = sorted(list(dict.fromkeys(exchange_pairs[exchange])))
    # flatten
    exchange_pair_arr = []
    for exchange in exchange_pairs:
        pairs = exchange_pairs[exchange]
        for pair in pairs:
            exchange_pair_arr.append((exchange, pair))
    with ProgressBar():
        db.from_sequence(exchange_pair_arr).map(
            lambda item: merge_dates(item[0], market_type, item[1], input_dir, output_dir)).compute()

In [7]:
merge_multi('Swap', DATA_DIR, OUTPUT_DIR)

[########################################] | 100% Completed | 12min 51.0s


In [8]:
merge_multi('Spot', DATA_DIR, OUTPUT_DIR)

[########################################] | 100% Completed | 13min 26.3s
