# Split by date and Deduplicate

In [1]:
import datetime
import glob
import gzip
import json
import os
import shutil
import subprocess
import zipfile

import dask.bag as db
import pandas as pd

In [None]:
DATA_DIR = '/home/frankdai/data/trade-20200607-20200610/trade'

In [None]:
FLATTEN_DIR = '/home/frankdai/data/trade-20200607-20200610'

In [None]:
exchanges = os.listdir(DATA_DIR)

In [None]:
exchanges

In [None]:
for exchange in os.listdir(DATA_DIR):
    [ex,market] = exchange.split('-')
    pairs = os.listdir(os.path.join(DATA_DIR, exchange))
    for pair in pairs:
        files = os.listdir(os.path.join(DATA_DIR, exchange, pair))
        for file in [x for x in files if x.endswith('.zip') or x.endswith('.json')]:
            source = os.path.join(DATA_DIR, exchange, pair, file)
            dest = os.path.join(FLATTEN_DIR, ex+'.'+market+'.' + pair + '.' + file)
            print(source)
            #shutil.move(source, dest)

In [None]:
for exchange in os.listdir(DATA_DIR):
    [ex,market] = exchange.split('-')
    pairs = os.listdir(os.path.join(DATA_DIR, exchange))
    for pair in pairs:
        periods = os.listdir(os.path.join(DATA_DIR, exchange, pair))
        for period in periods:
            files = os.listdir(os.path.join(DATA_DIR, exchange, pair, period))
            for file in [x for x in files if x.endswith('.zip') or x.endswith('.json')]:
                source = os.path.join(DATA_DIR, exchange, pair, period, file)
                dest = os.path.join(FLATTEN_DIR, ex+'.'+market+'.' + pair + '.' + period + '.' + file)
                print(dest)
                #shutil.move(source, dest)

In [None]:
for file in [x for x in os.listdir(FLATTEN_DIR) if x.endswith('.json')]:
    # zip -9 -rm WhaleEx.Spot.TRX_USDT.2020-06-10.zip WhaleEx.Spot.TRX_USDT.2020-06-10.json
    filename = os.path.join(FLATTEN_DIR,file[0:-5])
    command = f'zip -j -9 -rm {filename}.zip {filename}.json'
    print(command)
    p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
    p.wait()

In [None]:
for file in [x for x in os.listdir(FLATTEN_DIR) if x.endswith('.zip')]:
    # zip -9 -rm WhaleEx.Spot.TRX_USDT.2020-06-10.zip WhaleEx.Spot.TRX_USDT.2020-06-10.json
    filename = os.path.join(FLATTEN_DIR,file[0:-4])
    date_str = file.split('.')[-2]
    command = f'unzip -j {filename}.zip -d {FLATTEN_DIR}'

    p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
    p.wait()
    if os.path.exists(f'{os.path.join(FLATTEN_DIR, date_str)}.json'):
        shutil.move(f'{os.path.join(FLATTEN_DIR, date_str)}.json', f'{filename}.json')

    command = f'rm {filename}.zip'
    p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
    p.wait()

    command = f'zip -j -9 -rm {filename}.zip {filename}.json'
    print(command)
    p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
    p.wait()

## Split by date

In [None]:
INPUT_DIR = '/data/trade'
OUTPUT_DIR = '/data/hashed'

In [None]:
def split(input_file:str, output_dir:str)->None:
    if input_file.endswith('.json.gz'):
        f = gzip.open(input_file, 'rt')
        lines = f.readlines()
        f.close()
    elif input_file.endswith('.zip'):
        zf = zipfile.ZipFile(input_file, 'r')
        assert len(zf.namelist()) == 1
        lines = zf.read(zf.namelist()[0]).decode('UTF-8').split('\n')
        zf.close()
    elif input_file.endswith('.json') or input_file.endswith('file.log'):
        f = open(input_file, 'rt')
        lines = f.readlines()
        f.close()
    else:
        raise ValueError('Unknown file suffix ' + input_file)
    for line in lines:
        line = line.strip()
        if line:
            obj = json.loads(line)
            if obj['exchange'] == 'Bitfinex' and obj['marketType'] == 'Futures':
                obj['marketType'] = 'Swap'  # bugfix for Bitfinex
                line = json.dumps(obj)
            elif obj['exchange'] == 'WhaleEx' and obj['trade_id'] != obj['raw']['tradeId']:
                obj['trade_id'] = str(obj['raw']['tradeId'])  # bugfix for WhaleEx
                line = json.dumps(obj)

            date_str = datetime.datetime.fromtimestamp(obj['timestamp']/1000.0).isoformat()[0:10]
            exchange = obj['exchange']
            market_type = obj['marketType']
            pair = obj['pair']
            rawPair = obj['rawPair']
            filename = f'{exchange}.{market_type}.{pair}.{rawPair}' if market_type == 'Futures' else f'{exchange}.{market_type}.{pair}'

            output_file = os.path.join(output_dir, f'{filename}.{date_str}.json')
            file_object = open(output_file, 'at')
            file_object.write(line + '\n')
            file_object.close()
    del lines

In [None]:
# split('/data/trade/WhaleEx-Spot/BTC_USDT/20200511-0155-1.json.gz', OUTPUT_DIR)

In [None]:
#split('/home/frankdai/data/trade/WhaleEx-Spot/BTC_USDT/2020-06-06.zip', OUTPUT_DIR)

In [None]:
#split('/home/frankdai/data/trade/WhaleEx-Spot/BTC_USDT/2020-06-07.json', OUTPUT_DIR)

In [None]:
def split_multi(input_dir:str, output_dir:str)->None:
    json_files = [f for f in glob.glob(os.path.join(input_dir, "**/*.json"), recursive=True)]
    zip_files = [f for f in glob.glob(os.path.join(input_dir, "**/*.zip"), recursive=True)]
    gz_files = [f for f in glob.glob(os.path.join(input_dir, "**/*.json.gz"), recursive=True)]
    log_files = [f for f in glob.glob(os.path.join(input_dir, "**/file.log"), recursive=True)]
    files = json_files+zip_files+gz_files+log_files
    if len(files) <= 0:
        return
    for file in files:
        split(file, output_dir)
        os.remove(file)

In [None]:
#split_multi('/data/trade-20200607-20200610/', OUTPUT_DIR)

## Deduplicate and sort

In [5]:
def dedup_and_sort(input_file:str, ouput_file:str)->None:
    trade_map = {};
    f = open(input_file, 'rt')
    for line in f:
        line = line.strip()
        if line:
            obj = json.loads(line)
            exchange = obj['exchange']
            market_type = obj['marketType']
            pair = obj['pair']
            raw_pair = obj['rawPair']
            trade_id = obj['trade_id']
            if not trade_id:  # Fix trade_id for Kraken, MXC
                if exchange == 'Kraken' or exchange == 'MXC':
                    obj['trade_id'] = str(obj['timestamp'])
                    trade_id = obj['trade_id']
                    line = json.dumps(obj)
            if not trade_id:
                f.close()
                raise ValueError(line)
            key = f'{exchange}-{market_type}-{pair}-{raw_pair}-{trade_id}'
            trade_map[key] = line
    f.close()
    trade_array = []
    for key in trade_map:
        trade_array.append({'key': key, 'line': trade_map[key]})
    del trade_map
    trade_array.sort(key=lambda x: x['key'])

    f = open(ouput_file, 'at')
    for item in trade_array:
        f.write(item['line'] + '\n')
    del trade_array
    f.close()

In [8]:
def dedup_and_sort_wrapper(input_file:str, output_dir:str)->None:
    date_str = input_file.split('.')[-2]
    date_dir = os.path.join(output_dir, date_str)
    if not os.path.exists(date_dir):
        os.mkdir(date_dir)
    dedup_and_sort(input_file, os.path.join(date_dir, os.path.basename(input_file)))

In [9]:
def dedup_sort_multi(input_dir:str, output_dir:str)->None:
    files = [f for f in glob.glob(os.path.join(input_dir, "**/*.json"), recursive=True)]
    db.from_sequence(files).map(lambda file: dedup_and_sort_wrapper(file, output_dir)).compute()

In [10]:
dedup_sort_multi('/data/hashed', '/data/dated')