In [1]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import ccxt

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [2]:
import time


class Fetcher:
    def __init__(self, symbol=None):
        self.symbol = symbol
        self.keys = {
            'symbol': symbol
        }
        self.data_id = 'bybit_ohlcv'
        
    def fetch(self, last_timestamp=None):
        client = ccxt.bybit({
            # 'options': {
                # 'defaultType': 'future',
            # },
        })
        
        if last_timestamp is None:
            granularity = 24 * 60 * 60
            start_time = int(time.time()) * 1000
            while True:
                # time.sleep(1)
                result = client.publicGetDerivativesV3PublicKline({
                    'category': 'linear',
                    'symbol': self.symbol,
                    'interval': 'D',
                    'start': start_time - granularity * 200 * 1000,
                    'end': start_time,
                })
                df = pd.DataFrame(result['result']['list'], columns=[
                    'timestamp',
                    'op',
                    'hi',
                    'lo',
                    'cl',
                    'volume',
                    'amount',
                ])
                df['timestamp'] = df['timestamp'].astype(int)
                if df['timestamp'].min() == start_time:
                    break
                # display(df)
                start_time = df['timestamp'].min()
                print(start_time)
                
            granularity = 60
            start_time += 24 * 60 * 60 * 1000
            while True:
                # time.sleep(1)
                result = client.publicGetDerivativesV3PublicKline({
                    'category': 'linear',
                    'symbol': self.symbol,
                    'interval': '1',
                    'start': start_time - granularity * 200 * 1000,
                    'end': start_time,
                })
                df = pd.DataFrame(result['result']['list'], columns=[
                    'timestamp',
                    'op',
                    'hi',
                    'lo',
                    'cl',
                    'volume',
                    'amount',
                ])
                df['timestamp'] = df['timestamp'].astype(int)
                if df['timestamp'].min() == start_time:
                    break
                # display(df)
                start_time = df['timestamp'].min()
                print(start_time)
        else:
            start_time = (last_timestamp + 60 * 60) * 1000
        
        result = client.publicGetDerivativesV3PublicKline({
            'category': 'linear',
            'symbol': self.symbol,
            'interval': '1',
            'start': start_time,
            'end': start_time + 60 * 60 * 200 * 1000
        })
        
        df = pd.DataFrame(result['result']['list'], columns=[
            'timestamp',
            'op',
            'hi',
            'lo',
            'cl',
            'volume',
            'amount',
        ])
        df['timestamp'] = df['timestamp'].astype(int) // 1000
        
        for col in ['op', 'hi', 'lo', 'cl', 'volume', 'amount']:
            df[col] = df[col].astype('float')
        
        df['timestamp_5m'] = (df['timestamp'] // 300) * 300
        df['timestamp_1h'] = (df['timestamp'] // 3600) * 3600
        
        df_5m = pd.concat([
            df.groupby('timestamp_5m')['cl'].nth(-1),
        ], axis=1)
        df_5m = df_5m.reset_index()
        df_5m['timestamp_1h'] = (df_5m['timestamp_5m'] // 3600) * 3600
        
        # display(df.loc[(df['volume'] == 0) & (df['trades'] > 0)])
        
        # microstructure feature
        # entropy feature
        # slippage feature
        
        df['hi_op'] = df['hi'] - df['op']
        df['lo_op'] = df['lo'] - df['op']
        
        df['ln_hi_lo'] = np.log(df['hi'] / df['lo'])
        df['ln_hi_lo_sqr'] = df['ln_hi_lo'] ** 2
        
#         def corwin_alpha(x):
#             hi2 = x['hi'].rolling(2).max()
#             lo2 = x['lo'].rolling(2).min()
#             gamma = np.log(hi2 / lo2) ** 2
#             beta = np.log(x['hi'] / x['lo']) ** 2
#             beta = beta.rolling(2).sum()
            
#             sqrt2 = 2.0 ** 0.5
#             denom = 3.0 - 2.0 * sqrt2
#             alpha = (sqrt2 - 1) / denom * beta ** 0.5 - (gamma / denom) ** 0.5
#             return alpha.mean()
        
        df = pd.concat([
            df.groupby('timestamp_1h')['op'].nth(0),
            df.groupby('timestamp_1h')['hi'].max(),
            df.groupby('timestamp_1h')['lo'].min(),
            df.groupby('timestamp_1h')['cl'].nth(-1),
            df.groupby('timestamp_1h')['volume'].sum(),
            df.groupby('timestamp_1h')['amount'].sum(),
            df.groupby('timestamp_1h')['cl'].mean().rename('twap'),
            df_5m.groupby('timestamp_1h')['cl'].mean().rename('twap_5m'),
            # vola
            df.groupby('timestamp_1h')['cl'].std().fillna(0).rename('cl_std'),
            df.groupby('timestamp_1h').apply(lambda x: (x['cl'] - x['cl'].shift(1).fillna(x['op'])).std()).fillna(0).rename('cl_diff_std'),
            # slippage
            df.groupby('timestamp_1h')['hi'].mean().rename('hi_twap'),
            df.groupby('timestamp_1h')['lo'].mean().rename('lo_twap'),
            df.groupby('timestamp_1h')['hi_op'].mean().rename('hi_op_max'),
            df.groupby('timestamp_1h')['lo_op'].mean().rename('lo_op_min'),
            # microstructure
            df.groupby('timestamp_1h')['ln_hi_lo'].mean().rename('ln_hi_lo_mean'),
            df.groupby('timestamp_1h')['ln_hi_lo_sqr'].mean().rename('ln_hi_lo_sqr_mean'),
            # df.groupby('timestamp_1h').apply(corwin_alpha).fillna(0).rename('corwin_alpha'),
            # entropy
        ], axis=1)
        
        df.index.rename('timestamp', inplace=True)
        
        df = df.iloc[:-1] # remove partial
        
        return df


In [3]:
fetcher = Fetcher(symbol='BTCUSDT')
df = fetcher.fetch(last_timestamp=None)
display(df)
df = fetcher.fetch(last_timestamp=(1600000000 // 3600) * 3600)
display(df)

1653955200000
1636675200000
1619395200000
1602115200000
1585094400000
1585168800000
1585156800000
1585144800000
1585132800000
1585132560000


Unnamed: 0_level_0,op,hi,lo,cl,volume,amount,twap,twap_5m,cl_std,cl_diff_std,hi_twap,lo_twap,hi_op_max,lo_op_min,ln_hi_lo_mean,ln_hi_lo_sqr_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1585130400,6591.5,6591.5,6500.0,6500.0,0.004,26.1795,6526.104167,6517.6,41.566338,17.946062,6526.104167,6522.291667,3.8125,0.0,0.000582,8e-06
1585134000,6502.5,6628.5,6457.5,6591.5,438.873,2877880.0,6565.85,6568.208333,48.51361,12.241208,6572.891667,6559.458333,5.708333,-7.725,0.002051,6e-06
1585137600,6577.0,6588.5,6502.0,6506.5,529.318,3463795.0,6542.825,6540.958333,16.914998,8.203877,6547.308333,6537.625,5.683333,-4.0,0.00148,3e-06


Unnamed: 0_level_0,op,hi,lo,cl,volume,amount,twap,twap_5m,cl_std,cl_diff_std,hi_twap,lo_twap,hi_op_max,lo_op_min,ln_hi_lo_mean,ln_hi_lo_sqr_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1600002000,10321.0,10333.0,10264.5,10292.0,362.037,3734534.0,10316.383333,10315.5,10.880073,5.184696,10317.808333,10313.733333,1.966667,-2.108333,0.000395,4.611333e-07
1600005600,10328.5,10330.0,10307.0,10312.5,76.052,784931.4,10317.583333,10317.041667,6.869342,2.244516,10318.3,10316.65,0.858333,-0.791667,0.00016,6.528492e-08
1600009200,10287.0,10331.0,10282.5,10328.5,105.571,1088103.0,10311.908333,10313.291667,12.761001,3.954285,10313.508333,10310.666667,0.908333,-1.933333,0.000276,2.348301e-07


In [4]:
symbols = 'BTC,ETH,XRP,LINK,ATOM,DOT,SOL,BNB,MATIC,ADA'.split(',')
symbols += ['ALGO', 'AVAX', 'BCH', 'APT']
symbols += 'DOGE,SFP,DYDX,AXS,CHZ,TRX,MASK,ETC,LTC,SHIB1000,C98,SAND,SUSHI,NEAR,FIL'.split(',')
fetchers = []

for symbol in symbols:
    fetchers.append(Fetcher(symbol=f'{symbol}USDT'))
    
data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20221217_bybit_ohlcv.xz', 'wb') as f:
    f.write(data)