In [5]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import ccxt

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [6]:

class Fetcher:
    def __init__(self, symbol=None):
        self.symbol = symbol
        self.keys = {
            'symbol': symbol
        }
        self.data_id = 'binance_ohlcv_5m'
        
    def fetch(self, last_timestamp=None):
        client = ccxt.binance({
            'options': {
                'defaultType': 'future',
            },
        })
        
        result = client.fapiPublicGetKlines({
            'symbol': self.symbol,
            'interval': '1m',
            'startTime': 1 if last_timestamp is None else (last_timestamp + 5 * 60) * 1000,
            'limit': 1500,
        })
        
        df = pd.DataFrame(result, columns=[
            'timestamp',
            'op',
            'hi',
            'lo',
            'cl',
            'volume',
            'close_time',
            'amount',
            'trades',
            'buy_volume',
            'buy_amount',
            'ignored',
        ])
        df = df.drop(columns=['close_time', 'ignored'])
        df['timestamp'] = df['timestamp'].astype(int) // 1000
        
        for col in ['op', 'hi', 'lo', 'cl', 'volume', 'amount', 'trades', 'buy_volume', 'buy_amount']:
            df[col] = df[col].astype('float')
        
        df['timestamp_5m'] = (df['timestamp'] // 300) * 300
        
        # display(df.loc[(df['volume'] == 0) & (df['trades'] > 0)])
        
        # microstructure feature
        # entropy feature
        # slippage feature
        
        df['hi_op'] = df['hi'] - df['op']
        df['lo_op'] = df['lo'] - df['op']
        
        df['ln_hi_lo'] = np.log(df['hi'] / df['lo'])
        df['ln_hi_lo_sqr'] = df['ln_hi_lo'] ** 2
        
#         def corwin_alpha(x):
#             hi2 = x['hi'].rolling(2).max()
#             lo2 = x['lo'].rolling(2).min()
#             gamma = np.log(hi2 / lo2) ** 2
#             beta = np.log(x['hi'] / x['lo']) ** 2
#             beta = beta.rolling(2).sum()
            
#             sqrt2 = 2.0 ** 0.5
#             denom = 3.0 - 2.0 * sqrt2
#             alpha = (sqrt2 - 1) / denom * beta ** 0.5 - (gamma / denom) ** 0.5
#             return alpha.mean()
        
        df = pd.concat([
            df.groupby('timestamp_5m')['op'].nth(0),
            df.groupby('timestamp_5m')['hi'].max(),
            df.groupby('timestamp_5m')['lo'].min(),
            df.groupby('timestamp_5m')['cl'].nth(-1),
            df.groupby('timestamp_5m')['volume'].sum(),
            df.groupby('timestamp_5m')['amount'].sum(),
            df.groupby('timestamp_5m')['trades'].sum(),
            df.groupby('timestamp_5m')['buy_volume'].sum(),
            df.groupby('timestamp_5m')['buy_amount'].sum(),
            df.groupby('timestamp_5m')['cl'].mean().rename('twap'),
            # vola
            df.groupby('timestamp_5m')['cl'].std().fillna(0).rename('cl_std'),
            df.groupby('timestamp_5m').apply(lambda x: (x['cl'] - x['cl'].shift(1).fillna(df['op'])).std()).fillna(0).rename('cl_diff_std'),
            # slippage
            df.groupby('timestamp_5m')['hi'].mean().rename('hi_twap'),
            df.groupby('timestamp_5m')['lo'].mean().rename('lo_twap'),
            df.groupby('timestamp_5m')['hi_op'].mean().rename('hi_op_max'),
            df.groupby('timestamp_5m')['lo_op'].mean().rename('lo_op_min'),
            # microstructure
            df.groupby('timestamp_5m')['ln_hi_lo'].mean().rename('ln_hi_lo_mean'),
            df.groupby('timestamp_5m')['ln_hi_lo_sqr'].mean().rename('ln_hi_lo_sqr_mean'),
            # df.groupby('timestamp_1h').apply(corwin_alpha).fillna(0).rename('corwin_alpha'),
            # entropy
        ], axis=1)
        
        df.index.rename('timestamp', inplace=True)
        
        df = df.iloc[:-1] # remove partial
        
        return df


In [7]:
fetcher = Fetcher(symbol='BTCUSDT')
df = fetcher.fetch(last_timestamp=None)
display(df)
df = fetcher.fetch(last_timestamp=(1600000000 // 3600) * 3600)
display(df)

Unnamed: 0_level_0,op,hi,lo,cl,volume,amount,trades,buy_volume,buy_amount,twap,cl_std,cl_diff_std,hi_twap,lo_twap,hi_op_max,lo_op_min,ln_hi_lo_mean,ln_hi_lo_sqr_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1567965300,10000.00,10000.00,10000.00,10000.00,0.002,20.00000,2.0,0.001,10.00000,10000.000,0.000000,0.000000,10000.000,10000.000,0.000,0.000,0.000000,0.000000e+00
1567965600,10000.00,10000.00,10000.00,10000.00,0.000,0.00000,0.0,0.000,0.00000,10000.000,0.000000,0.000000,10000.000,10000.000,0.000,0.000,0.000000,0.000000e+00
1567965900,10000.00,10000.00,10000.00,10000.00,0.000,0.00000,0.0,0.000,0.00000,10000.000,0.000000,0.000000,10000.000,10000.000,0.000,0.000,0.000000,0.000000e+00
1567966200,10000.00,10000.00,10000.00,10000.00,0.000,0.00000,0.0,0.000,0.00000,10000.000,0.000000,0.000000,10000.000,10000.000,0.000,0.000,0.000000,0.000000e+00
1567966500,10000.00,10000.00,10000.00,10000.00,0.000,0.00000,0.0,0.000,0.00000,10000.000,0.000000,0.000000,10000.000,10000.000,0.000,0.000,0.000000,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1568053800,10261.52,10262.34,10253.14,10253.14,88.023,902909.48981,47.0,41.809,428897.86887,10256.418,3.492137,2.817309,10259.710,10255.820,1.612,-2.278,0.000379,1.561980e-07
1568054100,10253.14,10257.45,10250.56,10252.98,84.903,870540.82261,42.0,46.264,474408.95163,10252.614,1.381260,1.699285,10255.096,10251.520,1.292,-2.284,0.000349,1.374091e-07
1568054400,10254.39,10273.11,10254.39,10269.45,92.722,951670.01199,42.0,63.903,655918.82825,10265.250,6.109746,4.390367,10266.366,10261.242,3.810,-1.314,0.000499,2.858222e-07
1568054700,10271.36,10273.11,10267.74,10270.28,56.697,582303.28366,29.0,25.752,264505.00016,10270.484,1.493077,2.329835,10271.510,10269.092,1.046,-1.372,0.000235,6.816171e-08


Unnamed: 0_level_0,op,hi,lo,cl,volume,amount,trades,buy_volume,buy_amount,twap,cl_std,cl_diff_std,hi_twap,lo_twap,hi_op_max,lo_op_min,ln_hi_lo_mean,ln_hi_lo_sqr_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1599998700,10341.93,10341.93,10312.14,10336.67,1029.149,1.062818e+07,3284.0,496.003,5.122458e+06,10329.026,5.734573,9.556881,10335.050,10320.284,5.070,-9.696,0.001430,2.402590e-06
1599999000,10336.67,10352.68,10331.13,10346.16,647.076,6.691791e+06,2422.0,373.955,3.867401e+06,10340.352,4.087538,2.323159,10342.834,10337.236,4.402,-1.196,0.000541,4.323437e-07
1599999300,10346.17,10346.17,10331.11,10332.67,465.101,4.807925e+06,2050.0,145.508,1.504071e+06,10336.204,3.553580,7.081225,10340.504,10333.034,1.602,-5.868,0.000723,6.805473e-07
1599999600,10332.67,10349.93,10330.00,10348.20,524.316,5.423663e+06,1919.0,289.078,2.990288e+06,10346.406,2.349538,7.160878,10348.410,10340.734,5.110,-2.566,0.000742,8.059715e-07
1599999900,10348.20,10349.00,10330.00,10347.92,605.199,6.260212e+06,1920.0,401.152,4.149836e+06,10343.904,3.023529,5.579384,10347.616,10339.438,3.656,-4.522,0.000791,9.154744e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1600086900,10457.05,10459.54,10447.82,10454.03,409.891,4.285011e+06,1732.0,165.152,1.726549e+06,10453.292,2.271744,3.868389,10456.778,10451.370,2.884,-2.524,0.000517,2.843107e-07
1600087200,10454.03,10463.74,10454.02,10461.87,325.133,3.400406e+06,1669.0,218.642,2.286667e+06,10458.460,2.021769,1.462231,10460.504,10456.328,3.468,-0.708,0.000399,1.896212e-07
1600087500,10462.16,10478.00,10459.01,10478.00,553.048,5.791054e+06,2326.0,355.266,3.720191e+06,10468.276,8.612890,4.713631,10468.908,10463.248,3.802,-1.858,0.000541,4.001467e-07
1600087800,10477.99,10500.00,10477.99,10494.28,1498.788,1.572528e+07,4945.0,958.286,1.005496e+07,10493.482,2.766057,9.485835,10497.586,10484.470,7.620,-5.496,0.001250,1.712829e-06


In [8]:
symbols = 'BTC,ETH,XRP,LINK,ATOM,DOT,SOL,BNB,MATIC,ADA'.split(',')
symbols += ['ALGO', 'AVAX', 'BCH', 'APT']
symbols += 'DOGE,SFP,DYDX,AXS,CHZ,TRX,MASK,ETC,LTC,1000SHIB,C98,SAND,SUSHI,NEAR,FIL'.split(',')
fetchers = []

for symbol in symbols:
    fetchers.append(Fetcher(symbol=f'{symbol}USDT'))
    
data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20221123_binance_ohlcv_5m.xz', 'wb') as f:
    f.write(data)