In [1]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import ccxt

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [2]:

class Fetcher:
    def __init__(self, symbol=None):
        self.symbol = symbol
        self.keys = {
            'symbol': symbol
        }
        self.data_id = 'binance_ohlcv'
        
    def fetch(self, last_timestamp=None):
        client = ccxt.binance({
            'options': {
                'defaultType': 'future',
            },
        })
        
        result = client.fapiPublicGetKlines({
            'symbol': self.symbol,
            'interval': '1m',
            'startTime': 1 if last_timestamp is None else (last_timestamp + 60 * 60) * 1000,
            'limit': 1500,
        })
        
        df = pd.DataFrame(result, columns=[
            'timestamp',
            'op',
            'hi',
            'lo',
            'cl',
            'volume',
            'close_time',
            'amount',
            'trades',
            'buy_volume',
            'buy_amount',
            'ignored',
        ])
        df = df.drop(columns=['close_time', 'ignored'])
        df['timestamp'] = df['timestamp'].astype(int) // 1000
        
        for col in ['op', 'hi', 'lo', 'cl', 'volume', 'amount', 'trades', 'buy_volume', 'buy_amount']:
            df[col] = df[col].astype('float')
        
        df['timestamp_5m'] = (df['timestamp'] // 300) * 300
        df['timestamp_1h'] = (df['timestamp'] // 3600) * 3600
        
        df_5m = pd.concat([
            df.groupby('timestamp_5m')['cl'].nth(-1),
        ], axis=1)
        df_5m = df_5m.reset_index()
        df_5m['timestamp_1h'] = (df_5m['timestamp_5m'] // 3600) * 3600
        
        # display(df.loc[(df['volume'] == 0) & (df['trades'] > 0)])
        
        # microstructure feature
        # entropy feature
        # slippage feature
        
        df['hi_op'] = df['hi'] - df['op']
        df['lo_op'] = df['lo'] - df['op']
        
        df['ln_hi_lo'] = np.log(df['hi'] / df['lo'])
        df['ln_hi_lo_sqr'] = df['ln_hi_lo'] ** 2
        
#         def corwin_alpha(x):
#             hi2 = x['hi'].rolling(2).max()
#             lo2 = x['lo'].rolling(2).min()
#             gamma = np.log(hi2 / lo2) ** 2
#             beta = np.log(x['hi'] / x['lo']) ** 2
#             beta = beta.rolling(2).sum()
            
#             sqrt2 = 2.0 ** 0.5
#             denom = 3.0 - 2.0 * sqrt2
#             alpha = (sqrt2 - 1) / denom * beta ** 0.5 - (gamma / denom) ** 0.5
#             return alpha.mean()
        
        df = pd.concat([
            df.groupby('timestamp_1h')['op'].nth(0),
            df.groupby('timestamp_1h')['hi'].max(),
            df.groupby('timestamp_1h')['lo'].min(),
            df.groupby('timestamp_1h')['cl'].nth(-1),
            df.groupby('timestamp_1h')['volume'].sum(),
            df.groupby('timestamp_1h')['amount'].sum(),
            df.groupby('timestamp_1h')['trades'].sum(),
            df.groupby('timestamp_1h')['buy_volume'].sum(),
            df.groupby('timestamp_1h')['buy_amount'].sum(),
            df.groupby('timestamp_1h')['cl'].mean().rename('twap'),
            df_5m.groupby('timestamp_1h')['cl'].mean().rename('twap_5m'),
            # vola
            df.groupby('timestamp_1h')['cl'].std().fillna(0).rename('cl_std'),
            df.groupby('timestamp_1h').apply(lambda x: (x['cl'] - x['cl'].shift(1).fillna(df['op'])).std()).fillna(0).rename('cl_diff_std'),
            # slippage
            df.groupby('timestamp_1h')['hi'].mean().rename('hi_twap'),
            df.groupby('timestamp_1h')['lo'].mean().rename('lo_twap'),
            df.groupby('timestamp_1h')['hi_op'].mean().rename('hi_op_max'),
            df.groupby('timestamp_1h')['lo_op'].mean().rename('lo_op_min'),
            # microstructure
            df.groupby('timestamp_1h')['ln_hi_lo'].mean().rename('ln_hi_lo_mean'),
            df.groupby('timestamp_1h')['ln_hi_lo_sqr'].mean().rename('ln_hi_lo_sqr_mean'),
            # df.groupby('timestamp_1h').apply(corwin_alpha).fillna(0).rename('corwin_alpha'),
            # entropy
        ], axis=1)
        
        df.index.rename('timestamp', inplace=True)
        
        df = df.iloc[:-1] # remove partial
        
        return df


In [3]:
fetcher = Fetcher(symbol='BTCUSDT')
df = fetcher.fetch(last_timestamp=None)
display(df)
df = fetcher.fetch(last_timestamp=(1600000000 // 3600) * 3600)
display(df)

Unnamed: 0_level_0,op,hi,lo,cl,volume,amount,trades,buy_volume,buy_amount,twap,twap_5m,cl_std,cl_diff_std,hi_twap,lo_twap,hi_op_max,lo_op_min,ln_hi_lo_mean,ln_hi_lo_sqr_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1567962000,10000.0,10000.0,10000.0,10000.0,0.002,20.0,2.0,0.001,10.0,10000.0,10000.0,0.0,0.0,10000.0,10000.0,0.0,0.0,0.0,0.0
1567965600,10000.0,10000.0,10000.0,10000.0,0.0,0.0,0.0,0.0,0.0,10000.0,10000.0,0.0,0.0,10000.0,10000.0,0.0,0.0,0.0,0.0
1567969200,10000.0,10357.53,10000.0,10340.12,471.659,4879792.0,562.0,0.0,0.0,10275.48,10287.825833,140.46963,45.084017,10276.739322,10274.418983,1.039153,-1.281186,0.000224,1.020135e-07
1567972800,10340.12,10368.64,10334.54,10351.42,583.271,6037733.0,731.0,0.034,351.9712,10350.826833,10351.35,8.237726,3.46891,10352.417833,10349.721167,1.388,-1.308667,0.000261,1.08448e-07
1567976400,10351.42,10391.9,10324.77,10391.9,689.759,7136799.0,827.0,0.004,41.385,10347.2705,10348.556667,14.624109,3.471448,10348.2775,10345.559167,1.454167,-1.264167,0.000263,1.352056e-07
1567980000,10392.59,10412.65,10375.58,10375.58,675.394,7020273.0,821.0,0.0,0.0,10394.0685,10392.563333,8.151937,3.534259,10395.382833,10392.7305,1.019833,-1.6325,0.000255,1.045473e-07
1567983600,10375.58,10392.25,10366.57,10391.63,676.206,7021684.0,811.0,0.0,0.0,10383.645167,10385.221667,9.414928,3.26158,10384.587667,10382.231,1.112667,-1.244,0.000227,1.105052e-07
1567987200,10391.63,10391.63,10391.63,10391.63,0.0,0.0,0.0,0.0,0.0,10391.63,10391.63,0.0,0.0,10391.63,10391.63,0.0,0.0,0.0,0.0
1567990800,10391.63,10391.63,10391.63,10391.63,0.0,0.0,0.0,0.0,0.0,10391.63,10391.63,0.0,0.0,10391.63,10391.63,0.0,0.0,0.0,0.0
1567994400,10391.63,10391.63,10294.29,10296.04,96.462,994913.6,112.0,0.0,0.0,10379.9605,10377.206667,28.122285,9.987712,10380.308667,10379.834,0.018,-0.456667,4.6e-05,6.370534e-08


Unnamed: 0_level_0,op,hi,lo,cl,volume,amount,trades,buy_volume,buy_amount,twap,twap_5m,cl_std,cl_diff_std,hi_twap,lo_twap,hi_op_max,lo_op_min,ln_hi_lo_mean,ln_hi_lo_sqr_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1600002000,10290.28,10342.51,10260.0,10320.96,9614.284,99073320.0,30057.0,5159.779,53189410.0,10318.250167,10321.729167,12.291609,6.269115,10321.636833,10313.750167,3.874667,-4.012,0.000765,9.908901e-07
1600005600,10320.96,10340.0,10303.93,10323.88,4805.957,49588420.0,19387.0,2694.436,27804340.0,10317.8625,10317.0,8.703156,4.310701,10320.412833,10314.733833,2.566167,-3.112833,0.00055,3.938773e-07
1600009200,10323.89,10340.66,10273.13,10281.78,6421.05,66165700.0,25248.0,2885.856,29745900.0,10312.589833,10310.793333,14.144969,4.915285,10315.702,10309.151667,2.422833,-4.1275,0.000636,5.844943e-07
1600012800,10281.78,10298.73,10222.0,10256.42,13596.029,139448800.0,37231.0,5660.461,58060780.0,10259.829,10261.120833,13.310195,9.766598,10265.166,10253.473667,4.905667,-6.786667,0.00114,1.927086e-06
1600016400,10256.43,10281.97,10209.01,10275.0,11038.868,113069100.0,28274.0,5195.644,53232970.0,10244.735667,10246.691667,16.684025,5.343475,10248.778333,10239.760833,4.344667,-4.672833,0.00088,9.523863e-07
1600020000,10274.4,10282.39,10245.01,10256.7,4110.26,42195630.0,17133.0,1629.694,16732010.0,10265.859,10265.504167,9.600475,3.800401,10268.134667,10263.365333,1.947833,-2.8215,0.000465,2.720961e-07
1600023600,10256.69,10322.2,10247.01,10307.61,5917.699,60899760.0,20491.0,3552.691,36564170.0,10286.757333,10287.004167,20.485317,5.106946,10289.295333,10283.299333,3.3655,-2.6305,0.000583,4.849943e-07
1600027200,10307.62,10314.63,10285.6,10310.0,3742.984,38548130.0,16397.0,1914.328,19716440.0,10300.078333,10300.774167,6.568839,3.9792,10302.449667,10297.431167,2.4005,-2.618,0.000487,3.268162e-07
1600030800,10310.0,10319.63,10287.49,10296.08,3580.75,36903460.0,16895.0,1822.952,18789370.0,10306.185333,10304.7,7.72378,4.005225,10307.787833,10304.4275,1.38,-1.980333,0.000326,2.186834e-07
1600034400,10296.08,10314.85,10290.0,10303.99,3043.649,31355910.0,14465.0,1980.847,20407780.0,10301.3925,10300.565,5.42014,3.329357,10303.715667,10299.278667,2.446833,-1.990167,0.000431,2.540619e-07


In [4]:
symbols = 'BTC,ETH,XRP,LINK,ATOM,DOT,SOL,BNB,MATIC,ADA'.split(',')
symbols += ['ALGO', 'AVAX', 'BCH', 'APT']
fetchers = []

for symbol in symbols:
    fetchers.append(Fetcher(symbol=f'{symbol}USDT'))
    
data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20221103_binance_ohlcv.xz', 'wb') as f:
    f.write(data)