In [1]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import ccxt

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [2]:
import time


class Fetcher:
    def __init__(self, symbol=None):
        self.symbol = symbol
        self.keys = {
            'symbol': symbol
        }
        self.data_id = 'binance_ohlcv_spot'
        
    def fetch(self, last_timestamp=None):
        client = ccxt.binance({
            # 'options': {
                # 'defaultType': 'future',
            # },
        })
        
        time.sleep(10)
        result = client.publicGetKlines({
            'symbol': self.symbol,
            'interval': '1m',
            'startTime': 1 if last_timestamp is None else (last_timestamp + 60 * 60) * 1000,
            'limit': 1500,
        })
        
        df = pd.DataFrame(result, columns=[
            'timestamp',
            'op',
            'hi',
            'lo',
            'cl',
            'volume',
            'close_time',
            'amount',
            'trades',
            'buy_volume',
            'buy_amount',
            'ignored',
        ])
        df = df.drop(columns=['close_time', 'ignored'])
        df['timestamp'] = df['timestamp'].astype(int) // 1000
        
        for col in ['op', 'hi', 'lo', 'cl', 'volume', 'amount', 'trades', 'buy_volume', 'buy_amount']:
            df[col] = df[col].astype('float')
        
        df['timestamp_5m'] = (df['timestamp'] // 300) * 300
        df['timestamp_1h'] = (df['timestamp'] // 3600) * 3600
        
        df_5m = pd.concat([
            df.groupby('timestamp_5m')['cl'].nth(-1),
        ], axis=1)
        df_5m = df_5m.reset_index()
        df_5m['timestamp_1h'] = (df_5m['timestamp_5m'] // 3600) * 3600
        
        # display(df.loc[(df['volume'] == 0) & (df['trades'] > 0)])
        
        # microstructure feature
        # entropy feature
        # slippage feature
        
        df['hi_op'] = df['hi'] - df['op']
        df['lo_op'] = df['lo'] - df['op']
        
        df['ln_hi_lo'] = np.log(df['hi'] / df['lo'])
        df['ln_hi_lo_sqr'] = df['ln_hi_lo'] ** 2
        
#         def corwin_alpha(x):
#             hi2 = x['hi'].rolling(2).max()
#             lo2 = x['lo'].rolling(2).min()
#             gamma = np.log(hi2 / lo2) ** 2
#             beta = np.log(x['hi'] / x['lo']) ** 2
#             beta = beta.rolling(2).sum()
            
#             sqrt2 = 2.0 ** 0.5
#             denom = 3.0 - 2.0 * sqrt2
#             alpha = (sqrt2 - 1) / denom * beta ** 0.5 - (gamma / denom) ** 0.5
#             return alpha.mean()
        
        df = pd.concat([
            df.groupby('timestamp_1h')['op'].nth(0),
            df.groupby('timestamp_1h')['hi'].max(),
            df.groupby('timestamp_1h')['lo'].min(),
            df.groupby('timestamp_1h')['cl'].nth(-1),
            df.groupby('timestamp_1h')['volume'].sum(),
            df.groupby('timestamp_1h')['amount'].sum(),
            df.groupby('timestamp_1h')['trades'].sum(),
            df.groupby('timestamp_1h')['buy_volume'].sum(),
            df.groupby('timestamp_1h')['buy_amount'].sum(),
            df.groupby('timestamp_1h')['cl'].mean().rename('twap'),
            df_5m.groupby('timestamp_1h')['cl'].mean().rename('twap_5m'),
            # vola
            df.groupby('timestamp_1h')['cl'].std().fillna(0).rename('cl_std'),
            df.groupby('timestamp_1h').apply(lambda x: (x['cl'] - x['cl'].shift(1).fillna(x['op'])).std()).fillna(0).rename('cl_diff_std'),
            # slippage
            df.groupby('timestamp_1h')['hi'].mean().rename('hi_twap'),
            df.groupby('timestamp_1h')['lo'].mean().rename('lo_twap'),
            df.groupby('timestamp_1h')['hi_op'].mean().rename('hi_op_max'),
            df.groupby('timestamp_1h')['lo_op'].mean().rename('lo_op_min'),
            # microstructure
            df.groupby('timestamp_1h')['ln_hi_lo'].mean().rename('ln_hi_lo_mean'),
            df.groupby('timestamp_1h')['ln_hi_lo_sqr'].mean().rename('ln_hi_lo_sqr_mean'),
            # df.groupby('timestamp_1h').apply(corwin_alpha).fillna(0).rename('corwin_alpha'),
            # entropy
        ], axis=1)
        
        df.index.rename('timestamp', inplace=True)
        
        df = df.iloc[:-1] # remove partial
        
        return df


In [3]:
fetcher = Fetcher(symbol='BTCUSDT')
df = fetcher.fetch(last_timestamp=None)
display(df)
df = fetcher.fetch(last_timestamp=(1600000000 // 3600) * 3600)
display(df)

Unnamed: 0_level_0,op,hi,lo,cl,volume,amount,trades,buy_volume,buy_amount,twap,twap_5m,cl_std,cl_diff_std,hi_twap,lo_twap,hi_op_max,lo_op_min,ln_hi_lo_mean,ln_hi_lo_sqr_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1502942400,4261.48,4313.62,4261.32,4308.83,47.181009,202366.138393,171.0,35.160503,150952.477943,4282.625167,4282.931667,21.190936,7.995963,4283.535667,4281.556333,1.092,-0.887333,0.000461,1.485671e-06
1502946000,4308.83,4328.69,4291.37,4315.32,23.234916,100304.823567,102.0,21.448071,92608.279728,4313.305667,4310.644167,11.543133,6.336847,4313.6425,4312.843833,0.461833,-0.336833,0.000185,6.465886e-07
1502949600,4315.32,4345.45,4309.37,4324.35,7.229691,31282.31267,36.0,4.802861,20795.317224,4327.276333,4327.216667,11.110047,6.552408,4327.375833,4326.350667,0.925667,-0.0995,0.000237,9.758008e-07
1502953200,4324.35,4349.99,4287.41,4349.99,4.443249,19241.0583,25.0,2.602292,11291.347015,4311.038,4310.476667,15.591121,5.636087,4311.151,4310.593833,0.444167,-0.113,0.000129,3.680335e-07
1502956800,4333.32,4377.85,4333.32,4360.69,0.972807,4239.503586,28.0,0.814655,3552.746817,4360.375333,4358.360833,5.944685,6.914685,4361.1175,4360.0975,0.277833,-0.742167,0.000234,1.987785e-06
1502960400,4360.69,4445.78,4360.0,4444.0,10.763623,47219.355944,43.0,10.2584,44995.378157,4398.649833,4401.73,40.420864,6.234895,4398.649833,4397.844,0.715167,-0.090667,0.000184,7.626259e-07
1502964000,4441.1,4470.0,4399.81,4460.0,24.865399,110290.728988,82.0,21.698693,96225.973054,4428.906333,4430.875833,21.467214,10.45763,4429.903333,4428.154833,1.1405,-0.608,0.000394,2.520515e-06
1502967600,4460.0,4485.39,4427.3,4427.3,27.018053,120751.569232,95.0,26.015885,116279.953125,4468.385833,4466.304167,15.207452,6.670391,4468.4025,4466.9625,1.0845,-0.3555,0.000322,8.711835e-07
1502971200,4427.3,4449.56,4411.0,4411.0,23.032398,102126.518585,108.0,15.312994,67947.834082,4428.084333,4427.154167,11.869371,5.260773,4429.121333,4427.718667,0.748,-0.654667,0.000317,9.975715e-07
1502974800,4411.0,4459.0,4411.0,4459.0,31.312436,138976.639435,118.0,26.564042,117910.4756,4439.312333,4442.599167,10.341933,6.081583,4439.331167,4437.819833,1.244,-0.267333,0.000341,8.513764e-07


Unnamed: 0_level_0,op,hi,lo,cl,volume,amount,trades,buy_volume,buy_amount,twap,twap_5m,cl_std,cl_diff_std,hi_twap,lo_twap,hi_op_max,lo_op_min,ln_hi_lo_mean,ln_hi_lo_sqr_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1600002000,10290.06,10342.84,10258.0,10322.73,2489.821261,25670740.0,38403.0,1178.55498,12151010.0,10319.4305,10323.045,12.422451,6.289812,10322.984667,10314.989833,4.103333,-3.8915,0.000775,1.030032e-06
1600005600,10322.74,10341.82,10303.33,10324.78,1618.648839,16701860.0,27609.0,814.209218,8402162.0,10318.668167,10317.7875,8.670661,4.270569,10321.399,10315.1565,2.779833,-3.462667,0.000605,4.599999e-07
1600009200,10324.77,10342.0,10272.44,10280.14,1712.571848,17650460.0,31031.0,710.151474,7318405.0,10312.985667,10311.499167,14.937877,5.032222,10316.374,10309.5115,2.68,-4.1825,0.000666,6.144928e-07
1600012800,10280.15,10296.79,10200.0,10257.77,3730.155618,38250370.0,56172.0,1647.523114,16898350.0,10259.964,10261.419167,13.237759,9.683066,10265.426667,10252.7255,5.137,-7.564167,0.001238,2.333819e-06
1600016400,10257.78,10284.02,10212.51,10275.01,2144.266995,21965200.0,39612.0,1045.576815,10710130.0,10244.839167,10246.685,17.160958,5.319916,10249.020333,10239.362,4.494167,-5.164167,0.000943,1.055423e-06
1600020000,10276.69,10282.27,10243.47,10257.34,1115.529287,11452160.0,21508.0,539.442601,5537715.0,10265.822,10265.305,10.085573,3.768376,10268.323833,10263.142333,2.159833,-3.021667,0.000505,3.034635e-07
1600023600,10257.13,10323.67,10247.34,10309.05,1499.147021,15429080.0,24808.0,737.10656,7586193.0,10287.949667,10288.311667,20.526966,4.902658,10290.569833,10284.317,3.487,-2.765833,0.000608,4.971736e-07
1600027200,10309.05,10315.0,10284.74,10312.18,1120.897005,11545340.0,19727.0,545.866951,5622860.0,10300.9925,10301.9275,7.086016,4.087992,10303.655167,10298.000333,2.733,-2.921833,0.000549,4.21631e-07
1600030800,10312.19,10321.0,10286.77,10296.87,821.151679,8463355.0,18172.0,356.175306,3671203.0,10307.948167,10306.349167,8.585792,4.453691,10310.062167,10306.048667,1.859,-2.1545,0.000389,2.607213e-07
1600034400,10296.87,10316.18,10292.44,10306.66,901.616714,9290889.0,18189.0,405.15807,4174920.0,10303.52,10302.67,5.821881,3.395171,10306.037667,10301.236833,2.686333,-2.1145,0.000466,2.796603e-07


In [4]:
symbols = 'BTC,ETH,XRP,LINK,ATOM,DOT,SOL,BNB,MATIC,ADA'.split(',')
symbols += ['ALGO', 'AVAX', 'BCH', 'APT']
symbols += 'DOGE,SFP,DYDX,AXS,CHZ,TRX,MASK,ETC,LTC,1000SHIB,C98,SAND,SUSHI,NEAR,FIL'.split(',')
fetchers = []

for symbol in symbols:
    fetchers.append(Fetcher(symbol=f'{symbol}USDT'))
    
data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20221216_binance_ohlcv_spot.xz', 'wb') as f:
    f.write(data)

In [5]:
symbols = 'EOS,WAVES,COMP,CTSI,MTL,APE,TOMO,ARB,STORJ,MKR,SUI,FTM,OP,CFX,INJ'.split(',')
fetchers = []

for symbol in symbols:
    fetchers.append(Fetcher(symbol=f'{symbol}USDT'))
    
data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20221216_binance_ohlcv_spot2.xz', 'wb') as f:
    f.write(data)

In [6]:
symbols = 'MINA,KAVA,LINA,STG,OCEAN,CRV,LDO,GALA,ONT,BEL,ANKR,STMX,GMT,STX,RNDR'.split(',')
fetchers = []

for symbol in symbols:
    fetchers.append(Fetcher(symbol=f'{symbol}USDT'))
    
data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20221216_binance_ohlcv_spot3.xz', 'wb') as f:
    f.write(data)

In [4]:
symbols = 'UNI,GRT,FLOW,CELO,MANA,ARPA,SNX,SXP,XEM,YFI,ALPHA,FET,XLM,ZEC,NKN,AAVE'.split(',')
symbols += ['USDC']
fetchers = []

for symbol in symbols:
    fetchers.append(Fetcher(symbol=f'{symbol}USDT'))
    
data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20221216_binance_ohlcv_spot4.xz', 'wb') as f:
    f.write(data)