In [37]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import ccxt

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [41]:
import time

class Fetcher:
    def __init__(self, symbol=None):
        self.symbol = symbol
        self.keys = {
            'symbol': symbol
        }
        self.data_id = 'coinbase_ohlcv_spot'
        
    def fetch(self, last_timestamp=None):
        client = ccxt.coinbasepro({})
        
        if last_timestamp is None:
            granularity = 24 * 60 * 60
            start_time = int(time.time())
            while True:
                time.sleep(3)
                result = client.publicGetProductsIdCandles({
                    'id': self.symbol,
                    'granularity': granularity,
                    'start': start_time - granularity * 300,
                    'end': start_time,
                })
                df = pd.DataFrame(result, columns=[
                    'timestamp',
                    'lo',
                    'hi',
                    'op',
                    'cl',
                    'volume',
                ])
                df['timestamp'] = df['timestamp'].astype(int)
                if df['timestamp'].min() == start_time:
                    break
                # display(df)
                start_time = df['timestamp'].min()
                print(start_time)
                
            granularity = 60
            start_time += 24 * 60 * 60
            while True:
                time.sleep(3)
                result = client.publicGetProductsIdCandles({
                    'id': self.symbol,
                    'granularity': granularity,
                    'start': start_time - granularity * 300,
                    'end': start_time,
                })
                df = pd.DataFrame(result, columns=[
                    'timestamp',
                    'lo',
                    'hi',
                    'op',
                    'cl',
                    'volume',
                ])
                df['timestamp'] = df['timestamp'].astype(int)
                if df['timestamp'].min() == start_time:
                    break
                # display(df)
                start_time = df['timestamp'].min()
                print(start_time)
        else:
            start_time = last_timestamp + 60 * 60
        
        time.sleep(3)
        result = client.publicGetProductsIdCandles({
            'id': self.symbol,
            'granularity': 60,
            'start': start_time,
            'end': start_time + 60 * 300,
        })
        
        df = pd.DataFrame(result, columns=[
            'timestamp',
            'lo',
            'hi',
            'op',
            'cl',
            'volume',
        ])
        df['timestamp'] = df['timestamp'].astype(int)
        df = df.sort_values('timestamp')
        
        for col in ['op', 'hi', 'lo', 'cl', 'volume']:
            df[col] = df[col].astype('float')
        
        df['timestamp_5m'] = (df['timestamp'] // 300) * 300
        df['timestamp_1h'] = (df['timestamp'] // 3600) * 3600
        
        df_5m = pd.concat([
            df.groupby('timestamp_5m')['cl'].nth(-1),
        ], axis=1)
        df_5m = df_5m.reset_index()
        df_5m['timestamp_1h'] = (df_5m['timestamp_5m'] // 3600) * 3600
        
        # display(df.loc[(df['volume'] == 0) & (df['trades'] > 0)])
        
        # microstructure feature
        # entropy feature
        # slippage feature
        
        df['hi_op'] = df['hi'] - df['op']
        df['lo_op'] = df['lo'] - df['op']
        
        df['ln_hi_lo'] = np.log(df['hi'] / df['lo'])
        df['ln_hi_lo_sqr'] = df['ln_hi_lo'] ** 2
        
#         def corwin_alpha(x):
#             hi2 = x['hi'].rolling(2).max()
#             lo2 = x['lo'].rolling(2).min()
#             gamma = np.log(hi2 / lo2) ** 2
#             beta = np.log(x['hi'] / x['lo']) ** 2
#             beta = beta.rolling(2).sum()
            
#             sqrt2 = 2.0 ** 0.5
#             denom = 3.0 - 2.0 * sqrt2
#             alpha = (sqrt2 - 1) / denom * beta ** 0.5 - (gamma / denom) ** 0.5
#             return alpha.mean()
        
        df = pd.concat([
            df.groupby('timestamp_1h')['op'].nth(0),
            df.groupby('timestamp_1h')['hi'].max(),
            df.groupby('timestamp_1h')['lo'].min(),
            df.groupby('timestamp_1h')['cl'].nth(-1),
            df.groupby('timestamp_1h')['volume'].sum(),
            df.groupby('timestamp_1h')['cl'].mean().rename('twap'),
            df_5m.groupby('timestamp_1h')['cl'].mean().rename('twap_5m'),
            # vola
            df.groupby('timestamp_1h')['cl'].std().fillna(0).rename('cl_std'),
            df.groupby('timestamp_1h').apply(lambda x: (x['cl'] - x['cl'].shift(1).fillna(x['op'])).std()).fillna(0).rename('cl_diff_std'),
            # slippage
            df.groupby('timestamp_1h')['hi'].mean().rename('hi_twap'),
            df.groupby('timestamp_1h')['lo'].mean().rename('lo_twap'),
            df.groupby('timestamp_1h')['hi_op'].mean().rename('hi_op_max'),
            df.groupby('timestamp_1h')['lo_op'].mean().rename('lo_op_min'),
            # microstructure
            df.groupby('timestamp_1h')['ln_hi_lo'].mean().rename('ln_hi_lo_mean'),
            df.groupby('timestamp_1h')['ln_hi_lo_sqr'].mean().rename('ln_hi_lo_sqr_mean'),
            # df.groupby('timestamp_1h').apply(corwin_alpha).fillna(0).rename('corwin_alpha'),
            # entropy
        ], axis=1)
        
        df.index.rename('timestamp', inplace=True)
        
        df = df.iloc[:-1] # remove partial
        
        return df


In [42]:
fetcher = Fetcher(symbol='BTC-USD')
df = fetcher.fetch(last_timestamp=None)
display(df)
df = fetcher.fetch(last_timestamp=(1600000000 // 3600) * 3600)
display(df)

1645315200
1619481600
1593648000
1567814400
1541980800
1516147200
1490313600
1464480000
1438646400
1437350400
1437428220


Unnamed: 0_level_0,op,hi,lo,cl,volume,twap,twap_5m,cl_std,cl_diff_std,hi_twap,lo_twap,hi_op_max,lo_op_min,ln_hi_lo_mean,ln_hi_lo_sqr_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1437426000,277.98,278.0,277.92,278.0,59.260456,277.984783,277.978,0.018058,0.023532,277.994783,277.982609,0.002174,-0.01,4.4e-05,7.315126e-09
1437429600,278.0,278.5,277.37,278.14,346.995411,278.039821,278.039167,0.219963,0.227494,278.092321,277.96625,0.070536,-0.055536,0.000454,5.303553e-07
1437433200,278.31,280.0,278.15,280.0,376.627552,278.864746,278.921667,0.43155,0.149291,278.894407,278.798644,0.062373,-0.03339,0.000343,2.864564e-07
1437436800,279.96,281.0,279.38,280.89,288.177976,279.979831,279.958333,0.475059,0.130096,280.005593,279.95339,0.032034,-0.020169,0.000186,1.26291e-07
1437440400,280.89,281.15,280.35,280.36,380.933023,280.880339,280.8725,0.181222,0.064639,280.901864,280.857627,0.02,-0.024237,0.000158,7.54969e-08


Unnamed: 0_level_0,op,hi,lo,cl,volume,twap,twap_5m,cl_std,cl_diff_std,hi_twap,lo_twap,hi_op_max,lo_op_min,ln_hi_lo_mean,ln_hi_lo_sqr_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1600002000,10289.58,10345.12,10268.0,10327.75,776.714509,10324.20678,10326.771667,12.53418,6.38862,10326.616441,10320.924746,3.027458,-2.664237,0.000552,6.437148e-07
1600005600,10326.74,10344.04,10310.0,10330.8,802.016242,10324.669667,10323.853333,8.436266,3.779008,10326.382167,10322.022333,1.726167,-2.633667,0.000422,2.60459e-07
1600009200,10330.8,10341.71,10275.01,10284.29,445.716861,10314.2175,10312.829167,14.686248,4.478851,10316.835833,10311.125,1.935667,-3.775167,0.000554,4.671719e-07
1600012800,10283.63,10296.2,10223.37,10264.9,838.326013,10262.191167,10264.280833,11.665632,7.592806,10266.702333,10257.706667,4.086333,-4.909333,0.000877,1.187885e-06
1600016400,10264.89,10288.16,10211.0,10284.96,981.963186,10247.172833,10248.549167,19.91579,5.211796,10250.591333,10242.759833,3.680167,-4.151333,0.000764,8.07951e-07


In [43]:
symbols = 'BTC,ETH,LINK,ATOM,DOT,SOL,MATIC,ADA'.split(',')
symbols += ['ALGO', 'AVAX', 'BCH', 'APT']
symbols += 'DOGE,AXS,CHZ,MASK,ETC,LTC,SHIB,C98,SAND,SUSHI,NEAR,FIL'.split(',')
fetchers = []

for symbol in symbols:
    fetchers.append(Fetcher(symbol=f'{symbol}-USD'))
    
data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20221216_coinbase_ohlcv_spot.xz', 'wb') as f:
    f.write(data)