In [1]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import ccxt

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [2]:
import numpy as np
import pandas as pd
import time
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

def execute_query(client, q):
    query = gql('query Query {}'.format(q))
    return client.execute(query)

In [3]:
weth_id = '0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2'

def fetch_pair(client, created_at=0, weth_is_token1=False):
    tmpl = """
    {{
      pairs(orderBy: createdAtTimestamp, where: {{ createdAtTimestamp_gte: {}, token{}: "{}" }}, first: 1000) {{
        id
        createdAtTimestamp
        token0 {{
          id
          symbol
        }}
        token1 {{
          id
          symbol
        }}
      }}
    }}
    """
    return execute_query(client, tmpl.format(created_at, 1 if weth_is_token1 else 0, weth_id))

def do_fetch_all_pairs(client, created_at=1, weth_is_token1=False):
    dfs = []
    
    while True:
        print('created_at', created_at)
        try:
            res = fetch_pair(client, created_at, weth_is_token1)
        except Exception as e:
            print(e)
            print('retry')
            time.sleep(10)
            continue
        print('len', len(res['pairs']))
        for row in res['pairs']:
            row['token0_id'] = row['token0']['id']
            row['token0_symbol'] = row['token0']['symbol']
            row['token1_id'] = row['token1']['id']
            row['token1_symbol'] = row['token1']['symbol']
        df = pd.DataFrame(res['pairs']).drop(columns=['token0', 'token1'])
        dfs.append(df)
        created_at = df['createdAtTimestamp'].max()
        if len(res['pairs']) < 1000:
            break

    df = pd.concat(dfs).drop_duplicates()
    df = df.rename(columns={
        'createdAtTimestamp': 'timestamp'
    })
    df['timestamp'] = df['timestamp'].astype('int')
    df = df.set_index(['id']).sort_index()
    return df

def fetch_all_pairs(client, created_at=1):
    dfs = [
        do_fetch_all_pairs(client, created_at, False),
        do_fetch_all_pairs(client, created_at, True),
    ]
    return pd.concat(dfs).sort_index()

transport = RequestsHTTPTransport(url="https://api.thegraph.com/subgraphs/name/uniswap/uniswap-v2")
client = Client(transport=transport, fetch_schema_from_transport=True)

df_pair = fetch_all_pairs(client, 1668000000)
print('index duplicated count {}'.format(df_pair[df_pair.index.duplicated(keep='last')].shape[0]))
display(df_pair)
df_pair.to_pickle('/tmp/df_pair.pkl')

created_at 1668000000
len 273
created_at 1668000000
len 770
index duplicated count 0


Unnamed: 0_level_0,timestamp,token0_id,token0_symbol,token1_id,token1_symbol
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0x002c755bc6b99f61d59a76047a13d6be41bb8fd1,1668312539,0x22e7250df8dc9eb3cf2f3e21de58b1a4e2c36946,IM,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH
0x00513eedf43b040301315473f193152143f28cbd,1668456983,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH,0xc2089ed438021ac6a07b4f814787c02b4e34f109,test2
0x0084224f7c5635bbd8d8e436ca246f3b4f0e7197,1668116723,0x85d913b0afab2630544660089d6cbb85d4560393,CR7,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH
0x00b7bebc45ae2aac8af760db9315826c657cd8f4,1668377891,0x83fca6b2b527e51c4d2dc4b12fed40752794238d,ToM,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH
0x00b7e4a0f0b2aeffcefb92d6a78197b888196463,1668425507,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH,0xecead914cbf1751369fdb278704621926d1caac0,Pepe-TV
...,...,...,...,...,...
0xfee78a4782064296fbb5dd1ced086676bedb0d59,1668402131,0x19cce203a8563df72fcc17fea1f5d4d7de5b8e40,GAINS,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH
0xff3daea632eccaa05759f0c4d8dba46ebd555402,1668465203,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH,0xe1f8ad437bfadc5d2f42a4126403ee6c4200ee4e,MTLS
0xff6354df9854ee901593b93384dcff979599f3fa,1668119975,0x9dd9a8379d73ea7e5bc7b7d1af85c40ac93262c6,COKEBEAR,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH
0xff769f62d94283001b39f77c41c28f279e73d4ae,1668435539,0xa630422573fc622c6f389bd60608722d73117d6b,TIMP,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH


In [17]:

class PairFetcher:
    def __init__(self):
        self.keys = {}
        self.data_id = 'univ2_pair'
        
    def fetch(self, last_timestamp=None):
        transport = RequestsHTTPTransport(url="https://api.thegraph.com/subgraphs/name/uniswap/uniswap-v2")
        client = Client(transport=transport, fetch_schema_from_transport=True)
        
        df = fetch_all_pairs(client, 1 if last_timestamp is None else last_timestamp + 1)
        
        df = df.loc[df['timestamp'] < df['timestamp'].max() - 300] # buffer
        
        return df


In [16]:
fetcher = PairFetcher()
# df = fetcher.fetch(last_timestamp=None)
# display(df)
df = fetcher.fetch(last_timestamp=1668000000)
display(df)

created_at 1668000001
len 278
created_at 1668000001
len 776


Unnamed: 0_level_0,timestamp,token0_id,token0_symbol,token1_id,token1_symbol
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0x002c755bc6b99f61d59a76047a13d6be41bb8fd1,1668312539,0x22e7250df8dc9eb3cf2f3e21de58b1a4e2c36946,IM,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH
0x00513eedf43b040301315473f193152143f28cbd,1668456983,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH,0xc2089ed438021ac6a07b4f814787c02b4e34f109,test2
0x0084224f7c5635bbd8d8e436ca246f3b4f0e7197,1668116723,0x85d913b0afab2630544660089d6cbb85d4560393,CR7,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH
0x00b7bebc45ae2aac8af760db9315826c657cd8f4,1668377891,0x83fca6b2b527e51c4d2dc4b12fed40752794238d,ToM,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH
0x00b7e4a0f0b2aeffcefb92d6a78197b888196463,1668425507,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH,0xecead914cbf1751369fdb278704621926d1caac0,Pepe-TV
...,...,...,...,...,...
0xfee78a4782064296fbb5dd1ced086676bedb0d59,1668402131,0x19cce203a8563df72fcc17fea1f5d4d7de5b8e40,GAINS,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH
0xff3daea632eccaa05759f0c4d8dba46ebd555402,1668465203,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH,0xe1f8ad437bfadc5d2f42a4126403ee6c4200ee4e,MTLS
0xff6354df9854ee901593b93384dcff979599f3fa,1668119975,0x9dd9a8379d73ea7e5bc7b7d1af85c40ac93262c6,COKEBEAR,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH
0xff769f62d94283001b39f77c41c28f279e73d4ae,1668435539,0xa630422573fc622c6f389bd60608722d73117d6b,TIMP,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,WETH


In [18]:
def fetch_pair_hour_data(client, hour_start_unix=0, pair_ids=None):
    tmpl = """
    {{
      pairHourDatas(orderBy: hourStartUnix, first: 1000, where: {{ hourStartUnix_gte: {}, pair_in: [{}], }}) {{
        id
        hourStartUnix
        pair {{
          id
        }}
        reserve0
        reserve1
        reserveUSD
        hourlyVolumeToken0
        hourlyVolumeToken1
        hourlyVolumeUSD
        hourlyTxns
      }}
    }}
    """
    pair_ids_str = ','.join(['"{}"'.format(x) for x in pair_ids])
    return execute_query(client, tmpl.format(hour_start_unix, pair_ids_str))

def do_fetch_all_pair_hour_datas(client, pair_ids=None, raise_error=False, hour_start_unix=None, end_unix=None):
    dfs = []
    while hour_start_unix < end_unix:
        print('pair_ids hour_start_unix end_unix', pair_ids[0], pair_ids[-1], hour_start_unix, end_unix)
        try:
            res = fetch_pair_hour_data(client, hour_start_unix=hour_start_unix, pair_ids=pair_ids)
        except Exception as e:
            if raise_error:
                raise
            print(e)
            print('retry')
            time.sleep(10)
            continue
        print('len', len(res['pairHourDatas']))
        if len(res['pairHourDatas']) == 0:
            break
        for row in res['pairHourDatas']:
            row['pair_id'] = row['pair']['id']
        df = pd.DataFrame(res['pairHourDatas']).drop(columns=['pair'])
        dfs.append(df)
        hour_start_unix = df['hourStartUnix'].max()
        if len(res['pairHourDatas']) < 1000:
            break
            
    if len(dfs) == 0:
        return None

    df = pd.concat(dfs).drop_duplicates()
    df = df.rename(columns={
        'hourStartUnix': 'timestamp'
    }).drop(columns=['id'])
    df['timestamp'] = df['timestamp'].astype('int')
    df = df.loc[df['timestamp'] < end_unix]
    df = df.set_index(['timestamp', 'pair_id']).sort_index()
    return df

def fetch_all_pair_hour_datas(client, pair_ids=None, hour_start_unix=None, end_unix=None):
    dfs = []
    for i in range(0, len(pair_ids), 100):
        df = do_fetch_all_pair_hour_datas(
            client,
            pair_ids=pair_ids[i:min(i + 100, len(pair_ids))], 
            raise_error=False,
            hour_start_unix=hour_start_unix,
            end_unix=end_unix
        )
        if df is None:
            continue
        dfs.append(df)
    df = pd.concat(dfs)
    df = df.sort_index()
    return df

transport = RequestsHTTPTransport(url="https://api.thegraph.com/subgraphs/name/uniswap/uniswap-v2")
client = Client(transport=transport, fetch_schema_from_transport=True)

df_pair = pd.read_pickle('/tmp/df_pair.pkl')

pair_ids = sorted(df_pair.index.unique().tolist())
df_hour = fetch_all_pair_hour_datas(client, pair_ids, 1668000000, 1668000000 + 3600 * 24)

print('index duplicated count {}'.format(df_hour[df_hour.index.duplicated(keep='last')].shape[0]))
df_hour.to_pickle('/tmp/df_hour.pkl')
display(df_hour)

pair_ids hour_start_unix end_unix 0x002c755bc6b99f61d59a76047a13d6be41bb8fd1 0x18e229df96ca1e86bd0b90681effc3f76cbfedd3 1668000000 1668086400
len 490
pair_ids hour_start_unix end_unix 0x192ecdb363fe120c65f8a58f775a018dc3478b6c 0x3443a6df1a0c78f11f09773c7ff9a31e7a05a20d 1668000000 1668086400


KeyboardInterrupt: 

In [19]:
from google.cloud import bigquery

class HourFetcher:
    def __init__(self):
        self.keys = {}
        self.data_id = 'univ2_hour'
        
    def fetch(self, last_timestamp=None):
        transport = RequestsHTTPTransport(url="https://api.thegraph.com/subgraphs/name/uniswap/uniswap-v2")
        client = Client(transport=transport, fetch_schema_from_transport=True)
        
        pair_ids = self._get_pair_ids()
        start_unix = 1 if last_timestamp is None else last_timestamp + 1
        end_unix = start_unix + 24 * 3600
        df = fetch_all_pair_hour_datas(client, pair_ids, start_unix, end_unix)
        
        df = df.loc[df['timestamp'] < df['timestamp'].max()] # remove partial
        
        return df

    def _get_pair_ids(self):
        project_id = os.getenv('GC_PROJECT_ID')
        dataset_name = os.getenv('ALPHAPOOL_DATASET')
        table_id = f'{dataset_name}.univ2_pair'
        
        client = bigquery.Client(project=project_id)
        query = f'SELECT id FROM `{table_id}`'
        query_job = client.query(query)
        ids = []
        for row in query_job:
            ids.append(row['id'])
        
        return sorted(ids)

In [13]:
fetcher = HourFetcher()
# df = fetcher.fetch(last_timestamp=None)
# display(df)
df = fetcher.fetch(last_timestamp=(1668000000 // 3600) * 3600)
display(df)

DefaultCredentialsError: Could not automatically determine credentials. Please set GOOGLE_APPLICATION_CREDENTIALS or explicitly create credentials and re-run the application. For more information, please see https://cloud.google.com/docs/authentication/getting-started

In [20]:
fetchers = [
    PairFetcher(),
    HourFetcher(),
]

data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20221114_univ2.xz', 'wb') as f:
    f.write(data)