# Gas Fees Issue

Gas fees are market based external prices that are more or less a random walk. sample and hold for the slower moving pool is not a good strategy and makes the model make poor choices about the actual cost of doing business. a better approach is sample and hold on gasUsed and attempt to use the latest value for gasPrice (if possible)

First I need to import two pools and see if we can come up with a better way...

In [3]:
import os
import re

import pandas as pd

from itertools import combinations

In [4]:
# change the active path to the parent directory 
if True: 
    print("Moving active path to parent directory")
    os.chdir('..')
    print(os.getcwd())

Moving active path to parent directory
/Users/das/DATASCI210/arbitrage_playground


In [5]:
import src.arbutils as arbutils
import src.fetch as fetch

In [6]:

GRAPH_API_KEY = os.getenv("GRAPH_API_KEY")
DATA_PATH = "data/"

In [9]:
def find_pool_pairs(thegraph_api_key, location):
    """
    search in a directory with csv files with the naming convention pool_id_<address>.  Extract
    the address and then query for the metadata for the pool.  save the metadata for each file 
    and determine which ones are valid pairs (i.e. the token pairs).
    """
    pools = []

    for filename in [x for x in os.listdir(location) if x.find(f'.csv')!=-1]:

        #print(f"Reading: {filename}")
        pattern = r"pool_id_(.*?)_swap_final\.csv"
        match = re.search(pattern, filename)
        if match:
            address = match.group(1)
            #print(f"Found {address}")
            metadata = fetch.thegraph_request_pool_metadata(thegraph_api_key=thegraph_api_key, pool_address=address)
            pool = {
                'filename':f"{location}{filename}",
                'address':address,
                'feeTier':int(metadata['feeTier'])*1e-6,
                'token0':metadata['token0']['symbol'],
                'token1':metadata['token1']['symbol'],
                'token0_decimal':metadata['token0']['decimals'],
                'token1_decimal':metadata['token1']['decimals']
            }
            pools.append(pool)
        else:
            #ignore this mysterious csv.
            print(f"Ignoring {filename}")
    
    #print(f"Found {len(pools)} pools.")

    pair_to_addresses = {}
    matching_addresses = []
    
    for pool in pools:
        # Create a pair (order doesn't matter, so we use a tuple and sort it)
        pair = tuple(sorted([pool['token0'], pool['token1']]))
        address = pool['address']
        
        if pair not in pair_to_addresses:
            pair_to_addresses[pair] = []
        
        # Add the current address to the list of addresses for this pair
        pair_to_addresses[pair].append(address)
    
    # For each token pair, generate all possible combinations of addresses
    for addresses in pair_to_addresses.values():
        if len(addresses) > 1:
            matching_addresses.extend(list(combinations(addresses, 2)))
    
    matching_pools = []
    for addr0,addr1 in matching_addresses:
        
        pool_pairs = {'pool0':dict, 'pool1':dict}
        for pool in pools:
            if addr0 == pool['address']:
                pool_pairs['pool0'] = pool
            elif addr1 == pool['address']:
                pool_pairs['pool1'] = pool
        matching_pools.append(pool_pairs)
            
    print(f"Found {len(matching_pools)} valid pool pairs.")

    return matching_pools

In [10]:
pool_pairs_list = find_pool_pairs(GRAPH_API_KEY, DATA_PATH)
for pool_pair in pool_pairs_list:
    print(f"Pair: {pool_pair['pool0']['address']}, {pool_pair['pool1']['address']}")

Found 1 valid pool pairs.
Pair: 0x88e6a0c2ddd26feeb64f039a2c41296fcb3f5640, 0x8ad599c3a0ff1de082011efddc58f1908eb6e6d8


In [39]:
# Loading the files from the directory
p0 = pd.read_csv(pool_pairs_list[0]['pool0']['filename'])
p1 = pd.read_csv(pool_pairs_list[0]['pool1']['filename'])

p0_fee_tier = pool_pairs_list[0]['pool0']['feeTier']
p1_fee_tier = pool_pairs_list[0]['pool1']['feeTier']

token0_decimal = int(pool_pairs_list[0]['pool0']['token0_decimal'])
token1_decimal = int(pool_pairs_list[0]['pool0']['token1_decimal'])

In [40]:
p0.columns

Index(['transactionHash', 'datetime', 'timeStamp', 'sqrtPriceX96',
       'blockNumber', 'gasPrice', 'gasUsed', 'tick', 'amount0', 'amount1',
       'liquidity'],
      dtype='object')

In [41]:
small_p0 = p0[['datetime','timeStamp','blockNumber','gasPrice','gasUsed','sqrtPriceX96','tick']].sample(10).sort_values(by='datetime')
small_p0['timeStamp2'] = small_p0['timeStamp']
small_p1 = p1[['datetime','timeStamp','blockNumber','gasPrice','gasUsed','sqrtPriceX96','tick']].sample(10).sort_values(by='datetime')
small_p1['timeStamp2'] = small_p1['timeStamp']

This is a modified version of the code used for merging pools now.  The solution I'm currently proposing is to take the gas prices, which are foreward filled (now) and use an apply function to modify them...

In [42]:
pools.dtypes

datetime             object
timeStamp             int64
blockNumber           int64
p0.gasPrice         float64
p0.gasUsed          float64
p0.sqrtPriceX96     float64
p0.tick             float64
p0.timeStamp        float64
p1.gasPrice         float64
p1.gasUsed          float64
p1.sqrtPriceX96     float64
p1.tick             float64
p1.timeStamp        float64
p0.price_usdc        object
p0.gas_fees_usdc     object
p1.price_usdc        object
p1.gas_fees_usdc     object
dtype: object

In [52]:

pools = pd.merge(small_p0, small_p1, on=['datetime','timeStamp','blockNumber'],how='outer')
pools = pools.ffill().reset_index(drop=True)

# Rename columns
pools = pools.rename(
    columns=lambda col: f"p0.{col.replace('_x', '')}" if '_x' in col else
                        f"p1.{col.replace('_y', '')}" if '_y' in col else col
)
pools = pools.rename(columns={'p0.timeStamp2':'p0.timeStamp','p1.timeStamp2':'p1.timeStamp'})

pools['p0.price_usdc'] = ((pools['p0.sqrtPriceX96'] / 2**96)**2 / 10**(token1_decimal-token0_decimal)) **-1
pools['p0.gas_fees_usdc'] = (pools['p0.gasPrice'] / 1e9 )*(pools['p0.gasUsed'] / 1e9) * pools['p0.price_usdc'] 

pools['p1.price_usdc'] = ((pools['p1.sqrtPriceX96'] / 2**96)**2 / 10**(token1_decimal-token0_decimal)) **-1
pools['p1.gas_fees_usdc'] = (pools['p1.gasPrice'] / 1e9 )*(pools['p1.gasUsed'] / 1e9) * pools['p1.price_usdc']

# Find the first row with NaNs...
new_first_row = pools['p1.sqrtPriceX96'].first_valid_index()
pools = pools.iloc[new_first_row:]

# Find the first row with NaNs...
new_first_row = pools['p0.sqrtPriceX96'].first_valid_index()
pools = pools.iloc[new_first_row:]

has_nans = pools.isna().any().any()
print("Are there any NaNs in the DataFrame?", has_nans)


pools.head(10).sort_values(by='timeStamp')


Are there any NaNs in the DataFrame? False


Unnamed: 0,datetime,timeStamp,blockNumber,p0.gasPrice,p0.gasUsed,p0.sqrtPriceX96,p0.tick,p0.timeStamp,p1.gasPrice,p1.gasUsed,p1.sqrtPriceX96,p1.tick,p1.timeStamp,p0.price_usdc,p0.gas_fees_usdc,p1.price_usdc,p1.gas_fees_usdc
3,2024-08-09 00:08:23+00:00,1723162103,20487358,3746959000.0,5547294.0,1.530857e+33,197390.0,1723162000.0,188150900000.0,228474.0,1.669306e+33,199121.0,1722824000.0,2678.487858,55.673659,2252.617823,96.834592
4,2024-08-09 13:06:23+00:00,1723208783,20491233,3746959000.0,5547294.0,1.530857e+33,197390.0,1723162000.0,3500671000.0,123127.0,1.546024e+33,197587.0,1723209000.0,2678.487858,55.673659,2626.194928,1.131961
5,2024-08-24 00:12:23+00:00,1724458343,20594803,2125599000.0,120914.0,1.507432e+33,197081.0,1724458000.0,3500671000.0,123127.0,1.546024e+33,197587.0,1723209000.0,2762.383331,0.709973,2626.194928,1.131961
6,2024-08-27 03:15:23+00:00,1724728523,20617171,2125599000.0,120914.0,1.507432e+33,197081.0,1724458000.0,1200000000.0,227544.0,1.531529e+33,197398.0,1724729000.0,2762.383331,0.709973,2676.140078,0.730728
7,2024-08-29 16:24:47+00:00,1724948687,20635414,4105416000.0,114090.0,1.5604880000000001e+33,197773.0,1724949000.0,1200000000.0,227544.0,1.531529e+33,197398.0,1724729000.0,2577.734715,1.207377,2676.140078,0.730728
8,2024-10-01 04:44:59+00:00,1727757899,20868338,4105416000.0,114090.0,1.5604880000000001e+33,197773.0,1724949000.0,39063880000.0,495078.0,1.543282e+33,197551.0,1727758000.0,2577.734715,1.207377,2635.534503,50.970362
9,2024-10-14 09:35:47+00:00,1728898547,20962952,37348230000.0,123695.0,1.57594e+33,197970.0,1728899000.0,39063880000.0,495078.0,1.543282e+33,197551.0,1727758000.0,2527.432345,11.676205,2635.534503,50.970362
10,2024-10-18 06:21:47+00:00,1729232507,20990624,37348230000.0,123695.0,1.57594e+33,197970.0,1728899000.0,47360000000.0,396005.0,1.546711e+33,197596.0,1729233000.0,2527.432345,11.676205,2623.861993,49.209999
11,2024-10-26 00:04:47+00:00,1729901087,21046100,16388520000.0,282009.0,1.607451e+33,198366.0,1729901000.0,47360000000.0,396005.0,1.546711e+33,197596.0,1729233000.0,2429.31422,11.227587,2623.861993,49.209999
12,2024-11-06 18:59:59+00:00,1730919599,21130558,13320680000.0,352738.0,1.530873e+33,197390.0,1730920000.0,47360000000.0,396005.0,1.546711e+33,197596.0,1729233000.0,2678.432436,12.585175,2623.861993,49.209999


In [53]:
import numpy as np

def gas_price_correction(row):
    """
    The issue with gas fees is that they are always changing.  Unlike pool price which holds unless there is a swap 
    or mint/burn.  

    additionally, gas fees are an independent thing to a specific pool.  the gasPrice is driven by market forces 
    and the gasUsed is a standard smart contract thing (i.e. how much it takes to execute the contract).

    This is a correction for the forward fill to support better estimates of the gas price for inference.
    """
    # if pool 0 gas fees has a larger timestamp, it happened more recently.  
    # pick that for both of their gas fees....
    if int(row['p0.timeStamp']) > int(row['p1.timeStamp']):
        row['p1.gasPrice'] = row['p0.gasPrice']
        row['p1.gas_fees_usdc'] = (row['p1.gasPrice'] / 1e9 )*(row['p1.gasUsed'] / 1e9) * row['p1.price_usdc']
    else:
        row['p0.gasPrice'] = row['p1.gasPrice']
        row['p0.gas_fees_usdc'] = (row['p0.gasPrice'] / 1e9 )*(row['p0.gasUsed'] / 1e9) * row['p0.price_usdc']
    
    return row
    
pools.apply(gas_price_correction,axis=1)

Unnamed: 0,datetime,timeStamp,blockNumber,p0.gasPrice,p0.gasUsed,p0.sqrtPriceX96,p0.tick,p0.timeStamp,p1.gasPrice,p1.gasUsed,p1.sqrtPriceX96,p1.tick,p1.timeStamp,p0.price_usdc,p0.gas_fees_usdc,p1.price_usdc,p1.gas_fees_usdc
3,2024-08-09 00:08:23+00:00,1723162103,20487358,3746959000.0,5547294.0,1.530857e+33,197390.0,1723162000.0,3746959000.0,228474.0,1.669306e+33,199121.0,1722824000.0,2678.487858,55.673659,2252.617823,1.928427
4,2024-08-09 13:06:23+00:00,1723208783,20491233,3500671000.0,5547294.0,1.530857e+33,197390.0,1723162000.0,3500671000.0,123127.0,1.546024e+33,197587.0,1723209000.0,2678.487858,52.014225,2626.194928,1.131961
5,2024-08-24 00:12:23+00:00,1724458343,20594803,2125599000.0,120914.0,1.507432e+33,197081.0,1724458000.0,2125599000.0,123127.0,1.546024e+33,197587.0,1723209000.0,2762.383331,0.709973,2626.194928,0.687324
6,2024-08-27 03:15:23+00:00,1724728523,20617171,1200000000.0,120914.0,1.507432e+33,197081.0,1724458000.0,1200000000.0,227544.0,1.531529e+33,197398.0,1724729000.0,2762.383331,0.400813,2676.140078,0.730728
7,2024-08-29 16:24:47+00:00,1724948687,20635414,4105416000.0,114090.0,1.5604880000000001e+33,197773.0,1724949000.0,4105416000.0,227544.0,1.531529e+33,197398.0,1724729000.0,2577.734715,1.207377,2676.140078,2.49995
8,2024-10-01 04:44:59+00:00,1727757899,20868338,39063880000.0,114090.0,1.5604880000000001e+33,197773.0,1724949000.0,39063880000.0,495078.0,1.543282e+33,197551.0,1727758000.0,2577.734715,11.488443,2635.534503,50.970362
9,2024-10-14 09:35:47+00:00,1728898547,20962952,37348230000.0,123695.0,1.57594e+33,197970.0,1728899000.0,37348230000.0,495078.0,1.543282e+33,197551.0,1727758000.0,2527.432345,11.676205,2635.534503,48.73179
10,2024-10-18 06:21:47+00:00,1729232507,20990624,47360000000.0,123695.0,1.57594e+33,197970.0,1728899000.0,47360000000.0,396005.0,1.546711e+33,197596.0,1729233000.0,2527.432345,14.806192,2623.861993,49.209999
11,2024-10-26 00:04:47+00:00,1729901087,21046100,16388520000.0,282009.0,1.607451e+33,198366.0,1729901000.0,16388520000.0,396005.0,1.546711e+33,197596.0,1729233000.0,2429.31422,11.227587,2623.861993,17.028697
12,2024-11-06 18:59:59+00:00,1730919599,21130558,13320680000.0,352738.0,1.530873e+33,197390.0,1730920000.0,13320680000.0,396005.0,1.546711e+33,197596.0,1729233000.0,2678.432436,12.585175,2623.861993,13.841016


501