# Data collection - gathering available gas for opcode usage

#### Maria Silva, April 2025

## 1. Imports and settings


In [1]:
import os
import sys
import duckdb
import pandas as pd
from tqdm import tqdm

In [2]:
# Main directories and files
current_path = os.getcwd()
repo_dir = os.path.abspath(os.path.join(current_path, ".."))
data_dir = os.path.join(repo_dir, "data")
src_dir = os.path.join(repo_dir, "src")
raw_traces_dir = os.path.join(data_dir, "raw_trace_data")

In [3]:
# import internal packages
sys.path.append(src_dir)
from data.path_mng import get_parquet_path_patterns, chunks

## 2. Define and run query on raw trace data

In [4]:
# Get list of raw trace traces and partition
raw_data_dir = os.path.join(data_dir, "raw_trace_data")
block_dirs = get_parquet_path_patterns(raw_data_dir)
block_dirs_chunks = list(chunks(block_dirs, n=5))
con = duckdb.connect()
# Run query by partition
df = pd.DataFrame()
for dirs_chunk in tqdm(block_dirs_chunks):
    query = f"""
    SELECT block_height, tx_hash, FIRST(gas) AS first_avail_gas, LAST(gas) AS last_avail_gas
    FROM read_parquet(
            { dirs_chunk },
            hive_partitioning = TRUE,
            union_by_name=True
            )
    GROUP BY block_height, tx_hash;
    """
    query_df = con.execute(query).fetchdf()
    df = pd.concat([df, query_df])
# Print info
df.info()

100%|██████████| 400/400 [03:08<00:00,  2.12it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 290953 entries, 0 to 392
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   block_height     290953 non-null  int64 
 1   tx_hash          290953 non-null  object
 2   first_avail_gas  290953 non-null  int64 
 3   last_avail_gas   290953 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 11.1+ MB





In [5]:
# Define output file
start_block = df["block_height"].min()
end_block = df["block_height"].max()
file_dir = os.path.join(data_dir, f"avail_gas_for_opcodes_{start_block}_{end_block}.parquet")
# Save as parquet
df.to_parquet(file_dir, index=False)