# Data collection - gathering available gas for opcode usage

#### Maria Silva, April 2025

## 1. Imports and settings


In [9]:
import os
import duckdb

In [10]:
# Main directories and files
current_path = os.getcwd()
repo_dir = os.path.abspath(os.path.join(current_path, ".."))
data_dir = os.path.join(repo_dir, "data")
raw_traces_dir = os.path.join(data_dir, "raw_trace_data")

## 2. Define and run query on raw trace data

In [11]:
# Define query
filename = os.path.join(raw_traces_dir, "*", "*", "file.parquet")
query = f"""
SELECT block_height, tx_hash, MAX(gas) AS tx_avail_gas
FROM read_parquet(
    '{filename}', 
    hive_partitioning=True, 
    filename = True,
    union_by_name=True
    )
GROUP BY block_height, tx_hash;
"""
# Run query and fetch to DataFrame
df = duckdb.connect().execute(query).fetchdf()
# Print info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 645 entries, 0 to 644
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   block_height  645 non-null    int64 
 1   tx_hash       645 non-null    object
 2   tx_avail_gas  645 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 15.2+ KB


In [12]:
# Define output file
start_block = df["block_height"].min()
end_block = df["block_height"].max()
file_dir = os.path.join(data_dir, f"avail_gas_for_opcodes_{start_block}_{end_block}.parquet")
# Save as parquet
df.to_parquet(file_dir, index=False)