# Data download - gas usage by transaction

#### Maria Silva, March 2025

## 1. Imports and settings

Let's start by importing the necessary libraries and setting up some directories and files.


In [1]:
import os
import json
import pandas as pd
from sqlalchemy import text, create_engine

In [2]:
# Main directories and files
current_path = os.getcwd()
repo_dir = os.path.abspath(os.path.join(current_path, ".."))
data_dir = os.path.join(repo_dir, "data")

## 2. Query clickhouse

In [3]:
# Secrets for acessing xatu clickhouse and erigon
with open(os.path.join(repo_dir, "secrets.json"), "r") as file:
    secrets_dict = json.load(file)

# Block ranges to query
start_block=22_000_000
end_block=22_006_000

# Credentials for xatu clickhouse
xatu_user = secrets_dict["xatu_username"]
xatu_pass = secrets_dict["xatu_password"]

In [4]:
query = text(
    """
    SELECT 
        block_number AS block_height, 
        transaction_hash, 
        gas_used, 
        4 * n_input_zero_bytes + 16 * n_input_nonzero_bytes AS tx_input_data_cost,
        to_address IS NULL AS is_contract_creation
    FROM default.canonical_execution_transaction
    WHERE block_number BETWEEN toUInt64(:start_block) AND toUInt64(:end_block)
            AND meta_network_name = :network
    ORDER BY block_number ASC, transaction_index ASC
"""
)
db_url = f"clickhouse+http://{xatu_user}:{xatu_pass}@clickhouse.xatu.ethpandaops.io:443/default?protocol=https"
engine = create_engine(db_url)
connection = engine.connect()
query_result = connection.execute(
    query,
    {"start_block": start_block, "end_block": end_block, "network": "mainnet"},
)

## 3. Save data as parquet

In [5]:
df = pd.DataFrame(query_result.fetchall())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 937679 entries, 0 to 937678
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   block_height          937679 non-null  int64 
 1   transaction_hash      937679 non-null  object
 2   gas_used              937679 non-null  int64 
 3   tx_input_data_cost    937679 non-null  int64 
 4   is_contract_creation  937679 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 35.8+ MB


In [6]:
df["block_height"].agg(["min", "max"])

min    22000000
max    22006000
Name: block_height, dtype: int64

In [7]:
file_dir = os.path.join(data_dir, f"tx_gas_usage_{start_block}_{end_block}.parquet")
df.to_parquet(file_dir, index=False)