# Resource usage - exploratory analysis

#### Maria Silva, April 2025

## 1. Imports and settings

Let's start by importing the necessary libraries and setting up some directories and files.


In [None]:
import os
import sys
import duckdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

In [None]:
# plotting theme
sns.set_theme(
    style="whitegrid", palette="Set2", rc={"figure.dpi": 500, "axes.titlesize": 15}
)

In [None]:
# Main directories and files
current_path = os.getcwd()
repo_dir = os.path.abspath(os.path.join(current_path, ".."))
src_dir = os.path.join(repo_dir, "src")
data_dir = os.path.join(repo_dir, "data")
op_files_dir = os.path.join(data_dir, "aggregated_opcodes_v3", "*", "file.parquet")

In [None]:
# import internal packages
sys.path.append(src_dir)
from data.gas_cost import compute_component_gas_costs_per_tx
from resource_gas_split import compute_resource_gas_cost_per_tx

## 2. Load data

Now, let's load the data. We will start by loading the opcodes used for blocks 22000000 to 22001999.

In [None]:
# Load parquet files with duckdb
query = f"""
SELECT *
FROM read_parquet(
    '{op_files_dir}', 
    hive_partitioning=True, 
    union_by_name=True
    )
WHERE block_height BETWEEN 22000000 AND 22001999;
"""
agg_trace_df = duckdb.connect().execute(query).fetchdf()
# Drop columns
agg_trace_df = agg_trace_df.drop(columns=["block_range"])
# Clean up repeated opcodes
agg_trace_df["op"] = np.where(agg_trace_df["op"].str.startswith("DUP"), "DUP", agg_trace_df["op"])
agg_trace_df["op"] = np.where(agg_trace_df["op"].str.startswith("SWAP"), "SWAP", agg_trace_df["op"])
agg_trace_df["op"] = np.where(
    (agg_trace_df["op"].str.startswith("PUSH")) & (agg_trace_df["op"] != "PUSH0"),
    "PUSH",
    agg_trace_df["op"],
)
# Compute total gas cost for opcode
agg_trace_df["op_total_gas_cost"] = agg_trace_df["op_gas_cost"] * agg_trace_df["op_gas_pair_count"]
# Print info
agg_trace_df.info()

In [None]:
agg_trace_df["block_height"].agg(["min", "max"])

Now, we can load the gas used per transaction.

In [None]:
# Load parquet file
tx_gas_info_df = pd.read_parquet(
    os.path.join(data_dir, "tx_gas_usage_22000000_22006000.parquet")
)
# Make sure we don't more blocks than agg_trace_df
block_heights = agg_trace_df["block_height"].unique().tolist()
tx_gas_info_df = tx_gas_info_df[tx_gas_info_df["block_height"].isin(block_heights)]
# Print info
tx_gas_info_df.info()

In [None]:
# Load parquet file
avail_gas_df = pd.read_parquet(
    os.path.join(data_dir, "avail_gas_for_opcodes_22000000_22000004.parquet")
)
# Make sure we don't more blocks than agg_trace_df
block_heights = agg_trace_df["block_height"].unique().tolist()
avail_gas_df = avail_gas_df[avail_gas_df["block_height"].isin(block_heights)]
# Print info
avail_gas_df.info()

## 3. Process data

In [None]:
# Compute gas costs by component
comp_df = compute_component_gas_costs_per_tx(agg_trace_df, tx_gas_info_df, avail_gas_df)
comp_df.info()

In [None]:
# How many with positive intrinsic costs?
sum(comp_df["intrinsic_access_cost"]>0)/len(comp_df)

In [None]:
# Compute gas cost by resource
# Failed transactions are unassigned...
fail_txs = tx_gas_info_df[~tx_gas_info_df["is_success"]]["tx_hash"].values.tolist()
print("Failed txs:", len(fail_txs)/len(tx_gas_info_df))

gas_by_resource_df = compute_resource_gas_cost_per_tx(
    agg_trace_df,
    tx_gas_info_df,
    comp_df,
    fail_txs,
)
gas_by_resource_df.head()

In [None]:
# How much gas is unassigned?
total_gas = (
    gas_by_resource_df.drop(columns=["tx_hash", "block_height", "State (exc. Refunds)"])
    .sum(axis=1)
    .sum()
)

print(gas_by_resource_df["Unassigned"].sum())
print(gas_by_resource_df["Unassigned"].sum() / total_gas)

In [None]:
# Melt (for plotting)
melt_gas_by_resource_df = gas_by_resource_df.melt(
    id_vars=["block_height", "tx_hash"], var_name="Resource", value_name="gas_cost"
).sort_values("Resource")

melt_gas_by_resource_df.head()

## 4. Analyze gas by resource

In [None]:
# Transactions with unassigned gas
total_gas = (
    gas_by_resource_df.drop(columns=["tx_hash", "block_height", "State (exc. Refunds)"])
    .sum(axis=1)
    .sum()
)
print(sum(gas_by_resource_df["Unassigned"]>0)/len(gas_by_resource_df))
print(gas_by_resource_df["Unassigned"].sum())
print(gas_by_resource_df["Unassigned"].sum() / total_gas)

In [None]:
# How many with positive intrinsic costs?
sum(comp_df["intrinsic_access_cost"]>0)/len(comp_df)

In [None]:
plt.figure(figsize=(10, 4))
sns.boxplot(
    y="Resource",
    x="gas_cost",
    data=melt_gas_by_resource_df[melt_gas_by_resource_df["Resource"]!="Unassigned"],
    hue="Resource",
    legend=False,
    showfliers=False,
)
plt.title(
    "Distribution of transaction gas spent by resource from blocks 22000020 to 22001999 \n (excluding outliers)",
    pad=25,
)
plt.ylabel("")
plt.xlabel("Transaction gas cost in gas units")
plt.tight_layout()
plt.show()

In [None]:
temp_df = (
    melt_gas_by_resource_df[
        melt_gas_by_resource_df["Resource"] != "State (exc. Refunds)"
    ]
    .groupby(["block_height", "Resource"])["gas_cost"]
    .sum()
    / 1_000_000
)
temp_df = temp_df.reset_index().sort_values("Resource")
temp_df["block_height"] = temp_df["block_height"] - 22000000

plt.figure(figsize=(10, 4))
ax = sns.histplot(
    temp_df,
    x="block_height",
    weights="gas_cost",
    hue="Resource",
    multiple="stack",
    binwidth=10,
    alpha=1.0
)
plt.title(
    "Resource contribution to total gas used (grouped by every 10 blocks)",
    pad=25,
)
plt.xlabel("Block height (starting at 22000000)")
plt.ylabel("Total gas units (Millions)")
plt.axhline(y=180, color='grey', linestyle='--', label='target block size')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()

In [None]:
temp_df = (
    melt_gas_by_resource_df[
        melt_gas_by_resource_df["Resource"] != "State (exc. Refunds)"
    ]
    .groupby(["Resource"])["gas_cost"]
    .sum()
    / 1_000_000
)
print(temp_df.sort_values(ascending=False))
print(temp_df.sort_values(ascending=False)/temp_df.sum())

In [None]:
temp_df = gas_by_resource_df.drop(
    columns=["block_height", "Unassigned", "State (exc. Refunds)"]
).set_index("tx_hash")
corr_mat = temp_df.corr()
plt.figure(figsize=(4.5, 4.5))
ax = sns.heatmap(corr_mat, cmap="YlGnBu", linewidths=0.5, vmin=0, vmax=1)
plt.title("Resource gas usage correlation", pad=25)
plt.tight_layout()
plt.show()

## 5. Analyze block utilization

In [None]:
totals_df = (
    gas_by_resource_df.drop(columns=["tx_hash", "State (exc. Refunds)"])
    .groupby("block_height")
    .sum()
)
totals_df.head()

In [None]:
# One dimensional metering
one_dim_df = totals_df.sum(axis=1).reset_index()
one_dim_df.columns = ["block_height", "usage"]
one_dim_df["perc_over_limit"] = one_dim_df["usage"] / 36_000_000
one_dim_df["utilization"] = (one_dim_df["usage"] - 18_000_000) / 18_000_000
one_dim_df["meter"] = "Unidimensional"
one_dim_df = one_dim_df.reset_index()

# Two dimensional -  State & History vs. others
two_dim_df = pd.DataFrame(
    {
        "state_history": totals_df["State"] + totals_df["History"],
        "others": totals_df.sum(axis=1) - totals_df["State"] - totals_df["History"],
    }
)
two_dim_df["usage"] = two_dim_df.max(axis=1)
two_dim_df["perc_over_limit"] = two_dim_df["usage"] / 36_000_000
two_dim_df["utilization"] = (two_dim_df["usage"] - 18_000_000) / 18_000_000
two_dim_df["meter"] = "State/history vs. Others"
two_dim_df = two_dim_df.reset_index()

# Two dimensional -  State & History vs. Storage Access vs. others
three_dim_df = pd.DataFrame(
    {
        "state_history": totals_df["State"] + totals_df["History"],
        "access": totals_df["Access"],
        "others": totals_df.sum(axis=1)
        - totals_df["State"]
        - totals_df["History"]
        - totals_df["Access"],
    }
)
three_dim_df["usage"] = three_dim_df.max(axis=1)
three_dim_df["perc_over_limit"] = three_dim_df["usage"] / 36_000_000
three_dim_df["utilization"] = (three_dim_df["usage"] - 18_000_000) / 18_000_000
three_dim_df["meter"] = "State/history vs. Access vs. Others"
three_dim_df = three_dim_df.reset_index()

# Aggregate all dataframes
util_melt_df = pd.concat(
    [
        one_dim_df[["block_height", "meter", "usage", "perc_over_limit", "utilization"]],
        two_dim_df[["block_height","meter", "usage", "perc_over_limit", "utilization"]],
        three_dim_df[["block_height","meter", "usage", "perc_over_limit", "utilization"]],
    ],
    ignore_index=True,
)
util_melt_df.head()

In [None]:
((two_dim_df["usage"]-one_dim_df["usage"])/one_dim_df["usage"]).agg(["mean", "median"])

In [None]:
((three_dim_df["usage"]-two_dim_df["usage"])/two_dim_df["usage"]).agg(["mean", "median"])

In [None]:
plt.figure(figsize=(10, 3))
sns.boxplot(data=util_melt_df, x="perc_over_limit", y="meter", hue="meter",legend=False,)
plt.axvline(x=0.5, color='grey', linestyle='--')
plt.xlabel("Block resource utilization over block limit")
plt.ylabel("")
plt.title("Distribution of block utilization rate per metering model (blocks 22000020 to 22001999)", pad=25)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 3))
sns.boxplot(data=util_melt_df, x="utilization", y="meter", hue="meter",legend=False,)
plt.xlabel("Block utilization rate (block gas as a rate of block target)")
plt.ylabel("")
plt.title("Block utilization per metering model (blocks 22000020 to 22001999)", pad=25)
plt.tight_layout()
plt.show()

In [None]:
temp_df = util_melt_df.rename(columns={"meter": "Metering model"}).copy()
temp_df["usage"] = temp_df["usage"] / 1_000_000
temp_df["block_height"] = temp_df["block_height"] - 22000000

plt.figure(figsize=(10, 4))
ax = sns.histplot(
    temp_df,
    x="block_height",
    weights="usage",
    hue="Metering model",
    multiple="dodge",
    binwidth=20,
    alpha=1.0
)
plt.title(
    "Block gas utilization per metering model (grouped by every 20 blocks)",
    pad=25,
)
plt.xlabel("Block height (starting at 22000000)")
plt.ylabel("Total gas units (Millions)")
plt.axhline(y=180*2, color='grey', linestyle='--', label='target block size')

plt.tight_layout()
plt.show()