In [2]:
# spark UI address
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.uiWebUrl

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/05 08:09:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/05 08:09:31 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


'http://jupyter-pyspark-7984fd7cf7-hxv9f:4041'

In [None]:
# spark.sql("drop table bronze.kafka_ingestion_state")

DataFrame[]

In [6]:
spark.sql("""
CREATE TABLE IF NOT EXISTS bronze.kafka_ingestion_state (
    -- ========= 核心 checkpoint =========
    checkpoint_block        BIGINT      COMMENT '已成功处理并提交的最新区块高度',

    -- ========= 运行实例(Pod) =========
    producer_pod_name       STRING      COMMENT '执行 ingestion 的 Pod 名称',
    producer_pod_uid        STRING      COMMENT '执行 ingestion 的 Pod UID(唯一实例标识)',

    -- ========= 本次运行上下文 =========
    run_id                  STRING      COMMENT '本次 ingestion 运行的唯一 ID',
    run_mode                STRING      COMMENT '运行模式(realtime / checkpoint_resume / backfill / chain_head_resume)',
    run_start_block         BIGINT      COMMENT '本次运行开始处理的区块高度',
    run_started_at          TIMESTAMP   COMMENT '本次运行启动时间(UTC)',
    
    -- ========= Kafka 元数据 =========
    kafka_key           STRING      COMMENT 'Kafka key',
    kafka_topic         STRING      COMMENT 'Kafka topic',
    kafka_partition     INT         COMMENT 'Kafka partition',
    kafka_offset        BIGINT      COMMENT 'Kafka offset(全局顺序基准)',
    kafka_timestamp     TIMESTAMP   COMMENT 'Kafka 消息时间',
    kafka_date          DATE        COMMENT 'Kafka 消息日期',
    
    -- ========= 状态记录元信息 =========
    state_updated_at        TIMESTAMP   COMMENT '该状态记录写入 Iceberg 的时间'
    
)
USING iceberg
PARTITIONED BY (
    kafka_key
)
TBLPROPERTIES (
    'format-version' = '2',
    'write.metadata.delete-after-commit.enabled' = 'true',
    'write.metadata.previous-versions-max' = '10'
)
""")

DataFrame[]

In [5]:
# Bronze layer
# bsc_mainnet_transactions
spark.sql("""
  CREATE TABLE IF NOT EXISTS bronze.bsc_mainnet_transactions (
      -- ========= 业务字段 =========
      block_height        BIGINT      COMMENT '区块高度',
      job_name            STRING      COMMENT '写入作业名(backfill / realtime)',
      run_id              STRING      COMMENT 'Spark Streaming runId',   
      raw                 STRING      COMMENT '原始 JSON 日志(Kafka value)',

      -- ========= Kafka 元数据 =========
      kafka_topic         STRING      COMMENT 'Kafka topic',
      kafka_partition     INT         COMMENT 'Kafka partition',
      kafka_offset        BIGINT      COMMENT 'Kafka offset(全局顺序基准)',
      kafka_timestamp     TIMESTAMP   COMMENT 'Kafka 消息时间',
      kafka_date          DATE        COMMENT 'Kafka 消息日期'
  )
  USING iceberg
  PARTITIONED BY (
      kafka_date
  )
  TBLPROPERTIES (
      'format-version' = '2',
      'write.metadata.delete-after-commit.enabled' = 'true',
      'write.metadata.previous-versions-max' = '20'
  )
""")

DataFrame[]

In [2]:
# Silver layer
# bsc_mainnet_transactions
spark.sql("""
CREATE TABLE IF NOT EXISTS silver.bsc_mainnet_transactions (

    -- ========= 区块 / 作业元数据 =========
    block_height        BIGINT      COMMENT '区块高度',
    job_name            STRING      COMMENT '写入作业名(backfill / realtime)',
    run_id              STRING      COMMENT 'Spark Streaming runId',

    -- ========= Kafka 元数据 =========
    kafka_topic         STRING      COMMENT 'Kafka topic',
    kafka_partition     INT         COMMENT 'Kafka partition',
    kafka_offset        BIGINT      COMMENT 'Kafka offset(全局顺序基准)',
    kafka_timestamp     TIMESTAMP   COMMENT 'Kafka 消息时间',
    kafka_date          DATE        COMMENT 'Kafka 消息日期',

    -- ========= 交易基础信息 =========
    tx_hash             STRING      COMMENT '交易哈希',
    block_hash          STRING      COMMENT '区块哈希',
    block_number        BIGINT      COMMENT '区块号',
    tx_index            INT         COMMENT '交易在区块内的索引',
    nonce               BIGINT      COMMENT '发送方交易序号',

    from_address        STRING      COMMENT '发送方地址',
    to_address          STRING      COMMENT '接收方地址(EOA 或合约)',

    -- ========= 交易数值 / Gas =========
    value_wei           DECIMAL(38,0) COMMENT '转账金额(Wei)',
    gas_limit           BIGINT      COMMENT 'Gas limit',
    gas_price           BIGINT      COMMENT 'Gas price(legacy)',
    max_fee_per_gas     BIGINT      COMMENT 'EIP-1559 max fee per gas',
    max_priority_fee_per_gas BIGINT COMMENT 'EIP-1559 priority fee',

    -- ========= 交易类型 / 链信息 =========
    tx_type             STRING      COMMENT '交易类型(legacy / 1559 / 2930)',
    chain_id            INT         COMMENT '链 ID',

    -- ========= 交易签名 =========
    sig_v               STRING      COMMENT '签名参数 v',
    sig_r               STRING      COMMENT '签名参数 r',
    sig_s               STRING      COMMENT '签名参数 s',
    y_parity            STRING      COMMENT 'EIP-1559 yParity',

    -- ========= Input 简化分析字段 =========
    input_data          STRING      COMMENT '原始 input calldata(完整保留)',
    has_input           BOOLEAN     COMMENT '是否包含非空 input',
    method_id           STRING      COMMENT '方法选择器(method_id, 前4字节)',
    input_length        INT         COMMENT 'Input 字符长度',
    input_words         DOUBLE      COMMENT 'Input 参数 word 数量(32 bytes)',
    input_hash          STRING      COMMENT 'Input payload 哈希(sha256)',
    is_proxy_like       BOOLEAN     COMMENT '是否疑似 proxy / delegatecall 模式交易',
    
    -- ========= 其他分析字段 =========
    tx_kind             STRING      COMMENT '行为类型',
    is_contract_call    BOOLEAN     COMMENT '是否调用合约',
    fee_model           STRING      COMMENT 'gas 模型'

)
USING iceberg
PARTITIONED BY (
    kafka_date
)
TBLPROPERTIES (
    'format-version' = '2',
    'write.metadata.delete-after-commit.enabled' = 'true',
    'write.metadata.previous-versions-max' = '20'
)
""")


26/02/04 05:25:20 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DataFrame[]

In [2]:
# Bronze layer
spark.sql("""
  CREATE TABLE IF NOT EXISTS bronze.bsc_mainnet_logs (
      -- ========= 业务字段 =========
      block_height        BIGINT      COMMENT '区块高度',
      job_name            STRING      COMMENT '写入作业名(backfill / realtime)',
      run_id              STRING      COMMENT 'Spark Streaming runId',
      inserted_at         TIMESTAMP   COMMENT '写入 Iceberg 的时间',
      inserted_date       DATE        COMMENT '写入 Iceberg 的日期',
      raw                 STRING      COMMENT '原始 JSON 日志(Kafka value)',

      -- ========= Kafka 元数据(关键) =========
      kafka_topic         STRING      COMMENT 'Kafka topic',
      kafka_partition     INT         COMMENT 'Kafka partition',
      kafka_offset        BIGINT      COMMENT 'Kafka offset(全局顺序基准)',
      kafka_timestamp     TIMESTAMP   COMMENT 'Kafka 消息时间'
  )
  USING iceberg
  PARTITIONED BY (
      inserted_date
  )
  TBLPROPERTIES (
      'format-version' = '2',
      'write.metadata.delete-after-commit.enabled' = 'true',
      'write.metadata.previous-versions-max' = '20'
  )
""")

DataFrame[]

In [90]:
# Bronze layer
spark.sql("""
  CREATE TABLE IF NOT EXISTS bronze.base_mainnet_logs (
      -- ========= 业务字段 =========
      block_height        BIGINT      COMMENT '区块高度',
      job_name            STRING      COMMENT '写入作业名(backfill / realtime)',
      run_id              STRING      COMMENT 'Spark Streaming runId',
      inserted_at         TIMESTAMP   COMMENT '写入 Iceberg 的时间',
      inserted_date       DATE        COMMENT '写入 Iceberg 的日期',
      raw                 STRING      COMMENT '原始 JSON 日志(Kafka value)',

      -- ========= Kafka 元数据(关键) =========
      kafka_topic         STRING      COMMENT 'Kafka topic',
      kafka_partition     INT         COMMENT 'Kafka partition',
      kafka_offset        BIGINT      COMMENT 'Kafka offset(全局顺序基准)',
      kafka_timestamp     TIMESTAMP   COMMENT 'Kafka 消息时间'
  )
  USING iceberg
  PARTITIONED BY (
      inserted_date
  )
  TBLPROPERTIES (
      'format-version' = '2',
      'write.metadata.delete-after-commit.enabled' = 'true',
      'write.metadata.previous-versions-max' = '20'
  )
""")

DataFrame[]

## Commen query

In [86]:

# drop table -> delete files
# spark.conf.get("spark.sql.catalog.spark_catalog.purge", "true")
# spark.sql("""drop table bronze.base_mainnet_logs""").show(truncate=False)


# spark.sql("describe table bronze.base_mainnet_logs").show()
# spark.sql("select count(1) from bronze.base_mainnet_logs").show()
# spark.sql("""select max(block_height) from bronze.base_mainnet_logs""").show(truncate=False)
# spark.sql("""SELECT summary FROM bronze.base_mainnet_logs.snapshots ORDER BY committed_at DESC LIMIT 10""").show(truncate=False)
spark.sql("""select * from bronze.base_mainnet_logs limit 10""").show(truncate=False)

+------------+-------------+------------------------------------+-----------------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------+------------

In [46]:
spark.sql("""
    select 
        max(block_height), 
        max(kafka_offset)
    from bronze.base_mainnet_logs
""").show(truncate=False)

+-----------------+-----------------+
|max(block_height)|max(kafka_offset)|
+-----------------+-----------------+
|40877456         |67212993         |
+-----------------+-----------------+



In [None]:

# Silver layer
spark.sql("""
CREATE TABLE IF NOT EXISTS silver.base_mainnet_logs (
    removed BOOLEAN COMMENT '是否被回滚',
    log_index BIGINT COMMENT '在区块内的索引',
    transaction_index BIGINT COMMENT '交易在区块内的索引',
    transaction_hash STRING COMMENT '交易哈希',
    block_hash STRING COMMENT '区块哈希',
    block_number BIGINT COMMENT '区块高度',
    address STRING COMMENT '合约地址(触发事件的合约)',
    data STRING COMMENT 'ABI 编码后的数据(通常是 uint256 或 bytes)',
    topics ARRAY<STRING> COMMENT '事件签名 + indexed 参数'
)
USING ICEBERG;
""")

In [106]:
# query.stop() # stop the streaming

# query.status
# query.lastProgress
# query.isActive

# check all streaming jobs
# spark.streams.active

# 强制停止所有 Streaming(救命用)
for q in spark.streams.active:
    print("Stopping:", q.name)
#     q.stop()