In [7]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import col, expr

spark = SparkSession.builder.getOrCreate()


# SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"
# BLOCKS_TOPIC = "blockchain.logs.base"
# KAFKA_BROKER = "redpanda.kafka.svc:9092"
# TABLE_NAME = "kafka_base_mainnet_logs"
# CHECKPOINT_PATH = f"s3a://datalake/_checkpoints/{TABLE_NAME}"
# SUBJECT = f"{BLOCKS_TOPIC}-value"

# avro_schema = requests.get(
#     f"{SCHEMA_REGISTRY_URL}/subjects/{SUBJECT}/versions/latest"
# ).json()["schema"]

In [85]:
for q in spark.streams.active:
    print(q.name, q.status)

In [13]:
df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_BROKER)
    .option("subscribe", BLOCKS_TOPIC)
    .option("startingOffsets", "latest")   # extract latest data
    .option("maxOffsetsPerTrigger", 2000)   # avoid OOM
    .load()
)

df_stripped = df.withColumn(
    "value_no_header",
    expr("substring(value, 6, length(value)-5)")
)

df_parsed = (
    df_stripped.select(
        from_avro(
            col("value_no_header"),
            avro_schema,
            {"mode": "PERMISSIVE"}
        ).alias("r")
    )
    .select("r.*")
)

df_parsed.printSchema()

root
 |-- block_height: long (nullable = true)
 |-- job_name: string (nullable = true)
 |-- run_id: string (nullable = true)
 |-- inserted_at: string (nullable = true)
 |-- raw: string (nullable = true)



In [None]:
from pyspark.sql.types import (
    StructType,
    StructField,
    BooleanType,
    LongType,
    StringType,
    ArrayType,
)

raw_logs_schema = StructType([
    StructField("removed", BooleanType(), True),            # ÊòØÂê¶Ë¢´ÂõûÊªö
    StructField("logIndex", LongType(), True),              # Âú®Âå∫ÂùóÂÜÖÁöÑÁ¥¢Âºï
    StructField("transactionIndex", LongType(), True),      # ‰∫§ÊòìÂú®Âå∫ÂùóÂÜÖÁöÑÁ¥¢Âºï
    StructField("transactionHash", StringType(), True),     # ‰∫§ÊòìÂìàÂ∏å
    StructField("blockHash", StringType(), True),           # Âå∫ÂùóÂìàÂ∏å
    StructField("blockNumber", LongType(), True),           # Âå∫ÂùóÈ´òÂ∫¶
    StructField("blockTimestamp", StringType(), True),      # ÂÖàÂΩì string - Âå∫ÂùóÊó∂Èó¥Êà≥
    StructField("address", StringType(), True),             # ÂêàÁ∫¶Âú∞ÂùÄ
    StructField("data", StringType(), True),                # ABI ÁºñÁ†Å data
    StructField("topics", ArrayType(StringType()), True),   # event topics
])

from pyspark.sql.functions import from_json, col, to_timestamp

df_struct = df_parsed.withColumn(
    "logs_raw",
    from_json(col("raw"), raw_logs_schema)
)

In [32]:
# normalization
from pyspark.sql.functions import col, conv, from_unixtime, regexp_replace

df_normalized = df_struct.select(  
    col("logs_raw.blockNumber").alias("block_number"),
    col("logs_raw.transactionHash").alias("transaction_hash"),
    col("logs_raw.logIndex").alias("log_index"),
    col("logs_raw.removed").alias("removed"),
    col("logs_raw.transactionIndex").alias("transaction_index"),
    col("logs_raw.blockHash").alias("block_hash"),
    col("logs_raw.address").alias("address"),
    col("logs_raw.data").alias("data"),
    col("logs_raw.topics").alias("topics"),

    # hex timestamp ‚Üí long ‚Üí timestamp
    from_unixtime(
        conv(regexp_replace(col("logs_raw.blockTimestamp"), "^0x", ""), 16, 10)
    ).alias("block_timestamp"),
    col("job_name"),
    col("run_id"),
    col("inserted_at")
)

In [36]:
debug_query.stop()

In [None]:
debug_query = (
    df_normalized
    .writeStream
    .format("console")
    .outputMode("append")
    .option("truncate", False)
    .option("numRows", 1)   # üëà Âè™ÁúãÊúÄÊñ∞ 1 Êù°
    .start()
)

debug_query.awaitTermination()

26/01/16 06:05:23 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-17cdf2a3-5b58-49bd-a1cd-fb57f53b94b4. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
26/01/16 06:05:23 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 0
-------------------------------------------
+------------+----------------+---------+-------+-----------------+----------+-------+----+------+---------------+--------+------+-----------+
|block_number|transaction_hash|log_index|removed|transaction_index|block_hash|address|data|topics|block_timestamp|job_name|run_id|inserted_at|
+------------+----------------+---------+-------+-----------------+----------+-------+----+------+---------------+--------+------+-----------+
+------------+----------------+---------+-------+-----------------+----------+-------+----+------+---------------+--------+------+-----------+

-------------------------------------------
Batch: 1
-------------------------------------------
+------------+----------------------------------------------------------------+---------+-------+-----------------+----------------------------------------------------------------+------------------------------------------+----------

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

-------------------------------------------
Batch: 3
-------------------------------------------
+------------+----------------------------------------------------------------+---------+-------+-----------------+----------------------------------------------------------------+------------------------------------------+----------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-------------+------------------------------------+------------------------+
|block_number|transaction_hash                                                |log_index|removed|transaction_index|block_hash                                                      |address                                   |data                                                            |topics                             

In [None]:
# ===============================
# 2Ô∏è‚É£ ÂÆö‰πâ Avro Schema
# ===============================
blocks_avro_schema = """
{
  "type":"record",
  "name":"BlockEvent",
  "namespace":"platform.ingestion.blocks",
  "fields":[
    {"name":"block_height","type":"long"},
    {"name":"job_name","type":"string"},
    {"name":"run_id","type":"string"},
    {"name":"inserted_at","type":"string"},
    {"name":"raw","type":"string"}
  ]
}
"""

state_avro_schema = """
{
  "type":"record",
  "name":"IngestionState",
  "namespace":"platform.ingestion.state",
  "fields":[
    {"name":"job_name","type":"string"},
    {"name":"run_id","type":"string"},
    {"name":"range","type":{
        "type":"record",
        "name":"BlockRange",
        "fields":[
            {"name":"start","type":"long"},
            {"name":"end","type":"long"}
        ]
    }},
    {"name":"checkpoint","type":"long"},
    {"name":"status","type":{"type":"enum","name":"IngestionStatus","symbols":["running","stopped","completed","failed"]}},
    {"name":"inserted_at","type":"string"}
  ]
}
"""

ImportError: cannot import name 'from_avro' from 'pyspark.sql.functions' (/opt/conda/lib/python3.11/site-packages/pyspark/sql/functions.py)

In [None]:
# ===============================
# 3Ô∏è‚É£ ‰ªé Kafka ËØªÂèñ Avro Êï∞ÊçÆ
# ===============================
blocks_df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", kafka_bootstrap)
    .option("subscribe", "blockchain.blocks.eth.mainnet")
    .option("startingOffsets", "earliest")
    .load()
    .selectExpr("CAST(value AS BINARY) as avro_bytes")
    .select(from_avro(col("avro_bytes"), blocks_avro_schema, {"schema.registry.url": schema_registry}).alias("data"))
    .select("data.*")
)

state_df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", kafka_bootstrap)
    .option("subscribe", "blockchain.ingestion-state.eth.mainnet")
    .option("startingOffsets", "earliest")
    .load()
    .selectExpr("CAST(value AS BINARY) as avro_bytes")
    .select(from_avro(col("avro_bytes"), state_avro_schema, {"schema.registry.url": schema_registry}).alias("data"))
    .select("data.*")
)

In [None]:
# ===============================
# 4Ô∏è‚É£ ÂÜôÂÖ• IcebergÔºàexactly-onceÔºâ
# ===============================
blocks_query = (
    blocks_df.writeStream
    .format("iceberg")
    .outputMode("append")
    .option("checkpointLocation", "/data/checkpoints/blocks_eth_mainnet") 
    .toTable("iceberg.eth_blocks")
)

state_query = (
    state_df.writeStream
    .format("iceberg")
    .outputMode("complete")  # compact topic
    .option("checkpointLocation", "/data/checkpoints/state_eth_mainnet")
    .toTable("iceberg.eth_ingestion_state")
)

In [None]:
# ===============================
# 5Ô∏è‚É£ ÂêØÂä®ÊµÅ
# ===============================
blocks_query.awaitTermination()
state_query.awaitTermination()