In [None]:
import os
import json
import uuid
import time
from datetime import datetime, timezone
from web3 import Web3
from web3.middleware import ExtraDataToPOAMiddleware
from confluent_kafka import Producer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.avro import AvroSerializer
from confluent_kafka.serialization import SerializationContext, MessageField
from hexbytes import HexBytes
from web3.datastructures import AttributeDict
from src.kafka_state import load_last_state

# -----------------------------
# Environment Variables
# -----------------------------
INFURA_API_KEY = os.getenv("INFURA_API_KEY", "<YOUR-API-KEY>")
INFURA_BASE_URL = "https://bsc-mainnet.infura.io/v3"
BSC_RPC_URL = f"{INFURA_BASE_URL}/{INFURA_API_KEY}"
RUN_ID = os.getenv("RUN_ID", str(uuid.uuid4()))

POLL_BATCH_SIZE = int(os.getenv("BATCH_SIZE", "5"))
POLL_INTERVAL = int(os.getenv("POLL_INTERVAL", "5"))
BATCH_TX_SIZE = 10  # Max 10 transactions per batch

# -----------------------------
# Config
# -----------------------------
JOB_NAME = "bsc_realtime"
TRANSACTIONAL_ID = f"blockchain.ingestion.bsc.{JOB_NAME}"
KAFKA_BROKER = "redpanda.kafka.svc:9092"
SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"
BLOCKS_TOPIC = "blockchain.logs.bsc"
STATE_TOPIC = "blockchain.state.bsc"
PULL_BATCH_SIZE = int(os.getenv("BATCH_SIZE", "50"))

In [None]:
def current_utctime():
    """Return the current UTC time string in ISO-8601 format with millisecond precision"""
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"

In [None]:
# -----------------------------
# JSON safe serialization
# -----------------------------
def to_json_safe(obj):
    if isinstance(obj, HexBytes):
        return obj.hex()
    elif isinstance(obj, AttributeDict):
        return {k: to_json_safe(v) for k, v in obj.items()}
    elif isinstance(obj, dict):
        return {k: to_json_safe(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [to_json_safe(v) for v in obj]
    else:
        return obj

In [9]:
# -----------------------------
# Schema Registry
# -----------------------------
schema_registry = SchemaRegistryClient({
    "url": SCHEMA_REGISTRY_URL
})

# --- Avro schemasÔºàpull registryÔºâ
blocks_value_schema = schema_registry.get_latest_version(
    f"{BLOCKS_TOPIC}-value"
).schema.schema_str

state_value_schema = schema_registry.get_latest_version(
    f"{STATE_TOPIC}-value"
).schema.schema_str

# -----------------------------
# Serializers
# -----------------------------
blocks_value_serializer = AvroSerializer(
    schema_registry,
    blocks_value_schema
)

state_value_serializer = AvroSerializer(
    schema_registry,
    state_value_schema
)

In [10]:
# -----------------------------
# Web3 initialization
# -----------------------------
w3 = Web3(Web3.HTTPProvider(BSC_RPC_URL))
w3.middleware_onion.inject(ExtraDataToPOAMiddleware, layer=0)

def fetch_block_logs(block_number):
    return w3.eth.get_logs({
        "fromBlock": block_number,
        "toBlock": block_number
    })

# logs = fetch_block_logs(74934686)
# len(logs)

In [11]:
# -----------------------------
# delivery report for producer callback
# -----------------------------
def delivery_report(err, msg):
    if err is not None:
        print(f"‚ùå Delivery failed: {err}")
    else:
        print(
            f"‚úÖ Delivered to {msg.topic()} "
            f"[{msg.partition()}] @ {msg.offset()}"
        )

# -----------------------------
# Producer initialization
# -----------------------------
producer = Producer({
    "bootstrap.servers": KAFKA_BROKER,
    "enable.idempotence": True,
    "acks": "all",
    "retries": 3,
    "linger.ms": 5,
    "transactional.id": TRANSACTIONAL_ID
})

print("üîß Initializing Kafka transactions...")
producer.init_transactions()

üîß Initializing Kafka transactions...


In [None]:
# -----------------------------
# Main function (Kafka State + Exactly-once, lbatched splitting of logs)
# -----------------------------

def fetch_and_push():
    
    last_state = load_last_state(JOB_NAME)
    last_block = last_state["checkpoint"]
    if last_block is None:
        last_block = w3.eth.block_number - 1

    print(f"‚ñ∂Ô∏è Starting from block {last_block + 1}", flush=True)

    while True:
        latest_block = w3.eth.block_number
        if last_block >= latest_block:
            time.sleep(POLL_INTERVAL)
            continue

        batch_end = min(last_block + POLL_BATCH_SIZE, latest_block)

        try:
            # üîê Kafka Transaction
            producer.begin_transaction()

            # 1Ô∏è‚É£ produce logs (split each block's logs into batches for sending)
            for bn in range(last_block + 1, batch_end + 1):
                block_logs = fetch_block_logs(bn)
                if block_logs is None:
                    raise RuntimeError(f"block logs {bn} fetch failed")

                block_logs_safe = to_json_safe(block_logs)  # Convert to serializable
                if isinstance(block_logs_safe, dict):
                    transactions = block_logs_safe.get("transactions", [block_logs_safe])
                elif isinstance(block_logs_safe, list):
                    transactions = block_logs_safe
                else:
                    raise RuntimeError(f"Unexpected type for block_logs: {type(block_logs_safe)}")


                total_tx = len(transactions)
                for start_idx in range(0, total_tx, BATCH_TX_SIZE):
                    end_idx = min(start_idx + BATCH_TX_SIZE, total_tx)
                    batch_tx = transactions[start_idx:end_idx]

                    for idx, tx in enumerate(batch_tx, start=start_idx):
                        tx_record = {
                            "block_height": bn,
                            "job_name": JOB_NAME,
                            "run_id": RUN_ID,
                            "inserted_at": current_utctime(),
                            "raw": json.dumps(tx),
                            "tx_index": idx
                        }

                        producer.produce(
                            topic=BLOCKS_TOPIC,
                            key=f"{bn}-{idx}",
                            value=blocks_value_serializer(
                                tx_record,
                                SerializationContext(BLOCKS_TOPIC, MessageField.VALUE)
                            ),
                            on_delivery=delivery_report,
                        )
                    print(f"sending txs {start_idx}-{end_idx-1} for block {bn} to {BLOCKS_TOPIC}")
                    producer.poll(0)  # Poll once per batch to free memory

            # 2Ô∏è‚É£ produce state (single write)
            state_record = {
                "job_name": JOB_NAME,
                "run_id": RUN_ID,
                "range": {
                    "start": last_block,
                    "end": batch_end
                },
                "checkpoint": batch_end,
                "status": "running",
                "inserted_at": current_utctime()
            }

            producer.produce(
                STATE_TOPIC,
                key=JOB_NAME,
                value=state_value_serializer(
                    state_record,
                    SerializationContext(STATE_TOPIC, MessageField.VALUE)
                ),
                on_delivery=delivery_report,
            )

            producer.poll(0)
            # commit transactionÔºàlogs + stateÔºâ
            producer.commit_transaction()

            last_block = batch_end
            print(f"‚úÖ committed blocks up to {last_block}", flush=True)

        except Exception as e:
            print(f"üî• transaction failed, aborting: {e}", flush=True)
            try:
                producer.abort_transaction()
            except Exception as abort_err:
                print(f"Abort transaction failed: {abort_err}")

            # normal write for failed status
            failed_state = {
                "job_name": JOB_NAME,
                "run_id": RUN_ID,
                "range": {
                    "start": last_block,
                    "end": batch_end
                },
                "checkpoint": last_block - 1,
                "status": "failed",
                "inserted_at": current_utctime()
            }

            producer.produce(
                STATE_TOPIC,
                key=JOB_NAME,
                value=state_value_serializer(
                    failed_state,
                    SerializationContext(STATE_TOPIC, MessageField.VALUE)
                ),
                on_delivery=delivery_report,
            )

            producer.flush()
            raise

        time.sleep(POLL_INTERVAL)


# Entrypoint
if __name__ == "__main__":
    fetch_and_push()

‚ñ∂Ô∏è Starting from block 74944739
sending txs 0-9 for block 74944739 to blockchain.logs.bsc
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10939
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10940
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10941
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10942
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10943
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10944
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10945
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10946
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10947
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10948
sending txs 10-19 for block 74944739 to blockchain.logs.bsc
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10949
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10950
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10951
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10952
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10953
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10954
‚úÖ Delivered to blockchain.logs.bsc [0] @ 10955
‚úÖ Delivered

KeyboardInterrupt: 

In [None]:
# -----------------------------
# Main function 
# - Kafka State + Exactly-once, batched splitting of logs
# -----------------------------

def fetch_and_push():
    
    last_state = load_last_state(JOB_NAME)
    last_block = last_state["checkpoint"]
    if last_block is None:
        last_block = w3.eth.block_number - 1

    print(f"‚ñ∂Ô∏è Starting from block {last_block + 1}", flush=True)

    while True:
        latest_block = w3.eth.block_number
        if last_block >= latest_block:
            time.sleep(POLL_INTERVAL)
            continue

        batch_end = min(last_block + BATCH_SIZE, latest_block)

        try:
            # üîê Kafka Transaction
            producer.begin_transaction()

            # 1Ô∏è‚É£ produce logs (split each block's logs into batches for sending)
            for bn in range(last_block + 1, batch_end + 1):
                block_logs = fetch_block_logs(bn)
                if block_logs is None:
                    raise RuntimeError(f"block logs {bn} fetch failed")

                block_logs_safe = to_json_safe(block_logs)  # Convert to serializable
                if isinstance(block_logs_safe, dict):
                    transactions = block_logs_safe.get("transactions", [block_logs_safe])
                elif isinstance(block_logs_safe, list):
                    transactions = block_logs_safe
                else:
                    raise RuntimeError(f"Unexpected type for block_logs: {type(block_logs_safe)}")

                total_tx = len(transactions)
                for start_idx in range(0, total_tx, BATCH_TX_SIZE):
                    end_idx = min(start_idx + BATCH_TX_SIZE, total_tx)
                    batch_tx = transactions[start_idx:end_idx]

                    for idx, tx in enumerate(batch_tx, start=start_idx):
                        tx_record = {
                            "block_height": bn,
                            "job_name": JOB_NAME,
                            "run_id": RUN_ID,
                            "inserted_at": current_utctime(),
                            "raw": json.dumps(tx),
                            "tx_index": idx
                        }

                        producer.produce(
                            topic=BLOCKS_TOPIC,
                            key=f"{bn}-{idx}",
                            value=blocks_value_serializer(
                                tx_record,
                                SerializationContext(BLOCKS_TOPIC, MessageField.VALUE)
                            ),
                            on_delivery=delivery_report,
                        )
                
                    producer.poll(0)  # Poll once per batch to free memory
                print(f"sending txs {start_idx}-{end_idx-1} for block {bn} to {BLOCKS_TOPIC}")
                
            # 2Ô∏è‚É£ produce state (single write)
            state_record = {
                "job_name": JOB_NAME,
                "run_id": RUN_ID,
                "range": {
                    "start": last_block + 1,
                    "end": batch_end
                },
                "checkpoint": batch_end,
                "status": "running",
                "inserted_at": current_utctime()
            }

            producer.produce(
                STATE_TOPIC,
                key=JOB_NAME,
                value=state_value_serializer(
                    state_record,
                    SerializationContext(STATE_TOPIC, MessageField.VALUE)
                ),
                on_delivery=delivery_report,
            )

            producer.poll(0)
            # commit transactionÔºàlogs + stateÔºâ
            producer.commit_transaction()

            last_block = batch_end
            print(f"‚úÖ committed blocks up to {last_block}", flush=True)

        except Exception as e:
            print(f"üî• transaction failed, aborting: {e}", flush=True)
            try:
                producer.abort_transaction()
            except Exception as abort_err:
                print(f"Abort transaction failed: {abort_err}")

            # normal write for failed status
            failed_state = {
                "job_name": JOB_NAME,
                "run_id": RUN_ID,
                "range": {
                    "start": last_block,
                    "end": batch_end
                },
                "checkpoint": last_block - 1,
                "status": "failed",
                "inserted_at": current_utctime()
            }

            producer.produce(
                STATE_TOPIC,
                key=JOB_NAME,
                value=state_value_serializer(
                    failed_state,
                    SerializationContext(STATE_TOPIC, MessageField.VALUE)
                ),
                on_delivery=delivery_report,
            )

            producer.flush()
            raise

        time.sleep(POLL_INTERVAL)


# Entrypoint
if __name__ == "__main__":
    fetch_and_push()