In [1]:
import os

# Load environment variables for Kafka and VastDB connectivity
DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP", "localhost")
VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")
VASTDB_TWITTER_INGEST_SCHEMA = os.getenv("VASTDB_TWITTER_INGEST_SCHEMA")
VASTDB_TWITTER_INGEST_TABLE = os.getenv("VASTDB_TWITTER_INGEST_TABLE")

print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
""")


---
DOCKER_HOST_OR_IP=10.143.11.241
---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
VASTDB_TWITTER_INGEST_BUCKET=csnow-db
VASTDB_TWITTER_INGEST_SCHEMA=social_media
VASTDB_TWITTER_INGEST_TABLE=tweets
---



In [2]:
# ## Spark Configuration

import socket
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    ("spark.sql.execution.arrow.pyspark.enabled", "false"),
    # VASTDB
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension'),
    # Kafka
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3,"
                           "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0,"
                           "org.apache.logging.log4j:log4j-api:2.19.0,"
                           "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"),
    ("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"),
])

spark = SparkSession.builder \
    .master("local") \
    .appName("KafkaStreamingToVastDB") \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

print("Spark successfully loaded\n")

Spark successfully loaded



In [3]:
import os
import time
import threading
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StringType, StructType, StructField, LongType

class StreamMonitor:
    def __init__(self, spark):
        self.spark = spark
        self.active = True
        self.thread = None
        
    def monitor_counts(self, query_name, catalog_table_name):
        while self.active:
            try:
                # Wrap counts in a short timeout to allow interruption
                df_count = self.spark.sql(f"SELECT count(*) FROM {query_name}")
                kafka_count = df_count.take(1)[0][0]
                
                vast_df = self.spark.sql(f"SELECT count(*) FROM {catalog_table_name}")
                vast_count = vast_df.take(1)[0][0]
                
                print(f"Kafka messages consumed: {kafka_count} | Vast Table row count: {vast_count}", end="\r")
                time.sleep(1)
            except Exception:
                if self.active:  # Only print error if not shutting down
                    print("\nMonitoring interrupted")
                break
    
    def start(self, query_name, catalog_table_name):
        self.thread = threading.Thread(
            target=self.monitor_counts,
            args=(query_name, catalog_table_name)
        )
        self.thread.daemon = True
        self.thread.start()
    
    def stop(self):
        self.active = False
        if self.thread:
            self.thread.join(timeout=2)

# Configuration
kafka_brokers = f'{DOCKER_HOST_OR_IP}:19092'
topic = 'streaming-demo-2'
checkpoint_dir = os.path.abspath("/tmp/spark_checkpoint")
os.makedirs(checkpoint_dir, exist_ok=True)

# Schema definition
schema = StructType([
    StructField("text", StringType(), True),
    StructField("created_at", LongType(), True),
    StructField("id", LongType(), True),
    StructField("id_str", StringType(), True)
])

# Define the catalog table name
catalog_table_name = f"`ndb`.`{VASTDB_TWITTER_INGEST_BUCKET}`.`{VASTDB_TWITTER_INGEST_SCHEMA}`.`{VASTDB_TWITTER_INGEST_TABLE}`"
query_name = f"debug_table_{int(time.time())}"

def process_microbatch(parsed_df, epoch_id):
    parsed_df.write.mode("append").saveAsTable(catalog_table_name)

# Set up streams
raw_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_brokers) \
    .option("subscribe", topic) \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "true") \
    .load()

# Parse streams
decoded_stream = raw_stream.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), schema).alias("data")) \
    .select("data.*")

vastdb_stream = decoded_stream.select(
    col("text"),
    col("created_at"),
    col("id"),
    col("id_str")
)

# Initialize monitor
monitor = StreamMonitor(spark)

try:
    # Start the streams
    vastdb_query = vastdb_stream.writeStream \
        .foreachBatch(process_microbatch) \
        .outputMode("append") \
        .trigger(processingTime='1 second') \
        .start()
        
    memory_table_query = decoded_stream.writeStream \
        .outputMode("append") \
        .format("memory") \
        .queryName(query_name) \
        .start()
    
    # Start monitoring
    monitor.start(query_name, catalog_table_name)
    
    # Use a simple loop instead of awaitTermination
    while True:
        time.sleep(1)
        if not vastdb_query.isActive or not memory_table_query.isActive:
            break

except KeyboardInterrupt:
    print("\nShutdown initiated...")
finally:
    # Clean up
    print("\nStopping monitoring...")
    monitor.stop()
    
    print("Stopping queries...")
    try:
        if 'memory_table_query' in locals():
            memory_table_query.stop()
        if 'vastdb_query' in locals():
            vastdb_query.stop()
    except Exception as e:
        print(f"Error during shutdown: {e}")
    
    print("Shutdown complete")

Kafka messages consumed: 10225 | Vast Table row count: 2569653
Shutdown initiated...

Stopping monitoring...
Stopping queries...
Shutdown complete
