In [1]:
import os

# Load environment variables for Kafka and VastDB connectivity
DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP", "localhost")
VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")
VASTDB_TWITTER_INGEST_SCHEMA = os.getenv("VASTDB_TWITTER_INGEST_SCHEMA")
VASTDB_TWITTER_INGEST_TABLE = os.getenv("VASTDB_TWITTER_INGEST_TABLE")

print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
""")


---
DOCKER_HOST_OR_IP=10.143.11.241
---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
VASTDB_TWITTER_INGEST_BUCKET=csnow-db
VASTDB_TWITTER_INGEST_SCHEMA=social_media
VASTDB_TWITTER_INGEST_TABLE=tweets
---



In [2]:
# ## Spark Configuration

import socket
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    ("spark.sql.execution.arrow.pyspark.enabled", "false"),
    # VASTDB
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension'),
    # Kafka
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3,"
                           "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0,"
                           "org.apache.logging.log4j:log4j-api:2.19.0,"
                           "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"),
    ("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"),
])

spark = SparkSession.builder \
    .master("local") \
    .appName("KafkaStreamingToVastDB") \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

print("Spark successfully loaded\n")

Spark successfully loaded



In [None]:
# ## Set Kafka Configuration

# Set Kafka brokers and topic
kafka_brokers = f'{DOCKER_HOST_OR_IP}:19092'
topic = 'streaming-demo-2'

# Create checkpoint directory with absolute path
checkpoint_dir = os.path.abspath("/tmp/spark_checkpoint")
os.makedirs(checkpoint_dir, exist_ok=True)

# ## Read Streaming Data from Kafka

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StringType, StructType, StructField, LongType

# Define schema for Kafka message
schema = StructType([
    StructField("text", StringType(), True),
    StructField("created_at", LongType(), True),
    StructField("id", LongType(), True),
    StructField("id_str", StringType(), True)
])

# Read data from Kafka stream
raw_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_brokers) \
    .option("subscribe", topic) \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "true") \
    .load()

raw_messages = raw_stream.selectExpr("CAST(value AS STRING) as raw_message")

# Parse the Kafka message and extract relevant fields
decoded_stream = raw_stream.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), schema).alias("data")) \
    .select("data.*")

# ## Write the Stream to VastDB

# Prepare data to match VastDB table schema
vastdb_stream = decoded_stream.select(
    col("text"),
    col("created_at"),
    col("id"),
    col("id_str")
)

catalog_table_name = f"`ndb`.`{VASTDB_TWITTER_INGEST_BUCKET}`.`{VASTDB_TWITTER_INGEST_SCHEMA}`.`{VASTDB_TWITTER_INGEST_TABLE}`"

# Print schema for debugging
def process_microbatch(parsed_df, epoch_id):
    # Show all rows in the DataFrame
    # parsed_df.show(truncate=False)
    parsed_df.write.mode("append").saveAsTable(catalog_table_name)

# Write to console (for debugging purposes) with a trigger interval of 1 second
vastdb_query = vastdb_stream.writeStream \
    .foreachBatch(process_microbatch) \
    .outputMode("append") \
    .trigger(processingTime='1 second') \
    .start()

memory_table_query = decoded_stream.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("debug_table") \
    .start()

# Function to check the row count in memory table
import time
def check_row_count():
    while True:
        kafka_consumed_row_count = spark.sql(f"SELECT count(*) FROM debug_table").collect()[0][0]
        vast_table_row_count = spark.sql(f"SELECT count(*) FROM {catalog_table_name}").collect()[0][0]
        print(f"Rows consumed: {kafka_consumed_row_count} | Vast Table Row Count: {vast_table_row_count}", end="\r")
        time.sleep(5)

# Start thread for checking row count
import threading
row_count_thread = threading.Thread(target=check_row_count)
row_count_thread.daemon = True
row_count_thread.start()

# Wait for termination
vastdb_query.awaitTermination()

Rows consumed: 150 | Vast Table Row Count: 2474478