Fraud_Detector

# Trade Settlement (Spark Streaming app that consumes stock settlement data from Kafka and stores them into the  VAST Database

In [1]:
import os

# Load environment variables for Kafka and VastDB connectivity
DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP", "localhost")
VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_FRAUD_DETECTION_BUCKET = os.getenv("VASTDB_FRAUD_DETECTION_BUCKET")
VASTDB_FRAUD_DETECTION_SCHEMA = os.getenv("VASTDB_FRAUD_DETECTION_SCHEMA")
VASTDB_FRAUD_DETECTION_TABLE = 'fraud'

# Print configurations
print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY==****{VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_FRAUD_DETECTION_BUCKET={VASTDB_FRAUD_DETECTION_BUCKET}
VASTDB_FRAUD_DETECTION_SCHEMA={VASTDB_FRAUD_DETECTION_SCHEMA}
# VASTDB_FRAUD_DETECTION_TABLE={VASTDB_FRAUD_DETECTION_TABLE}
---
""")

# Kafka Configuration
kafka_brokers = f'{DOCKER_HOST_OR_IP}:19092'
topic = 'stock-settlement'


---
DOCKER_HOST_OR_IP=10.143.11.241
---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY==****QXN5
VASTDB_SECRET_KEY=****oLGr
VASTDB_FRAUD_DETECTION_BUCKET=csnow-db
VASTDB_FRAUD_DETECTION_SCHEMA=fraud_detection
# VASTDB_FRAUD_DETECTION_TABLE=fraud
---



Create Vast DB schema if it doesn't exist.

In [2]:
!pip install --quiet vastdb

import vastdb

session = vastdb.connect(endpoint=VASTDB_ENDPOINT, access=VASTDB_ACCESS_KEY, secret=VASTDB_SECRET_KEY)
with session.transaction() as tx:
    bucket = tx.bucket(VASTDB_FRAUD_DETECTION_BUCKET)
    bucket.schema(VASTDB_FRAUD_DETECTION_SCHEMA, fail_if_missing=False) or bucket.create_schema(VASTDB_FRAUD_DETECTION_SCHEMA)

In [3]:
import socket
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, count
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, BooleanType
import threading
import time

# Spark Configuration
conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    ("spark.sql.execution.arrow.pyspark.enabled", "false"),
    # VASTDB
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension'),
    # Kafka
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3," 
                            "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0," 
                            "org.apache.logging.log4j:log4j-api:2.19.0," 
                            "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"),
    ("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"),
])

spark = SparkSession.builder \
    .master("local") \
    .appName("KafkaStreamingToVastDB") \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

print("Spark successfully loaded\n")


Spark successfully loaded



In [4]:
destination_table_name = f"`ndb`.`{VASTDB_FRAUD_DETECTION_BUCKET}`.`{VASTDB_FRAUD_DETECTION_SCHEMA}`.`{VASTDB_FRAUD_DETECTION_TABLE}`"
destination_table_name

'`ndb`.`csnow-db`.`fraud_detection`.`fraud`'

In [5]:
# Create checkpoint directory with absolute path
checkpoint_dir = os.path.abspath("/tmp/spark_checkpoint")
os.makedirs(checkpoint_dir, exist_ok=True)

# Define schema for Kafka message
schema = StructType([
    StructField("partitionID", LongType(), True),
    StructField("offset", LongType(), True),
    StructField("timestamp", LongType(), True),
    StructField("compression", StringType(), True),
    StructField("isTransactional", BooleanType(), True),
    StructField("key", StructType([
        StructField("payload", StringType(), True),
        StructField("encoding", StringType(), True)
    ])),
    StructField("value", StructType([
        StructField("payload", StructType([
            StructField("transaction_id", StringType(), True),
            StructField("settlement_date", StringType(), True),
            StructField("stock_symbol", StringType(), True),
            StructField("quantity", LongType(), True),
            StructField("price", DoubleType(), True),
            StructField("buyer", StringType(), True),
            StructField("seller", StringType(), True),
            StructField("trade_date", StringType(), True),
            StructField("status", StringType(), True)
        ])),
        StructField("encoding", StringType(), True)
    ]))
])

# Read data from Kafka stream
raw_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_brokers) \
    .option("subscribe", topic) \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "true") \
    .load()

# Parse the Kafka message
decoded_stream = raw_stream.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), StructType([
        StructField("transaction_id", StringType(), True),
        StructField("settlement_date", StringType(), True),
        StructField("stock_symbol", StringType(), True),
        StructField("quantity", LongType(), True),
        StructField("price", DoubleType(), True),
        StructField("buyer", StringType(), True),
        StructField("seller", StringType(), True),
        StructField("trade_date", StringType(), True),
        StructField("status", StringType(), True)
    ])).alias("payload"))

# Prepare data to match VastDB table schema

vastdb_stream = decoded_stream.select(
    col("payload.transaction_id").alias("transaction_id"),
    col("payload.settlement_date").alias("settlement_date"),
    col("payload.stock_symbol").alias("stock_symbol"),
    col("payload.quantity").alias("quantity"),
    col("payload.price").alias("price"),
    col("payload.buyer").alias("buyer"),
    col("payload.seller").alias("seller"),
    col("payload.trade_date").alias("trade_date"),
    col("payload.status").alias("status")
)

total_message_count = 0

def process_microbatch(parsed_df, epoch_id):
    global total_message_count
    batch_size = parsed_df.count()
    total_message_count += batch_size
    parsed_df.write.mode("append").saveAsTable(destination_table_name)
    print(f"Batch {epoch_id} processed: {batch_size} records written to the database. Total records written: {total_message_count}")


vastdb_stream.writeStream \
    .format("console") \
    .outputMode("append") \
    .trigger(processingTime="1 second") \
    .start()


vastdb_query = vastdb_stream.writeStream \
    .foreachBatch(process_microbatch) \
    .outputMode("append") \
    .trigger(processingTime='1 second') \
    .option("maxFilesPerTrigger", 5000) \
    .start()

def check_row_count():
    while True:
        try:
            vast_table_row_count = spark.sql(f"SELECT count(*) FROM {destination_table_name}").collect()[0][0]
            print(f"Kafka messages consumed (in-memory count): {total_message_count} | Vast Table row count: {vast_table_row_count}", end="\r")
        except pyspark.errors.exceptions.captured.AnalysisException as e:
            print(f"AnalysisException: {e}. Ensure all tables exist.")
        time.sleep(1)

# Start thread for checking row count
row_count_thread = threading.Thread(target=check_row_count)
row_count_thread.daemon = True
row_count_thread.start()


# Wait for termination
vastdb_query.awaitTermination()

Exception in thread Thread-6 (check_row_count):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/opt/conda/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_529/2422378681.py", line 96, in check_row_count
  File "/usr/local/spark/python/pyspark/sql/dataframe.py", line 1218, in collect
    sock_info = self._jdf.collectToPython()
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1322, in __call__
  File "/usr/local/spark/python/pyspark/errors/exceptions/captured.py", line 169, in deco
    return f(*a, **kw)
           ^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/protocol.py", line 326, in get_return_

Py4JError: org does not exist in the JVM

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
