Fraud_Detector

# Trade Settlement (Spark Streaming app that consumes stock settlement data from Kafka and stores them into the  VAST Database

In [3]:
import os

# Load environment variables for Kafka and VastDB connectivity
DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP", "localhost")
VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")
VASTDB_TWITTER_INGEST_SCHEMA = os.getenv("VASTDB_TWITTER_INGEST_SCHEMA")
VASTDB_TWITTER_INGEST_TABLE = 'fraud'

# Print configurations
print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
# VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
""")

# Kafka Configuration
kafka_brokers = f'{DOCKER_HOST_OR_IP}:19092'
topic = 'stock-settlement'


---
DOCKER_HOST_OR_IP=10.143.11.241
---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
VASTDB_TWITTER_INGEST_BUCKET=csnow-db
VASTDB_TWITTER_INGEST_SCHEMA=social_media
# VASTDB_TWITTER_INGEST_TABLE=fraud
---



In [2]:
import socket
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, count
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, BooleanType
import threading
import time

# Spark Configuration
conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    ("spark.sql.execution.arrow.pyspark.enabled", "false"),
    # VASTDB
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension'),
    # Kafka
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3," 
                            "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0," 
                            "org.apache.logging.log4j:log4j-api:2.19.0," 
                            "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"),
    ("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"),
])

spark = SparkSession.builder \
    .master("local") \
    .appName("KafkaStreamingToVastDB") \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

print("Spark successfully loaded\n")

# Create checkpoint directory with absolute path
checkpoint_dir = os.path.abspath("/tmp/spark_checkpoint")
os.makedirs(checkpoint_dir, exist_ok=True)

# Define schema for Kafka message
schema = StructType([
    StructField("partitionID", LongType(), True),
    StructField("offset", LongType(), True),
    StructField("timestamp", LongType(), True),
    StructField("compression", StringType(), True),
    StructField("isTransactional", BooleanType(), True),
    StructField("key", StructType([
        StructField("payload", StringType(), True),
        StructField("encoding", StringType(), True)
    ])),
    StructField("value", StructType([
        StructField("payload", StructType([
            StructField("transaction_id", StringType(), True),
            StructField("settlement_date", StringType(), True),
            StructField("stock_symbol", StringType(), True),
            StructField("quantity", LongType(), True),
            StructField("price", DoubleType(), True),
            StructField("buyer", StringType(), True),
            StructField("seller", StringType(), True),
            StructField("trade_date", StringType(), True),
            StructField("status", StringType(), True)
        ])),
        StructField("encoding", StringType(), True)
    ]))
])

# Read data from Kafka stream
raw_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_brokers) \
    .option("subscribe", topic) \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "true") \
    .load()

# Parse the Kafka message


decoded_stream = raw_stream.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), StructType([
        StructField("transaction_id", StringType(), True),
        StructField("settlement_date", StringType(), True),
        StructField("stock_symbol", StringType(), True),
        StructField("quantity", LongType(), True),
        StructField("price", DoubleType(), True),
        StructField("buyer", StringType(), True),
        StructField("seller", StringType(), True),
        StructField("trade_date", StringType(), True),
        StructField("status", StringType(), True)
    ])).alias("payload"))

# Prepare data to match VastDB table schema

vastdb_stream = decoded_stream.select(
    col("payload.transaction_id").alias("transaction_id"),
    col("payload.settlement_date").alias("settlement_date"),
    col("payload.stock_symbol").alias("stock_symbol"),
    col("payload.quantity").alias("quantity"),
    col("payload.price").alias("price"),
    col("payload.buyer").alias("buyer"),
    col("payload.seller").alias("seller"),
    col("payload.trade_date").alias("trade_date"),
    col("payload.status").alias("status")
)
catalog_table_name = f"`ndb`.`{VASTDB_TWITTER_INGEST_BUCKET}`.`{VASTDB_TWITTER_INGEST_SCHEMA}`.`{VASTDB_TWITTER_INGEST_TABLE}`"

total_message_count = 0

def process_microbatch(parsed_df, epoch_id):
    global total_message_count
    batch_size = parsed_df.count()
    total_message_count += batch_size
    parsed_df.write.mode("append").saveAsTable(catalog_table_name)
    print(f"Batch {epoch_id} processed: {batch_size} records written to the database. Total records written: {total_message_count}")


vastdb_stream.writeStream \
    .format("console") \
    .outputMode("append") \
    .trigger(processingTime="1 second") \
    .start()


vastdb_query = vastdb_stream.writeStream \
    .foreachBatch(process_microbatch) \
    .outputMode("append") \
    .trigger(processingTime='1 second') \
    .option("maxFilesPerTrigger", 5000) \
    .start()

def check_row_count():
    while True:
        try:
            vast_table_row_count = spark.sql(f"SELECT count(*) FROM {catalog_table_name}").collect()[0][0]
            print(f"Kafka messages consumed (in-memory count): {total_message_count} | Vast Table row count: {vast_table_row_count}", end="\r")
        except pyspark.errors.exceptions.captured.AnalysisException as e:
            print(f"AnalysisException: {e}. Ensure all tables exist.")
        time.sleep(1)

# Start thread for checking row count
row_count_thread = threading.Thread(target=check_row_count)
row_count_thread.daemon = True
row_count_thread.start()


# Wait for termination
vastdb_query.awaitTermination()


Spark successfully loaded

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `ndb`.`csnow-db`.`social_media`.`fraud` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 21;
'Aggregate [unresolvedalias(count(1), None)]
+- 'UnresolvedRelation [ndb, csnow-db, social_media, fraud], [], false
. Ensure all tables exist.
AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `ndb`.`csnow-db`.`social_media`.`fraud` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 po

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: reentrant call inside <_io.BufferedReader name=50>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip

Py4JError: An error occurred while calling o131.awaitTermination

Kafka messages consumed (in-memory count): 40541 | Vast Table row count: 40541