In [1]:
%%capture --no-stderr
%pip install --quiet -U faker

In [2]:
import os

# Load environment variables for Kafka and VastDB connectivity
DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP", "localhost")
VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_FRAUD_DETECTION_BUCKET = os.getenv("VASTDB_FRAUD_DETECTION_BUCKET")
VASTDB_FRAUD_DETECTION_SCHEMA = os.getenv("VASTDB_FRAUD_DETECTION_SCHEMA")
VASTDB_FRAUD_DETECTION_TABLE = 'fraud'

# Kafka broker configuration using environment variable
DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")
print(f"{DOCKER_HOST_OR_IP=}")

kafka_brokers = f"{DOCKER_HOST_OR_IP}:19092"
kafka_topic = "stock-settlement"

DOCKER_HOST_OR_IP='10.143.11.241'


# Fraud Producer (Spark Streaming app that will simulate stock settlement data) -> Kafka

In [3]:
import os
import time
import json
import random
import signal
import sys
from faker import Faker
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

# Initialize Faker
fake = Faker()

# Spark configuration
conf = SparkConf()
conf.setAll([
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3,"
                             "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0,"
                             "org.apache.logging.log4j:log4j-api:2.19.0,"
                             "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"),
    ("spark.driver.userClassPathFirst", "true"),
    ("spark.executor.userClassPathFirst", "true"),
    ("spark.sql.execution.arrow.pyspark.enabled", "false")
])

spark = SparkSession.builder \
    .master("local") \
    .appName("FakeStockSettlementKafkaStreaming") \
    .config(conf=conf) \
    .getOrCreate()

# Graceful shutdown flag
should_shutdown = False

def signal_handler(sig, frame):
    global should_shutdown
    print("\nGraceful shutdown initiated...")
    should_shutdown = True

# Register signal handlers
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)

# Function to generate a fake stock settlement
def create_stock_settlement(fraud_percentage=0.03, status_weights=None):
    """Generate a fake stock settlement record with custom fraud percentage and status distribution."""
    if status_weights is None:
        status_weights = {"Settled": 0.8, "Pending": 0.2, "Failed": 0.1}

    is_fraud = random.random() < fraud_percentage  # Fraud percentage
    if is_fraud:
        return {
            "transaction_id": fake.uuid4(),
            "settlement_date": fake.date_this_year().isoformat(),
            "stock_symbol": fake.lexify(text='???', letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
            "quantity": random.randint(1, 100000),
            "price": round(random.uniform(5000, 25000), 2),  # Variable fraud price
            "buyer": "Fraudulent Company" if random.random() < 0.5 else fake.company(),
            "seller": "Fraudulent Company" if random.random() < 0.5 else fake.company(),
            "trade_date": fake.date_this_year().isoformat(),
            "status": "Fraudulent"
        }
    else:
        buyer = fake.company()
        seller = buyer if random.random() < 0.05 else fake.company()  # 5% chance buyer and seller are the same
        return {
            "transaction_id": fake.uuid4(),
            "settlement_date": fake.date_this_year().isoformat(),
            "stock_symbol": fake.lexify(text='???', letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
            "quantity": random.randint(1, 10000),
            "price": round(random.uniform(10, 50000), 2),
            "buyer": buyer,
            "seller": seller,
            "trade_date": fake.date_this_year().isoformat(),
            "status": random.choices(
                list(status_weights.keys()), 
                weights=list(status_weights.values()), 
                k=1
            )[0]
        }

# Continuously generate and push stock settlements to Kafka
record_count = 0

try:
    while not should_shutdown:
        # Randomize batch size for dynamic event generation
        batch_size = random.randint(1000, 8000)  # Different amount of events per trade date
        
        # Generate stock settlements
        records = [
            {"key": settlement["transaction_id"], "value": json.dumps(settlement)}
            for settlement in [create_stock_settlement() for _ in range(batch_size)]
        ]

        # Create DataFrame from stock settlements
        df = spark.createDataFrame(records)

        # Write DataFrame as JSON to Kafka topic
        df.write \
          .format("kafka") \
          .option("kafka.bootstrap.servers", kafka_brokers) \
          .option("topic", kafka_topic) \
          .option("kafka.producer.batch.size", 1000000) \
          .save()

        record_count += batch_size
        print(f"Produced: {record_count} records", end="\r")

        # Reduce sleep time or remove it for high throughput
        time.sleep(0.1)  # Adjust sleep time as needed, or remove it entirely for max throughput
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    print("\nShutting down Spark session...")
    spark.stop()
    print("Spark session stopped. Goodbye!")

Produced: 88839 records
Graceful shutdown initiated...
Produced: 95353 records
Shutting down Spark session...
Spark session stopped. Goodbye!
