In [1]:
%%capture --no-stderr
%pip install --quiet -U faker

In [2]:
import os

# Load environment variables for Kafka and VastDB connectivity
DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP", "localhost")
VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_FRAUD_DETECTION_BUCKET = os.getenv("VASTDB_FRAUD_DETECTION_BUCKET")
VASTDB_FRAUD_DETECTION_SCHEMA = os.getenv("VASTDB_FRAUD_DETECTION_SCHEMA")
VASTDB_FRAUD_DETECTION_TABLE = 'fraud'

# Kafka broker configuration using environment variable
DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")
print(f"{DOCKER_HOST_OR_IP=}")

use_vastkafka = True
if use_vastkafka:
    VAST_KAFKA_BROKER = os.getenv("VAST_KAFKA_BROKER")
else:
    VAST_KAFKA_BROKER = f"{DOCKER_HOST_OR_IP}:19092"

kafka_brokers = VAST_KAFKA_BROKER
kafka_topic = "stock-settlement"

DOCKER_HOST_OR_IP='10.143.11.241'


# Fraud Producer (Spark Streaming app that will simulate stock settlement data) -> Kafka

In [3]:
import os
import time
import json
import random
import signal
import sys
from datetime import datetime, timedelta
from faker import Faker
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

# Initialize Faker
fake = Faker()

# Spark configuration
conf = SparkConf()
conf.setAll([
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3,"
                             "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0,"
                             "org.apache.logging.log4j:log4j-api:2.19.0,"
                             "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"),
    ("spark.driver.userClassPathFirst", "true"),
    ("spark.executor.userClassPathFirst", "true"),
    ("spark.sql.execution.arrow.pyspark.enabled", "false")
])

spark = SparkSession.builder \
    .master("local") \
    .appName("FakeStockSettlementKafkaStreaming") \
    .config(conf=conf) \
    .getOrCreate()

# Graceful shutdown flag
should_shutdown = False

def signal_handler(sig, frame):
    global should_shutdown
    print("\nGraceful shutdown initiated...")
    should_shutdown = True

# Register signal handlers
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)

# Lists of known fraudulent entities and legitimate entities
FRAUDULENT_COMPANIES = [
    "Phoenix Capital LLC", "Mirage Investments", "Atlantis Holdings", 
    "Shadow Finance Group", "Apex Trading Syndicate", "Eclipse Securities",
    "Dark Star Ventures", "Nebula Financial Services", "BlackBox Investments",
    "Phantom Partners LLC"
]

LEGITIMATE_COMPANIES = [
    "Goldman Sachs", "JPMorgan Chase", "Morgan Stanley", "BlackRock", 
    "Vanguard Group", "Fidelity Investments", "Charles Schwab", 
    "State Street Corporation", "Bank of America Securities", "Citigroup"
]

# Suspicious stock symbols (used more often in fraudulent transactions)
SUSPICIOUS_SYMBOLS = ["XYZ", "ZYX", "QQQ", "ZZZ", "XXX"]

# Function to generate a random date within the year but with time patterns
def generate_trade_date():
    base_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
    days_back = random.randint(1, 365)
    date = base_date - timedelta(days=days_back)
    
    # Fraud is more likely to occur during non-business hours
    if random.random() < 0.7:  # 70% of fraudulent trades happen outside regular hours
        if random.random() < 0.5:  # Early morning
            hour = random.randint(0, 7)
        else:  # Late night
            hour = random.randint(18, 23)
        date = date.replace(hour=hour)
    else:
        # Normal business hours
        hour = random.randint(9, 16)
        date = date.replace(hour=hour)
        
    return date

# Function to generate a settlement date based on trade date
def generate_settlement_date(trade_date, is_fraud=False):
    if is_fraud:
        # Fraudulent transactions often have unusual settlement timeframes
        if random.random() < 0.6:  # Immediate settlement (suspicious)
            days = 0
        elif random.random() < 0.8:  # Very delayed settlement
            days = random.randint(10, 30)
        else:  # Normal timeframe to blend in
            days = random.randint(1, 3)
    else:
        # Normal settlement is typically T+2
        days = 2
        # Sometimes there are legitimate variations
        if random.random() < 0.1:
            days = random.randint(1, 3)
    
    settlement_date = trade_date + timedelta(days=days)
    return settlement_date

# Function to generate a fake stock settlement with enhanced fraud patterns
def create_stock_settlement(fraud_percentage=0.05):
    """Generate a fake stock settlement record with sophisticated fraud patterns."""
    is_fraud = random.random() < fraud_percentage
    
    # Base trade date
    trade_date = generate_trade_date()
    trade_date_str = trade_date.isoformat()
    
    # Generate settlement date based on trade date
    settlement_date = generate_settlement_date(trade_date, is_fraud)
    settlement_date_str = settlement_date.isoformat()
    
    # Common fields for both fraud and legitimate transactions
    transaction_id = fake.uuid4()
    
    if is_fraud:
        # FRAUD PATTERN 1: Price anomalies
        if random.random() < 0.7:
            price = round(random.uniform(20000, 100000), 2)  # Unusually high prices
        else:
            # Sometimes fraudsters use seemingly normal prices to blend in
            price = round(random.uniform(10, 5000), 2)
        
        # FRAUD PATTERN 2: Quantity anomalies
        if random.random() < 0.6:
            quantity = random.randint(50000, 1000000)  # Unusually large quantities
        elif random.random() < 0.3:
            quantity = random.randint(1, 10)  # Unusually small quantities
        else:
            quantity = random.randint(100, 10000)  # Normal quantities to blend in
        
        # FRAUD PATTERN 3: Entity relationships
        if random.random() < 0.4:
            # Same entity as both buyer and seller (self-dealing)
            if random.random() < 0.5:
                fraudulent_entity = random.choice(FRAUDULENT_COMPANIES)
                buyer = seller = fraudulent_entity
            else:
                # Use a legitimate company name to appear normal
                legitimate_entity = random.choice(LEGITIMATE_COMPANIES)
                buyer = seller = legitimate_entity
        else:
            # Different entities but at least one is suspicious
            if random.random() < 0.7:
                buyer = random.choice(FRAUDULENT_COMPANIES)
                if random.random() < 0.3:
                    seller = random.choice(FRAUDULENT_COMPANIES)
                else:
                    seller = fake.company()
            else:
                buyer = fake.company()
                seller = random.choice(FRAUDULENT_COMPANIES)
        
        # FRAUD PATTERN 4: Suspicious stock symbols
        if random.random() < 0.8:
            stock_symbol = random.choice(SUSPICIOUS_SYMBOLS)
        else:
            stock_symbol = fake.lexify(text='???', letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ')
        
        # FRAUD PATTERN 5: Status distribution
        status = random.choices(
            ["Pending", "Failed", "Settled", "Reversed", "Disputed"],
            weights=[0.4, 0.3, 0.1, 0.1, 0.1],
            k=1
        )[0]
        
        # Additional fraud indicators
        additional_fields = {
            "reversal_attempts": random.randint(0, 3) if random.random() < 0.4 else 0,
            "manual_review_flag": random.choices([True, False], weights=[0.7, 0.3], k=1)[0],
            "settlement_method": random.choices(
                ["Direct", "Escrow", "Third-party", "Complex"],
                weights=[0.2, 0.2, 0.3, 0.3],
                k=1
            )[0],
            "is_fraud": True  # This is our target label
        }
        
    else:
        # LEGITIMATE TRANSACTION PATTERNS
        price = round(random.uniform(10, 5000), 2)
        quantity = random.randint(10, 10000)
        
        # Legitimate entities
        if random.random() < 0.7:
            buyer = random.choice(LEGITIMATE_COMPANIES)
        else:
            buyer = fake.company()
            
        if random.random() < 0.05:  # 5% chance buyer and seller are same (can happen legitimately)
            seller = buyer
        elif random.random() < 0.7:
            seller = random.choice(LEGITIMATE_COMPANIES)
        else:
            seller = fake.company()
        
        # Normal stock symbols
        stock_symbol = fake.lexify(text='???', letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ')
        
        # Status distribution for legitimate transactions
        status = random.choices(
            ["Settled", "Pending", "Failed"],
            weights=[0.9, 0.07, 0.03],
            k=1
        )[0]
        
        # Additional legitimate indicators
        additional_fields = {
            "reversal_attempts": 0,
            "manual_review_flag": random.choices([True, False], weights=[0.05, 0.95], k=1)[0],
            "settlement_method": random.choices(
                ["Standard", "Expedited", "Direct"],
                weights=[0.8, 0.15, 0.05],
                k=1
            )[0],
            "is_fraud": False  # This is our target label
        }
    
    # Create the base transaction
    transaction = {
        "transaction_id": transaction_id,
        "trade_date": trade_date_str,
        "settlement_date": settlement_date_str,
        "stock_symbol": stock_symbol,
        "quantity": quantity,
        "price": price,
        "buyer": buyer,
        "seller": seller,
        "status": status,
        **additional_fields
    }
    
    # PATTERN 6: Feature Engineering - Derived features that will help ML models
    
    # Time-based features
    trade_datetime = datetime.fromisoformat(trade_date_str)
    transaction["trade_hour"] = trade_datetime.hour
    transaction["trade_day_of_week"] = trade_datetime.weekday()
    transaction["is_weekend"] = 1 if trade_datetime.weekday() >= 5 else 0
    transaction["is_after_hours"] = 1 if (trade_datetime.hour < 9 or trade_datetime.hour >= 17) else 0
    
    # Value-based features
    transaction["total_value"] = price * quantity
    
    # Settlement delay
    settlement_datetime = datetime.fromisoformat(settlement_date_str)
    settlement_delay = (settlement_datetime - trade_datetime).days
    transaction["settlement_delay_days"] = settlement_delay
    transaction["unusual_settlement_time"] = 1 if settlement_delay != 2 else 0
    
    # Entity-related features
    transaction["same_buyer_seller"] = 1 if buyer == seller else 0
    transaction["buyer_is_known_fraudulent"] = 1 if buyer in FRAUDULENT_COMPANIES else 0
    transaction["seller_is_known_fraudulent"] = 1 if seller in FRAUDULENT_COMPANIES else 0
    transaction["buyer_is_known_legitimate"] = 1 if buyer in LEGITIMATE_COMPANIES else 0
    transaction["seller_is_known_legitimate"] = 1 if seller in LEGITIMATE_COMPANIES else 0
    
    return transaction

# Continuously generate and push stock settlements to Kafka
record_count = 0

try:
    while not should_shutdown:
        # Randomize batch size for dynamic event generation
        batch_size = random.randint(1000, 8000)
        
        # Generate stock settlements
        records = [
            {"key": settlement["transaction_id"], "value": json.dumps(settlement)}
            for settlement in [create_stock_settlement() for _ in range(batch_size)]
        ]

        # Create DataFrame from stock settlements
        df = spark.createDataFrame(records)

        # Write DataFrame as JSON to Kafka topic
        df.write \
          .format("kafka") \
          .option("kafka.bootstrap.servers", kafka_brokers) \
          .option("topic", kafka_topic) \
          .option("kafka.producer.batch.size", 1000000) \
          .option("kafka.enable.idempotence", False) \
          .save()

        record_count += batch_size
        print(f"Produced: {record_count} records", end="\r")

        # Reduce sleep time or remove it for high throughput
        time.sleep(0.1)  # Adjust sleep time as needed
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    print("\nShutting down Spark session...")
    spark.stop()
    print("Spark session stopped. Goodbye!")

Produced: 25577 records
Graceful shutdown initiated...
Produced: 30012 records
Shutting down Spark session...
Spark session stopped. Goodbye!
