In [1]:
import os
os.environ['PYSPARK_PYTHON'] = '/Users/priyasuresh/.pyenv/shims/python3.10'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/Users/priyasuresh/.pyenv/shims/python3.10'


In [2]:
!which python

/Users/priyasuresh/.pyenv/versions/3.10.4/bin/python


In [1]:
import numpy as np
import hashlib
from pyspark.sql.functions import col, from_json
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from scipy.stats import laplace  # Import Laplace distribution for differential privacy

# Define SimpleFilter class
class SimpleFilter:
    def __init__(self, size, hash_count):
        self.size = size
        self.hash_count = hash_count
        self.bit_array = np.zeros(size, dtype=np.bool_)

    def _hash(self, item, seed):
        hash_val = int(hashlib.md5(item.encode('utf-8')).hexdigest(), 16)
        return (hash_val + seed) % self.size

    def add(self, item):
        for seed in range(self.hash_count):
            index = self._hash(item, seed)
            self.bit_array[index] = True

    def check(self, item):
        for seed in range(self.hash_count):
            index = self._hash(item, seed)
            if not self.bit_array[index]:
                return False
        return True

from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
from scipy.stats import laplace

# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("StreamingWithBloomFilter") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
    .getOrCreate()

# Define the schema of the message
messageSchema = StructType([
    StructField("symbol_id", StringType()),
    StructField("sequence", StringType()),
    StructField("time_exchange", TimestampType()),
    StructField("time_coinapi", TimestampType()),
    StructField("uuid", StringType()),
    StructField("price", DoubleType()),
    StructField("size", DoubleType()),
    StructField("taker_side", StringType())
])

# Subscribe to multiple topics
topic_names = "bitstamp_btc_usd,bitfinex_btc_usd,kraken_btc_usd"

# Create a streaming DataFrame that reads from the topics
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", topic_names) \
    .load()

# Deserialize the JSON data and apply the schema
value_df = df.select(from_json(col("value").cast("string"), messageSchema).alias("data")).select("data.*")

# Initialize the Bloom Filter
bloom_filter = SimpleFilter(10000, 3)

# Function to add Laplacian noise for differential privacy
def add_laplace_noise(data, epsilon=1.0, sensitivity=1):
    try:
        # Convert data to float and then add Laplace noise
        data_float = float(data)
        scale = sensitivity / epsilon
        noise = laplace.rvs(scale=scale, size=1)[0]
        return str(data_float + noise)  # Convert back to string after adding noise
    except ValueError:
        # If data cannot be converted to float (e.g., UUID), return the original value
        return data

# Define process_batch function with differential privacy
def process_batch(batch_df, batch_id):
    # Convert to Pandas DataFrame for easier manipulation
    pandas_df = batch_df.toPandas()
    
    # Apply differential privacy to each UUID
    pandas_df['uuid'] = pandas_df['uuid'].apply(add_laplace_noise)
    
    # Apply the Bloom Filter to each row
    for index, row in pandas_df.iterrows():
        uuid = row['uuid']
        if not bloom_filter.check(uuid):
            bloom_filter.add(uuid)  # Use add method to insert UUID into the Bloom filter
            # Process the row since it's not a duplicate
            print(row)

# Apply the function to each micro-batch
query = value_df.writeStream.foreachBatch(process_batch).start()

# Wait for the stream to finish
query.awaitTermination()



24/03/29 14:17:23 WARN Utils: Your hostname, Priyas-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.68.59 instead (on interface en0)
24/03/29 14:17:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/priyasuresh/.ivy2/cache
The jars for the packages stored in: /Users/priyasuresh/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-eb675797-7adb-4014-826a-69138f403342;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/priyasuresh/.pyenv/versions/3.10.4/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 193ms :: artifacts dl 6ms
	:: modules in use:
	com.google.code.findbugs#jsr305;3.0.0 from central in [default]
	commons-logging#commons-logging;1.1.3 from central in [default]
	org.apache.commons#commons-pool2;2.11.1 from central in [default]
	org.apache.hadoop#hadoop-client-api;3.3.4 from central in [default]
	org.apache.hadoop#h

symbol_id                         KRAKEN_SPOT_BTC_USD
sequence                                        25181
time_exchange              2024-03-29 14:17:28.201206
time_coinapi               2024-03-29 14:17:28.221169
uuid             5f9d98cd-fe5d-45be-92bc-fa9cfbe11f37
price                                         69580.0
size                                             0.05
taker_side                                       SELL
Name: 0, dtype: object
symbol_id                        KRAKEN_SPOT_BTC_USDT
sequence                                         5577
time_exchange              2024-03-29 14:17:28.538023
time_coinapi               2024-03-29 14:17:28.565928
uuid             2ab89a35-e8bd-4eed-b2f2-c16a5e39215a
price                                         69505.0
size                                         0.001659
taker_side                                       SELL
Name: 0, dtype: object
symbol_id                         KRAKEN_SPOT_BTC_USD
sequence                            

                                                                                

symbol_id                       BITSTAMP_SPOT_BTC_USD
sequence                                        18263
time_exchange              2024-03-29 14:17:48.849000
time_coinapi               2024-03-29 14:17:48.878439
uuid             fc9e6bf3-a39d-4d79-a1ca-255a5cafeefe
price                                         69552.0
size                                          0.11402
taker_side                                        BUY
Name: 0, dtype: object
symbol_id                        KRAKEN_SPOT_BTC_USDT
sequence                                         5579
time_exchange              2024-03-29 14:17:49.213182
time_coinapi               2024-03-29 14:17:49.240205
uuid             435544b8-f985-46f9-8c3d-4b4f370a2de7
price                                         69505.0
size                                          0.00192
taker_side                                       SELL
Name: 0, dtype: object
symbol_id                        KRAKEN_SPOT_BTC_USDT
sequence                            

                                                                                

symbol_id                      BITFINEX_SPOT_BTC_USDT
sequence                                        10726
time_exchange              2024-03-29 14:18:07.516000
time_coinapi               2024-03-29 14:18:07.532222
uuid             41bb8dcb-a2b5-4724-9f2a-def4c457ca9d
price                                         69500.0
size                                         0.000216
taker_side                                       SELL
Name: 0, dtype: object


                                                                                

symbol_id                         KRAKEN_SPOT_BTC_USD
sequence                                        25199
time_exchange              2024-03-29 14:18:10.026467
time_coinapi               2024-03-29 14:18:10.046193
uuid             c3afe02e-0cb4-4126-aaea-d551497e5184
price                                         69580.1
size                                          0.00149
taker_side                                        BUY
Name: 0, dtype: object
symbol_id                      BITFINEX_SPOT_BTC_USDT
sequence                                        10728
time_exchange              2024-03-29 14:18:10.391000
time_coinapi               2024-03-29 14:18:10.407120
uuid             99bef732-757d-41ee-80a2-04cc1e3ce16e
price                                         69501.0
size                                         0.002346
taker_side                                        BUY
Name: 0, dtype: object
symbol_id                        KRAKEN_SPOT_BTC_USDT
sequence                            

                                                                                

symbol_id                       BITSTAMP_SPOT_BTC_USD
sequence                                        18265
time_exchange              2024-03-29 14:18:31.920000
time_coinapi               2024-03-29 14:18:31.946020
uuid             b2bd1888-69a2-4083-8c81-29dd7c62f59c
price                                         69540.0
size                                          0.00043
taker_side                                       SELL
Name: 0, dtype: object
symbol_id                       BITSTAMP_SPOT_BTC_USD
sequence                                        18267
time_exchange              2024-03-29 14:18:34.278000
time_coinapi               2024-03-29 14:18:34.342396
uuid             5e9680a6-9ae5-4d2e-a2f4-80e79d0e0ab6
price                                         69541.0
size                                          0.00062
taker_side                                        BUY
Name: 0, dtype: object
symbol_id                         KRAKEN_SPOT_BTC_USD
sequence                            

                                                                                

symbol_id                         KRAKEN_SPOT_BTC_USD
sequence                                        25209
time_exchange              2024-03-29 14:18:52.758919
time_coinapi               2024-03-29 14:18:52.778969
uuid             9a228bd0-665a-4bc3-a0e6-21cd657b9414
price                                         69580.0
size                                         0.000128
taker_side                                       SELL
Name: 0, dtype: object
symbol_id                         KRAKEN_SPOT_BTC_USD
sequence                                        25211
time_exchange              2024-03-29 14:18:52.868824
time_coinapi               2024-03-29 14:18:52.888651
uuid             95de03e9-aad4-484a-ad33-90f56f004192
price                                         69580.1
size                                         0.180942
taker_side                                        BUY
Name: 2, dtype: object
symbol_id                       BITSTAMP_SPOT_BTC_USD
sequence                            

                                                                                

symbol_id                         KRAKEN_SPOT_BTC_USD
sequence                                        25219
time_exchange              2024-03-29 14:19:08.393218
time_coinapi               2024-03-29 14:19:08.413893
uuid             437bbd72-f7c2-4f80-8c26-a0440b1d8f32
price                                         69580.0
size                                         0.359256
taker_side                                       SELL
Name: 0, dtype: object
symbol_id                         KRAKEN_SPOT_BTC_USD
sequence                                        25221
time_exchange              2024-03-29 14:19:09.372354
time_coinapi               2024-03-29 14:19:09.392219
uuid             2e1dd3d2-3cd8-4cde-8c70-768b8c34e956
price                                         69580.1
size                                          0.00081
taker_side                                        BUY
Name: 0, dtype: object
symbol_id                         KRAKEN_SPOT_BTC_USD
sequence                            

                                                                                

symbol_id                       BITSTAMP_SPOT_BTC_USD
sequence                                        18295
time_exchange              2024-03-29 14:19:33.277000
time_coinapi               2024-03-29 14:19:33.305709
uuid             18433093-8180-4970-a5b8-11d4239670ea
price                                         69531.0
size                                         0.014382
taker_side                                        BUY
Name: 0, dtype: object
symbol_id                       BITSTAMP_SPOT_BTC_USD
sequence                                        18297
time_exchange              2024-03-29 14:19:33.277000
time_coinapi               2024-03-29 14:19:33.320582
uuid             8a368433-cb80-4c3b-b44e-662495434956
price                                         69533.0
size                                          0.43145
taker_side                                        BUY
Name: 0, dtype: object
symbol_id                       BITSTAMP_SPOT_BTC_USD
sequence                            

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/priyasuresh/.pyenv/versions/3.10.4/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=80>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/priyasuresh/.pyenv/versions/3.10.4/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/priyasuresh/.pyenv/versions/3.10.4/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/priyasuresh/.pyenv/versions/3.10.4/lib/python3.10/site-packages/py4j/clients

Py4JError: An error occurred while calling o51.awaitTermination

symbol_id                      BITFINEX_SPOT_BTC_USDT
sequence                                        10756
time_exchange              2024-03-29 14:19:41.687000
time_coinapi               2024-03-29 14:19:41.704322
uuid             dfbeffc4-766c-41fa-a57f-7f339a3b473c
price                                         69485.0
size                                         0.000866
taker_side                                       SELL
Name: 0, dtype: object
symbol_id                      BITFINEX_SPOT_BTC_USDT
sequence                                        10758
time_exchange              2024-03-29 14:19:41.687000
time_coinapi               2024-03-29 14:19:41.707740
uuid             da8af468-9617-4a3e-b469-ddde6d74fd4b
price                                         69485.0
size                                         0.001602
taker_side                                       SELL
Name: 2, dtype: object
symbol_id                        KRAKEN_SPOT_BTC_USDT
sequence                            

                                                                                

symbol_id                         KRAKEN_SPOT_BTC_USD
sequence                                        25249
time_exchange              2024-03-29 14:20:15.365273
time_coinapi               2024-03-29 14:20:15.385274
uuid             6d0f009b-4d0c-44ef-8ab5-72788c05339d
price                                         69580.0
size                                         0.000508
taker_side                                       SELL
Name: 0, dtype: object
symbol_id                       BITSTAMP_SPOT_BTC_USD
sequence                                        18313
time_exchange              2024-03-29 14:20:13.021000
time_coinapi               2024-03-29 14:20:13.060133
uuid             db154cf8-4fd5-4c56-b903-7aac7aa6b529
price                                         69530.0
size                                          0.00678
taker_side                                        BUY
Name: 0, dtype: object
symbol_id                       BITSTAMP_SPOT_BTC_USD
sequence                            

In [9]:
query.stop()

In [10]:
spark.stop()