In [1]:
import os

# Set SPARK_HOME and JAVA_HOME environment variables
os.environ['SPARK_HOME'] = '/usr/local/Cellar/apache-spark/3.5.1/libexec'
os.environ['JAVA_HOME'] = '/usr/local/opt/openjdk/libexec/openjdk.jdk/Contents/Home'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import BooleanType
import time

# Initialize Spark session
spark = SparkSession.builder.appName("JoinPerformanceComparison").getOrCreate()

# Sample Users data (A)
dataA = [
    (1, "Alice", "alice@example.com"),
    (2, "Bob", "bob@example.com"),
    (3, "Charlie", "charlie@example.com"),
    (4, "David", "david@example.com"),
    (5, "Eva", "eva@example.com"),
    (6, "Frank", "frank@example.com"),
    (7, "Grace", "grace@example.com"),
    (8, "Hannah", "hannah@example.com"),
    (9, "Ivan", "ivan@example.com"),
    (10, "Jack", "jack@example.com"),
]

# Sample Purchases data (B)
dataB = [
    (2, "Laptop", 1200),
    (3, "Phone", 800),
    (5, "Headphones", 150),
    (11, "Monitor", 300),
    (12, "Tablet", 400),
]

# Create DataFrames
dfA = spark.createDataFrame(dataA, ["id", "name", "email"])
dfB = spark.createDataFrame(dataB, ["user_id", "product", "amount"])

# Create a broadcast variable of user IDs in purchases
user_ids_in_B = set(row.user_id for row in dfB.select("user_id").collect())
broadcast_user_ids = spark.sparkContext.broadcast(user_ids_in_B)

# Define a function to check membership
def is_not_in_broadcast(value):
    return value not in broadcast_user_ids.value

# Register the function as a UDF
is_not_in_broadcast_udf = udf(is_not_in_broadcast, BooleanType())

# Timing the Bitset-Based Anti-Join
start_time = time.time()
# Perform the bitset-based anti-join here
result_df = dfA.filter(is_not_in_broadcast_udf(col("id")))
result_df.show()
end_time = time.time()
print(f"Bitset-Based Anti-Join Time: {end_time - start_time} seconds")

# Timing the Left Join + Filter Null
start_time = time.time()
# Perform the left join + filter null here
joined_df = dfA.join(dfB, dfA.id == dfB.user_id, how="left").filter(dfB.user_id.isNull())
joined_df.show()
end_time = time.time()
print(f"Left Join + Filter Null Time: {end_time - start_time} seconds")

# Stop Spark session
spark.stop()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/20 12:51:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+---+------+------------------+
| id|  name|             email|
+---+------+------------------+
|  1| Alice| alice@example.com|
|  4| David| david@example.com|
|  6| Frank| frank@example.com|
|  7| Grace| grace@example.com|
|  8|Hannah|hannah@example.com|
|  9|  Ivan|  ivan@example.com|
| 10|  Jack|  jack@example.com|
+---+------+------------------+

Bitset-Based Anti-Join Time: 3.3023719787597656 seconds
+---+------+------------------+-------+-------+------+
| id|  name|             email|user_id|product|amount|
+---+------+------------------+-------+-------+------+
|  1| Alice| alice@example.com|   NULL|   NULL|  NULL|
|  4| David| david@example.com|   NULL|   NULL|  NULL|
|  6| Frank| frank@example.com|   NULL|   NULL|  NULL|
|  7| Grace| grace@example.com|   NULL|   NULL|  NULL|
|  8|Hannah|hannah@example.com|   NULL|   NULL|  NULL|
|  9|  Ivan|  ivan@example.com|   NULL|   NULL|  NULL|
| 10|  Jack|  jack@example.com|   NULL|   NULL|  NULL|
+---+------+------------------+-------+--