In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("Spark-Join") \
    .master("local[*]") \
    .getOrCreate()


In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "false")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")
spark.conf.set("spark.sql.shuffle.partitions", "200")


In [None]:
csv_path = r"E:\pyspark-training\data\small\online_retail.csv"

df = spark.read.option("header", "true") \
               .option("inferSchema", "true") \
               .csv(csv_path)

In [None]:
customer_dim = (
    df
    .select("CustomerID", "Country")
    .dropna()
    .distinct()
)


In [None]:
customer_dim.count()

In [None]:
joined_df = df.join(customer_dim, on="CustomerID", how="inner")


In [None]:
joined_df.count()


In [None]:
joined_df.explain(True)

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "true")

# keep broadcast OFF on purpose
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

# keep shuffle partitions high to see AQE effect
spark.conf.set("spark.sql.shuffle.partitions", "200")


In [None]:
# Now Broadcase
spark.conf.set("spark.sql.adaptive.enabled", "false")
#spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 10 * 1024 * 1024)  # 10 MB
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 1)  # 1BYTE for negative testing - Broadcast join
spark.conf.set("spark.sql.shuffle.partitions", "200")

In [None]:
# To do OOM for broadcast
from pyspark.sql.functions import broadcast
large_df = df
for _ in range(50):  # 50× replication → several GBs
    large_df = large_df.union(df)

joined_df = df.join(broadcast(large_df), on="CustomerID", how="inner")
joined_df.count()



In [None]:
joined_df = df.join(customer_dim, on="CustomerID", how="inner")
joined_df.count()


In [2]:
spark.stop()

In [None]:
# SparkContext
sc = spark.sparkContext

# Total cores available
sc.defaultParallelism  # usually = total cores across all executors

# Configured executor memory
executor_memory = spark.conf.get("spark.executor.memory", "Not set")
print(f"Executor memory: {executor_memory}")

# Executor cores
executor_cores = spark.conf.get("spark.executor.cores", "Not set")
print(f"Executor cores per executor: {executor_cores}")

# Number of executors (if running on cluster)
num_executors = sc._conf.get("spark.executor.instances", "Not set")
print(f"Number of executors: {num_executors}")

spark.sparkContext.getConf().getAll()


