In [4]:
from pyspark.sql import SparkSession
max1 = max
from pyspark.sql.functions import *
max = max1

# spark = SparkSession.builder.appName("RepartitionExample").getOrCreate()

# Create a large DataFrame (e.g., representing web server logs)
num_rows = 10000000  # 10 million rows
df = spark.range(num_rows).withColumn("event_time", current_timestamp()) \
    .withColumn("user_id", (rand() * 10000).cast("int")) \
    .withColumn("event_type", array(lit("login"), lit("logout"), lit("purchase"))[floor(rand() * 3).cast("int")])

# Initial number of partitions
initial_partitions = df.rdd.getNumPartitions()
print(f"Initial number of partitions: {initial_partitions}")

filtered_df = df.filter(col("event_type") == "purchase")

# Number of partitions AFTER filtering
filtered_partitions = filtered_df.rdd.getNumPartitions()
print(f"Number of partitions after filtering: {filtered_partitions}")

# Check the number of rows after filtering
filtered_count = filtered_df.count()
print(f"Number of rows after filtering: {filtered_count}")

# If the filtered data is significantly smaller but still has the same number of partitions, it's inefficient.
# Let's say, after filtering, we only have 1/10th of the rows but the same number of partitions.

# Determine an appropriate number of partitions for the filtered data.
# A general guideline is to aim for partitions of around 100-200MB each.
# For simplicity, let's just reduce the number of partitions proportionally to the reduction in data size.
target_partitions = max(1, int(filtered_count / (num_rows / initial_partitions) / 10))
print(f"Target number of partitions: {target_partitions}")

# Repartition the filtered DataFrame
repartitioned_df = filtered_df.repartition(target_partitions)

# Number of partitions AFTER repartitioning
repartitioned_partitions = repartitioned_df.rdd.getNumPartitions()
print(f"Number of partitions after repartitioning: {repartitioned_partitions}")

# Perform a subsequent operation on the repartitioned DataFrame (e.g., aggregation)
# This operation will now be more efficient due to better resource utilization
aggregated_df = repartitioned_df.groupBy("user_id").count()
aggregated_df.explain() #show the execution plan

spark.stop()

Initial number of partitions: 4
Number of partitions after filtering: 4


                                                                                

Number of rows after filtering: 3333011
Target number of partitions: 1




Number of partitions after repartitioning: 1
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[user_id#7], functions=[count(1)])
   +- HashAggregate(keys=[user_id#7], functions=[partial_count(1)])
      +- Exchange SinglePartition, REPARTITION_BY_NUM, [plan_id=133]
         +- Project [user_id#7]
            +- Filter (isnotnull(event_type#11) AND (event_type#11 = purchase))
               +- Project [user_id#7, [login,logout,purchase][cast(FLOOR((rand(-3131851920966333594) * 3.0)) as int)] AS event_type#11]
                  +- Project [cast((rand(-7079338601226393055) * 10000.0) as int) AS user_id#7]
                     +- Range (0, 10000000, step=1, splits=4)




In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("CacheShufflingExample").getOrCreate()

# Sample Data (simulating sales data)
data = [
    ("A", "Product1", 100),
    ("B", "Product2", 200),
    ("A", "Product3", 150),
    ("C", "Product1", 50),
    ("B", "Product2", 250),
    ("A", "Product1", 120),
    ("C", "Product3", 80),
    ("B", "Product1", 180),
    ("A", "Product2", 90)
]

df = spark.createDataFrame(data, ["customer", "product", "sales"])

# --- Without Caching ---
print("Without Caching:")

# First aggregation (sales per customer)
customer_sales = df.groupBy("customer").agg(sum("sales").alias("total_sales"))
customer_sales.explain()
customer_sales.show()

# Second aggregation (average sales per customer)
average_sales = customer_sales.agg(avg("total_sales").alias("average_sales"))
average_sales.explain()
average_sales.show()

# Third aggregation (max sales per customer)
max_sales = customer_sales.agg(max("total_sales").alias("max_sales"))
max_sales.explain()
max_sales.show()

# --- With Caching ---
print("\nWith Caching:")

# First aggregation (sales per customer) and CACHING
customer_sales_cached = df.groupBy("customer").agg(sum("sales").alias("total_sales")).cache()
customer_sales_cached.explain()
customer_sales_cached.show()

# Second aggregation (average sales per customer) - uses the CACHED data
average_sales_cached = customer_sales_cached.agg(avg("total_sales").alias("average_sales"))
average_sales_cached.explain()
average_sales_cached.show()

# Third aggregation (max sales per customer) - uses the CACHED data
max_sales_cached = customer_sales_cached.agg(max("total_sales").alias("max_sales"))
max_sales_cached.explain()
max_sales_cached.show()

# Unpersist the cached DataFrame to release memory
customer_sales_cached.unpersist()

spark.stop()

25/01/23 04:31:28 INFO SparkEnv: Registering MapOutputTracker
25/01/23 04:31:28 INFO SparkEnv: Registering BlockManagerMaster
25/01/23 04:31:28 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/01/23 04:31:28 INFO SparkEnv: Registering OutputCommitCoordinator
25/01/23 04:31:28 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.spark_spark-avro_2.12-3.5.0.jar added multiple times to distributed cache.
25/01/23 04:31:28 WARN Client: Same path resource file:///root/.ivy2/jars/org.tukaani_xz-1.9.jar added multiple times to distributed cache.


Without Caching:
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[customer#35], functions=[sum(sales#37L)])
   +- Exchange hashpartitioning(customer#35, 1000), ENSURE_REQUIREMENTS, [plan_id=152]
      +- HashAggregate(keys=[customer#35], functions=[partial_sum(sales#37L)])
         +- Project [customer#35, sales#37L]
            +- Scan ExistingRDD[customer#35,product#36,sales#37L]




                                                                                

+--------+-----------+
|customer|total_sales|
+--------+-----------+
|       B|        630|
|       C|        130|
|       A|        460|
+--------+-----------+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[avg(total_sales#45L)])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=222]
      +- HashAggregate(keys=[], functions=[partial_avg(total_sales#45L)])
         +- HashAggregate(keys=[customer#35], functions=[sum(sales#37L)])
            +- Exchange hashpartitioning(customer#35, 1000), ENSURE_REQUIREMENTS, [plan_id=218]
               +- HashAggregate(keys=[customer#35], functions=[partial_sum(sales#37L)])
                  +- Project [customer#35, sales#37L]
                     +- Scan ExistingRDD[customer#35,product#36,sales#37L]




                                                                                

+-----------------+
|    average_sales|
+-----------------+
|406.6666666666667|
+-----------------+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[max(total_sales#45L)])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=336]
      +- HashAggregate(keys=[], functions=[partial_max(total_sales#45L)])
         +- HashAggregate(keys=[customer#35], functions=[sum(sales#37L)])
            +- Exchange hashpartitioning(customer#35, 1000), ENSURE_REQUIREMENTS, [plan_id=332]
               +- HashAggregate(keys=[customer#35], functions=[partial_sum(sales#37L)])
                  +- Project [customer#35, sales#37L]
                     +- Scan ExistingRDD[customer#35,product#36,sales#37L]


+---------+
|max_sales|
+---------+
|      630|
+---------+


With Caching:
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- InMemoryTableScan [customer#35, total_sales#105L]
      +- InMemoryRelation [customer#35, total_sales#105L], StorageLe

                                                                                

+--------+-----------+
|customer|total_sales|
+--------+-----------+
|       B|        630|
|       C|        130|
|       A|        460|
+--------+-----------+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[avg(total_sales#105L)])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=496]
      +- HashAggregate(keys=[], functions=[partial_avg(total_sales#105L)])
         +- InMemoryTableScan [total_sales#105L]
               +- InMemoryRelation [customer#35, total_sales#105L], StorageLevel(disk, memory, deserialized, 1 replicas)
                     +- AdaptiveSparkPlan isFinalPlan=false
                        +- HashAggregate(keys=[customer#35], functions=[sum(sales#37L)])
                           +- Exchange hashpartitioning(customer#35, 1000), ENSURE_REQUIREMENTS, [plan_id=499]
                              +- HashAggregate(keys=[customer#35], functions=[partial_sum(sales#37L)])
                                 +- Project [c

                                                                                

+-----------------+
|    average_sales|
+-----------------+
|406.6666666666667|
+-----------------+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[max(total_sales#105L)])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=561]
      +- HashAggregate(keys=[], functions=[partial_max(total_sales#105L)])
         +- InMemoryTableScan [total_sales#105L]
               +- InMemoryRelation [customer#35, total_sales#105L], StorageLevel(disk, memory, deserialized, 1 replicas)
                     +- AdaptiveSparkPlan isFinalPlan=false
                        +- HashAggregate(keys=[customer#35], functions=[sum(sales#37L)])
                           +- Exchange hashpartitioning(customer#35, 1000), ENSURE_REQUIREMENTS, [plan_id=564]
                              +- HashAggregate(keys=[customer#35], functions=[partial_sum(sales#37L)])
                                 +- Project [customer#35, sales#37L]
                                    +-

                                                                                

+---------+
|max_sales|
+---------+
|      630|
+---------+

