In [1]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName("Spark NO_OFF_HEAP Sort"). \
    master("yarn"). \
    config('spark.executor.instances','1'). \
    config('spark.executor.memory','512MB'). \
    config('spark.executor.cores','4'). \
    config('spark.dynamicAllocation.enabled','False'). \
    config('spark.memory.offHeap.enabled','False'). \
    getOrCreate()
   

In [37]:
spark.stop()


In [2]:
spark.sparkContext.applicationId

'application_1745651200635_14122'

In [5]:
spark.conf.set("spark.sql.shuffle.partitions", "20")
spark.conf.get("spark.sql.shuffle.partitions")

'20'

In [6]:
 #Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)


In [7]:
# Create a sample DataFrame

df = spark.range(0, 100000000).withColumnRenamed("id", "value")


In [8]:
# Add a 'key' column to partition on

from pyspark.sql.functions import col

key_df = df.withColumn("key", col("value") % 100)


In [9]:
# Repartition and sort within partitions — this triggers a shuffle

sorted_df = key_df.repartition("key").sortWithinPartitions("key")


In [10]:
# Show sample output - 17 sec to process

sorted_df.write.format("noop").mode("overwrite").save()


In [28]:
spark.stop()

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName("Spark OFF_HEAP Sort"). \
    master("yarn"). \
    config('spark.executor.instances','1'). \
    config('spark.executor.memory','512MB'). \
    config('spark.executor.cores','4'). \
    config('spark.dynamicAllocation.enabled','False'). \
    config('spark.memory.offHeap.enabled','True'). \
    config('spark.memory.offHeap.size','1g'). \
    getOrCreate()

In [2]:
spark.sparkContext.applicationId

'application_1745651200635_13448'

In [3]:
#Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

spark.conf.set("spark.sql.shuffle.partitions", "20")
spark.conf.get("spark.sql.shuffle.partitions")

'20'

In [4]:
# Create a sample DataFrame
df = spark.range(0, 100000000).withColumnRenamed("id", "value")


In [5]:
# Add a 'key' column to partition on

from pyspark.sql.functions import col

key_df = df.withColumn("key", col("value") % 100)


In [6]:
# Repartition and sort within partitions — this triggers a shuffle

sorted_df = key_df.repartition("key").sortWithinPartitions("key")


In [7]:
sorted_df.explain()

== Physical Plan ==
*(2) Sort [key#4L ASC NULLS FIRST], false, 0
+- Exchange hashpartitioning(key#4L, 20), REPARTITION, [id=#16]
   +- *(1) Project [id#0L AS value#2L, (id#0L % 100) AS key#4L]
      +- *(1) Range (0, 100000000, step=1, splits=4)




In [7]:
#write Action to run the flow - 13 sec

sorted_df.write.format("noop").mode("overwrite").save()


In [39]:
# Check the partition count
from pyspark.sql.functions import spark_partition_id, count, lit, desc

part_df = sorted_df.withColumn("partition_num", spark_partition_id()).groupBy("partition_num").agg(count(lit(1)).alias("count"))

part_df.orderBy(desc("partition_num")).show()

+-------------+-------+
|partition_num|  count|
+-------------+-------+
|          198|1000000|
|          195|3000000|
|          192|1000000|
|          190|2000000|
|          185|1000000|
|          184|1000000|
|          182|1000000|
|          179|1000000|
|          177|1000000|
|          175|2000000|
|          174|1000000|
|          172|1000000|
|          169|1000000|
|          168|3000000|
|          167|1000000|
|          166|2000000|
|          164|2000000|
|          161|1000000|
|          159|1000000|
|          154|1000000|
+-------------+-------+
only showing top 20 rows



In [8]:
spark.stop()

In [None]:
Conclusion:
-> OFF Heap Memory stores the data in Memory but serialize format
-> It is better than Disk but stores data in Serialize format and require to deserialize to use

| --------------------------- | ----------------------------------- |
| Problem                     | Recommended Fix                     |
| --------------------------- | ----------------------------------- |
| GC Overhead                 | Use `OFF_HEAP` or `MEMORY_ONLY_SER` |
| Memory not enough           | Use `MEMORY_AND_DISK` instead       |
| Need better performance     | Use `OFF_HEAP` (if enabled)         |
| Want to reduce JVM pressure | Use serialized caching levels       |
| --------------------------- | ----------------------------------- |


In [None]:
spark-shell --master "local[4]" --driver-memory 1g --conf spark.memory.offHeap.enabled=true --conf spark.memory.offHeap.size=512m

In [None]:
val df = spark.range(0, 100000000).map(x => (x % 100, x)).toDF("key", "value")

val sorted = df.repartition($"key").sortWithinPartitions($"key") 
sorted.show()

In [None]:
val sorted = df.repartition($"key").sortWithinPartitions($"key") 
sorted.show()