In [12]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName("Spark OffHeap Memory"). \
    master("yarn"). \
    config('spark.executor.instances','1'). \
    config('spark.executor.memory','1g'). \
    config('spark.executor.cores','4'). \
    config('spark.dynamicAllocation.enabled','False'). \
    config('spark.memory.offHeap.enabled','True'). \
    config('spark.memory.offHeap.size','1g'). \
    getOrCreate()

In [11]:
spark.stop()

In [3]:
spark.sparkContext.applicationId

'application_1745651200635_12972'

In [13]:
 #Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)


In [14]:
from pyspark.sql import SparkSession
from pyspark import StorageLevel
import time


In [15]:
df = spark.range(0, 10_000_000).withColumnRenamed("id", "number")

In [16]:
def benchmark_storage(df, storage_level, label):
    df_unpersisted = df.unpersist(blocking=True)
    df_cached = df.persist(storage_level)

    # Trigger caching
    df_cached.count()

    # Benchmark action time
    start = time.time()
    df_cached.selectExpr("avg(number)").collect()
    end = time.time()

    print(f"{label:<20}: {end - start:.3f} sec")

In [17]:
# MEMORY_ONLY
benchmark_storage(df, StorageLevel.MEMORY_ONLY, "MEMORY_ONLY")



MEMORY_ONLY         : 0.478 sec


In [18]:
# DISK_ONLY
benchmark_storage(df, StorageLevel.DISK_ONLY, "DISK_ONLY")



DISK_ONLY           : 0.120 sec


In [21]:
# OFF_HEAP
benchmark_storage(df, StorageLevel.OFF_HEAP, "OFF_HEAP")

OFF_HEAP            : 0.080 sec


In [20]:
# List all cached RDDs (technically for RDDs only)
persistent_rdds = spark.sparkContext._jsc.getPersistentRDDs()
for rdd_id in persistent_rdds:
    rdd = persistent_rdds[rdd_id]
    print(f"RDD ID: {rdd_id}")
    print(f"  Name: {rdd.name()}")
    print(f"  Partitions: {rdd.partitions().size()}")
    print(f"  Storage Level: {rdd.getStorageLevel().description()}")


RDD ID: 46
  Name: *(1) Project [id#140L AS number#142L]
+- *(1) Range (0, 10000000, step=1, splits=4)

  Partitions: 4
  Storage Level: Disk Memory (off heap) Serialized 1x Replicated


In [None]:
Conclusion:
-> OFF Heap Memory stores the data in Memory but serialize format
-> It is better than Disk but stores data in Serialize format and require to deserialize to use

| --------------------------- | ----------------------------------- |
| Problem                     | Recommended Fix                     |
| --------------------------- | ----------------------------------- |
| GC Overhead                 | Use `OFF_HEAP` or `MEMORY_ONLY_SER` |
| Memory not enough           | Use `MEMORY_AND_DISK` instead       |
| Need better performance     | Use `OFF_HEAP` (if enabled)         |
| Want to reduce JVM pressure | Use serialized caching levels       |
| --------------------------- | ----------------------------------- |


In [None]:
spark-shell --master "local[4]" --driver-memory 1g --conf spark.memory.offHeap.enabled=true --conf spark.memory.offHeap.size=1g

In [None]:
val df = spark.range(0, 10000000).toDF("value")

df.unpersist()
df.persist(org.apache.spark.storage.StorageLevel.MEMORY_ONLY)
df.count()

df.unpersist()
df.persist(org.apache.spark.storage.StorageLevel.DISK_ONLY)
df.count()


df.unpersist()
df.persist(org.apache.spark.storage.StorageLevel.OFF_HEAP)
df.count()