In [1]:
import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col

import os
# Set environment variables (local paths)
os.environ["JAVA_HOME"] = "D:/Programs/Java"
os.environ["HADOOP_HOME"] = "D:/Programs/hadoop"
os.environ["SPARK_HOME"] = "D:/Programs/spark/spark-3.5.6-bin-hadoop3"  # Adjust if different

import findspark
findspark.init("D:/Programs/spark/spark-3.5.6-bin-hadoop3")

import time
def timed(label, func):
    start = time.time()
    func()
    end = time.time()
    print(f"{label:<30}: {(end - start):.2f} sec")
    return end - start

In [2]:
spark = (
    SparkSession
    .builder
    .appName("Testing Caching")
    .config("spark.sql.adaptive.enabled", "true") 
    .config("spark.sql.shuffle.partitions", "8") 
    .config("spark.executor.memory", "4g") 
    .config("spark.driver.memory", "4g") 
    .master("local[*]")
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [3]:
transactions_file = "../../data/transactions.parquet"
df = spark.read.parquet(transactions_file)

In [4]:
# Basic Transformation
df_transformed = df.withColumn("amt", col("amt").cast(DoubleType())) \
                   .select("cust_id", "expense_type", "amt")

In [None]:
def no_cache():
    df_transformed.groupBy("expense_type").count().show()
    df_transformed.groupBy("cust_id").avg("amt").show()

baseline_time = timed("No Cache", no_cache)

+-------------------+--------+
|       expense_type|   count|
+-------------------+--------+
|       Motor/Travel| 4738090|
|          Groceries| 6473528|
|           Gambling|  958807|
|              Fines|    7467|
|Bills and Utilities| 1260478|
|      Entertainment|22417986|
|                Tax|  453669|
|             Health| 1136161|
|            Housing|  261668|
|          Education|  559518|
|            Savings|  357141|
|           Clothing| 1165579|
+-------------------+--------+

+----------+------------------+
|   cust_id|          avg(amt)|
+----------+------------------+
|CL3B876N0W| 55.22601100228109|
|CZMEO3I7BR|144.56504063974802|
|CB9X2QIFXC| 45.85590289720845|
|CF050PCFFL|117.22504949222264|
|CKVZS7W3MF| 68.98813863928103|
|CPK9DPIB9Z|127.85602702702685|
|CJUTITO23A|  55.4455711177591|
|C8F59WIEY1|174.73890040927765|
|CGAXKN70IJ| 88.56002359800117|
|CYETYN7E1E| 47.87958594730247|
|CQJ2ZRK7WW| 54.15641633064491|
|CS92OJU3GX|112.21643863959632|
|C51C18F19D|101.3222613

In [5]:
df_transformed.cache()
df_transformed.count()  # trigger caching

def with_cache():
    df_transformed.groupBy("expense_type").count().show()
    df_transformed.groupBy("cust_id").avg("amt").show()

cache_time = timed("With Cache", with_cache)
df_transformed.unpersist()


+-------------------+--------+
|       expense_type|   count|
+-------------------+--------+
|       Motor/Travel| 4738090|
|          Groceries| 6473528|
|           Gambling|  958807|
|              Fines|    7467|
|Bills and Utilities| 1260478|
|      Entertainment|22417986|
|                Tax|  453669|
|             Health| 1136161|
|            Housing|  261668|
|          Education|  559518|
|            Savings|  357141|
|           Clothing| 1165579|
+-------------------+--------+

+----------+------------------+
|   cust_id|          avg(amt)|
+----------+------------------+
|CL3B876N0W| 55.22601100228109|
|CZMEO3I7BR|144.56504063974802|
|CB9X2QIFXC| 45.85590289720845|
|CF050PCFFL|117.22504949222264|
|CKVZS7W3MF| 68.98813863928103|
|CPK9DPIB9Z|127.85602702702685|
|CJUTITO23A|  55.4455711177591|
|C8F59WIEY1|174.73890040927765|
|CGAXKN70IJ| 88.56002359800117|
|CYETYN7E1E| 47.87958594730247|
|CQJ2ZRK7WW| 54.15641633064491|
|CS92OJU3GX|112.21643863959632|
|C51C18F19D|101.3222613

DataFrame[cust_id: string, expense_type: string, amt: double]

In [None]:
df_transformed.persist(StorageLevel.MEMORY_AND_DISK)
df_transformed.count()  # trigger persisting

def with_persist():
    df_transformed.groupBy("expense_type").count().show()
    df_transformed.groupBy("cust_id").avg("amt").show()

persist_time = timed("With Persist", with_persist)
df_transformed.unpersist()

+-------------------+--------+
|       expense_type|   count|
+-------------------+--------+
|       Motor/Travel| 4738090|
|          Groceries| 6473528|
|           Gambling|  958807|
|              Fines|    7467|
|Bills and Utilities| 1260478|
|      Entertainment|22417986|
|                Tax|  453669|
|             Health| 1136161|
|            Housing|  261668|
|          Education|  559518|
|            Savings|  357141|
|           Clothing| 1165579|
+-------------------+--------+

+----------+------------------+
|   cust_id|          avg(amt)|
+----------+------------------+
|CL3B876N0W| 55.22601100228109|
|CZMEO3I7BR|144.56504063974802|
|CB9X2QIFXC| 45.85590289720845|
|CF050PCFFL|117.22504949222264|
|CKVZS7W3MF| 68.98813863928103|
|CPK9DPIB9Z|127.85602702702685|
|CJUTITO23A|  55.4455711177591|
|C8F59WIEY1|174.73890040927765|
|CGAXKN70IJ| 88.56002359800117|
|CYETYN7E1E| 47.87958594730247|
|CQJ2ZRK7WW| 54.15641633064491|
|CS92OJU3GX|112.21643863959632|
|C51C18F19D|101.3222613

DataFrame[cust_id: string, expense_type: string, amt: double]

In [None]:
df_repart = df_transformed.repartition(8, "expense_type").cache()
df_repart.count()

def cache_repartition():
    df_repart.groupBy("expense_type").count().show()
    df_repart.groupBy("cust_id").avg("amt").show()

repart_cache_time = timed("Cache + Repartition", cache_repartition)
df_repart.unpersist()

+-------------------+--------+
|       expense_type|   count|
+-------------------+--------+
|       Motor/Travel| 4738090|
|          Groceries| 6473528|
|           Gambling|  958807|
|              Fines|    7467|
|Bills and Utilities| 1260478|
|      Entertainment|22417986|
|                Tax|  453669|
|             Health| 1136161|
|            Housing|  261668|
|          Education|  559518|
|            Savings|  357141|
|           Clothing| 1165579|
+-------------------+--------+

+----------+------------------+
|   cust_id|          avg(amt)|
+----------+------------------+
|CL3B876N0W|55.226011002280984|
|CZMEO3I7BR|144.56504063974833|
|CB9X2QIFXC| 45.85590289720868|
|CF050PCFFL|117.22504949222257|
|CKVZS7W3MF| 68.98813863928105|
|CPK9DPIB9Z|127.85602702702701|
|CJUTITO23A|55.445571117758966|
|C8F59WIEY1|174.73890040927688|
|CGAXKN70IJ| 88.56002359800121|
|CYETYN7E1E| 47.87958594730238|
|CQJ2ZRK7WW| 54.15641633064517|
|CS92OJU3GX|112.21643863959659|
|C51C18F19D|101.3222613

DataFrame[cust_id: string, expense_type: string, amt: double]

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "true")

df_aqe = df_transformed.cache()
df_aqe.count()

def aqe_cache():
    df_aqe.groupBy("expense_type").count().show()
    df_aqe.groupBy("cust_id").avg("amt").show()

aqe_cache_time = timed("AQE + Cache", aqe_cache)
df_aqe.unpersist()

+-------------------+--------+
|       expense_type|   count|
+-------------------+--------+
|       Motor/Travel| 4738090|
|          Groceries| 6473528|
|           Gambling|  958807|
|              Fines|    7467|
|Bills and Utilities| 1260478|
|      Entertainment|22417986|
|                Tax|  453669|
|             Health| 1136161|
|            Housing|  261668|
|          Education|  559518|
|            Savings|  357141|
|           Clothing| 1165579|
+-------------------+--------+

+----------+------------------+
|   cust_id|          avg(amt)|
+----------+------------------+
|CL3B876N0W| 55.22601100228109|
|CZMEO3I7BR|144.56504063974802|
|CB9X2QIFXC| 45.85590289720845|
|CF050PCFFL|117.22504949222264|
|CKVZS7W3MF| 68.98813863928103|
|CPK9DPIB9Z|127.85602702702685|
|CJUTITO23A|  55.4455711177591|
|C8F59WIEY1|174.73890040927765|
|CGAXKN70IJ| 88.56002359800117|
|CYETYN7E1E| 47.87958594730247|
|CQJ2ZRK7WW| 54.15641633064491|
|CS92OJU3GX|112.21643863959632|
|C51C18F19D|101.3222613

DataFrame[cust_id: string, expense_type: string, amt: double]

In [None]:
print("\n=== Summary ===")
print(f"{'Strategy':<30} | {'Time (sec)'}")
print("-" * 45)
print(f"{'No Cache':<30} | {baseline_time:.2f}")
print(f"{'With Cache':<30} | {cache_time:.2f}")
print(f"{'With Persist':<30} | {persist_time:.2f}")
print(f"{'Cache + Repartition':<30} | {repart_cache_time:.2f}")
print(f"{'AQE + Cache':<30} | {aqe_cache_time:.2f}")


=== Summary ===
Strategy                       | Time (sec)
---------------------------------------------
No Cache                       | 53.73
With Cache                     | 36.56
With Persist                   | 38.75
Cache + Repartition            | 33.92
AQE + Cache                    | 30.83


In [5]:
spark.stop()