In [1]:
#Step 1 â€” Setup and Test Data

# Create SparkSession

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum as spark_sum, avg, desc, lit, when, round as spark_round
import time

spark = SparkSession.builder \
    .appName("M16-Lab03-Transformations-and-Actions") \
    .master("local[*]") \
    .getOrCreate()

print(f"âœ… Spark {spark.version} running in local mode")

âœ… Spark 4.0.2 running in local mode


In [2]:
# Generate test dataset (100,000 rows)

import random

random.seed(42)

data = []
users = [f"U{i:04d}" for i in range(1, 501)]
songs = [f"S{i:03d}" for i in range(1, 51)]
artists = [f"A{i:02d}" for i in range(1, 11)]
statuses = ["completed", "completed", "completed", "skipped", "error"]

for i in range(100_000):
    data.append((
        f"P{i:06d}",
        random.choice(users),
        random.choice(songs),
        random.choice(artists),
        f"2025-03-{random.randint(1, 31):02d}",
        random.choice(statuses),
        random.randint(30, 300)
    ))

columns = ["play_id", "user_id", "song_id", "artist_id", "play_date", "status", "duration_seconds"]
df = spark.createDataFrame(data, columns)

print(f"âœ… Test dataset created: {df.count()} rows, {len(df.columns)} columns")
df.printSchema()

âœ… Test dataset created: 100000 rows, 7 columns
root
 |-- play_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- play_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- duration_seconds: long (nullable = true)



In [3]:
# Step 2 â€” Experiment 1: Prove Lazy Transformations
# Chain transformations and measure time

print("=" * 60)
print("EXPERIMENT 1: Proving Transformations Are Lazy")
print("=" * 60)

start = time.time()

step1 = df.filter(col("status") == "completed")
step2 = step1.filter(col("duration_seconds") > 120)
step3 = step2.groupBy("artist_id").agg(
    count("play_id").alias("play_count"),
    spark_sum("duration_seconds").alias("total_seconds")
)
step4 = step3.withColumn("avg_seconds", spark_round(col("total_seconds") / col("play_count"), 1))
step5 = step4.orderBy(desc("play_count"))

elapsed = time.time() - start
print(f"\n5 transformations chained in {elapsed:.4f} seconds")
print(f"Type of step5: {type(step5)}")
print("Did Spark read the data? NO â€” these are just plans!")

EXPERIMENT 1: Proving Transformations Are Lazy

5 transformations chained in 0.3103 seconds
Type of step5: <class 'pyspark.sql.classic.dataframe.DataFrame'>
Did Spark read the data? NO â€” these are just plans!


In [4]:
# Cell 4: Trigger an action and time it

start = time.time()
step5.show(5)
elapsed = time.time() - start

print(f"\n.show() triggered execution in {elapsed:.4f} seconds")
print("NOW Spark read the data, filtered, grouped, aggregated, and sorted!")

+---------+----------+-------------+-----------+
|artist_id|play_count|total_seconds|avg_seconds|
+---------+----------+-------------+-----------+
|      A07|      4061|       854348|      210.4|
|      A02|      4032|       846346|      209.9|
|      A10|      4027|       853677|      212.0|
|      A05|      4007|       842347|      210.2|
|      A08|      3980|       836879|      210.3|
+---------+----------+-------------+-----------+
only showing top 5 rows

.show() triggered execution in 3.2525 seconds
NOW Spark read the data, filtered, grouped, aggregated, and sorted!


In [5]:
# Observations

print("\nðŸ“‹ OBSERVATIONS:")
print("1. Transformation chain time: _____ seconds (near zero)")
print("2. Action (.show) time:       _____ seconds (much longer)")
print("3. This proves transformations are lazy â€” they build a plan, not results")


ðŸ“‹ OBSERVATIONS:
1. Transformation chain time: _____ seconds (near zero)
2. Action (.show) time:       _____ seconds (much longer)
3. This proves transformations are lazy â€” they build a plan, not results


In [6]:
# Step 3 â€” Experiment 2: Inspect Execution Plans
# Simple plan

simple = df.filter(col("status") == "completed").select("play_id", "song_id", "duration_seconds")
print("\n--- Simple Plan (filter + select) ---")
simple.explain()


--- Simple Plan (filter + select) ---
== Physical Plan ==
*(1) Project [play_id#0, song_id#2, duration_seconds#6L]
+- *(1) Filter (isnotnull(status#5) AND (status#5 = completed))
   +- *(1) Scan ExistingRDD[play_id#0,user_id#1,song_id#2,artist_id#3,play_date#4,status#5,duration_seconds#6L]




In [8]:
# Complex plan
complex_query = (
    df.filter(col("status") == "completed")
      .filter(col("duration_seconds") > 60)
      .groupBy("artist_id")
      .agg(
          count("play_id").alias("play_count"),
          spark_sum("duration_seconds").alias("total_seconds")
      )
      .filter(col("play_count") > 100)
      .orderBy(desc("total_seconds"))
)
print("\n--- Complex Plan (filter + group + filter + sort) ---")
complex_query.explain()
print("\n--- Extended Plan (all optimization stages) ---")
complex_query.explain(True)


--- Complex Plan (filter + group + filter + sort) ---
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [total_seconds#65L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(total_seconds#65L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=175]
      +- Filter (play_count#64L > 100)
         +- HashAggregate(keys=[artist_id#3], functions=[count(play_id#0), sum(duration_seconds#6L)])
            +- Exchange hashpartitioning(artist_id#3, 200), ENSURE_REQUIREMENTS, [plan_id=171]
               +- HashAggregate(keys=[artist_id#3], functions=[partial_count(play_id#0), partial_sum(duration_seconds#6L)])
                  +- Project [play_id#0, artist_id#3, duration_seconds#6L]
                     +- Filter ((isnotnull(status#5) AND isnotnull(duration_seconds#6L)) AND ((status#5 = completed) AND (duration_seconds#6L > 60)))
                        +- Scan ExistingRDD[play_id#0,user_id#1,song_id#2,artist_id#3,play_date#4,status#5,duration_seconds#6L]



--- Extende

In [9]:
# Timing explain vs show

start = time.time()
complex_query.explain()
explain_time = time.time() - start

start = time.time()
complex_query.show(5)
show_time = time.time() - start

print(f"\n.explain() time: {explain_time:.4f} seconds (plan only, no execution)")
print(f".show() time:    {show_time:.4f} seconds (full execution)")
print(f"\nâœ… .explain() is safe â€” it shows the plan without running it")

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [total_seconds#65L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(total_seconds#65L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=175]
      +- Filter (play_count#64L > 100)
         +- HashAggregate(keys=[artist_id#3], functions=[count(play_id#0), sum(duration_seconds#6L)])
            +- Exchange hashpartitioning(artist_id#3, 200), ENSURE_REQUIREMENTS, [plan_id=171]
               +- HashAggregate(keys=[artist_id#3], functions=[partial_count(play_id#0), partial_sum(duration_seconds#6L)])
                  +- Project [play_id#0, artist_id#3, duration_seconds#6L]
                     +- Filter ((isnotnull(status#5) AND isnotnull(duration_seconds#6L)) AND ((status#5 = completed) AND (duration_seconds#6L > 60)))
                        +- Scan ExistingRDD[play_id#0,user_id#1,song_id#2,artist_id#3,play_date#4,status#5,duration_seconds#6L]


+---------+----------+-------------+
|artist_id|play_count|total_se

In [10]:
# Step 4 â€” Experiment 3 & 4: Multiple Actions & Caching

# Multiple actions without cache

pipeline = (
    df.filter(col("status") == "completed")
      .groupBy("song_id")
      .agg(count("play_id").alias("play_count"))
      .orderBy(desc("play_count"))
)

print("\nAction 1: .count()")
start = time.time()
total = pipeline.count()
t1 = time.time() - start
print(f"  Result: {total} songs | Time: {t1:.4f}s")

print("\nAction 2: .show()")
start = time.time()
pipeline.show(5)
t2 = time.time() - start
print(f"  Time: {t2:.4f}s")

print("\nAction 3: .collect()")
start = time.time()
result = pipeline.collect()
t3 = time.time() - start
print(f"  Result: {len(result)} rows | Time: {t3:.4f}s")

print(f"\nðŸ“‹ TOTAL TIME: {t1 + t2 + t3:.4f}s")


Action 1: .count()
  Result: 50 songs | Time: 2.2980s

Action 2: .show()
+-------+----------+
|song_id|play_count|
+-------+----------+
|   S034|      1279|
|   S022|      1260|
|   S003|      1249|
|   S006|      1245|
|   S048|      1237|
+-------+----------+
only showing top 5 rows
  Time: 1.6858s

Action 3: .collect()
  Result: 50 rows | Time: 1.8810s

ðŸ“‹ TOTAL TIME: 5.8648s


In [12]:
# With caching

pipeline_cached = (
    df.filter(col("status") == "completed")
      .groupBy("song_id")
      .agg(count("play_id").alias("play_count"))
      .orderBy(desc("play_count"))
)
pipeline_cached.cache()

print("\nAction 1: .count() (first action)")
start = time.time()
total = pipeline_cached.count()
t1_cached = time.time() - start
print(f"  Result: {total} songs | Time: {t1_cached:.4f}s")

print("\nAction 2: .show() (reads from cache)")
start = time.time()
pipeline_cached.show(5)
t2_cached = time.time() - start
print(f"  Time: {t2_cached:.4f}s")

print("\nAction 3: .collect() (reads from cache)")
start = time.time()
result = pipeline_cached.collect()
t3_cached = time.time() - start
print(f"  Result: {len(result)} rows | Time: {t3_cached:.4f}s")

print(f"\nðŸ“‹ TOTAL TIME WITH CACHE: {t1_cached + t2_cached + t3_cached:.4f}s")
pipeline_cached.unpersist()
print("âœ… Cache cleared")


Action 1: .count() (first action)
  Result: 50 songs | Time: 5.3517s

Action 2: .show() (reads from cache)
+-------+----------+
|song_id|play_count|
+-------+----------+
|   S034|      1279|
|   S022|      1260|
|   S003|      1249|
|   S006|      1245|
|   S048|      1237|
+-------+----------+
only showing top 5 rows
  Time: 0.3677s

Action 3: .collect() (reads from cache)
  Result: 50 rows | Time: 0.7815s

ðŸ“‹ TOTAL TIME WITH CACHE: 6.5010s
âœ… Cache cleared


In [13]:
# Step 5 â€” Experiment 5: Classify 15 Operations

# Classification table

classifications = [
    ("df.filter(col('x') > 10)", "?"),
    ("df.select('a', 'b')", "?"),
    ("df.groupBy('x').count()", "?"),
    ("df.show()", "?"),
    ("df.count()", "?"),
    ("df.collect()", "?"),
    ("df.join(df2, 'key')", "?"),
    ("df.orderBy('x')", "?"),
    ("df.write.parquet('path')", "?"),
    ("df.withColumn('y', col('x') * 2)", "?"),
    ("df.distinct()", "?"),
    ("df.take(5)", "?"),
    ("df.explain()", "?"),
    ("df.union(df2)", "?"),
    ("df.first()", "?"),
]

answers = ["T", "T", "T", "A", "A", "A", "T", "T", "A", "T", "T", "A", "Neither", "T", "A"]

print(f"\n{'Operation':<45} {'Your Answer':<15} {'Correct':<15}")
print("-" * 75)
for (op, _), answer in zip(classifications, answers):
    print(f"{op:<45} {'___':<15} {answer:<15}")


Operation                                     Your Answer     Correct        
---------------------------------------------------------------------------
df.filter(col('x') > 10)                      ___             T              
df.select('a', 'b')                           ___             T              
df.groupBy('x').count()                       ___             T              
df.show()                                     ___             A              
df.count()                                    ___             A              
df.collect()                                  ___             A              
df.join(df2, 'key')                           ___             T              
df.orderBy('x')                               ___             T              
df.write.parquet('path')                      ___             A              
df.withColumn('y', col('x') * 2)              ___             T              
df.distinct()                                 ___             T  