In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import random, time

spark = SparkSession.builder \
    .appName("StreamPulse-ExecutionArchitecture") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.sql.adaptive.enabled", "false") \
    .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
    .getOrCreate()

print(f"Spark UI: {spark.sparkContext.uiWebUrl}")


Spark UI: http://f703e4d3b65d:4040


In [2]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.5.0


In [6]:
from pyngrok import ngrok

# Kill any existing tunnels (safe reset)
ngrok.kill()

# Authenticate ngrok with your authtoken
# Replace "YOUR_NGROK_AUTHTOKEN_HERE" with your actual ngrok authtoken from https://dashboard.ngrok.com/get-started/your-authtoken
ngrok.set_auth_token("3AJU04MhqRelI0tcdTT9eWDNJ4H_3h7dhZfh3ggHaBQGhu4WV")

# Open tunnel to Spark UI port 4040
public_url = ngrok.connect(4040)

print("Open this URL in your browser:")
print(public_url)



Open this URL in your browser:
NgrokTunnel: "https://symphonically-unexiled-brice.ngrok-free.dev" -> "http://localhost:4040"


In [7]:
spark.range(1000000).count()

1000000

In [9]:
random.seed(42)

# Listening events
listen_data = [(f"EVT-{i:07d}", f"USR-{random.randint(1,100000):06d}",
                f"TRK-{random.randint(1,50000):05d}",
                random.choice(["mobile","desktop","tablet","speaker","tv"]),
                random.choice(["free","premium","family","student"]),
                random.randint(10, 360),
                random.choice([True, False]),
                __builtins__.round(random.uniform(0.002, 0.015), 4))
               for i in range(800000)]

events = spark.createDataFrame(listen_data,
    ["event_id","user_id","track_id","device","tier","duration","completed","revenue"])
events.write.parquet("lab_arch/events", mode="overwrite")

# Track catalog
track_data = [(f"TRK-{i:05d}", random.choice(["Pop","Rock","Jazz","Hip-Hop","Electronic","R&B","Country","Classical"]),
               random.choice(["Major Label","Indie","Self-Published"]))
              for i in range(1, 50001)]
tracks = spark.createDataFrame(track_data, ["track_id","genre","label"])
tracks.write.parquet("lab_arch/tracks", mode="overwrite")

events = spark.read.parquet("lab_arch/events")
tracks = spark.read.parquet("lab_arch/tracks")

print(f"Events: {events.count()} | Tracks: {tracks.count()}")
print(f"Event partitions: {events.rdd.getNumPartitions()}")
print(f"Track partitions: {tracks.rdd.getNumPartitions()}")


Events: 800000 | Tracks: 50000
Event partitions: 2
Track partitions: 2


In [10]:
# Pipeline 1: filter + select + withColumn (narrow only)
pipeline1 = events \
    .filter(col("completed") == True) \
    .filter(col("duration") > 60) \
    .select("event_id", "user_id", "device", "duration", "revenue") \
    .withColumn("revenue_cents", (col("revenue") * 100).cast("int"))


In [11]:
pipeline1.explain()

== Physical Plan ==
*(1) Project [event_id#17, user_id#18, device#20, duration#22L, revenue#24, cast((revenue#24 * 100.0) as int) AS revenue_cents#48]
+- *(1) Filter (((isnotnull(completed#23) AND isnotnull(duration#22L)) AND completed#23) AND (duration#22L > 60))
   +- *(1) ColumnarToRow
      +- FileScan parquet [event_id#17,user_id#18,device#20,duration#22L,completed#23,revenue#24] Batched: true, DataFilters: [isnotnull(completed#23), isnotnull(duration#22L), completed#23, (duration#22L > 60)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/content/lab_arch/events], PartitionFilters: [], PushedFilters: [IsNotNull(completed), IsNotNull(duration), EqualTo(completed,true), GreaterThan(duration,60)], ReadSchema: struct<event_id:string,user_id:string,device:string,duration:bigint,completed:boolean,revenue:dou...




In [12]:
pipeline1.count()

342109

In [13]:
pipeline1.rdd.getNumPartitions()

2

In [14]:
pipeline1.explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(revenue_cents, cast('`*`('revenue, 100) as int), None)]
+- Project [event_id#17, user_id#18, device#20, duration#22L, revenue#24]
   +- Filter (duration#22L > cast(60 as bigint))
      +- Filter (completed#23 = true)
         +- Relation [event_id#17,user_id#18,track_id#19,device#20,tier#21,duration#22L,completed#23,revenue#24] parquet

== Analyzed Logical Plan ==
event_id: string, user_id: string, device: string, duration: bigint, revenue: double, revenue_cents: int
Project [event_id#17, user_id#18, device#20, duration#22L, revenue#24, cast((revenue#24 * cast(100 as double)) as int) AS revenue_cents#48]
+- Project [event_id#17, user_id#18, device#20, duration#22L, revenue#24]
   +- Filter (duration#22L > cast(60 as bigint))
      +- Filter (completed#23 = true)
         +- Relation [event_id#17,user_id#18,track_id#19,device#20,tier#21,duration#22L,completed#23,revenue#24] parquet

== Optimized Logical Plan ==
Project [event

In [15]:
spark.sparkContext.setJobDescription("PIPELINE_1_COUNT")
pipeline1.count()

342109

In [16]:
pipeline2 = events \
    .filter(col("completed") == True) \
    .groupBy("device") \
    .agg(count("*").alias("plays"),
         sum("revenue").alias("total_rev"))

In [17]:
pipeline2.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[device#20], functions=[count(1), sum(revenue#24)])
+- Exchange hashpartitioning(device#20, 8), ENSURE_REQUIREMENTS, [plan_id=258]
   +- *(1) HashAggregate(keys=[device#20], functions=[partial_count(1), partial_sum(revenue#24)])
      +- *(1) Project [device#20, revenue#24]
         +- *(1) Filter (isnotnull(completed#23) AND completed#23)
            +- *(1) ColumnarToRow
               +- FileScan parquet [device#20,completed#23,revenue#24] Batched: true, DataFilters: [isnotnull(completed#23), completed#23], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/content/lab_arch/events], PartitionFilters: [], PushedFilters: [IsNotNull(completed), EqualTo(completed,true)], ReadSchema: struct<device:string,completed:boolean,revenue:double>




In [18]:
spark.sparkContext.setJobDescription("PIPELINE_2_GROUPBY_DEVICE")
pipeline2.show()

+-------+-----+-----------------+
| device|plays|        total_rev|
+-------+-----+-----------------+
| tablet|80227|679.3329999999964|
|speaker|79783|678.1191999999917|
|     tv|79964|680.2651999999941|
| mobile|80767|686.3548999999934|
|desktop|79729|676.4081999999994|
+-------+-----+-----------------+



| Metric                      | Value                        |
| --------------------------- | ---------------------------- |
| Jobs                        | 1 main job (others internal) |
| Stages                      | 2                            |
| Stage 1 Tasks               | 2                            |
| Stage 2 Tasks               | 1                            |
| Shuffle write               | 609 B                        |
| Shuffle read                | 609 B                        |
| Shuffle boundary created by | groupBy                      |


In [20]:
# Pipeline 3a: SortMerge Join (autoBroadcast disabled)
start = time.time()
joined_sm = events.join(tracks, "track_id")
result_sm = joined_sm.groupBy("genre").agg(sum("revenue").alias("total_rev"))
result_sm.explain()
result_sm.show()
time_sm = time.time() - start
print(f"SortMerge join time: {time_sm:.2f}s")


== Physical Plan ==
*(6) HashAggregate(keys=[genre#26], functions=[sum(revenue#24)])
+- Exchange hashpartitioning(genre#26, 8), ENSURE_REQUIREMENTS, [plan_id=404]
   +- *(5) HashAggregate(keys=[genre#26], functions=[partial_sum(revenue#24)])
      +- *(5) Project [revenue#24, genre#26]
         +- *(5) SortMergeJoin [track_id#19], [track_id#25], Inner
            :- *(2) Sort [track_id#19 ASC NULLS FIRST], false, 0
            :  +- Exchange hashpartitioning(track_id#19, 8), ENSURE_REQUIREMENTS, [plan_id=386]
            :     +- *(1) Filter isnotnull(track_id#19)
            :        +- *(1) ColumnarToRow
            :           +- FileScan parquet [track_id#19,revenue#24] Batched: true, DataFilters: [isnotnull(track_id#19)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/content/lab_arch/events], PartitionFilters: [], PushedFilters: [IsNotNull(track_id)], ReadSchema: struct<track_id:string,revenue:double>
            +- *(4) Sort [track_id#25 ASC NULLS FIRST], false, 0
 

In [21]:
spark.sparkContext.setJobDescription("SORTMERGE_JOIN")

In [22]:
# Pipeline 3b: Broadcast Join
start = time.time()
joined_bc = events.join(broadcast(tracks), "track_id")
result_bc = joined_bc.groupBy("genre").agg(sum("revenue").alias("total_rev"))
result_bc.explain()
result_bc.show()
time_bc = time.time() - start
print(f"Broadcast join time: {time_bc:.2f}s")


== Physical Plan ==
*(3) HashAggregate(keys=[genre#26], functions=[sum(revenue#24)])
+- Exchange hashpartitioning(genre#26, 8), ENSURE_REQUIREMENTS, [plan_id=633]
   +- *(2) HashAggregate(keys=[genre#26], functions=[partial_sum(revenue#24)])
      +- *(2) Project [revenue#24, genre#26]
         +- *(2) BroadcastHashJoin [track_id#19], [track_id#25], Inner, BuildRight, false
            :- *(2) Filter isnotnull(track_id#19)
            :  +- *(2) ColumnarToRow
            :     +- FileScan parquet [track_id#19,revenue#24] Batched: true, DataFilters: [isnotnull(track_id#19)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/content/lab_arch/events], PartitionFilters: [], PushedFilters: [IsNotNull(track_id)], ReadSchema: struct<track_id:string,revenue:double>
            +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, false]),false), [plan_id=627]
               +- *(1) Filter isnotnull(track_id#25)
                  +- *(1) ColumnarToRow
                

In [25]:
spark.sparkContext.setJobDescription("PIPELINE_3A_SORTMERGE")
result_sm.show()

+----------+-----------------+
|     genre|        total_rev|
+----------+-----------------+
|      Rock|838.1191000000026|
| Classical| 845.793000000001|
|       Pop|844.4322000000017|
|      Jazz|849.4741000000023|
|Electronic|852.3180000000009|
|       R&B| 842.639500000001|
|   Country|834.7105000000024|
|   Hip-Hop|882.7481000000005|
+----------+-----------------+



| Metric                | Value         |
| --------------------- | ------------- |
| Total stages          | 4             |
| Max tasks in a stage  | 8             |
| Largest shuffle write | 8.9 MiB       |
| Largest shuffle read  | 9.4 MiB       |
| Slowest stage         | Stage 41 (2s) |
| Total runtime         | ~4s           |


In [26]:
spark.sparkContext.setJobDescription("PIPELINE_3B_BROADCAST")
result_bc.show()

+----------+-----------------+
|     genre|        total_rev|
+----------+-----------------+
|      Rock|838.1190999999797|
|       Pop|844.4321999999842|
| Classical|845.7929999999835|
|Electronic|852.3179999999829|
|      Jazz|849.4740999999784|
|       R&B|842.6394999999836|
|   Country|834.7104999999915|
|   Hip-Hop|882.7480999999837|
+----------+-----------------+



| Stage | Duration | Tasks | What It’s Doing                                 |
| ----- | -------- | ----- | ----------------------------------------------- |
| 22    | 0.2 s    | 1     | Small table (tracks) broadcast to all executors |
| 23    | 2 s      | 2     | Join executed (no shuffle!)                     |
| 24    | 0.1 s    | 4     | GroupBy(genre) aggregation                      |
| 25    | 0.1 s    | 3     | Final aggregation + `.show()`                   |


| Metric              | SortMerge Join | Broadcast Join |
| ------------------- | -------------- | -------------- |
| Total stages        | 4              | 4              |
| Tasks in join stage | 8              | 2              |
| Shuffle write       | 8.9 MiB        | 0–1 KiB        |
| Shuffle read        | 9.4 MiB        | 0–1 KiB        |
| Slowest stage       | Stage 41 (2s)  | Stage 23 (2s)  |
| Runtime             | ~4 s           | ~2 s           |


In [31]:
spark.sparkContext.setJobDescription("PIPELINE_4_MULTIACTION_WITHOUT_CACHE")
enriched = events.join(broadcast(tracks), "track_id")

# Action 1
start = time.time()
print(f"Total: {enriched.count()}")
t1 = time.time() - start

# Action 2
start = time.time()
enriched.groupBy("genre").agg(count("*")).show()
t2 = time.time() - start

# Action 3
start = time.time()
enriched.groupBy("device","genre").agg(avg("duration")).show()
t3 = time.time() - start

print(f"\nWithout cache: {t1:.2f}s + {t2:.2f}s + {t3:.2f}s = {t1+t2+t3:.2f}s total")


Total: 800000
+----------+--------+
|     genre|count(1)|
+----------+--------+
|      Rock|   98754|
|       Pop|   99445|
| Classical|   99604|
|Electronic|  100465|
|      Jazz|  100341|
|       R&B|   99204|
|   Country|   98439|
|   Hip-Hop|  103748|
+----------+--------+

+-------+----------+------------------+
| device|     genre|     avg(duration)|
+-------+----------+------------------+
| mobile|Electronic| 185.7190280971903|
| tablet| Classical|185.67922918653085|
| tablet|       R&B|183.03205355784348|
| mobile|      Rock| 183.8533252353477|
|desktop|   Country|184.96477062332852|
|speaker|   Hip-Hop|184.31145963629365|
| mobile|   Country|185.40454930912355|
|speaker|   Country|185.61704718417047|
|speaker|Electronic|185.82959396406235|
|speaker| Classical| 184.6383702049929|
|     tv|      Jazz|185.31201665675192|
|desktop|      Rock|185.32751555623008|
|     tv|       Pop| 184.3024787571019|
|     tv|   Hip-Hop| 184.2873852102465|
| mobile|       R&B| 183.9146027342976|
|

| Job / Stage | Description                      | Duration    | Tasks           | Notes                                   |
| ----------- | -------------------------------- | ----------- | --------------- | --------------------------------------- |
| Job 50      | showString (avg by device+genre) | 0.2 s       | 4/4 (2 skipped) | Last action                             |
| Job 49      | showString (groupBy genre)       | 1.0 s       | 2/2             | Second action                           |
| Job 48-42   | Various preparatory tasks        | 0.05–0.85 s | 1–2 tasks each  | Each action triggers full recomputation |


In [30]:
spark.sparkContext.setJobDescription("PIPELINE_4_MULTIACTION_WITH_CACHE")

enriched = events.join(broadcast(tracks), "track_id")
enriched.cache()

# Action 1 (materializes cache)
start = time.time()
print(f"Total: {enriched.count()}")
t1c = time.time() - start

# Action 2
start = time.time()
enriched.groupBy("genre").agg(count("*")).show()
t2c = time.time() - start

# Action 3
start = time.time()
enriched.groupBy("device","genre").agg(avg("duration")).show()
t3c = time.time() - start

print(f"\nWith cache: {t1c:.2f}s + {t2c:.2f}s + {t3c:.2f}s = {t1c+t2c+t3c:.2f}s total")

enriched.unpersist()

Total: 800000
+----------+--------+
|     genre|count(1)|
+----------+--------+
|      Rock|   98754|
|       Pop|   99445|
| Classical|   99604|
|Electronic|  100465|
|      Jazz|  100341|
|       R&B|   99204|
|   Country|   98439|
|   Hip-Hop|  103748|
+----------+--------+

+-------+----------+------------------+
| device|     genre|     avg(duration)|
+-------+----------+------------------+
| mobile|Electronic| 185.7190280971903|
| tablet| Classical|185.67922918653085|
| tablet|       R&B|183.03205355784348|
| mobile|      Rock| 183.8533252353477|
|desktop|   Country|184.96477062332852|
|speaker|   Hip-Hop|184.31145963629365|
| mobile|   Country|185.40454930912355|
|speaker|   Country|185.61704718417047|
|speaker|Electronic|185.82959396406235|
|speaker| Classical| 184.6383702049929|
|     tv|      Jazz|185.31201665675192|
|desktop|      Rock|185.32751555623008|
|     tv|       Pop| 184.3024787571019|
|     tv|   Hip-Hop| 184.2873852102465|
| mobile|       R&B| 183.9146027342976|
|

DataFrame[track_id: string, event_id: string, user_id: string, device: string, tier: string, duration: bigint, completed: boolean, revenue: double, genre: string, label: string]

| Job / Stage | Description                       | Duration | Tasks           | Notes                                                    |
| ----------- | --------------------------------- | -------- | --------------- | -------------------------------------------------------- |
| Job 38      | PIPELINE_4_MULTIACTION_WITH_CACHE | 62 ms    | 4/4 (2 skipped) | Likely the last action (`avg(duration)` by device+genre) |
| Job 37      | PIPELINE_4_MULTIACTION_WITH_CACHE | 0.6 s    | 2/2             | Aggregation by genre                                     |
| Job 36      | PIPELINE_4_MULTIACTION_WITH_CACHE | 10 s     | 2/2             | **Count action** (materializes cache)                    |
| Job 35      | PIPELINE_4_MULTIACTION_WITH_CACHE | 0.1 s    | 1/1             | Minor preparatory tasks                                  |


| Metric                 | With Cache                                     | Without Cache                    |
| ---------------------- | ---------------------------------------------- | -------------------------------- |
| First action duration  | 10 s                                           | 0.5 s–10 s?                      |
| Second action duration | 0.6 s                                          | 1 s                              |
| Third action duration  | 0.06 s                                         | 0.2 s                            |
| Tasks executed         | Fewer (some skipped)                           | More (recomputed)                |
| Stages saved           | Subsequent actions reuse cache                 | All stages recomputed            |
| Shuffle usage          | Same (cached DataFrame prevents extra shuffle) | Same, but recomputed each action |


In [33]:
from pyspark.sql.functions import countDistinct

# Set the job name for Spark UI
spark.sparkContext.setJobDescription("PIPELINE_5_COMPLEX_AGG")

# Complex pipeline
result = events \
    .filter(col("completed") == True) \
    .filter(col("duration") > 30) \
    .join(broadcast(tracks), "track_id") \
    .groupBy("genre", "device", "tier") \
    .agg(
        count("*").alias("plays"),
        sum("revenue").alias("total_rev"),
        avg("duration").alias("avg_dur"),
        countDistinct("user_id").alias("unique_users")
    ) \
    .filter(col("plays") > 50) \
    .orderBy(col("total_rev").desc())

# Check the plan and results
result.explain(True)
result.show(20)

== Parsed Logical Plan ==
'Sort ['total_rev DESC NULLS LAST], true
+- Filter (plays#1149L > cast(50 as bigint))
   +- Aggregate [genre#26, device#20, tier#21], [genre#26, device#20, tier#21, count(1) AS plays#1149L, sum(revenue#24) AS total_rev#1150, avg(duration#22L) AS avg_dur#1151, count(distinct user_id#18) AS unique_users#1152L]
      +- Project [track_id#19, event_id#17, user_id#18, device#20, tier#21, duration#22L, completed#23, revenue#24, genre#26, label#27]
         +- Join Inner, (track_id#19 = track_id#25)
            :- Filter (duration#22L > cast(30 as bigint))
            :  +- Filter (completed#23 = true)
            :     +- Relation [event_id#17,user_id#18,track_id#19,device#20,tier#21,duration#22L,completed#23,revenue#24] parquet
            +- ResolvedHint (strategy=broadcast)
               +- Relation [track_id#25,genre#26,label#27] parquet

== Analyzed Logical Plan ==
genre: string, device: string, tier: string, plays: bigint, total_rev: double, avg_dur: double, 

| Stage Id | Description                                                           | Tasks (Succeeded/Total) | Input    | Output   | Shuffle Read | Shuffle Write | Notes                                                            |
| -------- | --------------------------------------------------------------------- | ----------------------- | -------- | -------- | ------------ | ------------- | ---------------------------------------------------------------- |
| 102      | File scan + narrow transformations (filter completed & duration > 30) | 2/2                     | 8.9 MiB  | 13.5 MiB | –            | 13.5 MiB      | Reading Parquet files, applying filters. No shuffle yet.         |
| 103      | Broadcast join with `tracks`                                          | 8/8                     | 13.5 MiB | 64.1 KiB | –            | 64.1 KiB      | Broadcast join avoids shuffle, merges `tracks` with `events`.    |
| 104      | Aggregation (`groupBy genre, device, tier`) + final filter & order    | 8/8                     | 64.1 KiB | 64.1 KiB | –            | –             | Aggregation and ordering. Small output, likely cached in memory. |


Observations

Job Count: 1 (PIPELINE_5_COMPLEX_AGG)

Stage Count: 3 (Stages 102–104)

Shuffle Boundaries: None (broadcast join avoids shuffle; all transformations are either narrow or broadcast-enabled)

Most Expensive Stage: Stage 102 (File scan + filters) and Stage 103 (join) dominate input size processing.

Optimization: Using broadcast(tracks) avoids a full shuffle join, reducing time and shuffle write/read


StreamPulse Execution Architecture Guide
How Spark Executes Your Code
Step-by-Step Flow

You write Python code → SparkSession receives it

Catalyst optimizer creates and optimizes a logical plan

Physical plan is generated

DAG Scheduler breaks the plan into stages at shuffle boundaries

Task Scheduler assigns tasks (one per partition per stage)

Executors run tasks in parallel and return results

Key Rules

1 action = 1 job

1 shuffle = 1 stage boundary

Stages: number of shuffles + 1

Tasks per stage: number of partitions

Broadcast join: NO shuffle

SortMerge join: 1-2 shuffles

Task 1 – Pipeline 1 (Narrow Transformations Only)
| Job Id | Stage Id | Description      | Tasks | Input   | Output | Shuffle Read | Shuffle Write | Notes                                             |
| ------ | -------- | ---------------- | ----- | ------- | ------ | ------------ | ------------- | ------------------------------------------------- |
| 6      | 12       | PIPELINE_1_COUNT | 2/2   | 1.2 MiB | 118 B  | –            | 118 B         | Only narrow transformations, no shuffle boundary. |


Task 2 – Pipeline 2 (GroupBy, Wide Transformation)
| Job Id | Stage Id | Description               | Tasks | Input   | Output | Shuffle Read | Shuffle Write | Notes                              |
| ------ | -------- | ------------------------- | ----- | ------- | ------ | ------------ | ------------- | ---------------------------------- |
| 11     | 14       | PIPELINE_2_GROUPBY_DEVICE | 2/2   | 1.3 MiB | 609 B  | –            | 609 B         | Introduces shuffle due to groupBy. |


Task 3a – SortMerge Join
| Job Id | Stage Id | Description           | Tasks | Input | Output   | Shuffle Read | Shuffle Write | Notes                                        |
| ------ | -------- | --------------------- | ----- | ----- | -------- | ------------ | ------------- | -------------------------------------------- |
| 25     | 22–25    | PIPELINE_3B_BROADCAST | 3–4/4 | 3 MiB | 64.1 KiB | –            | –             | Broadcast join avoids shuffle, fewer stages. |


Task 3b – Broadcast Join
| Job Id | Stage Id | Description           | Tasks | Input | Output   | Shuffle Read | Shuffle Write | Notes                                        |
| ------ | -------- | --------------------- | ----- | ----- | -------- | ------------ | ------------- | -------------------------------------------- |
| 25     | 22–25    | PIPELINE_3B_BROADCAST | 3–4/4 | 3 MiB | 64.1 KiB | –            | –             | Broadcast join avoids shuffle, fewer stages. |


Task 4 – Multi-Action Pipeline (Caching Impact)

| Job Id | Stage Id | Description                          | Tasks | Input  | Output | Notes                                                                     |
| ------ | -------- | ------------------------------------ | ----- | ------ | ------ | ------------------------------------------------------------------------- |
| 50     | 42–50    | PIPELINE_4_MULTIACTION_WITHOUT_CACHE | 2–4/4 | Varies | Varies | Actions repeated, full recomputation                                      |
| 38     | 35–38    | PIPELINE_4_MULTIACTION_WITH_CACHE    | 2–4/4 | Varies | Varies | Cached DataFrame reduces recomputation, fewer stages and faster execution |


Task 5 – Complex Pipeline

| Job Id | Stage Id | Description            | Tasks | Input   | Output   | Shuffle Read | Shuffle Write | Notes                                                                                                    |
| ------ | -------- | ---------------------- | ----- | ------- | -------- | ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
| 52     | 102–104  | PIPELINE_5_COMPLEX_AGG | 2–8/8 | 8.9 MiB | 64.1 KiB | –            | –             | Broadcast join avoids shuffle; aggregation and ordering applied; most expensive stage = file scan + join |


Performance Checklist

 Use broadcast joins when one table is small (<10MB)

 Cache DataFrames that are reused across actions

 Minimize shuffles (combine groupBys, filter early)

 Check Spark UI Stages tab for skew and GC pressure

 Use .explain() before running to predict stage count