In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import time, os, random

spark = SparkSession.builder \
    .appName("StreamPulse-PartitionAudit") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.adaptive.enabled", "false") \
    .getOrCreate()


In [3]:
# Generate 800K listening events:

random.seed(42)
data = []
for i in range(800000):
    data.append((
        f"EVT-{i+1:07d}",
        f"USR-{random.randint(1, 100000):06d}",
        random.choice(["Pop", "Rock", "Hip-Hop", "Jazz", "Electronic", "R&B"]),
        random.choice(["mobile", "desktop", "smart_speaker", "tablet"]),
        random.randint(15, 350),
        random.choice([True, False]),
        f"2024-{random.randint(1,12):02d}-{random.randint(1,28):02d}",
    ))

df = spark.createDataFrame(data,
    ["event_id", "user_id", "genre", "device", "duration_sec", "completed", "event_date"]) \
    .withColumn("event_date", col("event_date").cast("date")) \
    .withColumn("month", month(col("event_date")))


In [4]:
#Save in multiple file layouts:
for n in [1, 4, 8, 20, 100]:
    output = f"audit_data/layout_{n}"
    if n <= df.rdd.getNumPartitions():
        df.coalesce(n).write.parquet(output, mode="overwrite")
    else:
        df.repartition(n).write.parquet(output, mode="overwrite")
    file_count = len([f for f in os.listdir(output) if f.endswith(".parquet")])
    print(f"Layout {n:>3}: {file_count} files")


Layout   1: 1 files
Layout   4: 4 files
Layout   8: 8 files
Layout  20: 20 files
Layout 100: 100 files


In [5]:
#Input Partition Exploration
def partition_profile(path, label):
    df = spark.read.parquet(path)
    num_parts = df.rdd.getNumPartitions()

    dist = df.withColumn("pid", spark_partition_id()) \
        .groupBy("pid").agg(count("*").alias("rows")).toPandas()

    min_rows = dist["rows"].min()
    max_rows = dist["rows"].max()
    avg_rows = dist["rows"].mean()
    ratio = max_rows / min_rows if min_rows > 0 else float("inf")

    start = time.time()
    df.groupBy("genre").agg(sum("duration_sec"), count("*")).collect()
    elapsed = time.time() - start

    print(f"{label:<20} parts={num_parts:<4} min={min_rows:<8.0f} max={max_rows:<8.0f} "
          f"ratio={ratio:<5.1f} groupBy={elapsed:.3f}s")
    return num_parts, elapsed

print(f"{'Layout':<20} {'Parts':<6} {'Min Rows':<10} {'Max Rows':<10} {'Ratio':<7} {'GroupBy'}")
print("-" * 75)
for n in [1, 4, 8, 20, 100]:
    partition_profile(f"audit_data/layout_{n}", f"Layout {n} files")


Layout               Parts  Min Rows   Max Rows   Ratio   GroupBy
---------------------------------------------------------------------------
Layout 1 files       parts=2    min=800000   max=800000   ratio=1.0   groupBy=3.396s
Layout 4 files       parts=2    min=400000   max=400000   ratio=1.0   groupBy=2.467s
Layout 8 files       parts=2    min=400000   max=400000   ratio=1.0   groupBy=2.182s
Layout 20 files      parts=2    min=399999   max=400001   ratio=1.0   groupBy=1.957s
Layout 100 files     parts=4    min=32000    max=256002   ratio=8.0   groupBy=3.234s


In [6]:
df = spark.read.parquet("audit_data/layout_8")

print(f"{'Shuffle Parts':<15} {'GroupBy Time':<13} {'Join Time':<13}")
print("-" * 45)

lookup = spark.createDataFrame(
    [("Pop", 1), ("Rock", 2), ("Hip-Hop", 3), ("Jazz", 4), ("Electronic", 5), ("R&B", 6)],
    ["genre", "genre_id"])

for n in [2, 4, 8, 16, 50, 200, 1000]:
    spark.conf.set("spark.sql.shuffle.partitions", str(n))

    start = time.time()
    df.groupBy("genre", "device").agg(sum("duration_sec"), count("*")).collect()
    t_group = time.time() - start

    start = time.time()
    df.join(lookup, "genre").groupBy("genre_id").agg(count("*")).collect()
    t_join = time.time() - start

    print(f"  {n:>5}         {t_group:.3f}s        {t_join:.3f}s")

spark.conf.set("spark.sql.shuffle.partitions", "8")


Shuffle Parts   GroupBy Time  Join Time    
---------------------------------------------
      2         0.907s        3.142s
      4         0.714s        3.236s
      8         0.948s        1.535s
     16         0.739s        1.994s
     50         0.954s        2.134s
    200         2.158s        5.211s
   1000         8.224s        10.793s


In [7]:
df = spark.read.parquet("audit_data/layout_8")

stages = {}
stages["1. Read"] = df.rdd.getNumPartitions()

df_filtered = df.filter(col("completed") == True)
stages["2. Filter"] = df_filtered.rdd.getNumPartitions()

df_selected = df_filtered.select("event_id", "genre", "device", "duration_sec")
stages["3. Select"] = df_selected.rdd.getNumPartitions()

df_grouped = df_filtered.groupBy("genre").agg(count("*"))
stages["4. GroupBy"] = df_grouped.rdd.getNumPartitions()

df_sorted = df_grouped.orderBy(col("count(1)").desc())
stages["5. OrderBy"] = df_sorted.rdd.getNumPartitions()

df_coalesced = df_filtered.coalesce(4)
stages["6. Coalesce(4)"] = df_coalesced.rdd.getNumPartitions()

df_repartitioned = df_filtered.repartition(16)
stages["7. Repartition(16)"] = df_repartitioned.rdd.getNumPartitions()

print(f"{'Stage':<25} {'Partitions':>12} {'Change'}")
print("-" * 55)
prev = None
for stage, parts in stages.items():
    change = ""
    if prev is not None:
        if parts > prev:
            change = f"↑ increased from {prev}"
        elif parts < prev:
            change = f"↓ decreased from {prev}"
        else:
            change = "= unchanged"
    print(f"  {stage:<23} {parts:>10}   {change}")
    prev = parts


Stage                       Partitions Change
-------------------------------------------------------
  1. Read                          2   
  2. Filter                        2   = unchanged
  3. Select                        2   = unchanged
  4. GroupBy                       8   ↑ increased from 2
  5. OrderBy                       6   ↓ decreased from 8
  6. Coalesce(4)                   2   ↓ decreased from 6
  7. Repartition(16)              16   ↑ increased from 2


In [9]:
import builtins # Add this import to explicitly access built-in functions

df = spark.read.parquet("audit_data/layout_8")
df_processed = df.filter(col("completed") == True) \
    .groupBy("genre", "month") \
    .agg(sum("duration_sec").alias("total_duration"), count("*").alias("plays"))

for n in [1, 4, 8, 20]:
    output = f"audit_data/output_{n}"
    if n <= df_processed.rdd.getNumPartitions():
        df_processed.coalesce(n).write.parquet(output, mode="overwrite")
    else:
        df_processed.repartition(n).write.parquet(output, mode="overwrite")

    files = [f for f in os.listdir(output) if f.endswith(".parquet")]
    total_size = builtins.sum(os.path.getsize(os.path.join(output, f)) for f in files)
    avg_size = total_size / len(files) if files else 0

    print(f"  {n:>2} partitions → {len(files)} files, "
          f"total {total_size/1024:.0f} KB, avg {avg_size/1024:.1f} KB/file")

   1 partitions → 1 files, total 2 KB, avg 2.2 KB/file
   4 partitions → 4 files, total 6 KB, avg 1.6 KB/file
   8 partitions → 8 files, total 12 KB, avg 1.5 KB/file
  20 partitions → 20 files, total 27 KB, avg 1.3 KB/file


StreamPulse Partition Standards
Environment: Local Development (4 Cores)

Machine Configuration:

4 CPU cores

2 GB driver memory

Spark running in local[4] mode

Adaptive execution disabled

1️⃣ Input Partition Standards
Observations (From Part 2 Testing)

Tested layouts: 1, 4, 8, 20, 100 files

Findings:

1 partition → Severe underutilization (only 1 core active)

4 partitions → Good CPU alignment but limited parallel slack

8 partitions → Best overall performance

20 partitions → Slight overhead increase

100 partitions → Performance degradation (too many small tasks)

Recommended Standard

Target: 8 input partitions

Rule: 2× core count for small–medium datasets

For 4 cores:

4 cores × 2 = 8 partitions
Why?

Allows full CPU utilization

Provides task scheduling flexibility

Avoids excessive small-task overhead

Prevents single-thread bottlenecks

2️⃣ Shuffle Partition Standards
Observations (From Part 3 Testing)

Tested values:
2, 4, 8, 16, 50, 200, 1000

Findings pattern:

Very low values (2–4) → Under-parallelization

Moderate values (8–16) → Best performance

High values (50+) → Task overhead increases

Very high (200–1000) → Severe slowdown (tiny partitions)

Recommended Setting (Local 4-Core)
spark.sql.shuffle.partitions = 8
Reasoning

Matches optimal input partition count

Balances parallelism and task overhead

Prevents excessive shuffle file creation

Aligns with available CPU resources

3️⃣ Output Partition Standards
Observations (From Part 5 Testing)

When writing aggregated data:

1 file → Large file, low parallelism

4 files → Good balance

8 files → Optimal for local usage

20 files → Too many small files

Target File Size

For local development:

32MB – 128MB per file

For production:

128MB – 256MB per file

Write Strategy (Local)

Use:

df.coalesce(4 or 8)

Recommended:

coalesce(4) for small aggregated outputs
coalesce(8) for medium datasets
When to Use repartition()

Use repartition() only when:

Increasing partitions

Redistributing skewed data

Preparing for heavy shuffle operations

4️⃣ Partition Behavior Through Pipeline

Observed Partition Behavior:

Stage	Partition Change
Read	Based on file count
Filter	Unchanged
Select	Unchanged
GroupBy	Changes to shuffle partition count
OrderBy	Triggers shuffle
Coalesce	Reduces partitions (no shuffle)
Repartition	Full shuffle
Key Insights

Transformations like filter/select do NOT change partition count.

Wide transformations (groupBy, join, orderBy) trigger shuffle.

Shuffle partition count controls post-aggregation partitioning.

coalesce() avoids shuffle.

repartition() forces shuffle.

5️⃣ Production Cluster Recommendations
Shuffle Partitions
2–3× total executor cores

Example:
If cluster has 32 total cores:

64–96 shuffle partitions
Target Partition Size
128MB – 256MB per partition

Why:

Optimal for HDFS / cloud storage

Reduces metadata overhead

Improves scan efficiency

Balances memory & parallelism

Maximum Partition Count
Do not exceed 10,000 partitions

Too many partitions cause:

Scheduler overhead

Metadata explosion

Small file problems

Slower shuffle

6️⃣ StreamPulse Partitioning Rules (Final Standards)
✅ Input Rules

Aim for 2× core count

Avoid single large partitions

Avoid excessive small partitions

✅ Shuffle Rules

Set shuffle partitions ≈ 2×–3× total cores

Reduce default 200 when running locally

Increase only for large production datasets

✅ Output Rules

Target 128–256MB per file in production

Use coalesce() before writing aggregated outputs

Use partitionBy() only for high-cardinality filters that are queried frequently (e.g., month, genre)

✅ Avoid

Default 200 shuffle partitions on small machines

Writing 100+ tiny files

Repartitioning unnecessarily

Ignoring skew detection

Final Recommendation for StreamPulse (Local 4-Core Dev)
Setting	Value
Input partitions	8
Shuffle partitions	8
Output partitions	4–8
Target file size	32–128MB