In [None]:
import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

import os
# Set environment variables (local paths)
os.environ["JAVA_HOME"] = "D:/Programs/Java"
os.environ["HADOOP_HOME"] = "D:/Programs/hadoop"
os.environ["SPARK_HOME"] = "D:/Programs/spark/spark-3.5.6-bin-hadoop3"  # Adjust if different

import findspark
findspark.init("D:/Programs/spark/spark-3.5.6-bin-hadoop3")

In [2]:
spark = (
    SparkSession
    .builder
    .appName("Testing Partitioning")
    .config("spark.sql.adaptive.enabled", "false") 
    .master("local[*]")
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("ERROR")
spark

In [3]:
transactions_file = "../../data/transactions.parquet"

In [4]:
# baseline transformation without repartitioning
df_transactions = spark.read.parquet(transactions_file)
print("Initial partitions:", df_transactions.rdd.getNumPartitions())
df_transformed = (
	df_transactions
	.filter(F.col("amt") > 10) 
	.groupBy("city")                    
	.agg(F.avg("amt").alias("avg_amt")) 
)	
df_transformed.write.mode("overwrite").csv("../../output/baseline")

Initial partitions: 12


#### **Job Baseline**

- **Number of partitions** (`df_transactions.rdd.getNumPartitions()`): **12**  
- **Total execution time**: **42s**  
- **Shuffle Read**: **0.005 MB**  
- **Shuffle Write**: **0.009 MB**  
- **Output file count** (`output/baseline/`): **10**  
- **Size of output files**: **294.0 B**  
- **Exchanges**: **1 (only one exchange)**  


In [6]:
# transformation with repartitioning to 4 partitions
df_transactions = spark.read.parquet(transactions_file).repartition(4)
df_transformed = (
    df_transactions
    .withColumn("amt", F.col("amt").cast(DoubleType()))
    .filter(F.col("amt") > 10)
    .groupBy("city")
    .agg(F.avg("amt").alias("avg_amt"))
)
df_transformed.write.mode("overwrite").csv(f"../../output/repartition_4")

#### **job_repartition_4**

- **Number of partitions**: **4**  
- **Total execution time**: **57s**  

**First Exchange (main shuffle for groupBy):**  
- **Shuffle Read**: **1.2 GB**  
- **Shuffle Write**: **186.0 MiB**  

**Second Exchange (write to disk):**  
- **Shuffle Read**: **3.2 KiB**  
- **Shuffle Write**: **3.2 KiB**  

- **Output file count** (`output/baseline/`): **10**  
- **Size of output files**: **295.0 B**  


In [5]:
# transformation with repartitioning to 8 partitions
df_transactions = spark.read.parquet(transactions_file).repartition(8)
df_transformed = (
    df_transactions
    .withColumn("amt", F.col("amt").cast(DoubleType()))
    .filter(F.col("amt") > 10)
    .groupBy("city")
    .agg(F.avg("amt").alias("avg_amt"))
)
df_transformed.write.mode("overwrite").csv(f"../../output/repartition_8")

#### **job_repartition_8**

- **Number of partitions**: **8**  
- **Total execution time**: **1.1 min**  

**First Exchange (main shuffle for groupBy):**  
- **Shuffle Read**: **1.2 GB**  
- **Shuffle Write**: **246.4 MiB**  

**Second Exchange (write to disk):**  
- **Shuffle Read**: **6.3 KiB**  
- **Shuffle Write**: **3.4 KiB**  

- **Output file count** (`output/baseline/`): **10**  
- **Size of output files**: **295.0 B**  


In [7]:
# transformation with coalesce to 2 partitions
df_transactions = spark.read.parquet(transactions_file) 
df_transformed = (
    df_transactions
    .withColumn("amt", F.col("amt").cast(DoubleType()))
    .filter(F.col("amt") > 10)
    .groupBy("city")
    .agg(F.avg("amt").alias("avg_amt"))
)    
df_transformed.coalesce(2).write.mode("overwrite").csv("../../output/coalesce_2")

#### **job_coalesce_2**

- **Number of partitions**: **2** (after exchange then coalesce)  
- **Total execution time**: **23s**  
- **Shuffle Read**: **~5.1 KB**  
- **Shuffle Write**: **~9.5 KB**  

- **Output file count** (`output/baseline/`): **2 files**  
- **Size of output files**: **294.0 B**  


In [8]:
# transformation with repartition to 2 partitions
df_transactions = spark.read.parquet(transactions_file)
df_transformed = (
    df_transactions
    .withColumn("amt", F.col("amt").cast(DoubleType()))
    .filter(F.col("amt") > 10)
    .groupBy("city")
    .agg(F.avg("amt").alias("avg_amt"))
)
df_transformed.repartition(2).write.mode("overwrite").csv("../../output/repartition_2")

#### **job_repartition_2**

- **Number of partitions**: **2**  
- **Total execution time**: **1.1 min**  

**First Exchange (main shuffle for groupBy):**  
- **Shuffle Read**: **0.005 MB**  
- **Shuffle Write**: **0.009 MB**  

**Second Exchange (write to disk):**  
- **Shuffle Read**: **0.00075 MB**  
- **Shuffle Write**: **0.00075 MB**  

- **Output file count** (`output/baseline/`): **2**  
- **Size of output files**: **294.0 B**  


In [9]:
spark.stop()

| Job Name | Partitions | Exec Time | Shuffle Read | Shuffle Write | Output Files | Output Size | Notes |
| --- | --- | --- | --- | --- | --- | --- | --- |
| **Baseline** | 12 | 42s | 0.005 MB | 0.009 MB | 10 | 294.0 B | Only 1 shuffle |
| **Repartition to 4** | 4 | 57s | 1.2 GB | 186.0 MiB | 10 | 295.0 B | 2 shuffles |
| **Repartition to 8** | 8 | 1.1 min | 1.2 GB | 246.4 MiB | 10 | 295.0 B | 2 shuffles |
| **Coalesce to 2** | 2 (after shuffle) | 23s | 0.005 MB | 0.009 MB | 2 | 294.0 B | Minimal shuffle |
| **Repartition to 2** | 2 | 1.1 min | 0.005 MB | 0.009 MB | 2 | 294.0 B | 2 shuffles |

| 🔍 Question | ✅ Inference |
| --- | --- |
| **Does repartition increase shuffle?** | ✔️ Yes. Repartitioning to 4 or 8 led to **1.2 GB shuffle read**, whereas baseline had only **5 KB**. Repartition triggers **full shuffle**. |
| **Is 8 partitions better than 4?** | ❌ Not clearly. **8 partitions** gave **slightly higher shuffle write (246 MB)** and **longer execution (1.1 min)** than **4 partitions (57s)**. |
| **Is `coalesce` cheaper than `repartition` (for same output)?** | ✔️ Yes. **Coalesce to 2** took only **23s with tiny shuffle**, while **Repartition to 2** took **1.1 min with shuffle**. |
| **Are more partitions always better?** | ❌ No. **Baseline (12 partitions)** was **faster than repartition(8)**. Partition tuning must balance **parallelism** and **shuffle overhead**. |

### Recommendations

#### Best Practices
- For **writing**, prefer `coalesce(n)` to reduce output file count efficiently without expensive shuffle
- Avoid unnecessary `repartition(n)` unless you need even data distribution before a wide transformation (e.g., `groupBy`)
- Use `repartition()` only when correcting data skew or when you need even distribution before expensive operations

#### When to Use This Optimization
- **Coalesce**: When reducing output files for writing operations
- **Repartition**: When you need even data distribution before wide transformations
- **No change**: When current partitioning is already optimal

#### Common Pitfalls to Avoid
- **Over-partitioning**: Too many partitions can increase overhead
- **Under-partitioning**: Too few partitions can cause memory pressure
- **Unnecessary repartitioning**: Only repartition when you have a specific need