In [66]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import DataFrame
import random

In [67]:
# Initialize Spark Session
spark: SparkSession.Builder = SparkSession.builder \
    .appName("PartitioningExample") \
    .config(
        "spark.driver.extraJavaOptions", "-Djava.security.manager=allow"
    ) \
    .getOrCreate()

In [68]:
def create_sample_data(n=1000):
    data = []
    for i in range(n):
        age = random.randint(18, 80)
        salary = random.randint(30000, 150000)
        department = random.choice(['HR', 'Engineering', 'Sales', 'Marketing'])
        data.append((i, age, salary, department))
    return data

In [69]:
# Create DataFrame
data = create_sample_data()
df: DataFrame = spark.createDataFrame(data, ["id", "age", "salary", "department"])

In [70]:
# Show initial data distribution
print("Initial Data Sample:")
df.show(5)
print("\nInitial Partitioning:")
print(f"Number of partitions: {df.rdd.getNumPartitions()}")

Initial Data Sample:
+---+---+------+-----------+
| id|age|salary| department|
+---+---+------+-----------+
|  0| 77|101670|  Marketing|
|  1| 42| 70607|         HR|
|  2| 63| 91851|  Marketing|
|  3| 52| 55338|Engineering|
|  4| 41| 50951|         HR|
+---+---+------+-----------+
only showing top 5 rows


Initial Partitioning:
Number of partitions: 10


In [71]:
# Example 1: Basic repartition by age
print("\nExample 1: Basic repartition")
df_repartitioned = df.repartition(5, "age")
print(f"Number of partitions after repartition: {df_repartitioned.rdd.getNumPartitions()}")
print("\nSample after repartition:")
df_repartitioned.show(5)


Example 1: Basic repartition
Number of partitions after repartition: 5

Sample after repartition:


In [72]:
# Example 2: Range partitioning by salary
print("\nExample 2: Range partitioning by salary")
df_range = df.repartitionByRange(5, F.col("salary"))
print(f"Number of partitions after range partitioning: {df_range.rdd.getNumPartitions()}")
print("\nSample after range partitioning (ordered by salary):")
df_range.show(20)


Example 2: Range partitioning by salary
Number of partitions after range partitioning: 5

Sample after range partitioning (ordered by salary):
+---+---+------+-----------+
| id|age|salary| department|
+---+---+------+-----------+
|  4| 41| 50951|         HR|
|  7| 77| 38537|  Marketing|
|  8| 76| 41430|      Sales|
| 19| 77| 47865|  Marketing|
| 22| 48| 34906|  Marketing|
| 23| 71| 43374|      Sales|
| 24| 38| 47238|      Sales|
| 36| 48| 33289|         HR|
| 37| 43| 46285|  Marketing|
| 46| 80| 36612|Engineering|
| 52| 37| 48157|      Sales|
| 53| 61| 49190|      Sales|
| 57| 28| 41113|         HR|
| 58| 63| 34813|  Marketing|
| 60| 71| 55033|      Sales|
| 62| 78| 45924|Engineering|
| 67| 32| 47229|Engineering|
| 80| 24| 44930|      Sales|
| 82| 21| 50430|      Sales|
| 83| 68| 47027|      Sales|
+---+---+------+-----------+
only showing top 20 rows



In [73]:
# Example 3: Multiple column partitioning
print("\nExample 3: Multiple column partitioning")
df_multi = df.repartition(5, "department", "age")
print(f"Number of partitions: {df_multi.rdd.getNumPartitions()}")
print("\nSample after multiple column partitioning:")
df_multi.show(5)


Example 3: Multiple column partitioning
Number of partitions: 5

Sample after multiple column partitioning:
+---+---+------+----------+
| id|age|salary|department|
+---+---+------+----------+
|  5| 43| 97102|        HR|
| 10| 48|121824|        HR|
| 15| 39| 62575|     Sales|
| 18| 79|146825|        HR|
| 23| 71| 43374|     Sales|
+---+---+------+----------+
only showing top 5 rows



In [74]:
# Analyzing partition distribution
def analyze_partitions(df, col_name):
    return df.groupBy(F.col(col_name)) \
        .agg(F.count("*").alias("count")) \
        .orderBy(F.col(col_name)) \
        .show()

print("\nAnalyzing salary distribution in range partitioned data:")
analyze_partitions(df_range, "salary")


Analyzing salary distribution in range partitioned data:
+------+-----+
|salary|count|
+------+-----+
| 30010|    1|
| 30189|    1|
| 30407|    1|
| 30644|    1|
| 30789|    1|
| 31237|    1|
| 31250|    1|
| 31422|    1|
| 31658|    1|
| 31663|    1|
| 32112|    1|
| 32113|    1|
| 32401|    1|
| 32890|    1|
| 32913|    1|
| 33072|    1|
| 33262|    1|
| 33289|    1|
| 33407|    1|
| 33665|    1|
+------+-----+
only showing top 20 rows



In [75]:
# Additional experiments
print("\nComparing sorting performance:")
from time import time

# 일반 정렬
start = time()
df.orderBy(F.col("salary")).count()
print(f"Regular sort time: {time() - start}")

# range partition 후 정렬
start = time()
df_range.orderBy(F.col("salary")).count()
print(f"Range partitioned sort time: {time() - start}")

# 특정 범위의 데이터 확인
print("\nChecking specific salary range (50000-70000):")
df_range.where(
    (F.col("salary") >= 50000) & 
    (F.col("salary") < 70000)
).show(5)

# 실행 계획 확인
print("\nExecution plan for range partitioned DataFrame:")
df_range.select("salary").explain()


Comparing sorting performance:
Regular sort time: 0.12279009819030762
Range partitioned sort time: 0.20327401161193848

Checking specific salary range (50000-70000):
+---+---+------+----------+
| id|age|salary|department|
+---+---+------+----------+
|  4| 41| 50951|        HR|
| 82| 21| 50430|     Sales|
| 98| 80| 52461|        HR|
|133| 77| 51958|     Sales|
|142| 80| 52591|     Sales|
+---+---+------+----------+
only showing top 5 rows


Execution plan for range partitioned DataFrame:
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange rangepartitioning(salary#792L ASC NULLS FIRST, 5), REPARTITION_BY_NUM, [plan_id=2707]
   +- Project [salary#792L]
      +- Scan ExistingRDD[id#790L,age#791L,salary#792L,department#793]


