In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pyspark
from pyspark.sql import DataFrame

## 1.0. Set-Ups

In [2]:
spark: SparkSession = SparkSession \
    .builder \
    .appName("Partitioning 1") \
    .master("local[4]") \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/12 16:31:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
def sdf_generator1(num_iter: int = 1) -> DataFrame:
    d = [
        {"a":"a", "b": 1},
        {"a":"b", "b": 2},
        {"a":"c", "b": 3},
        {"a":"d", "b": 4},
        {"a":"e", "b": 5},
        {"a":"e", "b": 6},
        {"a":"f", "b": 7},
        {"a":"g", "b": 8},
        {"a":"h", "b": 9},
        {"a":"i", "b": 10},
    ]

    data = []
    for _ in range(0, num_iter):
        data.extend(d)
    ddl_schema = "a string, b int"
    df = spark.createDataFrame(data, schema=ddl_schema)
    return df

In [4]:
def sdf_generator2(num_rows: int, num_partitions: int = None) -> DataFrame:
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

## 1.1 Partition Size based on Cores and Data Amount with spark.CreateDataFrame

In [5]:
spark.sparkContext.defaultParallelism

4

In [6]:
sdf_gen1 = sdf_generator1(2)
sdf_gen1.rdd.getNumPartitions()

4

In [7]:
sdf_part1 = sdf_gen1.withColumn("partition_id", f.spark_partition_id())
sdf_part1.show()

                                                                                

+---+---+------------+
|  a|  b|partition_id|
+---+---+------------+
|  a|  1|           0|
|  b|  2|           0|
|  c|  3|           0|
|  d|  4|           0|
|  e|  5|           0|
|  e|  6|           1|
|  f|  7|           1|
|  g|  8|           1|
|  h|  9|           1|
|  i| 10|           1|
|  a|  1|           2|
|  b|  2|           2|
|  c|  3|           2|
|  d|  4|           2|
|  e|  5|           2|
|  e|  6|           3|
|  f|  7|           3|
|  g|  8|           3|
|  h|  9|           3|
|  i| 10|           3|
+---+---+------------+



In [8]:
row_count = sdf_gen1.count()
sdf_part_count1 = sdf_part1.groupBy("partition_id").count()
sdf_part_count1 = sdf_part_count1.withColumn("count_perc", 100*f.col("count")/row_count)
sdf_part_count1.show()

+------------+-----+----------+
|partition_id|count|count_perc|
+------------+-----+----------+
|           0|    5|      25.0|
|           1|    5|      25.0|
|           2|    5|      25.0|
|           3|    5|      25.0|
+------------+-----+----------+



In [9]:
sc.setJobDescription("Gen1_Exp1")
sdf_gen1.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

In [10]:
sdf_gen1_2 = sdf_generator1(2000)
sdf_gen1_2.rdd.getNumPartitions()

4

In [11]:
sdf_part1_2 = sdf_gen1_2.withColumn("partition_id", f.spark_partition_id())
row_count = sdf_gen1_2.count()
sdf_part_count1_2 = sdf_part1_2.groupBy("partition_id").count()
sdf_part_count1_2 = sdf_part_count1_2.withColumn("count_perc", 100*f.col("count")/row_count)
sdf_part_count1_2.show()

+------------+-----+----------+
|partition_id|count|count_perc|
+------------+-----+----------+
|           0| 5120|      25.6|
|           1| 5120|      25.6|
|           2| 5120|      25.6|
|           3| 4640|      23.2|
+------------+-----+----------+



In [12]:
sc.setJobDescription("Gen1_Exp2")
sdf_gen1_2.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

## 1.2. Partition Size based on Cores and Data Amount with spark.range
The same results as for spark.createDataFrame count also here even though it's a spark function returning data

In [13]:
sdf_gen2 = sdf_generator2(2000000)
sdf_gen2.rdd.getNumPartitions()

4

In [14]:
sdf_part2 = sdf_gen2.withColumn("partition_id", f.spark_partition_id())
sdf_part2.show()

+---+----------+--------------------+--------+-------+------+------------+
| id|      date|           timestamp|idstring|idfirst|idlast|partition_id|
+---+----------+--------------------+--------+-------+------+------------+
|  0|2024-08-12|2024-08-12 16:31:...|       0|      0|     0|           0|
|  1|2024-08-12|2024-08-12 16:31:...|       1|      1|     1|           0|
|  2|2024-08-12|2024-08-12 16:31:...|       2|      2|     2|           0|
|  3|2024-08-12|2024-08-12 16:31:...|       3|      3|     3|           0|
|  4|2024-08-12|2024-08-12 16:31:...|       4|      4|     4|           0|
|  5|2024-08-12|2024-08-12 16:31:...|       5|      5|     5|           0|
|  6|2024-08-12|2024-08-12 16:31:...|       6|      6|     6|           0|
|  7|2024-08-12|2024-08-12 16:31:...|       7|      7|     7|           0|
|  8|2024-08-12|2024-08-12 16:31:...|       8|      8|     8|           0|
|  9|2024-08-12|2024-08-12 16:31:...|       9|      9|     9|           0|
| 10|2024-08-12|2024-08-1

In [15]:
row_count = sdf_gen2.count()
print(row_count)

2000000


In [16]:
sdf_part_count2 = sdf_part2.groupBy("partition_id").count()
sdf_part_count2 = sdf_part_count2.withColumn("count_perc", 100*f.col("count")/row_count)
sdf_part_count2.show()

+------------+------+----------+
|partition_id| count|count_perc|
+------------+------+----------+
|           0|500000|      25.0|
|           1|500000|      25.0|
|           2|500000|      25.0|
|           3|500000|      25.0|
+------------+------+----------+



In [17]:
sc.setJobDescription("Gen2_Exp1")
sdf_gen2.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

In [18]:
sdf_gen2_2 = sdf_generator2(2000000000000000000)
sdf_gen2_2.rdd.getNumPartitions()

4

## 1.3. Influence on Spark partitions to the performance

In [19]:
sdf1 = sdf_generator2(20000000, 4)
print(sdf1.rdd.getNumPartitions())
sc.setJobDescription("Part Exp1")
sdf1.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

4


                                                                                

In [20]:
sdf4 = sdf_generator2(20000000, 4)
print(sdf4.rdd.getNumPartitions())
sc.setJobDescription("Part Exp4")
sdf4.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

sdf8 = sdf_generator2(20000000, 8)
print(sdf8.rdd.getNumPartitions())
sc.setJobDescription("Part Exp8")
sdf8.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

sdf3 = sdf_generator2(20000000, 3)
print(sdf3.rdd.getNumPartitions())
sc.setJobDescription("Part Exp3")
sdf3.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

sdf6 = sdf_generator2(20000000, 6)
print(sdf6.rdd.getNumPartitions())
sc.setJobDescription("Part Exp6")
sdf6.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

sdf200 = sdf_generator2(20000000, 200)
print(sdf200.rdd.getNumPartitions())
sc.setJobDescription("Part Exp200")
sdf200.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

sdf20000 = sdf_generator2(20000000, 20000)
print(sdf20000.rdd.getNumPartitions())
sc.setJobDescription("Part Exp20000")
sdf20000.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

4


                                                                                

8


                                                                                

3


                                                                                

6


                                                                                

200


                                                                                

20000


                                                                                