# Today's topic: How Spark partitions are influenced when loading the data with parquet

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import math
import time

In [2]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .getOrCreate()

"""
Reference gresearch:
- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
- GitHub Spark extension: https://github.com/G-Research/spark-extension
- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet
"""
sc = spark.sparkContext

:: loading settings :: url = jar:file:/Users/simonedangelo/Documents/simonedangelo-blog/myenv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/simonedangelo/.ivy2/cache
The jars for the packages stored in: /Users/simonedangelo/.ivy2/jars
uk.co.gresearch.spark#spark-extension_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ccf0cad9-0e17-4d84-bbbe-4fadc91b418b;1.0
	confs: [default]
	found uk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.5 in central
	found com.github.scopt#scopt_2.12;4.1.0 in central
:: resolution report :: resolve 118ms :: artifacts dl 4ms
	:: modules in use:
	com.github.scopt#scopt_2.12;4.1.0 from central in [default]
	uk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.5 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||

In [3]:
import gresearch.spark.parquet

In [4]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [5]:
BASE_DIR = "base_dir_loading_lesson"

In [6]:
results_dict = {}
results_list = []
def write_generator(num_rows, num_files):
    sdf = sdf_generator(num_rows, num_files)
    path = f"{BASE_DIR}/{num_files}_files_{num_rows}_rows.parquet"
    sc.setJobDescription(f"Write {num_files} files, {num_rows} rows")
    sdf.write.format("parquet").mode("overwrite").save(path)
    sc.setJobDescription("None")
    print(f"Num partitions written: {sdf.rdd.getNumPartitions()}")
    print(f"Saved Path: {path}")
    return path

In [7]:
def set_configs(maxPartitionsMB = 128, openCostInMB = 4, minPartitions = 4):
    maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
    openCostInBytes = math.ceil(openCostInMB*1024*1024)
    spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")
    spark.conf.set("spark.sql.files.openCostInBytes", str(openCostInBytes)+"b")
    spark.conf.set("spark.sql.files.minPartitionNum", str(minPartitions))
    print(" ")
    print("******** SPARK CONFIGURATIONS ********")
    print(f"MaxPartitionSize {maxPartitionsMB} MB or {maxPartitionsBytes} bytes")
    print(f"OpenCostInBytes {openCostInMB} MB or {openCostInBytes} bytes")
    print(f"Min Partitions: {minPartitions}")

    results_dict["maxPartitionsBytes"] = maxPartitionsMB

In [8]:
def bytes_rows_per_partition(path):
    sdf = (
        spark.read.parquet_partitions(path)
        .groupBy("partition").agg(f.sum("compressedBytes"), f.sum("rows"), f.count("partition"))
        .withColumnRenamed("sum(compressedBytes)", "compressedBytes")
        .withColumnRenamed("sum(rows)", "rows")
        .withColumnRenamed("count(partition)", "numFiles")
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .select("partition", "numFiles", "compressedBytes","compressedMB","rows")
        .orderBy("partition")
    )
    sdf.show(20)
    return sdf

# 4. Basic algorithm

In [9]:
#Basic algorithm
def basic_algorithm(file_size):
    maxPartitionBytes = int(spark.conf.get("spark.sql.files.maxPartitionBytes")[:-1])    
    minPartitionNum = int(spark.conf.get("spark.sql.files.minPartitionNum"))
    size_per_core = file_size/minPartitionNum
    partition_size = min(maxPartitionBytes, size_per_core)
    no_partitions = file_size/partition_size #round up for no_partitions
    
    print(" ")
    print("******** BASIC ALGORITHM TO ESTIMATE NO PARTITIONS ********")
    print(f"File Size: {round(file_size/1024/1024, 1)} MB or {file_size} bytes")
    print(f"Size Per Core: {round(size_per_core/1024/1024, 1)} MB or {size_per_core} bytes")
    print(f"Partion size: {round(partition_size/1024/1024, 1)} MB or {partition_size} bytes")
    print(f"EstimatedPartitions: {math.ceil(no_partitions)}, unrounded: {no_partitions}")

#Reference: https://www.linkedin.com/pulse/how-initial-number-partitions-determined-pyspark-sugumar-srinivasan#:~:text=Ideally%20partitions%20will%20be%20created,resource%20will%20get%20utilised%20properly

In [10]:
file_size = 64
set_configs(maxPartitionsMB=128, openCostInMB=4, minPartitions=4)
basic_algorithm(file_size*1024*1024)

 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** BASIC ALGORITHM TO ESTIMATE NO PARTITIONS ********
File Size: 64.0 MB or 67108864 bytes
Size Per Core: 16.0 MB or 16777216.0 bytes
Partion size: 16.0 MB or 16777216.0 bytes
EstimatedPartitions: 4, unrounded: 4.0


In [11]:
file_size = 100
set_configs(maxPartitionsMB=128, openCostInMB=4, minPartitions=4)
basic_algorithm(file_size*1024*1024)

 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** BASIC ALGORITHM TO ESTIMATE NO PARTITIONS ********
File Size: 100.0 MB or 104857600 bytes
Size Per Core: 25.0 MB or 26214400.0 bytes
Partion size: 25.0 MB or 26214400.0 bytes
EstimatedPartitions: 4, unrounded: 4.0


In [12]:
file_size = 200
set_configs(maxPartitionsMB=128, openCostInMB=4, minPartitions=4)
basic_algorithm(file_size*1024*1024)

 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** BASIC ALGORITHM TO ESTIMATE NO PARTITIONS ********
File Size: 200.0 MB or 209715200 bytes
Size Per Core: 50.0 MB or 52428800.0 bytes
Partion size: 50.0 MB or 52428800.0 bytes
EstimatedPartitions: 4, unrounded: 4.0


In [13]:
file_size = 200
set_configs(maxPartitionsMB=45, openCostInMB=4, minPartitions=4)
basic_algorithm(file_size*1024*1024)

 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 45 MB or 47185920 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** BASIC ALGORITHM TO ESTIMATE NO PARTITIONS ********
File Size: 200.0 MB or 209715200 bytes
Size Per Core: 50.0 MB or 52428800.0 bytes
Partion size: 45.0 MB or 47185920 bytes
EstimatedPartitions: 5, unrounded: 4.444444444444445


In [14]:
file_size = 200
set_configs(maxPartitionsMB=1, openCostInMB=4, minPartitions=4)
basic_algorithm(file_size*1024*1024)

 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 1 MB or 1048576 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** BASIC ALGORITHM TO ESTIMATE NO PARTITIONS ********
File Size: 200.0 MB or 209715200 bytes
Size Per Core: 50.0 MB or 52428800.0 bytes
Partion size: 1.0 MB or 1048576 bytes
EstimatedPartitions: 200, unrounded: 200.0


# 5. Simple experiments

* Experiment 1: 4 files, a 64.8 MB, sum 259.3 MB
* Experiment 2: 8 files, a 64.9 MB MB, sum 518.9 MB
* Experiment 3: 8 files, a 47,5 MB, sum 380 MB

In [30]:
spark.sparkContext.defaultParallelism

4

In [15]:
def get_parquet_file_size(path):
    sdf = (
        spark.read.parquet_metadata(path)
        .select("filename", "blocks", "compressedBytes", "rows")
        .dropDuplicates(["filename"])
    )
    sum = sdf.select(f.sum(sdf["compressedBytes"]))
    size = sum.collect()[0][0]
    return size

def file_analysis(path, num_files):
    file_size = get_parquet_file_size(path)
    avg_file_size = file_size/num_files
    print(" ")
    print("******** FILE SIZE ANALYSIS ********")
    print(f"File Size: {round(file_size/1024/1024, 1)} MB or {file_size} bytes")
    print(f"Num files: {num_files}")
    print(f"Avg file Size: {round(avg_file_size/1024/1024, 1)} MB or {avg_file_size} bytes")

def row_count_analysis(num_files, num_rows):
    print(" ")
    print("******** ROW COUNT ANALYSIS ********")    
    print(f"Num files written: {num_files}")
    print(f"Num rows written: {num_rows}")
    print(f"Num rows per file: {int(num_rows/num_files)}")

def estimate_num_partitions(file_size, num_files):
    """
    Reference to code: 
    - Stackoverflow: https://stackoverflow.com/questions/70985235/what-is-opencostinbytes
    - GitHub: https://github.com/apache/spark/blob/v3.3.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala#L86-L97
    """
    #get spark values
    maxPartitionBytes = int(spark.conf.get("spark.sql.files.maxPartitionBytes")[:-1])
    openCostInBytes = int(spark.conf.get("spark.sql.files.openCostInBytes")[:-1])
    minPartitionNum = int(spark.conf.get("spark.sql.files.minPartitionNum"))
    #Calculate maxSpliPartitionBytes
    # a) If we have bigger files bytesPerCorePadded will be bigger then openCostInBytes but also maxPartitionBytes. 
    # In this case we would limit the size as mayPartitionBytes will be the result of maxSplitPartitionBytes. e.g. 1 GB dataset, 4 cores, maxPartitions 128 MB
    # b) If bytesPerCorePadded is the result of maxSplitPartitionBytes we have a fair split of the data over all cores, e.g. 1 GB dataset, 4 cores, maxPartitions 300 MB
    # c) If bytesPerCore is to small we want to limit amount of Partitions to be opened. This is the cost here.
    paddedFileSize = file_size + num_files * openCostInBytes
    bytesPerCorePadded = paddedFileSize / minPartitionNum
    maxSplitPartitionBytes = min(maxPartitionBytes, max(openCostInBytes, bytesPerCorePadded))
    #Estimation of partitions from Internet
    estimated_num_partitions_int = paddedFileSize/maxSplitPartitionBytes
    #Own Estimator
    avg_file_size_padded = paddedFileSize/num_files
    bytesPerCore = file_size / minPartitionNum
    #Calculate number of files fitting into one partitions. Then calculate the number of partitions
    files_per_partition = max(1, math.floor(maxSplitPartitionBytes/avg_file_size_padded))
    estimated_num_partitions = num_files/files_per_partition
    print(" ")
    print("******** ESTIMATION OF MAX SPLIT PARTITION BYTES AND NO PARTITIONS ********")
    print(f"Avg file Size Padded: {round(avg_file_size_padded/1024/1024, 1)} MB or {avg_file_size_padded} bytes")
    print(f"Padded File Size: {round(paddedFileSize/1024/1024, 1)} MB or {paddedFileSize} bytes")
    print(f"SizePerCore: {round(bytesPerCore/1024/1024, 1)} MB or {bytesPerCore} bytes")
    print(f"SizePerCorePadded: {round(bytesPerCorePadded/1024/1024, 1)} MB or {bytesPerCorePadded} bytes")
    print(f"MaxSplitPartitionBytes: {round(maxSplitPartitionBytes/1024/1024, 1)} MB or {maxSplitPartitionBytes} bytes")
    print(f"MaxFilesPerPartition {files_per_partition}")
    print(f"EstimatedPartitions: {math.ceil(estimated_num_partitions)}, unrounded: {estimated_num_partitions}")
    print(f"EstimatedPartitionsInternet: {math.ceil(estimated_num_partitions_int)}, unrounded: {estimated_num_partitions_int}")

    results_dict["paddedFileSize"] = round(paddedFileSize/1024/1024, 1)
    results_dict["MBPerCore"] = round(bytesPerCore/1024/1024, 1)
    results_dict["MBPerCorePadded"] = round(bytesPerCorePadded/1024/1024, 1)
    results_dict["maxSplitPartitionBytes"] = round(maxSplitPartitionBytes/1024/1024, 1)
    results_dict["avg_file_size_padded"] = round(avg_file_size_padded/1024/1024, 1)
    results_dict["Maxfiles_per_partition"] = files_per_partition
    results_dict["MyEstimationPartitions"] = math.ceil(estimated_num_partitions)
    results_dict["InternetEstimationPartitions"] = math.ceil(estimated_num_partitions_int)

def get_actual_num_partitions(path):
    sdf = spark.read.parquet(path)
    print(" ")
    print("******** ACTUAL RESULTS ********")   
    print(f"ActualNumPartitions: {sdf.rdd.getNumPartitions()}")
    results_dict["ActualNumPartitions"] = sdf.rdd.getNumPartitions()

def noop_write(path):
    sdf = spark.read.parquet(path)
    sc.setJobDescription("WRITE")
    start_time = time.time()
    sdf.write.format("noop").mode("overwrite").save()
    end_time = time.time()
    sc.setJobDescription("None")
    duration = round(end_time - start_time, 2)
    results_dict["ExecutionTime"] = duration
    print(f"Duration: {duration} sec")

In [26]:
BASE_DIR = "../../base_dir_loading_lesson"

In [27]:
num_files = 4
num_rows = 32000000
path = write_generator(num_rows, num_files)



Num partitions written: 4
Saved Path: ../../base_dir_loading_lesson/4_files_32000000_rows.parquet


                                                                                

In [29]:
path = "../../base_dir_loading_lesson/4_files_32000000_rows.parquet"
num_files = 4
num_rows = 32000000
file_analysis(path, num_files)
row_count_analysis(num_files, num_rows)
set_configs(maxPartitionsMB=50, openCostInMB=4, minPartitions=4)
size = get_parquet_file_size(path)
basic_algorithm(size)
estimate_num_partitions(size, num_files)
get_actual_num_partitions(path)
noop_write(path)
bytes_rows_per_partition(path)

 
******** FILE SIZE ANALYSIS ********
File Size: 267.6 MB or 280634098 bytes
Num files: 4
Avg file Size: 66.9 MB or 70158524.5 bytes
 
******** ROW COUNT ANALYSIS ********
Num files written: 4
Num rows written: 32000000
Num rows per file: 8000000
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 50 MB or 52428800 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** BASIC ALGORITHM TO ESTIMATE NO PARTITIONS ********
File Size: 267.6 MB or 280634098 bytes
Size Per Core: 66.9 MB or 70158524.5 bytes
Partion size: 50.0 MB or 52428800 bytes
EstimatedPartitions: 6, unrounded: 5.352670631408691
 
******** ESTIMATION OF MAX SPLIT PARTITION BYTES AND NO PARTITIONS ********
Avg file Size Padded: 70.9 MB or 74352828.5 bytes
Padded File Size: 283.6 MB or 297411314 bytes
SizePerCore: 66.9 MB or 70158524.5 bytes
SizePerCorePadded: 70.9 MB or 74352828.5 bytes
MaxSplitPartitionBytes: 50.0 MB or 52428800 bytes
MaxFilesPerPartition 1
EstimatedPartitions: 4, unrounded: 4.0
Est

                                                                                

Duration: 1.98 sec
+---------+--------+---------------+------------+-------+
|partition|numFiles|compressedBytes|compressedMB|   rows|
+---------+--------+---------------+------------+-------+
|        0|       1|       70205760|        67.0|8000000|
|        1|       1|       69631609|        66.4|8000000|
|        2|       1|       70398611|        67.1|8000000|
|        3|       1|       70398118|        67.1|8000000|
|        4|       2|              0|         0.0|      0|
|        5|       2|              0|         0.0|      0|
+---------+--------+---------------+------------+-------+



DataFrame[partition: int, numFiles: bigint, compressedBytes: bigint, compressedMB: double, rows: bigint]