In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("challenge").getOrCreate()
sqlContext = SparkSession(spark)
spark.sparkContext.setLogLevel("ERROR")

## Example 1

We have _a single huge CSV file of 2647733632 bytes_ size (approx. 2.5 GB). Lets estimate the partition count with default Spark configuration.

In [5]:
def num_partitions(file_size, num_of_files, spark):
    # Check the default partition size
    partition_size = int(spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b",""))
    print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")
    # Check the default open Cost in Bytes
    open_cost_size = int(spark.conf.get("spark.sql.files.openCostInBytes").replace("b",""))
    print(f"Open Cost Size: {open_cost_size} in bytes and {int(open_cost_size) / 1024 / 1024} in MB")
    # Default parallelism
    parallelism = int(spark.sparkContext.defaultParallelism)
    print(f"Default Parallelism: {parallelism}")
    # Total Actual File Size in Bytes
    total_file_size = file_size * num_of_files
    print(f"Total File size on disk: {total_file_size} in bytes and {total_file_size / 1024 /1024} in MB")
    # Padded file size for Spark read
    padded_file_size = total_file_size + (num_of_files * open_cost_size)
    print(f"Total padded file size: {padded_file_size} in bytes and {padded_file_size / 1024 /1024} in MB")
    # Number of Bytes per Core
    bytes_per_core = padded_file_size / parallelism
    print(f"Bytes per Core: {bytes_per_core} in bytes and {bytes_per_core / 1024 /1024} in MB")
    # Max Split Bytes
    max_bytes_per_split = min(partition_size, max(open_cost_size, bytes_per_core))
    print(f"Max bytes per Partition: {max_bytes_per_split} in bytes and {max_bytes_per_split / 1024 /1024} in MB")
    # Total number of Partitions
    num_of_partitions = padded_file_size / max_bytes_per_split
    print(f"Approx number of partitions: {num_of_partitions}")

In [6]:
num_partitions(file_size=2647733632, num_of_files=1, spark=spark)

Partition Size: 134217728 in bytes and 128.0 in MB
Open Cost Size: 4194304 in bytes and 4.0 in MB
Default Parallelism: 4
Total File size on disk: 2647733632 in bytes and 2525.0755615234375 in MB
Total padded file size: 2651927936 in bytes and 2529.0755615234375 in MB
Bytes per Core: 662981984.0 in bytes and 632.2688903808594 in MB
Max bytes per Partition: 134217728 in bytes and 128.0 in MB
Approx number of partitions: 19.758402824401855


## Example 2

Consider a folder with _41300 tiny parquet files_ with average size of 7777 bytes per file approx. 7.7 KB. We can use our designed function now to predict the number of partitions for us.

In [8]:
# Calculate the number of partitions as per our logic for tiny files
num_partitions(7777, 41300, spark)

Partition Size: 134217728 in bytes and 128.0 in MB
Open Cost Size: 4194304 in bytes and 4.0 in MB
Default Parallelism: 4
Total File size on disk: 321190100 in bytes and 306.3107490539551 in MB
Total padded file size: 173545945300 in bytes and 165506.31074905396 in MB
Bytes per Core: 43386486325.0 in bytes and 41376.57768726349 in MB
Max bytes per Partition: 134217728 in bytes and 128.0 in MB
Approx number of partitions: 1293.018052726984
