### Optimize huge file read

In [2]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Optimize huge file reads") \
    .master("local[*]") \
    .getOrCreate()

spark

In [5]:
# Check the default partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

Partition Size: 134217728 in bytes and 128.0 in MB


In [6]:
# Check the default parallelism available
print(f"Parallelism : {spark.sparkContext.defaultParallelism}")

Parallelism : 8


In [18]:
# File size that we are going to import
import os
file_size = os.path.getsize('dataset/sales_combined_2.csv')
print(f"""Data File Size: 
            {file_size} in bytes 
            {int(file_size) / 1024 / 1024} in MB
            {int(file_size) / 1024 / 1024 / 1024} in GB""")

Data File Size: 
            2647733632 in bytes 
            2525.0755615234375 in MB
            2.465894103050232 in GB


In [19]:
# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        print("-"*80)
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())
    print("-"*80)

In [20]:
# Lets read the file and write in noop format for Performance Benchmarking
@get_time
def x():
    df = spark.read.format("csv").option("header", True).load("dataset/sales_combined_2.csv")
    print(f"Number of Partition -> {df.rdd.getNumPartitions()}")
    df.write.format("noop").mode("overwrite").save()

Number of Partition -> 20
--------------------------------------------------------------------------------
Execution time: 11532.451629638672 ms
--------------------------------------------------------------------------------


In [45]:
# Change the default partition size to 3 times to decrease the number of partitions
spark.conf.set("spark.sql.files.maxPartitionBytes", str(128 * 3 * 1024 * 1024)+"b")

# Verify the partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

Partition Size: 402653184 in bytes and 384.0 in MB


In [46]:
# Lets read the file again with new partition size and write in noop format for Performance Benchmarking
@get_time
def x():
    df = spark.read.format("csv").option("header", True).load("dataset/sales_combined_2.csv")
    print(f"Number of Partition -> {df.rdd.getNumPartitions()}")
    df.write.format("noop").mode("overwrite").save()

Number of Partition -> 8
--------------------------------------------------------------------------------
Execution time: 9590.608835220337 ms
--------------------------------------------------------------------------------


In [47]:
# Change the default partition size to 160 MB to decrease the number of partitions
spark.conf.set("spark.sql.files.maxPartitionBytes", str(160 * 1024 * 1024)+"b")

# Verify the partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

Partition Size: 167772160 in bytes and 160.0 in MB


In [48]:
# Lets read the file again with new partition size and write in noop format for Performance Benchmarking
@get_time
def x():
    df = spark.read.format("csv").option("header", True).load("dataset/sales_combined_2.csv")
    print(f"Number of Partition -> {df.rdd.getNumPartitions()}")
    df.write.format("noop").mode("overwrite").save()

Number of Partition -> 16
--------------------------------------------------------------------------------
Execution time: 7111.926555633545 ms
--------------------------------------------------------------------------------


In [49]:
spark.stop()