## Core Factors

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Factor of cores") \
    .master("local[2]") \
    .getOrCreate()

spark

In [2]:
# Determine the degree of parallelism
spark.sparkContext.defaultParallelism

2

In [3]:
# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

In [4]:
# Disable all AQE optimization for benchmarking tests
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.localShuffleReader.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)

In [12]:
# Define shuffle partitions which is not Factor of core
spark.conf.set("spark.sql.shuffle.partitions", 9)

In [8]:
# Define shuffle partitions which is not Factor of core - 8 e.g 13
from pyspark.sql.functions import count, lit

@get_time
def x(): 
    df = spark.read.format("parquet").load("dataset/sales.parquet/")
    df.groupBy("trx_id").agg(count(lit(1))).write.format("noop").mode("overwrite").save()

Execution time: 4236.340761184692 ms


In [9]:
# Not repartitiong with factor
spark.read.format("parquet").load("dataset/sales.parquet/").repartition(9).write.format("noop").mode("overwrite").save()

In [10]:
# Repartitiong based on factor of cores
spark.read.format("parquet").load("dataset/sales.parquet/").repartition(8).write.format("noop").mode("overwrite").save()