In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [2]:
spark = (
    SparkSession
    .builder
    .appName("Spark Partitions in Action")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .master("local[4]")
    .enableHiveSupport()
    .getOrCreate() 
)

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/15 11:23:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")

#to not cache datafrimes... this may not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [4]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

def rows_per_partition(sdf: "DataFrame") -> None:
    num_rows = sdf.count()
    sdf_part = sdf.withColumn("partition_id", f.spark_partition_id())
    sdf_part_count = sdf_part.groupBy("partition_id").count()
    sdf_part_count = sdf_part_count.withColumn("count_perc", 100*f.col("count")/num_rows)
    sdf_part_count.orderBy("partition_id").show()

def rows_per_partition_col(sdf: "DataFrame", num_rows: int, col: str) -> None:
    sdf_part = sdf.withColumn("partition_id", f.spark_partition_id())
    sdf_part_count = sdf_part.groupBy("partition_id", col).count()
    sdf_part_count = sdf_part_count.withColumn("count_perc", 100*f.col("count")/num_rows)
    sdf_part_count.orderBy("partition_id", col).show()

In [5]:
num_rows = 200000000


In [6]:
sdf = sdf_generator(num_rows, 12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [7]:
sdf = sdf_generator(num_rows, 13)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 13")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

13


                                                                                

In [8]:
sdf = sdf_generator(num_rows, 13).coalesce(12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Coalesce 13 to 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [9]:
sdf = sdf_generator(num_rows, 13).repartition(12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Repartition 13 to 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [10]:
sdf = sdf_generator(num_rows, 20001)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 20001")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

20001


                                                                                

In [11]:
sdf = sdf_generator(num_rows, 20001).coalesce(12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Coalesce 20001 to 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [12]:
sdf = sdf_generator(num_rows, 20001).repartition(12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Repartition 20001 to 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [13]:
sdf = sdf_generator(num_rows, 40)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 40")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

40


                                                                                

In [14]:
sdf = sdf_generator(num_rows, 90)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 90")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

90


                                                                                

In [15]:
sdf = sdf_generator(num_rows, 90).coalesce(40)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Coalesce 90 to 40")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

40


                                                                                

In [16]:
sdf = sdf_generator(num_rows, 90).repartition(40)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Repartition 90 to 40")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

40


                                                                                

In [17]:
sdf = sdf_generator(num_rows, 1)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 1")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

1


                                                                                

In [18]:
sdf = sdf_generator(num_rows, 10)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 10")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

10


                                                                                

In [19]:
sdf = sdf_generator(num_rows, 12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [20]:
sdf = sdf_generator(num_rows, 1).repartition(12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Repartition 1 to 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [21]:
sdf = sdf_generator(num_rows, 10).repartition(12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Repartition 10 to 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

24/12/15 03:53:08 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 813658 ms exceeds timeout 120000 ms
24/12/15 03:53:08 WARN SparkContext: Killing executors is not supported by current scheduler.
24/12/15 03:53:16 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at o

# Data Skew

In [8]:
sdf = sdf_generator(num_rows, 15)
sdf = sdf.coalesce(12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line skewed 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [9]:
sdf = sdf_generator(num_rows, 15)
sdf = sdf.coalesce(12)
sdf = sdf.coalesce(8)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Coalesce for Skew 8")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

8


                                                                                

In [10]:
sdf = sdf_generator(num_rows, 15)
sdf = sdf.coalesce(12)
sdf = sdf.repartition(12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Repartition for Skew 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [11]:
sdf = sdf_generator(num_rows, 15)
sdf = sdf.coalesce(12)
sdf = sdf.repartition(8)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Repartition for Skew 8")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

8


                                                                                

# Filter operations become more efficient

In [12]:
sdf = sdf_generator(num_rows, 12)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 12")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [13]:
sdf = sdf_generator(num_rows, 12)
sdf = sdf.filter(f.col("id") < 1000)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 12 with filter id")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


In [14]:
sdf = sdf_generator(num_rows, 12)
sdf = sdf.repartition(12, "id")
sdf = sdf.filter(f.col("id") < 1000)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Repartition filter 12 id")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


In [15]:
sdf = sdf_generator(num_rows, 12)
sdf = sdf.filter(f.col("idfirst") == "1")
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 12 with filter idfirst")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [16]:
sdf = sdf_generator(num_rows, 12)
sdf = sdf.repartition(12, "idfirst")
sdf = sdf.filter(f.col("idfirst") == "1")
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Reaprtition filter 12 idfirst")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [17]:
sdf = sdf_generator(num_rows, 12)
sdf = sdf.filter(f.col("idlast") == "1")
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Base line 12 with filter idlast")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                

In [18]:
sdf = sdf_generator(num_rows, 12)
sdf = sdf.repartition(12, "idlast")
sdf = sdf.filter(f.col("idlast") == "1")
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Reaprtition filter 12 idlast")
sdf.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


                                                                                