## DataFrame RDD Partitions

In [185]:
import os
import multiprocessing
import random
from pyspark.sql import SparkSession
from pyspark.sql.functions import spark_partition_id

In [186]:
# Init
spark = SparkSession.builder.appName("RDD_exploration").getOrCreate()

In [190]:
# Configurations whose values we want to see
configs_to_exam = ["spark.app.name", "spark.sql.shuffle.partitions", "spark.master"]

# Get the lenghts of the longest key
w = len(max(configs_to_exam, key=len))

# Print
for key in configs_to_exam:
    val = spark.conf.get(key)
    print(f"{key:<{w}} : {val}")

spark.app.name               : RDD_exploration
spark.sql.shuffle.partitions : 200
spark.master                 : local[*]


In [191]:
print("# of CPUs of Threads available: ", multiprocessing.cpu_count())

# of CPUs of Threads available:  12


## Load a DataFrame

The source directory has only 1 file since we have repartitioned it before writing.

In [192]:
p = os.path.join("..", "S3", "staging", "dms", "abc", "devices", "devices")

df = spark.read.format("parquet").load(p)

print("# of files in source path:", len([f for f in os.listdir(p) if f.endswith(".parquet")]))
print("# of partitions in DataFrame:", df.rdd.getNumPartitions())

df.withColumn("pid", spark_partition_id()).show()

# of files in source path: 1
# of partitions in DataFrame: 1
+-------------------+---+-----------+--------+-------------+-------------------+-------------------+---+
|      dms_timestamp| id|customer_fk|model_fk|serial_number|            created|           modified|pid|
+-------------------+---+-----------+--------+-------------+-------------------+-------------------+---+
|2021-08-05 15:19:54|  1|          1|       1|  862-86-8047|1970-01-15 12:18:54|1970-01-15 12:52:11|  0|
|2021-08-05 15:19:54|  2|          1|       1|  329-08-2350|1970-01-15 12:18:54|1970-01-15 12:50:41|  0|
|2021-08-05 15:19:54|  3|          1|       3|  360-73-1379|2020-01-15 16:04:02|2020-01-15 16:47:19|  0|
|2021-08-05 15:19:54|  4|          1|       3|  034-94-0243|2020-01-15 16:04:02|2020-01-15 16:57:24|  0|
|2021-08-05 15:19:54|  5|          2|       1|  688-21-1124|2020-01-15 16:10:11|2020-01-15 16:45:38|  0|
|2021-08-05 15:19:54|  6|          2|       2|  531-52-1018|2020-01-15 16:10:11|2020-01-15 16:59:34

In [193]:
df_sorted = df.sort("modified")

print("# of partitions in DataFrame:", df_sorted.rdd.getNumPartitions())

df_sorted.withColumn("pid", spark_partition_id()).show()

# of partitions in DataFrame: 16
+-------------------+---+-----------+--------+-------------+-------------------+-------------------+---+
|      dms_timestamp| id|customer_fk|model_fk|serial_number|            created|           modified|pid|
+-------------------+---+-----------+--------+-------------+-------------------+-------------------+---+
|2021-08-05 15:19:54|  2|          1|       1|  329-08-2350|1970-01-15 12:18:54|1970-01-15 12:50:41|  0|
|2021-08-05 15:19:54|  1|          1|       1|  862-86-8047|1970-01-15 12:18:54|1970-01-15 12:52:11|  1|
|2021-08-05 15:19:54|  5|          2|       1|  688-21-1124|2020-01-15 16:10:11|2020-01-15 16:45:38|  2|
|2021-08-05 15:19:54|  3|          1|       3|  360-73-1379|2020-01-15 16:04:02|2020-01-15 16:47:19|  3|
|2021-08-05 15:19:54|  4|          1|       3|  034-94-0243|2020-01-15 16:04:02|2020-01-15 16:57:24|  4|
|2021-08-05 15:19:54|  6|          2|       2|  531-52-1018|2020-01-15 16:10:11|2020-01-15 16:59:34|  5|
|2021-08-05 15:19:54| 

In [194]:
df_sorted.coalesce(4).withColumn("pid", spark_partition_id()).show()

+-------------------+---+-----------+--------+-------------+-------------------+-------------------+---+
|      dms_timestamp| id|customer_fk|model_fk|serial_number|            created|           modified|pid|
+-------------------+---+-----------+--------+-------------+-------------------+-------------------+---+
|2021-08-05 15:19:54|  2|          1|       1|  329-08-2350|1970-01-15 12:18:54|1970-01-15 12:50:41|  0|
|2021-08-05 15:19:54|  1|          1|       1|  862-86-8047|1970-01-15 12:18:54|1970-01-15 12:52:11|  0|
|2021-08-05 15:19:54|  5|          2|       1|  688-21-1124|2020-01-15 16:10:11|2020-01-15 16:45:38|  0|
|2021-08-05 15:19:54|  3|          1|       3|  360-73-1379|2020-01-15 16:04:02|2020-01-15 16:47:19|  0|
|2021-08-05 15:19:54|  4|          1|       3|  034-94-0243|2020-01-15 16:04:02|2020-01-15 16:57:24|  1|
|2021-08-05 15:19:54|  6|          2|       2|  531-52-1018|2020-01-15 16:10:11|2020-01-15 16:59:34|  1|
|2021-08-05 15:19:54|  7|          3|       4|  225-91-

## Generate a DataFrame

Instead of loading a DataFrame from a single Parquet file, let's see what happens when we generate one from made up data.

In [195]:
# Define some random data
data = [{"id": i, "sample": random.random()} for i in range(1000)]

# Define schema with DDL syntax
schema = "id INT, sample FLOAT"

df = spark.createDataFrame(data, schema=schema)

print("# of partitions in DataFrame:", df.rdd.getNumPartitions())
df.withColumn("pid", spark_partition_id()).show()

# of partitions in DataFrame: 12
+---+-----------+---+
| id|     sample|pid|
+---+-----------+---+
|  0|  0.3667918|  0|
|  1|  0.8227412|  0|
|  2|  0.8557527|  0|
|  3| 0.09284008|  0|
|  4| 0.43896046|  0|
|  5|0.029582817|  0|
|  6| 0.75651675|  0|
|  7|0.037324116|  0|
|  8| 0.07729287|  0|
|  9|  0.6893575|  0|
| 10|   0.695219|  0|
| 11|  0.4964869|  0|
| 12| 0.29565847|  0|
| 13|  0.7083751|  0|
| 14| 0.34966978|  0|
| 15|  0.8800423|  0|
| 16| 0.45173523|  0|
| 17|  0.8067894|  0|
| 18|  0.6379171|  0|
| 19| 0.17989096|  0|
+---+-----------+---+
only showing top 20 rows



### Notice that...

We get as many RDD partitions as there are CPU Threads.

Let's shuffle the data and see what happens.

In [196]:
df_sorted = df.sort("sample")

print("# of partitions in DataFrame:", df_sorted.rdd.getNumPartitions())
df_sorted.withColumn("pid", spark_partition_id()).show()

# of partitions in DataFrame: 200
+---+------------+---+
| id|      sample|pid|
+---+------------+---+
|163| 5.330911E-4|  0|
|119| 8.778424E-4|  0|
|750|0.0057204547|  0|
|275| 0.008876068|  0|
|903| 0.009868193|  0|
|479| 0.009877583|  1|
|318| 0.013418788|  1|
|627| 0.014248454|  1|
|187| 0.014699633|  1|
|489| 0.015786262|  1|
| 20| 0.017281093|  2|
|342| 0.019262396|  2|
|564| 0.019438043|  2|
|717| 0.020404326|  2|
|567| 0.021412997|  2|
|963| 0.023174021|  3|
|234|  0.02571832|  3|
|853|  0.02658498|  3|
|406|  0.02842313|  3|
|209| 0.029365541|  3|
+---+------------+---+
only showing top 20 rows



### Notice that...

We get as many RDD partitions after shuffling as in `spark.sql.shuffle.partitions`. The previous example, where we loaded a file, was limited by the number of rows.

Let's change this settings and perform the sort again.

In [197]:
spark.conf.set("spark.sql.shuffle.partitions", 24)

In [198]:
df_sorted = df.sort("sample").withColumn("pid", spark_partition_id())

print("# of partitions in DataFrame:", df_sorted.rdd.getNumPartitions())
df_sorted.show()

# of partitions in DataFrame: 24
+---+------------+---+
| id|      sample|pid|
+---+------------+---+
|163| 5.330911E-4|  0|
|119| 8.778424E-4|  0|
|750|0.0057204547|  0|
|275| 0.008876068|  0|
|903| 0.009868193|  0|
|479| 0.009877583|  0|
|318| 0.013418788|  0|
|627| 0.014248454|  0|
|187| 0.014699633|  0|
|489| 0.015786262|  0|
| 20| 0.017281093|  0|
|342| 0.019262396|  0|
|564| 0.019438043|  0|
|717| 0.020404326|  0|
|567| 0.021412997|  0|
|963| 0.023174021|  0|
|234|  0.02571832|  0|
|853|  0.02658498|  0|
|406|  0.02842313|  0|
|209| 0.029365541|  0|
+---+------------+---+
only showing top 20 rows



### How are they distributed?

In [199]:
df_sorted.createOrReplaceTempView("sorted")

In [200]:
counted = spark.sql("""
SELECT 
  pid, 
  count(*) as n_elems 
FROM sorted 
GROUP BY pid
""").collect()

In [201]:
sorted([(x.pid, x.n_elems) for x in counted])

[(0, 42),
 (1, 42),
 (2, 41),
 (3, 42),
 (4, 42),
 (5, 41),
 (6, 42),
 (7, 42),
 (8, 41),
 (9, 42),
 (10, 42),
 (11, 42),
 (12, 41),
 (13, 42),
 (14, 41),
 (15, 42),
 (16, 42),
 (17, 41),
 (18, 42),
 (19, 42),
 (20, 41),
 (21, 42),
 (22, 42),
 (23, 41)]