In [1]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Optimizing Skewness and Spillage")
    .master("spark://197e20b418a6:7077")
    .config("spark.cores.max", 8)
    .config("spark.executor.cores", 4)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

In [2]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [3]:
# Read Employee data
_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load("/data/input/employee_records_skewed.csv")

In [4]:
# Read DEPT CSV data
_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(_dept_schema).option("header", True).load("/data/input/department_data.csv")

In [5]:
# Join Datasets

df_joined = emp.join(dept, on=emp.department_id==dept.department_id, how="left_outer")

In [6]:
df_joined.write.format("noop").mode("overwrite").save()

In [None]:
#Explain Plan

df_joined.explain()

In [7]:
# Check the partition details to understand distribution
from pyspark.sql.functions import spark_partition_id, count, lit

part_df = df_joined.withColumn("partition_num", spark_partition_id()).groupBy("partition_num").agg(count(lit(1)).alias("count"))

part_df.show()

+-------------+------+
|partition_num| count|
+-------------+------+
|          103| 19860|
|          122|420474|
|           43| 19899|
|          107| 19928|
|           49| 20006|
|           51| 19829|
|          102| 20099|
|           66| 20172|
|          174| 20229|
|           89|419504|
+-------------+------+



In [8]:
# Verify Employee data based on department_id
from pyspark.sql.functions import count, lit, desc, col

emp.groupBy("department_id").agg(count(lit(1))).show()

+-------------+--------+
|department_id|count(1)|
+-------------+--------+
|            1|   19899|
|            6|   20006|
|            3|   19829|
|            5|   20172|
|            9|  419504|
|            4|   20099|
|            8|   19860|
|            7|   19928|
|           10|  420474|
|            2|   20229|
+-------------+--------+



In [27]:
# Set shuffle partitions to a lesser number - 16

spark.conf.set("spark.sql.shuffle.partitions", 32)

In [28]:
# Let prepare the salt
import random
from pyspark.sql.functions import udf

# UDF to return a random number every time and add to Employee as salt
@udf
def salt_udf():
    return random.randint(0, 32)

# Salt Data Frame to add to department
salt_df = spark.range(0, 32)
salt_df.show()


+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+
only showing top 20 rows



In [29]:
# Salted Employee
from pyspark.sql.functions import lit, concat

salted_emp = emp.withColumn("salted_dept_id", concat("department_id", lit("_"), salt_udf()))

salted_emp.show()                                                     

+----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------+
|first_name|last_name|           job_title|       dob|               email|               phone|  salary|department_id|salted_dept_id|
+----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------+
|  Samantha|    Brown|Diagnostic radiog...|1966-06-11| jwatson@example.com|       (428)806-5154|439679.0|            3|           3_5|
|    Justin|Castaneda|Human resources o...|1996-11-11|  sdavis@example.org|    001-581-642-9621| 97388.0|            4|          4_23|
|      Carl| Peterson|         Proofreader|1984-11-23|andrew20@example.net|   241-871-9102x3835|287728.0|            1|           1_5|
| Catherine|     Lane|    Location manager|1966-06-21|elizabethalexande...|   470.866.4415x0739|174151.0|            3|          3_25|
|     Aaron|  Delgado|Teacher, secondar...|1972-10-11|u

In [30]:
# Salted Department

salted_dept = dept.join(salt_df, how="cross").withColumn("salted_dept_id", concat("department_id", lit("_"), "id"))

salted_dept.where("department_id = 9").show()

+-------------+--------------------+--------------------+-----------+-----+-------+---+--------------+
|department_id|     department_name|         description|       city|state|country| id|salted_dept_id|
+-------------+--------------------+--------------------+-----------+-----+-------+---+--------------+
|            9|Mcmahon, Terrell ...|De-engineered hig...|Marychester|   MN|  Italy|  0|           9_0|
|            9|Mcmahon, Terrell ...|De-engineered hig...|Marychester|   MN|  Italy|  1|           9_1|
|            9|Mcmahon, Terrell ...|De-engineered hig...|Marychester|   MN|  Italy|  2|           9_2|
|            9|Mcmahon, Terrell ...|De-engineered hig...|Marychester|   MN|  Italy|  3|           9_3|
|            9|Mcmahon, Terrell ...|De-engineered hig...|Marychester|   MN|  Italy|  4|           9_4|
|            9|Mcmahon, Terrell ...|De-engineered hig...|Marychester|   MN|  Italy|  5|           9_5|
|            9|Mcmahon, Terrell ...|De-engineered hig...|Marychester|   M

In [31]:
# Lets make the salted join now
salted_joined_df = salted_emp.join(salted_dept, on=salted_emp.salted_dept_id==salted_dept.salted_dept_id, how="left_outer")


In [33]:
salted_joined_df.write.format("noop").mode("overwrite").save()

In [32]:
# Check the partition details to understand distribution
from pyspark.sql.functions import spark_partition_id, count

part_df = salted_joined_df.withColumn("partition_num", spark_partition_id()).groupBy("partition_num").agg(count(lit(1)).alias("count"))

part_df.show()

+-------------+-----+
|partition_num|count|
+-------------+-----+
|           18|30975|
|           12|28636|
|           10|17942|
|           27|58641|
|            1|28039|
|            3|31642|
|           20|29552|
|           29| 4860|
|           13|20105|
|           14|18831|
|           23|81155|
|            6|18818|
|            9|55094|
|           11|44418|
|           26| 3006|
|            7|30275|
|           30| 4220|
|           28|16504|
|            0|30887|
|            8|30997|
+-------------+-----+
only showing top 20 rows

