In [3]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName("Spark dataSkew WindowFunc"). \
    master("yarn"). \
    config('spark.executor.instances','2'). \
    config('spark.executor.memory','512MB'). \
    config('spark.executor.cores','4'). \
    config('spark.dynamicAllocation.enabled','False'). \
    getOrCreate()

In [84]:
spark.stop()

In [4]:
spark.sparkContext.applicationId

'application_1745651200635_12486'

In [5]:
 #Disable AQE and AdvisoryPartitionSize

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)


In [6]:
# Read Employee Skew data
schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(schema).option("header", True).load("Datasets/employee_records_skew.csv")

In [89]:
emp.show()

+-----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
| first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|
+-----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
|      Jacob|     Stark|         Fine artist|1976-04-25|jasonortiz@exampl...|  224-695-9516x02171|358889.0|            1|
|    Marissa|     Crane|Intelligence analyst|2000-06-24|johnsontroy@examp...|        277-928-0029|786608.0|            3|
|     Andrea|     Davis|     Physiotherapist|1999-06-17| ihowell@example.org|          9503082950|428991.0|            3|
|       John|     Tapia|Lecturer, further...|2001-09-23|russobarbara@exam...|    001-679-487-9525|241574.0|            9|
|      Colin|    Holmes|     Psychotherapist|1965-06-29|fsimmons@example.org|          3232202899|320260.0|            4|
|       Eric|      Beck|

In [7]:
#adding the skew for department_id=2

emp_df = emp.union(emp.where("department_id == 2")).union(emp.where("department_id == 2")).union(emp.where("department_id == 2")) \
            .union(emp.where("department_id == 2")).union(emp.where("department_id == 2")).union(emp.where("department_id == 2")) \
            .union(emp.where("department_id == 2")).union(emp.where("department_id == 2")).union(emp.where("department_id == 2"))

In [8]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, avg, sum, min, max

# Window function: rank events by date per user
window_spec = Window.partitionBy("department_id").orderBy("dob")

df_with_rank = emp_df.withColumn("dept_max_sal", max("salary").over(window_spec))


In [9]:
#write to noop - takes 4 mints

df_with_rank.write.format("noop").mode("overwrite").save()

In [9]:
df_with_rank.where("department_id == 2").orderBy("dob").show()

+----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+------------+
|first_name|last_name|           job_title|       dob|               email|               phone|  salary|department_id|dept_max_sal|
+----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+------------+
|      Jane|    Reyes|      Air cabin crew|1962-12-17| hdunlap@example.net|        797.558.4831|658357.0|            2|    928613.0|
|    Tyrone|    Wells|Producer, televis...|1962-12-17|brandonkelly@exam...|   530.698.2971x5231| 76400.0|            2|    928613.0|
|       Roy|   Vargas|             Curator|1962-12-17|swansonmaria@exam...| +1-669-614-3946x041|223256.0|            2|    928613.0|
|   Cameron|    Olson|Educational psych...|1962-12-17|lopezjames@exampl...|   (209)978-1855x159| 87142.0|            2|    928613.0|
|      Adam|Hernandez|   Social researcher|1962-12-17|stacey60@exampl

In [None]:
#Please check the Spark Ui -> SQL -> Job -> Stage -> Task details
#Observe the dataSkew Issues
#Observe the DataSpill Issues

In [63]:
# Check the partition count
from pyspark.sql.functions import spark_partition_id, count, lit, desc

part_df = emp_df.withColumn("partition_num", spark_partition_id()).groupBy("partition_num").agg(count(lit(1)).alias("count"))

part_df.orderBy(desc("partition_num")).show()

+-------------+------+
|partition_num| count|
+-------------+------+
|           10|  8713|
|            9| 60703|
|            8| 65250|
|            7| 65699|
|            6|217684|
|            5|130421|
|            4|130312|
|            3|130393|
|            2|130400|
|            1|130384|
|            0|130406|
+-------------+------+



In [64]:
# Verify Employee data based on department_id
from pyspark.sql.functions import count, lit, desc, col

emp_df.groupBy("department_id").agg(count(lit(1))).show()


+-------------+--------+
|department_id|count(1)|
+-------------+--------+
|            4|  100214|
|            8|  100417|
|            7|   99805|
|           10|   99780|
|            1|   99451|
|            6|   99706|
|            3|  100248|
|            5|  200420|
|            2|  200310|
|            9|  100014|
+-------------+--------+



In [10]:
from pyspark.sql.functions import concat, lit, rand, floor

#salting
import random
from pyspark.sql.functions import udf

# UDF to return a random number every time and add to Employee as salt
@udf
def salt_udf():
    return random.randint(0, 48)


In [11]:
#setting shuffle partition same as salted keys range

spark.conf.set("spark.sql.shuffle.partitions", 48)

In [12]:
# Add a salted key (simulate evenly spreading records)

salted_emp = emp.withColumn("salted_dept_id", concat("department_id", lit("_"), salt_udf()))


In [102]:
# Verify Employee data based on department_id
from pyspark.sql.functions import count, lit, desc, col

salted_emp.groupBy("salted_dept_id").agg(count(lit(1)).alias("cnt")).orderBy(desc("cnt")).show()

+--------------+----+
|salted_dept_id| cnt|
+--------------+----+
|          5_18|4254|
|          2_18|4238|
|          5_28|4237|
|          2_20|4201|
|           5_5|4189|
|          5_10|4187|
|          5_47|4185|
|           5_1|4178|
|          2_31|4174|
|          5_17|4173|
|           5_3|4170|
|          5_27|4169|
|          5_14|4169|
|           5_8|4164|
|          5_44|4160|
|          2_45|4155|
|          5_29|4153|
|          2_30|4148|
|          2_12|4147|
|          5_30|4144|
+--------------+----+
only showing top 20 rows



In [13]:
# Apply window function on salted key

window_salted = Window.partitionBy("salted_dept_id").orderBy("dob")

df_salted_max_sal = salted_emp.withColumn("dept_max_sal_salt", max("salary").over(window_salted))

In [14]:
#group by on dept to get actual result
wind = Window.partitionBy("department_id").orderBy("dob")

df_final_wdw = df_salted_max_sal.withColumn("dept_max_sal", max("dept_max_sal_salt").over(wind))


In [15]:
#write to noop

df_final_wdw.write.format("noop").mode("overwrite").save()

In [30]:
#check the salt column

df_final_wdw.drop("job_title","email","phone").where("department_id == 2").orderBy("dob").show()

+----------+---------+----------+--------+-------------+--------------+-----------------+------------+
|first_name|last_name|       dob|  salary|department_id|salted_dept_id|dept_max_sal_salt|dept_max_sal|
+----------+---------+----------+--------+-------------+--------------+-----------------+------------+
|    Tyrone|    Wells|1962-12-17| 76400.0|            2|          2_90|          87142.0|    928613.0|
|    Javier|     Hall|1962-12-17| 22499.0|            2|          2_71|          22499.0|    928613.0|
|   Cameron|    Olson|1962-12-17| 87142.0|            2|          2_90|          87142.0|    928613.0|
|      Adam|     Webb|1962-12-17|631124.0|            2|          2_15|         631124.0|    928613.0|
|    Javier|     Hall|1962-12-17| 22499.0|            2|          2_47|          22499.0|    928613.0|
|      Adam|Hernandez|1962-12-17|144252.0|            2|          2_15|         631124.0|    928613.0|
|      Dawn|  Mcbride|1962-12-17|141670.0|            2|          2_28|  

In [16]:
#stop sparkSession

spark.stop()

Conclusion:
1. understand data skew in Aggregation or window functions
2. understand data spill to memory and Disk
3. how to prepare the salted key
4. how to reduce spills using salting approch
5. AQE can't handle the Aggregate or Window functions 
