In [1]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName("Spark dataSkew Window Rank"). \
    master("yarn"). \
    config('spark.executor.instances','5'). \
    config('spark.executor.memory','512MB'). \
    config('spark.executor.cores','4'). \
    config('spark.dynamicAllocation.enabled','False'). \
    getOrCreate()

In [49]:
spark.stop()

In [12]:
spark.sparkContext.applicationId

'application_1745651200635_18473'

In [2]:
 #Disable AQE and AdvisoryPartitionSize

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)


In [3]:
# Read Employee Skew data
schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(schema).option("header", True).load("Datasets/employee_records_skew.csv")

In [5]:
emp.show()

+-----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
| first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|
+-----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
|      Jacob|     Stark|         Fine artist|1976-04-25|jasonortiz@exampl...|  224-695-9516x02171|358889.0|            1|
|    Marissa|     Crane|Intelligence analyst|2000-06-24|johnsontroy@examp...|        277-928-0029|786608.0|            3|
|     Andrea|     Davis|     Physiotherapist|1999-06-17| ihowell@example.org|          9503082950|428991.0|            3|
|       John|     Tapia|Lecturer, further...|2001-09-23|russobarbara@exam...|    001-679-487-9525|241574.0|            9|
|      Colin|    Holmes|     Psychotherapist|1965-06-29|fsimmons@example.org|          3232202899|320260.0|            4|
|       Eric|      Beck|

In [4]:
#adding the skew for department_id=2
emp1 = emp.drop("job_title","email","phone")

emp_df = emp1 #.union(emp1.where("department_id == 2")).union(emp1.where("department_id == 2")).union(emp1.where("department_id == 2")) \
             #.union(emp1.where("department_id == 2")).union(emp1.where("department_id == 2")).union(emp1.where("department_id == 2"))
            # .union(emp.where("department_id == 2")).union(emp.where("department_id == 2")).union(emp.where("department_id == 2"))

In [9]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, avg, sum, min, max, rank, desc

# Window function: rank events by date per user
window_spec = Window.partitionBy("department_id").orderBy(desc("salary"))

df_with_rank = emp_df.withColumn("rank", rank().over(window_spec))


In [11]:
#write to noop - takes 8 Sec

df_with_rank.write.format("noop").mode("overwrite").save()

In [15]:
#Please check the Spark Ui -> SQL -> Job -> Stage -> Task details
#Observe the dataSkew Issues
#Observe the DataSpill Issues

In [10]:
# Check the partition count
from pyspark.sql.functions import spark_partition_id, count, lit, desc

part_df = emp_df.withColumn("partition_num", spark_partition_id()).groupBy("partition_num").agg(count(lit(1)).alias("count"))

part_df.orderBy(desc("partition_num")).show()

+-------------+------+
|partition_num| count|
+-------------+------+
|           10|  8713|
|            9| 60703|
|            8| 65250|
|            7| 65699|
|            6|217684|
|            5|130421|
|            4|130312|
|            3|130393|
|            2|130400|
|            1|130384|
|            0|130406|
+-------------+------+



In [14]:
# Verify Employee data based on department_id
from pyspark.sql.functions import count, lit, desc, col

emp_df.groupBy("department_id").agg(count(lit(1))).show()


+-------------+--------+
|department_id|count(1)|
+-------------+--------+
|            1|   99451|
|            6|   99706|
|            3|  100248|
|            5|  200420|
|            9|  100014|
|            4|  100214|
|            8|  100417|
|            7|   99805|
|           10|   99780|
|            2|  200310|
+-------------+--------+



In [None]:
rank()

In [22]:
# Step 1: Count how many employees are at each salary level in each department

from pyspark.sql.functions import count, lit, desc, col,desc, sum, count, coalesce

salary_counts = emp_df.groupBy("department_id", "salary") \
                  .agg(count("*").alias("employee_count"))



In [23]:
# Step 2: Compute rank by summing employee counts of higher salaries (descending order)
window_desc = Window.partitionBy("department_id") \
                    .orderBy(col("salary").desc()) \
                    .rowsBetween(Window.unboundedPreceding, -1)  # exclude current row

salary_ranks = salary_counts.withColumn(
    "rank", coalesce(sum(coalesce("employee_count")).over(window_desc),lit(0)) + 1
)

salary_ranks.show()

+-------------+--------+--------------+----+
|department_id|  salary|employee_count|rank|
+-------------+--------+--------------+----+
|            1|999996.0|             1|   1|
|            1|999979.0|             1|   2|
|            1|999959.0|             1|   3|
|            1|999922.0|             1|   4|
|            1|999917.0|             1|   5|
|            1|999914.0|             1|   6|
|            1|999911.0|             1|   7|
|            1|999901.0|             2|   8|
|            1|999896.0|             1|  10|
|            1|999894.0|             1|  11|
|            1|999885.0|             1|  12|
|            1|999879.0|             1|  13|
|            1|999863.0|             1|  14|
|            1|999849.0|             1|  15|
|            1|999846.0|             1|  16|
|            1|999844.0|             1|  17|
|            1|999843.0|             2|  18|
|            1|999829.0|             1|  20|
|            1|999815.0|             1|  21|
|         

In [24]:
# Step 3: Join back to original data to assign the rank

final_df = emp_df.join(salary_ranks, on=["department_id", "salary"], how="left") \
             .select("*") \
             .orderBy("department_id", "rank")

In [29]:
#write to noop - takes 8 Sec

final_df.write.format("noop").mode("overwrite").save()

In [32]:
#some sample data with row_number

final_df.where("department_id == 2").where("rank < 10").show()

+-------------+--------+----------+---------+----------+--------------+----+
|department_id|  salary|first_name|last_name|       dob|employee_count|rank|
+-------------+--------+----------+---------+----------+--------------+----+
|            2|999997.0|   Richard|   Foster|1990-03-26|             2|   1|
|            2|999997.0|   Richard|   Foster|1990-03-26|             2|   1|
|            2|999987.0|   Cynthia|    Lewis|1994-05-15|             2|   3|
|            2|999987.0|   Cynthia|    Lewis|1994-05-15|             2|   3|
|            2|999964.0|    Melody| Reynolds|1987-09-11|             2|   5|
|            2|999964.0|    Melody| Reynolds|1987-09-11|             2|   5|
|            2|999903.0|    Thomas|   Medina|1997-08-10|             2|   7|
|            2|999903.0|    Thomas|   Medina|1997-08-10|             2|   7|
|            2|999885.0|    Andrea|   Gordon|1998-06-07|             2|   9|
|            2|999885.0|    Andrea|   Gordon|1998-06-07|             2|   9|

In [21]:
#some sample data with row_number

df_with_rank.where("department_id == 2").where("rank < 10").show()

+----------+---------+----------+--------+-------------+----+
|first_name|last_name|       dob|  salary|department_id|rank|
+----------+---------+----------+--------+-------------+----+
|   Richard|   Foster|1990-03-26|999997.0|            2|   1|
|   Richard|   Foster|1990-03-26|999997.0|            2|   1|
|   Cynthia|    Lewis|1994-05-15|999987.0|            2|   3|
|   Cynthia|    Lewis|1994-05-15|999987.0|            2|   3|
|    Melody| Reynolds|1987-09-11|999964.0|            2|   5|
|    Melody| Reynolds|1987-09-11|999964.0|            2|   5|
|    Thomas|   Medina|1997-08-10|999903.0|            2|   7|
|    Thomas|   Medina|1997-08-10|999903.0|            2|   7|
|    Andrea|   Gordon|1998-06-07|999885.0|            2|   9|
|    Andrea|   Gordon|1998-06-07|999885.0|            2|   9|
+----------+---------+----------+--------+-------------+----+



✅ Summary:
With this approch we were able to solve this problem with 60% less resources and improved Job execution time by 80%
If you try with larger datasets, you will definatly see performance improvements
