In [49]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName("Spark dataSkew AggregateFunc"). \
    master("yarn"). \
    config('spark.executor.instances','2'). \
    config('spark.executor.memory','512MB'). \
    config('spark.executor.cores','4'). \
    config('spark.dynamicAllocation.enabled','False'). \
    getOrCreate()

In [48]:
spark.stop()

In [50]:
spark.sparkContext.applicationId

'application_1745651200635_11868'

In [51]:
 #Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)


In [52]:
# Read Employee Skew data
schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(schema).option("header", True).load("Datasets/employee_records_skew.csv")


In [53]:
emp.show()

+-----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
| first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|
+-----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
|      Jacob|     Stark|         Fine artist|1976-04-25|jasonortiz@exampl...|  224-695-9516x02171|358889.0|            1|
|    Marissa|     Crane|Intelligence analyst|2000-06-24|johnsontroy@examp...|        277-928-0029|786608.0|            3|
|     Andrea|     Davis|     Physiotherapist|1999-06-17| ihowell@example.org|          9503082950|428991.0|            3|
|       John|     Tapia|Lecturer, further...|2001-09-23|russobarbara@exam...|    001-679-487-9525|241574.0|            9|
|      Colin|    Holmes|     Psychotherapist|1965-06-29|fsimmons@example.org|          3232202899|320260.0|            4|
|       Eric|      Beck|

In [54]:
#adding the skew for department_id=2

emp_df = emp
         
#emp_df = emp

In [55]:

from pyspark.sql.functions import col, row_number, avg, sum, min, max

# Aggregate function: Max salary by each department

df_max_sal_by_dept = emp_df.groupBy("department_id").agg(max("salary").alias("max_salary"))

In [59]:
#write to noop - takes 4 mints

df_max_sal_by_dept.write.format("noop").mode("overwrite").save()

In [60]:
# Check the partition count
from pyspark.sql.functions import spark_partition_id, count, lit, desc

part_df = emp_df.withColumn("partition_num", spark_partition_id()).groupBy("partition_num").agg(count(lit(1)).alias("count"))

part_df.orderBy(desc("partition_num")).show()

+-------------+------+
|partition_num| count|
+-------------+------+
|           10|  8713|
|            9| 60703|
|            8| 65250|
|            7| 65699|
|            6|217684|
|            5|130421|
|            4|130312|
|            3|130393|
|            2|130400|
|            1|130384|
|            0|130406|
+-------------+------+



In [61]:
# Verify Employee data based on department_id
from pyspark.sql.functions import count, lit, desc, col

emp_df.groupBy("department_id").agg(count(lit(1))).show()


+-------------+--------+
|department_id|count(1)|
+-------------+--------+
|            1|   99451|
|            6|   99706|
|            3|  100248|
|            5|  200420|
|            9|  100014|
|            4|  100214|
|            8|  100417|
|            7|   99805|
|           10|   99780|
|            2|  200310|
+-------------+--------+



In [69]:
from pyspark.sql.functions import concat, lit, rand, floor

#salting
import random
from pyspark.sql.functions import udf

# UDF to return a random number every time and add to Employee as salt
@udf
def salt_udf():
    return random.randint(0, 90)


In [70]:
spark.conf.set("spark.sql.shuffle.partitions",90)

In [71]:
# Add a salted key (simulate evenly spreading records)


salted_emp = emp.withColumn("salted_dept_id", concat("department_id", lit("_"), salt_udf()))

In [72]:
#aggregate on salted department_id

df_max_sal_by_dept = salted_emp.groupBy("department_id","salted_dept_id").agg(max("salary").alias("max_salary"))

In [73]:
#aggregate on department_id

df_max_sal_by_dept1 = df_max_sal_by_dept.groupBy("department_id").agg(max("max_salary").alias("final_max_salary"))

In [75]:
#Action

df_max_sal_by_dept1.write.format("noop").mode("overwrite").save()

In [106]:
#stop sparkSession

spark.stop()