In [13]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName("Spark AQE- DataSkew Handling"). \
    master("yarn"). \
    config('spark.executor.instances','2'). \
    config('spark.executor.memory','512MB'). \
    config('spark.executor.cores','4'). \
    config('spark.dynamicAllocation.enabled','False'). \
    getOrCreate()

In [12]:
spark

In [14]:
spark.sparkContext.applicationId

'application_1745651200635_11250'

In [15]:
 #Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)


In [16]:
# Read Employee data
schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(schema).option("header", True).load("Datasets/employee_records.csv")

In [17]:
# Read DEPT CSV data
dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(dept_schema).option("header", True).load("Datasets/department_data.csv")


In [18]:
#combine the same dataframe multiple times to increase dataframe size

emp_df = emp.union(emp).union(emp)

In [19]:
# Join Datasets

df_joined = emp_df.join(dept, on=emp_df.department_id==dept.department_id, how="left_outer")


In [20]:
#writing to noop format - dummy write

df_joined.write.format("noop").mode("overwrite").save()

In [21]:
#enable AQE 

spark.conf.set("spark.sql.adaptive.enabled", True)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", True)
spark.conf.set("spark.sql.adaptive.advisoryPartitionSizeInBytes", "5MB")
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes","8MB")


In [22]:
#joining the datasets

df_joined = emp_df.join(dept, on=emp_df.department_id==dept.department_id, how="left_outer")

In [23]:
#writing to noop format

df_joined.write.format("noop").mode("overwrite").save()

In [24]:
#stop the spark

spark.stop()

Conclusion:
1. Understand how AQE helps to avoid the Data Skewness Automatically
2. understand how AQE reduces the empty partitions 
3. How AQE Helps to reduce the Data spill
4. Memory spill vs Disk Spill
