In [0]:
# Sample DataFrames
employee_data = [(1, "Alice", 50000, 1), (2, "Bob", 60000, 1), (3, "Charlie", 55000, 2),
                 (4, "David", 70000, 2), (5, "Eve", 45000, 3), (6, "Frank", 48000, 3), (7, "Grace", 70000, 3)]

department_data = [(1, "HR"), (2, "IT"), (3, "Marketing")]

# Create DataFrames
employee_columns = ["employee_id", "name", "salary", "department_id"]
department_columns = ["department_id", "department_name"]
employee_df = spark.createDataFrame(employee_data, employee_columns)
department_df = spark.createDataFrame(department_data, department_columns)


In [0]:
employee_df.display()
department_df.display()

employee_id,name,salary,department_id
1,Alice,50000,1
2,Bob,60000,1
3,Charlie,55000,2
4,David,70000,2
5,Eve,45000,3
6,Frank,48000,3
7,Grace,70000,3


department_id,department_name
1,HR
2,IT
3,Marketing


In [0]:
from pyspark.sql.functions import max

# Assuming employee_df and department_df are your DataFrames

# Join Employee and Department tables
employee_department_df = employee_df.join(department_df, "department_id")

# Find highest salary per department
highest_salary_df = employee_department_df.groupBy("department_name").agg(max("salary").alias("highest_salary"))

highest_salary_df.display()


department_name,highest_salary
HR,60000
Marketing,70000
IT,70000


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank

# Create a window spec for ranking employees within each department by salary in descending order
window_spec = Window.partitionBy("department_name").orderBy(col("salary").desc())

# Add rank to each row
employee_with_rank_df = employee_department_df.withColumn("rank", rank().over(window_spec))

# Filter to get only the top-ranked employee per department (highest salary)
highest_paid_employee_df = employee_with_rank_df.filter(col("rank") == 1)

highest_paid_employee_df.show()


+-------------+-----------+-----+------+---------------+----+
|department_id|employee_id| name|salary|department_name|rank|
+-------------+-----------+-----+------+---------------+----+
|            1|          2|  Bob| 60000|             HR|   1|
|            2|          4|David| 70000|             IT|   1|
|            3|          7|Grace| 70000|      Marketing|   1|
+-------------+-----------+-----+------+---------------+----+



In [0]:
from pyspark.sql.functions import min

# Find lowest salary per department
lowest_salary_df = employee_department_df.groupBy("department_name").agg(min("salary").alias("lowest_salary"))

lowest_salary_df.show()


+---------------+-------------+
|department_name|lowest_salary|
+---------------+-------------+
|             HR|        50000|
|      Marketing|        45000|
|             IT|        55000|
+---------------+-------------+



In [0]:
# Create a window spec for ranking employees within each department by salary in ascending order
window_spec = Window.partitionBy("department_name").orderBy(col("salary").asc())

# Add rank to each row
employee_with_rank_df = employee_department_df.withColumn("rank", rank().over(window_spec))

# Filter to get only the top-ranked employee per department (lowest salary)
lowest_paid_employee_df = employee_with_rank_df.filter(col("rank") == 1)

lowest_paid_employee_df.show()


+-------------+-----------+-------+------+---------------+----+
|department_id|employee_id|   name|salary|department_name|rank|
+-------------+-----------+-------+------+---------------+----+
|            1|          1|  Alice| 50000|             HR|   1|
|            2|          3|Charlie| 55000|             IT|   1|
|            3|          5|    Eve| 45000|      Marketing|   1|
+-------------+-----------+-------+------+---------------+----+

