In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize SparkSession
spark = SparkSession.builder.appName("HighestSalaryInDepartment").getOrCreate()

# Sample data for Employee table
employee_data = [
    (1, "Joe", 70000, 1),
    (2, "Jim", 90000, 1),
    (3, "Henry", 80000, 2),
    (4, "Sam", 60000, 2),
    (5, "Max", 90000, 1)
]

# Sample data for Department table
department_data = [
    (1, "IT"),
    (2, "Sales")
]

# Create DataFrames
employee_df = spark.createDataFrame(employee_data, ["id", "name", "salary", "departmentId"])
department_df = spark.createDataFrame(department_data, ["id", "name"])

# Register DataFrames as temporary SQL tables
employee_df.createOrReplaceTempView("Employee")
department_df.createOrReplaceTempView("Department")




In [0]:
# Join Employee and Department DataFrames
emp_dept_df = employee_df.join(department_df, employee_df.departmentId == department_df.id)

# Define a window partitioned by departmentId and ordered by salary in descending order
window_spec = Window.partitionBy("departmentId").orderBy(F.desc("salary"))

# Add a rank column to identify the highest salary in each department
emp_dept_df = emp_dept_df.withColumn("rank", F.rank().over(window_spec))

# Filter to keep only the top-ranked (highest salary) employees
result_df = emp_dept_df.filter(emp_dept_df.rank == 1)

# Select the required columns and rename them accordingly
result_df = result_df.select(department_df["name"].alias("Department"), 
                             employee_df["name"].alias("Employee"), 
                             employee_df["salary"].alias("Salary"))

result_df.display()

Department,Employee,Salary
IT,Jim,90000
IT,Max,90000
Sales,Henry,80000


In [0]:
# Write the SQL query to find employees with the highest salary in each department
query = """
SELECT 
    d.name AS Department, 
    e.name AS Employee, 
    e.salary AS Salary
FROM 
    Employee e
JOIN 
    Department d ON e.departmentId = d.id
WHERE 
    e.salary = (SELECT MAX(salary) 
                FROM Employee 
                WHERE departmentId = e.departmentId)
"""

# Execute the query using Spark SQL
result_df = spark.sql(query)

# Show the result
result_df.display()

Department,Employee,Salary
IT,Max,90000
IT,Jim,90000
Sales,Henry,80000
