#Problem Statement
A company's executives are interested in seeing who earns the most money in each of the company's departments. A high earner in a department is an employee who has a salary in the top three unique salaries for that department.

Write a solution to find the employees who are high earners in each of the departments.

Return the result table in any order.

The result format is in the following example.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import row_number, max
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder \
    .appName("HighestSalaryByDepartment") \
    .getOrCreate()

# Define the Employee and Department tables
employee_data = [
    (1, 'Joe', 85000, 1),
    (2, 'Henry', 80000, 2),
    (3, 'Sam', 60000, 2),
    (4, 'Max', 90000, 1),
    (5, 'Janet', 69000, 1),
    (6, 'Randy', 85000, 1),
    (7, 'Will', 70000, 1)
]

department_data = [
    (1, 'IT'),
    (2, 'Sales')
]

employee_df = spark.createDataFrame(employee_data, ['id', 'name', 'salary', 'departmentId'])
department_df = spark.createDataFrame(department_data, ['id', 'dept_name'])
employee_df.display()
department_df.display()

id,name,salary,departmentId
1,Joe,85000,1
2,Henry,80000,2
3,Sam,60000,2
4,Max,90000,1
5,Janet,69000,1
6,Randy,85000,1
7,Will,70000,1


id,dept_name
1,IT
2,Sales


In [0]:
# Join Employee and Department tables on departmentId
joined_df = employee_df.join(department_df, employee_df.departmentId == department_df.id, 'inner')

# Define a window specification
window_spec = Window.partitionBy('departmentId').orderBy(employee_df.salary.desc())

# Assign row number to each employee within each department based on salary
joined_df = joined_df.withColumn('rank', row_number().over(window_spec))

# Filter rows to include only those with ranks less than or equal to 3
top_three_per_department = joined_df.filter(joined_df.rank <= 3)

# Rename columns for expected output
top_three_per_department = top_three_per_department.selectExpr("name as Employee", "salary as Salary", "dept_name as Department")

# Show the result
print("Department Top Three Salaries")
top_three_per_department.display()

Department Top Three Salaries


Employee,Salary,Department
Max,90000,IT
Joe,85000,IT
Randy,85000,IT
Henry,80000,Sales
Sam,60000,Sales


In [0]:
employee_df.createOrReplaceTempView("employee")
department_df.createOrReplaceTempView("Department")

###Spark SQL


In [0]:
%sql
WITH cte AS (
    SELECT e.name AS Employee, e.salary AS Salary, d.dept_name AS Department, DENSE_RANK() OVER(PARTITION BY d.dept_name ORDER BY e.salary DESC) AS rank
    FROM Employee AS e
    JOIN Department AS d
    ON e.departmentId = d.id
)

SELECT Department, Employee, Salary
FROM cte
WHERE rank < 3
ORDER BY Department ASC , Salary DESC

Department,Employee,Salary
IT,Max,90000
IT,Joe,85000
IT,Randy,85000
Sales,Henry,80000
Sales,Sam,60000
