In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructField,StructType,StringType,IntegerType
from pyspark.sql.functions import col,max
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField("id",IntegerType(),False),
    StructField("name",StringType(),False),
    StructField("salary",IntegerType(),False),
    StructField("departmentId",IntegerType(),False)
])
data = [
( 1  , 'Joe'   , 70000  , 1 )           ,
( 2  , 'Jim'   , 90000  , 1 )           ,
( 3  , 'Henry' , 80000  , 2 )           ,
( 4  , 'Sam'   , 60000  , 2 )           ,
( 5  , 'Max'   , 90000  , 1 )           
]
emp = spark.createDataFrame(data,schema)
emp.show()

+---+-----+------+------------+
| id| name|salary|departmentId|
+---+-----+------+------------+
|  1|  Joe| 70000|           1|
|  2|  Jim| 90000|           1|
|  3|Henry| 80000|           2|
|  4|  Sam| 60000|           2|
|  5|  Max| 90000|           1|
+---+-----+------+------------+



In [0]:
schema = StructType([
    StructField("id",IntegerType(),False),
    StructField("name",StringType(),False)
])
data = [
    (1,'IT'),
    (2,'Sales')
]
dept = spark.createDataFrame(data,schema)
dept.show()

+---+-----+
| id| name|
+---+-----+
|  1|   IT|
|  2|Sales|
+---+-----+



In [0]:
# Write a solution to find employees who have the highest salary in each of the departments. Return the result table in any order.
window_spec = Window.partitionBy("departmentId")
emp.select("name","salary","departmentId",max("salary").over(window_spec).alias("max_salary"))\
    .filter(col("salary")==col("max_salary")).join(dept,emp.departmentId==dept.id,"inner")\
    .select(dept.name.alias("Department"), emp.name.alias("Employee"),"salary").show()

+----------+--------+------+
|Department|Employee|salary|
+----------+--------+------+
|        IT|     Max| 90000|
|        IT|     Jim| 90000|
|     Sales|   Henry| 80000|
+----------+--------+------+



In [0]:
emp.createOrReplaceTempView("emp")
dept.createOrReplaceTempView("dept")
spark.sql("""
          select d.name Department, e.name Employee, salary from emp e join dept d on e.departmentId=d.id where (d.id,e.salary) in (select departmentId, max(salary) from emp group by 1)
          """).show()

+----------+--------+------+
|Department|Employee|salary|
+----------+--------+------+
|        IT|     Jim| 90000|
|        IT|     Max| 90000|
|     Sales|   Henry| 80000|
+----------+--------+------+



In [0]:
spark.stop()