In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField("employee_id",IntegerType(),False),
    StructField("department_id",IntegerType(),False),
    StructField("primary_flag",StringType(),False)
])
data = [
( 1           , 1             , 'N'      )      ,
( 2           , 1             , 'Y'      )      ,
( 2           , 2             , 'N'      )      ,
( 3           , 3             , 'N'      )      ,
( 4           , 2             , 'N'      )      ,
( 4           , 3             , 'Y'      )      ,
( 4           , 4             , 'N'      )      
]
emp = spark.createDataFrame(data,schema)
emp.show()

+-----------+-------------+------------+
|employee_id|department_id|primary_flag|
+-----------+-------------+------------+
|          1|            1|           N|
|          2|            1|           Y|
|          2|            2|           N|
|          3|            3|           N|
|          4|            2|           N|
|          4|            3|           Y|
|          4|            4|           N|
+-----------+-------------+------------+



In [0]:
# Employees can belong to multiple departments. When the employee joins other departments, they need to decide which department is their primary department. 
# Note that when an employee belongs to only one department, their primary column is 'N'.
# Write a solution to report all the employees with their primary department. For employees who belong to one department, report their only department.
# Return the result table in any order.

emp_with_pf_y = emp.filter(col("primary_flag")=='Y')
emp_id_with_pf_y = emp_with_pf_y.select("employee_id").rdd.flatMap(lambda x:x).collect()
emp_with_pf_y.union(emp.filter(~col("employee_id").isin(emp_id_with_pf_y))).select("employee_id","department_id").show()


+-----------+-------------+
|employee_id|department_id|
+-----------+-------------+
|          2|            1|
|          4|            3|
|          1|            1|
|          3|            3|
+-----------+-------------+



In [0]:
emp.createTempView("emp")
spark.sql("""select employee_id,department_id from emp where primary_flag='Y' 
            union 
            select employee_id,department_id from emp where employee_id not in (select employee_id from emp where primary_flag='Y')""").show()

+-----------+-------------+
|employee_id|department_id|
+-----------+-------------+
|          2|            1|
|          4|            3|
|          1|            1|
|          3|            3|
+-----------+-------------+



In [0]:
spark.stop()