In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField("employee_id",IntegerType(),False),
    StructField("name",StringType(),False)
])

data = [
( 2          , "Crew"     ),
( 4          , "Haven"    ),
( 5          , "Kristian" )
]
emp = spark.createDataFrame(data,schema)
emp.show()

+-----------+--------+
|employee_id|    name|
+-----------+--------+
|          2|    Crew|
|          4|   Haven|
|          5|Kristian|
+-----------+--------+



In [0]:
schema = StructType([
    StructField("employee_id",IntegerType(),False),
    StructField("salary",IntegerType(),False)
])
data = [
( 5           , 76071 ) ,
( 1           , 22517 ) ,
( 4           , 63539 ) 
]
salaries = spark.createDataFrame(data,schema)
salaries.show()

+-----------+------+
|employee_id|salary|
+-----------+------+
|          5| 76071|
|          1| 22517|
|          4| 63539|
+-----------+------+



In [0]:
# Write a solution to report the IDs of all the employees with missing information. Return the result table ordered by employee_id in descending order.
# The information of an employee is missing if:
#     * The employee's name is missing, or
#     * The employee's salary is missing.


emp_left = emp.join(salaries,emp.employee_id==salaries.employee_id,'left_anti').select("employee_id")
salaries_left = salaries.join(emp,emp.employee_id==salaries.employee_id,'anti').select("employee_id")

emp_left.union(salaries_left).orderBy("employee_id").sort(col("employee_id").desc()).show()


+-----------+
|employee_id|
+-----------+
|          2|
|          1|
+-----------+



In [0]:
emp.createOrReplaceTempView("emp")
salaries.createOrReplaceTempView("s")

# order by 1 desc works on full table created via union
spark.sql("select employee_id from emp left anti join s using(employee_id) union select employee_id from s left anti join emp using (employee_id) order by 1 desc").show()

# Using left and right joins along with where condition to create left anti join
spark.sql("""
            select e.employee_id from emp e left join s on e.employee_id=s.employee_id where s.employee_id is null
                union
            select s.employee_id from emp e right join s on e.employee_id=s.employee_id where e.employee_id is null
            order by 1
          """).show()

+-----------+
|employee_id|
+-----------+
|          2|
|          1|
+-----------+

+-----------+
|employee_id|
+-----------+
|          1|
|          2|
+-----------+



In [0]:
spark.stop()