In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,IntegerType,StringType
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField("employee_id",IntegerType(),False),
    StructField("name",StringType(),False),
    StructField("manager_id",IntegerType(),True),
    StructField("salary",IntegerType(),False)
])

data = [
( 3           , "Mila"      , 9          , 60301 ) ,
( 12          , "Antonella" , None       , 31000 ) ,
( 13          , "Emery"     , None       , 67084 ) ,
( 1           , "Kalel"     , 11         , 21241 ) ,
( 9           , "Mikaela"   , None       , 50937 ) ,
( 11          , "Joziah"    , 6          , 28485 ) ,
( 8           , "John"      , 5          , 29485 )
]
emp = spark.createDataFrame(data,schema)
emp.show()

+-----------+---------+----------+------+
|employee_id|     name|manager_id|salary|
+-----------+---------+----------+------+
|          3|     Mila|         9| 60301|
|         12|Antonella|      null| 31000|
|         13|    Emery|      null| 67084|
|          1|    Kalel|        11| 21241|
|          9|  Mikaela|      null| 50937|
|         11|   Joziah|         6| 28485|
|          8|     John|         5| 29485|
+-----------+---------+----------+------+



In [0]:
# Find the IDs of the employees whose salary is strictly less than $30000 and whose manager left the company. When a manager leaves the company, their information is deleted from the Employees table, 
# but the reports still have their manager_id set to the manager that left. Return the result table ordered by employee_id.

# Note: In pyspark DataFrame API, for leftanti join we can use words: anti, leftanti and left_anti 
# but for right anti join there is nothing so we have to change the dataframes order and keep the dataframe whose anti values we want to find on left side whereas in spark sql we got both left and right anti joins

# Note: In left anti join and left semi join, we get columns of only left dataframe/table
emp.alias("e").filter(col("salary")<30000).join(emp.alias("m"),col("e.manager_id")==col("m.employee_id"),"leftanti").select("e.employee_id").orderBy("employee_id").show()


+-----------+
|employee_id|
+-----------+
|          8|
|         11|
+-----------+



In [0]:
emp.createOrReplaceTempView("emp")
# without join
spark.sql("select employee_id from emp where salary<30000 and manager_id is not null and manager_id not in (select employee_id from emp) order by 1").show()

# with anti join
print("Using left anti join")
spark.sql("select e.employee_id from emp e left anti join emp m on e.manager_id=m.employee_id where e.salary<30000 order by 1").show()


+-----------+
|employee_id|
+-----------+
|          8|
|         11|
+-----------+

Using left anti join
+-----------+
|employee_id|
+-----------+
|          8|
|         11|
+-----------+



In [0]:
spark.stop()