In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("app").master("local[3]").getOrCreate()

In [0]:
schema = StructType([
    StructField("id",IntegerType(),False),
    StructField("name",StringType(),False),
    StructField("department",StringType(),False),
    StructField("managerId",IntegerType(),True)
])

data = [
( 101 , 'John' , 'A'          , None )     ,
( 102 , 'Dan'  , 'A'          , 101  )     ,
( 103 , 'Jame' , 'A'          , 101  )     ,
( 104 , 'Amy'  , 'A'          , 101  )     ,
( 105 , 'Anne' , 'A'          , 101  )     ,
( 106 , 'Ron'  , 'B'          , 101  )     ,
( 107 , 'Tony' , 'A'          , 102  )     ,
( 108 , 'John' , 'A'          , 102  )     ,
( 109 , 'Arun' , 'A'          , 102  )     ,
]
emp = spark.createDataFrame(data,schema)
emp.show()

+---+----+----------+---------+
| id|name|department|managerId|
+---+----+----------+---------+
|101|John|         A|     null|
|102| Dan|         A|      101|
|103|Jame|         A|      101|
|104| Amy|         A|      101|
|105|Anne|         A|      101|
|106| Ron|         B|      101|
|107|Tony|         A|      102|
|108|John|         A|      102|
|109|Arun|         A|      102|
+---+----+----------+---------+



In [0]:
# Write a solution to find managers with at least five direct reports. Return the result table in any order.
emp.alias("mng").join(emp.alias("em"),col("mng.managerId")==col("em.id"),"inner")\
    .select(col("em.name"),col("mng.managerId"))\
    .groupBy(col("mng.managerId"),col("em.name")).count()\
    .filter(col("count")>=5).select("em.name").show()

+----+
|name|
+----+
|John|
+----+



In [0]:
emp.createOrReplaceTempView("emp")
spark.sql("select em.name from emp mng join emp em on mng.managerId=em.id group by mng.managerId,em.name having count(*)>=5").show()

+----+
|name|
+----+
|John|
+----+



In [0]:
spark.stop()