In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import col,avg,round,count

spark = SparkSession.builder.appName("app").master("local[3]").getOrCreate()

In [0]:
schema = StructType([
    StructField("employee_id",IntegerType(),False),
    StructField("name",StringType(),False),
    StructField("reports_to",IntegerType(),True),
    StructField("age",IntegerType(),True)
])
data = [
( 9           , "Hercy"   , None       , 43 ) ,
( 6           , "Alice"   , 9          , 41 ) ,
( 4           , "Bob"     , 9          , 36 ) ,
( 2           , "Winston" , None       , 37 )  ]
emp = spark.createDataFrame(data,schema)
emp.show()

+-----------+-------+----------+---+
|employee_id|   name|reports_to|age|
+-----------+-------+----------+---+
|          9|  Hercy|      null| 43|
|          6|  Alice|         9| 41|
|          4|    Bob|         9| 36|
|          2|Winston|      null| 37|
+-----------+-------+----------+---+



In [0]:
# For this problem, we will consider a manager an employee who has at least 1 other employee reporting to them.
# Write a solution to report the ids and the names of all managers, the number of employees who report directly to them, and the average age of the reports rounded to the nearest integer.
# Return the result table ordered by employee_id.
emp.alias("m").join(emp.alias("e"),col("m.reports_to")==col("e.employee_id"),'inner')\
    .groupBy("e.employee_id","e.name")\
        .agg( count("m.employee_id").alias("reports_count"),     round(avg("m.age")).alias("average_age")).orderBy("e.employee_id").show()
    # .select(col("e.employee_id").alias("manager_id"),col("e.name").alias("manager_name"),).show()

+-----------+-----+-------------+-----------+
|employee_id| name|reports_count|average_age|
+-----------+-----+-------------+-----------+
|          9|Hercy|            2|       39.0|
+-----------+-----+-------------+-----------+



In [0]:
emp.createOrReplaceTempView("emp")
spark.sql("select e.employee_id,e.name, count(*) as reports_count, round(avg(m.age)) as average_age from emp m join emp e on m.reports_to=e.employee_id group by 1,2 order by 1").show()

+-----------+-----+-------------+-----------+
|employee_id| name|reports_count|average_age|
+-----------+-----+-------------+-----------+
|          9|Hercy|            2|       39.0|
+-----------+-----+-------------+-----------+



In [0]:
spark.stop()