In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField,StringType,IntegerType
from pyspark.sql.functions import col, avg,round

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField('project_id',IntegerType(), False),
    StructField('employee_id',IntegerType(),False)
])

data = [
    (1           , 1)         ,
    (1           , 2)         ,
    (1           , 3)         ,
    (2           , 1)         ,
    (2           , 4)
]

project = spark.createDataFrame(data,schema)
project.show()

+----------+-----------+
|project_id|employee_id|
+----------+-----------+
|         1|          1|
|         1|          2|
|         1|          3|
|         2|          1|
|         2|          4|
+----------+-----------+



In [0]:
schema = StructType([
    StructField("employee_id",IntegerType(),False),
    StructField("name",StringType(),False),
    StructField("experience_years",IntegerType(),False)
])

data = [
 (1         , "Khaled" , 3)                ,
 (2         , "Ali"    , 2)                ,
 (3         , "John"   , 1)                ,
 (4         , "Doe"    , 2) 
]
emp = spark.createDataFrame(data,schema)
emp.show() 

+-----------+------+----------------+
|employee_id|  name|experience_years|
+-----------+------+----------------+
|          1|Khaled|               3|
|          2|   Ali|               2|
|          3|  John|               1|
|          4|   Doe|               2|
+-----------+------+----------------+



In [0]:
# Write an SQL query that reports the average experience years of all the employees for each project, rounded to 2 digits.
# Return the result table in any order.

project.join(emp,project.employee_id==emp.employee_id,'inner').groupBy(col("project_id")).agg(round(avg(col("experience_years")),2).alias("average_years")).show()

+----------+-------------+
|project_id|average_years|
+----------+-------------+
|         1|          2.0|
|         2|          2.5|
+----------+-------------+



In [0]:
project.createOrReplaceTempView("p")
emp.createOrReplaceTempView("e")

spark.sql("select p.project_id, round(avg(e.experience_years),2) as average_years from p join e using(employee_id) group by p.project_id;").show()

+----------+-------------+
|project_id|average_years|
+----------+-------------+
|         1|          2.0|
|         2|          2.5|
+----------+-------------+



In [0]:
spark.stop()