In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructField,StructType,IntegerType
from pyspark.sql.functions import col,count

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField("actor_id",IntegerType(),False),
    StructField("director_id",IntegerType(),False),
    StructField("timestamp",IntegerType(),False)
])

data = [
    (1           , 1           , 0  )         ,
    (1           , 1           , 1  )         ,
    (1           , 1           , 2  )         ,
    (1           , 2           , 3  )         ,
    (1           , 2           , 4  )         ,
    (2           , 1           , 5  )         ,
    (2           , 1           , 6  ) 
]

actor_dir = spark.createDataFrame(data,schema)
actor_dir.show()

+--------+-----------+---------+
|actor_id|director_id|timestamp|
+--------+-----------+---------+
|       1|          1|        0|
|       1|          1|        1|
|       1|          1|        2|
|       1|          2|        3|
|       1|          2|        4|
|       2|          1|        5|
|       2|          1|        6|
+--------+-----------+---------+



In [0]:
# Write a solution to find all the pairs (actor_id, director_id) where the actor has cooperated with the director at least three times.
# Return the result table in any order
actor_dir.groupBy([col("actor_id"),col("director_id")]).agg(count(col("timestamp")).alias("colab")).filter(col("colab")>=3).select("actor_id",'director_id').show()

+--------+-----------+
|actor_id|director_id|
+--------+-----------+
|       1|          1|
+--------+-----------+



In [0]:
actor_dir.createOrReplaceTempView("ad")
spark.sql("select actor_id,director_id from ad group by 1,2 having count(*)>=3").show()

+--------+-----------+
|actor_id|director_id|
+--------+-----------+
|       1|          1|
+--------+-----------+



In [0]:
spark.stop()