In [0]:
from pyspark.sql.types import *

schema = StructType(
    [
        StructField("ActorId", IntegerType(), True),
        StructField("DirectorId", IntegerType(), True),
        StructField("timestamp", IntegerType(), True),
    ]
)
data = [(1, 1, 0), (1, 1, 1), (1, 1, 2), (1, 2, 3), (1, 2, 4), (1, 2, 5), (2, 1, 6)]
df = spark.createDataFrame(data, schema)
display(df)

ActorId,DirectorId,timestamp
1,1,0
1,1,1
1,1,2
1,2,3
1,2,4
1,2,5
2,1,6


In [0]:
# Corrected usage of countDistinct with multiple columns passed as a list
result_df = df.groupBy("ActorId", "DirectorId").count()

df1 = result_df.filter(result_df["count"] > 2)

display(df1.select("ActorId", "DirectorId"))

ActorId,DirectorId
1,1
1,2


In [0]:
df.createOrReplaceTempView("movies")

In [0]:
# Spark SQL Query
query = """
    SELECT ActorId, DirectorId
    FROM movies
    GROUP BY ActorId, DirectorId
    HAVING COUNT(*) > 2
"""

# Execute the query
result_df = spark.sql(query)

# Display the result
display(result_df)

ActorId,DirectorId
1,1
1,2


Explanation:

createOrReplaceTempView: 

Registers the DataFrame df as a temporary SQL table called movies.

SQL Query:

GROUP BY ActorId, DirectorId: 

Groups the records based on ActorId and DirectorId.
HAVING COUNT(*) > 2: Filters only those combinations where the count is greater than 2.
spark.sql(query): Executes the SQL query and returns the result as a DataFrame.