In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,IntegerType
from pyspark.sql.functions import col,countDistinct

spark = SparkSession.builder.appName("app").master("local[3]").getOrCreate()

In [0]:
schema = StructType([
    StructField("customer_id",IntegerType(),False),
    StructField("product_key",IntegerType(),False)
])
data = [
( 1           , 5     )      ,
( 2           , 6     )      ,
( 3           , 5     )      ,
( 3           , 6     )      ,
( 1           , 6     )      
]
cust = spark.createDataFrame(data,schema)
cust.show()

+-----------+-----------+
|customer_id|product_key|
+-----------+-----------+
|          1|          5|
|          2|          6|
|          3|          5|
|          3|          6|
|          1|          6|
+-----------+-----------+



In [0]:
schema = StructType([
    StructField("product_key",IntegerType(),False)
    ])
data = [
    (5,),
    (6,)
]
products = spark.createDataFrame(data,schema)
products.show()

+-----------+
|product_key|
+-----------+
|          5|
|          6|
+-----------+



In [0]:
# Write a solution to report the customer ids from the Customer table that bought all the products in the Product table.
# Return the result table in any order.

products_list = products.count()
cust.groupBy("customer_id").agg(countDistinct("product_key").alias("count")).filter(col("count")==products_list).select("customer_id").show()

+-----------+
|customer_id|
+-----------+
|          1|
|          3|
+-----------+



In [0]:
cust.createOrReplaceTempView("c")
products.createOrReplaceTempView("p")
spark.sql("select customer_id from c group by 1 having count(distinct product_key)=(select count(*) from p )").show()

+-----------+
|customer_id|
+-----------+
|          1|
|          3|
+-----------+



In [0]:
spark.stop()