In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructField,StructType,StringType,DateType
from pyspark.sql.functions import col,collect_set,array_join,sort_array,count_distinct
from datetime import datetime

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField("sell_date",DateType(),False),
    StructField("product",StringType(),False)
])

data = [
    ( datetime(2020,5,30) , "Headphone"  ) ,
    ( datetime(2020,6,1 ), "Pencil"      ) ,
    ( datetime(2020,6,2 ), "Mask"        ) ,
    ( datetime(2020,5,30) , "Basketball" ) ,
    ( datetime(2020,6,1 ), "Bible"       ) ,
    ( datetime(2020,6,2 ), "Mask"        ) ,
    ( datetime(2020,5,30) , "T-Shirt"    ) 
]

activities = spark.createDataFrame(data,schema)
activities.show()

+----------+----------+
| sell_date|   product|
+----------+----------+
|2020-05-30| Headphone|
|2020-06-01|    Pencil|
|2020-06-02|      Mask|
|2020-05-30|Basketball|
|2020-06-01|     Bible|
|2020-06-02|      Mask|
|2020-05-30|   T-Shirt|
+----------+----------+



In [0]:
# Write a solution to find for each date the number of different products sold and their names. The sold products names for each date should be sorted lexicographically.
# Return the result table ordered by sell_date.

activities.orderBy("product").groupBy("sell_date")\
    .agg(   count_distinct("product").alias("num_sold"),\
            array_join(sort_array(collect_set(col("product")))\
        ,delimiter=",").alias("products"))\
    .orderBy("sell_date").show(truncate=False)

+----------+--------+----------------------------+
|sell_date |num_sold|products                    |
+----------+--------+----------------------------+
|2020-05-30|3       |Basketball,Headphone,T-Shirt|
|2020-06-01|2       |Bible,Pencil                |
|2020-06-02|1       |Mask                        |
+----------+--------+----------------------------+



In [0]:
activities.createOrReplaceTempView("a")
spark.sql("select sell_date,count(distinct product) as num_sold, array_join(sort_array(collect_set(product)),',') as products from a group by 1 order by 1").show(truncate=False)

+----------+--------+----------------------------+
|sell_date |num_sold|products                    |
+----------+--------+----------------------------+
|2020-05-30|3       |Basketball,Headphone,T-Shirt|
|2020-06-01|2       |Bible,Pencil                |
|2020-06-02|1       |Mask                        |
+----------+--------+----------------------------+



In [0]:
spark.stop()