In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructField,StructType,StringType,IntegerType,DateType
from pyspark.sql.functions import col,countDistinct
from datetime import datetime

spark = SparkSession.builder.appName("app").master("local[3]").getOrCreate()

In [0]:
schema = StructType([
    StructField("date_id",DateType(),False),
    StructField("make_name",StringType(),False),
    StructField("lead_id",IntegerType(),False),
    StructField("partner_id",IntegerType(),False)
])
data = [
( datetime(2020,12,8) , "toyota"    , 0       , 1      )    ,
( datetime(2020,12,8) , "toyota"    , 1       , 0      )    ,
( datetime(2020,12,8) , "toyota"    , 1       , 2      )    ,
( datetime(2020,12,7) , "toyota"    , 0       , 2      )    ,
( datetime(2020,12,7) , "toyota"    , 0       , 1      )    ,
( datetime(2020,12,8) , "honda"     , 1       , 2      )    ,
( datetime(2020,12,8) , "honda"     , 2       , 1      )    ,
( datetime(2020,12,7) , "honda"     , 0       , 1      )    ,
( datetime(2020,12,7) , "honda"     , 1       , 2      )    ,
( datetime(2020,12,7) , "honda"     , 2       , 1      )    
]
sales = spark.createDataFrame(data,schema)
sales.show()


+----------+---------+-------+----------+
|   date_id|make_name|lead_id|partner_id|
+----------+---------+-------+----------+
|2020-12-08|   toyota|      0|         1|
|2020-12-08|   toyota|      1|         0|
|2020-12-08|   toyota|      1|         2|
|2020-12-07|   toyota|      0|         2|
|2020-12-07|   toyota|      0|         1|
|2020-12-08|    honda|      1|         2|
|2020-12-08|    honda|      2|         1|
|2020-12-07|    honda|      0|         1|
|2020-12-07|    honda|      1|         2|
|2020-12-07|    honda|      2|         1|
+----------+---------+-------+----------+



In [0]:
# For each date_id and make_name, find the number of distinct lead_id's and distinct partner_id's.
# Return the result table in any order.

sales.groupBy("date_id","make_name").agg(countDistinct("lead_id").alias("unique_leads"),countDistinct("partner_id").alias("unique_partners")).show()

+----------+---------+------------+---------------+
|   date_id|make_name|unique_leads|unique_partners|
+----------+---------+------------+---------------+
|2020-12-07|    honda|           3|              2|
|2020-12-08|   toyota|           2|              3|
|2020-12-08|    honda|           2|              2|
|2020-12-07|   toyota|           1|              2|
+----------+---------+------------+---------------+



In [0]:
sales.createOrReplaceTempView("s")
spark.sql("select date_id,make_name,count(distinct lead_id) unique_leads, count(distinct partner_id) unique_partners from s group by 1,2").show()

+----------+---------+------------+---------------+
|   date_id|make_name|unique_leads|unique_partners|
+----------+---------+------------+---------------+
|2020-12-07|    honda|           3|              2|
|2020-12-08|   toyota|           2|              3|
|2020-12-08|    honda|           2|              2|
|2020-12-07|   toyota|           1|              2|
+----------+---------+------------+---------------+



In [0]:
spark.stop()