In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType,DateType,StringType
from pyspark.sql.functions import col, date_sub, lit,countDistinct,count
from datetime import datetime

spark = SparkSession.builder.master("local[2]").appName("app").getOrCreate()

In [0]:
schema = StructType([
    StructField("user_id",IntegerType(),False),
    StructField("session_id",IntegerType(),False),
    StructField("activity_date",DateType(),False),
    StructField("activity_type",StringType(),False)
])

data = [
(1       , 1          , datetime(2019,7,20)    , "open_session"  ),
(1       , 1          , datetime(2019,7,20)    , "scroll_down"   ),
(1       , 1          , datetime(2019,7,20)    , "end_session"   ),
(2       , 4          , datetime(2019,7,20)    , "open_session"  ),
(2       , 4          , datetime(2019,7,21)    , "send_message"  ),
(2       , 4          , datetime(2019,7,21)    , "end_session"   ),
(3       , 2          , datetime(2019,7,21)    , "open_session"  ),
(3       , 2          , datetime(2019,7,21)    , "send_message"  ),
(3       , 2          , datetime(2019,7,21)    , "end_session"   ),
(4       , 3          , datetime(2019,6,25)    , "open_session"  ),
(4       , 3          , datetime(2019,6,25)    , "end_session"   )
]

activity = spark.createDataFrame(data,schema)
activity.show()

+-------+----------+-------------+-------------+
|user_id|session_id|activity_date|activity_type|
+-------+----------+-------------+-------------+
|      1|         1|   2019-07-20| open_session|
|      1|         1|   2019-07-20|  scroll_down|
|      1|         1|   2019-07-20|  end_session|
|      2|         4|   2019-07-20| open_session|
|      2|         4|   2019-07-21| send_message|
|      2|         4|   2019-07-21|  end_session|
|      3|         2|   2019-07-21| open_session|
|      3|         2|   2019-07-21| send_message|
|      3|         2|   2019-07-21|  end_session|
|      4|         3|   2019-06-25| open_session|
|      4|         3|   2019-06-25|  end_session|
+-------+----------+-------------+-------------+



In [0]:
# Write a solution to find the daily active user count for a period of 30 days ending 2019-07-27 inclusively. A user was active on someday if they made at least one activity on that day.
# Return the result table in any order.
activity.where(col("activity_date").between( date_sub(lit('2019-07-27'),30), lit("2019-07-27"))).groupBy("activity_date").agg(countDistinct(col("user_id")).alias("active_users")).show()

+-------------+------------+
|activity_date|active_users|
+-------------+------------+
|   2019-07-21|           2|
|   2019-07-20|           2|
+-------------+------------+



In [0]:
activity.createOrReplaceTempView("a")
spark.sql("select activity_date,count(distinct user_id) as active_users from a where activity_date between date_sub('2019-07-27',29) and '2019-07-27' group by activity_date").show()

+-------------+------------+
|activity_date|active_users|
+-------------+------------+
|   2019-07-21|           2|
|   2019-07-20|           2|
+-------------+------------+



In [0]:
spark.stop()