In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField,DateType,IntegerType
from pyspark.sql.functions import col,lead,min,when,datediff,sum,round,count_distinct
from pyspark.sql.window import Window
from datetime import datetime

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField("player_id",IntegerType(),False),
    StructField("device_id",IntegerType(),False),
    StructField("event_date",DateType(),False),
    StructField("games_played",IntegerType(),False)
])
data = [
( 1         , 2         , datetime(2016,3,1) , 5  )          ,
( 1         , 2         , datetime(2016,3,2) , 6  )          ,
( 2         , 3         , datetime(2017,6,25), 1  )          ,
( 3         , 1         , datetime(2016,3,2) , 0  )          ,
( 3         , 4         , datetime(2018,7,3) , 5  )          
]
games = spark.createDataFrame(data,schema)
games.show()

+---------+---------+----------+------------+
|player_id|device_id|event_date|games_played|
+---------+---------+----------+------------+
|        1|        2|2016-03-01|           5|
|        1|        2|2016-03-02|           6|
|        2|        3|2017-06-25|           1|
|        3|        1|2016-03-02|           0|
|        3|        4|2018-07-03|           5|
+---------+---------+----------+------------+



In [0]:
# Write a solution to report the fraction of players that logged in again on the day after the day they first logged in, rounded to 2 decimal places. 
# In other words, you need to count the number of players that logged in for at least two consecutive days starting from their first login date, 
# then divide that number by the total number of players.
window_spec = Window.partitionBy("player_id").orderBy("event_date")
games.withColumn("first_login",min("event_date").over(window_spec))\
     .withColumn("next_day_login",when(datediff(col("event_date"),col("first_login"))==1,1).otherwise(0))\
     .select(round(sum("next_day_login")/count_distinct("player_id"),2).alias("fraction")).show()


+--------+
|fraction|
+--------+
|    0.33|
+--------+



In [0]:

games.createOrReplaceTempView("games")
spark.sql("""with cte as 
                    (select player_id, event_date, min(event_date) over(partition by player_id order by event_date) as first_login from games)
            select round(sum(case when dateadd(first_login, 1)=event_date then 1 else 0 end)/count(distinct player_id),2) as fraction from cte
                            """).show()

+--------+
|fraction|
+--------+
|    0.33|
+--------+



In [0]:
spark.stop()