#Problem Statement
questions:
--Game Play Analysis 

--q1: Write an SQL query that reports the first login date for each player

--q2: Write a SQL query that reports the device that is first logged in for each player

--q3: Write an SQL query that reports for each player and date, how many games played so far by the player. 
--That is, the total number of games played by the player until that date.

--q4: Write an SQL query that reports the fraction of players that logged in again 
 on the day after the day they first logged in, rounded to 2 decimal places

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from datetime import date

# Initialize Spark session
spark = SparkSession.builder.appName("Create Activity Table").getOrCreate()

# Create a list of rows
data = [
    Row(player_id=1, device_id=2, event_date=date(2016, 3, 1), games_played=5),
    Row(player_id=1, device_id=2, event_date=date(2016, 3, 2), games_played=6),
    Row(player_id=2, device_id=3, event_date=date(2017, 6, 25), games_played=1),
    Row(player_id=3, device_id=1, event_date=date(2016, 3, 2), games_played=0),
    Row(player_id=3, device_id=4, event_date=date(2018, 7, 3), games_played=5),
]

# Create DataFrame from list of rows
df = spark.createDataFrame(data)

# Show DataFrame schema
df.printSchema()

# Show DataFrame contents
df.display()

root
 |-- player_id: long (nullable = true)
 |-- device_id: long (nullable = true)
 |-- event_date: date (nullable = true)
 |-- games_played: long (nullable = true)



player_id,device_id,event_date,games_played
1,2,2016-03-01,5
1,2,2016-03-02,6
2,3,2017-06-25,1
3,1,2016-03-02,0
3,4,2018-07-03,5


In [0]:
df.createOrReplaceTempView("activity")

--q1: Write an SQL query that reports the first login date for each player

In [0]:
from pyspark.sql.functions import min

# Group by player_id and calculate the minimum event_date
first_login_df = df.groupBy("player_id").agg(
    min("event_date").alias("first_login_date")
)

# Show the result
first_login_df.display()

player_id,first_login_date
1,2016-03-01
2,2017-06-25
3,2016-03-02


In [0]:
%sql
SELECT
  player_id,
  MIN(event_date) AS first_login_date
FROM
  activity
GROUP BY
  player_id;

player_id,first_login_date
1,2016-03-01
2,2017-06-25
3,2016-03-02


--q2: Write a SQL query that reports the device that is first logged in for each player

In [0]:
%sql
with temp as (
  select
    player_id,
    min(event_date) First_Login_Date
  from
    activity
  group by
    player_id
)
select
  a.*
from
  temp t
  inner join activity a on t.player_id = a.player_id
  and First_Login_Date = event_date

player_id,device_id,event_date,games_played
1,2,2016-03-01,5
2,3,2017-06-25,1
3,1,2016-03-02,0


In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import col, min
from datetime import date

first_login_df = df.groupBy("player_id").agg(
    min("event_date").alias("First_Login_Date")
)

# Join the original DataFrame with the first login DataFrame to get the corresponding rows
result_df = df.join(
    first_login_df,
    (df.player_id == first_login_df.player_id)
    & (df.event_date == first_login_df.First_Login_Date),
).select(df["*"])


result_df.display()

player_id,device_id,event_date,games_played
1,2,2016-03-01,5
2,3,2017-06-25,1
3,1,2016-03-02,0


In [0]:
%sql
with temp as (
  select
    player_id,
    min(event_date) First_Login_Date
  from
    activity
  group by
    player_id
)
select
  a.*
from
  temp t
  inner join activity a on t.player_id = a.player_id
  and First_Login_Date = event_date

player_id,device_id,event_date,games_played
1,2,2016-03-01,5
2,3,2017-06-25,1
3,1,2016-03-02,0


--q3: Write an SQL query that reports for each player and date, how many games played so far by the player. --That is, the total number of games played by the player until that date.

In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import sum as _sum
from pyspark.sql.window import Window
from datetime import date

window_spec = Window.partitionBy("player_id").orderBy("event_date")

# Apply the sum function over the window
df_with_total = df.withColumn(
    "total_number_of_games", _sum("games_played").over(window_spec)
)

# Show the result
df_with_total.display()

player_id,device_id,event_date,games_played,total_number_of_games
1,2,2016-03-01,5,5
1,2,2016-03-02,6,11
2,3,2017-06-25,1,1
3,1,2016-03-02,0,0
3,4,2018-07-03,5,5


In [0]:
%sql
select
  *,
  sum(games_played) over(
    partition by player_id
    order by
      event_date
  ) total_number_of_games
from
  activity

player_id,device_id,event_date,games_played,total_number_of_games
1,2,2016-03-01,5,5
1,2,2016-03-02,6,11
2,3,2017-06-25,1,1
3,1,2016-03-02,0,0
3,4,2018-07-03,5,5


--q4: Write an SQL query that reports the fraction of players that logged in again on the day after the day they first logged in, rounded to 2 decimal places

In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import col, countDistinct, lag, round, expr, lit
from pyspark.sql.window import Window
from datetime import date

# Define the window specification
window_spec = Window.partitionBy("player_id").orderBy("event_date")

# Calculate the previous event date for each player
df_with_prev_date = df.withColumn("prev_date", lag("event_date", 1).over(window_spec))

# Filter for logins that occurred the day after the previous login
df_day_after = df_with_prev_date.filter(
    col("prev_date").isNotNull() & 
    (expr("datediff(event_date, prev_date)") == 1)
)

# Count distinct players who logged in the day after their first login
day_after_count = df_day_after.select(countDistinct("player_id").alias("cnt")).collect()[0]["cnt"]

# Count total distinct players
total_players_count = df.select(countDistinct("player_id")).collect()[0][0]

# Calculate the percentage
percentage = (day_after_count / total_players_count) * 100

# Round to 2 decimal places using lit
percentage_rounded = df.withColumn("percentage", round(lit(percentage), 2))

# Show the result
percentage_rounded.select("percentage").distinct().show()


+----------+
|percentage|
+----------+
|     33.33|
+----------+



In [0]:
%sql
with cte as(
  select
    count(
      distinct case
        when extract(
          day
          from
            prev_date
        ) - extract(
          day
          from
            event_date
        ) = -1 then 1
      end
    ) as cnt
  from(
      select
        player_id,
        device_id,
        event_date,
        games_played,
        lag(event_date, 1) over(
          partition by player_id
          order by
            event_date
        ) as prev_date
      from
        activity
    ) A
)
select
  round(
    cnt /(
      select
        count(distinct player_id)
      from
        activity
    ) * 100.0,
    2
  ) as percentage
from
  cte

percentage
33.33
