In [0]:
### The Challenge

# Your task is to process the input raw data and generate an output that can be used to analyse some engagement metrics. In particular, with the output you will generate, we want to be able to measure:

# - the number of activities played
# - the time spent playing
# - the most played activities, both in terms of number of times and total duration
# - the completion rate of activities

# On top of that:

# - we want to be able to analyse data at daily, weekly, monthly and yearly granularity.
# - we want to be able to slice the data by several dimensions:
#   - country
#   - OS of the device (type and version) and App version
#   - subscription status: free, trial, subscribed, lapsed
#   - activity type and collection
# - we are not interested in being able to analyse every single event or the behaviour of individual users.



In [0]:
from pyspark.sql import functions as F
from datetime import datetime

# Load silver tables
events_data_df = spark.table("lingokids.silver.events_data")
activities_df = spark.table("lingokids.silver.activities")
events_df = spark.table("lingokids.silver.events")
dim_date_df = spark.table("lingokids.silver.dim_date")
events_context_df = spark.table("lingokids.silver.events_context")
users_df = spark.table("lingokids.silver.users")
dim_country_df = spark.table("lingokids.silver.dim_country")

# Joins (matching your SQL)
joined_df = (
    events_data_df
    .join(activities_df, "activity_id", "left")
    .join(events_df, "event_id", "inner")
    .join(dim_date_df, events_df.date_key == dim_date_df.date_key, "inner")
    .join(events_context_df, "event_id", "left")
    .join(users_df, events_df.user_id == users_df.user_id, "left")
    .join(dim_country_df, users_df.country_code == dim_country_df.country_code, "left")
)

# Aggregation
aggregated_df = (
    joined_df.groupBy(
        dim_date_df.date,
        dim_date_df.quarter,
        dim_date_df.year,
        events_data_df.activity_id,
        activities_df.name.alias("activity_name"),
        events_context_df.os_name,
        dim_country_df.country_name
    )
    .agg(
        F.sum(F.when(F.col("completed") == False, 1).otherwise(0)).alias("number_incomplete_plays"),
        F.count("event_id").alias("number_plays_activity"),
        F.sum("duration").alias("sum_duration")
    )
)

# Add updated_at timestamp
aggregated_df = aggregated_df.withColumn("updated_at", F.lit(datetime.utcnow()))

# Write to Gold (overwrite mode) We don't append as we just regenerate the whole dashboard each time. In case we need to track the changes, then we would append
(
    aggregated_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("lingokids.gold.dashboard")
)



In [0]:
# %sql
# select 
#   dim_date.date,
#   dim_date.quarter,
#   dim_date.year,
#   events_data.activity_id,
#   activities.name as activity_name,
#   events_context.os_name,
#   dim_country.country_name,
# --  count(distinct events_data.activity_id) as number_of_activities_played,
#   sum(CASE WHEN events_data.completed = false THEN 1 else 0 END) AS number_incomplete_plays,
#   count(events_data.event_id) as number_plays_activity,
#   sum(duration)
# from lingokids.silver.events_data 
# left join lingokids.silver.activities 
#    on events_data.activity_id = activities.activity_id
# inner join lingokids.silver.events
#    on events_data.event_id = events.event_id
# inner join lingokids.silver.dim_date
#    on events.date_key = dim_date.date_key
# left join lingokids.silver.events_context
#    on events_data.event_id = events_context.event_id
# left join lingokids.silver.users
#    on users.user_id = events.user_id
# left join lingokids.silver.dim_country
#    on users.country_code = dim_country.country_code
# group by  dim_date.date,
#           dim_date.quarter,
#           dim_date.year,
#           events_data.activity_id,
#           activities.name,
#           events_context.os_name,
#           dim_country.country_name;



In [0]:
# We find discrepancies between events activities and activities. There are activities in the events that do not exist in the activities listed