In [63]:
from pathlib import Path

import polars as pl
import plotly.express as px


In [66]:
data_path = Path("../data/music_recsys")
train_events = pl.read_parquet(data_path / "train_events.parquet")
users_for_submission = pl.read_parquet(data_path / "users_for_submission.parquet")


### Events EDA

In [67]:
train_events


user_id,track_id,session_id,datetime,play_ratio
i32,i32,i32,datetime[ms],f32
1525056,32483,11506051,2023-04-24 20:41:06.384,0.063927
5168988,391065,3570201,2023-01-24 19:22:04.889,0.242604
2266755,1466076,5553461,2023-02-08 15:03:09.984,1.0
2266755,616270,5553461,2023-02-08 15:08:40.898,1.0
2266755,618303,5553461,2023-02-08 15:09:11.060,0.164223
2266755,1334259,5553461,2023-02-08 15:14:42.319,1.0
2266755,780374,5553461,2023-02-08 15:14:42.343,0.086956
2266755,1353091,5553461,2023-02-08 15:19:42.482,1.0
2266755,299471,5553461,2023-02-08 15:20:03.463,0.432099
2266755,476055,5553461,2023-02-08 15:26:14.366,1.0


In [68]:
print("Unique users:", train_events["user_id"].n_unique())
print("Unique tracks:", train_events["track_id"].n_unique())


Unique users: 12150
Unique tracks: 115648


In [69]:
tracks_per_user = (
    train_events
    .group_by("user_id")
    .agg(
        pl.col("track_id").len().alias("n_tracks"),
        pl.col("track_id").n_unique().alias("n_unique_tracks"),
    )
)

print("Min tracks per user:", tracks_per_user["n_tracks"].min())
print("Max tracks per user:", tracks_per_user["n_tracks"].max())
print("10% percentile tracks per user:", tracks_per_user["n_tracks"].quantile(0.1))
print("20% percentile tracks per user:", tracks_per_user["n_tracks"].quantile(0.2))
print("30% percentile tracks per user:", tracks_per_user["n_tracks"].quantile(0.3))
print("40% percentile tracks per user:", tracks_per_user["n_tracks"].quantile(0.4))
print("50% percentile tracks per user:", tracks_per_user["n_tracks"].quantile(0.5))
print("60% percentile tracks per user:", tracks_per_user["n_tracks"].quantile(0.6))
print("70% percentile tracks per user:", tracks_per_user["n_tracks"].quantile(0.7))
print("80% percentile tracks per user:", tracks_per_user["n_tracks"].quantile(0.8))
print("90% percentile tracks per user:", tracks_per_user["n_tracks"].quantile(0.9))
print("Mean tracks per user:", tracks_per_user["n_tracks"].mean())


Min tracks per user: 11
Max tracks per user: 1994
10% percentile tracks per user: 157.0
20% percentile tracks per user: 209.0
30% percentile tracks per user: 264.0
40% percentile tracks per user: 328.0
50% percentile tracks per user: 413.0
60% percentile tracks per user: 521.0
70% percentile tracks per user: 666.0
80% percentile tracks per user: 886.0
90% percentile tracks per user: 1226.0
Mean tracks per user: 560.2467489711934


In [70]:
print("Min tracks per user:", tracks_per_user["n_unique_tracks"].min())
print("Max tracks per user:", tracks_per_user["n_unique_tracks"].max())
print("10% percentile tracks per user:", tracks_per_user["n_unique_tracks"].quantile(0.1))
print("20% percentile tracks per user:", tracks_per_user["n_unique_tracks"].quantile(0.2))
print("30% percentile tracks per user:", tracks_per_user["n_unique_tracks"].quantile(0.3))
print("40% percentile tracks per user:", tracks_per_user["n_unique_tracks"].quantile(0.4))
print("50% percentile tracks per user:", tracks_per_user["n_unique_tracks"].quantile(0.5))
print("60% percentile tracks per user:", tracks_per_user["n_unique_tracks"].quantile(0.6))
print("70% percentile tracks per user:", tracks_per_user["n_unique_tracks"].quantile(0.7))
print("80% percentile tracks per user:", tracks_per_user["n_unique_tracks"].quantile(0.8))
print("90% percentile tracks per user:", tracks_per_user["n_unique_tracks"].quantile(0.9))
print("Mean tracks per user:", tracks_per_user["n_unique_tracks"].mean())


Min tracks per user: 10
Max tracks per user: 995
10% percentile tracks per user: 106.0
20% percentile tracks per user: 124.0
30% percentile tracks per user: 147.0
40% percentile tracks per user: 176.0
50% percentile tracks per user: 207.0
60% percentile tracks per user: 248.0
70% percentile tracks per user: 303.0
80% percentile tracks per user: 380.0
90% percentile tracks per user: 511.0
Mean tracks per user: 263.9962962962963


In [71]:
px.histogram(tracks_per_user, x="n_unique_tracks")


In [72]:
print("Min play ratio:", train_events["play_ratio"].min())
print("Max play ratio:", train_events["play_ratio"].max())
print("Mean play ratio:", train_events["play_ratio"].mean())
print("Median play ratio:", train_events["play_ratio"].median())


Min play ratio: 0.0
Max play ratio: 1.0
Mean play ratio: 0.5213189727350958
Median play ratio: 0.4944751262664795


In [73]:
print("Min datetime:", train_events["datetime"].min())
print("Max datetime:", train_events["datetime"].max())


Min datetime: 2023-01-15 00:00:12.240000
Max datetime: 2023-05-14 20:50:11.164000


In [74]:
events_per_day = (
    train_events
    .with_columns(pl.col("datetime").dt.date().alias("date"))
    .group_by("date")
    .agg(pl.col("track_id").count().alias("n_tracks"))
    .sort("date")
)

px.line(events_per_day, x="date", y="n_tracks")


In [75]:
sessions_per_user = train_events.group_by("user_id").agg(pl.col("session_id").n_unique().alias("n_sessions"))

print("Min sessions per user:", sessions_per_user["n_sessions"].min())
print("Max sessions per user:", sessions_per_user["n_sessions"].max())
print("Median sessions per user:", sessions_per_user["n_sessions"].median())
print("Mean sessions per user:", sessions_per_user["n_sessions"].mean())


Min sessions per user: 1
Max sessions per user: 385
Median sessions per user: 23.0
Mean sessions per user: 34.33481481481481


In [53]:
users_for_submission


user_id
i32
5453460
4884983
9103259
7586513
7375885
3635788
4972399
9729159
9090010
7573288


In [76]:
train_users = set(train_events["user_id"].unique())
test_users = set(users_for_submission["user_id"].unique())

print("Unique users in train:", len(train_users))
print("Unique users in test:", len(test_users))
print("Unique users in train & test:", len(train_users & test_users))
print("Unique users missing in train:", len(test_users - train_users))


Unique users in train: 12150
Unique users in test: 13500
Unique users in train & test: 12150
Unique users missing in train: 1350


In [55]:
ground_truth = pl.read_parquet(data_path / "ground_truth.parquet")
ground_truth


user_id,track_id
i32,i32
8400876,705760
7594304,449496
9822635,761855
3618658,1471128
2602202,240273
9428911,871422
8882768,1035432
2699666,571663
3424650,264672
2349340,345922


In [77]:
gt_with_tracks_from_train = (
    ground_truth
    .join(
        (
            train_events[["user_id", "track_id"]]
            .unique()
            .with_columns(pl.lit(1).alias("was_in_train"))
        ),
        on=["user_id", "track_id"],
        how="left",
    )
    .with_columns(pl.col("was_in_train").fill_null(0))
)
gt_with_tracks_from_train


user_id,track_id,was_in_train
i32,i32,i32
8400876,705760,0
7594304,449496,0
9822635,761855,0
3618658,1471128,0
2602202,240273,0
9428911,871422,0
8882768,1035432,0
2699666,571663,0
3424650,264672,0
2349340,345922,0


In [78]:
# About ~0.5% of tracks in user's ground truth was already listened
gt_with_tracks_from_train["was_in_train"].mean()


0.005273686124749955