In [1]:
from pathlib import Path

import polars as pl
import numpy as np

from my_recsys_metrics import compute_metrics
from my_utils import make_submission


In [2]:
data_path = Path("../data/music_recsys")
train_events = pl.read_parquet(data_path / "train_events.parquet")
users_for_submission = pl.read_parquet(data_path / "users_for_submission.parquet")


In [3]:

def top_personal(events: pl.DataFrame) -> pl.DataFrame:
    user_track_scores = (
        events
        .group_by("user_id", "track_id")
        .agg(pl.col("datetime").count().alias("n_track_listen"))
        .with_columns(pl.col("n_track_listen").sum().over("user_id").alias("n_listen"))
        .with_columns((pl.col("n_track_listen") / pl.col("n_listen")).alias("score"))
        .drop("n_track_listen", "n_listen")
    )

    top_personal_tracks = (
        user_track_scores
        .group_by("user_id")
        .agg(pl.col("track_id", "score").sort_by("score", descending=True).head(10))
        .explode(columns=["track_id", "score"])
    )

    return top_personal_tracks


top_personal_recommendations = top_personal(train_events)
top_personal_recommendations


user_id,track_id,score
i32,i32,f64
5797298,1118041,0.008785
5797298,211773,0.008785
5797298,884390,0.007321
5797298,996261,0.007321
5797298,715712,0.007321
5797298,1394130,0.007321
5797298,1174693,0.007321
5797298,683075,0.007321
5797298,523823,0.005857
5797298,304548,0.005857


In [4]:
top10_tracks = (
    train_events
    .group_by("track_id")
    .agg(pl.col("user_id").count().alias("score"))
    .top_k(10, by="score")
    .with_columns(pl.col("score") / len(train_events))
)

def populate_tracks_to_users(tracks: pl.DataFrame, users: pl.DataFrame) -> pl.DataFrame:
    user_ids = users["user_id"].unique().to_numpy()
    track_ids = tracks["track_id"].to_numpy()
    scores = tracks["score"].to_numpy()

    result = pl.DataFrame({
        "user_id": np.repeat(user_ids, len(tracks)),
        "track_id": np.tile(track_ids, len(user_ids)),
        "score": np.tile(scores, len(user_ids)),
    })

    return result

toppop_recommendations = populate_tracks_to_users(top10_tracks, users_for_submission)
toppop_recommendations


user_id,track_id,score
i32,i32,f64
1000736,634651,0.002666
1000736,811300,0.002355
1000736,44204,0.002099
1000736,265134,0.0019
1000736,1355970,0.001844
1000736,1133665,0.001747
1000736,412548,0.001706
1000736,647096,0.001648
1000736,278845,0.001629
1000736,322362,0.00154


In [5]:
mixed_recommendations = pl.concat([
    top_personal_recommendations,
    toppop_recommendations,
])

final_recommendations = (
    mixed_recommendations
    .group_by("user_id")
    .agg(pl.col("track_id", "score").sort_by("score", descending=True).head(10))
    .explode(columns=["track_id", "score"])
)

final_recommendations


user_id,track_id,score
i32,i32,f64
8485853,691646,0.028736
8485853,634651,0.028736
8485853,593014,0.028736
8485853,1336314,0.017241
8485853,1027915,0.017241
8485853,562894,0.017241
8485853,168816,0.017241
8485853,186769,0.017241
8485853,540364,0.017241
8485853,1187897,0.017241


In [6]:
toppersonal_submission = make_submission(final_recommendations)
compute_metrics(toppersonal_submission, pl.read_parquet(data_path / "ground_truth.parquet"))


{'ndcg@10': 0.0038704079923490963, 'recall@10': 0.0072383756594282915}