In [24]:
from pathlib import Path
from typing import Any, Protocol, List, TypeVar, Generic

import polars as pl
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from scipy.sparse.linalg import norm as sparse_norm
from numba import njit, prange
from implicit.nearest_neighbours import ItemItemRecommender

from my_recsys_metrics import compute_metrics
from my_utils import make_submission


In [21]:
data_path = Path("../data/music_recsys")
train_events = pl.read_parquet(data_path / "train_events.parquet")
users_for_submission = pl.read_parquet(data_path / "users_for_submission.parquet")


In [22]:
_T = TypeVar("_T")
_U = TypeVar("_U")

class TransformerLike(Protocol):
    def fit_transform(self, input: Any) -> Any: ...


class Pipeline(Generic[_T, _U]):
    def __init__(self, transformers: List[TransformerLike]) -> None:
        self.transformers = transformers

    def fit_transform(self, x: _T) -> _U:
        y: Any = x
        for t in self.transformers:
            print(f"Fit-transform with {t.__class__.__name__}")
            y = t.fit_transform(y)
        return y

class OrdinalEncoder:
    def __init__(self, column: str) -> None:
        self.column = column

    def fit(self, df: pl.DataFrame) -> "OrdinalEncoder":
        self._mapper = (
            df[[self.column]].unique()
            .sort(self.column)
            .with_row_count("__index__")
            .with_columns(pl.col("__index__").cast(pl.Int32))
        )
        return self

    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        df = (
            df
            .join(self._mapper, on=self.column, how="left")
            .drop(self.column)
            .rename({"__index__": self.column})
        )
        return df

    def inverse_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        df = (
            df
            .rename({self.column: "__index__"})
            .join(
                self._mapper,
                on="__index__",
                how="left",
            )
            .drop(f"__index__")
        )
        return df

    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        return self.fit(df).transform(df)


class FilterByPlayRatio:
    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        return events.filter(pl.col("play_ratio") > 0.3)


class FrequencyEncoder:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        frequency_scores = (
            events
            .group_by(self.user_column, self.item_column)
            .agg(pl.col(self.item_column).count().alias("n_interactions_per_user"))
            .with_columns(
                pl.col("n_interactions_per_user").sum().over(self.user_column).alias("n_interactions_total"),
            )
            .with_columns(
                (pl.col("n_interactions_per_user") / pl.col("n_interactions_total")).alias(self.value_column),
            )
            .drop("n_interactions_per_user", "n_interactions_total")
        )
        return frequency_scores


class CSRConverter:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, coo: pl.DataFrame) -> csr_matrix:
        user_idx = coo[self.user_column].to_numpy()
        item_idx = coo[self.item_column].to_numpy()
        values = coo[self.value_column].to_numpy()

        n_users = user_idx.max() + 1
        n_items = item_idx.max() + 1

        user_item_coo = coo_matrix(
            (
                values.astype(np.float32),
                (user_idx, item_idx),
            ),
            shape=(n_users, n_items),
            dtype=np.float32,
        )

        user_item_coo.sum_duplicates()

        user_item_csr = user_item_coo.tocsr()
        return user_item_csr



In [23]:
events_preprocessing_pipeline: Pipeline[pl.DataFrame, csr_matrix] = Pipeline([
    OrdinalEncoder(column="user_id"),
    OrdinalEncoder(column="track_id"),
    FilterByPlayRatio(),
    FrequencyEncoder(user_column="user_id", item_column="track_id", value_column="freq"),
    CSRConverter(user_column="user_id", item_column="track_id", value_column="freq"),
])

user_item_csr = events_preprocessing_pipeline.fit_transform(train_events)
user_item_csr


Fit-transform with OrdinalEncoder
Fit-transform with OrdinalEncoder
Fit-transform with FilterByPlayRatio
Fit-transform with FrequencyEncoder
Fit-transform with CSRConverter


<12150x115648 sparse matrix of type '<class 'numpy.float32'>'
	with 2137443 stored elements in Compressed Sparse Row format>

In [25]:
class ItemBasedKNN:
    def __init__(
        self,
        user_column: str,
        item_column: str,
        score_column: str,
        n_neighbor_items: int,
        top_k: int,
    ) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.score_column = score_column
        self.n_neighbor_items = n_neighbor_items
        self.top_k = top_k

    def fit_predict(self, user_item: csr_matrix) -> pl.DataFrame:
        item_base_knn = ItemItemRecommender(K=self.n_neighbor_items)
        item_base_knn.fit(user_item_csr.astype(dtype=np.float64))

        user_ids = np.arange(user_item_csr.shape[0])
        recommended_item_indices, recommended_scores = item_base_knn.recommend(
            user_ids,
            user_item,
            N=self.top_k,
            filter_already_liked_items=True,
        )

        scores_df = pl.DataFrame({
            self.user_column: pl.Series(user_ids, dtype=pl.Int32),
            self.item_column: pl.Series(recommended_item_indices, dtype=pl.List(pl.Int32)),
            self.score_column: pl.Series(recommended_scores, dtype=pl.List(pl.Float32)),
        })

        scores_df = scores_df.explode(self.item_column, self.score_column)

        return scores_df


item_based_knn = ItemBasedKNN(
    user_column="user_id",
    item_column="track_id",
    score_column="score",
    n_neighbor_items=100,
    top_k=10,
)
itemknn_recommendations = item_based_knn.fit_predict(user_item_csr)
itemknn_recommendations


100%|██████████| 115648/115648 [00:13<00:00, 8822.68it/s]


user_id,track_id,score
i32,i32,f32
0,62190,0.002185
0,48507,0.001989
0,3334,0.001898
0,20212,0.001681
0,73919,0.001641
0,103784,0.001367
0,79053,0.001367
0,32233,0.001341
0,92283,0.001233
0,10471,0.001075


In [26]:

user_encoder: OrdinalEncoder = events_preprocessing_pipeline.transformers[0]
item_encoder: OrdinalEncoder = events_preprocessing_pipeline.transformers[1]

itemknn_recommendations_decoded = itemknn_recommendations
itemknn_recommendations_decoded = user_encoder.inverse_transform(itemknn_recommendations_decoded)
itemknn_recommendations_decoded = item_encoder.inverse_transform(itemknn_recommendations_decoded)


In [27]:
itemknn_submission = make_submission(itemknn_recommendations_decoded)
compute_metrics(itemknn_submission, pl.read_parquet(data_path / "ground_truth.parquet"))

{'ndcg@10': 0.013148679533728032, 'recall@10': 0.01951851851851852}