In [4]:
from pathlib import Path
from typing import Any, Protocol, List, TypeVar, Generic, Optional

import polars as pl
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares

from my_recsys_metrics import compute_metrics
from my_utils import make_submission


In [5]:
data_path = Path("../data/music_recsys")
train_events = pl.read_parquet(data_path / "train_events.parquet")
users_for_submission = pl.read_parquet(data_path / "users_for_submission.parquet")


In [6]:
_T = TypeVar("_T")
_U = TypeVar("_U")

class TransformerLike(Protocol):
    def fit_transform(self, input: Any) -> Any: ...


class Pipeline(Generic[_T, _U]):
    def __init__(self, transformers: List[TransformerLike]) -> None:
        self.transformers = transformers

    def fit_transform(self, x: _T) -> _U:
        y: Any = x
        for t in self.transformers:
            print(f"Fit-transform with {t.__class__.__name__}")
            y = t.fit_transform(y)
        return y

class OrdinalEncoder:
    def __init__(self, column: str) -> None:
        self.column = column

    def fit(self, df: pl.DataFrame) -> "OrdinalEncoder":
        self._mapper = (
            df[[self.column]].unique()
            .sort(self.column)
            .with_row_count("__index__")
            .with_columns(pl.col("__index__").cast(pl.Int32))
        )
        return self

    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        df = (
            df
            .join(self._mapper, on=self.column, how="left")
            .drop(self.column)
            .rename({"__index__": self.column})
        )
        return df

    def inverse_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        df = (
            df
            .rename({self.column: "__index__"})
            .join(
                self._mapper,
                on="__index__",
                how="left",
            )
            .drop(f"__index__")
        )
        return df

    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        return self.fit(df).transform(df)


class FilterByPlayRatio:
    def __init__(self, min_ratio: float) -> None:
        self.min_ratio = min_ratio

    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        return events.filter(pl.col("play_ratio") > self.min_ratio)


class FrequencyEncoder:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        frequency_scores = (
            events
            .group_by(self.user_column, self.item_column)
            .agg(pl.col(self.item_column).count().alias("n_interactions_per_user"))
            .with_columns(
                pl.col("n_interactions_per_user").sum().over(self.user_column).alias("n_interactions_total"),
            )
            .with_columns(
                (pl.col("n_interactions_per_user") / pl.col("n_interactions_total")).alias(self.value_column),
            )
            .drop("n_interactions_per_user", "n_interactions_total")
        )
        return frequency_scores


class TFIDFEncoder:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        n_users = events["user_id"].n_unique()

        def idf_fn(track_occurrences):
            return 1 + np.log(1 + 0.001 * (n_users / track_occurrences))

        idf_scores = (
            events
            .group_by(self.item_column)
            .agg(pl.col(self.user_column).count().alias("n_user_per_item"))
            .with_columns(idf_fn(pl.col("n_user_per_item")).alias("idf"))
            .drop("n_user_per_item")
        )

        tf_scores = (
            events
            .group_by(self.user_column, self.item_column)
            .agg(pl.count().alias("n_user_item"))
            .with_columns(
                pl.col("n_user_item").sum().over(self.user_column).alias("n_total"),
            )
            .with_columns(
                (pl.col("n_user_item") / pl.col("n_total")).alias("tf"),
            )
            .drop("n_user_item", "n_total")
        )

        scores = (
            tf_scores
            .join(
                idf_scores,
                on=self.item_column,
                how="left"
            )
            .with_columns((pl.col("tf") * pl.col("idf")).alias(self.value_column))
            .drop("tf", "idf")
        )

        return scores


class CSRConverter:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, coo: pl.DataFrame) -> csr_matrix:
        user_idx = coo[self.user_column].to_numpy()
        item_idx = coo[self.item_column].to_numpy()
        values = coo[self.value_column].to_numpy()

        n_users = user_idx.max() + 1
        n_items = item_idx.max() + 1

        user_item_coo = coo_matrix(
            (
                values.astype(np.float32),
                (user_idx, item_idx),
            ),
            shape=(n_users, n_items),
            dtype=np.float32,
        )

        user_item_coo.sum_duplicates()

        user_item_csr = user_item_coo.tocsr()
        return user_item_csr



In [7]:
events_preprocessing_pipeline: Pipeline[pl.DataFrame, csr_matrix] = Pipeline([
    OrdinalEncoder(column="user_id"),
    OrdinalEncoder(column="track_id"),
    FrequencyEncoder(user_column="user_id", item_column="track_id", value_column="freq"),
    CSRConverter(user_column="user_id", item_column="track_id", value_column="freq"),
])

user_item_csr = events_preprocessing_pipeline.fit_transform(train_events)
user_item_csr


Fit-transform with OrdinalEncoder
Fit-transform with OrdinalEncoder
Fit-transform with FrequencyEncoder
Fit-transform with CSRConverter


<12227x115888 sparse matrix of type '<class 'numpy.float32'>'
	with 3277444 stored elements in Compressed Sparse Row format>

In [8]:
class ALS:
    def __init__(
        self,
        user_column: str,
        item_column: str,
        score_column: str,
        n_factors: int,
        n_iterations: int,
        top_k: int,
    ) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.score_column = score_column
        self.n_factors = n_factors
        self.n_iterations = n_iterations
        self.top_k = top_k

    def fit_predict(
        self,
        user_item: csr_matrix,
        user_item_filter: Optional[csr_matrix] = None,
    ) -> pl.DataFrame:
        als = AlternatingLeastSquares(
            factors=self.n_factors,
            iterations=self.n_iterations,
            alpha=40.0,
            regularization=0.001,
            calculate_training_loss=True,
        )
        als.fit(user_item)

        user_ids = np.arange(user_item_csr.shape[0])
        recommended_item_indices, recommended_scores = als.recommend(
            user_ids,
            (user_item_filter if user_item_filter is not None else user_item),
            N=self.top_k,
            filter_already_liked_items=True,
        )

        scores_df = pl.DataFrame({
            self.user_column: pl.Series(user_ids, dtype=pl.Int32),
            self.item_column: pl.Series(recommended_item_indices, dtype=pl.List(pl.Int32)),
            self.score_column: pl.Series(recommended_scores, dtype=pl.List(pl.Float32)),
        })

        scores_df = scores_df.explode(self.item_column, self.score_column)

        return scores_df


als = ALS(
    user_column="user_id",
    item_column="track_id",
    score_column="score",
    n_factors=128,
    n_iterations=10,
    top_k=10,
)
als_recommendations = als.fit_predict(user_item_csr)


100%|██████████| 10/10 [00:40<00:00,  4.00s/it, loss=0.00029]


In [9]:

user_encoder: OrdinalEncoder = events_preprocessing_pipeline.transformers[0]
item_encoder: OrdinalEncoder = events_preprocessing_pipeline.transformers[1]

als_recommendations_decoded = als_recommendations
als_recommendations_decoded = user_encoder.inverse_transform(als_recommendations_decoded)
als_recommendations_decoded = item_encoder.inverse_transform(als_recommendations_decoded)


In [10]:
als_submission = make_submission(als_recommendations_decoded)
compute_metrics(als_submission, pl.read_parquet(data_path / "ground_truth.parquet"))


{'ndcg@10': 0.025432283035629815, 'recall@10': 0.035414059624585936}