In [37]:
from pathlib import Path
from typing import Any, Protocol, List, TypeVar, Generic

import polars as pl
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from scipy.sparse.linalg import norm as sparse_norm
from numba import njit, prange

from my_recsys_metrics import compute_metrics
from my_utils import make_submission


In [38]:
data_path = Path("../data/music_recsys")
train_events = pl.read_parquet(data_path / "train_events.parquet")
users_for_submission = pl.read_parquet(data_path / "users_for_submission.parquet")


In [39]:
_T = TypeVar("_T")
_U = TypeVar("_U")

class TransformerLike(Protocol):
    def fit_transform(self, input: Any) -> Any: ...


class Pipeline(Generic[_T, _U]):
    def __init__(self, transformers: List[TransformerLike]) -> None:
        self.transformers = transformers

    def fit_transform(self, x: _T) -> _U:
        y: Any = x
        for t in self.transformers:
            print(f"Fit-transform with {t.__class__.__name__}")
            y = t.fit_transform(y)
        return y


class OrdinalEncoder:
    def __init__(self, column: str) -> None:
        self.column = column

    def fit(self, df: pl.DataFrame) -> "OrdinalEncoder":
        self._mapper = (
            df[[self.column]].unique()
            .sort(self.column)
            .with_row_count("__index__")
            .with_columns(pl.col("__index__").cast(pl.Int32))
        )
        return self

    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        df = (
            df
            .join(self._mapper, on=self.column, how="left")
            .drop(self.column)
            .rename({"__index__": self.column})
        )
        return df

    def inverse_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        df = (
            df
            .rename({self.column: "__index__"})
            .join(
                self._mapper,
                on="__index__",
                how="left",
            )
            .drop(f"__index__")
        )
        return df

    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        return self.fit(df).transform(df)


class FilterByPlayRatio:
    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        return events.filter(pl.col("play_ratio") > 0.3)


class FrequencyEncoder:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        frequency_scores = (
            events
            .group_by(self.user_column, self.item_column)
            .agg(pl.col(self.item_column).count().alias("n_interactions_per_user"))
            .with_columns(
                pl.col("n_interactions_per_user").sum().over(self.user_column).alias("n_interactions_total"),
            )
            .with_columns(
                (pl.col("n_interactions_per_user") / pl.col("n_interactions_total")).alias(self.value_column),
            )
            .drop("n_interactions_per_user", "n_interactions_total")
        )
        return frequency_scores


class TFIDFEncoder:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        n_users = events["user_id"].n_unique()

        idf_scores = (
            events
            .group_by(self.item_column)
            .agg(pl.col(self.user_column).count().alias("n_user_per_item"))
            .with_columns(np.log(n_users / pl.col("n_user_per_item")).alias("idf"))
            .drop("n_user_per_item")
        )

        tf_scores = (
            events
            .group_by(self.user_column, self.item_column)
            .agg(pl.count().alias("n_user_item"))
            .with_columns(
                pl.col("n_user_item").sum().over(self.user_column).alias("n_total"),
            )
            .with_columns(
                (pl.col("n_user_item") / pl.col("n_total")).alias("tf"),
            )
            .drop("n_user_item", "n_total")
        )

        scores = (
            tf_scores
            .join(
                idf_scores,
                on=self.item_column,
                how="left"
            )
            .with_columns((pl.col("tf") * pl.col("idf")).alias(self.value_column))
            .drop("tf", "idf")
        )

        return scores


class CSRConverter:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, coo: pl.DataFrame) -> csr_matrix:
        user_idx = coo[self.user_column].to_numpy()
        item_idx = coo[self.item_column].to_numpy()
        values = coo[self.value_column].to_numpy()

        n_users = user_idx.max() + 1
        n_items = item_idx.max() + 1

        user_item_coo = coo_matrix(
            (
                values.astype(np.float32),
                (user_idx, item_idx),
            ),
            shape=(n_users, n_items),
            dtype=np.float32,
        )

        user_item_coo.sum_duplicates()

        user_item_csr = user_item_coo.tocsr()
        return user_item_csr



In [40]:
events_preprocessing_pipeline: Pipeline[pl.DataFrame, csr_matrix] = Pipeline([
    OrdinalEncoder(column="user_id"),
    OrdinalEncoder(column="track_id"),
    FilterByPlayRatio(),
    # FrequencyEncoder(user_column="user_id", item_column="track_id", value_column="freq"),
    TFIDFEncoder(user_column="user_id", item_column="track_id", value_column="freq"),
    CSRConverter(user_column="user_id", item_column="track_id", value_column="freq"),
])

user_item_csr = events_preprocessing_pipeline.fit_transform(train_events)
user_item_csr


Fit-transform with OrdinalEncoder
Fit-transform with OrdinalEncoder
Fit-transform with FilterByPlayRatio
Fit-transform with TFIDFEncoder
Fit-transform with CSRConverter


<12150x115648 sparse matrix of type '<class 'numpy.float32'>'
	with 2137443 stored elements in Compressed Sparse Row format>

In [41]:
# 1. Посчитать матрицу похожих пользователей по косинусному расстоянию
#    R -- входная матрица [n, m]
#    D -- матрица дистанций, d_ij [n, n]
#    d_ij = (r_i, r_j) / (|r_i| |r_j|)
#    R_norm = norm(R)
#    D = (R_norm) x (R_norm)^T
#
# 2. s_ij = (d_i, r_j) / sum(d_i)
#    D_norm = norm(D)
#    S = D_norm x R [n, m]

class UserBasedKNN:
    def __init__(
        self,
        user_column: str,
        item_column: str,
        score_column: str,
        n_neighbor_users: int,
        top_k: int,
    ) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.score_column = score_column
        self.n_neighbor_users = n_neighbor_users
        self.top_k = top_k

    def fit_predict(self, user_item: csr_matrix) -> pl.DataFrame:
        distances = self._compute_distances(user_item)
        scores = self._compute_scores(distances, user_item)
        scores_df = self._scores_to_df(scores)
        return scores_df

    def _scores_to_df(self, scores: csr_matrix) -> pl.DataFrame:
        scores_coo = scores.tocoo()
        scores_df = pl.DataFrame({
            self.user_column: scores_coo.row,
            self.item_column: scores_coo.col,
            self.score_column: scores_coo.data,
        })
        return scores_df

    def _compute_distances(self, user_item: csr_matrix) -> csr_matrix:
        user_item_normalized = self._normalize_user_item(user_item)
        distances = user_item_normalized @ user_item_normalized.T
        self._keep_largest_per_row_inplace(distances, n_largest=self.n_neighbor_users)
        return distances

    def _compute_scores(self, distances: csr_matrix, user_item: csr_matrix) -> csr_matrix:
        distances_normalized = self._normalize_distances(distances)
        scores = distances_normalized @ user_item
        self._remove_already_liked_items_inplace(scores, user_item)
        self._keep_largest_per_row_inplace(scores, n_largest=self.top_k)
        return scores

    def _normalize_user_item(self, user_item: csr_matrix) -> csr_matrix:
        user_item = user_item.copy()
        user_item_norms = sparse_norm(user_item, axis=1)
        nnz_per_row = user_item.indptr[1:] - user_item.indptr[:-1]
        user_item.data /= np.repeat(user_item_norms, nnz_per_row)
        return user_item

    def _normalize_distances(self, distances: csr_matrix) -> csr_matrix:
        distances = distances.copy()
        distances_norms = distances.sum(axis=1).A1
        nnz_per_row = distances.indptr[1:] - distances.indptr[:-1]
        distances.data /= np.repeat(distances_norms, nnz_per_row)
        return distances

    def _keep_largest_per_row_inplace(self, distances: csr_matrix, n_largest: int) -> None:
       _keep_largest_per_row_nb(
           row_count=distances.shape[0],
           n_largest=n_largest,
           indptr=distances.indptr,
           data=distances.data,
        )
       distances.eliminate_zeros()

    def _remove_already_liked_items_inplace(self, scores: csr_matrix, user_item: csr_matrix) -> None:
        _remove_already_liked_items_nb(
            row_count=scores.shape[0],
            scores_data=scores.data,
            scores_indptr=scores.indptr,
            scores_indices=scores.indices,
            interactions_indptr=user_item.indptr,
            interactions_indices=user_item.indices,
        )
        scores.eliminate_zeros()


@njit(parallel=True)
def _keep_largest_per_row_nb(
    row_count: int,
    n_largest: int,
    indptr: np.ndarray,
    data: np.ndarray,
) -> None:
    for i in prange(row_count):
        row_begin = indptr[i]
        row_end = indptr[i + 1]

        nnz_per_row = row_end - row_begin
        n_to_zero = nnz_per_row - n_largest

        if n_to_zero > 0:
            indices_to_zero = np.argsort(data[row_begin:row_end])[:n_to_zero]
            data[row_begin + indices_to_zero] = 0


@njit(parallel=True)
def _remove_already_liked_items_nb(
    row_count: int,
    scores_data: np.ndarray,
    scores_indptr: np.ndarray,
    scores_indices: np.ndarray,
    interactions_indptr: np.ndarray,
    interactions_indices: np.ndarray,
) -> None:
    for i in prange(row_count):
        interactions = interactions_indices[interactions_indptr[i]:interactions_indptr[i + 1]]
        for idx in range(scores_indptr[i], scores_indptr[i + 1]):
            j = scores_indices[idx]
            if j in interactions:
                scores_data[idx] = 0.0


user_based_knn = UserBasedKNN(
    user_column="user_id",
    item_column="track_id",
    score_column="score",
    n_neighbor_users=50,
    top_k=10,
)
userknn_recommendations = user_based_knn.fit_predict(user_item_csr)
userknn_recommendations


user_id,track_id,score
i32,i32,f32
0,107780,0.009349
0,79213,0.009094
0,58821,0.009133
0,25446,0.009184
0,19713,0.009091
0,3050,0.010239
0,105051,0.009541
0,61465,0.009619
0,26279,0.009081
0,15586,0.009192


In [42]:
user_encoder: OrdinalEncoder = events_preprocessing_pipeline.transformers[0]
item_encoder: OrdinalEncoder = events_preprocessing_pipeline.transformers[1]

userknn_recommendations_decoded = userknn_recommendations
userknn_recommendations_decoded = user_encoder.inverse_transform(userknn_recommendations_decoded)
userknn_recommendations_decoded = item_encoder.inverse_transform(userknn_recommendations_decoded)


In [43]:
userknn_submission = make_submission(userknn_recommendations_decoded)
compute_metrics(userknn_submission, pl.read_parquet(data_path / "ground_truth.parquet"))


{'ndcg@10': 0.017945530826247173, 'recall@10': 0.025911111111111107}