In [None]:
from typing import Any, Callable, Iterable, Iterator, NamedTuple, Optional, Set, Tuple, Union
from abc import ABC, abstractmethod
from collections import defaultdict
from copy import deepcopy
from dataclasses import dataclass, field
from functools import partial

from animus import ICallback, IExperiment
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sps
import seaborn as sns
from tqdm import tqdm
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset


sns.set(rc={"figure.figsize":(15, 5)})

# Что по данным?

В этом семинаре мы будем экспериментировать с датасетом MovieLens 10M.
Это один из самых известных датасетов для исследования рекомендательных моделей, поэтому вы его ещё не раз встретите в статья.

Датасет на самом деле очень простой. У нас есть user, item и рейтинг user для item. Рейтинг - explicit величина, которая принимает значения от 0 до 5. Кроме того, есть ещё метадата для фильмов, если мы захотим обучить что-то на основе контента. Однако последнее нам не понадобится))

## Step 0: Получим данные

In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-10m.zip
!unzip -uo ml-10m.zip -d data
# Приведем к формату csv ratings
!awk 'BEGIN { print "user,item,values,timestamp" } { gsub(/::/, ",", $0); print $0 }' data/ml-10M100K/ratings.dat > data/ml-10M100K/ratings.csv
# Приведем к формату csv movies
!awk 'BEGIN { print "item,title,genres" } { gsub(/::/, ",", $0); print $0 }' data/ml-10M100K/movies.dat > data/ml-10M100K/movies.csv

Eeeeeeee

Looks good!!!

In [None]:
!head data/ml-10M100K/ratings.csv data/ml-10M100K/movies.csv

## Step 1: Посмотрим на данные... КОРОЧЕ EDA

### 1: Распарсим данные из csv в pandas DataFrame

In [None]:
ratings = pd.read_csv("data/ml-10M100K/ratings.csv")
ratings["timestamp"] = pd.to_datetime(ratings["timestamp"], unit="s")
ratings.head().T

In [None]:
ratings.shape, ratings.user.unique().shape, ratings.item.unique().shape

### 2: Посмотрим на распределение рейтингов в данных

In [None]:
sns.histplot(data=ratings["values"], binwidth=1)
plt.show()

### 3: Построим Rating Distribution

Вы спросите, а что такое Rating Distribution?

Это просто fancy название графика, который, грубо говоря, показывает отношение количества рейтингов к количеству юзеров. Он отлично подходит, чтобы popularity-skew в данных.

По оси y располагаем количество рейтингов, выраженное в процентах от общего количества.
По оси x - айтемы сортированные в порядке убывания по количество рейтингов на них. Ну и также ставим для них в качестве id - процент.

In [None]:
def rating_distribution(column: str) -> None:
    # Compute
    ratings_dist = ratings[column].value_counts(normalize=True).reset_index()
    ratings_dist.columns = [column, "num_ratings_percent"]
    step = 1 / (ratings_dist.shape[0] - 1)
    ratings_dist[f"{column}_percent"] = np.arange(0, 1 + step, step) * 100
    ratings_dist["num_ratings_percent"] = np.cumsum(ratings_dist["num_ratings_percent"]) * 100
    # Plot
    fig, ax = plt.subplots()
    sns.lineplot(data=ratings_dist, x=f"{column}_percent", y="num_ratings_percent", ax=ax)
    ax.set_xlim(0, 100)
    ax.set_xticks(range(0, 101, 2))
    ax.set_title("Rating Distribution")
    plt.show()

Для item

In [None]:
rating_distribution(column="item")

Для user

In [None]:
rating_distribution(column="user")

## Step 2: Мы почистим, мы помоем

### 1: Сделаем простенькую фильтрацию

Ниже есть несколько удобных функций, с которыми можно поиграться:
- **valid_items**
    - получим элементы, которые встречаются реже чем threshold. Полезно, чтобы исключить супер популярные айтемы и активных юзеров.
- **filter_ratings**
    - оставим только рейтинги выше `min_rating` (полезно если хотим сделать implicit датасте)
    - оставим users и items, которые встречаются не реже `min_user_count`, `min_item_count` соответственно. 

In [None]:
def valid_elements(data: pd.DataFrame, column: str, threshold: float = 0.2) -> Set[int]:
    rating_dist = data[column].value_counts().reset_index()
    cut_off = round(threshold * rating_dist.shape[0]) + 1
    return set(rating_dist.index[cut_off:])


def get_count(data: pd.DataFrame, column: str) -> pd.DataFrame:
    grouped_by_column = data.groupby(column, as_index=False)
    return grouped_by_column.size()


def _filter_ratings(
    data: pd.DataFrame,
    min_rating: float = 3.0,
    min_user_count: int = 3,
    min_item_count: int = 3,
) -> pd.DataFrame:
    if min_rating is not None:
        data = data.loc[data["values"] > min_rating]
    if min_item_count > 0:
        itemcount = get_count(data, column="item")
        data = data.loc[data["item"].isin(itemcount.item[itemcount["size"] >= min_item_count])]
    if min_user_count > 0:
        usercount = get_count(data, column="user")
        data = data.loc[data["user"].isin(usercount.user[usercount["size"] >= min_user_count])]
    return data


def filter_ratings(
    data: pd.DataFrame,
    min_rating: float = 3.0,
    min_user_count: int = 3,
    min_item_count: int = 3,
    remove_top_items: float = 0.02,
) -> pd.DataFrame:
    if remove_top_items > 0:
        valid_items = valid_elements(data, column="item")
        data = data.loc[data["item"].isin(valid_items)]
    while True:
        cur_num_rows = data.shape[0]
        data = _filter_ratings(data, min_rating, min_user_count, min_item_count)
        new_num_rows = data.shape[0]
        if cur_num_rows == new_num_rows:
            break
    return data

In [None]:
implicit_filtered_ratings = filter_ratings(ratings)
print(f"Before: {ratings.shape}; After: {implicit_filtered_ratings.shape}")
explicit_filtered_ratings = filter_ratings(ratings, min_rating=None)
print(f"Before: {ratings.shape}; After: {explicit_filtered_ratings.shape}")

### 2: Построим encoder для user и item

Вместо кастомного encoder можно использовать [sklearn.LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html), однако `pickle зло` поэтому напишем свой.

In [None]:
@dataclass
class NamespaceEncoder:
    item_to_idx: dict[str, int] = field(default_factory=dict)
    idx_to_item: dict[str, str] = field(default_factory=dict)

    def __post_init__(self) -> None:
        self.add("@@OOV@@")

    def __len__(self) -> int:
        return len(self.item_to_idx)

    def add(self, item: str) -> None:
        idx = len(self.item_to_idx)
        self.item_to_idx[item] = idx
        self.idx_to_item[str(idx)] = item


class Encoder:
    def __init__(self) -> None:
        self.namespaces = {"user": NamespaceEncoder(), "item": NamespaceEncoder()}

    def encode(self, data: Iterable[str], namespace: str = "user") -> list[str]:
        encoder = self.namespaces[namespace].item_to_idx
        return [encoder.get(x, 0) for x in data if x in encoder]

    def decode(self, data: Iterable[Union[str, int]], namespace: str = "user") -> list[str]:
        decoder = self.namespaces[namespace].idx_to_item
        return [decoder.get(x, "@@OOV@@") for x in map(str, data) if x in decoder]

    def fit(self, data: pd.DataFrame) -> "Encoder":
        for attr, encoder in self.namespaces.items():
            for item in data[attr].unique():
                encoder.add(item)
        return self

    def transform(self, data: pd.DataFrame, decode: bool = False) -> pd.DataFrame:
        data_copy = data.copy()
        data_copy = data_copy[
            data_copy.user.isin(set(self.namespaces["user"].item_to_idx))
            & data_copy.item.isin(set(self.namespaces["item"].item_to_idx))
        ]
        for attr in self.namespaces:
            func = self.encode if not decode else self.decode
            data_copy[attr] = func(data_copy[attr].values, namespace=attr)
        return data_copy

In [None]:
encoder = Encoder().fit(explicit_filtered_ratings)
implicit_filtered_ratings = encoder.transform(implicit_filtered_ratings)
explicit_filtered_ratings = encoder.transform(explicit_filtered_ratings)

### 3: Выберем себе тестовые данные

Вот просто так, решили, что для тестирования наших рекомендов возьмём последний **год** из базы. На остальном будем обучаться.

Необязательно подходить именно таким образом для создания тестового и валидационного датасетов.
Далее в курсе мы подробно обсудим различные подходы для разделения данных в RecSys

In [None]:
ts_split = {
    "implicit": implicit_filtered_ratings["timestamp"].max() - pd.Timedelta(365, "days"),
    "explicit": explicit_filtered_ratings["timestamp"].max() - pd.Timedelta(365, "days"),
}
datasets = {}
for key, dataset in (("implicit", implicit_filtered_ratings), ("explicit", explicit_filtered_ratings)):
    datasets[key] = {
        "train": dataset.loc[dataset["timestamp"] <= ts_split[key]],
        "valid": dataset.loc[dataset["timestamp"] > ts_split[key]],
    }
    datasets[key]["valid"] = datasets[key]["valid"].loc[datasets[key]["valid"].user.isin(datasets[key]["train"].user.unique())]

In [None]:
def make_sparse(dataset: pd.DataFrame, encoder: Encoder, binary: bool = False) -> sps.csr_matrix:
    shape = len(encoder.namespaces["user"]), len(encoder.namespaces["item"])
    values = dataset["values"].to_numpy()
    values = np.ones_like(values) if binary else values
    return sps.csr_matrix((values, (dataset["user"].to_numpy(), dataset["item"].to_numpy())), shape=shape)

In [None]:
def build_sparse_datasets(
    train: pd.DataFrame, valid: pd.DataFrame, encoder: Encoder, binary: bool = True
) -> dict[str, sps.csr_matrix]:
    train_sparse = make_sparse(train, encoder, binary=binary)
    valid_sparse = make_sparse(valid, encoder, binary=binary)
    train_mask, valid_mask = train_sparse.getnnz(axis=-1) > 0, valid_sparse.getnnz(axis=-1) > 0
    return {
        "train": {"source": train_sparse[train_mask], "target": train_sparse[train_mask]},
        "valid": {"source": train_sparse[valid_mask], "target": valid_sparse[valid_mask]},
    }

In [None]:
import pprint


explicit_datasets = build_sparse_datasets(**datasets["explicit"], encoder=encoder, binary=False)
implicit_datasets = build_sparse_datasets(**datasets["implicit"], encoder=encoder, binary=True)
print("Explicit")
pprint.pprint(explicit_datasets)
print("Implicit")
pprint.pprint(implicit_datasets)

## 4: Обучим модельки

Сразу будем писать всё на torch, because we can!!!

In [None]:
class MovieLensDataset(Dataset):
    def __init__(self, source: sps.csc_matrix, target: sps.csr_matrix = None) -> None:
        self._source = source
        self._target = target

    def __len__(self):
        return self._source.shape[0]

    def __getitem__(self, idx: int) -> dict[str, np.ndarray]:
        item = {"source": self._source[idx], "idx": idx}
        if self._target is not None:
            item["target"] = self._target[idx]
        return item

In [None]:
class MovieLensCollator:
    def __call__(self, instances: Iterable[dict[str, sps.csr_matrix]]) -> dict[str, torch.Tensor]:
        batch = self._make_batch(instances)
        for key, tensor in batch.items():
            batch[key] = (
                self._to_sparse(tensor).to_dense().float()
                if isinstance(tensor[0], sps.csr_matrix)
                else torch.Tensor(tensor)
            )
        return dict(batch)

    @staticmethod
    def _make_batch(instances: Iterable[dict[str, sps.csr_matrix]]) -> dict[str, list[sps.csr_matrix]]:
        tensor_dict = defaultdict(list)
        for instance in instances:
            for field, tensor in instance.items():
                tensor_dict[field].append(tensor)
        return tensor_dict

    @staticmethod
    def _to_sparse(tensor: list[sps.csr_matrix]) -> torch.sparse.Tensor:
        tensor = sps.vstack(tensor)
        values = torch.from_numpy(tensor.data)
        indices = torch.from_numpy(np.vstack(tensor.nonzero())).long()
        sparse_tensor = torch.sparse.FloatTensor(indices, values, tensor.shape)
        return sparse_tensor

### Sparse Model и Runner

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
class SparseModel(ABC):
    def __init__(self, remove_seen: bool = True) -> None:
        self.remove_seen = remove_seen
        self.device = torch.device("cpu")

    @abstractmethod
    def fit(self, data: sps.csr_matrix) -> None:
        pass

    @abstractmethod
    def predict_proba(self, batch: dict[str, Any]) -> dict[str, torch.Tensor]:
        pass

    def to(self, device: torch.device) -> "SparseModel":
        self.device = device
        return self

In [None]:
class SparseRunner(IExperiment):
    def __init__(
        self,
        model: SparseModel,
        seed: int = 13,
        batch_size: int = 32,
        device: torch.device = torch.device("cpu"),
        callbacks: dict[str, ICallback] = None,
        target_threshold: float = 0.0,
    ) -> None:
        super().__init__()
        self.seed = seed
        self.model = model.to(device)
        self.device = device
        self.callbacks = callbacks or defaultdict(None)
        self._batch_size = batch_size
        self._target_threshold = target_threshold
        # Extra variables
        self.batch_output: dict[str, np.ndarray] = None
        self.batch_target: np.ndarray = None

    def run_dataset(self) -> None:
        if self.is_train_dataset:
            self.model.fit(self.dataset["source"])
        self.dataset = tqdm(
            DataLoader(
                MovieLensDataset(**self.dataset),
                collate_fn=MovieLensCollator(),
                batch_size=self._batch_size,
                shuffle=True,
                num_workers=1,
                pin_memory=True,
            ),
            desc=f"Iterating {self.dataset_key}",
        )
        # Remove seen items only for validation
        self.model.remove_seen = not self.is_train_dataset
        super().run_dataset()

    def run_batch(self) -> None:
        self.batch = {k: v.to(self.device) for k, v in self.batch.items()}
        self.batch_target = self.batch["target"].gt(self._target_threshold).float()
        self.batch_output = self.model.predict_proba(self.batch)

    def run(self, datasets: dict[str, dict[str, sps.csr_matrix]]) -> None:
        self.datasets = datasets
        super().run()

### Baselines

In [None]:
class TopPopularModel(SparseModel):
    def __init__(self, remove_seen: bool = True) -> None:
        super().__init__(remove_seen=remove_seen)
        self._stats = {}

    def fit(self, data: sps.csr_matrix) -> None:
        self._stats["item_freq"] = item_freq = torch.from_numpy(
            np.array(data.sum(axis=0), dtype=np.float32)
        ).view(-1).to(self.device)
        self._logits = torch.zeros_like(item_freq).scatter_(
            dim=-1,
            index=torch.argsort(-item_freq),
            src=torch.arange(item_freq.size(-1), 0, -1, dtype=torch.float, device=self.device)
        )

    def predict_proba(self, batch: dict[str, Any]) -> dict[str, torch.Tensor]:
        scores = self._logits.repeat(batch["source"].size(0), 1)
        if self.remove_seen:
            scores[batch["source"].gt(0)] = -1e13
        return {"logits": scores}

In [None]:
top_popular = SparseRunner(model=TopPopularModel(), device=device)
print("Implicit")
top_popular.run(implicit_datasets)
print("Explicit")
top_popular.run(explicit_datasets)

### ItemKNN

In [None]:
class KNNModel(SparseModel):
    def __init__(
        self,
        similarity: str,
        topk: int,
        count_threshold: int = None,
        method: str = "item",
        remove_seen: bool = True,
    ) -> None:
        super().__init__(remove_seen=remove_seen)
        assert similarity in ("cooccurrence", "cosine", "jaccard")
        self._similarity = getattr(self, similarity)
        self._topk = topk
        self._count_threshold = count_threshold
        self._method = method

    def cooccurrence(self, data: sps.csr_matrix) -> torch.Tensor:
        if self._method == "user":
            # sim_matrix ~ (num users, num users)
            sim_matrix = torch.from_numpy(data.dot(data.T).toarray()).to(self.device)
        else:
            # sim_matrix ~ (num items, num items)
            sim_matrix = torch.from_numpy(data.T.dot(data).toarray()).to(self.device)
        if self._count_threshold is not None:
            sim_matrix *= sim_matrix > self._count_threshold
        return sim_matrix

    def cosine(self, data: sps.csr_matrix) -> torch.Tensor:
        if self._method == "user":
            # sim_matrix ~ (num users, num users)
            sim_matrix = torch.from_numpy(data.dot(data.T).toarray()).to(self.device)
            # sum_of_squares ~ (num users)
            sum_of_squares = torch.from_numpy(
                data.power(2).sum(axis=1)
            ).to(self._device).view(-1).sqrt()
        else:
            # sim_matrix ~ (num items, num items)
            sim_matrix = torch.from_numpy(data.T.dot(data).toarray()).to(self.device)
            # sum_of_squares ~ (num items)
            sum_of_squares = torch.from_numpy(
                data.power(2).sum(axis=0)
            ).to(self.device).view(-1).sqrt()
        if self._count_threshold is not None:
            sim_matrix *= sim_matrix > self._count_threshold
        denominator = torch.einsum("i,j->ij", sum_of_squares, sum_of_squares) + 1e-13
        sim_matrix = sim_matrix / denominator
        sim_matrix.fill_diagonal_(0.0)
        return sim_matrix

    def jaccard(self, data: sps.csr_matrix) -> torch.Tensor:
        if self._method == "user":
            # intersaction ~ (num users, num users)
            intersaction = torch.from_numpy(data.dot(data.T).toarray()).to(self.device)
        else:
            # intersaction ~ (num items, num items)
            intersaction = torch.from_numpy(data.T.dot(data).toarray()).to(self.device)
        if self._count_threshold is not None:
            intersaction *= intersaction > self._count_threshold
        diagonal = intersaction.diagonal()
        return intersaction / (diagonal.unsqueeze(0) + diagonal.unsqueeze(-1) - intersaction)

    def fit(self, data: sps.csr_matrix) -> None:
        topk = min(self._topk or data.shape[-1] - 1, data.shape[-1] - 1)
        similarity_matrix = self._similarity(data).float()
        relevant = torch.topk(similarity_matrix, k=topk, dim=-1)
        self._similarity_matrix = torch.zeros_like(similarity_matrix).scatter_(
            dim=-1, index=relevant.indices, src=relevant.values
        )

    def predict_proba(self, batch: dict[str, Any]) -> dict[str, torch.Tensor]:
        scores = torch.einsum("bi,ij->bj", batch["source"], self._similarity_matrix)
        if self.remove_seen:
            scores[batch["source"].gt(0)] = -1e13
        return {"logits": scores}

In [None]:
knn_model = SparseRunner(model=KNNModel("cosine", topk=100), device=device)
print("Implicit")
knn_model.run(implicit_datasets)
print("Explicit")
knn_model.run(explicit_datasets)

### EASE

In [None]:
class EASE(SparseModel):
    def __init__(self, remove_seen: bool = True, lambda_weight: float = 100.0) -> None:
        super().__init__(remove_seen=remove_seen)
        self._lambda_weight = lambda_weight
        self._item_matrix = None

    def fit(self, data: sps.csr_matrix) -> None:
        X = torch.sparse.FloatTensor(
            torch.from_numpy(np.vstack(data.nonzero())).long(),
            torch.from_numpy(data.data),
            data.shape,
        )
        gram_matrix = torch.sparse.mm(X.transpose(0, 1), X)
        gram_matrix += self._lambda_weight * torch.eye(gram_matrix.shape[0]).to_sparse()
        gram_matrix = gram_matrix.to_dense().to(self.device)
        precision_matrix = torch.linalg.inv(gram_matrix)
        item_matrix = precision_matrix / (-torch.diag(precision_matrix))
        item_matrix.fill_diagonal_(0.0)
        self._item_matrix = item_matrix.float()

    def predict_proba(self, batch: dict[str, Any]) -> dict[str, torch.Tensor]:
        scores = torch.einsum("bi,ij->bj", batch["source"], self._item_matrix)
        if self.remove_seen:
            scores[batch["source"].gt(0)] = -1e13
        return {"logits": scores}

In [None]:
runner = SparseRunner(model=EASE(), batch_size=512, device=device)
runner.run(implicit_datasets)

### SVD

In [None]:
class SvdModel(SparseModel):
    def __init__(self, factors: int, remove_seen: bool = True) -> None:
        super().__init__(remove_seen=remove_seen)
        self._factors = factors
        self.factors = {}
        self.sigma = None

    def fit(self, data: sps.csr_matrix) -> None:
        # Torch argues on negative stride with u and sigma
        u, sigma, vt = sps.linalg.svds(data, k=self._factors)
        self.factors = {
            "user": torch.from_numpy(u.copy()).float().to(self.device),
            "item": torch.einsum("fi->if", torch.from_numpy(vt).float().to(self.device)),
        }
        self.sigma = torch.from_numpy(sigma.copy()).diag().float().to(self.device)

    def predict_proba(self, batch: dict[str, Any]) -> dict[str, torch.Tensor]:
        scores = torch.einsum(
            "ui,if,jf->uj", batch["source"], self.factors["item"], self.factors["item"]
        )
        if self.remove_seen:
            scores[batch["source"].gt(0)] = -1e13
        return {"logits": scores}

In [None]:
runner = SparseRunner(model=SvdModel(factors=128), batch_size=512, device=device)
print("Implicit")
runner.run(implicit_datasets)
print("Explicit")
runner.run(explicit_datasets)

## 5: Считаем метрички

Так как, мы всё делаем поверху animus, сдеалем 2 callback, чтобы красиво визуализировать метрики моделей

In [None]:
class MetricCallback(ICallback):
    def __init__(self, func: Callable, topk: list[int], input_key: str) -> None:
        self._name = func.__name__
        self._func = func
        self._topk = topk
        self._input_key = input_key
        self._total_sum = {k: 0 for k in self._topk}
        self._num_batches = 0

    def _get_metric(self) -> dict[str, float]:
        return {
            f"{self._name}@{k}": metric_sum / self._num_batches
            for k, metric_sum in self._total_sum.items()
        }

    def on_dataset_start(self, exp: IExperiment) -> None:
        self._total_sum = {k: 0 for k in self._topk}
        self._num_batches = 0

    def on_batch_end(self, exp: IExperiment) -> None:
        for k in self._topk:
            self._total_sum[k] += self._func(
                exp.batch_output[self._input_key], exp.batch_target, topk=k
            ).item()
        self._num_batches += 1
        exp.batch_metrics.update(self._get_metric())

    def on_dataset_end(self, exp: IExperiment) -> None:
        exp.dataset_metrics.update(self._get_metric())

In [None]:
class LoggerCallback(ICallback):
    def on_dataset_end(self, exp: IExperiment) -> None:
        """Run callbacks on dataset end."""
        print(f"{exp.dataset_key.capitalize()} metrics:")
        max_length = max(len(x) for x in exp.dataset_metrics)
        # Sort by length to make it prettier
        for metric in sorted(exp.dataset_metrics, key=self._sort_func):
            metric_value = exp.dataset_metrics.get(metric)
            if isinstance(metric_value, (float, int)):
                print(f"{metric.ljust(max_length)} | {metric_value:.4f}")

    @staticmethod
    def _sort_func(x: str) -> Tuple[int, str, int]:
        if "@" in x:
            metric_key, topk = x.split("@")
            return (len(metric_key), metric_key, int(topk))
        return (len(x), x, 0)

In [None]:
class PrepareTargetResult(NamedTuple):
    values: torch.Tensor
    indices: torch.Tensor


def validate_metric_inputs(output: torch.Tensor, target: torch.Tensor) -> None:
    if output.size() != target.size():
        raise IndexError(
            "Unequal sizes for output and target: "
            f"output - {output.size()}, target - {target.size()}."
        )
    if not (target.eq(0) | target.eq(1)).all():
        raise ValueError(
            "Target contains values outside of 0 and 1." f"\nTarget:\n{target}"
        )


def prepare_target(
    output: torch.Tensor, target: torch.Tensor, return_indices: bool = False
) -> Union[torch.Tensor, PrepareTargetResult]:
    validate_metric_inputs(output, target)
    # Define order by sorted output scores.
    indices = output.argsort(dim=-1, descending=True)
    sorted_target = torch.gather(target, index=indices, dim=-1)
    return (
        PrepareTargetResult(sorted_target, indices) if return_indices else sorted_target
    )


def nan_to_num(tensor: torch.Tensor, nan: float = 0.0) -> torch.Tensor:
    return torch.where(
        torch.isnan(tensor) | torch.isinf(tensor),
        torch.full_like(tensor, fill_value=nan),
        tensor,
    )

In [None]:
output, target = torch.randn(size=(10, 13)), torch.randint(low=0, high=2, size=(10, 13)).float()

### Precision

In [None]:
def precision(output: torch.Tensor, target: torch.Tensor, topk: int) -> torch.Tensor:
    topk = min(output.size(-1), topk)
    # target_sorted_by_output ~ (users, topk)
    target_sorted_by_output = prepare_target(output, target)
    # YOUR CODE HERE
    return 0

In [None]:
precision(output, target, topk=3)

### Recall

In [None]:
def recall(output: torch.Tensor, target: torch.Tensor, topk: int) -> torch.Tensor:
    topk = min(output.shape[-1], topk)
    # target_sorted_by_output ~ (users, topk)
    target_sorted_by_output = prepare_target(output, target)
    # YOUR CODE HERE
    return 0

In [None]:
recall(output, target, topk=3)

### Mean (Normalized) Average Precision

In [None]:
def mnap(output: torch.Tensor, target: torch.Tensor, topk: int, normalized: bool = True) -> torch.Tensor:
    topk = min(output.size(-1), topk)
    # target_sorted_by_output ~ (users, topk)
    target_sorted_by_output = prepare_target(output, target)
    # YOUR CODE HERE
    return 0

In [None]:
mnap(output, target, topk=3)

### Normalized Discounted Cumulative Gain

In [None]:
def dcg(tensor: torch.Tensor) -> torch.Tensor:
    gains = (2**tensor) - 1
    discounts = 1 / torch.log2(torch.arange(0, tensor.size(-1), dtype=torch.float, device=tensor.device) + 2.0)
    return gains * discounts


def ndcg(output: torch.Tensor, target: torch.Tensor, topk: int) -> torch.Tensor:
    topk = min(output.size(-1), topk)
    # target_sorted_by_output ~ (users, items)
    target_sorted_by_output = prepare_target(output, target)
    ideal_target = prepare_target(target, target)
    # YOUR CODE HERE
    return 0

In [None]:
ndcg(output, target, topk=3)

## LESSS GOOOOOO

### Top Popular

In [None]:
runner = SparseRunner(
    model=TopPopularModel(),
    batch_size=512,
    callbacks={
        "precision": MetricCallback(precision, input_key="logits", topk=(1, 3, 10)),
        "recall": MetricCallback(recall, input_key="logits", topk=(1, 3, 10)),
        "ndcg": MetricCallback(ndcg, input_key="logits", topk=(1, 3, 10)),
        "map": MetricCallback(mnap, input_key="logits", topk=(1, 3, 10)),
        "logger": LoggerCallback(),
    },
    target_threshold=3.0,
    device=device,
)
runner.run(explicit_datasets)

In [None]:
runner = SparseRunner(
    model=TopPopularModel(),
    batch_size=512,
    callbacks={
        "precision": MetricCallback(precision, input_key="logits", topk=(1, 3, 10)),
        "recall": MetricCallback(recall, input_key="logits", topk=(1, 3, 10)),
        "ndcg": MetricCallback(ndcg, input_key="logits", topk=(1, 3, 10)),
        "map": MetricCallback(mnap, input_key="logits", topk=(1, 3, 10)),
        "logger": LoggerCallback(),
    },
    device=device,
)
runner.run(implicit_datasets)

### Item KNN

In [None]:
runner = SparseRunner(
    model=KNNModel("cosine", topk=100),
    batch_size=512,
    callbacks={
        "precision": MetricCallback(precision, input_key="logits", topk=(1, 3, 10)),
        "recall": MetricCallback(recall, input_key="logits", topk=(1, 3, 10)),
        "ndcg": MetricCallback(ndcg, input_key="logits", topk=(1, 3, 10)),
        "map": MetricCallback(mnap, input_key="logits", topk=(1, 3, 10)),
        "logger": LoggerCallback(),
    },
    device=device,
)
runner.run(implicit_datasets)

In [None]:
runner = SparseRunner(
    model=KNNModel("cosine", topk=100),
    batch_size=512,
    callbacks={
        "precision": MetricCallback(precision, input_key="logits", topk=(1, 3, 10)),
        "recall": MetricCallback(recall, input_key="logits", topk=(1, 3, 10)),
        "ndcg": MetricCallback(ndcg, input_key="logits", topk=(1, 3, 10)),
        "map": MetricCallback(mnap, input_key="logits", topk=(1, 3, 10)),
        "logger": LoggerCallback(),
    },
    device=device,
)
runner.run(explicit_datasets)

### EASE

In [None]:
runner = SparseRunner(
    model=EASE(),
    batch_size=512,
    callbacks={
        "precision": MetricCallback(precision, input_key="logits", topk=(1, 3, 10)),
        "recall": MetricCallback(recall, input_key="logits", topk=(1, 3, 10)),
        "ndcg": MetricCallback(ndcg, input_key="logits", topk=(1, 3, 10)),
        "map": MetricCallback(mnap, input_key="logits", topk=(1, 3, 10)),
        "logger": LoggerCallback(),
    },
    device=device,
)
runner.run(implicit_datasets)

### SVD

In [None]:
runner = SparseRunner(
    model=SvdModel(factors=32),
    batch_size=512,
    callbacks={
        "precision": MetricCallback(precision, input_key="logits", topk=(1, 3, 10)),
        "recall": MetricCallback(recall, input_key="logits", topk=(1, 3, 10)),
        "ndcg": MetricCallback(ndcg, input_key="logits", topk=(1, 3, 10)),
        "map": MetricCallback(mnap, input_key="logits", topk=(1, 3, 10)),
        "logger": LoggerCallback(),
    },
    device=device,
)
runner.run(explicit_datasets)

In [None]:
runner = SparseRunner(
    model=SvdModel(factors=32),
    batch_size=512,
    callbacks={
        "precision": MetricCallback(precision, input_key="logits", topk=(1, 3, 10)),
        "recall": MetricCallback(recall, input_key="logits", topk=(1, 3, 10)),
        "ndcg": MetricCallback(ndcg, input_key="logits", topk=(1, 3, 10)),
        "map": MetricCallback(mnap, input_key="logits", topk=(1, 3, 10)),
        "logger": LoggerCallback(),
    },
    device=device,
)
runner.run(implicit_datasets)