Merge branch '32-ndcg' into 'master'

Resolve "Proper performance metrics" Closes #32 See merge request recommend.games/board-game-recommender!25
recommend-games · Apr 30, 2023 · 0aa1de1 · 0aa1de1
2 parents c712f61 + 399c36a
commit 0aa1de1
Show file tree

Hide file tree

Showing 12 changed files with 1,960 additions and 1,184 deletions.
diff --git a/.gitignore b/.gitignore
@@ -106,5 +106,6 @@ recommender/
 .tc*/
 .bga*
 .bgg*
+*.csv
 *.ipynb
 *.npz
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,16 +5,20 @@ repos:
   - id: check-yaml
   - id: end-of-file-fixer
   - id: trailing-whitespace
+    exclude: ^notebooks/
 - repo: https://github.com/asottile/pyupgrade
   rev: v3.3.1
   hooks:
   - id: pyupgrade
+    exclude: ^notebooks/
 - repo: https://github.com/psf/black
   rev: '23.3.0'
   hooks:
   - id: black
+    exclude: ^notebooks/
 - repo: https://github.com/pycqa/isort
   rev: '5.12.0'
   hooks:
   - id: isort
+    exclude: ^notebooks/
     args: ["--profile", "black", "--filter-files"]
diff --git a/Pipfile b/Pipfile
@@ -7,10 +7,14 @@ verify_ssl = true
 python_version = "3.8"
 
 [packages]
+matplotlib = "*"
 numpy = "*"
 pandas = "*"
+polars = "*"
 pytility = "*"
+scikit-learn = "*"
 scipy = "*"
+seaborn = "*"
 turicreate = "*"
 
 [dev-packages]
@@ -21,6 +25,7 @@ jupyterlab = "*"
 jupytext = "*"
 mypy = "*"
 nb-black = "*"
+pandas-stubs = "*"
 pre-commit = "*"
 pylint = "*"
 twine = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/board_game_recommender/base.py b/board_game_recommender/base.py
@@ -3,6 +3,8 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, AbstractSet, Generic, Iterable, TypeVar, Union
 
+import numpy as np
+
 GameKeyType = TypeVar("GameKeyType")
 UserKeyType = TypeVar("UserKeyType")
 
@@ -49,6 +51,14 @@ def recommend(
     ) -> DataFrame:
         """Recommend games for given users."""
 
+    @abstractmethod
+    def recommend_as_numpy(
+        self: "BaseGamesRecommender",
+        users: Iterable[UserKeyType],
+        games: Iterable[GameKeyType],
+    ) -> np.ndarray:
+        """Recommend games for given users and games as a numpy array."""
+
     @abstractmethod
     def recommend_similar(
         self: "BaseGamesRecommender",

diff --git a/board_game_recommender/evaluation.py b/board_game_recommender/evaluation.py
@@ -0,0 +1,111 @@
+"""Evaluate recommender models."""
+
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, Tuple, Union
+
+import numpy as np
+import polars as pl
+from sklearn.metrics import ndcg_score
+
+from board_game_recommender.base import BaseGamesRecommender
+
+LOGGER = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class RecommenderTestData:
+    """Test data for recommender model evaluation."""
+
+    user_ids: Tuple[str, ...]
+    game_ids: np.ndarray
+    ratings: np.ndarray
+
+
+def load_test_data(
+    path: Union[str, Path],
+    ratings_per_user: int,
+    user_id_key: str = "bgg_user_name",
+    game_id_key: str = "bgg_id",
+    ratings_key: str = "bgg_user_rating",
+) -> RecommenderTestData:
+    """Load RecommenderTestData from CSV."""
+
+    path = Path(path).resolve()
+    LOGGER.info("Loading test data from <%s>…", path)
+
+    data = pl.read_csv(path)
+    LOGGER.info("Read %d rows", len(data))
+
+    if len(data) % ratings_per_user != 0:
+        raise ValueError(
+            f"The number of rows ({len(data)}) is not divisible by "
+            + f"the number of ratings per user ({ratings_per_user})"
+        )
+
+    user_ids = tuple(data[user_id_key][::ratings_per_user])
+    game_ids = data[game_id_key].view().reshape((-1, ratings_per_user))
+    ratings = data[ratings_key].view().reshape((-1, ratings_per_user))
+
+    return RecommenderTestData(user_ids=user_ids, game_ids=game_ids, ratings=ratings)
+
+
+@dataclass(frozen=True)
+class RecommenderMetrics:
+    """Recommender model evaluation metrics."""
+
+    ndcg: Dict[int, float]
+    ndcg_exp: Dict[int, float]
+
+
+def calculate_metrics(
+    recommender: BaseGamesRecommender,
+    test_data: RecommenderTestData,
+    *,
+    k_values: Union[None, int, Iterable[int]],
+) -> RecommenderMetrics:
+    """Calculate RecommenderMetrics for given recommender model and RecommenderTestData."""
+
+    y_true = test_data.ratings
+    y_pred = np.array(
+        [
+            recommender.recommend_as_numpy(users=(user,), games=games)[0, :]
+            for user, games in zip(test_data.user_ids, test_data.game_ids)
+        ]
+    )
+
+    if y_true.shape != y_pred.shape:
+        raise ValueError(
+            f"Shape of ratings ({y_true.shape}) does not match "
+            + f"shape of predictions ({y_pred.shape})"
+        )
+
+    if k_values is None:
+        k_values = frozenset()
+    elif isinstance(k_values, int):
+        k_values = frozenset({k_values})
+    else:
+        k_values = frozenset(k_values)
+
+    k_values = sorted(k_values | {y_true.shape[-1]})
+    ndcg = {}
+
+    for k in k_values:
+        ndcg[k] = ndcg_score(
+            y_true=y_true,
+            y_score=y_pred,
+            k=k,
+        )
+
+    y_true = np.exp2(y_true) - 1
+    ndcg_exp = {}
+
+    for k in k_values:
+        ndcg_exp[k] = ndcg_score(
+            y_true=y_true,
+            y_score=y_pred,
+            k=k,
+        )
+
+    return RecommenderMetrics(ndcg=ndcg, ndcg_exp=ndcg_exp)
diff --git a/board_game_recommender/light.py b/board_game_recommender/light.py
@@ -26,12 +26,12 @@ class CollaborativeFilteringData:
     """Labels, vectors and matrices for linear collaborative filtering models."""
 
     intercept: float
-    users_labels: np.ndarray
-    users_linear_terms: np.ndarray
-    users_factors: np.ndarray
-    items_labels: np.ndarray
-    items_linear_terms: np.ndarray
-    items_factors: np.ndarray
+    users_labels: np.ndarray  # (num_users,)
+    users_linear_terms: np.ndarray  # (num_users,)
+    users_factors: np.ndarray  # (num_users, num_factors)
+    items_labels: np.ndarray  # (num_items,)
+    items_linear_terms: np.ndarray  # (num_items,)
+    items_factors: np.ndarray  # (num_factors, num_items)
 
     def to_npz(self: "CollaborativeFilteringData", file_path: Union[Path, str]) -> None:
         """Save data into an .npz file."""
@@ -141,6 +141,38 @@ def known_users(self: "LightGamesRecommender") -> FrozenSet[str]:
     def num_users(self: "LightGamesRecommender") -> int:
         return len(self.users_labels)
 
+    def _recommendation_scores(
+        self: "LightGamesRecommender",
+        users: Optional[List[str]] = None,
+        games: Optional[List[int]] = None,
+    ) -> np.ndarray:
+        """Calculate recommendations scores for certain users and games."""
+
+        if users:
+            user_ids = np.array([self.users_indexes[user] for user in users])
+            user_factors = self.users_factors[user_ids]
+            users_linear_terms = self.users_linear_terms[user_ids].reshape(-1, 1)
+        else:
+            user_factors = self.users_factors
+            users_linear_terms = self.users_linear_terms.reshape(-1, 1)
+
+        if games:
+            # TODO Unknown games will cause a key error. Instead, use the user's
+            # average predicted rating (user + global bias) for unknown games. (#57)
+            game_ids = np.array([self.items_indexes[game] for game in games])
+            items_factors = self.items_factors[:, game_ids]
+            items_linear_terms = self.items_linear_terms[game_ids].reshape(1, -1)
+        else:
+            items_factors = self.items_factors
+            items_linear_terms = self.items_linear_terms.reshape(1, -1)
+
+        return (
+            user_factors @ items_factors  # (num_users, num_items)
+            + users_linear_terms  # (num_users, 1)
+            + items_linear_terms  # (1, num_items)
+            + self.intercept  # (1,)
+        )
+
     def recommend(
         self: "LightGamesRecommender",
         users: Iterable[str],
@@ -149,14 +181,7 @@ def recommend(
         """Calculate recommendations for certain users."""
 
         users = list(users)
-        user_ids = np.array([self.users_indexes[user] for user in users])
-
-        scores = (
-            self.users_factors[user_ids] @ self.items_factors
-            + self.users_linear_terms[user_ids].reshape(len(user_ids), 1)
-            + self.items_linear_terms
-            + self.intercept
-        )
+        scores = self._recommendation_scores(users=users)
 
         result = pd.DataFrame(
             index=self.items_labels,
@@ -173,6 +198,17 @@ def recommend(
 
         return result[pd.MultiIndex.from_product([users, ["score", "rank"]])]
 
+    def recommend_as_numpy(
+        self: "LightGamesRecommender",
+        users: Iterable[str],
+        games: Iterable[int],
+    ) -> np.ndarray:
+        """Calculate recommendations for certain users and games as a numpy array."""
+
+        users = list(users)
+        games = list(games)
+        return self._recommendation_scores(users=users, games=games)
+
     def recommend_similar(
         self: "LightGamesRecommender",
         games: Iterable[int],

diff --git a/board_game_recommender/recommend.py b/board_game_recommender/recommend.py
@@ -9,6 +9,7 @@
 # from datetime import date
 from typing import Any, Dict, FrozenSet, Iterable, Optional, Tuple, Type
 
+import numpy as np
 import turicreate as tc
 from pytility import arg_to_iter, clear_list
 
@@ -428,6 +429,45 @@ def recommend(
             ascending=ascending,
         )
 
+    def recommend_as_numpy(
+        self: "GamesRecommender",
+        users: Iterable[str],
+        games: Iterable[int],
+    ) -> np.ndarray:
+        """Calculate recommendations for certain users and games as a numpy array."""
+
+        users = list(users)
+        users_sf = tc.SFrame(
+            {
+                self.user_id_field: users,
+                "sort_users": range(len(users)),
+            }
+        )
+
+        games = list(games)
+        games_sf = tc.SFrame(
+            {
+                self.id_field: games,
+                "sort_games": range(len(games)),
+            }
+        )
+
+        recommendations = self.model.recommend(
+            users=users,
+            items=games,
+            exclude_known=False,
+            k=len(games),
+        )
+
+        assert len(recommendations) == len(users) * len(games)
+
+        result = (
+            recommendations.join(users_sf)
+            .join(games_sf)
+            .sort(["sort_users", "sort_games"])
+        )
+        return result["score"].to_numpy().reshape(len(users), len(games))
+
     def recommend_similar(
         self: "GamesRecommender",
         games: Iterable[GameKeyType],

diff --git a/notebooks/ndcg_train_test.py b/notebooks/ndcg_train_test.py
@@ -0,0 +1,57 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %%
+import polars as pl
+
+# %load_ext nb_black
+# %load_ext lab_black
+
+# %%
+THRESHOLD_POWER_USERS = 200
+NUM_LABELS = 100
+
+# %%
+ratings = (
+    pl.scan_ndjson("../../board-game-data/scraped/bgg_RatingItem.jl")
+    .filter(pl.col("bgg_user_rating").is_not_null())
+    .select(
+        "bgg_id",
+        "bgg_user_name",
+        "bgg_user_rating",
+        (
+            (pl.col("bgg_id").count().over("bgg_user_name") >= THRESHOLD_POWER_USERS)
+            & (pl.arange(0, pl.count()).shuffle().over("bgg_user_name") < NUM_LABELS)
+        ).alias("is_test_row"),
+    )
+    .collect()
+)
+
+# %%
+train_test = ratings.partition_by(
+    "is_test_row",
+    as_dict=True,
+)
+data_train = train_test[False]
+data_train.drop_in_place("is_test_row")
+data_train = data_train.sort("bgg_user_name", "bgg_id")
+data_test = train_test[True]
+data_test.drop_in_place("is_test_row")
+data_test = data_test.sort("bgg_user_name", "bgg_id")
+data_train.shape, data_test.shape
+
+# %%
+data_train.write_csv("ratings_train.csv")
+data_test.write_csv("ratings_test.csv")
diff --git a/notebooks/rankings.py b/notebooks/rankings.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.2
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python