## Import Libraries

In [6]:
pip install numpy==1.26.4

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m715.0 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.

In [1]:
pip install --upgrade --no-deps scikit-surprise



In [2]:
import pandas as pd
from typing import Iterable, Optional, List, Tuple
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

## Load Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
ratings = pd.read_csv('/content/drive/My Drive/data/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


## Colabrative Model

In [5]:
class CollaborativeRecommender:
    """
    Ratings-only wrapper around Surprise.
    Expects a DataFrame with columns: ['userId','movieId','rating'].
    Returns movieIds (and estimated scores if requested).
    """

    def __init__(self, algo=None, rating_scale: Optional[Tuple[float, float]] = None, random_state: int = 42):
        self.algo = algo if algo is not None else SVD(random_state=random_state)
        self.rating_scale = rating_scale  # infer from data if None
        self.reader = None
        self.trainset = None
        self._ratings_df = None
        self._user_seen = {}  # userId -> set(movieId)

    def fit(self, ratings: pd.DataFrame):
        """Fit on full ratings set."""
        req = {"userId","movieId","rating"}
        if not req.issubset(ratings.columns):
            raise ValueError(f"ratings must have columns {sorted(req)}")

        self._ratings_df = ratings.copy()

        if self.rating_scale is None:
            self.rating_scale = (float(ratings["rating"].min()), float(ratings["rating"].max()))

        self.reader = Reader(rating_scale=self.rating_scale)
        data = Dataset.load_from_df(self._ratings_df[["userId","movieId","rating"]], self.reader)

        self.trainset = data.build_full_trainset()
        self.algo.fit(self.trainset)

        # cache items each user has rated
        self._user_seen = (
            self._ratings_df.groupby("userId")["movieId"]
            .apply(lambda s: set(s.tolist()))
            .to_dict()
        )
        return self

    def cross_validate(self, cv: int = 5, measures: List[str] = ["RMSE","MAE"], verbose: bool = False):
        """5-fold CV using the current algorithm configuration."""
        if self._ratings_df is None:
            raise RuntimeError("Call fit(...) first to provide ratings.")
        data = Dataset.load_from_df(self._ratings_df[["userId","movieId","rating"]],
                                    Reader(rating_scale=self.rating_scale))
        return cross_validate(self.algo, data, measures=measures, cv=cv, verbose=verbose, n_jobs=1)

    def predict(self, user_id, movie_id):
        """One-off predicted rating (Surprise Prediction object)."""
        if self.trainset is None:
            raise RuntimeError("Call fit(...) before predict().")
        return self.algo.predict(uid=user_id, iid=movie_id, r_ui=None, verbose=False)

    def _anti_items_for_user(self, user_id, restrict_to: Optional[Iterable] = None):
        """Items the user hasn't rated yet (as raw movieIds)."""
        all_items = set(self._ratings_df["movieId"].unique()) if restrict_to is None else set(restrict_to)
        seen = self._user_seen.get(user_id, set())
        return list(all_items - seen)

    def recommend(self, user_id, k: int = 10, filter_seen: bool = True,
                  candidates: Optional[Iterable] = None, include_scores: bool = False):
        """
        Top-k movieIds for a user.
        - filter_seen: exclude already-rated items
        - candidates: optional iterable of movieIds to score (e.g., only popular)
        - include_scores: include estimated ratings
        """
        if self.trainset is None:
            raise RuntimeError("Call fit(...) before recommend().")

        to_score = (self._anti_items_for_user(user_id, candidates) if filter_seen
                    else list(set(self._ratings_df["movieId"].unique()) if candidates is None else set(candidates)))

        if not to_score:
            return (pd.DataFrame(columns=["movieId","est"]) if include_scores
                    else pd.Series(dtype=int, name="movieId"))

        preds = [self.algo.predict(user_id, iid) for iid in to_score]
        preds.sort(key=lambda p: p.est, reverse=True)
        top = preds[:k]

        mids = [p.iid for p in top]
        if include_scores:
            ests = [round(p.est, 4) for p in top]
            return pd.DataFrame({"movieId": mids, "est": ests})

        return pd.Series(mids, name="movieId")

    def recommend_for_users(self, user_ids: Iterable, k: int = 10, include_scores: bool = False):
        """Batch convenience: dict[user_id -> Series/DataFrame]."""
        return {uid: self.recommend(uid, k=k, include_scores=include_scores) for uid in user_ids}


In [6]:
collab = CollaborativeRecommender(algo=SVD(random_state=42))

collab.fit(ratings)
cv_results = collab.cross_validate(cv=5, measures=["RMSE","MAE"], verbose=True)

ratings[ratings["userId"] == 1]

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8996  0.9005  0.8905  0.8933  0.8940  0.8956  0.0039  
MAE (testset)     0.6902  0.6927  0.6882  0.6875  0.6888  0.6895  0.0019  
Fit time          2.21    1.56    1.54    1.56    1.53    1.68    0.27    
Test time         0.32    0.12    0.25    0.12    0.24    0.21    0.08    


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [7]:
# Single prediction like your snippet
collab.predict(1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.820855982000488, details={'was_impossible': False})

In [8]:
# Top-10 recs (movieIds only) for user 1
collab.recommend(1, k=10, include_scores=True)

Unnamed: 0,movieId,est
0,5618,3.8923
1,318,3.8638
2,858,3.7668
3,1136,3.7569
4,1259,3.7201
5,745,3.7043
6,898,3.6979
7,260,3.6716
8,593,3.6602
9,913,3.6426
