In [2]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl size=2610404 sha256=df49673af5a9a74faf37b1960e72b0b3a63d3fbe96b1db39804ebfd24dd604c8
  Stored in directory: /root/.cache/pip/wheels/75/fa/bc/739bc2cb1fbaab6061854e6cfbb81a0ae52c92a502a7fa454b
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succes

In [9]:
pip install numpy==1.26.4

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m116.9 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 

In [7]:
pip install --upgrade --no-deps scikit-surprise



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
import pandas as pd
import numpy as np
from typing import Optional, Iterable, List, Tuple

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix

from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

import requests


# ---------- Content model ----------
class MetadataIndexer:
    def __init__(
        self,
        meta_cols=("keywords", "cast", "director", "genres", "original_language"),
        ngram_range=(1, 2),
        min_df=1,
        max_features=None,
        stop_words="english",
        weights=None,
    ):
        self.meta_cols = meta_cols
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.max_features = max_features
        self.stop_words = stop_words
        self.weights = weights or {}

        self.vectorizer = None
        self.matrix = None
        self.df = None
        self.titles = None
        self.indices = None


    def _ensure_tokens(self, value):
        if value is None or (isinstance(value, float) and pd.isna(value)):
            return []
        if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
            out = []
            for v in value or []:
                s = str(v).strip()
                if s:
                    out.append(s)
            return out
        s = str(value).strip()
        return [] if not s else s.split()

    def _build_soup(self, df: pd.DataFrame) -> pd.Series:
        cols = [c for c in self.meta_cols if c in df.columns]
        if not cols:
            raise ValueError(f"No valid metadata columns found among {self.meta_cols}")

        def row_to_soup(row):
            tokens = []
            for c in cols:
                toks = self._ensure_tokens(row[c])
                w = float(self.weights.get(c, 1.0))
                if w <= 0 or not toks:
                    continue
                reps = max(1, int(round(w)))
                for _ in range(reps):
                    tokens.extend(toks)
            return " ".join(tokens) if tokens else ""

        return df.apply(row_to_soup, axis=1)

    def fit(self, df: pd.DataFrame):
        if "title" not in df.columns:
            raise ValueError("DataFrame must contain a 'title' column.")
        self.df = df.reset_index(drop=True).copy()
        self.titles = self.df["title"]
        self.indices = pd.Series(self.df.index, index=self.titles).drop_duplicates()

        soup = self._build_soup(self.df)
        self.vectorizer = CountVectorizer(
            analyzer="word",
            ngram_range=self.ngram_range,
            min_df=self.min_df,
            max_features=self.max_features,
            stop_words=self.stop_words,
        )
        self.matrix = self.vectorizer.fit_transform(soup)
        # Optional: cosine behavior
        self.matrix = normalize(self.matrix)
        return self

    def _scores_for_index(self, idx: int) -> np.ndarray:
        sims = linear_kernel(self.matrix[idx], self.matrix).ravel()
        return sims


# ---------- Collaborative model ----------
class CollaborativeRecommender:
    def __init__(self, algo=None, rating_scale: Optional[Tuple[float, float]] = None, random_state: int = 42):
        self.algo = algo if algo is not None else SVD(random_state=random_state)
        self.rating_scale = rating_scale
        self.reader = None
        self.trainset = None
        self._ratings_df = None
        self._user_seen = {}

    def fit(self, ratings: pd.DataFrame):
        req = {"userId","movieId","rating"}
        if not req.issubset(ratings.columns):
            raise ValueError(f"ratings must have columns {sorted(req)}")
        self._ratings_df = ratings.copy()
        if self.rating_scale is None:
            self.rating_scale = (float(ratings["rating"].min()), float(ratings["rating"].max()))
        self.reader = Reader(rating_scale=self.rating_scale)
        data = Dataset.load_from_df(self._ratings_df[["userId","movieId","rating"]], self.reader)
        self.trainset = data.build_full_trainset()
        self.algo.fit(self.trainset)
        self._user_seen = (
            self._ratings_df.groupby("userId")["movieId"]
            .apply(lambda s: set(s.tolist()))
            .to_dict()
        )
        return self

    def predict(self, user_id, movie_id):
        if self.trainset is None:
            raise RuntimeError("Call fit(...) before predict().")
        return self.algo.predict(uid=user_id, iid=movie_id, r_ui=None, verbose=False)


# ---------- Unified Hybrid + helpers ----------
class HybridUnified:
    """
    Owns both the content and collaborative models, plus utility helpers.
    - fit(smd, ratings, id_map)
    - recommend(user_id, title, ...)
    - get_movie_info(title)
    - api_poster(tmdb_id)
    """

    def __init__(
        self,
        meta_kwargs: dict = None,
        collab_kwargs: dict = None,
        alpha: float = 0.6,
        tmdb_api_key: Optional[str] = None,
        poster_placeholder: str = "data/No-Image-Placeholder.png",
    ):
        self.meta = MetadataIndexer(**(meta_kwargs or {}))
        self.collab = CollaborativeRecommender(**(collab_kwargs or {}))
        self.alpha = float(alpha)

        self.smd = None
        self.id_map = None                         # ['movieId','id']
        self.indices_map_by_id = None              # id -> movieId

        self.tmdb_api_key = tmdb_api_key
        self.poster_placeholder = poster_placeholder
        self.load_datasets()

    def load_datasets(self, base="/content/drive/My Drive/data"):
        """Load raw CSVs and build self.metadata (for content), self.ratings (for collab), and self.id_map."""
        # raw files
        self.movies      = pd.read_csv(f"{base}/CleanMoviesMetadata.csv")
        self.credits     = pd.read_csv(f"{base}/CleanCredits.csv")
        self.keywords    = pd.read_csv(f"{base}/CleanKeywords.csv")
        self.links_small = pd.read_csv(f"{base}/links_small.csv")
        self.ratings     = pd.read_csv(f"{base}/ratings_small.csv")

        # ---- build id_map: keep BOTH columns, drop rows without tmdbId, cast to int
        id_map = (
            self.links_small.loc[self.links_small["tmdbId"].notna(), ["movieId", "tmdbId"]]
            .astype({"movieId": "int64", "tmdbId": "int64"})
            .rename(columns={"tmdbId": "id"})
        )
        self.id_map = id_map

        # ---- ensure movies.id is int and filter to ids that exist in links_small
        self.movies["id"] = pd.to_numeric(self.movies["id"], errors="coerce").astype("Int64")
        self.movies = self.movies[self.movies["id"].notna()].copy()
        self.movies["id"] = self.movies["id"].astype(int)
        self.movies = self.movies[self.movies["id"].isin(self.id_map["id"])].copy()

        # ---- build metadata for content model
        # (left joins in case keywords/credits are missing for some ids)
        self.metadata = (
            self.movies
            .merge(self.keywords, on="id", how="left")
            .merge(self.credits, on="id", how="left")
        )

        # OPTIONAL: if tokens are JSON strings, turn them into lists so MetadataIndexer works well
        for col in ["genres", "keywords", "cast", "director"]:
            if col in self.metadata.columns:
                self.metadata[col] = self.metadata[col].apply(
                    lambda x: x if isinstance(x, (list, tuple)) else ([] if pd.isna(x) else str(x).split())
                )
        return self

    def fit_from_loaded(self):
        """Use datasets loaded by load_datasets() to fit both sub-models."""
        if any(getattr(self, a, None) is None for a in ["metadata", "ratings", "id_map"]):
            raise RuntimeError("Call load_datasets() first.")
        # self.metadata -> content, self.ratings -> collab, self.id_map -> mapping
        return self.fit(smd=self.metadata,
                        ratings=self.ratings[["userId", "movieId", "rating"]],
                        id_map=self.id_map)

    # ---------- fitting ----------
    def fit(self, smd: pd.DataFrame, ratings: pd.DataFrame, id_map: pd.DataFrame):
        if "title" not in smd.columns or "id" not in smd.columns:
            raise ValueError("smd must include 'title' and 'id' (TMDb).")
        self.smd = smd
        self.meta.fit(smd)

        self.collab.fit(ratings)

        need = {"movieId","id"}
        if not need.issubset(id_map.columns):
            raise ValueError(f"id_map must include columns {sorted(need)}")
        self.id_map = id_map[["movieId","id"]].dropna().copy()
        self.indices_map_by_id = self.id_map.drop_duplicates("id").set_index("id")
        return self

    # ---------- hybrid recommend ----------
    def recommend(
        self,
        user_id: int,
        titles,                         # str OR List[str]
        k: int = 10,
        n_content: int = 200,
        include_meta: bool = True,
        genre_filter: Optional[List[str]] = None,
      ):
        # --- accept 1 or many titles ---
        if isinstance(titles, str):
            titles = [titles]

        missing = [t for t in titles if t not in self.meta.indices]
        if missing:
            raise KeyError(f"Titles not found in content index: {missing}")

        # --- build averaged content profile over the seed titles ---
        idxs = [int(self.meta.indices[t]) for t in titles]
        seed_vecs = self.meta.matrix[idxs]
        profile = seed_vecs.sum(axis=0) / len(idxs)
        profile = csr_matrix(profile)
        profile = normalize(profile)

        sims_vec = linear_kernel(profile, self.meta.matrix).ravel()

        # rank by content, exclude the seeds themselves
        order = np.argsort(-sims_vec)
        exclude_idx = set(idxs)
        order = [i for i in order if i not in exclude_idx][:n_content]

        # candidates: titles, tmdb ids, content sims
        cand_titles = self.meta.titles.iloc[order].tolist()
        cand_ids    = self.meta.df.iloc[order]["id"].astype(int).values
        cand_sims   = sims_vec[order].astype(float)

        cand_df = pd.DataFrame({"title": cand_titles, "id": cand_ids, "content_sim": cand_sims})

        # map TMDb id -> movieId
        mapped = cand_df.merge(
            self.indices_map_by_id[["movieId"]].reset_index(),
            on="id", how="left"
        ).dropna(subset=["movieId"]).drop_duplicates("movieId")

        if mapped.empty:
            return pd.DataFrame(columns=["title","movieId","id","content_sim","est","score"])

        mapped["movieId"] = mapped["movieId"].astype(int)

        # optional genre gate
        if genre_filter and "genres" in self.smd.columns:
            gmap = self.smd[["id","genres"]].drop_duplicates("id")
            mapped = mapped.merge(gmap, on="id", how="left")
            def has_any(gs):
                try:
                    s = set(gs or [])
                except Exception:
                    s = set(str(gs).split())
                return any(g in s for g in genre_filter)
            mapped = mapped[mapped["genres"].apply(has_any)].drop(columns=["genres"], errors="ignore")

        # collaborative re-rank
        mapped["est"] = mapped["movieId"].apply(lambda mid: self.collab.predict(user_id, int(mid)).est)

        # blend (normalize each to [0,1])
        s = mapped["content_sim"].to_numpy()
        e = mapped["est"].to_numpy()
        s = (s - s.min()) / (s.max() - s.min() + 1e-9)
        e = (e - e.min()) / (e.max() - e.min() + 1e-9)
        mapped["score"] = self.alpha * s + (1 - self.alpha) * e

        out = mapped.copy()
        if include_meta:
            keep = [c for c in ["id","title","vote_count","vote_average","year","overview","original_language"] if c in self.smd.columns]
            if keep:
                out = out.merge(self.smd[keep].drop_duplicates("id"), on=["id","title"], how="left")

        out = out.sort_values("score", ascending=False).head(k)
        cols = [c for c in ["title","movieId","id","content_sim","est","score","vote_count","vote_average","year"] if c in out.columns]
        return out[cols].reset_index(drop=True)

    @staticmethod
    def _lang_name(code: Optional[str]) -> str:
        m = {
            "en":"English","fr":"French","de":"German","es":"Spanish","it":"Italian",
            "pt":"Portuguese","ru":"Russian","zh":"Chinese","ja":"Japanese","ko":"Korean",
            "hi":"Hindi","ar":"Arabic","tr":"Turkish","fa":"Persian","nl":"Dutch",
        }
        if not code or not isinstance(code, str):
            return ""
        return m.get(code.lower(), code)

    def get_movie_info(self, movie_name: str) -> dict:
        """
        Look up by exact title in the metadata (self.smd) and return a small info dict.
        If duplicates exist, prefer the one with highest vote_count.
        """
        if self.smd is None:
            raise RuntimeError("Call fit(...) first so metadata is loaded.")
        recs = self.smd[self.smd["title"] == movie_name]
        if recs.empty:
            raise KeyError(f"Title not found: {movie_name}")

        if "vote_count" in recs.columns:
            record = recs.sort_values("vote_count", ascending=False).iloc[0]
        else:
            record = recs.iloc[0]

        info = {
            "title": record.get("title", movie_name),
            "overview": record.get("overview", ""),
            "language": self._lang_name(record.get("original_language", "")),
            "genres": record.get("genres", []),
            "id": int(record.get("id")) if pd.notna(record.get("id")) else None,
            "year": int(record.get("year")) if pd.notna(record.get("year")) else None,
        }
        return info

    def api_poster(self, tmdb_id: int) -> str:
        """
        Fetch poster URL from TMDb for a given TMDb movie id.
        Returns a full https://image.tmdb.org/t/p/w500/... URL or a local placeholder.
        """
        if not tmdb_id or self.tmdb_api_key is None:
            return self.poster_placeholder

        url = f"https://api.themoviedb.org/3/movie/{int(tmdb_id)}"
        params = {"api_key": self.tmdb_api_key, "language": "en-US"}
        headers = {"accept": "application/json"}
        try:
            r = requests.get(url, params=params, headers=headers, timeout=10)
            data = r.json() if r.ok else {}
        except Exception:
            return self.poster_placeholder

        poster_path = data.get("poster_path")
        if poster_path and str(poster_path).strip().lower() != "nan":
            return f"https://image.tmdb.org/t/p/w500/{poster_path.lstrip('/')}"
        return self.poster_placeholder


In [34]:
hyb = HybridUnified(
    meta_kwargs={"weights": {"keywords":3.0, "genres":2.0, "director":0.8, "cast":0.6}},
    collab_kwargs={"algo": SVD(random_state=42)},
    alpha=0.65,
)

hyb.fit_from_loaded()    # trains both models and sets up the id mapping

hyb.recommend(
    user_id=1,
    titles=["The Godfather", "Pulp Fiction", "Fight Club"],
    k=10,
    n_content=400,
    genre_filter=None
)

# movie info & poster:
info = hyb.get_movie_info("The Godfather")
poster = hyb.api_poster(info["id"])


In [35]:
hyb.recommend(
    user_id=1,
    titles=["The Godfather", "Pulp Fiction", "Fight Club"],
    k=10,
    n_content=400,
    genre_filter=None
)


Unnamed: 0,title,movieId,id,content_sim,est,score,vote_count,vote_average
0,Tucker: The Man and His Dream,4557,28176,0.192353,2.592226,0.778723,71.0,6.5
1,The Godfather: Part II,1221,240,0.15074,3.390681,0.706663,3418.0,8.3
2,Mitchell,8290,32303,0.174504,2.465665,0.64002,15.0,4.4
3,Cleanskin,95508,95516,0.159124,2.797688,0.621867,118.0,5.6
4,The Rainmaker,1672,11975,0.153076,2.901199,0.608516,239.0,6.7
5,Animal Kingdom,79720,44629,0.159124,2.633522,0.584138,240.0,6.7
6,The Paradine Case,2201,31667,0.155767,2.673866,0.572794,42.0,6.3
7,Feast of July,167,259209,0.157504,2.599111,0.566282,0.0,0.0
8,True Believer,4279,31863,0.148124,2.777145,0.549591,25.0,6.3
9,Se7en,47,807,0.135983,3.066676,0.541572,5915.0,8.1


In [31]:
info = hyb.get_movie_info("The Godfather")
info

{'title': 'The Godfather',
 'overview': 'Spanning the years 1945 to 1955, a chronicle of the fictional Italian-American Corleone crime family. When organized crime family patriarch, Vito Corleone barely survives an attempt on his life, his youngest son, Michael steps in to take care of the would-be killers, launching a campaign of bloody revenge.',
 'language': "['en']",
 'genres': ["['Drama',", "'Crime']"],
 'id': 238,
 'year': None}

In [32]:
poster

'data/No-Image-Placeholder.png'