## Import Libraries

In [1]:
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd

## Load Dataset

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
movies = pd.read_csv('/content/drive/My Drive/data/CleanMoviesMetadata.csv')
movies.head()

Unnamed: 0,adult,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,status,tagline,title,video,vote_average,vote_count,name_belongs_to_collection,id_belongs_to_collection,poster_path_belongs_to_collection,backdrop_path_belongs_to_collection
0,False,30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,['en'],Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,Released,,Toy Story,False,7.7,5415.0,Toy Story Collection,10194.0,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg
1,False,65000000,"['Adventure', 'Fantasy', 'Family']",,8844,tt0113497,['en'],Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,,,,
2,False,0,"['Romance', 'Comedy']",,15602,tt0113228,['en'],Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,Grumpy Old Men Collection,119050.0,/nLvUdqgPgm3F85NMCii9gVFUcet.jpg,/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg
3,False,16000000,"['Comedy', 'Drama', 'Romance']",,31357,tt0114885,['en'],Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,,,,
4,False,0,['Comedy'],,11862,tt0113041,['en'],Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,Father of the Bride Collection,96871.0,/nts4iOmNnq7GNicycMJ9pSAn204.jpg,/7qwE57OVZmMJChBpLEbJEmzUydk.jpg


In [7]:
links_small = pd.read_csv('/content/drive/My Drive/data/links.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [8]:
movies['id'] = movies['id'].astype('int')
movies = movies[movies['id'].isin(links_small)]
movies.shape

(9099, 27)

## Movie Description Based Recommender For 1 Movie

In [9]:
class ContentIndexer:
    """
    Build a TF-IDF content index over movie descriptions and
    return top-K similar titles given one input title.
    """
    def __init__(
        self,
        ngram_range=(1, 2),
        min_df=1,
        max_features=None,
        stop_words="english",
        text_cols=("overview", "tagline")
    ):
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.max_features = max_features
        self.stop_words = stop_words
        self.text_cols = text_cols

        self.vectorizer = None
        self.tfidf = None
        self.df = None
        self.titles = None
        self.indices = None

    def _build_text(self, df: pd.DataFrame) -> pd.Series:
        # make a clean description without mutating the original df
        cols = [c for c in self.text_cols if c in df.columns]
        parts = [df[c].fillna('') for c in cols]
        if not parts:
            raise ValueError(f"No valid text columns found among {self.text_cols}")
        # overview + tagline (or whatever was found)
        desc = parts[0]
        for p in parts[1:]:
            desc = desc + " " + p
        return desc.fillna('')

    def fit(self, df: pd.DataFrame):
        """
        Expects a DataFrame with at least: 'title' plus text_cols (overview/tagline).
        """
        if "title" not in df.columns:
            raise ValueError("DataFrame must contain a 'title' column.")

        self.df = df.reset_index(drop=True).copy()
        self.titles = self.df["title"]

        # if duplicate titles exist, keep the first index for mapping
        self.indices = pd.Series(self.df.index, index=self.titles).drop_duplicates()

        desc = self._build_text(self.df)

        self.vectorizer = TfidfVectorizer(
            analyzer="word",
            ngram_range=self.ngram_range,
            min_df=self.min_df,
            max_features=self.max_features,
            stop_words=self.stop_words
        )
        self.tfidf = self.vectorizer.fit_transform(desc)
        return self

    def _scores_for_index(self, idx: int) -> np.ndarray:
        # compute similarities on the fly: (1 x d) • (n x d)^T -> (1 x n)
        sims = linear_kernel(self.tfidf[idx], self.tfidf).ravel()
        return sims

    def recommend(self, title: str, k: int = 10, include_scores: bool = False) -> pd.DataFrame | pd.Series:
        """
        Return top-k similar titles to the given title (excluding the title itself).
        - If include_scores=True, returns a DataFrame with 'title' and 'score'.
        - Otherwise returns a Series of titles.
        """
        if self.indices is None or self.tfidf is None:
            raise RuntimeError("Call fit(df) before recommend().")

        if title not in self.indices:
            # helpful error with nearby suggestions
            # quick, lightweight candidates: case-insensitive contains
            ct = title.lower()
            candidates = self.titles[self.titles.str.lower().str.contains(ct, na=False)].unique()[:10]
            raise KeyError(
                f"Title '{title}' not found. "
                f"Did you mean one of: {list(candidates)}"
            )

        idx = int(self.indices[title])
        sims = self._scores_for_index(idx)

        # sort by similarity, drop itself
        order = np.argsort(-sims)
        order = order[order != idx]

        top_idx = order[:k]
        top_titles = self.titles.iloc[top_idx].reset_index(drop=True)
        if include_scores:
            top_scores = pd.Series(sims[top_idx]).reset_index(drop=True).round(6)
            return pd.DataFrame({"title": top_titles, "score": top_scores})
        return top_titles

    def recommend_by_index(self, idx: int, k: int = 10, include_scores: bool = False):
        """
        Same as recommend(), but starting from a row index instead of a title.
        Useful if you already looked up the index elsewhere.
        """
        if idx < 0 or idx >= self.tfidf.shape[0]:
            raise IndexError(f"idx must be in [0, {self.tfidf.shape[0]-1}]")
        sims = self._scores_for_index(idx)
        order = np.argsort(-sims)
        order = order[order != idx]
        top_idx = order[:k]
        top_titles = self.titles.iloc[top_idx].reset_index(drop=True)
        if include_scores:
            top_scores = pd.Series(sims[top_idx]).reset_index(drop=True).round(6)
            return pd.DataFrame({"title": top_titles, "score": top_scores})
        return top_titles

In [10]:
ci = ContentIndexer(ngram_range=(1,2), min_df=1, stop_words='english')
ci.fit(movies)

ci.recommend("The Godfather", k=10)
ci.recommend("The Godfather", k=10, include_scores=True)

Unnamed: 0,title,score
0,The Godfather: Part II,0.220061
1,The Family,0.100294
2,Made,0.067619
3,Johnny Dangerously,0.065622
4,Shanghai Triad,0.056143
5,Fury,0.056029
6,American Movie,0.05502
7,The Godfather: Part III,0.050235
8,8 Women,0.047508
9,Summer of Sam,0.045953


## Movie Description Based Recommender For List of Movies

In [11]:
class ContentIndexer2:
    """
    Build a TF-IDF content index over movie descriptions and
    return top-K similar titles given one input title.
    """
    def __init__(
        self,
        ngram_range=(1, 2),
        min_df=1,
        max_features=None,
        stop_words="english",
        text_cols=("overview", "tagline")
    ):
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.max_features = max_features
        self.stop_words = stop_words
        self.text_cols = text_cols

        self.vectorizer = None
        self.tfidf = None
        self.df = None
        self.titles = None
        self.indices = None

    def _build_text(self, df: pd.DataFrame) -> pd.Series:
        cols = [c for c in self.text_cols if c in df.columns]
        parts = [df[c].fillna('') for c in cols]
        if not parts:
            raise ValueError(f"No valid text columns found among {self.text_cols}")
        desc = parts[0]
        for p in parts[1:]:
            desc = desc + " " + p
        return desc.fillna('')

    def fit(self, df: pd.DataFrame):
        """
        Expects a DataFrame with at least: 'title' plus text_cols (overview/tagline).
        """
        if "title" not in df.columns:
            raise ValueError("DataFrame must contain a 'title' column.")

        self.df = df.reset_index(drop=True).copy()
        self.titles = self.df["title"]

        self.indices = pd.Series(self.df.index, index=self.titles).drop_duplicates()

        desc = self._build_text(self.df)

        self.vectorizer = TfidfVectorizer(
            analyzer="word",
            ngram_range=self.ngram_range,
            min_df=self.min_df,
            max_features=self.max_features,
            stop_words=self.stop_words
        )
        self.tfidf = self.vectorizer.fit_transform(desc)
        return self

    def _scores_for_index(self, idx: int) -> np.ndarray:
        sims = linear_kernel(self.tfidf[idx], self.tfidf).ravel()
        return sims

    def recommend(self, titles, k=10, include_scores=False):
        if self.indices is None or self.tfidf is None:
            raise RuntimeError("Call fit(df) before recommend().")

        if isinstance(titles, str):
            titles = [titles]

        missing = [t for t in titles if t not in self.indices]
        if missing:
            raise KeyError(f"Titles not found: {missing}")

        idxs = [int(self.indices[t]) for t in titles]
        seed_vecs = self.tfidf[idxs]

        profile = seed_vecs.sum(axis=0) / len(idxs)
        profile = csr_matrix(profile)
        profile = normalize(profile)

        sims = linear_kernel(profile, self.tfidf).ravel()

        order = np.argsort(-sims)
        exclude_idx = set(idxs)
        order = [i for i in order if i not in exclude_idx]

        top_idx = order[:k]
        top_titles = self.titles.iloc[top_idx].reset_index(drop=True)

        if include_scores:
            top_scores = pd.Series(sims[top_idx]).reset_index(drop=True).round(6)
            return pd.DataFrame({"title": top_titles, "score": top_scores})

        return top_titles

    def recommend_by_index(self, idx: int, k: int = 10, include_scores: bool = False):
        """
        Same as recommend(), but starting from a row index instead of a title.
        Useful if you already looked up the index elsewhere.
        """
        if idx < 0 or idx >= self.tfidf.shape[0]:
            raise IndexError(f"idx must be in [0, {self.tfidf.shape[0]-1}]")
        sims = self._scores_for_index(idx)
        order = np.argsort(-sims)
        order = order[order != idx]
        top_idx = order[:k]
        top_titles = self.titles.iloc[top_idx].reset_index(drop=True)
        if include_scores:
            top_scores = pd.Series(sims[top_idx]).reset_index(drop=True).round(6)
            return pd.DataFrame({"title": top_titles, "score": top_scores})
        return top_titles


In [12]:
ci = ContentIndexer2(ngram_range=(1,2), min_df=1, stop_words='english')
ci.fit(movies)

ci.recommend(["The Godfather", "Pulp Fiction", "Fight Club"], k=10)

ci.recommend(["The Godfather", "Pulp Fiction", "Fight Club"], k=10, include_scores=True)


Unnamed: 0,title,score
0,The Godfather: Part II,0.132847
1,Delirious,0.066389
2,The Family,0.061664
3,Raging Bull,0.052633
4,Johnny Dangerously,0.048118
5,Made,0.048057
6,The Wackness,0.043206
7,Holding Trevor,0.042576
8,Run All Night,0.037486
9,Road to Perdition,0.037395
