## Import Libraris

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from collections.abc import Iterable

## Load Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
movies = pd.read_csv('/content/drive/My Drive/data/CleanMoviesMetadata.csv')
movies.head()

Unnamed: 0,adult,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,status,tagline,title,video,vote_average,vote_count,name_belongs_to_collection,id_belongs_to_collection,poster_path_belongs_to_collection,backdrop_path_belongs_to_collection
0,False,30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,['en'],Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,Released,,Toy Story,False,7.7,5415.0,Toy Story Collection,10194.0,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg
1,False,65000000,"['Adventure', 'Fantasy', 'Family']",,8844,tt0113497,['en'],Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,,,,
2,False,0,"['Romance', 'Comedy']",,15602,tt0113228,['en'],Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,Grumpy Old Men Collection,119050.0,/nLvUdqgPgm3F85NMCii9gVFUcet.jpg,/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg
3,False,16000000,"['Comedy', 'Drama', 'Romance']",,31357,tt0114885,['en'],Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,,,,
4,False,0,['Comedy'],,11862,tt0113041,['en'],Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,Father of the Bride Collection,96871.0,/nts4iOmNnq7GNicycMJ9pSAn204.jpg,/7qwE57OVZmMJChBpLEbJEmzUydk.jpg


In [4]:
links_small = pd.read_csv('/content/drive/My Drive/data/links.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [5]:
movies['id'] = movies['id'].astype('int')
movies = movies[movies['id'].isin(links_small)]
movies.shape

(9099, 27)

In [6]:
keywords = pd.read_csv('/content/drive/My Drive/data/CleanKeywords.csv')
keywords.head()

Unnamed: 0,id,keywords
0,862,"['jealousi', 'toy', 'boy', 'friendship', 'frie..."
1,8844,"['boardgam', 'disappear', ""basedonchildren'sbo..."
2,15602,"['fish', 'bestfriend', 'duringcreditssting', '..."
3,31357,"['basedonnovel', 'interracialrelationship', 's..."
4,11862,"['babi', 'midlifecrisi', 'confid', 'age', 'dau..."


In [7]:
credits = pd.read_csv('/content/drive/My Drive/data/CleanCredits.csv')
credits.head()

Unnamed: 0,cast,id,director
0,"['tomhanks', 'timallen', 'donrickles', 'jimvar...",862,"['johnlasseter', 'johnlasseter', 'johnlasseter']"
1,"['robinwilliams', 'jonathanhyde', 'kirstenduns...",8844,"['joejohnston', 'joejohnston', 'joejohnston']"
2,"['waltermatthau', 'jacklemmon', 'ann-margret',...",15602,"['howarddeutch', 'howarddeutch', 'howarddeutch']"
3,"['whitneyhouston', 'angelabassett', 'lorettade...",31357,"['forestwhitaker', 'forestwhitaker', 'forestwh..."
4,"['stevemartin', 'dianekeaton', 'martinshort', ...",11862,"['charlesshyer', 'charlesshyer', 'charlesshyer']"


In [8]:
metadata = movies.merge(keywords, on='id')
metadata = metadata.merge(credits, on='id')
metadata.head()

Unnamed: 0,adult,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,video,vote_average,vote_count,name_belongs_to_collection,id_belongs_to_collection,poster_path_belongs_to_collection,backdrop_path_belongs_to_collection,keywords,cast,director
0,False,30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,['en'],Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,False,7.7,5415.0,Toy Story Collection,10194.0,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg,"['jealousi', 'toy', 'boy', 'friendship', 'frie...","['tomhanks', 'timallen', 'donrickles', 'jimvar...","['johnlasseter', 'johnlasseter', 'johnlasseter']"
1,False,65000000,"['Adventure', 'Fantasy', 'Family']",,8844,tt0113497,['en'],Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,False,6.9,2413.0,,,,,"['boardgam', 'disappear', ""basedonchildren'sbo...","['robinwilliams', 'jonathanhyde', 'kirstenduns...","['joejohnston', 'joejohnston', 'joejohnston']"
2,False,0,"['Romance', 'Comedy']",,15602,tt0113228,['en'],Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,False,6.5,92.0,Grumpy Old Men Collection,119050.0,/nLvUdqgPgm3F85NMCii9gVFUcet.jpg,/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg,"['fish', 'bestfriend', 'duringcreditssting', '...","['waltermatthau', 'jacklemmon', 'ann-margret',...","['howarddeutch', 'howarddeutch', 'howarddeutch']"
3,False,16000000,"['Comedy', 'Drama', 'Romance']",,31357,tt0114885,['en'],Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,...,False,6.1,34.0,,,,,"['basedonnovel', 'interracialrelationship', 's...","['whitneyhouston', 'angelabassett', 'lorettade...","['forestwhitaker', 'forestwhitaker', 'forestwh..."
4,False,0,['Comedy'],,11862,tt0113041,['en'],Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,...,False,5.7,173.0,Father of the Bride Collection,96871.0,/nts4iOmNnq7GNicycMJ9pSAn204.jpg,/7qwE57OVZmMJChBpLEbJEmzUydk.jpg,"['babi', 'midlifecrisi', 'confid', 'age', 'dau...","['stevemartin', 'dianekeaton', 'martinshort', ...","['charlesshyer', 'charlesshyer', 'charlesshyer']"


In [9]:
class MetadataIndexer:
    """
    Build a CountVectorizer content index over movie metadata "soup"
    (e.g., keywords, cast, director, genres, language, etc.) and
    return top-K similar titles given 1+ seed titles.
    """

    def __init__(
        self,
        meta_cols=("keywords", "cast", "director", "genres", "original_language"),
        ngram_range=(1, 2),
        min_df=1,
        max_features=None,
        stop_words="english",
        weights=None,
    ):
        self.meta_cols = meta_cols
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.max_features = max_features
        self.stop_words = stop_words
        self.weights = weights or {}

        self.vectorizer = None
        self.matrix = None
        self.df = None
        self.titles = None
        self.indices = None

    def _ensure_tokens(self, value):
        """Turn a cell into a list[str] tokens."""
        if value is None or (isinstance(value, float) and pd.isna(value)):
            return []
        if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
            out = []
            for v in value:
                if v is None:
                    continue
                s = str(v).strip()
                if s:
                    out.append(s)
            return out
        s = str(value).strip()
        if not s:
            return []
        return s.split()

    def _build_soup(self, df: pd.DataFrame) -> pd.Series:
        cols = [c for c in self.meta_cols if c in df.columns]
        if not cols:
            raise ValueError(f"No valid metadata columns found among {self.meta_cols}")

        def row_to_soup(row):
            tokens = []
            for c in cols:
                toks = self._ensure_tokens(row[c])
                w = float(self.weights.get(c, 1.0))
                if w <= 0 or not toks:
                    continue
                # simple weighting by repetition (integer part)
                reps = max(1, int(round(w)))
                for _ in range(reps):
                    tokens.extend(toks)
            return " ".join(tokens) if tokens else ""

        return df.apply(row_to_soup, axis=1)


    def fit(self, df: pd.DataFrame):
        """
        Expects a DataFrame with at least: 'title' + the chosen meta_cols.
        """
        if "title" not in df.columns:
            raise ValueError("DataFrame must contain a 'title' column.")

        self.df = df.reset_index(drop=True).copy()
        self.titles = self.df["title"]
        # If duplicate titles exist, keep the first index for mapping
        self.indices = pd.Series(self.df.index, index=self.titles).drop_duplicates()

        soup = self._build_soup(self.df)

        self.vectorizer = CountVectorizer(
            analyzer="word",
            ngram_range=self.ngram_range,
            min_df=self.min_df,
            max_features=self.max_features,
            stop_words=self.stop_words,
        )
        self.matrix = self.vectorizer.fit_transform(soup)
        return self

    def _scores_for_index(self, idx: int) -> np.ndarray:
        sims = linear_kernel(self.matrix[idx], self.matrix).ravel()
        return sims

    def recommend(self, titles, k=10, include_scores=False):
        """
        Recommend top-K titles similar to one or more seed titles.
        """
        if self.indices is None or self.matrix is None:
            raise RuntimeError("Call fit(df) before recommend().")

        if isinstance(titles, str):
            titles = [titles]

        missing = [t for t in titles if t not in self.indices]
        if missing:
            raise KeyError(f"Titles not found: {missing}")

        idxs = [int(self.indices[t]) for t in titles]
        seed_vecs = self.matrix[idxs]

        profile = seed_vecs.sum(axis=0) / len(idxs)
        profile = csr_matrix(profile)
        profile = normalize(profile)

        sims = linear_kernel(profile, self.matrix).ravel()

        order = np.argsort(-sims)
        exclude_idx = set(idxs)
        order = [i for i in order if i not in exclude_idx]

        top_idx = order[:k]
        top_titles = self.titles.iloc[top_idx].reset_index(drop=True)

        if include_scores:
            top_scores = pd.Series(sims[top_idx]).reset_index(drop=True).round(6)
            return pd.DataFrame({"title": top_titles, "score": top_scores})

        return top_titles

    def recommend_by_index(self, idx: int, k: int = 10, include_scores: bool = False):
        """
        Same as recommend(), but starting from a row index instead of a title.
        """
        if self.matrix is None:
            raise RuntimeError("Call fit(df) before recommend_by_index().")
        if idx < 0 or idx >= self.matrix.shape[0]:
            raise IndexError(f"idx must be in [0, {self.matrix.shape[0]-1}]")

        sims = self._scores_for_index(idx)
        order = np.argsort(-sims)
        order = [i for i in order if i != idx]
        top_idx = order[:k]
        top_titles = self.titles.iloc[top_idx].reset_index(drop=True)

        if include_scores:
            top_scores = pd.Series(sims[top_idx]).reset_index(drop=True).round(6)
            return pd.DataFrame({"title": top_titles, "score": top_scores})

        return top_titles


In [10]:
meta = MetadataIndexer(
    meta_cols=("keywords","cast","director","genres","original_language"),
    ngram_range=(1, 2),
    min_df=1,
    stop_words="english",
    weights={"director": 2.0, "cast": 1.5}
)

meta.fit(metadata)

# Recommend from three favorites
meta.recommend(["The Godfather", "Pulp Fiction", "Fight Club"], k=10)

Unnamed: 0,title
0,The Hateful Eight
1,The Godfather: Part II
2,Jackie Brown
3,The Godfather: Part III
4,Se7en
5,The Rainmaker
6,Kill Bill: Vol. 2
7,Kill Bill: Vol. 1
8,Inglourious Basterds
9,Gardens of Stone


In [11]:
# With similarity scores:
meta.recommend(["The Godfather", "Pulp Fiction", "Fight Club"], k=10, include_scores=True)

Unnamed: 0,title,score
0,The Hateful Eight,4.066629
1,The Godfather: Part II,4.015796
2,Jackie Brown,3.964963
3,The Godfather: Part III,3.812464
4,Se7en,3.761631
5,The Rainmaker,3.659966
6,Kill Bill: Vol. 2,3.659966
7,Kill Bill: Vol. 1,3.659966
8,Inglourious Basterds,3.659966
9,Gardens of Stone,3.609133


In [12]:
# Or by row index:
meta.recommend_by_index(123, k=10, include_scores=True)

Unnamed: 0,title,score
0,Sidewalks of New York,82.0
1,She's the One,79.0
2,Life or Something Like It,19.0
3,Confidence,14.0
4,Echelon Conspiracy,13.0
5,15 Minutes,13.0
6,Happy Endings,10.0
7,Seeking a Friend for the End of the World,10.0
8,The Last Days of Disco,9.0
9,Chasing Amy,9.0
