In [4]:

import argparse
import ast
import re
from pathlib import Path

import numpy as np
import pandas as pd


STOP_WORDS = {
    "the","a","an","of","on","in","to","and","or","for","with","without","at",
    "by","this","that","these","those","into","over","from","as","is","it",
    "its","be","was","were","are"
}

def clean_token(t: str) -> str:
    t = re.sub(r"[^\w\-]", "", t.lower().strip())
    return t if t and t not in STOP_WORDS else ""


def parse_json_col(col: pd.Series, key: str = "name", top_n: int | None = None):
    """
    TMDB stores genres / keywords / cast / crew as JSON‑encoded strings.
    Decode them into lists of token strings.
    """
    out = []
    for item in col.fillna("[]"):
        try:
            parsed = ast.literal_eval(item)
        except (ValueError, SyntaxError):
            parsed = []
        if top_n is not None:
            parsed = parsed[:top_n]
        tokens = [clean_token(d.get(key, "")) for d in parsed]
        out.append([t for t in tokens if t])
    return out


class TMDBRecommenderNP:
    def __init__(
        self,
        movies_csv: str | Path = "tmdb_5000_movies.csv",
        credits_csv: str | Path = "tmdb_5000_credits.csv",
    ) -> None:
  
        movies = pd.read_csv(r"C:\Users\popsi\Downloads\tmdb_5000_movies.csv.zip")
        credits = pd.read_csv(r"C:\Users\popsi\Downloads\tmdb_5000_credits.csv.zip")
        df = movies.merge(credits, on="title")

        
        df["genres_list"] = parse_json_col(df["genres"])
        df["keywords_list"] = parse_json_col(df["keywords"])
        df["cast_list"] = parse_json_col(df["cast"], key="name", top_n=3)

       
        directors = []
        for row in df["crew"].fillna("[]"):
            try:
                crew = ast.literal_eval(row)
            except (ValueError, SyntaxError):
                crew = []
            name = ""
            for person in crew:
                if person.get("job") == "Director":
                    name = clean_token(person.get("name", ""))
                    break
            directors.append([name] if name else [])
        df["director_list"] = directors

    
        def split_text(s: str) -> list[str]:
            tokens = [clean_token(t) for t in re.split(r"\W+", str(s).lower())]
            return [t for t in tokens if t]

        df["overview_list"] = df["overview"].apply(split_text)
        df["tagline_list"] = df["tagline"].apply(split_text)

        
        df["soup"] = (
            df["genres_list"]
            + df["keywords_list"]
            + df["cast_list"]
            + df["director_list"]
            + df["overview_list"]
            + df["tagline_list"]
        )

        
        self.movies = df[["title", "soup"]].copy()

        
        vocab: dict[str, int] = {}
        for tokens in self.movies["soup"]:
            for t in tokens:
                if t not in vocab:
                    vocab[t] = len(vocab)
        self.vocab_size = len(vocab)
        self.vocab = vocab

        n_movies = len(self.movies)
        tf_counts = np.zeros((n_movies, self.vocab_size), dtype=np.uint16)

        for i, tokens in enumerate(self.movies["soup"]):
            for t in tokens:
                tf_counts[i, vocab[t]] += 1

        df_counts = (tf_counts > 0).sum(axis=0)  
        idf = np.log((n_movies) / (1 + df_counts)) + 1.0  
        tf_idf = tf_counts * idf  

        
        norms = np.linalg.norm(tf_idf, axis=1, keepdims=True)
        norms[norms == 0] = 1e-10  
        self.matrix = tf_idf / norms  

       
        self.title2idx = pd.Series(self.movies.index, index=self.movies["title"]).drop_duplicates()

   
    def recommend(self, title: str, k: int = 5) -> pd.DataFrame:
        if title not in self.title2idx:
            raise ValueError(f"Title '{title}' not found in dataset.")
        idx = self.title2idx[title]
        v = self.matrix[idx]

        sims = self.matrix @ v  
        sims[idx] = -1          
        top_idx = np.argpartition(-sims, range(k))[:k]
        top_idx = top_idx[np.argsort(-sims[top_idx])]

        return self.movies.loc[top_idx, ["title"]].reset_index(drop=True)

   
    @staticmethod
    def _cli():
        p = argparse.ArgumentParser(description="NumPy + Pandas TMDB recommender")
        p.add_argument("title", help="Movie title, e.g. 'Avatar'")
        p.add_argument("-k", "--top", type=int, default=5, help="Number of recs")
        p.add_argument("--movies_csv", default="tmdb_5000_movies.csv")
        p.add_argument("--credits_csv", default="tmdb_5000_credits.csv")
        args = p.parse_args()

        rec = TMDBRecommenderNP(args.movies_csv, args.credits_csv)
        print(f"\nTop {args.top} movies similar to '{args.title}':")
        print(rec.recommend(args.title, k=args.top).to_string(index=False))

if __name__ == "__main__":
    
    rec = TMDBRecommenderNP("tmdb_5000_movies.csv", "tmdb_5000_credits.csv")
    recommendations = rec.recommend("Avatar", k=5)
    print("Top 5 similar movies to 'Avatar':")
    print(recommendations.to_string(index=False))



Top 5 similar movies to 'Avatar':
                  title
                 Aliens
          Falcon Rising
              Apollo 18
Star Trek Into Darkness
             Titan A.E.
