In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from pathlib import Path
PROJECT_ROOT = Path("/home/sjscarface/piprojects/recommendersystem")
DATA_DIR = PROJECT_ROOT / "data"
CANDIDATES_FILE = DATA_DIR / "candidates.json"
WATCH_HISTORY_FILE = DATA_DIR / "watch_history.json"
RECOMMENDATIONS_FILE = DATA_DIR / "recommendations.json"

In [9]:
def load_data():
    """Load candidates from Phase 1."""
    print("ðŸ“‚ Loading candidates...")

    with open(CANDIDATES_FILE, "r") as f:
        data = json.load(f)

    candidates = data["candidates"]
    print(f"   Loaded {len(candidates)} candidates")

    # Convert to DataFrame for easier manipulation
    candidate_df = pd.DataFrame(candidates)

    return candidate_df

In [11]:
data = load_data()

ðŸ“‚ Loading candidates...
   Loaded 710 candidates


In [12]:
def _build_feature_text(self, item: dict) -> str:
        """
        Combine all text features into a single string for TF-IDF.
        
        LEARNING NOTE: Feature Engineering
        -----------------------------------
        We concatenate multiple features because:
        1. TF-IDF works on text, so we need a text representation
        2. Repeating important features increases their weight
        3. Different features capture different aspects of similarity
        
        We weight features by repetition:
        - Genres: repeated 3x (very important for similarity)
        - Keywords: repeated 2x (distinctive features)
        - Cast: repeated 1x (actor preferences)
        - Overview: repeated 1x (thematic similarity)
        """
        parts = []
        
        # Genres (weight: 3x) - most important for broad similarity
        genres = item.get("genres", [])
        parts.extend(genres * 3)
        
        # Keywords (weight: 2x) - very distinctive features
        keywords = item.get("keywords", [])
        parts.extend(keywords * 2)
        
        # Cast names (weight: 1x)
        cast = item.get("cast", [])
        cast_names = [c["name"].replace(" ", "_") for c in cast if isinstance(c, dict)]
        parts.extend(cast_names)
        
        # Directors/Creators (weight: 1x)
        directors = item.get("directors", [])
        creators = item.get("creators", [])
        parts.extend([d.replace(" ", "_") for d in directors])
        parts.extend([c.replace(" ", "_") for c in creators])
        
        # Overview words (weight: 1x) - thematic content
        overview = item.get("overview", "")
        if overview:
            # Just add the overview as-is, TF-IDF will tokenize
            parts.append(overview)
        
        # Combine all parts
        return " ".join(parts).lower()

In [13]:
data[]

Unnamed: 0,tmdb_id,title,type,year,genres,keywords,overview,tagline,vote_average,vote_count,...,cast,creators,original_language,poster_path,status,number_of_seasons,recommended_because,recommendation_strength,runtime,directors
0,1396,Breaking Bad,tv,2008,"[Drama, Crime]","[new mexico, drug dealer, narcissism, psychopa...","Walter White, a New Mexico chemistry teacher, ...",Change the equation.,8.935,16843,...,"[{'name': 'Bryan Cranston', 'popularity': 6.13...",[Vince Gilligan],en,/ztkUQFLlC19CCMYHW9o1zWhJRNq.jpg,Ended,5.0,"[Rick and Morty, Family Guy, Severance, Asia, ...",6,,
1,60625,Rick and Morty,tv,2013,"[Animation, Comedy, Sci-Fi & Fantasy, Action &...","[time travel, grandfather, alcoholism, alien, ...",Follows a sociopathic genius scientist who dra...,"Science makes sense, family doesn't.",8.683,10587,...,"[{'name': 'Chris Parnell', 'popularity': 2.753...","[Dan Harmon, Justin Roiland]",en,/WGRQ8FpjkDTzivQJ43t94bOuY0.jpg,Returning Series,8.0,"[Family Guy, Arcane, Stranger Things, Dexter's...",5,,
2,615,Futurama,tv,1999,"[Animation, Comedy, Sci-Fi & Fantasy]","[spacecraft, future, space travel, melancholy,...",The adventures of a late-20th-century New York...,Defying gravity and common sense.,8.368,3582,...,"[{'name': 'Billy West', 'popularity': 1.7181, ...",[Matt Groening],en,/6ZS8SOno6kTmWz4eQ8lX8EBXOMv.jpg,Returning Series,10.0,"[Rick and Morty, Dexter's Laboratory, Family G...",4,,
3,1433,American Dad!,tv,2005,"[Animation, Comedy]","[central intelligence agency (cia), social sat...",The series focuses on an eccentric motley crew...,"From the creator of ""Family Guy"" comes a man w...",7.000,2282,...,"[{'name': 'Seth MacFarlane', 'popularity': 3.4...","[Matt Weitzman, Seth MacFarlane, Mike Barker]",en,/aC1q422YhQR7k82GB8gW4KoD91p.jpg,Returning Series,22.0,"[Rick and Morty, Family Guy, South Park]",4,,
4,62741,Kamisama Kiss,tv,2012,"[Animation, Comedy, Sci-Fi & Fantasy]","[magic, friendship, supernatural, high school,...",Nanami was just a normal high school girl down...,,8.600,992,...,"[{'name': 'Yui Horie', 'popularity': 2.2087, '...",[],ja,/5E7GL8KxpFemEFl3Lv8Fu4RuSwa.jpg,Ended,2.0,"[Arcane, SPY x FAMILY, Stranger Things]",3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,129,Spirited Away,movie,2001,"[Animation, Family, Fantasy]","[witch, parent child relationship, darkness, b...","A young girl, Chihiro, becomes trapped in a st...",Beyond the tunnel was a mysterious town.,8.500,17795,...,"[{'name': 'Rumi Hiiragi', 'popularity': 2.1533...",,ja,/39wmItIWsg5sZMyRUHLkWBcuVCM.jpg,,,[Parasite],1,125.0,[Hayao Miyazaki]
706,503919,The Lighthouse,movie,2019,"[Drama, Fantasy, Thriller]","[island, nightmare, isolation, mermaid, halluc...",Two lighthouse keepers try to maintain their s...,"Keeping secrets, are ye?",7.484,5360,...,"[{'name': 'Robert Pattinson', 'popularity': 6....",,en,/f1tIYarTbkBdIT1aW0gzelDwknv.jpg,,,[Parasite],1,109.0,[Robert Eggers]
707,359724,Ford v Ferrari,movie,2019,"[Drama, Action, History]","[based on novel or book, car race, sports, car...",American car designer Carroll Shelby and the B...,They took the American dream for a ride,8.000,8673,...,"[{'name': 'Matt Damon', 'popularity': 10.9619,...",,en,/dR1Ju50iudrOh3YgfwkAU1g2HZe.jpg,,,[Parasite],1,153.0,[James Mangold]
708,387426,Okja,movie,2017,"[Adventure, Drama, Science Fiction]","[new york city, monster, slaughterhouse, east ...",A young girl named Mija risks everything to pr...,We needed a miracle. And then we got one.,7.346,4453,...,"[{'name': 'Ahn Seo-hyun', 'popularity': 1.7392...",,en,/lHBYG2NcBMW7UpFL4rSCpsgvz4m.jpg,,,[Parasite],1,120.0,[Bong Joon Ho]


In [None]:
_build_feature_text(c)