In [None]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
similarity = linear_kernel(tfidf_matrix, tfidf_matrix)


# Load your dataset
DATA_PATH = "app/data/movies/movies_metadata.csv"

movies = pd.read_csv(DATA_PATH, low_memory=False, encoding='utf-8')
movies.head(3)


FileNotFoundError: [Errno 2] No such file or directory: 'app/data/movies/movies_metadata.csv'

In [None]:
# Select only needed columns
columns = ["id", "title", "overview", "genres", "poster_path", "vote_average", "release_date"]
movies = movies[columns].copy()

# Handle missing values
movies["title"] = movies["title"].fillna("")
movies["overview"] = movies["overview"].fillna("")
movies["genres"] = movies["genres"].fillna("[]")
movies["poster_path"] = movies["poster_path"].fillna("")
movies["vote_average"] = movies["vote_average"].fillna(0)
movies["release_date"] = movies["release_date"].fillna("")


In [None]:
def parse_genres(genres_str):
    if not isinstance(genres_str, str) or genres_str.strip() == "":
        return ""
    try:
        genres_list = json.loads(genres_str.replace("'", '"'))
        if isinstance(genres_list, list):
            return ", ".join([g.get("name", "") for g in genres_list if isinstance(g, dict)])
        return ""
    except Exception:
        return ""

movies["genres_clean"] = movies["genres"].apply(parse_genres)
movies[["title", "genres_clean"]].head(5)


In [None]:
def build_poster_url(poster_path):
    if isinstance(poster_path, str) and poster_path.strip():
        if not poster_path.startswith("/"):
            poster_path = "/" + poster_path
        return f"https://image.tmdb.org/t/p/w500{poster_path}"
    return "https://picsum.photos/400/600?random"

movies["poster_full"] = movies["poster_path"].apply(build_poster_url)
movies[["title", "poster_full"]].head(5)


In [None]:
# Combine overview and genres into a single text field
movies["tags"] = (movies["overview"].astype(str) + " " + movies["genres_clean"].astype(str)).str.strip()

# Drop duplicates and missing
movies = movies.drop_duplicates(subset=["title"])
movies = movies[movies["tags"].str.len() > 0]
movies.reset_index(drop=True, inplace=True)

movies[["title", "tags"]].head(5)


In [None]:
movies = movies.head(8000)  # Limit dataset for memory efficiency


In [None]:
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["tags"])
similarity = cosine_similarity(tfidf_matrix)
print("✅ TF-IDF model built successfully!", similarity.shape)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend(title):
    title = title.lower().strip()
    mask = movies["title"].astype(str).str.lower() == title
    if not mask.any():
        return []
    idx = movies[mask].index[0]

    # Compute similarity only with this movie
    movie_vector = tfidf_matrix[idx]
    sim_scores = cosine_similarity(movie_vector, tfidf_matrix).flatten()
    sim_indices = sim_scores.argsort()[-11:-1][::-1]  # top 10

    recs = []
    for i in sim_indices:
        row = movies.iloc[i]
        recs.append({
            "title": row["title"],
            "overview": row["overview"],
            "genres": row["genres_clean"],
            "poster_path": row["poster_full"],
            "rating": float(row.get("vote_average", 0)),
            "release_date": row.get("release_date", "")
        })
    return recs


In [None]:
cleaned_path = "app/data/movies/movies_clean.csv"
movies_clean = movies[["id", "title", "overview", "genres_clean", "poster_full", "vote_average", "release_date"]].copy()
movies_clean = movies_clean.rename(columns={
    "genres_clean": "genres",
    "poster_full": "poster_path",
    "vote_average": "rating"
})
movies_clean.to_csv(cleaned_path, index=False)
print(f"✅ Cleaned dataset saved to: {cleaned_path}")
