# Movie Recommender System - Data Preprocessing & Training

This notebook handles data loading, cleaning, feature extraction, and model training for the Movie Recommender System.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pickle
import ast
import os

## 1. Load Data

In [None]:
movies = pd.read_csv('../data/tmdb_5000_movies.csv')
credits = pd.read_csv('../data/tmdb_5000_credits.csv')

print("Movies shape:", movies.shape)
print("Credits shape:", credits.shape)

## 2. Merge Data

In [None]:
movies = movies.merge(credits, on='title')
print("Merged shape:", movies.shape)

## 3. Data Cleaning & Feature Extraction

In [None]:
# Keep relevant columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'vote_average', 'vote_count', 'popularity']]

# Drop missing values
movies.dropna(inplace=True)

# Helper function to extract names from JSON strings
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(convert3)

def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

# Collapse lists to strings (remove spaces)
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Create 'tags' column
movies['tags'] = movies['overview'].apply(lambda x: x.split() if isinstance(x, str) else []) + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

new_df = movies[['movie_id', 'title', 'tags', 'vote_average', 'vote_count', 'popularity']]
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())

print(new_df.head())

## 4. Vectorization (TF-IDF)

In [None]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(new_df['tags'])
print("TF-IDF Matrix shape:", tfidf_matrix.shape)

## 5. Compute Similarity

In [None]:
# We will use linear_kernel which is equivalent to cosine_similarity for normalized vectors (TF-IDF is normalized)
# Note: Computing the full matrix might be memory intensive. For production, we might compute on the fly or save a sparse matrix.
# Here we save the sparse matrix.

# For demonstration in notebook, let's just show it works.
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print("Cosine Similarity shape:", cosine_sim.shape)

## 6. Save Models

In [None]:
import scipy.sparse

if not os.path.exists('../models'):
    os.makedirs('../models')

# Save the dataframe (metadata)
pickle.dump(new_df, open('../models/movies_metadata.pkl', 'wb'))

# Save the vectorizer
pickle.dump(tfidf, open('../models/tfidf_vectorizer.pkl', 'wb'))

# Save the TF-IDF matrix (sparse)
scipy.sparse.save_npz('../models/tfidf_matrix.npz', tfidf_matrix)

print("Models saved successfully!")