In [3]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# ============================
# 1. Load datasets
# ============================
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

# ============================
# 2. Ensure merge column exists
# ============================
if 'movie_id' not in movies.columns and 'id' in movies.columns:
    movies.rename(columns={'id':'movie_id'}, inplace=True)

# Merge datasets safely
movies = movies.merge(credits, on='movie_id', how='inner')

# ============================
# 3. Handle overlapping column names
# ============================
# After merge, some columns may have _x/_y suffixes
# Keep original movie columns, drop duplicate credits columns
if 'title_x' in movies.columns:
    movies.rename(columns={'title_x':'title'}, inplace=True)
if 'title_y' in movies.columns:
    movies.drop(columns=['title_y'], inplace=True)

# ============================
# 4. Keep only useful columns
# ============================
expected_cols = ['movie_id','title','overview','genres','keywords','cast','crew']
movies = movies[[col for col in expected_cols if col in movies.columns]]

# ============================
# 5. Helper function to convert stringified lists
# ============================
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

# ============================
# 6. Apply conversions
# ============================
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Top 3 cast members
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:3]])

# Director name only
movies['crew'] = movies['crew'].apply(lambda x: [i['name'] for i in ast.literal_eval(x) if i['job']=='Director'])

# ============================
# 7. Create "tags" column
# ============================
movies['tags'] = movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# ============================
# 8. Convert tags into clean text
# ============================
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x).lower())
movies['tags'] = movies['tags'].fillna("")

# ============================
# 9. Clean movie titles
# ============================
movies['title'] = movies['title'].astype(str).str.strip()

# ============================
# 10. TF-IDF vectorization
# ============================
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])

# ============================
# 11. Cosine similarity
# ============================
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# ============================
# 12. Recommendation function
# ============================
def get_recommendations(title, cosine_sim=cosine_sim):
    title = title.strip()
    
    if title not in movies['title'].values:
        return ["Movie not found in dataset"]
    
    idx = movies[movies['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

# ============================
# 13. Test
# ============================
print(get_recommendations("The Dark Knight Rises"))

# ============================
# 14. Save data for Streamlit app
# ============================
with open('movie_data.pkl', 'wb') as file:
    pickle.dump((movies, cosine_sim), file)


65               The Dark Knight
119                Batman Begins
1359                      Batman
210               Batman & Robin
428               Batman Returns
1196                The Prestige
303                     Catwoman
4638    Amidst the Devil's Wings
72                 Suicide Squad
299               Batman Forever
Name: title, dtype: object
