In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from collections import defaultdict

In [None]:
movies = pd.read_csv('movies.csv')  # Top 10,000 movies
genres = pd.read_csv('genres.csv')
themes = pd.read_csv('themes.csv')
crew = pd.read_csv('crew.csv')
languages = pd.read_csv('languages.csv')

# Filter for movies only in the English language
english_movies = languages[(languages['language'] == 'English') & 
                           (languages['type'].isin(['Primary language', 'Language']))]
movies = pd.merge(movies, english_movies[['id']], on='id', how='inner')

# Filter for movies only released after the year 1990
# movies = movies.where(movies['date'] >= 1990.0)

# Filter for movies only released before the year 2025 and after 1990
movies = movies.where(movies['date'] <= 2023.0)
movies = movies.where(movies['date'] >= 1990.0)

# Filter for movies only with a rating above 3.5
movies = movies.where(movies['rating'] >= 3.0)

# Merge genres, actors, themes into the movies dataframe
movies_genres = genres.groupby('id')['genre'].apply(lambda x: ' '.join(x)).reset_index()
movies = pd.merge(movies, movies_genres, on='id', how='left')

movies_themes = themes.groupby('id')['theme'].apply(lambda x: ' '.join(x)).reset_index()
movies = pd.merge(movies, movies_themes, on='id', how='left')

movies_directors = crew[crew['role'] == 'Director'].groupby('id')['name'].apply(lambda x: ' '.join(map(str, x))).reset_index()
movies = pd.merge(movies, movies_directors, on='id', how='left', suffixes=('', '_director'))

# Filter for only movies that have a theme, genre, description and director
movies.dropna(subset=['minute', 'theme', 'genre', 'description', 'name_director'], inplace=True, how='any')

# Extract description and tagline for text-based features
# movies['description'] = movies['description'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')

# Combine textual features into a single column
movies['text'] = movies['description'] + " " + movies['tagline'] + " " + movies['name']
movies.dropna(subset=['text'], inplace=True)
movies = movies.head(50000)

movies.head()

In [None]:
len(movies.index)

In [None]:
# Vectorize the textual data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
text_matrix = tfidf_vectorizer.fit_transform(movies['text'])
director_matrix = tfidf_vectorizer.fit_transform(movies['name_director'].fillna(''))

# One-hot encode the genres and themes
# ohe = OneHotEncoder(sparse_output=False)
# genre_matrix = ohe.fit_transform(movies['genre'].values.reshape(-1, 1))
# theme_matrix = ohe.fit_transform(movies['theme'].values.reshape(-1, 1))
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split())
genre_matrix = count_vectorizer.fit_transform(movies['genre'].fillna('').values)
theme_matrix = count_vectorizer.fit_transform(movies['theme'].fillna('').values)

In [None]:
# Compute Cosine Similarity for Textual Features
cosine_sim_text = cosine_similarity(text_matrix, text_matrix)
print("Completed text similarity")
cosine_sim_directors = cosine_similarity(director_matrix, director_matrix)
print("Completed director similarity")

# Calculate Cosine Similarity for Genres
cosine_sim_genres = cosine_similarity(genre_matrix, genre_matrix)
print("Completed genre similarity")
cosine_sim_themes = cosine_similarity(theme_matrix, theme_matrix)
print("Completed theme similarity")

In [None]:
# Function to recommend movies based on input titles
def recommend_movies(movie_titles, num_recommendations=3):
    # Check if the input movies exist in the dataset
    valid_movies = movies[movies['name'].isin(movie_titles)]
    if valid_movies.empty:
        return "One or more of the input movies do not exist in the dataset."
    
    for title in movie_titles:
        input_movie = movies.loc[movies['name'] == title]
        formatted_input = (f"Inputted:\n{input_movie.values[0]}\n\n")
        print(formatted_input)
    
    # Get movie indices for the input titles
    movie_indices = valid_movies.index.tolist()
    
    # Create a defaultdict to store the scores
    scores = defaultdict(float)
    
    # Add scores from text similarity
    for idx in movie_indices:
        similarity_scores = cosine_sim_text[idx]
        for i, score in enumerate(similarity_scores):
            if i != idx:  # Don't recommend the movie itself
                scores[i] += 0.75 * score
    
    for idx in movie_indices:
        genre_scores = cosine_sim_genres[idx]
        for i, score in enumerate(genre_scores):
            if i != idx:  # Don't recommend the movie itself
                scores[i] += 0.13 * score

    for idx in movie_indices:
        theme_scores = cosine_sim_themes[idx]
        for i, score in enumerate(theme_scores):
            if i != idx:  # Don't recommend the movie itself
                scores[i] += 0.1 * score

    for idx in movie_indices:
        director_scores = cosine_sim_directors[idx]
        for i, score in enumerate(director_scores):
            if i != idx:  # Don't recommend the movie itself
                scores[i] += 0.1 * score

    # Sort scores and get the top N recommendations
    recommended_indices = sorted(scores, key=scores.get, reverse=True)[:num_recommendations]
    
    recommendations = []
    for idx in recommended_indices:
        movie = movies.iloc[idx]
        
        # Get the weight for each aspect (genres, description, etc.)
        # genre_score = np.dot(genre_matrix[idx], genre_matrix.T).sum()
        genre_score = cosine_sim_genres[idx].sum()
        theme_score = cosine_sim_themes[idx].sum()
        director_score = cosine_sim_directors[idx].sum()
        text_score = cosine_sim_text[idx].sum()

        reason = {
            'genres': genre_score,
            'themes': theme_score,
            'directors': director_score,
            'text': text_score
        }
        
        recommendations.append({
            'title': movie['name'],
            'genres': movie['genre'],
            'directors': movie['name_director'],
            'tagline': movie['tagline'],
            'reasoning': reason
        })
    
    # Format output to make it more readable
    formatted_recommendations = []
    for recommendation in recommendations:
        formatted_recommendations.append(f"Title: {recommendation['title']}\n"
                                        f"Genres: {recommendation['genres']}\n"
                                        f"Directors: {recommendation['directors']}\n"
                                        f"Tagline: {recommendation['tagline']}\n"
                                        f"Reasoning:\n"
                                        f"  - Genres Similarity: {recommendation['reasoning']['genres']:.2f}\n"
                                        f"  - Themes Similarity: {recommendation['reasoning']['themes']:.2f}\n"
                                        f"  - Directors Similarity: {recommendation['reasoning']['directors']:.2f}\n"
                                        f"  - Text Similarity: {recommendation['reasoning']['text']:.2f}\n")
    
    return "\n".join(formatted_recommendations)

# Example usage:
input_titles = ["Inception"]  # Input movie titles
recommendations = recommend_movies(input_titles)

# Display the recommendations
print(recommendations)

In [None]:
movies.head(30)