In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans 
from collections import defaultdict
from scipy.sparse import hstack
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
movies = pd.read_csv('movies.csv')  # Top 10,000 movies
genres = pd.read_csv('genres.csv')
themes = pd.read_csv('themes.csv')
crew = pd.read_csv('crew.csv')
languages = pd.read_csv('languages.csv')
actors = pd.read_csv('actors.csv')
countries = pd.read_csv('countries.csv')
studios = pd.read_csv('studios.csv')

# Ensure consistent `id` type across all datasets
movies['id'] = movies['id'].astype(str)
genres['id'] = genres['id'].astype(str)
themes['id'] = themes['id'].astype(str)
crew['id'] = crew['id'].astype(str)
languages['id'] = languages['id'].astype(str)
actors['id'] = actors['id'].astype(str)
countries['id'] = countries['id'].astype(str)
studios['id'] = studios['id'].astype(str)  # Ensure studios is processed correctly

# Filter for movies only in the English language
english_movies = languages[
    (languages['language'] == 'English') & 
    (languages['type'].isin(['Primary language', 'Language']))
]
movies = pd.merge(movies, english_movies[['id']], on='id', how='inner')

# Filter for movies only released after 1990 and before 2025
movies = movies[(movies['date'] >= 1990.0) & (movies['date'] <= 2023.0)]

# Filter for movies only with a rating above 3.0
movies = movies[movies['rating'] >= 3.0]

# Merge genres into the movies dataframe
movies_genres = genres.groupby('id')['genre'].apply(lambda x: ' '.join(x.dropna())).reset_index()
movies = pd.merge(movies, movies_genres, on='id', how='left')

# Merge themes into the movies dataframe
movies_themes = themes.groupby('id')['theme'].apply(lambda x: ' '.join(x.dropna())).reset_index()
movies = pd.merge(movies, movies_themes, on='id', how='left')

# Merge directors into the movies dataframe
movies_directors = crew[crew['role'] == 'Director'].groupby('id')['name'].apply(
    lambda x: ' '.join(x.dropna().astype(str))
).reset_index()
movies = pd.merge(movies, movies_directors, on='id', how='left', suffixes=('', '_director'))

# Merge countries into the movies dataframe
movies_countries = countries.groupby('id')['country'].apply(
    lambda x: ' '.join(x.dropna().astype(str))
).reset_index()
movies = pd.merge(movies, movies_countries, on='id', how='left')

# Merge actors into the movies dataframe
movies_actors = actors.groupby('id')['name'].apply(
    lambda x: ' '.join(x.dropna().astype(str))  # Handle NaN and ensure string type
).reset_index()
movies = pd.merge(movies, movies_actors, on='id', how='left', suffixes=('', '_actors'))

# Merge studios into the movies dataframe
movies_studios = studios.groupby('id')['studio'].apply(
    lambda x: ' '.join(x.dropna().astype(str))  # Handle NaN and ensure string type
).reset_index()
movies = pd.merge(movies, movies_studios, on='id', how='left', suffixes=('', '_studios'))

# Drop movies without critical information
movies.dropna(
    subset=['minute', 'theme', 'genre', 'description', 'name_director', 'country'], 
    inplace=True
)

# Fill missing values for text-related fields
movies['tagline'] = movies['tagline'].fillna('')
movies['description'] = movies['description'].fillna('')
movies['name'] = movies['name'].fillna('')

# Combine textual features into a single column
movies['text'] = (
    movies['description'] + " " + movies['tagline'] + " " + movies['name'] +
    " " + movies['country'] + " " + movies['name_actors'] + " " + movies['studio']
)

# Drop rows without textual data
movies.dropna(subset=['text'], inplace=True)

# Limit to the first 50,000 movies
movies = movies.head(50000)

# Preview the final dataframe
print(movies.head())


        id                               name    date  \
0  1000001                             Barbie  2023.0   
1  1000003  Everything Everywhere All at Once  2022.0   
2  1000004                         Fight Club  1999.0   
3  1000005                         La La Land  2016.0   
4  1000006                        Oppenheimer  2023.0   

                                            tagline  \
0                  She's everything. He's just Ken.   
1  The universe is so much bigger than you realize.   
2                           Mischief. Mayhem. Soap.   
3                    Here's to the fools who dream.   
4                        The world forever changes.   

                                         description  minute  rating  \
0  Barbie and Ken are having the time of their li...   114.0    3.86   
1  An aging Chinese immigrant is swept up in an i...   140.0    4.30   
2  A ticking-time-bomb insomniac and a slippery s...   139.0    4.27   
3  Mia, an aspiring actress, serves la

In [5]:
len(movies.index)


6196

In [6]:
import umap.umap_ as umap  

# Vectorize the textual data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
text_matrix = tfidf_vectorizer.fit_transform(movies['text'])
director_matrix = tfidf_vectorizer.fit_transform(movies['name_director'].fillna(''))

# Use the correct column name for actors
actor_column = 'name_actors'  # Ensure this matches the column created during merging
if actor_column in movies.columns:
    actor_matrix = tfidf_vectorizer.fit_transform(movies[actor_column].fillna(''))  
    print(f"Actor matrix created with shape: {actor_matrix.shape}")
else:
    print(f"Column '{actor_column}' not found. Skipping actor_matrix.")
    actor_matrix = None  # Fallback in case actors are not present

# Dimensionality Reduction using UMAP (replacing TruncatedSVD)
try:
    reducer = umap.UMAP(n_neighbors=15, n_components=50, random_state=42)
    # UMAP requires a dense matrix; converting sparse matrix to dense safely
    reduced_text_matrix = reducer.fit_transform(text_matrix.toarray())  
    print("UMAP dimensionality reduction completed.")
except MemoryError:
    print("MemoryError: Try reducing the number of features or sampling data for UMAP.")

# One-hot encode the genres, themes, and countries
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split())
genre_matrix = count_vectorizer.fit_transform(movies['genre'].fillna('').values)
theme_matrix = count_vectorizer.fit_transform(movies['theme'].fillna('').values)
country_matrix = count_vectorizer.fit_transform(movies['country'].fillna('').values)

# Scale numerical features
scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(movies[['date', 'rating']].fillna(0))

#full feature matrix
feature_matrices = [reduced_text_matrix, director_matrix, genre_matrix, theme_matrix, country_matrix, numerical_features]
if actor_matrix is not None:  
    feature_matrices.insert(1, actor_matrix) 

# Combine all feature matrices
from scipy.sparse import hstack  
try:
    feature_matrix = hstack(feature_matrices)  # Combine sparse and dense matrices
    print("Feature matrix shape:", feature_matrix.shape)
except MemoryError:
    print("MemoryError: Combining matrices failed. Consider reducing feature dimensions or sampling the dataset.")


Actor matrix created with shape: (6196, 61037)


  warn(


UMAP dimensionality reduction completed.
Feature matrix shape: (6196, 65689)




In [7]:
cosine_sim_text = cosine_similarity(text_matrix, text_matrix)
cosine_sim_directors = cosine_similarity(director_matrix, director_matrix)
cosine_sim_genres = cosine_similarity(genre_matrix, genre_matrix)
cosine_sim_themes = cosine_similarity(theme_matrix, theme_matrix)
cosine_sim_countries = cosine_similarity(country_matrix, country_matrix)
cosine_sim_actors = cosine_similarity(actor_matrix, actor_matrix) 

#clustered movies by combined genres and themes
genre_theme_combined = hstack([genre_matrix, theme_matrix])
kmeans = KMeans(n_clusters=10, random_state=42)
movies['cluster'] = kmeans.fit_predict(genre_theme_combined)


In [8]:
# Function to recommend movies based on input titles
def recommend_movies(movie_titles, num_recommendations=15, return_raw = False):
    # Check if the input movies exist in the dataset
    valid_movies = movies[movies['name'].isin(movie_titles)]
    if valid_movies.empty:
        return "One or more of the input movies do not exist in the dataset."
    
    for title in movie_titles:
        input_movie = movies.loc[movies['name'] == title]
        formatted_input = (f"Inputted:\n{input_movie.values[0]}\n\n")
        print(formatted_input)
    
    # Get movie indices for the input titles
    movie_indices = valid_movies.index.tolist()
    
    # Create a defaultdict to store the scores
    scores = defaultdict(float)
    
   # Calculate scores based on different features
    for idx in movie_indices:
        text_scores = cosine_sim_text[idx]
        genre_scores = cosine_sim_genres[idx]
        theme_scores = cosine_sim_themes[idx]
        director_scores = cosine_sim_directors[idx]
        country_scores = cosine_sim_countries[idx]

        for i, score in enumerate(text_scores):
            if i != idx:
                scores[i] += 0.90 * score  # High weight for text
                scores[i] += 0.15 * genre_scores[i]  # Moderate weight for genres
                scores[i] += 0.03 * theme_scores[i]  # Low weight for themes
                scores[i] += 0.15 * director_scores[i]  # Moderate weight for directors
                if movies.iloc[i]['name_director'] == movies.iloc[idx]['name_director']:
                    scores[i] -= 0.05 * director_scores[i]  # Penalize same director


    # Sort scores and get the top N recommendations
    recommended_indices = sorted(scores, key=scores.get, reverse=True)[:num_recommendations]
    recommendations = []
    seen_directors = set()
    
    input_movie_genres = movies.loc[movie_indices[0], 'genre'].split()  # Extract input movie's genres

    
    for idx in sorted(scores, key=scores.get, reverse=True):
        director = movies.iloc[idx]['name_director']
        if idx not in recommended_indices and any(genre in movies.iloc[idx]['genre'].split() for genre in input_movie_genres):
            if director not in seen_directors:  # Limit to one movie per director
                recommended_indices.append(idx)
                seen_directors.add(director)
            if len(recommended_indices) == 10:  # Limit to 10 recommendations
                break
        
    
    for idx in recommended_indices:
        movie = movies.iloc[idx]
        reason = {
            'genres': cosine_sim_genres[idx].sum(),
            'themes': cosine_sim_themes[idx].sum(),
            'directors': cosine_sim_directors[idx].sum(),
            'text': cosine_sim_text[idx].sum()
        }

        recommendations.append({
            'title': movie['name'],
            'genres': movie['genre'],
            'directors': movie['name_director'],
            'tagline': movie['tagline'],
            'reasoning': reason,
            'country': movie['country']
        })

    if return_raw:
        return recommendations #returns raw data for evaluation
    
    # Formatted output 
    formatted_recommendations = []
    for recommendation in recommendations:
        formatted_recommendations.append(f"Title: {recommendation['title']}\n"
                                        f"Genres: {recommendation['genres']}\n"
                                        f"Directors: {recommendation['directors']}\n"
                                        f"Tagline: {recommendation['tagline']}\n"
                                        f"Reasoning:\n"
                                        f"  - Genres Similarity: {recommendation['reasoning']['genres']:.2f}\n"
                                        f"  - Themes Similarity: {recommendation['reasoning']['themes']:.2f}\n"
                                        f"  - Directors Similarity: {recommendation['reasoning']['directors']:.2f}\n"
                                        f"  - Text Similarity: {recommendation['reasoning']['text']:.2f}\n")
    
    return "\n".join(formatted_recommendations)

# Example usage:
input_titles = ["Get Out"]  # Input movie titles
recommendations = recommend_movies(input_titles)

# Display the recommendations
print(recommendations)

Inputted:
['1000014' 'Get Out' 2017.0
 "Just because you're invited, doesn't mean you're welcome."
 "Chris and his girlfriend Rose go upstate to visit her parents for the weekend. At first, Chris reads the family's overly accommodating behavior as nervous attempts to deal with their daughter's interracial relationship, but as the weekend progresses, a series of increasingly disturbing discoveries lead him to a truth that he never could have imagined."
 104.0 4.16 'Horror Mystery Thriller'
 'Horror, the undead and monster classics Intense violence and sexual transgression Twisted dark psychological thriller Terrifying, haunted, and supernatural horror Creepy, chilling, and terrifying horror Gory, gruesome, and slasher horror Racism and the powerful fight for justice'
 'Jordan Peele' 'USA'
 'Daniel Kaluuya Allison Williams Bradley Whitford Catherine Keener Caleb Landry Jones Betty Gabriel Marcus Henderson Lil Rel Howery LaKeith Stanfield Stephen Root Ashley LeConte Campbell John Wilmot C

In [9]:

def evaluate_recommendations(input_titles, recommendations, ground_truth, k=10):
    """
    Evaluate the recommendation system.
    Args:
        input_titles (list): Input movie titles.
        recommendations (list): List of recommendation dictionaries from recommend_movies.
        ground_truth (list): Manually curated relevant movies.
        k (int): Number of top recommendations to evaluate.
    Returns:
        dict: Metrics including Precision@k, Recall@k, F1-Score@k, NDCG@k, and Intra-list Similarity.
    """
    # Extract recommended titles
    recommended_titles = [rec['title'] for rec in recommendations[:k]]
    
    # Calculated Precision@k
    relevant_recommendations = [title for title in recommended_titles if title in ground_truth]
    precision_at_k = len(relevant_recommendations) / k

    # Calculated Recall@k
    recall_at_k = len(relevant_recommendations) / len(ground_truth) if ground_truth else 0

    # Calculated F1-Score@k
    if precision_at_k + recall_at_k > 0:
        f1_score_at_k = 2 * (precision_at_k * recall_at_k) / (precision_at_k + recall_at_k)
    else:
        f1_score_at_k = 0

    # Convert feature_matrix to CSR format
    feature_matrix_csr = feature_matrix.tocsr()

    # Map recommended titles to indices in the feature matrix
    recommended_indices = []
    for title in recommended_titles:
        try:
            idx = movies[movies['name'] == title].index[0]
            if idx < feature_matrix.shape[0]:  
                recommended_indices.append(idx)
        except IndexError:
            print(f"Title '{title}' not found in dataset.")

    # Handle edge case: No valid indices found
    if not recommended_indices:
        return {
            "Precision@k": precision_at_k,
            "Recall@k": recall_at_k,
            "F1-Score@k": f1_score_at_k,
            "NDCG@k": None,
            "Intra-list Similarity": None,
        }

    # Calculated Intra-list Similarity (ILS)
    feature_vectors = feature_matrix_csr[recommended_indices]
    similarity_matrix = cosine_similarity(feature_vectors)
    intra_list_similarity = np.mean(similarity_matrix[np.triu_indices(len(similarity_matrix), k=1)])

    # Calculated Normalized Discounted Cumulative Gain (NDCG@k)
    dcg = 0
    for i, title in enumerate(recommended_titles):
        if title in ground_truth:
            dcg += 1 / np.log2(i + 2)  # Discounted gain
    idcg = sum([1 / np.log2(i + 2) for i in range(min(len(ground_truth), k))])  # Ideal DCG
    ndcg_at_k = dcg / idcg if idcg > 0 else 0

    return {
        "Precision@k": precision_at_k,
        "Recall@k": recall_at_k,
        "F1-Score@k": f1_score_at_k,
        "NDCG@k": ndcg_at_k,
        "Intra-list Similarity": intra_list_similarity,
    }

# Example usage
input_titles = ["Inception"]
ground_truth = ["Interstellar", "Tenet", "The Prestige", "Guardians of the Galaxy"]  # Ground truth for evaluation
recommendations = recommend_movies(input_titles, num_recommendations=15, return_raw=True)
evaluation_results = evaluate_recommendations(input_titles, recommendations, ground_truth, k=10)
print(evaluation_results)

Inputted:
['1000018' 'Inception' 2010.0 'Your mind is the scene of the crime.'
 'Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person\'s idea into a target\'s subconscious.'
 148.0 4.2 'Action Adventure Science Fiction'
 'High speed and special ops Humanity and the world around us Dreamlike, quirky, and surreal storytelling Surreal and thought-provoking visions of life and death Emotional and captivating fantasy storytelling Explosive and action-packed heroes vs. villains Thought-provoking sci-fi action and future technology'
 'Christopher Nolan' 'UK USA'
 'Leonardo DiCaprio Joseph Gordon-Levitt Ken Watanabe Tom Hardy Elliot Page Dileep Rao Cillian Murphy Tom Berenger Marion Cotillard Pete Postlethwaite Michael Caine Lukas Haas Talulah Riley Tohoru Masamune Taylor Geare Claire Geare Johnathan Geare

In [10]:
len(movies.index)

6196