In [125]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from collections import defaultdict
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD

In [126]:
movies = pd.read_csv('movies.csv')  # Top 10,000 movies
genres = pd.read_csv('genres.csv')
themes = pd.read_csv('themes.csv')
crew = pd.read_csv('crew.csv')
languages = pd.read_csv('languages.csv')
actors = pd.read_csv('actors.csv')
countries = pd.read_csv('countries.csv')

# Filter for movies only in the English language
english_movies = languages[(languages['language'] == 'English') & 
                           (languages['type'].isin(['Primary language', 'Language']))]
movies = pd.merge(movies, english_movies[['id']], on='id', how='inner')

# Filter for movies only released after the year 1990
# movies = movies.where(movies['date'] >= 1990.0)

# Filter for movies only released before the year 2025 and after 1990
movies = movies.where(movies['date'] <= 2023.0)
movies = movies.where(movies['date'] >= 1990.0)

# Filter for movies only with a rating above 3.5
movies = movies.where(movies['rating'] >= 3.0)

# Merge genres, actors, themes into the movies dataframe
movies_genres = genres.groupby('id')['genre'].apply(lambda x: ' '.join(x)).reset_index()
movies = pd.merge(movies, movies_genres, on='id', how='left')

movies_themes = themes.groupby('id')['theme'].apply(lambda x: ' '.join(x)).reset_index()
movies = pd.merge(movies, movies_themes, on='id', how='left')

movies_directors = crew[crew['role'] == 'Director'].groupby('id')['name'].apply(lambda x: ' '.join(map(str, x))).reset_index()
movies = pd.merge(movies, movies_directors, on='id', how='left', suffixes=('', '_director'))

movies_countries = countries.groupby('id')['country'].apply(lambda x: ' '.join(x)).reset_index()  
movies = pd.merge(movies, movies_countries, on='id', how='left')

# Filter for only movies that have a theme, genre, description and director
movies.dropna(subset=['minute', 'theme', 'genre', 'description', 'name_director', 'country'], inplace=True)

# Extract description and tagline for text-based features
# movies['description'] = movies['description'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')

# Combine textual features into a single column
movies['text'] = movies['description'] + " " + movies['tagline'] + " " + movies['name']
movies.dropna(subset=['text'], inplace=True)
movies = movies.head(50000)

movies.head()

Unnamed: 0,id,name,date,tagline,description,minute,rating,genre,theme,name_director,country,text
0,1000001.0,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.86,Comedy Adventure,Humanity and the world around us Crude humor a...,Greta Gerwig,UK USA,Barbie and Ken are having the time of their li...
1,1000003.0,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.3,Science Fiction Adventure Comedy Action,Humanity and the world around us Moving relati...,Daniel Scheinert Daniel Kwan,USA,An aging Chinese immigrant is swept up in an i...
2,1000004.0,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27,Drama,Intense violence and sexual transgression Huma...,David Fincher,Germany USA,A ticking-time-bomb insomniac and a slippery s...
3,1000005.0,La La Land,2016.0,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129.0,4.09,Drama Comedy Music Romance,Song and dance Humanity and the world around u...,Damien Chazelle,Hong Kong USA,"Mia, an aspiring actress, serves lattes to mov..."
4,1000006.0,Oppenheimer,2023.0,The world forever changes.,The story of J. Robert Oppenheimer's role in t...,181.0,4.23,Drama History,Humanity and the world around us Politics and ...,Christopher Nolan,UK USA,The story of J. Robert Oppenheimer's role in t...


In [127]:
len(movies.index)

6408

In [128]:
# Vectorize the textual data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
text_matrix = tfidf_vectorizer.fit_transform(movies['text'])
director_matrix = tfidf_vectorizer.fit_transform(movies['name_director'].fillna(''))

#dimensionality reduction
svd = TruncatedSVD(n_components=100)  # Reduce to 100 components
reduced_text_matrix = svd.fit_transform(text_matrix)

# One-hot encode the genres and themes
# ohe = OneHotEncoder(sparse_output=False)
# genre_matrix = ohe.fit_transform(movies['genre'].values.reshape(-1, 1))
# theme_matrix = ohe.fit_transform(movies['theme'].values.reshape(-1, 1))
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split())
genre_matrix = count_vectorizer.fit_transform(movies['genre'].fillna('').values)
theme_matrix = count_vectorizer.fit_transform(movies['theme'].fillna('').values)
country_matrix = count_vectorizer.fit_transform(movies['country'].fillna('').values)  


scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(movies[['date', 'rating']].fillna(0))

# all features in a single feature matrix
feature_matrix = hstack([reduced_text_matrix, director_matrix, genre_matrix, theme_matrix, country_matrix, numerical_features])  ## UPDATED CODE
print("Feature matrix shape:", feature_matrix.shape)

Feature matrix shape: (6408, 4845)




In [129]:
# Compute Cosine Similarity for Textual Features
cosine_sim_text = cosine_similarity(text_matrix, text_matrix)
print("Completed text similarity")
cosine_sim_directors = cosine_similarity(director_matrix, director_matrix)
print("Completed director similarity")

# Calculate Cosine Similarity for Genres
cosine_sim_genres = cosine_similarity(genre_matrix, genre_matrix)
print("Completed genre similarity")
cosine_sim_themes = cosine_similarity(theme_matrix, theme_matrix)
print("Completed theme similarity")

cosine_sim_countries = cosine_similarity(country_matrix, country_matrix)
print("Completed country similarity")


Completed text similarity
Completed director similarity
Completed genre similarity
Completed theme similarity
Completed country similarity


In [130]:
# Function to recommend movies based on input titles
def recommend_movies(movie_titles, num_recommendations=15, return_raw = False):
    # Check if the input movies exist in the dataset
    valid_movies = movies[movies['name'].isin(movie_titles)]
    if valid_movies.empty:
        return "One or more of the input movies do not exist in the dataset."
    
    for title in movie_titles:
        input_movie = movies.loc[movies['name'] == title]
        formatted_input = (f"Inputted:\n{input_movie.values[0]}\n\n")
        print(formatted_input)
    
    # Get movie indices for the input titles
    movie_indices = valid_movies.index.tolist()
    
    # Create a defaultdict to store the scores
    scores = defaultdict(float)
    
   # Calculate scores based on different features
    for idx in movie_indices:
        text_scores = cosine_sim_text[idx]
        genre_scores = cosine_sim_genres[idx]
        theme_scores = cosine_sim_themes[idx]
        director_scores = cosine_sim_directors[idx]
        country_scores = cosine_sim_countries[idx]

        for i, score in enumerate(text_scores):
            if i != idx:
                scores[i] += 0.90 * score  # High weight for text
                scores[i] += 0.15 * genre_scores[i]  # Moderate weight for genres
                scores[i] += 0.03 * theme_scores[i]  # Low weight for themes
                scores[i] += 0.15 * director_scores[i]  # Moderate weight for directors
                if movies.iloc[i]['name_director'] == movies.iloc[idx]['name_director']:
                    scores[i] -= 0.05 * director_scores[i]  # Penalize same director


    # Sort scores and get the top N recommendations
    recommended_indices = sorted(scores, key=scores.get, reverse=True)[:num_recommendations]
    recommendations = []
    seen_directors = set()
    
    input_movie_genres = movies.loc[movie_indices[0], 'genre'].split()  # Extract input movie's genres

    
    for idx in sorted(scores, key=scores.get, reverse=True):
        director = movies.iloc[idx]['name_director']
        if idx not in recommended_indices and any(genre in movies.iloc[idx]['genre'].split() for genre in input_movie_genres):
            if director not in seen_directors:  # Limit to one movie per director
                recommended_indices.append(idx)
                seen_directors.add(director)
            if len(recommended_indices) == 10:  # Limit to 10 recommendations
                break
        
    
    for idx in recommended_indices:
        movie = movies.iloc[idx]
        reason = {
            'genres': cosine_sim_genres[idx].sum(),
            'themes': cosine_sim_themes[idx].sum(),
            'directors': cosine_sim_directors[idx].sum(),
            'text': cosine_sim_text[idx].sum()
        }

        recommendations.append({
            'title': movie['name'],
            'genres': movie['genre'],
            'directors': movie['name_director'],
            'tagline': movie['tagline'],
            'reasoning': reason,
            'country': movie['country']
        })

    if return_raw:
        return recommendations #returns raw data for evaluation
    
    # Format output to make it more readable
    formatted_recommendations = []
    for recommendation in recommendations:
        formatted_recommendations.append(f"Title: {recommendation['title']}\n"
                                        f"Genres: {recommendation['genres']}\n"
                                        f"Directors: {recommendation['directors']}\n"
                                        f"Tagline: {recommendation['tagline']}\n"
                                        f"Reasoning:\n"
                                        f"  - Genres Similarity: {recommendation['reasoning']['genres']:.2f}\n"
                                        f"  - Themes Similarity: {recommendation['reasoning']['themes']:.2f}\n"
                                        f"  - Directors Similarity: {recommendation['reasoning']['directors']:.2f}\n"
                                        f"  - Text Similarity: {recommendation['reasoning']['text']:.2f}\n")
    
    return "\n".join(formatted_recommendations)

# Example usage:
input_titles = ["Get Out"]  # Input movie titles
recommendations = recommend_movies(input_titles)

# Display the recommendations
print(recommendations)

Inputted:
[1000014.0 'Get Out' 2017.0
 "Just because you're invited, doesn't mean you're welcome."
 "Chris and his girlfriend Rose go upstate to visit her parents for the weekend. At first, Chris reads the family's overly accommodating behavior as nervous attempts to deal with their daughter's interracial relationship, but as the weekend progresses, a series of increasingly disturbing discoveries lead him to a truth that he never could have imagined."
 104.0 4.16 'Horror Mystery Thriller'
 'Horror, the undead and monster classics Intense violence and sexual transgression Twisted dark psychological thriller Terrifying, haunted, and supernatural horror Creepy, chilling, and terrifying horror Gory, gruesome, and slasher horror Racism and the powerful fight for justice'
 'Jordan Peele' 'USA'
 "Chris and his girlfriend Rose go upstate to visit her parents for the weekend. At first, Chris reads the family's overly accommodating behavior as nervous attempts to deal with their daughter's inter

In [131]:
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_recommendations(input_titles, recommendations, ground_truth, k=10):
    recommended_titles = [rec['title'] for rec in recommendations[:k]]
    relevant_recommendations = [title for title in recommended_titles if title in ground_truth]
    precision_at_k = len(relevant_recommendations) / k

    feature_matrix_csr = feature_matrix.tocsr()
    recommended_indices = []
    for title in recommended_titles:
        try:
            idx = movies[movies['name'] == title].index[0]
            if idx < feature_matrix.shape[0]:
                recommended_indices.append(idx)
        except IndexError:
            print(f"Title '{title}' not found in dataset.")

    if not recommended_indices:
        return {"Precision@k": precision_at_k, "Intra-list Similarity": None}

    feature_vectors = feature_matrix_csr[recommended_indices]
    intra_list_similarity = cosine_similarity(feature_vectors).mean()

    return {"Precision@k": precision_at_k, "Intra-list Similarity": intra_list_similarity}

# Example usage
input_titles = ["Inception"]
ground_truth = ["Interstellar", "Tenet", "The Prestige", "Guardians of the Galaxy"]  # input movies related to it to test
recommendations = recommend_movies(input_titles, num_recommendations=10, return_raw=True)
evaluation_results = evaluate_recommendations(input_titles, recommendations, ground_truth)
print(evaluation_results)


Inputted:
[1000018.0 'Inception' 2010.0 'Your mind is the scene of the crime.'
 'Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person\'s idea into a target\'s subconscious.'
 148.0 4.2 'Action Adventure Science Fiction'
 'High speed and special ops Humanity and the world around us Dreamlike, quirky, and surreal storytelling Surreal and thought-provoking visions of life and death Emotional and captivating fantasy storytelling Explosive and action-packed heroes vs. villains Thought-provoking sci-fi action and future technology'
 'Christopher Nolan' 'UK USA'
 'Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person\'