In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm.notebook import tqdm

# Import data

In [2]:
# This function takes a user and returns a list of recommeded movies
ml_ratings = pd.read_csv('../data/ml-32m/ratings.csv')
ml_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858
...,...,...,...,...
32000199,200948,79702,4.5,1294412589
32000200,200948,79796,1.0,1287216292
32000201,200948,80350,0.5,1294412671
32000202,200948,80463,3.5,1350423800


# Initialize the similiarity function (Min-hash)

In [5]:
# used to read the correct file
threshold = 0.4

# read files
df_minhash = pd.read_pickle('../data/df_min_hash.pkl')
with open(f'../data/lsh_groups_{threshold}.pkl', 'rb') as f:
    dict_lsh = pickle.load(f)


# make dict to convert from index to movieId
index_to_id = dict(zip(df_minhash.index, df_minhash.movieId))
id_to_index = dict(zip(df_minhash.movieId, df_minhash.index)) # reverse dict

def movie_recommendation_min_hash(movie_id, id_to_index=id_to_index, index_to_id=index_to_id, dict_lsh=dict_lsh, df_minhash=df_minhash):
    """
    This function takes a movieId and returns a list of recommended movies
    """
    index = id_to_index[movie_id]
    similar_movies = []
    for idx in dict_lsh[index]:
        jaccard_score = df_minhash['minhash'][index].jaccard(df_minhash['minhash'][idx])
        similar_movies.append((index_to_id[idx], jaccard_score))
    
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    similar_movies = [movie for movie in similar_movies if movie[0] != movie_id]
    
    return similar_movies

# similiar film to movieId 1
movie_recommendation_min_hash(1)

[(3270, 0.140625), (140016, 0.078125)]

# Initialize the similiarity function (Genre-hash)

In [6]:
import numpy as np
from collections import defaultdict
import pandas as pd


class OptimizedMovieLSH:
    def __init__(self, num_hash_functions=10, num_bands=5):
        self.num_hash_functions = num_hash_functions
        self.num_bands = num_bands
        self.hash_functions = None
        self.precomputed_hashes = None
        self.movie_ids = None
        self.hash_tables = None
        
    def generate_hash_functions(self, num_genres):
        """Generate bit sampling positions for each band."""
        # Create a (num_bands, num_hash_functions) array of bit positions
        self.hash_functions = np.array([
            np.random.choice(num_genres, size=self.num_hash_functions, replace=True)
            for _ in range(self.num_bands)
        ])
    
    def _compute_all_hashes(self, genre_matrix):
        """
        Compute all hashes for all movies at once using vectorized operations.
        
        Args:
            genre_matrix: numpy array of shape (num_movies, num_genres)
        Returns:
            numpy array of shape (num_bands, num_movies) containing hash values
        """
        num_movies = genre_matrix.shape[0]
        # Preallocate array for hash values
        hashes = np.zeros((self.num_bands, num_movies), dtype=np.int32)
        
        # For each band, compute hashes for all movies at once
        for band in range(self.num_bands):
            # Select bits for this band (vectorized operation)
            selected_bits = genre_matrix[:, self.hash_functions[band]]
            
            # Convert bits to integers using binary weights
            # This creates a unique hash value from the selected bits
            powers_of_two = 2 ** np.arange(self.num_hash_functions)
            hashes[band] = selected_bits.dot(powers_of_two)
            
        return hashes
    
    def index_movies(self, df, genres_list):
        """
        Index all movies using vectorized operations.
        """
        if self.hash_functions is None:
            self.generate_hash_functions(len(genres_list))
        
        # Convert DataFrame to numpy array for faster operations
        genre_matrix = df[genres_list].values
        self.movie_ids = np.array(df.index)
        
        # Compute all hashes at once
        self.precomputed_hashes = self._compute_all_hashes(genre_matrix)
        
        # Create hash tables using numpy operations
        self.hash_tables = [defaultdict(list) for _ in range(self.num_bands)]
        
        # Vectorized hash table construction
        for band in range(self.num_bands):
            unique_hashes, inverse_indices = np.unique(self.precomputed_hashes[band], 
                                                     return_inverse=True)
            # Create hash tables using numpy operations
            for i, hash_val in enumerate(unique_hashes):
                matching_movies = self.movie_ids[inverse_indices == i]
                self.hash_tables[band][hash_val] = matching_movies.tolist()
    
    def query(self, query_vector, threshold=10):
        """
        Find similar movies using vectorized operations.
        
        Args:
            query_vector: Binary vector of genre features
            threshold: Minimum number of matching bands
        """
        # Compute query hashes using the same method
        query_hashes = np.zeros(self.num_bands, dtype=np.int32)
        
        for band in range(self.num_bands):
            selected_bits = query_vector[self.hash_functions[band]]
            powers_of_two = 2 ** np.arange(self.num_hash_functions)
            query_hashes[band] = selected_bits.dot(powers_of_two)
        
        # Count matches for each movie using vectorized operations
        candidate_counts = defaultdict(int)
        
        # Use numpy operations to find matching movies
        for band, query_hash in enumerate(query_hashes):
            matching_movies = self.hash_tables[band].get(query_hash, [])
            for movie_id in matching_movies:
                candidate_counts[movie_id] += 1
        
        return {movie_id for movie_id, count in candidate_counts.items() 
                if count >= threshold}

    def save_state(self, filename):
        """Save the LSH state efficiently using numpy."""
        np.savez_compressed(
            filename,
            hash_functions=self.hash_functions,
            precomputed_hashes=self.precomputed_hashes,
            movie_ids=self.movie_ids
        )
    
    def load_state(self, filename):
        """Load the LSH state and rebuild hash tables efficiently."""
        data = np.load(filename)
        self.hash_functions = data['hash_functions']
        self.precomputed_hashes = data['precomputed_hashes']
        self.movie_ids = data['movie_ids']
        
        # Rebuild hash tables efficiently
        self.hash_tables = [defaultdict(list) for _ in range(self.num_bands)]
        for band in range(self.num_bands):
            unique_hashes, inverse_indices = np.unique(self.precomputed_hashes[band], 
                                                     return_inverse=True)
            for i, hash_val in enumerate(unique_hashes):
                matching_movies = self.movie_ids[inverse_indices == i]
                self.hash_tables[band][hash_val] = matching_movies.tolist()


In [7]:

df_genres = pd.read_csv('../data/df_genres.csv')

genres_list = ['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

# Initialize and index
lsh_optim = OptimizedMovieLSH(num_hash_functions=128, num_bands=128)
lsh_optim.index_movies(df_genres, genres_list)

# Save state efficiently
lsh_optim.save_state('../data/'+'movie_lsh_optimized.npz')

def batch_find_similar_movies(lsh, df, genres_list, query_movie_ids, 
                            threshold=2, similarity_threshold=0.3, batch_size=1000):
    """
    Find similar movies for multiple queries efficiently.
    
    Args:
        lsh: OptimizedMovieLSH instance
        df: DataFrame with genre data
        query_movie_ids: List of movie IDs to find similar movies for
        batch_size: Number of queries to process at once
    """
    genre_matrix = df[genres_list].values
    results = {}
    
    # Process queries in batches
    for i in range(0, len(query_movie_ids), batch_size):
        batch_ids = query_movie_ids[i:i + batch_size]
        batch_vectors = genre_matrix[np.searchsorted(df.index, batch_ids)]
        
        for idx, query_id in enumerate(batch_ids):
            candidates = lsh.query(batch_vectors[idx], threshold)
            
            # Compute similarities using vectorized operations
            if candidates:
                candidate_vectors = genre_matrix[np.searchsorted(df.index, list(candidates))]
                query_vec = batch_vectors[idx]
                
                # Vectorized Jaccard similarity computation
                intersection = (candidate_vectors & query_vec).sum(axis=1)
                union = (candidate_vectors | query_vec).sum(axis=1)
                similarities = intersection / np.maximum(union, 1)
                
                # Filter and sort results
                mask = similarities >= similarity_threshold
                similar_movies = list(zip(np.array(list(candidates))[mask], 
                                       similarities[mask]))
                results[query_id] = sorted(similar_movies, key=lambda x: x[1], 
                                         reverse=True)
    
    return results


def movie_recommendation_genres(movie_id):
    result_dict = batch_find_similar_movies(query_movie_ids=[movie_id], lsh=lsh_optim, df=df_genres, genres_list=genres_list)
    return result_dict[movie_id]


# Recommend using bert embeddings

In [23]:

import pandas as pd
import numpy as np
from scipy.spatial.distance import jaccard as jacc_score

# Load the data
df_bert = pd.read_csv('./../data/df_clusters.csv')

def movie_recommendation_bert(movie_id):
    """
    This function takes a movieId and returns a list of recommended movies
    based on Jaccard similarity of movies in the same cluster.
    """
    # Get the cluster label for the given movie ID
    movie_cluster = df_bert[df_bert['movieId'] == movie_id]

    if movie_cluster.empty:
        raise ValueError(f"Movie ID {movie_id} not found in the dataset holding decriptions.")
        
    movie_cluster = movie_cluster['cluster_label'].item()
    # Load embeddings for the cluster
    embeddings = np.load(f'/work3/s204161/data/clustered_embeddings/{movie_cluster}.npy')
    
    # Reshape embeddings to flatten the last two dimensions
    embeddings = embeddings.reshape(embeddings.shape[0], -1)  # Shape becomes (N, 106*768)
    
    # Filter movies in the same cluster and get their DataFrame indices
    cluster_movies = df_bert[df_bert['cluster_label'] == movie_cluster]['movieId'].tolist()
    
    # Map DataFrame indices to embedding indices
    movie_id_to_embedding_index = {idx: i for i, idx in enumerate(cluster_movies)}
    
    # Check if movie_id exists in the mapping
    if movie_id not in movie_id_to_embedding_index:
        raise ValueError(f"Movie ID {movie_id} is not in the cluster {movie_cluster}.")
    
    movie_embedding_index = movie_id_to_embedding_index[movie_id]

    # Compute Jaccard similarity for the movies in the cluster
    similar_movies = []
    for idx in cluster_movies:
        embedding_idx = movie_id_to_embedding_index[idx]
        if embedding_idx != movie_embedding_index:  # Skip the movie itself
            similarity = jacc_score(embeddings[movie_embedding_index], embeddings[embedding_idx])
            similar_movies.append((idx, similarity))
    
    # Sort movies by similarity score in descending order 
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)

    return similar_movies

# Similar movies to movieId 1
recommendations = movie_recommendation_bert(1)[:5]
print(recommendations) # jaccard is for movies in cluster very close to 1.0 or just 1.0 for the bert embeddings...
df_bert[df_bert['movieId'] == 1].description.item()

[(133065, 0.999951774691358), (7930, 0.9999397183641975), (52375, 0.9999397183641975), (141381, 0.9999397183641975), (202263, 0.9999397183641975)]


"A cowboy doll is profoundly threatened and jealous when a new spaceman figure supplants him as top toy in a boy's room."

In [26]:
#top 10 most similair movies to toy story using BERT embeddings
for movieid in recommendations[:10]:
    print(df_genres[df_genres['movieId'] == movieid[0]].title.values[0])
    print(df_genres[df_genres['movieId'] == movieid[0]].description.values[0])


From Hell to Victory (1979)
In 1939, at a Paris café, six friends of various nationalities vow to meet again at the same spot after the end of WW2.
People Under the Stairs, The (1991)
Two adults and a juvenile break into a house occupied by a brother and sister and their stolen children. There, they must fight for their lives.
Hoax, The (2007)
In what would cause a fantastic media frenzy, Clifford Irving sells his bogus biography of Howard Hughes to a premiere publishing house in the early 1970s.
One Man Force (1989)
In this action packed film, an L. A. cop speeds off to get revenge upon the dirty drug-dealing dogs who killed his partner.
In the Year 2889 (1967)
In a post nuclear Earth, survivors are stuck in a valley and have to protect themselves from mutant human beings, and each other in some cases.


# The global function

In [9]:
# get movie_recommendations for all movies that the user has rated 5
def get_movie_recommendations(userId, recommendation_function):
    """
    Get movie recommendations for a user based on the movies they have rated 5
    userId: int
    recommendation_function: function that takes a movieId and returns a list of recommended movies. Output is [(movieId, jaccard_score), ...]

    Returns: list of recommended movies [(movieId, jaccard_score), ...]
    """

    movie_recommendations = []
    user_ratings = ml_ratings[ml_ratings['userId'] == userId]
    user_ratings = user_ratings[user_ratings['rating'] == 5]

    for movieId in tqdm(user_ratings['movieId']):
        try:
            movie_recommendations.append(recommendation_function(movie_id=movieId))
        except:
            print('No recommendations for movieId:', movieId)

    # flatten list and sort by jaccard score
    movie_recommendations = [movie for sublist in movie_recommendations for movie in sublist]

    movie_recommendations = sorted(movie_recommendations, key=lambda x: x[1], reverse=True)
    # remove movies that the user has already rated
    movie_recommendations = [movie for movie in movie_recommendations if movie[0] not in user_ratings['movieId']]
    
    return movie_recommendations


# Run

In [12]:
user_ids_list = [304, 6741, 147001]
user_recommendations_min_hash = {}
user_recommendations_genres = {}
user_recommendations_bert = {}
for userId in user_ids_list:
    user_recommendations_min_hash[userId] = get_movie_recommendations(userId, movie_recommendation_min_hash)[:]
    user_recommendations_genres[userId] = get_movie_recommendations(userId, movie_recommendation_genres)[:]
    user_recommendations_bert[userId] = get_movie_recommendations(userId, movie_recommendation_bert)[:]


# save recommendations
with open('../data/recommendations/user_recommendations_min_hash.pkl', 'wb') as f:
    pickle.dump(user_recommendations_min_hash, f)

with open('../data/recommendations/user_recommendations_genres.pkl', 'wb') as f:
    pickle.dump(user_recommendations_genres, f)

with open('../data/recommendations/user_recommendations_bert.pkl', 'wb') as f:
    pickle.dump(user_recommendations_bert, f)



  0%|          | 0/10 [00:00<?, ?it/s]

No recommendations for movieId: 318
No recommendations for movieId: 3949


  0%|          | 0/10 [00:00<?, ?it/s]

No recommendations for movieId: 58559


  0%|          | 0/10 [00:00<?, ?it/s]

No recommendations for movieId: 318
No recommendations for movieId: 3949


  0%|          | 0/16 [00:00<?, ?it/s]

No recommendations for movieId: 356
No recommendations for movieId: 3421


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

No recommendations for movieId: 356
No recommendations for movieId: 3421


  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

No recommendations for movieId: 53125
No recommendations for movieId: 54001
No recommendations for movieId: 56775
No recommendations for movieId: 58559
No recommendations for movieId: 59315
No recommendations for movieId: 59501
No recommendations for movieId: 63992
No recommendations for movieId: 68319
No recommendations for movieId: 69844
No recommendations for movieId: 72998
No recommendations for movieId: 78772
No recommendations for movieId: 79139
No recommendations for movieId: 82169


  0%|          | 0/51 [00:00<?, ?it/s]