In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm.notebook import tqdm

In [5]:
# This function takes a user and returns a list of recommeded movies
ml_ratings = pd.read_csv('data/ml-32m/ratings.csv')
ml_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858
...,...,...,...,...
32000199,200948,79702,4.5,1294412589
32000200,200948,79796,1.0,1287216292
32000201,200948,80350,0.5,1294412671
32000202,200948,80463,3.5,1350423800


### Initialize the similiarity function (Min-hash)

In [6]:
# used to read the correct file
threshold = 0.4

# read files
df_minhash = pd.read_pickle('data/df_min_hash.pkl')
with open(f'data/lsh_groups_{threshold}.pkl', 'rb') as f:
    dict_lsh = pickle.load(f)


# make dict to convert from index to movieId
index_to_id = dict(zip(df_minhash.index, df_minhash.movieId))
id_to_index = dict(zip(df_minhash.movieId, df_minhash.index)) # reverse dict

def movie_recommendation_min_hash(movie_id, id_to_index=id_to_index, index_to_id=index_to_id, dict_lsh=dict_lsh, df_minhash=df_minhash):
    """
    This function takes a movieId and returns a list of recommended movies
    """
    index = id_to_index[movie_id]
    similar_movies = []
    for idx in dict_lsh[index]:
        jaccard_score = df_minhash['minhash'][index].jaccard(df_minhash['minhash'][idx])
        similar_movies.append((index_to_id[idx], jaccard_score))
    
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    similar_movies = [movie for movie in similar_movies if movie[0] != movie_id]
    
    return similar_movies

# similiar film to movieId 1
movie_recommendation_min_hash(1)

[(3270, 0.140625), (140016, 0.078125)]

### Initialize the similiarity function (Genre-hash)

In [32]:
import numpy as np
from collections import defaultdict
import pandas as pd


class OptimizedMovieLSH:
    def __init__(self, num_hash_functions=10, num_bands=5):
        self.num_hash_functions = num_hash_functions
        self.num_bands = num_bands
        self.hash_functions = None
        self.precomputed_hashes = None
        self.movie_ids = None
        self.hash_tables = None
        
    def generate_hash_functions(self, num_genres):
        """Generate bit sampling positions for each band."""
        # Create a (num_bands, num_hash_functions) array of bit positions
        self.hash_functions = np.array([
            np.random.choice(num_genres, size=self.num_hash_functions, replace=True)
            for _ in range(self.num_bands)
        ])
    
    def _compute_all_hashes(self, genre_matrix):
        """
        Compute all hashes for all movies at once using vectorized operations.
        
        Args:
            genre_matrix: numpy array of shape (num_movies, num_genres)
        Returns:
            numpy array of shape (num_bands, num_movies) containing hash values
        """
        num_movies = genre_matrix.shape[0]
        # Preallocate array for hash values
        hashes = np.zeros((self.num_bands, num_movies), dtype=np.int32)
        
        # For each band, compute hashes for all movies at once
        for band in range(self.num_bands):
            # Select bits for this band (vectorized operation)
            selected_bits = genre_matrix[:, self.hash_functions[band]]
            
            # Convert bits to integers using binary weights
            # This creates a unique hash value from the selected bits
            powers_of_two = 2 ** np.arange(self.num_hash_functions)
            hashes[band] = selected_bits.dot(powers_of_two)
            
        return hashes
    
    def index_movies(self, df, genres_list):
        """
        Index all movies using vectorized operations.
        """
        if self.hash_functions is None:
            self.generate_hash_functions(len(genres_list))
        
        # Convert DataFrame to numpy array for faster operations
        genre_matrix = df[genres_list].values
        self.movie_ids = np.array(df.index)
        
        # Compute all hashes at once
        self.precomputed_hashes = self._compute_all_hashes(genre_matrix)
        
        # Create hash tables using numpy operations
        self.hash_tables = [defaultdict(list) for _ in range(self.num_bands)]
        
        # Vectorized hash table construction
        for band in range(self.num_bands):
            unique_hashes, inverse_indices = np.unique(self.precomputed_hashes[band], 
                                                     return_inverse=True)
            # Create hash tables using numpy operations
            for i, hash_val in enumerate(unique_hashes):
                matching_movies = self.movie_ids[inverse_indices == i]
                self.hash_tables[band][hash_val] = matching_movies.tolist()
    
    def query(self, query_vector, threshold=10):
        """
        Find similar movies using vectorized operations.
        
        Args:
            query_vector: Binary vector of genre features
            threshold: Minimum number of matching bands
        """
        # Compute query hashes using the same method
        query_hashes = np.zeros(self.num_bands, dtype=np.int32)
        
        for band in range(self.num_bands):
            selected_bits = query_vector[self.hash_functions[band]]
            powers_of_two = 2 ** np.arange(self.num_hash_functions)
            query_hashes[band] = selected_bits.dot(powers_of_two)
        
        # Count matches for each movie using vectorized operations
        candidate_counts = defaultdict(int)
        
        # Use numpy operations to find matching movies
        for band, query_hash in enumerate(query_hashes):
            matching_movies = self.hash_tables[band].get(query_hash, [])
            for movie_id in matching_movies:
                candidate_counts[movie_id] += 1
        
        return {movie_id for movie_id, count in candidate_counts.items() 
                if count >= threshold}

    def save_state(self, filename):
        """Save the LSH state efficiently using numpy."""
        np.savez_compressed(
            filename,
            hash_functions=self.hash_functions,
            precomputed_hashes=self.precomputed_hashes,
            movie_ids=self.movie_ids
        )
    
    def load_state(self, filename):
        """Load the LSH state and rebuild hash tables efficiently."""
        data = np.load(filename)
        self.hash_functions = data['hash_functions']
        self.precomputed_hashes = data['precomputed_hashes']
        self.movie_ids = data['movie_ids']
        
        # Rebuild hash tables efficiently
        self.hash_tables = [defaultdict(list) for _ in range(self.num_bands)]
        for band in range(self.num_bands):
            unique_hashes, inverse_indices = np.unique(self.precomputed_hashes[band], 
                                                     return_inverse=True)
            for i, hash_val in enumerate(unique_hashes):
                matching_movies = self.movie_ids[inverse_indices == i]
                self.hash_tables[band][hash_val] = matching_movies.tolist()


In [34]:

df_genres = pd.read_csv('data/df_genres.csv')

genres_list = ['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

# Initialize and index
lsh_optim = OptimizedMovieLSH(num_hash_functions=128, num_bands=128)
lsh_optim.index_movies(df_genres, genres_list)

# Save state efficiently
lsh_optim.save_state('data/'+'movie_lsh_optimized.npz')

def batch_find_similar_movies(lsh, df, genres_list, query_movie_ids, 
                            threshold=2, similarity_threshold=0.3, batch_size=1000):
    """
    Find similar movies for multiple queries efficiently.
    
    Args:
        lsh: OptimizedMovieLSH instance
        df: DataFrame with genre data
        query_movie_ids: List of movie IDs to find similar movies for
        batch_size: Number of queries to process at once
    """
    genre_matrix = df[genres_list].values
    results = {}
    
    # Process queries in batches
    for i in range(0, len(query_movie_ids), batch_size):
        batch_ids = query_movie_ids[i:i + batch_size]
        batch_vectors = genre_matrix[np.searchsorted(df.index, batch_ids)]
        
        for idx, query_id in enumerate(batch_ids):
            candidates = lsh.query(batch_vectors[idx], threshold)
            
            # Compute similarities using vectorized operations
            if candidates:
                candidate_vectors = genre_matrix[np.searchsorted(df.index, list(candidates))]
                query_vec = batch_vectors[idx]
                
                # Vectorized Jaccard similarity computation
                intersection = (candidate_vectors & query_vec).sum(axis=1)
                union = (candidate_vectors | query_vec).sum(axis=1)
                similarities = intersection / np.maximum(union, 1)
                
                # Filter and sort results
                mask = similarities >= similarity_threshold
                similar_movies = list(zip(np.array(list(candidates))[mask], 
                                       similarities[mask]))
                results[query_id] = sorted(similar_movies, key=lambda x: x[1], 
                                         reverse=True)
    
    return results


def movie_recommendation_genres(movie_id):
    result_dict = batch_find_similar_movies(query_movie_ids=[movie_id], lsh=lsh_optim, df=df_genres, genres_list=genres_list)
    return result_dict[movie_id]


### The global function

In [35]:
# get movie_recommendations for all movies that the user has rated 5
def get_movie_recommendations(userId, recommendation_function):
    """
    Get movie recommendations for a user based on the movies they have rated 5
    userId: int
    recommendation_function: function that takes a movieId and returns a list of recommended movies. Output is [(movieId, jaccard_score), ...]

    Returns: list of recommended movies [(movieId, jaccard_score), ...]
    """

    movie_recommendations = []
    user_ratings = ml_ratings[ml_ratings['userId'] == userId]
    user_ratings = user_ratings[user_ratings['rating'] == 5]

    for movieId in tqdm(user_ratings['movieId']):
        try:
            movie_recommendations.append(recommendation_function(movie_id=movieId))
        except:
            print('No recommendations for movieId:', movieId)

    # flatten list and sort by jaccard score
    movie_recommendations = [movie for sublist in movie_recommendations for movie in sublist]

    movie_recommendations = sorted(movie_recommendations, key=lambda x: x[1], reverse=True)
    # remove movies that the user has already rated
    movie_recommendations = [movie for movie in movie_recommendations if movie[0] not in user_ratings['movieId']]
    
    return movie_recommendations


# RUN

In [36]:
recommended_movies = get_movie_recommendations(userId=1,recommendation_function=movie_recommendation_min_hash)
recommended_movies

  0%|          | 0/57 [00:00<?, ?it/s]

No recommendations for movieId: 80
No recommendations for movieId: 232
No recommendations for movieId: 562
No recommendations for movieId: 838
No recommendations for movieId: 909
No recommendations for movieId: 915
No recommendations for movieId: 926
No recommendations for movieId: 927
No recommendations for movieId: 1041
No recommendations for movieId: 1056
No recommendations for movieId: 1247
No recommendations for movieId: 1719
No recommendations for movieId: 1721
No recommendations for movieId: 1968
No recommendations for movieId: 2020
No recommendations for movieId: 2025
No recommendations for movieId: 2599
No recommendations for movieId: 2973


[(148727, 0.3828125),
 (1126, 0.3671875),
 (218265, 0.3671875),
 (215317, 0.359375),
 (172313, 0.359375),
 (272541, 0.3515625),
 (219623, 0.34375),
 (6136, 0.3359375),
 (234255, 0.3359375),
 (168060, 0.3359375),
 (145570, 0.3359375),
 (121051, 0.3203125),
 (54318, 0.3203125),
 (240088, 0.3203125),
 (207860, 0.3125),
 (280408, 0.3125),
 (168794, 0.3125),
 (116169, 0.3046875),
 (170913, 0.3046875),
 (228993, 0.3046875),
 (200326, 0.3046875),
 (8890, 0.3046875),
 (160307, 0.3046875),
 (221026, 0.3046875),
 (191431, 0.296875),
 (185903, 0.296875),
 (144722, 0.296875),
 (6735, 0.2890625),
 (104608, 0.2890625),
 (137319, 0.2890625),
 (224104, 0.2890625),
 (122543, 0.2890625),
 (273561, 0.28125),
 (176927, 0.28125),
 (6570, 0.28125),
 (248672, 0.28125),
 (110591, 0.28125),
 (257355, 0.28125),
 (205575, 0.28125),
 (183585, 0.2734375),
 (123665, 0.2734375),
 (242254, 0.2734375),
 (260213, 0.2734375),
 (726, 0.2734375),
 (244456, 0.2734375),
 (3872, 0.2734375),
 (4410, 0.2734375),
 (149374, 0.27

In [37]:
recommended_movies = get_movie_recommendations(userId=1,recommendation_function=movie_recommendation_genres)
recommended_movies

  0%|          | 0/57 [00:00<?, ?it/s]

[(70, 1.0),
 (194, 1.0),
 (33048, 1.0),
 (33190, 1.0),
 (1553, 1.0),
 (2096, 1.0),
 (35443, 1.0),
 (3656, 1.0),
 (4200, 1.0),
 (37180, 1.0),
 (37578, 1.0),
 (38023, 1.0),
 (5303, 1.0),
 (38164, 1.0),
 (39244, 1.0),
 (6483, 1.0),
 (39607, 1.0),
 (7164, 1.0),
 (40084, 1.0),
 (8091, 1.0),
 (41347, 1.0),
 (8783, 1.0),
 (41569, 1.0),
 (9584, 1.0),
 (42704, 1.0),
 (42967, 1.0),
 (10506, 1.0),
 (43903, 1.0),
 (11739, 1.0),
 (44833, 1.0),
 (12110, 1.0),
 (13147, 1.0),
 (13334, 1.0),
 (13341, 1.0),
 (15021, 1.0),
 (48637, 1.0),
 (16408, 1.0),
 (49365, 1.0),
 (49534, 1.0),
 (49614, 1.0),
 (16874, 1.0),
 (17536, 1.0),
 (51651, 1.0),
 (19196, 1.0),
 (21985, 1.0),
 (22013, 1.0),
 (22338, 1.0),
 (23089, 1.0),
 (23386, 1.0),
 (23706, 1.0),
 (24029, 1.0),
 (25052, 1.0),
 (26056, 1.0),
 (28165, 1.0),
 (32647, 1.0),
 (483, 1.0),
 (52, 1.0),
 (84, 1.0),
 (119, 1.0),
 (151, 1.0),
 (221, 1.0),
 (354, 1.0),
 (365, 1.0),
 (380, 1.0),
 (423, 1.0),
 (447, 1.0),
 (451, 1.0),
 (469, 1.0),
 (471, 1.0),
 (604, 1.0

In [31]:
len(recommended_movies)

228578