In [1]:
import pandas as pd
import random


In [2]:
def is_prime(n):
    if n <= 1:
        return False
    for i in range(2, int(n**0.5) + 1):
        if n % i == 0:
            return False
    return True

def next_prime(n):
    while not is_prime(n):
        n += 1
    return n


In [None]:
ratings = pd.read_csv('rating.csv')  
movies = pd.read_csv('movie.csv')    

df = pd.merge(ratings, movies, on='movieId', how='inner')

print(df.head())

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()


In [None]:
df.nunique()

In [8]:
def minhash_scratch(user_movie_dict, perm):
    all_movies = set(movie for movies in user_movie_dict.values() for movie in movies)

    a_vals = [random.randint(1,len(all_movies)) for i in range(perm)]
    b_vals = [random.randint(0,len(all_movies)) for i in range(perm)]

    p = next_prime(len(all_movies) + 1)

    user_sign = {user : [float('inf')] * perm for user in user_movie_dict}

    for user, movies in user_movie_dict.items():
        for movie in movies:
            for i in range(perm):
                hash_value = (a_vals[i] * movie + b_vals[i]) % p
                if hash_value < user_sign[user][i]:
                    user_sign[user][i] = hash_value
    
    return user_sign

In [9]:
user_movie_dict = df.groupby('userId')['movieId'].apply(list).to_dict()

In [None]:
user_movie_dict

In [11]:
from collections import defaultdict

def lsh_scratch(minhash_signatures, num_bands, rows_per_band):
    assert num_bands * rows_per_band == len(list(minhash_signatures.values())[0]), \
        "ERROR: num_bands * rows_per_band does not match the signature length."

    buckets = defaultdict(list)
    candidate_pairs = set()

    for band_idx in range(num_bands):
        band_buckets = defaultdict(list)

        for user, signature in minhash_signatures.items():
            start_idx = band_idx * rows_per_band
            end_idx = start_idx + rows_per_band
            band = tuple(signature[start_idx:end_idx])

            band_hash = hash(band)  # Built-in Python hash
            band_buckets[band_hash].append(user)

            print(f"Band {band_idx}, Hash {band_hash}: {band_buckets[band_hash]}")

        # Add users to general buckets
        for band_hash, users in band_buckets.items():
            buckets[band_hash].extend(users)

        # Check for candidate pairs
        for bucket_users in band_buckets.values():
            if len(bucket_users) > 1:
                for i in range(len(bucket_users)):
                    for j in range(i + 1, len(bucket_users)):
                        candidate_pairs.add((bucket_users[i], bucket_users[j]))

    return buckets, candidate_pairs


In [12]:
minhash_signatures = minhash_scratch(user_movie_dict, 32)

In [None]:
minhash_signatures

In [None]:
num_bands = 8
rows_per_band = 4

buckets, candidate_pairs = lsh_scratch(minhash_signatures, num_bands, rows_per_band)



In [None]:
buckets

In [None]:
candidate_pairs

In [None]:
len(buckets)

In [None]:
len(candidate_pairs)

In [18]:
# for user1, user2 in candidate_pairs:
#     movies_user1 = set(user_movie_dict[user1])
#     movies_user2 = set(user_movie_dict[user2])
#     common_movies = movies_user1.intersection(movies_user2)
#     print(f"Users {user1} and {user2} have {len(common_movies)} movies in common.")


In [20]:
def find_similar_users(candidate_pairs, user_movie_dict):
    similar_users = defaultdict(list)  

    for user1, user2 in candidate_pairs:
        movies_user1 = set(user_movie_dict[user1])
        movies_user2 = set(user_movie_dict[user2])
        
        common_movies = movies_user1.intersection(movies_user2)
        
        if len(common_movies) > 0:
            similar_users[user1].append((user2, len(common_movies)))
            similar_users[user2].append((user1, len(common_movies)))

    for user in similar_users:
        similar_users[user] = sorted(similar_users[user], key=lambda x: x[1], reverse=True)
    
    return similar_users


In [21]:
def recommend_movies(user, similar_users, user_movie_dict, rating_dict, max_recommendations=5):
    user_movies = set(user_movie_dict[user]) 
    recommendations = {}

    if user not in similar_users:
        return []  
    
    for similar_user, _ in similar_users[user]:
        similar_user_movies = set(user_movie_dict[similar_user])
        
        unseen_movies = similar_user_movies - user_movies
        
        for movie in unseen_movies:
            if movie not in recommendations:
                recommendations[movie] = rating_dict[similar_user].get(movie, 0)  
            else:
                recommendations[movie] += rating_dict[similar_user].get(movie, 0) 
    
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    
    return [movie for movie, _ in sorted_recommendations[:max_recommendations]]


In [25]:
rating_dict = (
    df.groupby('userId')
    .apply(lambda x: dict(zip(x['movieId'], x['rating'])))
    .to_dict()
)


In [None]:
similar_users = find_similar_users(candidate_pairs, user_movie_dict)

user_id = 1 
recommended_movies = recommend_movies(user_id, similar_users, user_movie_dict, rating_dict)

print(f"Recommended Movies for User {user_id}: {recommended_movies}")
