In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [39]:
# This function takes a user and returns a list of recommeded movies
ml_ratings = pd.read_csv('data/ml-32m/ratings.csv')
ml_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858
...,...,...,...,...
32000199,200948,79702,4.5,1294412589
32000200,200948,79796,1.0,1287216292
32000201,200948,80350,0.5,1294412671
32000202,200948,80463,3.5,1350423800


### Initialize the similiarity function (Min-hash)

In [None]:
# used to read the correct file
threshold = 0.4

# read files
df_minhash = pd.read_pickle('data/df_min_hash.pkl')
with open(f'data/lsh_groups_{threshold}.pkl', 'rb') as f:
    dict_lsh = pickle.load(f)


# make dict to convert from index to movieId
index_to_id = dict(zip(df_minhash.index, df_minhash.movieId))
id_to_index = dict(zip(df_minhash.movieId, df_minhash.index)) # reverse dict

def movie_recommendation_min_hash(movie_id, id_to_index=id_to_index, index_to_id=index_to_id, dict_lsh=dict_lsh, df_minhash=df_minhash):
    """
    This function takes a movieId and returns a list of recommended movies
    """
    index = id_to_index[movie_id]
    similar_movies = []
    for idx in dict_lsh[index]:
        jaccard_score = df_minhash['minhash'][index].jaccard(df_minhash['minhash'][idx])
        similar_movies.append((index_to_id[idx], jaccard_score))
    
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    similar_movies = [movie for movie in similar_movies if movie[0] != movie_id]
    
    return similar_movies

# similiar film to movieId 1
movie_recommendation_min_hash(1)

[(3270, 0.140625), (140016, 0.078125)]

### The global function

In [42]:
# get movie_recommendations for all movies that the user has rated 5
def get_movie_recommendations(userId, recommendation_function):
    """
    Get movie recommendations for a user based on the movies they have rated 5
    userId: int
    recommendation_function: function that takes a movieId and returns a list of recommended movies. Output is [(movieId, jaccard_score), ...]

    Returns: list of recommended movies [(movieId, jaccard_score), ...]
    """

    movie_recommendations = []
    user_ratings = ml_ratings[ml_ratings['userId'] == userId]
    user_ratings = user_ratings[user_ratings['rating'] == 5]

    for movieId in user_ratings['movieId']:
        try:
            movie_recommendations.append(recommendation_function(movie_id=movieId))
        except:
            print('No recommendations for movieId:', movieId)

    # flatten list and sort by jaccard score
    movie_recommendations = [movie for sublist in movie_recommendations for movie in sublist]

    movie_recommendations = sorted(movie_recommendations, key=lambda x: x[1], reverse=True)
    # remove movies that the user has already rated
    movie_recommendations = [movie for movie in movie_recommendations if movie[0] not in user_ratings['movieId']]
    
    return movie_recommendations


In [36]:
recommended_movies = get_movie_recommendations(1, movie_recommendation_min_hash)
recommended_movies

No recommendations for movieId: 80
No recommendations for movieId: 232
No recommendations for movieId: 562
No recommendations for movieId: 838
No recommendations for movieId: 909
No recommendations for movieId: 915
No recommendations for movieId: 926
No recommendations for movieId: 927
No recommendations for movieId: 1041
No recommendations for movieId: 1056
No recommendations for movieId: 1247
No recommendations for movieId: 1719
No recommendations for movieId: 1721
No recommendations for movieId: 1968
No recommendations for movieId: 2020
No recommendations for movieId: 2025
No recommendations for movieId: 2599
No recommendations for movieId: 2973


[(148727, 0.3828125),
 (1126, 0.3671875),
 (218265, 0.3671875),
 (215317, 0.359375),
 (172313, 0.359375),
 (272541, 0.3515625),
 (219623, 0.34375),
 (6136, 0.3359375),
 (234255, 0.3359375),
 (168060, 0.3359375),
 (145570, 0.3359375),
 (121051, 0.3203125),
 (54318, 0.3203125),
 (240088, 0.3203125),
 (207860, 0.3125),
 (280408, 0.3125),
 (168794, 0.3125),
 (116169, 0.3046875),
 (170913, 0.3046875),
 (228993, 0.3046875),
 (200326, 0.3046875),
 (8890, 0.3046875),
 (160307, 0.3046875),
 (221026, 0.3046875),
 (191431, 0.296875),
 (185903, 0.296875),
 (144722, 0.296875),
 (6735, 0.2890625),
 (104608, 0.2890625),
 (137319, 0.2890625),
 (224104, 0.2890625),
 (122543, 0.2890625),
 (273561, 0.28125),
 (176927, 0.28125),
 (6570, 0.28125),
 (248672, 0.28125),
 (110591, 0.28125),
 (257355, 0.28125),
 (205575, 0.28125),
 (183585, 0.2734375),
 (123665, 0.2734375),
 (242254, 0.2734375),
 (260213, 0.2734375),
 (726, 0.2734375),
 (244456, 0.2734375),
 (3872, 0.2734375),
 (4410, 0.2734375),
 (149374, 0.27