In [15]:
# Imports
import pandas as pd
import numpy as np
import operator

In [4]:
# Importing the dataset
df_movies = pd.read_csv('data/movies.csv')
df_ratings = pd.read_csv('data/ratings.csv')

In [5]:
df_movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [6]:
df_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


Initially this is a baseline recommendation model so genres of each model will be removed

In [7]:
del df_movies['genres']
del df_ratings['timestamp']

Remove the movie year from dataframe and create its own column

In [8]:
df_movies['movie_year'] = df_movies['title'].str.extract("\((.*)\)")
df_movies['title'] = df_movies['title'].str.replace("\((.*)\)", "", regex=True)
df_movies.head(2)

Unnamed: 0,movieId,title,movie_year
0,1,Toy Story,1995
1,2,Jumanji,1995


Below I make a fake user which will be used to test each of the methods. After testing this fake user will be removed.

In [9]:
# fake input user for testing
fake_user = [
{'movieId':4369, 'title':'Fast and the Furious, The', 'rating':5.0},
{'movieId':1, 'title':'Toy Story', 'rating':4.0},
{'movieId':59315, 'title':'Iron Man', 'rating':5.0},
{'movieId':59615, 'title':'Indiana Jones and the Kingdom of the Crystal Skull', 'rating':2.0},
{'movieId':59131, 'title':'Are You Scared?', 'rating':1.0},
{'movieId':59784, 'title':'Kung Fu Panda', 'rating':4.0}]

df_fake_user = pd.DataFrame(fake_user)

The method below returns a dataframe of other users that have also watched the same movies as the target user.

In [10]:
def get_viewers_of_target(target_user):
    # Gets list of moives rated by target
    target_movies = target_user['movieId'].tolist()
    # Gets a subset of other people that have watched those movies
    viewers = df_ratings[df_ratings['movieId'].isin(target_movies)]
    # Create sub-dataframes for each user [makes it easier to sort based on number of similar movies seen to target]
    viewers_sub_dataframes = viewers.groupby(['userId'])
    viewers_sub_dataframes = sorted(viewers_sub_dataframes, key=lambda x: len(x[1]), reverse=True)
    # Convert the sub dataframes back into a single dataframe
    return pd.concat(map(lambda x: x[1], viewers_sub_dataframes))

In [11]:
get_viewers_of_target(df_fake_user).head(10)

Unnamed: 0,userId,movieId,rating
10360,68,1,2.5
10993,68,4369,3.5
11410,68,59315,4.5
11416,68,59615,1.0
11419,68,59784,3.5
36374,249,1,4.0
36653,249,4369,3.5
36955,249,59315,4.5
36959,249,59615,3.0
36960,249,59784,4.0


Calculate the correlation between the target user and the other users.

In [16]:
def user_correlation(target, other):
    # Get list of movies IDs of the other user
    sub_df_movies = other['movieId'].tolist()
    # Get movies by target that have also been watched by other user and then sort
    sorted_temp_target = target[(target['movieId'].isin(sub_df_movies))].sort_values(by=['movieId'])['rating'].tolist()
    # Get the ratings of the movies (this list is already sorted)
    sorted_other_user = other['rating'].tolist()
    # Calculate Pearsons Correlation
    with np.errstate(all='ignore'):
        score = (np.corrcoef(sorted_temp_target, sorted_other_user)[0, 1])
        if (np.isnan(score)):
            return 0
        return score

def get_similar_users(target, potential_users, process_num=100):
    # Get unique list of users IDs 
    users_list = potential_users['userId'].unique()
    # Get similarity of each user to the target user
    sim_users = dict([(x, user_correlation(target, potential_users[potential_users['userId'] == x])) for x in users_list[:100]])  
    # Sort the dict based on value and return
    return sorted(sim_users.items(), key=operator.itemgetter(1), reverse=True)


In [19]:
viewers = get_viewers_of_target(df_fake_user)
sim_users = get_similar_users(df_fake_user, viewers)

In [18]:
sim_users[0:20]

[(98, 1.0),
 (393, 1.0),
 (608, 1.0),
 (139, 0.9999999999999999),
 (45, 0.9999999999999999),
 (63, 0.9999999999999999),
 (82, 0.9999999999999999),
 (239, 0.9999999999999999),
 (256, 0.9999999999999999),
 (514, 0.9999999999999999),
 (520, 0.9999999999999999),
 (522, 0.9999999999999999),
 (528, 0.9999999999999999),
 (550, 0.9999999999999999),
 (601, 0.9999999999999999),
 (489, 0.9989061072386719),
 (200, 0.9472044455566301),
 (119, 0.9449111825230678),
 (18, 0.9365858115816939),
 (307, 0.9285714285714284)]