In [1]:
# Imports
import pandas as pd
import numpy as np
import operator
import math
from sklearn.metrics import mean_squared_error

In [5]:
# Importing the dataset
df_movies = pd.read_csv('data/movies.csv')
df_ratings = pd.read_csv('data/ratings.csv')

In [6]:
df_movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [7]:
df_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


Initially this is a baseline recommendation model so genres of each model will be removed

In [8]:
del df_movies['genres']
del df_ratings['timestamp']

Remove the movie year from dataframe and create its own column

In [9]:
df_movies['movie_year'] = df_movies['title'].str.extract("\((.*)\)")
df_movies['title'] = df_movies['title'].str.replace("\((.*)\)", "", regex=True)
df_movies.head(2)

Unnamed: 0,movieId,title,movie_year
0,1,Toy Story,1995
1,2,Jumanji,1995


Below I make a fake user which will be used to test each of the methods. After testing this fake user will be removed.

In [24]:
# fake input user for testing
fake_user = [
{'movieId':4369, 'title':'Fast and the Furious, The', 'rating':5.0},
{'movieId':1, 'title':'Toy Story', 'rating':4.0},
{'movieId':59315, 'title':'Iron Man', 'rating':5.0},
{'movieId':59615, 'title':'Indiana Jones and the Kingdom of the Crystal Skull', 'rating':1.0},
{'movieId':59131, 'title':'Are You Scared?', 'rating':0.5},
{'movieId':59784, 'title':'Kung Fu Panda', 'rating':4.0}]

df_fake_user = pd.DataFrame(fake_user)

The method below returns a dataframe of other users that have also watched the same movies as the target user.

In [11]:
def get_viewers_of_target(target_user):
    # Gets list of moives rated by target
    target_movies = target_user['movieId'].tolist()
    # Gets a subset of other people that have watched those movies
    viewers = df_ratings[df_ratings['movieId'].isin(target_movies)]
    # Create sub-dataframes for each user [makes it easier to sort based on number of similar movies seen to target]
    viewers_sub_dataframes = viewers.groupby(['userId'])
    viewers_sub_dataframes = sorted(viewers_sub_dataframes, key=lambda x: len(x[1]), reverse=True)
    # Convert the sub dataframes back into a single dataframe
    return pd.concat(map(lambda x: x[1], viewers_sub_dataframes))

In [12]:
get_viewers_of_target(df_fake_user).head(10)

Unnamed: 0,userId,movieId,rating
10360,68,1,2.5
10993,68,4369,3.5
11410,68,59315,4.5
11416,68,59615,1.0
11419,68,59784,3.5
36374,249,1,4.0
36653,249,4369,3.5
36955,249,59315,4.5
36959,249,59615,3.0
36960,249,59784,4.0


Calculate the correlation between the target user and the other users.

In [13]:
def user_correlation(target, other):
    # Get list of movies IDs of the other user
    sub_df_movies = other['movieId'].tolist()
    # Get movies by target that have also been watched by other user and then sort
    sorted_temp_target = target[(target['movieId'].isin(sub_df_movies))].sort_values(by=['movieId'])['rating'].tolist()
    # Get the ratings of the movies (this list is already sorted)
    sorted_other_user = other['rating'].tolist()
    # Calculate Pearsons Correlation
    with np.errstate(all='ignore'):
        score = (np.corrcoef(sorted_temp_target, sorted_other_user)[0, 1])
        if (np.isnan(score)):
            return 0
        return score

def get_similar_users(target, potential_users, process_num=100):
    # Get unique list of users IDs 
    users_list = potential_users['userId'].unique()
    # Get similarity of each user to the target user
    sim_users = dict([(x, user_correlation(target, potential_users[potential_users['userId'] == x])) for x in users_list[:process_num]])  
    # Sort the dict based on value and return dataframe
    sim_users = dict(sorted(sim_users.items(), key=operator.itemgetter(1), reverse=True))
    return pd.DataFrame(sim_users.items(), columns=['userId', 'similarity'])


Gets the users that are above a certain similarity threshold.

In [14]:
def select_best_users(user_sim_df, threshold=0.6):
    return user_sim_df[user_sim_df['similarity'] >= threshold]

Create User-Based Recommendations.

$$R_U = \frac{(\sum \limits _{u=1} ^{n} S_u * R_u)}{(\sum \limits _{u=1} ^{n} S_u)}$$
where $S_u$ is the user similarity and $R_u$ is the rating

Typically, the average rating is computed for each score the target user may put. However, the weighted average is used in this case, where each rating is multiplied by the similarity score.
https://realpython.com/build-recommendation-engine-collaborative-filtering/

In [15]:
def create_recommendations(df_ratings, sim_users):
    # Ratings for all movies by similar users to the target user
    sim_other_ratings = sim_users.merge(df_ratings, left_on='userId', right_on='userId')
    # Calculate the weighted rating
    sim_other_ratings['weighted_rating'] = sim_other_ratings['similarity'] * sim_other_ratings['rating']
    # Group by movieId and calculate sum
    sum_ratings = sim_other_ratings.groupby('movieId', as_index = False).sum()[['movieId', 'similarity','weighted_rating']]
    # From the sum, calculate the final weighted rating score for each movie for target
    sum_ratings['final_rating'] = sum_ratings['weighted_rating'] / sum_ratings['similarity']
    # Return the table along with the movies
    return sum_ratings.sort_values(by=['final_rating'], ascending=False)[['movieId','final_rating']]

def select_K_recommendations(recommend_items, N):
    top_k = recommend_items.head(N)
    movie_ids = top_k['movieId'].tolist()
    titles = [df_movies.loc[df_movies['movieId'] == x, 'title'].values[0] for x in movie_ids]
    top_k['title'] = titles
    return top_k

def select_random_K_recommendations(recommend_items, N):
    top_2k = select_K_recommendations(recommend_items, 2 * N)
    return top_2k.sample(N)
    

Calculates the RMSE/MSE Score of the target user.
$$ RMSE = \sqrt{\frac{\sum \limits _{i=1} ^{N} (\hat{y_i} - y_i)^2}{N}} $$


$$ MSE = \frac{\sum \limits _{i=1} ^{N} (\hat{y_i} - y_i)^2}{N} $$

In [34]:
def base_score_calculate(target, sim_users):
    # List of movies watched by target
    movie_list = target['movieId'].tolist()
    # Ratings of other people watched target movies
    sim_other_ratings = sim_users.merge(df_ratings, left_on='userId', right_on='userId')
    # Only get movies that are watched by target
    sim_other_ratings = sim_other_ratings[sim_other_ratings['movieId'].isin(movie_list)]
    # Calculate the weighted rating
    sim_other_ratings['weighted_rating'] = sim_other_ratings['similarity'] * sim_other_ratings['rating']
    # Group by movieId and calculate sum
    sum_ratings = sim_other_ratings.groupby('movieId', as_index = False).sum()[['movieId', 'similarity','weighted_rating']]
    # From the sum, calculate the final weighted rating score for each movie for target
    sum_ratings['final_rating'] = sum_ratings['weighted_rating'] / sum_ratings['similarity']
    # Get the original ratings for each movie
    sum_ratings = sum_ratings.merge(df_fake_user[['movieId','rating']], left_on='movieId', right_on='movieId')[['movieId','final_rating','rating']]
    return sum_ratings
    
def RMSE(target, sim_users):
    sum_ratings = base_score_calculate(target, sim_users)
    # RMSE score of actual vs predicted
    return mean_squared_error(sum_ratings['rating'].tolist(), sum_ratings['final_rating'].tolist(), squared=False)

def MSE(target, sim_users):
    sum_ratings = base_score_calculate(target, sim_users)
    # MSE score of actual vs predicted
    return mean_squared_error(sum_ratings['rating'].tolist(), sum_ratings['final_rating'].tolist(), squared=True)
    

Calculate the Precision and Recall of target user

To compute these methods the true ratings are translated into binary form by assuming any item rated $\geq$ 3.5 is relevant and any item < 3.5 is irrelevant. Then I compute the following equations for Precision and Recall:


$$ Precision = \frac{\text{|Relevant items @k|}}{\text{|All items @k|}} $$

$$ Recall = \frac{\text{|Relevant items @k|}}{\text{|Total Relevant items|}} $$

In [66]:
def Precision_Recall(target, sim_users, K):
    sum_ratings = base_score_calculate(target, sim_users)
    sum_ratings[sum_ratings['final_rating'] >= 3.5]['movieId']
    # Original relevant items [>= 3.5] = relevant
    original_relevant_items = sum_ratings[sum_ratings['rating'] >= 3.5]['movieId'].tolist()
    # Predicted relevant items [>= 3.5] = relevant for up to value 'K'
    temp_sum_ratings = sum_ratings.head(K)
    predicted_relevant_items = temp_sum_ratings[temp_sum_ratings['final_rating'] >= 3.5]['movieId'].tolist()
    # Calculate the intersection of lists
    relevant_items = len(set(original_relevant_items) & set(predicted_relevant_items))
    # Prescision = (relevant @k) / (all items @k)
    prescision = (relevant_items / temp_sum_ratings.shape[0])
    # recall = (relevant @k) / (total relevant items)
    recall = (relevant_items / len(original_relevant_items))
    return prescision, recall

Create the training and test split for the target user.

In [20]:
def user_train_test_split(target, testing_split=0.2):
    train_set_size = math.floor(len(target) * (1 - testing_split))
    df_shuffle = target.sample(frac=1)
    return df_shuffle[:train_set_size], df_shuffle[train_set_size:]

Test run for the "fake" user

In [67]:
viewers = get_viewers_of_target(df_fake_user)
sim_users = get_similar_users(df_fake_user, viewers, process_num=50)
sim_users = select_best_users(sim_users)
recommend_items = create_recommendations(df_ratings, sim_users)

train, test = user_train_test_split(df_fake_user, 0.2)

prescision_train, recall_train = Precision_Recall(train, sim_users, 3)
prescision_test, recall_test = Precision_Recall(test, sim_users, 3)
print('RMSE')
print('Train:', RMSE(train, sim_users))
print('Test:', RMSE(test, sim_users))
print('--------------------------')
print('MSE')
print('Train:', MSE(train, sim_users))
print('Test:', MSE(test, sim_users))
print('--------------------------')
print('Prescision')
print('Train:', prescision_train)
print('Test:', prescision_test)
print('--------------------------')
print('Recall')
print('Train:', recall_train)
print('Test:', recall_test)

RMSE
Train: 0.7418447846966625
Test: 1.1258357102037766
--------------------------
MSE
Train: 0.5503336845816376
Test: 1.2675060463700423
--------------------------
Prescision
Train: 0.6666666666666666
Test: 0.5
--------------------------
Recall
Train: 0.6666666666666666
Test: 1.0
