In [285]:
# Imports
import pandas as pd
import numpy as np
import operator
import math
from sklearn.metrics import mean_squared_error
from statistics import mean
from tqdm import tqdm

In [286]:
# Importing the dataset
df_movies = pd.read_csv('data/movies.csv')
df_ratings = pd.read_csv('data/ratings.csv')

In [287]:
df_movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [288]:
df_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


Initially this is a baseline recommendation model so genres of each model will be removed

In [289]:
del df_movies['genres']
del df_ratings['timestamp']

Remove the movie year from dataframe and create its own column

In [290]:
df_movies['movie_year'] = df_movies['title'].str.extract("\((.*)\)")
df_movies['title'] = df_movies['title'].str.replace("\((.*)\)", "", regex=True)
df_movies.head(2)

Unnamed: 0,movieId,title,movie_year
0,1,Toy Story,1995
1,2,Jumanji,1995


Below I make a fake user which will be used to test each of the methods. After testing this fake user will be removed.

In [291]:
# fake input user for testing
fake_user = [
{'movieId':4369, 'title':'Fast and the Furious, The', 'rating':5.0},
{'movieId':1, 'title':'Toy Story', 'rating':4.0},
{'movieId':59315, 'title':'Iron Man', 'rating':5.0},
{'movieId':59615, 'title':'Indiana Jones and the Kingdom of the Crystal Skull', 'rating':1.0},
{'movieId':59131, 'title':'Are You Scared?', 'rating':0.5},
{'movieId':59784, 'title':'Kung Fu Panda', 'rating':4.0}]

df_fake_user = pd.DataFrame(fake_user)

The method below returns a dataframe of other users that have also watched the same movies as the target user.

In [387]:
def get_viewers_of_target(target_user, positive_only=False):
    # Gets list of movies rated by target (False)
    target_movies = target_user['movieId'].tolist()
    # If True then gets list of movies that are rated positively only by target
    if (positive_only):
        target_movies = target_user[target_user['rating'] >= 3.5]['movieId'].tolist()
    # Gets a subset of other people that have watched those movies
    viewers = df_ratings[(df_ratings['userId'] != target_user['userId'].values[0]) & (df_ratings['movieId'].isin(target_movies))]
    # Create sub-dataframes for each user [makes it easier to sort based on number of similar movies seen to target]
    viewers_sub_dataframes = viewers.groupby(['userId'])
    viewers_sub_dataframes = sorted(viewers_sub_dataframes, key=lambda x: len(x[1]), reverse=True)
    # Convert the sub dataframes back into a single dataframe
    return pd.concat(map(lambda x: x[1], viewers_sub_dataframes))

Calculate the correlation between the target user and the other users.

In [361]:
def user_correlation(target, other):
    # Get list of movies IDs of the other user
    sub_df_movies = other['movieId'].tolist()
    # Get movies by target that have also been watched by other user and then sort
    sorted_temp_target = target[(target['movieId'].isin(sub_df_movies))].sort_values(by=['movieId'])['rating'].tolist()
    # Get the ratings of the movies (this list is already sorted)
    sorted_other_user = other['rating'].tolist()
    # Calculate Pearsons Correlation
    with np.errstate(all='ignore'):
        score = (np.corrcoef(sorted_temp_target, sorted_other_user)[0, 1])
        if (np.isnan(score)):
            return 0
        return score

def get_similar_users(target, potential_users, process_num=100):
    # Get unique list of users IDs 
    users_list = potential_users['userId'].unique()
    # Get similarity of each user to the target user
    sim_users = dict([(x, user_correlation(target, potential_users[potential_users['userId'] == x])) for x in users_list[:process_num]])  
    # Sort the dict based on value and return dataframe
    sim_users = dict(sorted(sim_users.items(), key=operator.itemgetter(1), reverse=True))
    return pd.DataFrame(sim_users.items(), columns=['userId', 'similarity'])


Gets the users that are above a certain similarity threshold.

In [99]:
def select_best_users(user_sim_df, threshold=0.6):
    return user_sim_df[user_sim_df['similarity'] >= threshold]

Create User-Based Recommendations.

$$R_U = \frac{(\sum \limits _{u=1} ^{n} S_u * R_u)}{(\sum \limits _{u=1} ^{n} S_u)}$$
where $S_u$ is the user similarity and $R_u$ is the rating

Typically, the average rating is computed for each score the target user may put. However, the weighted average is used in this case, where each rating is multiplied by the similarity score.
https://realpython.com/build-recommendation-engine-collaborative-filtering/

In [345]:
def create_recommendations(df_ratings, sim_users):
    # Ratings for all movies by similar users to the target user
    sim_other_ratings = sim_users.merge(df_ratings, left_on='userId', right_on='userId')
    # Calculate the weighted rating
    sim_other_ratings['weighted_rating'] = sim_other_ratings['similarity'] * sim_other_ratings['rating']
    # Group by movieId and calculate sum
    sum_ratings = sim_other_ratings.groupby('movieId', as_index = False).sum()[['movieId', 'similarity','weighted_rating']]
    # From the sum, calculate the final weighted rating score for each movie for target
    sum_ratings['final_rating'] = sum_ratings['weighted_rating'] / sum_ratings['similarity']
    # Return the table along with the movies
    return sum_ratings.sort_values(by=['final_rating'], ascending=False)[['movieId','final_rating']]

def select_K_recommendations(recommend_items, N):
    movie_ids = recommend_items['movieId'].tolist()
    titles = [df_movies.loc[df_movies['movieId'] == x, 'title'].values[0] for x in movie_ids]
    recommend_items['title'] = titles
    return recommend_items.head(N)

def select_random_K_recommendations(recommend_items, N):
    top_2k = select_K_recommendations(recommend_items, 2 * N)
    return top_2k.sample(N)


def get_recommendations(df_ratings, sim_users, N=10, random=False):
    # To get the recommened movies of each user
    recommend_items = create_recommendations(df_ratings, sim_users)
    if (random):
        return select_random_K_recommendations(recommend_items, N)
    else:
        return select_K_recommendations(recommend_items, N)
        
        

Calculates the RMSE/MSE Score of the target user.
$$ RMSE = \sqrt{\frac{\sum \limits _{i=1} ^{N} (\hat{y_i} - y_i)^2}{N}} $$


$$ MSE = \frac{\sum \limits _{i=1} ^{N} (\hat{y_i} - y_i)^2}{N} $$

In [120]:
def base_score_calculate(target, sim_users):
    # List of movies watched by target
    movie_list = target['movieId'].tolist()
    # Ratings of other people watched target movies
    sim_other_ratings = sim_users.merge(df_ratings, left_on='userId', right_on='userId')
    # Only get movies that are watched by target
    sim_other_ratings = sim_other_ratings[sim_other_ratings['movieId'].isin(movie_list)]
    # Calculate the weighted rating
    sim_other_ratings['weighted_rating'] = sim_other_ratings['similarity'] * sim_other_ratings['rating']
    # Group by movieId and calculate sum
    sum_ratings = sim_other_ratings.groupby('movieId', as_index = False).sum()[['movieId', 'similarity','weighted_rating']]
    # From the sum, calculate the final weighted rating score for each movie for target
    sum_ratings['final_rating'] = sum_ratings['weighted_rating'] / sum_ratings['similarity']
    # Get the original ratings for each movie
    sum_ratings = sum_ratings.merge(target[['movieId','rating']], left_on='movieId', right_on='movieId')[['movieId','final_rating','rating']]
    return sum_ratings
    
def RMSE(target, sim_users):
    sum_ratings = base_score_calculate(target, sim_users)
    # RMSE score of actual vs predicted
    return mean_squared_error(sum_ratings['rating'].tolist(), sum_ratings['final_rating'].tolist(), squared=False)

def MSE(target, sim_users):
    sum_ratings = base_score_calculate(target, sim_users)
    # MSE score of actual vs predicted
    return mean_squared_error(sum_ratings['rating'].tolist(), sum_ratings['final_rating'].tolist(), squared=True)
    

Calculate the Precision and Recall of target user

To compute these methods the true ratings are translated into binary form by assuming any item rated $\geq$ 3.5 is relevant and any item < 3.5 is irrelevant. Then I compute the following equations for Precision and Recall:


$$ Precision = \frac{\text{|Relevant items @k|}}{\text{|All items @k|}} $$

$$ Recall = \frac{\text{|Relevant items @k|}}{\text{|Total Relevant items|}} $$

In [128]:
def Precision_Recall(target, sim_users, K):
    sum_ratings = base_score_calculate(target, sim_users)
    sum_ratings[sum_ratings['final_rating'] >= 3.5]['movieId']
    # Original relevant items [>= 3.5] = relevant
    original_relevant_items = sum_ratings[sum_ratings['rating'] >= 3.5]['movieId'].tolist()
    # Predicted relevant items [>= 3.5] = relevant for up to value 'K'
    temp_sum_ratings = sum_ratings.head(K)
    predicted_relevant_items = temp_sum_ratings[temp_sum_ratings['final_rating'] >= 3.5]['movieId'].tolist()
    # Calculate the intersection of lists
    relevant_items = len(set(original_relevant_items) & set(predicted_relevant_items))
    # Precision = (relevant @k) / (all items @k)
    precision = (relevant_items / temp_sum_ratings.shape[0])
    # recall = (relevant @k) / (total relevant items)
    recall = (relevant_items / len(original_relevant_items))
    return precision, recall

Create the training and test split for the target user.

In [129]:
def user_train_test_split(target, testing_split=0.2):
    train_set_size = math.floor(len(target) * (1 - testing_split))
    df_shuffle = target.sample(frac=1)
    return df_shuffle[:train_set_size], df_shuffle[train_set_size:]

To evaluate the model each user will have 20% of ratings taken as test set while the rest is used for training. The metrics are calculated for each user and then an average is computed.

In [375]:
def main_run_evaluate(K, positive_only=False, threshold=0.6):
    # Mean MSE, Precision and Recall Score
    avg_mse_test = []
    avg_precision_test = []
    avg_recall_test = []
    # List of all unique users
    all_users = df_ratings['userId'].unique()
    # Performing the calculation for all users in dataset
    for user in tqdm(all_users):
        try:
            target = df_ratings[df_ratings['userId'] == user]
            other = df_ratings[df_ratings['userId'] != user]
            # Create user training and test set
            train, test = user_train_test_split(target, testing_split=0.2)
            # Get other users that have watched the same movies seen in the training set
            viewers = get_viewers_of_target(train, positive_only)
            # Check how similar these users are to the target user
            sim_users = get_similar_users(train, viewers, process_num=50)
            # Selecting a subset of these similar users
            sim_users = select_best_users(sim_users, threshold=threshold)    
            # Evaluate the performance for target user
            mse_test = MSE(test, sim_users)
            precision_test, recall_test = Precision_Recall(test, sim_users, K=K)
            # Append to avg lists
            avg_mse_test.append(mse_test)
            avg_precision_test.append(precision_test)
            avg_recall_test.append(recall_test)
        except:
            # Due to small dataset size if it cant find other users with any related movies
            # it will throw an exception and it is caught hereor
            print('Unable to Process User:', user)
        
        
    metrics = {'avg_mse_test' : mean(avg_mse_test), 'avg_precision_test' : mean(avg_precision_test),
               'avg_recall_test' : mean(avg_recall_test)}
    
    return metrics

In [378]:
# Running Algorithm with K=50
metrics_output = main_run_evaluate(K=50, positive_only=False, threshold=0.0)

In [377]:
metrics_output

{'avg_mse_test': 1.0085150242172634,
 'avg_precision_test': 0.4574205979921704,
 'avg_recall_test': 0.6306438453600622}

Get list of recommended movies for a single user.

In [372]:
def user_recommend(userId, positive_only=False, threshold=0.0, N=10, random=False, unseenOnly=False):
    # Dataset Split
    target = df_ratings[df_ratings['userId'] == userId]
    # Get other users that have watched the same movies seen in the training set
    viewers = get_viewers_of_target(target, positive_only)
    # Check how similar these users are to the target user
    sim_users = get_similar_users(target, viewers, process_num=50)
    # Selecting a subset of these similar users
    sim_users = select_best_users(sim_users, threshold=threshold)   
    if (unseenOnly):
        return get_recommendations_unseen(target, df_ratings, sim_users, N=N, random=random)
    return get_recommendations(df_ratings, sim_users, N=N, random=random)

In [395]:
user_recommend(120, positive_only=False, threshold=0.0, N=50, random=False, unseenOnly=False)

Unnamed: 0,movieId,final_rating,title
5777,89118,5.0,"Skin I Live In, The"
3886,7121,5.0,Adam's Rib
3449,6192,5.0,Open Hearts
6339,122912,5.0,Avengers: Infinity War - Part I
3454,6201,5.0,Lady Jane
2408,3951,5.0,Two Family House
6354,127108,5.0,Brooklyn
3571,6442,5.0,Belle époque
6392,132333,5.0,Seve
823,1295,5.0,"Unbearable Lightness of Being, The"


In [393]:
seeRatedMovies = df_ratings.merge(df_movies[['movieId','title']], left_on='movieId', right_on='movieId')
seeRatedMovies[seeRatedMovies['userId'] == 120]

Unnamed: 0,userId,movieId,rating,title
1619,120,260,5.0,Star Wars: Episode IV - A New Hope
4684,120,608,5.0,Fargo
4861,120,648,3.0,Mission: Impossible
5118,120,733,4.0,"Rock, The"
5236,120,736,3.0,Twister
5378,120,780,2.0,Independence Day
6030,120,1073,2.0,Willy Wonka & the Chocolate Factory
7598,120,1210,5.0,Star Wars: Episode VI - Return of the Jedi
18980,120,32,3.0,Twelve Monkeys
19160,120,52,3.0,Mighty Aphrodite
