In [2]:
# Imports
import pandas as pd
import numpy as np
import operator
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from statistics import mean
from tqdm import tqdm

In [3]:
# Importing the dataset
df_movies = pd.read_csv('data/movies.csv')
df_ratings = pd.read_csv('data/ratings.csv')
del df_ratings['timestamp']

In [4]:
df_movies['movie_year'] = df_movies['title'].str.extract("\((.*)\)")
df_movies['title'] = df_movies['title'].str.replace("\((.*)\)", "", regex=True)
df_movies.head(2)

Unnamed: 0,movieId,title,genres,movie_year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995


In [5]:
# Create ratings matrix
ratings_matrix = df_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
# Create the user similarity matrix [Cosine Similarity]
user_sim_matrix = pd.DataFrame(cosine_similarity(ratings_matrix), index=ratings_matrix.index, columns=ratings_matrix.index)


Above creates a matrix for all the ratings in the dataset. Another matrix is then computed to understand how similar each user is to other users.

In [6]:
def create_all_recommendations(target_user, N=100):
    # Get the top N most similar users [index, similarity] (index is userId)
    # Below excludes the first user and gets N sim users
    most_similar_users = user_sim_matrix.loc[target_user].sort_values(ascending=False)[1:(N+1)]
    # Get the sub matrix of all the top N similar users
    sub_user_matrix = ratings_matrix.loc[most_similar_users.index]
    # Converts Matrix to standard table
    sub_df = sub_user_matrix.stack().reset_index().rename(columns={'level_0':'userId','level_1':'movieId', 0:'rating'})
    # Removes ratings that are NaN
    sub_df = sub_df[sub_df['rating'] != 0]
    # Merges ratings with the similarity score
    sub_df = sub_df.merge(most_similar_users, left_on='userId', right_on='userId').rename(columns={target_user:'similarity'})
    # Calculate weighted rating
    sub_df['weighted_rating'] = sub_df['similarity'] * sub_df['rating']
    # Calculate sum of weights for each movie 
    sub_df = sub_df.groupby('movieId', as_index = False).sum()[['movieId', 'similarity','weighted_rating']]
    # From the sum, calculate the final weighted rating score for each movie for target
    sub_df['prediction_userCF'] = sub_df['weighted_rating'] / sub_df['similarity']
    # Sort based on the predicted rating
    return sub_df[['movieId', 'prediction_userCF']]

Above method creates the recommendations for a single user.

Instead of the mean rating of similar users, a weighted rating is used.

$$R_U = \frac{(\sum \limits _{u=1} ^{n} S_u * R_u)}{(\sum \limits _{u=1} ^{n} S_u)}$$
where $S_u$ is the user similarity and $R_u$ is the rating

Typically, the average rating is computed for each score the target user may put. However, the weighted average is used in this case, where each rating is multiplied by the similarity score.
https://realpython.com/build-recommendation-engine-collaborative-filtering/

In [7]:
def get_recommendations(target_user, K=30, random=False):
    # To get the recommened movies of each user
    recommend_items = create_all_recommendations(target_user).sort_values(by=['prediction_userCF'], ascending=False)
    recommend_items = recommend_items.merge(df_movies[['movieId','title']], left_on='movieId', right_on='movieId')
    if (random):
        # Selects N samples from the top 3 * N movies
        return recommend_items.head(3*K).sample(K)
    else:
        # Selects the top N movies
        return recommend_items.head(K)

In [12]:
get_recommendations(1, K=10, random=True)

Unnamed: 0,movieId,prediction_userCF,title
18,4454,5.0,More
29,5485,5.0,Tadpole
26,5537,5.0,Satin Rouge
15,8580,5.0,Into the Woods
24,99,5.0,Heidi Fleiss: Hollywood Madam
21,1699,5.0,"Butcher Boy, The"
28,1809,5.0,Fireworks
20,1310,5.0,Hype!
27,2972,5.0,Red Sorghum
12,65642,5.0,Timecrimes


Above method returns nicely presented table of the recommended movies. Random=True allows other movies that are highly rated to also appear within the list.

In [13]:
def base_score_CF(target_user, N=100):
    # Get dataframe of Movies already watched by user
    movie_list = df_ratings[df_ratings['userId'] == target_user][['movieId','rating']]
    # Get Predictions
    predictions = create_all_recommendations(target_user, N)
    # Create complete dataframe
    complete_df = movie_list.merge(predictions, left_on='movieId', right_on='movieId')
    return complete_df.sort_values(by=['prediction_userCF'], ascending=False)

def MSE(target_user):
    complete_df = base_score_CF(target_user)
    return mean_squared_error(complete_df['rating'].tolist(), complete_df['prediction_userCF'].tolist(), squared=True)


def average_mse():
    scores = []
    for x in tqdm(df_ratings['userId'].unique()):
        scores.append(MSE(x))
    return mean(scores)
          

Calculates the RMSE/MSE Score of the target user.

$$ MSE = \frac{\sum \limits _{i=1} ^{N} (\hat{y_i} - y_i)^2}{N} $$

In [14]:
average_mse()

100%|██████████| 610/610 [01:03<00:00,  9.64it/s]


0.9939166312137755