In [44]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error

## Load data

In [3]:
# movies data
movies_df = pd.read_csv('./datasets/ml-latest-small/movies.csv')

# ratings data
ratings_df = pd.read_csv('./datasets/ml-latest-small/ratings.csv')

In [59]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [60]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [61]:
# first filter out rarely rated movies and rarely rating users
min_movie_ratings = 10
min_user_ratings = 10

filter_movies = (ratings_df['movieId'].value_counts() > min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

filter_users = (ratings_df['userId'].value_counts() > min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# the get filtered data
mask = (ratings_df['movieId'].isin(filter_movies)) & (ratings_df['userId'].isin(filter_users))
ratings_df_filtered = ratings_df[mask]
del filter_movies, filter_users
ratings_df_filtered.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [62]:
ratings_df_filtered.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,79636.0,79636.0,79636.0,79636.0
mean,318.605028,14592.299174,3.579443,1196723000.0
std,181.771511,29229.502261,1.0168,216812900.0
min,1.0,1.0,0.5,828124600.0
25%,167.0,953.0,3.0,999708000.0
50%,314.0,2424.0,4.0,1180444000.0
75%,474.0,5971.0,4.0,1430598000.0
max,610.0,187593.0,5.0,1537799000.0


In [64]:
# set aside a small port of ratings_df for testing purpose
n = 10000

rng = np.random.default_rng(42)
permuted_indices = rng.permutation(ratings_df_filtered.shape[0])


df_train = ratings_df_filtered.iloc[permuted_indices[:-n],:]
df_test = ratings_df_filtered.iloc[permuted_indices[-n:],:]
print(df_train.shape)
print(df_test.shape)

(69636, 4)
(10000, 4)


In [65]:
# transform the ratings_df to User-Movie matrix

um_matrix = df_train.pivot_table(index='userId', columns='movieId', values='rating')
print(um_matrix.shape)
um_matrix.head(10)

(610, 2121)


movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,4.0,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,5.0,4.0,4.0,,3.0,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,4.0,,,,,,2.0,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [66]:
# join ratings_df and movies_df
movie_ratings = ratings_df.join(movies_df.set_index('movieId'), on='movieId')
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [67]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Mean Rating based global recommendation

Global recommendation is useful when there is no information about a uer. Here we compute the mean rating for each movie to create a ranking list. Some top ranked movies can thus be recommended to a new user (and is the same for all users). 

In [68]:
# mean ratings for all movies
mean_ratings = um_matrix.mean(axis=0).sort_values(ascending=False).rename('Mean-Rating').to_frame()

# rating frequencies for all movies
ratings_count = um_matrix.count(axis=0).rename('Rating-counts').to_frame()

# combine
movie_ratings_mean = mean_ratings.join(ratings_count).join(movies_df.set_index('movieId'), on='movieId')

movie_ratings_mean.head(20)


Unnamed: 0_level_0,Mean-Rating,Rating-counts,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3451,4.625,8,Guess Who's Coming to Dinner (1967),Drama
1041,4.6,10,Secrets & Lies (1996),Drama
1178,4.541667,12,Paths of Glory (1957),Drama|War
1217,4.464286,14,Ran (1985),Drama|War
2360,4.458333,12,"Celebration, The (Festen) (1998)",Drama
1104,4.444444,18,"Streetcar Named Desire, A (1951)",Drama
318,4.435315,286,"Shawshank Redemption, The (1994)",Crime|Drama
306,4.433333,15,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
3030,4.4,10,Yojimbo (1961),Action|Adventure
951,4.392857,14,His Girl Friday (1940),Comedy|Romance


In [69]:
# join the mean ratings with test dataset
predictions = df_test.set_index('movieId').join(mean_ratings)
predictions.head()

Unnamed: 0_level_0,userId,rating,timestamp,Mean-Rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,182,4.0,1063289621,3.951613
1,269,5.0,850865423,3.951613
1,135,4.0,1009691859,3.951613
1,21,3.5,1407618878,3.951613
1,350,4.0,864940931,3.951613


In [70]:
predictions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 187593
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   userId       10000 non-null  int64  
 1   rating       10000 non-null  float64
 2   timestamp    10000 non-null  int64  
 3   Mean-Rating  10000 non-null  float64
dtypes: float64(2), int64(2)
memory usage: 390.6 KB


In [71]:
# calculate the RMSE
y_true = predictions['rating']
y_pred = predictions['Mean-Rating']

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f'The RMSE of the mean rating recommender is: {rmse}')

The RMSE of the mean rating recommender is: 0.9323896824242067


In [72]:
# Top 10 rated movies
movie_ratings_mean[['title', 'Mean-Rating']].head(10)

Unnamed: 0_level_0,title,Mean-Rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3451,Guess Who's Coming to Dinner (1967),4.625
1041,Secrets & Lies (1996),4.6
1178,Paths of Glory (1957),4.541667
1217,Ran (1985),4.464286
2360,"Celebration, The (Festen) (1998)",4.458333
1104,"Streetcar Named Desire, A (1951)",4.444444
318,"Shawshank Redemption, The (1994)",4.435315
306,Three Colors: Red (Trois couleurs: Rouge) (1994),4.433333
3030,Yojimbo (1961),4.4
951,His Girl Friday (1940),4.392857


## Weighted-mean rating based global recommender

Some movies may just have a few ratings,which may introduce unstable mean. The weighted mean used by IMDB is a good way to tackle this problem. 

Weighted Rating is defined as 

WR = (v/(v+w))R + (m/(v+m))C

where:

R = average for the movie (mean) = (Rating)

v = number of votes for the movie = (votes)

m = minimum votes requred

C = mean vote across the entire report

In [73]:
# number of min votes considered
m = 10 

# mean rating for all movies
C = um_matrix.stack().mean()

# mean ratings for all movies
R = um_matrix.mean(axis=0).values

# rating count for all movies
v = um_matrix.count().values

In [77]:
# weighted ratings
weighted_rating = (v / (v+m)) * R + (m / (v+m)) * C
weighted_mean_rating = pd.DataFrame(weighted_rating, columns=['Weighted-Mean'])
weighted_mean_rating.set_index(um_matrix.columns, inplace=True)

# combine 
movie_ratings_mean = movie_ratings_mean.join(weighted_mean_rating)

movie_ratings_mean.head(10)

Unnamed: 0_level_0,Mean-Rating,Rating-counts,title,genres,Weighted-Mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3451,4.625,8,Guess Who's Coming to Dinner (1967),Drama,4.042511
1041,4.6,10,Secrets & Lies (1996),Drama,4.08826
1178,4.541667,12,Paths of Glory (1957),Drama|War,4.102963
1217,4.464286,14,Ran (1985),Drama|War,4.094383
2360,4.458333,12,"Celebration, The (Festen) (1998)",Drama,4.057509
1104,4.444444,18,"Streetcar Named Desire, A (1951)",Drama,4.134471
318,4.435315,286,"Shawshank Redemption, The (1994)",Crime|Drama,4.406301
306,4.433333,15,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,4.090608
3030,4.4,10,Yojimbo (1961),Action|Adventure,3.98826
951,4.392857,14,His Girl Friday (1940),Comedy|Romance,4.052716


In [78]:
# get predictions
predictions = df_test.set_index('movieId').join(weighted_mean_rating)
predictions.head(10)

Unnamed: 0_level_0,userId,rating,timestamp,Weighted-Mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,182,4.0,1063289621,3.932475
1,269,5.0,850865423,3.932475
1,135,4.0,1009691859,3.932475
1,21,3.5,1407618878,3.932475
1,350,4.0,864940931,3.932475
1,339,4.0,1460183470,3.932475
1,90,3.0,856353996,3.932475
1,570,4.0,1181476989,3.932475
1,436,4.0,833529571,3.932475
1,63,5.0,1443199669,3.932475


In [79]:
# RMSE

y_true = predictions['rating']
y_pred = predictions['Weighted-Mean']

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f'The RMSE for the weighted-mean rating recommender is: {rmse}')

The RMSE for the weighted-mean rating recommender is: 0.9331253250718711


In [81]:
# Top 10 movies

movie_ratings_mean.sort_values(by=['Weighted-Mean'], ascending=False)[['title', 'Weighted-Mean']].head(10)

Unnamed: 0_level_0,title,Weighted-Mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
318,"Shawshank Redemption, The (1994)",4.406301
858,"Godfather, The (1972)",4.243063
58559,"Dark Knight, The (2008)",4.224187
912,Casablanca (1942),4.223282
1193,One Flew Over the Cuckoo's Nest (1975),4.218877
1213,Goodfellas (1990),4.203121
2959,Fight Club (1999),4.195888
48516,"Departed, The (2006)",4.191181
260,Star Wars: Episode IV - A New Hope (1977),4.189979
1221,"Godfather: Part II, The (1974)",4.185083
