In [1]:
import pandas as pd
import numpy as np

from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise import accuracy

from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

In [2]:
df_movies  = pd.read_csv("./Datasets/Movies.csv")
df_links   = pd.read_csv("./Datasets/Links.csv")
df_tags    = pd.read_csv("./Datasets/Tags.csv")
df_ratings = pd.read_csv("./Datasets/Ratings.csv")

In [3]:
user_item_matrix = df_ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
user_item_matrix_sparse = csr_matrix(user_item_matrix)

U, sigma, Vt = svds(user_item_matrix_sparse, k=50)
sigma = np.diag(sigma)

In [5]:
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [6]:
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)
predicted_ratings_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.181872,0.393674,0.838186,-0.082365,-0.546279,2.521662,-0.887231,-0.025221,0.196969,1.606758,...,-0.024984,-0.021415,-0.028553,-0.028553,-0.024984,-0.028553,-0.024984,-0.024984,-0.024984,-0.058988
2,0.209809,0.004821,0.030742,0.017252,0.183764,-0.060660,0.083306,0.023797,0.048100,-0.151968,...,0.018895,0.016196,0.021594,0.021594,0.018895,0.021594,0.018895,0.018895,0.018895,0.031966
3,0.013394,0.034726,0.050525,0.000200,-0.005577,0.114673,-0.007461,0.000738,0.004747,-0.061284,...,-0.001612,-0.001382,-0.001843,-0.001843,-0.001612,-0.001843,-0.001612,-0.001612,-0.001612,-0.000530
4,2.012072,-0.394882,-0.290386,0.093864,0.123312,0.259765,0.472667,0.035965,0.011293,-0.021983,...,0.001966,0.001685,0.002247,0.002247,0.001966,0.002247,0.001966,0.001966,0.001966,-0.021462
5,1.336714,0.772954,0.064577,0.113880,0.274994,0.584480,0.251048,0.131534,-0.086310,1.035361,...,-0.004407,-0.003778,-0.005037,-0.005037,-0.004407,-0.005037,-0.004407,-0.004407,-0.004407,-0.006099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.479239,-0.068356,0.013008,0.079384,-0.177777,-0.653276,2.354006,-0.005611,-0.083256,-0.181771,...,-0.035946,-0.030811,-0.041081,-0.041081,-0.035946,-0.041081,-0.035946,-0.035946,-0.035946,-0.057618
607,2.818179,1.371977,0.327078,0.007047,-0.280580,1.507306,-0.082546,-0.017639,0.069184,1.387139,...,0.004102,0.003516,0.004688,0.004688,0.004102,0.004688,0.004102,0.004102,0.004102,-0.032740
608,2.309635,2.702439,2.264197,0.020182,0.152228,3.716082,-0.028438,0.222490,0.129641,2.447513,...,-0.012517,-0.010728,-0.014305,-0.014305,-0.012517,-0.014305,-0.012517,-0.012517,-0.012517,0.092752
609,0.783183,0.530143,0.097975,0.025595,0.090406,0.216570,0.080565,0.060980,-0.066119,1.249459,...,0.000985,0.000844,0.001125,0.001125,0.000985,0.001125,0.000985,0.000985,0.000985,-0.005494


In [7]:
predicted_ratings_df.max(axis=1)

userId
1      6.421784
2      2.108572
3      0.311241
4      3.815454
5      3.090049
         ...   
606    6.311061
607    5.451264
608    6.488272
609    2.455088
610    5.761253
Length: 610, dtype: float64

In [8]:
min_rating = np.min(predicted_ratings)
max_rating = np.max(predicted_ratings)

scaled_predicted_ratings = 5.0 * (predicted_ratings - min_rating) / (max_rating - min_rating)

scaled_predicted_ratings_df = pd.DataFrame(scaled_predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

In [9]:
scaled_predicted_ratings_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.273023,1.591083,1.760600,1.409543,1.232626,2.402604,1.102602,1.431335,1.516068,2.053700,...,1.431425,1.432786,1.430064,1.430064,1.431425,1.430064,1.431425,1.431425,1.431425,1.418457
2,1.520965,1.442791,1.452677,1.447532,1.511032,1.417820,1.472722,1.450028,1.459296,1.382999,...,1.448159,1.447129,1.449188,1.449188,1.448159,1.449188,1.448159,1.448159,1.448159,1.453143
3,1.446061,1.454196,1.460221,1.441029,1.438826,1.484684,1.438108,1.441235,1.442763,1.417582,...,1.440338,1.440426,1.440250,1.440250,1.440338,1.440250,1.440338,1.440338,1.440338,1.440751
4,2.208268,1.290362,1.330212,1.476748,1.487979,1.540016,1.621207,1.454668,1.445260,1.432570,...,1.441703,1.441596,1.441810,1.441810,1.441703,1.441810,1.441703,1.441703,1.441703,1.432768
5,1.950717,1.735723,1.465580,1.484382,1.545823,1.663848,1.536691,1.491114,1.408038,1.835794,...,1.439272,1.439512,1.439032,1.439032,1.439272,1.439032,1.439272,1.439272,1.439272,1.438627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.386426,1.414885,1.445914,1.471226,1.373157,1.191822,2.338667,1.438813,1.409203,1.371633,...,1.427245,1.429203,1.425287,1.425287,1.427245,1.425287,1.427245,1.427245,1.427245,1.418980
607,2.515682,1.964164,1.565686,1.443640,1.333952,2.015773,1.409474,1.434226,1.467337,1.969947,...,1.442517,1.442294,1.442741,1.442741,1.442517,1.442741,1.442517,1.442517,1.442517,1.428467
608,2.321746,2.471544,2.304418,1.448649,1.499006,2.858103,1.430108,1.525801,1.490392,2.374327,...,1.436180,1.436862,1.435498,1.435498,1.436180,1.435498,1.436180,1.436180,1.436180,1.476324
609,1.739624,1.643126,1.478316,1.450714,1.475430,1.523543,1.471677,1.464208,1.415738,1.917441,...,1.441328,1.441275,1.441382,1.441382,1.441328,1.441382,1.441328,1.441328,1.441328,1.438858


In [10]:
#scaled_predicted_ratings_df.max(axis=1).to_csv('./scaled_predicted_ratings.csv')

In [11]:
scaled_predicted_ratings_df.max(axis=1)

userId
1      3.889938
2      2.245069
3      1.559646
4      2.895999
5      2.619361
         ...   
606    3.847713
607    3.519825
608    3.915294
609    2.377215
610    3.638041
Length: 610, dtype: float64

In [12]:
np.min(scaled_predicted_ratings_df)

0.0

In [13]:
scaled_predicted_ratings_df.mean(axis=1)

userId
1      1.479975
2      1.445303
3      1.443113
4      1.470896
5      1.447595
         ...   
606    1.601077
607    1.468978
608    1.546351
609    1.444816
610    1.628610
Length: 610, dtype: float64

In [14]:
scaled_predicted_ratings_df.min(axis=1)

userId
1      0.721222
2      1.256662
3      1.376909
4      1.021650
5      1.322632
         ...   
606    1.148174
607    0.887871
608    0.559644
609    1.327052
610    1.168711
Length: 610, dtype: float64

In [15]:
def recommend_movies_svd(user_id, user_item_matrix, predicted_ratings_df, movies_df, top_n=10):
    user_ratings = user_item_matrix.loc[user_id]
    
    user_predicted_ratings = predicted_ratings_df.loc[user_id].drop(user_ratings[user_ratings > 0].index)
    
    top_n_movie_ids = user_predicted_ratings.sort_values(ascending=False).head(top_n).index
    recommended_movies = movies_df[movies_df['movieId'].isin(top_n_movie_ids)]
        
    return recommended_movies

In [16]:
user_id_sample = 1

In [17]:
recommended_movies_svd_sample = recommend_movies_svd(user_id_sample, user_item_matrix, scaled_predicted_ratings_df, df_movies, top_n=5)
recommended_movies_svd_sample

Unnamed: 0,movieId,title,genres
659,858,"Godfather, The (1972)",Crime|Drama
793,1036,Die Hard (1988),Action|Crime|Thriller
922,1221,"Godfather: Part II, The (1974)",Crime|Drama
1067,1387,Jaws (1975),Action|Horror
1445,1968,"Breakfast Club, The (1985)",Comedy|Drama


In [26]:
actual_ratings = user_item_matrix.values.flatten()
predicted_ratings = scaled_predicted_ratings_df.values.flatten()

In [27]:
non_zero_indices = user_item_matrix.values.flatten() > 0

In [28]:
actual_ratings_filtered = actual_ratings[non_zero_indices]
predicted_ratings_filtered = predicted_ratings[non_zero_indices]

In [29]:
predicted_ratings_filtered.size

100836

In [30]:
rmse = sqrt(mean_squared_error(actual_ratings_filtered, predicted_ratings_filtered))

print(f'RMSE: {rmse}')

RMSE: 1.6018934304006824
