read csv files for movie metadata and ratings

In [1]:
import pandas as pd

metadata = pd.read_csv("app/exports/metadata.csv")
ratings = pd.read_csv("./datasets/ratings.csv")

If there is no corresponding metadata for a movie in ratings, it is removed

In [5]:
ratings = ratings[ratings['movieId'].isin(metadata['movieId'])]
del metadata

analyse unique valuees

In [6]:
# movie count and user count
print("Movie count: ", len(ratings['movieId'].unique()))
print("User count: ", len(ratings['userId'].unique()))

Movie count:  21719
User count:  270821


drop movies with less than 10 ratings and users who has rated less than 10 movies since they contain less information. This is done to reduce dimensionality of the resulting similarity matrix

In [7]:
rating_count_per_movie = ratings.groupby('movieId')['rating'].count().reset_index()

# drop movies with less than 10 ratings
rating_count_per_movie = rating_count_per_movie[rating_count_per_movie['rating'] > 10]

ratings = ratings[ratings['movieId'].isin(rating_count_per_movie['movieId'])]


In [8]:
# movie count and user count
print("Movie count: ", len(ratings['movieId'].unique()))
print("User count: ", len(ratings['userId'].unique()))

Movie count:  16880
User count:  270816


In [9]:
ratings_count_per_user = ratings.groupby('userId')['rating'].count().reset_index()

# drop users with less than 10 ratings
ratings_count_per_user = ratings_count_per_user[ratings_count_per_user['rating'] > 10]

ratings = ratings[ratings['userId'].isin(ratings_count_per_user['userId'])]

In [10]:
# movie count and user count
print("Movie count: ", len(ratings['movieId'].unique()))
print("User count: ", len(ratings['userId'].unique()))

Movie count:  16880
User count:  226440


Sample ratings with users

In [11]:
import random
random.seed(42)

# get a sample of 100000 users
users = ratings['userId'].unique().tolist()
users = random.sample(users, 100000)

train_ratings = ratings[ratings['userId'].isin(users)]

del ratings
train_ratings

Unnamed: 0,userId,movieId,rating,timestamp
59,4,223,4.0,1042668576
60,4,415,4.0,1042667925
61,4,648,4.0,1042674800
62,4,1097,5.0,1042667925
63,4,1197,4.0,1042667956
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


Pivot table to make user-rating matrix

In [14]:
user_rating_matrix = train_ratings.pivot_table(index='userId', columns='movieId', values='rating')
del train_ratings
user_rating_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,174585,174681,174711,174815,174893,175281,175475,175655,175795,176211
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,,,,,,,,,,,...,,,,,,,,,,
12,4.0,,,,,,,,,,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
19,,,,,,,,,,,...,,,,,,,,,,
20,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270886,,,,,,,,,,,...,,,,,,,,,,
270887,5.0,5.0,4.0,,,5.0,,,5.0,,...,,,,,,,,,,
270889,,,,,,,,,,,...,,,,,,,,,,
270893,4.0,,,,,,,,,,...,,,,,,,,,,


fill Nan values with movie's mean rating

In [15]:
# fill na with movie average rating
user_rating_matrix.fillna(user_rating_matrix.mean(axis=0), inplace=True)

Normalize the data

In [17]:
# standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
user_rating_matrix = pd.DataFrame(scaler.fit_transform(user_rating_matrix), index=user_rating_matrix.index, columns=user_rating_matrix.columns)
user_rating_matrix


movieId,1,2,3,4,5,6,7,8,9,10,...,174585,174681,174711,174815,174893,175281,175475,175655,175795,176211
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,-8.965533e-16,1.353289e-15,3.393221e-15,-3.500687e-15,3.485071e-15,1.436108e-15,-5.360405e-15,-1.140510e-14,0.000000,-4.020847e-15,...,-2.660386e-14,0.0,0.0,0.0,0.0,5.152495e-13,1.570092e-13,0.0,1.968606e-14,0.0
12,2.266538e-01,1.353289e-15,3.393221e-15,-3.500687e-15,3.485071e-15,1.436108e-15,-5.360405e-15,-1.140510e-14,0.000000,-4.020847e-15,...,-2.660386e-14,0.0,0.0,0.0,0.0,5.152495e-13,1.570092e-13,0.0,1.968606e-14,0.0
17,-8.965533e-16,1.353289e-15,3.393221e-15,-3.500687e-15,3.485071e-15,1.436108e-15,-5.360405e-15,-1.140510e-14,0.000000,-4.020847e-15,...,-2.660386e-14,0.0,0.0,0.0,0.0,5.152495e-13,1.570092e-13,0.0,1.968606e-14,0.0
19,-8.965533e-16,1.353289e-15,3.393221e-15,-3.500687e-15,3.485071e-15,1.436108e-15,-5.360405e-15,-1.140510e-14,0.000000,-4.020847e-15,...,-2.660386e-14,0.0,0.0,0.0,0.0,5.152495e-13,1.570092e-13,0.0,1.968606e-14,0.0
20,2.266538e-01,1.353289e-15,3.393221e-15,-3.500687e-15,3.485071e-15,1.436108e-15,-5.360405e-15,-1.140510e-14,0.000000,-4.020847e-15,...,-2.660386e-14,0.0,0.0,0.0,0.0,5.152495e-13,1.570092e-13,0.0,1.968606e-14,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270886,-8.965533e-16,1.353289e-15,3.393221e-15,-3.500687e-15,3.485071e-15,1.436108e-15,-5.360405e-15,-1.140510e-14,0.000000,-4.020847e-15,...,-2.660386e-14,0.0,0.0,0.0,0.0,5.152495e-13,1.570092e-13,0.0,1.968606e-14,0.0
270887,2.245512e+00,5.356701e+00,3.136327e+00,-3.500687e-15,3.485071e-15,3.760608e+00,-5.360405e-15,-1.140510e-14,14.948677,-4.020847e-15,...,-2.660386e-14,0.0,0.0,0.0,0.0,5.152495e-13,1.570092e-13,0.0,1.968606e-14,0.0
270889,-8.965533e-16,1.353289e-15,3.393221e-15,-3.500687e-15,3.485071e-15,1.436108e-15,-5.360405e-15,-1.140510e-14,0.000000,-4.020847e-15,...,-2.660386e-14,0.0,0.0,0.0,0.0,5.152495e-13,1.570092e-13,0.0,1.968606e-14,0.0
270893,2.266538e-01,1.353289e-15,3.393221e-15,-3.500687e-15,3.485071e-15,1.436108e-15,-5.360405e-15,-1.140510e-14,0.000000,-4.020847e-15,...,-2.660386e-14,0.0,0.0,0.0,0.0,5.152495e-13,1.570092e-13,0.0,1.968606e-14,0.0


Calculate cosine similarity

In [19]:
# item based cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

collaborative_similarity = cosine_similarity(user_rating_matrix.T)
collaborative_similarity = pd.DataFrame(collaborative_similarity, index=user_rating_matrix.T.index, columns=user_rating_matrix.T.index)
del user_rating_matrix
collaborative_similarity



movieId,1,2,3,4,5,6,7,8,9,10,...,174585,174681,174711,174815,174893,175281,175475,175655,175795,176211
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.131013,4.946678e-02,2.166942e-02,6.226784e-02,5.372020e-02,5.704072e-02,1.722044e-02,1.503198e-02,9.072584e-02,...,3.286735e-03,0.002764,4.859798e-03,-0.002547,-3.388079e-03,1.109600e-03,-2.214024e-03,-5.494321e-03,3.906163e-04,7.980239e-04
2,0.131013,1.000000,8.068421e-02,5.152423e-02,9.853588e-02,3.220918e-02,7.455671e-02,7.085357e-02,6.065864e-02,1.347949e-01,...,3.522253e-03,-0.001757,-8.522078e-04,0.009567,-7.059934e-04,-8.365644e-03,6.531870e-03,9.659949e-04,-1.432955e-03,-1.204565e-03
3,0.049467,0.080684,1.000000e+00,5.691386e-02,1.976939e-01,3.947937e-02,1.152761e-01,5.447363e-02,7.995153e-02,5.599260e-02,...,-1.104002e-03,0.000000,7.703720e-34,0.001995,7.703720e-34,-6.238095e-03,8.870874e-03,4.622232e-33,5.728145e-04,5.408279e-04
4,0.021669,0.051524,5.691386e-02,1.000000e+00,6.497487e-02,2.416495e-02,5.230800e-02,7.915610e-02,4.206456e-02,2.479556e-02,...,-1.234092e-02,0.000000,7.703720e-34,0.000000,0.000000e+00,-1.758007e-27,-5.694836e-28,-4.622232e-33,-7.122282e-29,-3.081488e-33
5,0.062268,0.098536,1.976939e-01,6.497487e-02,1.000000e+00,3.747637e-02,1.495192e-01,6.645053e-02,7.980631e-02,6.782164e-02,...,3.741808e-03,0.000000,0.000000e+00,0.002343,-3.802881e-03,1.748194e-27,5.688823e-28,3.081488e-33,-3.023509e-03,1.540744e-33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175281,0.001110,-0.008366,-6.238095e-03,-1.758007e-27,1.748194e-27,7.416713e-03,-2.647208e-27,-5.857792e-27,-1.145081e-29,-7.789647e-04,...,-1.372951e-26,0.000000,0.000000e+00,0.000000,0.000000e+00,1.000000e+00,-5.625687e-01,5.916457e-31,1.015731e-26,3.944305e-31
175475,-0.002214,0.006532,8.870874e-03,-5.694836e-28,5.688823e-28,2.168351e-28,-8.292777e-28,-1.792144e-27,-3.149281e-30,-3.673616e-03,...,-4.171291e-27,0.000000,0.000000e+00,0.000000,0.000000e+00,-5.625687e-01,1.000000e+00,1.972152e-31,3.085832e-27,9.860761e-32
175655,-0.005494,0.000966,4.622232e-33,-4.622232e-33,3.081488e-33,2.311116e-33,-6.162976e-33,-1.848893e-32,0.000000e+00,-6.162976e-33,...,9.485724e-02,0.000000,0.000000e+00,0.000000,0.000000e+00,5.916457e-31,1.972152e-31,1.000000e+00,2.113978e-01,0.000000e+00
175795,0.000391,-0.001433,5.728145e-04,-7.122282e-29,-3.023509e-03,2.266684e-03,-1.521344e-03,-2.253693e-28,-4.591417e-31,-7.795702e-29,...,4.893791e-02,0.000000,1.015056e-01,0.000000,0.000000e+00,1.015731e-26,3.085832e-27,2.113978e-01,1.000000e+00,6.162976e-33


Save the calculated similarity matrix

In [21]:
# save the model
import pickle

with open('app/exports/collaborative_similarity.pkl', 'wb') as f:
    pickle.dump(collaborative_similarity, f)