In [1]:
import pandas as pd

metadata = pd.read_csv("app/exports/metadata.csv")
ratings = pd.read_csv("./datasets/ratings.csv")


In [2]:
metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'movieId', 'imdbId', 'tmdbId'],
      dtype='object')

In [3]:
metadata = metadata[['movieId', 'title', 'vote_count']]

In [4]:
metadata.dropna(inplace=True)

In [5]:
ratings = ratings[ratings['movieId'].isin(metadata['movieId'])]

In [6]:
rating_count = ratings.groupby('movieId')['rating'].count().reset_index()

# drop movies with less than 1000 ratings
rating_count = rating_count[rating_count['rating'] > 1000]

ratings = ratings[ratings['movieId'].isin(rating_count['movieId'])]


In [7]:
train_ratings = pd.DataFrame()

movies_in_ratings = rating_count['movieId'].tolist()

for movie in movies_in_ratings:
    train_ratings = pd.concat([train_ratings, ratings[ratings['movieId'] == movie].sample(100, random_state=42)], ignore_index=True)
                                         
train_ratings.shape

(371700, 4)

In [8]:
user_rating_matrix = train_ratings.pivot_table(index='userId', columns='movieId', values='rating')
user_rating_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,162606,163645,164179,164909,166461,166528,166635,166643,168250,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
20,,,,,,,,,,,...,,,,,,,,,,
23,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270879,,,,,,,,,,,...,,,,,,,,,,
270885,,,,,,,,,,,...,,,,,,,,,,
270887,,,,,,,,,,,...,,,,,,,,,,
270893,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# fill na with movie average rating
user_rating_matrix = user_rating_matrix.fillna(user_rating_matrix.mean(axis=0))
user_rating_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,162606,163645,164179,164909,166461,166528,166635,166643,168250,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,3.93,3.34,3.17,2.93,3.15,3.735,3.27,3.06,2.915,3.335,...,3.68,3.785,4.11,3.86,3.81,3.84,3.545,3.86,4.07,3.97
11,3.93,3.34,3.17,2.93,3.15,3.735,3.27,3.06,2.915,3.335,...,3.68,3.785,4.11,3.86,3.81,3.84,3.545,3.86,4.07,3.97
17,3.93,3.34,3.17,2.93,3.15,3.735,3.27,3.06,2.915,3.335,...,3.68,3.785,4.11,3.86,3.81,3.84,3.545,3.86,4.07,3.97
20,3.93,3.34,3.17,2.93,3.15,3.735,3.27,3.06,2.915,3.335,...,3.68,3.785,4.11,3.86,3.81,3.84,3.545,3.86,4.07,3.97
23,3.93,3.34,3.17,2.93,3.15,3.735,3.27,3.06,2.915,3.335,...,3.68,3.785,4.11,3.86,3.81,3.84,3.545,3.86,4.07,3.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270879,3.93,3.34,3.17,2.93,3.15,3.735,3.27,3.06,2.915,3.335,...,3.68,3.785,4.11,3.86,3.81,3.84,3.545,3.86,4.07,3.97
270885,3.93,3.34,3.17,2.93,3.15,3.735,3.27,3.06,2.915,3.335,...,3.68,3.785,4.11,3.86,3.81,3.84,3.545,3.86,4.07,3.97
270887,3.93,3.34,3.17,2.93,3.15,3.735,3.27,3.06,2.915,3.335,...,3.68,3.785,4.11,3.86,3.81,3.84,3.545,3.86,4.07,3.97
270893,3.93,3.34,3.17,2.93,3.15,3.735,3.27,3.06,2.915,3.335,...,3.68,3.785,4.11,3.86,3.81,3.84,3.545,3.86,4.07,3.97


In [10]:
# standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
user_rating_matrix = pd.DataFrame(scaler.fit_transform(user_rating_matrix), index=user_rating_matrix.index, columns=user_rating_matrix.columns)
user_rating_matrix


movieId,1,2,3,4,5,6,7,8,9,10,...,162606,163645,164179,164909,166461,166528,166635,166643,168250,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,-1.632435e-14,-3.138102e-14,0.0,-1.445366e-14,2.586979e-14,0.0,-6.419592e-14,-1.518631e-14,1.448883e-14,-3.575960e-14,...,-3.669336e-14,7.979265e-14,-3.403370e-14,-1.536636e-14,-5.049100e-14,-3.156850e-14,-4.200318e-14,1.642680e-14,0.0,0.0
11,-1.632435e-14,-3.138102e-14,0.0,-1.445366e-14,2.586979e-14,0.0,-6.419592e-14,-1.518631e-14,1.448883e-14,-3.575960e-14,...,-3.669336e-14,7.979265e-14,-3.403370e-14,-1.536636e-14,-5.049100e-14,-3.156850e-14,-4.200318e-14,1.642680e-14,0.0,0.0
17,-1.632435e-14,-3.138102e-14,0.0,-1.445366e-14,2.586979e-14,0.0,-6.419592e-14,-1.518631e-14,1.448883e-14,-3.575960e-14,...,-3.669336e-14,7.979265e-14,-3.403370e-14,-1.536636e-14,-5.049100e-14,-3.156850e-14,-4.200318e-14,1.642680e-14,0.0,0.0
20,-1.632435e-14,-3.138102e-14,0.0,-1.445366e-14,2.586979e-14,0.0,-6.419592e-14,-1.518631e-14,1.448883e-14,-3.575960e-14,...,-3.669336e-14,7.979265e-14,-3.403370e-14,-1.536636e-14,-5.049100e-14,-3.156850e-14,-4.200318e-14,1.642680e-14,0.0,0.0
23,-1.632435e-14,-3.138102e-14,0.0,-1.445366e-14,2.586979e-14,0.0,-6.419592e-14,-1.518631e-14,1.448883e-14,-3.575960e-14,...,-3.669336e-14,7.979265e-14,-3.403370e-14,-1.536636e-14,-5.049100e-14,-3.156850e-14,-4.200318e-14,1.642680e-14,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270879,-1.632435e-14,-3.138102e-14,0.0,-1.445366e-14,2.586979e-14,0.0,-6.419592e-14,-1.518631e-14,1.448883e-14,-3.575960e-14,...,-3.669336e-14,7.979265e-14,-3.403370e-14,-1.536636e-14,-5.049100e-14,-3.156850e-14,-4.200318e-14,1.642680e-14,0.0,0.0
270885,-1.632435e-14,-3.138102e-14,0.0,-1.445366e-14,2.586979e-14,0.0,-6.419592e-14,-1.518631e-14,1.448883e-14,-3.575960e-14,...,-3.669336e-14,7.979265e-14,-3.403370e-14,-1.536636e-14,-5.049100e-14,-3.156850e-14,-4.200318e-14,1.642680e-14,0.0,0.0
270887,-1.632435e-14,-3.138102e-14,0.0,-1.445366e-14,2.586979e-14,0.0,-6.419592e-14,-1.518631e-14,1.448883e-14,-3.575960e-14,...,-3.669336e-14,7.979265e-14,-3.403370e-14,-1.536636e-14,-5.049100e-14,-3.156850e-14,-4.200318e-14,1.642680e-14,0.0,0.0
270893,-1.632435e-14,-3.138102e-14,0.0,-1.445366e-14,2.586979e-14,0.0,-6.419592e-14,-1.518631e-14,1.448883e-14,-3.575960e-14,...,-3.669336e-14,7.979265e-14,-3.403370e-14,-1.536636e-14,-5.049100e-14,-3.156850e-14,-4.200318e-14,1.642680e-14,0.0,0.0


In [11]:
# item based cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

collaborative_similarity = cosine_similarity(user_rating_matrix.T)
collaborative_similarity = pd.DataFrame(collaborative_similarity, index=user_rating_matrix.T.index, columns=user_rating_matrix.T.index)
collaborative_similarity

movieId,1,2,3,4,5,6,7,8,9,10,...,162606,163645,164179,164909,166461,166528,166635,166643,168250,168252
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000e+00,5.111447e-28,-7.703720e-34,2.355790e-28,-4.253606e-28,-4.102231e-32,1.044925e-27,2.485394e-28,-2.363318e-28,5.869954e-28,...,6.013311e-28,-1.301465e-27,5.555836e-28,2.505680e-28,8.291171e-28,5.151998e-28,6.848543e-28,-2.676777e-28,2.388153e-31,1.606226e-31
2,5.111447e-28,1.000000e+00,-5.700753e-32,4.539058e-28,-8.087920e-28,-7.934831e-32,2.007548e-27,-1.027011e-03,-4.548476e-28,1.120734e-27,...,1.153889e-27,-2.502903e-27,1.069220e-27,4.813689e-28,1.584737e-27,9.896871e-28,1.316804e-27,-5.151768e-28,3.297192e-31,2.757932e-31
3,-7.703720e-34,-5.700753e-32,1.000000e+00,-8.474092e-33,5.084455e-32,0.000000e+00,-3.512896e-31,-5.161492e-32,8.747904e-03,-1.346819e-03,...,-1.802670e-31,2.341931e-31,2.157042e-32,-3.851860e-32,-2.157042e-31,-1.001484e-31,1.078521e-32,9.783724e-32,0.000000e+00,0.000000e+00
4,2.355790e-28,4.539058e-28,-8.474092e-33,1.000000e+00,-3.727005e-28,-6.586680e-32,9.260279e-28,-4.434628e-05,-6.452201e-04,5.176467e-28,...,5.306210e-28,-1.159635e-27,4.930207e-28,1.303213e-02,7.283711e-28,4.568637e-28,6.089920e-28,-2.376536e-28,1.648596e-31,1.117039e-31
5,-4.253606e-28,-8.087920e-28,5.084455e-32,-3.727005e-28,1.000000e+00,1.020743e-31,-1.663453e-27,-3.911304e-28,-1.149434e-04,-9.268667e-28,...,-9.509732e-28,2.062501e-27,-8.783142e-28,-3.949411e-28,-1.304976e-27,-8.170731e-28,-1.086961e-27,4.268496e-28,-2.665487e-31,-2.465190e-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166528,5.151998e-28,9.896871e-28,-1.001484e-31,4.568637e-28,-8.170731e-28,-1.163262e-31,2.024362e-27,4.771751e-28,-4.565312e-28,1.126127e-27,...,3.752255e-02,4.710599e-02,1.078254e-27,4.847053e-28,2.078620e-02,1.000000e+00,-1.777456e-03,-1.316102e-02,4.329584e-03,1.091577e-03
166635,6.848543e-28,1.316804e-27,1.078521e-32,6.089920e-28,-1.086961e-27,-2.018375e-31,2.695761e-27,6.391201e-28,-6.093723e-28,1.500584e-27,...,1.000878e-04,8.094928e-03,-4.022767e-04,1.935107e-02,2.056782e-03,-1.777456e-03,1.000000e+00,-6.878005e-28,4.837936e-31,2.579178e-04
166643,-2.676777e-28,-5.151768e-28,9.783724e-32,-2.376536e-28,4.268496e-28,6.278532e-32,-1.053140e-27,-2.508852e-28,2.377976e-28,-5.891658e-28,...,-6.036706e-28,1.262286e-03,5.552337e-02,-1.251631e-02,1.659732e-03,-1.316102e-02,-6.878005e-28,1.000000e+00,-1.226460e-03,-2.609683e-03
168250,2.388153e-31,3.297192e-31,0.000000e+00,1.648596e-31,-2.665487e-31,0.000000e+00,7.950239e-31,1.155558e-31,-1.995263e-31,2.804154e-31,...,-3.780801e-03,3.099018e-02,1.190721e-02,6.277598e-04,2.117232e-02,4.329584e-03,4.837936e-31,-1.226460e-03,1.000000e+00,1.967670e-04


In [12]:
"""
movies : [(movieId, rating), (movieId, rating), (movieId, rating)...]
"""
def get_collaborative_similarity(movies):
    movie_ids = [movie[0] for movie in movies]
    ratings = [movie[1] - 2.5 for movie in movies]

    movie_similarities = collaborative_similarity[movie_ids] * ratings
    movie_similarities = movie_similarities.sum(axis=1)
    # movie_similarities = sorted(list(zip(movie_similarities.index, movie_similarities)), key=lambda x: x[1], reverse=True)

    return movie_similarities

get_collaborative_similarity([(149406,5), (87876, 5)])



movieId
1        -2.073546e-27
2        -3.968333e-27
3         2.349635e-31
4        -1.824718e-27
5         3.270548e-27
              ...     
166528    1.065860e-03
166635    2.837839e-02
166643    1.269778e-02
168250    2.679442e-02
168252    1.001924e-02
Length: 3717, dtype: float64

In [14]:
def get_recommendations(movies):
    movie_similarities = get_collaborative_similarity(movies)
    movie_similarities = sorted(list(zip(movie_similarities.index, movie_similarities)), key=lambda x: x[1], reverse=True)

    return movie_similarities

for movie, val in get_recommendations([(149406,5), (87876, 5)]):
    print(movie, '\t', val, '\t', metadata[metadata['movieId'] == movie]['title'].values[0])

87876 	 2.616708470660488 	 Cars 2
149406 	 2.6167084706604857 	 Kung Fu Panda 3
66297 	 0.24104853861825265 	 Futurama: Into the Wild Green Yonder
112175 	 0.23542404178281354 	 How to Train Your Dragon 2
79592 	 0.2317592478181799 	 The Other Guys
8958 	 0.21309710059869175 	 Ray
4262 	 0.21207737014614172 	 Scarface
134130 	 0.1896755299914145 	 The Martian
8965 	 0.18915203261873034 	 The Polar Express
48394 	 0.18328939070207242 	 Pan's Labyrinth
74530 	 0.172405708438515 	 Percy Jackson & the Olympians: The Lightning Thief
616 	 0.17191366036724134 	 The Aristocats
40278 	 0.1674548909581449 	 Jarhead
8866 	 0.16595216385576084 	 Wimbledon
4823 	 0.163064560244548 	 Serendipity
152077 	 0.16259936751620993 	 10 Cloverfield Lane
91535 	 0.16070272656971185 	 The Bourne Legacy
79428 	 0.15986297506404906 	 Dinner for Schmucks
112171 	 0.15937285982155436 	 The Equalizer
106002 	 0.15911039479755024 	 Ender's Game
70293 	 0.15174316208269994 	 Julie & Julia
45726 	 0.148645104212063

In [16]:
# save the model
import pickle

with open('app/exports/collaborative_similarity.pkl', 'wb') as f:
    pickle.dump(collaborative_similarity, f)