In [1]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>")) 

# Load Content Embeddings

In [2]:
cwd = os.getcwd()
content_embeddings = pd.read_pickle(os.path.join("..", "..", "data", "ml-20m", "autoencoder_embeddings.pkl"))
content_embeddings = pd.DataFrame(content_embeddings)
print(content_embeddings.shape)
content_embeddings.head()

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.050325,0.0,0.0,0.0,0.534813,0.087398,0.0,0.0,0.0,0.140658,...,0.026881,0.0,0.238451,0.0,0.0,0.0,0.333064,0.0,0.503451,0.0
1,0.0,0.0,0.0,0.0,0.464565,0.0,0.0,0.0,0.0,0.037817,...,0.0,0.0,0.129538,0.0,0.0,0.0,0.342558,0.0,0.406881,0.0
2,0.0,0.0,0.0,0.0,0.569222,0.0,0.0,0.0,0.0,0.07428,...,0.0,0.0,0.033404,0.0,0.0,0.0,0.34774,0.0,0.66096,0.0
3,0.001858,0.0,0.0,0.0,0.434674,0.131226,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.302579,0.0,0.500243,0.0
4,0.0,0.0,0.0,0.0,0.427863,0.136397,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.322434,0.0,0.233838,0.0


# Load Collaborative Embeddings

In [3]:
cwd = os.getcwd()
collaborative_embeddings = pd.read_pickle(os.path.join("..", "..", "data", "ml-20m", "movie_embeddings_1.pkl"))
print(collaborative_embeddings.shape)
collaborative_embeddings.head()

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.325341,-2.316274,-0.100806,-0.511121,-0.083659,-0.389862,-0.08631,0.714603,0.980253,-0.07737,...,0.385782,-0.188543,0.00993,0.590205,0.996824,-0.180804,-0.935786,-0.554848,-0.526218,1.968207
1,2.434166,-0.31038,0.509316,-0.750373,-3.01437,2.86943,1.759523,1.548208,-2.116433,-1.671408,...,-2.701217,3.308206,-3.202542,2.152793,3.233978,-2.374892,-1.705962,-3.337779,2.012516,0.346246
2,2.28081,-1.875792,1.443488,-2.113039,-2.818471,1.905172,1.479568,1.613045,-1.52865,-3.363268,...,-4.084937,3.510745,-2.57453,0.715931,3.27247,-2.894063,-3.461911,-1.858291,4.064014,0.573834
3,1.527529,-3.667315,2.302887,-0.199075,-4.247454,2.386378,2.566826,3.688847,-0.982091,-3.786057,...,-3.958512,3.970079,-3.332116,2.858457,3.311129,-5.052051,-3.828944,-2.011621,2.640216,1.158406
4,2.576652,-3.956768,3.677797,-1.258373,-4.804983,4.059122,4.225045,5.900877,-0.941438,-4.83197,...,-6.551518,5.977818,-5.873029,4.846278,4.900824,-2.635733,-5.753502,-5.207174,3.763796,2.092899


# Format Movie Lookup Data

In [4]:
# Load index mapping 
with open('../../data/ml-20m/movie_to_idx.pkl', 'rb') as handle:
    movie2idx = pickle.load(handle)

In [5]:
movies = pd.read_csv(os.path.join("..", "..", "data", "ml-20m", "movies.csv"))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

ratings = pd.read_csv(os.path.join("..", "..", "data", "ml-20m", "ratings.csv"))
print("{} unique movies in ratings.csv".format(len(ratings.movieId.unique())))

movies = pd.merge(movies, ratings, on="movieId", how="inner")
movies.movieId = movies.movieId.apply(lambda x: movie2idx[x])

#get popularity
popularity = pd.DataFrame(movies[['userId', 'title', 'movieId']].groupby(['title', 'movieId']).agg(['count']))
popularity.reset_index(inplace=True)
popularity.columns = ['title', 'movieId', 'ratings_count']
popularity.sort_values('ratings_count', ascending=False, inplace=True)
movies = pd.merge(popularity[['movieId', 'ratings_count']], movies, on='movieId')
movies.reset_index(inplace=True)

#get average ratings
average_ratings = pd.DataFrame(movies[['rating', 'title', 'movieId']].groupby(['title', 'movieId']).agg(['mean']))
average_ratings.reset_index(inplace=True)
average_ratings.columns = ['title', 'movieId', 'avg_rating']
movies = pd.merge(average_ratings[['movieId', 'avg_rating']], movies, on='movieId')
movies.reset_index(inplace=True)


movies = movies[['movieId', 'title', 'genres', 'ratings_count', 'avg_rating']]
movies.drop_duplicates(inplace=True)
print("{} unique movies in embeddings".format(len(movies.movieId.unique())))
movies.set_index('movieId', inplace=True, drop=True)
movies.sort_index(ascending=True, inplace=True)
print(movies.shape)
movies.head(5)

27278 unique movies in movies.csv
26744 unique movies in ratings.csv
26744 unique movies in embeddings
(26744, 4)


Unnamed: 0_level_0,title,genres,ratings_count,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Jumanji (1995),Adventure|Children|Fantasy,22243,3.211977
1,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,8520,3.95223
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,44980,3.898055
3,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,43249,4.053493
4,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,47006,4.334372


In [6]:
movies.to_csv('../../data/movie_demographics.csv')

In [7]:
movies.query('title == "Zodiac (2007)"')

Unnamed: 0_level_0,title,genres,ratings_count,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3995,Zodiac (2007),Crime|Drama|Thriller,3907,3.675454


# Recommendations

In [8]:
#Import class

import os; import sys
cwd = os.getcwd()
path = os.path.join('..' , '..', 'movie_recommender')
if not path in sys.path:
    sys.path.append(path)
del cwd, path

from similarity import SimilarityPredictions

In [42]:
def lookup_movie_id_by_title(movie_title):
    return movies[movies.title.str.contains(movie_title)]

In [57]:
lookup_movie_id_by_title("Notebook")

Unnamed: 0_level_0,title,genres,ratings_count,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1764,"Notebook, The (2004)",Drama|Romance,3968,3.804057
15131,"Notebook, The (A nagy füzet) (2013)",Drama|War,10,3.7


In [88]:
primer = 3006 
lotr = 131 #fellowship of the ring
inception = 2087
zodiac = 3995 
pulp_fiction = 11
notebook = 1764


In [54]:
def get_detailed_recs(movie_id, embeddings, file_path):
    #get similar movies
    sim_model = SimilarityPredictions(embeddings, similarity_metric="cosine")
    output = sim_model.predict_similar_items(seed_item=movie_id, n=20)
    similar_movies = pd.DataFrame(output)
    similar_movies.set_index('item_id', inplace=True)
    sim_df = pd.merge(movies, similar_movies, left_index=True, right_index=True)
    sim_df.sort_values('similarity_score', ascending=False, inplace=True)
    
    #save recs locally
    sim_df.head(20).to_csv(file_path, index=False, header=True)
    return sim_df.head(20)

In [72]:
def get_ensemble_recs(movie_id, content_embeddings, collaborative_embeddings, file_path):
    #get similar movies from content
    sim_model_cont = SimilarityPredictions(content_embeddings, similarity_metric="cosine")
    cont_output = sim_model_cont.predict_similar_items(seed_item=movie_id, n=26744)
    similar_movies = pd.DataFrame(cont_output)
    similar_movies.set_index('item_id', inplace=True)
    sim_df_cont = pd.merge(movies, similar_movies, left_index=True, right_index=True)
    sim_df_cont.sort_values('similarity_score', ascending=False, inplace=True)
    sim_df_cont = sim_df_cont.rename(index=str, columns={"similarity_score": "content_similarity_score"})

    #get similar movies from collaborative
    sim_model_coll = SimilarityPredictions(collaborative_embeddings, similarity_metric="cosine")
    coll_output = sim_model_coll.predict_similar_items(seed_item=movie_id, n=26744)
    similar_movies = pd.DataFrame(coll_output)
    similar_movies.set_index('item_id', inplace=True)
    sim_df_coll = pd.merge(movies, similar_movies, left_index=True, right_index=True)
    sim_df_coll.sort_values('similarity_score', ascending=False, inplace=True)
    sim_df_coll = sim_df_coll.rename(index=str, columns={"similarity_score": "collaborative_similarity_score"})

    #ensemble results
    sim_df_avg = pd.merge(sim_df_coll, pd.DataFrame(sim_df_cont['content_similarity_score']), left_index=True, right_index=True)
    sim_df_avg['average_similarity_score'] = (sim_df_avg['content_similarity_score'] + sim_df_avg['collaborative_similarity_score'])/2
    #sim_df_avg.drop("collaborative_similarity_score", axis=1, inplace=True)
    #sim_df_avg.drop("content_similarity_score", axis=1, inplace=True)
    sim_df_avg.sort_values('average_similarity_score', ascending=False, inplace=True)
    
    #save recs locally
    sim_df_avg.head(20).to_csv(file_path, index=False, header=True)
    return sim_df_avg.head(20)

## Lord of the Rings, Fellowship of the Ring

### Collaborative Recommendations

In [73]:
get_detailed_recs(lotr, collaborative_embeddings, '../../data/collaborative_recs_lotr.csv')

Unnamed: 0,title,genres,ratings_count,avg_rating,similarity_score
131,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,37553,4.137925,1.0
9,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,54502,4.190672,0.978414
31,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,43295,4.219009,0.976552
158,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,31577,4.142382,0.974713
30,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,45313,4.188202,0.974589
652,Schindler's List (1993),Drama|War,50054,4.310175,0.972036
142,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,33947,4.107521,0.969416
12,"Shawshank Redemption, The (1994)",Crime|Drama,63366,4.44699,0.968162
302,Saving Private Ryan (1998),Action|Drama|War,37110,4.064417,0.965546
1352,Good Will Hunting (1997),Drama|Romance,28324,4.032517,0.963929


### Content Recommendations

In [74]:
get_detailed_recs(lotr, content_embeddings, '../../data/content_recs_lotr.csv')

Unnamed: 0,title,genres,ratings_count,avg_rating,similarity_score
131,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,37553,4.137925,1.0
142,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,33947,4.107521,0.986169
929,"Chronicles of Narnia: The Lion, the Witch and ...",Adventure|Children|Fantasy,7112,3.478839,0.98222
72,"Dark Crystal, The (1982)",Adventure|Fantasy,7980,3.590977,0.975679
158,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,31577,4.142382,0.96832
126,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,17239,3.614334,0.96784
5367,Tales from Earthsea (Gedo Senki) (2006),Adventure|Animation|Fantasy,131,3.167939,0.961316
14373,Almighty Thor (2011),Adventure|Fantasy,19,3.236842,0.959171
1804,"Chronicles of Narnia: Prince Caspian, The (2008)",Adventure|Children|Fantasy,1551,3.323985,0.957583
19513,"Magic Voyage of Sindbad, The (Sadko) (1953)",Adventure|Fantasy,3,3.333333,0.956919


### Averaged Ensemble Recommendations

In [75]:
get_ensemble_recs(lotr, content_embeddings, collaborative_embeddings, '../../data/ensemble_recs_lotr.csv')

Unnamed: 0,title,genres,ratings_count,avg_rating,collaborative_similarity_score,content_similarity_score,average_similarity_score
131,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,37553,4.137925,1.0,1.0,1.0
142,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,33947,4.107521,0.969416,0.986169,0.977792
158,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,31577,4.142382,0.974713,0.96832,0.971517
30,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,45313,4.188202,0.974589,0.920829,0.947709
9,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,54502,4.190672,0.978414,0.911576,0.944995
261,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,32586,4.176732,0.961415,0.909933,0.935674
31,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,43295,4.219009,0.976552,0.891464,0.934008
186,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,46839,4.004622,0.956912,0.910364,0.933638
1007,WALL·E (2008),Adventure|Animation|Children|Romance|Sci-Fi,12176,4.038929,0.948095,0.913246,0.930671
913,Batman Begins (2005),Action|Crime|IMAX,18686,3.970754,0.959914,0.8905,0.925207


## Inception

### Collaborative Recommendations

In [96]:
get_detailed_recs(primer, collaborative_embeddings, '../../data/collaborative_recs_primer.csv')

Unnamed: 0,title,genres,ratings_count,avg_rating,similarity_score
3006,Primer (2004),Drama|Sci-Fi,2643,3.85263,1.0
2419,"Man Who Would Be King, The (1975)",Adventure|Drama,3607,4.0402,0.912031
5265,Cosmos (1980),Documentary,936,4.22062,0.911804
5140,"Times of Harvey Milk, The (1984)",Documentary,250,3.904,0.909714
5274,Louis C.K.: Oh My God (2013),Comedy,284,4.040493,0.908629
8180,Alone in the Wilderness (2004),Documentary,189,3.955026,0.908325
3179,Once Were Warriors (1994),Crime|Drama,4042,3.995299,0.907113
5141,"Decalogue, The (Dekalog) (1989)",Crime|Drama|Romance,402,4.174129,0.906468
3308,"Children of Heaven, The (Bacheha-Ye Aseman) (1...",Comedy|Drama,712,4.017556,0.906056
5096,Shoah (1985),Documentary|War,220,4.027273,0.906014


### Content Recommendations

In [95]:
get_detailed_recs(primer, content_embeddings, '../../data/content_recs_primer.csv')

Unnamed: 0,title,genres,ratings_count,avg_rating,similarity_score
3006,Primer (2004),Drama|Sci-Fi,2643,3.85263,1.0
3071,Looper (2012),Action|Crime|Sci-Fi,2855,3.770403,0.997139
964,"Last Mimzy, The (2007)",Adventure|Children|Sci-Fi,431,3.264501,0.99696
6558,Predestination (2014),Sci-Fi|Thriller,318,3.657233,0.995283
11793,Trancers (1985),Action|Sci-Fi,40,3.0875,0.994775
2911,"Final Countdown, The (1980)",Action|Sci-Fi,533,3.386492,0.994298
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,44980,3.898055,0.994169
7040,When Worlds Collide (1951),Sci-Fi,334,3.510479,0.99416
7670,"Time Traveler's Wife, The (2009)",Drama|Romance|Sci-Fi,820,3.565244,0.994059
97,Time Bandits (1981),Adventure|Comedy|Fantasy|Sci-Fi,8595,3.594823,0.993732


### Averaged Ensemble Recommendations

In [94]:
get_ensemble_recs(primer, content_embeddings, collaborative_embeddings, '../../data/ensemble_recs_primer.csv')

Unnamed: 0,title,genres,ratings_count,avg_rating,collaborative_similarity_score,content_similarity_score,average_similarity_score
3006,Primer (2004),Drama|Sci-Fi,2643,3.85263,1.0,1.0,1.0
2419,"Man Who Would Be King, The (1975)",Adventure|Drama,3607,4.0402,0.912031,0.972647,0.942339
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,44980,3.898055,0.887107,0.994169,0.940638
3338,"Man Who Knew Too Much, The (1956)",Adventure|Drama|Mystery|Thriller,2278,3.875329,0.903752,0.972968,0.93836
849,Nausicaä of the Valley of the Wind (Kaze no ta...,Adventure|Animation|Drama|Fantasy|Sci-Fi,3334,4.092082,0.892734,0.983479,0.938106
3179,Once Were Warriors (1994),Crime|Drama,4042,3.995299,0.907113,0.969071,0.938092
5141,"Decalogue, The (Dekalog) (1989)",Crime|Drama|Romance,402,4.174129,0.906468,0.96698,0.936724
3308,"Children of Heaven, The (Bacheha-Ye Aseman) (1...",Comedy|Drama,712,4.017556,0.906056,0.966392,0.936224
6363,"Man Who Planted Trees, The (Homme qui plantait...",Animation|Drama,196,3.971939,0.891085,0.97924,0.935163
6480,"Timecrimes (Cronocrímenes, Los) (2007)",Sci-Fi|Thriller,695,3.794245,0.882338,0.987821,0.93508
