In [29]:
import pandas as pd
import numpy as np

# machine-learning libraries
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from sklearn.metrics.pairwise import cosine_similarity

# miscellaneous
import pickle

In [30]:
movie_df = pd.read_csv('./data/ml-latest-small/movies.csv')
rating_df = pd.read_csv('./data/ml-latest-small/ratings.csv')
link_df = pd.read_csv('./data/ml-latest-small/links.csv')
tag_df = pd.read_csv('./data/ml-latest-small/tags.csv')

In [31]:
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [32]:
rating_df = rating_df.rename(columns={'movieId': 'movie_id'})
movie_df = movie_df.rename(columns={'movieId': 'movie_id'})
rating_df = rating_df.rename(columns={'userId': 'user_id'})

In [33]:
# calculate the number of ratings per movie
rating_count = rating_df.groupby('movie_id')[['rating']].count()
rating_count

Unnamed: 0_level_0,rating
movie_id,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


In [34]:
# filter for movies with more than 20 ratings and extract the index
popular_movies = rating_count[rating_count['rating']>20].index
popular_movies

Int64Index([     1,      2,      3,      5,      6,      7,     10,     11,
                16,     17,
            ...
            122920, 122922, 134130, 134853, 139385, 148626, 152081, 164179,
            166528, 168252],
           dtype='int64', name='movie_id', length=1235)

In [35]:
# filter the ratings matrix and only keep the popular movies
df = rating_df[rating_df['movie_id'].isin(popular_movies)].copy()
df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100803,610,148626,4.0,1493847175
100808,610,152081,4.0,1493846503
100829,610,164179,5.0,1493845631
100830,610,166528,4.0,1493879365


In [36]:
rating_df.shape, df.shape

((100836, 4), (66658, 4))

In [37]:
# need to remake user ids and movie ids since they are not sequential
user_ids = df['user_id'].unique()
user_ids

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [38]:
user_id_map = {v:k for k,v in enumerate(user_ids)}
df['user_id'] = df['user_id'].map(user_id_map)
df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1,4.0,964982703
1,0,3,4.0,964981247
2,0,6,4.0,964982224
3,0,47,5.0,964983815
4,0,50,5.0,964982931
...,...,...,...,...
100803,609,148626,4.0,1493847175
100808,609,152081,4.0,1493846503
100829,609,164179,5.0,1493845631
100830,609,166528,4.0,1493879365


In [39]:
movie_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [40]:
#similarly for the movie_id:
movie_ids = df['movie_id'].unique()
movie_ids


array([   1,    3,    6, ..., 4247, 2986, 2013])

In [41]:
movie_id_map = {v:k for k,v in enumerate(movie_ids)}
df['movie_id'] = df['movie_id'].map(movie_id_map)
df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
4,0,4,5.0,964982931
...,...,...,...,...
100803,609,808,4.0,1493847175
100808,609,643,4.0,1493846503
100829,609,809,5.0,1493845631
100830,609,644,4.0,1493879365


In [None]:
#movie_id_map = {v:k for k,v in enumerate(movie_ids)}
#df['movie_id'] = df['movie_id'].map(movie_id_map)

#movie_id_map = {v:k for k,v in enumerate(movie_df['movie_id'].unique())}
#movie_df['movie_id'] =movie_df['movie_id'].map(movie_id_map)
#movie_df

In [42]:
#filter out unpopular movies
movies = movie_df[movie_df['movie_id'].isin(movie_ids)]
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
...,...,...,...
9162,148626,"Big Short, The (2015)",Drama
9223,152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
9392,164179,Arrival (2016),Sci-Fi
9433,166528,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi


In [43]:
# redefine movie ids
movies['movie_id'] = movies['movie_id'].map(movie_id_map)
movies

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['movie_id'] = movies['movie_id'].map(movie_id_map)


Unnamed: 0,movie_id,title,genres
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,325,Jumanji (1995),Adventure|Children|Fantasy
2,1,Grumpier Old Men (1995),Comedy|Romance
4,326,Father of the Bride Part II (1995),Comedy
5,2,Heat (1995),Action|Crime|Thriller
...,...,...,...
9162,808,"Big Short, The (2015)",Drama
9223,643,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
9392,809,Arrival (2016),Sci-Fi
9433,644,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi


In [44]:
movie_title = movies.sort_values('movie_id')['title']
movie_title

0                     Toy Story (1995)
2              Grumpier Old Men (1995)
5                          Heat (1995)
43         Seven (a.k.a. Se7en) (1995)
46          Usual Suspects, The (1995)
                     ...              
1435        Terms of Endearment (1983)
2968               Little Nicky (2000)
3158                   Joe Dirt (2001)
2249                  RoboCop 2 (1990)
1488    Poseidon Adventure, The (1972)
Name: title, Length: 1235, dtype: object

In [45]:
from scipy.sparse import csr_matrix
R = csr_matrix((df['rating'], (df['user_id'], df['movie_id'])))

In [46]:
R.shape

(610, 1235)

In [47]:
R.todense()

matrix([[4. , 4. , 4. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        ...,
        [2.5, 2. , 0. , ..., 1. , 1.5, 0. ],
        [3. , 0. , 0. , ..., 0. , 0. , 0. ],
        [5. , 0. , 5. , ..., 3. , 0. , 0. ]])

In [49]:
Rating = pd.DataFrame(R.todense(), columns=movie_title)
Rating

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
0,4.0,4.0,4.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.5,0.0,0.0,3.0,4.5,4.0,0.0,3.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
606,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,2.5,2.0,0.0,4.5,4.5,3.0,0.0,4.0,0.0,3.0,...,0.0,0.5,0.0,0.0,4.5,0.0,2.5,1.0,1.5,0.0
608,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Model**

In [50]:
# initialize the unsupervised model NearestNeighbors
model = NearestNeighbors(metric='cosine')

In [51]:
# fit it to the Ratings matrix
model.fit(R)

**save model**

In [52]:
with open('./neighbour_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

<hr style="border:2px solid black">

## 3. Model Deployment

In [53]:
#load model

with open('./neighbour_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

**receive a user query**

In [56]:
movie_title

0                     Toy Story (1995)
2              Grumpier Old Men (1995)
5                          Heat (1995)
43         Seven (a.k.a. Se7en) (1995)
46          Usual Suspects, The (1995)
                     ...              
1435        Terms of Endearment (1983)
2968               Little Nicky (2000)
3158                   Joe Dirt (2001)
2249                  RoboCop 2 (1990)
1488    Poseidon Adventure, The (1972)
Name: title, Length: 1235, dtype: object

In [57]:
new_user_query = {'Toy Story (1995)': 1, 
                 'Joe Dirt (2001)':2, 
                 "Heat (1995)": 3.5 ,
                 "Little Nicky (2000)":5}

In [58]:
new_user_query

{'Toy Story (1995)': 1,
 'Joe Dirt (2001)': 2,
 'Heat (1995)': 3.5,
 'Little Nicky (2000)': 5}

**construct a user vector**

we need the same input as was used during training!

In [59]:
new_user_dataframe =  pd.DataFrame(new_user_query, columns=movie_title, index=['new_user'])
new_user_dataframe

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,1,,3.5,,,,,,,,...,,,,,,,5,2,,


In [60]:
# filling the missing values
new_user_dataframe_imputed = new_user_dataframe.fillna(0)
new_user_dataframe_imputed

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,1,0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,2,0,0


**calculate the score**

1. find the neighborhood of $n$ similar users
2. use their ratings to calculate a score

In [61]:
# calculates the distances to all other users in the data!
similarity_scores, neighbor_ids = model.kneighbors(
    new_user_dataframe_imputed,
    n_neighbors=5,
    return_distance=True
)



In [62]:
# sklearn returns a list of predictions
# extract the first and only value of the list

neighbors_df = pd.DataFrame(
    data = {'neighbor_id': neighbor_ids[0], 'similarity_score': similarity_scores[0]}
)
neighbors_df

Unnamed: 0,neighbor_id,similarity_score
0,268,0.805273
1,388,0.854501
2,450,0.868485
3,120,0.872117
4,296,0.879355


In [63]:
# only look at ratings for users that are similar!
neighborhood = Rating.iloc[neighbor_ids[0]]
neighborhood

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
268,5.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
388,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
450,5.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,4.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
296,0.0,0.0,5.0,4.0,5.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Filter seen movie**

In [64]:
new_user_query.keys()

dict_keys(['Toy Story (1995)', 'Joe Dirt (2001)', 'Heat (1995)', 'Little Nicky (2000)'])

In [65]:
neighborhood_filtered = neighborhood.drop(new_user_query.keys(), axis=1)
neighborhood_filtered

title,Grumpier Old Men (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),Billy Madison (1995),Clerks (1994),...,Metropolis (1927),Babe: Pig in the City (1998),Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
268,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,0.0,3.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
296,0.0,4.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Calculate score**

In [66]:
# calculate the summed up rating for each movie
# summing up introduces a bias for popular movies
# averaging introduces bias for movies only seen by few users in the neighboorhood

df_score = neighborhood_filtered.sum()
df_score

title
Grumpier Old Men (1995)                   4.0
Seven (a.k.a. Se7en) (1995)               7.0
Usual Suspects, The (1995)                5.0
From Dusk Till Dawn (1996)                2.0
Bottle Rocket (1996)                      0.0
                                         ... 
American Werewolf in London, An (1981)    0.0
eXistenZ (1999)                           0.0
Terms of Endearment (1983)                0.0
RoboCop 2 (1990)                          0.0
Poseidon Adventure, The (1972)            0.0
Length: 1231, dtype: float64

**Rank the movie**

In [67]:
df_score_ranked = df_score.sort_values(ascending=False).index.to_list()
df_score_ranked

['Leaving Las Vegas (1995)',
 'Independence Day (a.k.a. ID4) (1996)',
 'Rock, The (1996)',
 'Broken Arrow (1996)',
 'Father of the Bride Part II (1995)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Birdcage, The (1996)',
 'Executive Decision (1996)',
 'Ransom (1996)',
 'Fugitive, The (1993)',
 "Mr. Holland's Opus (1995)",
 'Eraser (1996)',
 'Mission: Impossible (1996)',
 'Time to Kill, A (1996)',
 'Silence of the Lambs, The (1991)',
 'Twister (1996)',
 'Sense and Sensibility (1995)',
 'Fargo (1996)',
 'Primal Fear (1996)',
 'Rumble in the Bronx (Hont faan kui) (1995)',
 'Willy Wonka & the Chocolate Factory (1971)',
 'Natural Born Killers (1994)',
 'Star Trek: First Contact (1996)',
 'Phenomenon (1996)',
 'Long Kiss Goodnight, The (1996)',
 'Clear and Present Danger (1994)',
 'River Wild, The (1994)',
 'Seven (a.k.a. Se7en) (1995)',
 'Striptease (1996)',
 'In the Line of Fire (1993)',
 'Firm, The (1993)',
 'Apollo 13 (1995)',
 'Cable Guy, The (1996)',
 'Disclosure (1994)',
 'Reservo

**Recommendations**

In [68]:
recommendations = df_score_ranked[:3]
recommendations

['Leaving Las Vegas (1995)',
 'Independence Day (a.k.a. ID4) (1996)',
 'Rock, The (1996)']