# Simple Content-based Filtering

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv("movies.csv")
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# dummy encode the genre
movies = movies.join(movies.genres.str.get_dummies("|"))
movies.head(5)

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
movies.iloc[:,3:][:5]

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# compute the cosine similarity
cos_sim = cosine_similarity(movies.iloc[:,3:])
cos_sim

array([[1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
        0.4472136 ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.        , 0.        ,
        0.70710678],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.4472136 , 0.        , 0.70710678, ..., 0.        , 0.        ,
        1.        ]])

In [6]:
cos_sim[0]

array([1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
       0.4472136 ])

In [7]:
# Let's get the top 5 most similar films:
toystory_top5 = np.argsort(cos_sim[0])[-5:][::-1]
toystory_top5

array([   0, 8219, 3568, 9430, 3000])

In [8]:
movies[movies['movieId'] == 1]

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
movies[movies['movieId'] == 8218]

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western


# Simple Collaborative Filtering

In [10]:
ratings = pd.read_csv("ratings.csv")
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [11]:
# compute mean rating
mean_rating = ratings['rating'].mean()
mean_rating

3.501556983616962

In [12]:
pref_matrix = ratings[['userId', 'movieId', 'rating']].pivot(index='userId', columns='movieId', values='rating')

# adjust by overall mean
pref_matrix = pref_matrix - mean_rating 

item_mean_rating = pref_matrix.mean(axis=0)

# adjust by item mean
pref_matrix = pref_matrix - item_mean_rating

user_mean_rating = pref_matrix.mean(axis=1)
pref_matrix = pref_matrix - user_mean_rating

In [13]:
pref_matrix.fillna(0) + user_mean_rating + item_mean_rating + mean_rating

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.00000,3.436979,4.000000,2.160805,2.995648,4.000000,2.912676,2.891557,3.075148,3.312842,...,,,,,,,,,,
2,4.73008,3.436979,2.110475,2.160805,2.995648,4.245001,2.912676,2.891557,3.075148,3.312842,...,,,,,,,,,,
3,4.73008,3.436979,2.110475,2.160805,2.995648,4.245001,2.912676,2.891557,3.075148,3.312842,...,,,,,,,,,,
4,4.73008,3.436979,2.110475,2.160805,2.995648,4.245001,2.912676,2.891557,3.075148,3.312842,...,,,,,,,,,,
5,4.00000,3.436979,2.110475,2.160805,2.995648,4.245001,2.912676,2.891557,3.075148,3.312842,...,,,,,,,,,,
6,4.73008,4.000000,5.000000,3.000000,5.000000,4.000000,4.000000,3.000000,3.075148,3.000000,...,,,,,,,,,,
7,4.50000,3.436979,2.110475,2.160805,2.995648,4.245001,2.912676,2.891557,3.075148,3.312842,...,,,,,,,,,,
8,4.73008,4.000000,2.110475,2.160805,2.995648,4.245001,2.912676,2.891557,3.075148,2.000000,...,,,,,,,,,,
9,4.73008,3.436979,2.110475,2.160805,2.995648,4.245001,2.912676,2.891557,3.075148,3.312842,...,,,,,,,,,,
10,4.73008,3.436979,2.110475,2.160805,2.995648,4.245001,2.912676,2.891557,3.075148,3.312842,...,,,,,,,,,,


In [14]:
mat = pref_matrix.values
k = 0 # use the first user
np.nansum((mat - mat[k,:])**2,axis=1).reshape(-1,1)

array([[ 0.  ],
       [ 1.  ],
       [20.25],
       [36.  ],
       [14.  ],
       [61.  ],
       [ 4.5 ],
       [14.  ],
       [ 1.  ],
       [ 4.25],
       [ 5.  ],
       [ 0.25],
       [ 0.  ],
       [25.  ],
       [15.5 ],
       [ 9.75],
       [ 8.  ],
       [17.5 ],
       [56.  ],
       [ 1.25],
       [ 2.5 ],
       [ 2.  ],
       [ 3.5 ],
       [ 6.5 ],
       [ 1.  ],
       [ 5.  ],
       [ 4.  ],
       [34.75],
       [ 3.75],
       [ 1.  ],
       [ 1.  ],
       [14.  ],
       [11.  ],
       [16.5 ],
       [ 5.  ],
       [ 4.  ],
       [11.  ],
       [29.  ],
       [14.  ],
       [47.  ],
       [15.25],
       [20.  ],
       [19.  ],
       [ 3.  ],
       [17.  ],
       [11.  ],
       [12.5 ],
       [ 0.  ],
       [ 0.25],
       [13.25],
       [20.25],
       [ 2.  ],
       [ 0.  ],
       [17.  ],
       [ 0.  ],
       [11.  ],
       [22.  ],
       [16.  ],
       [ 8.  ],
       [ 5.  ],
       [ 0.25],
       [ 5.5 ],
       [

In [15]:
np.nansum((mat - mat[0,:])**2,axis=1)[1:].argmin() # returns 11
# check it:
np.nansum(mat[12] - mat[0]) # returns 0.0

0.0

In [16]:
np.where(~np.isnan(mat[12]) & np.isnan(mat[0]) == True)

(array([304, 596]),)

In [17]:
mat[12][[304, 596]]

array([-2.13265214, -0.89476547])