# Collaborative Filtering Recommender Systems

In [1]:
import pandas as pd
import numpy as np
from settings import *
from utils import save_np_array

## Fix dataset files

### movies.csv and ratings.csv

Load ratings.csv

In [3]:
ratings_df = pd.read_csv(RATINGS_CSV_FILE, header=0,  delimiter=',', quotechar='"')

print('ratings.csv')
ratings_df.head()

ratings.csv


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Load movies.csv

In [4]:
movies_df = pd.read_csv(MOVIES_CSV_FILE, header=0,  delimiter=',', quotechar='"')

print('movies.csv')
movies_df.head()

movies.csv


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
print('All movies are unique?', movies_df['movieId'].unique().shape == movies_df['movieId'].shape)
print('Number of movies:', len(movies_df))

rated_movie_ids = ratings_df['movieId'].unique()
rated_movie_ids.sort()

print('Number of rated movies:', len(rated_movie_ids))

All movies are unique? True
Number of movies: 9742
Number of rated movies: 9724


We only need the movies that have been rated

In [6]:
movies_df = movies_df[movies_df['movieId'].isin(rated_movie_ids)]

print('New number of movies:', len(movies_df))

New number of movies: 9724


Now lets check the `movieId` column

In [7]:
print(
    'Range of movieId:',
    f'({movies_df["movieId"].min()}, {movies_df["movieId"].max()})'
)

Range of movieId: (1, 193609)


**Conclusion:** Movies are all unique but their ids are not continues.  
We'll change the ids to the range of `(0, num_movies-1)`, so we can use the row index as the movie id

In [8]:
ids_map = dict(
    zip(rated_movie_ids, range(len(rated_movie_ids)))
)

Update `movieId` in `movies_df` and `ratings_df`

In [9]:
movies_df['movieId'] = movies_df['movieId'].replace(ids_map)
ratings_df['movieId'] = ratings_df['movieId'].replace(ids_map)

print(
    'New range of movieId:',
    f'({movies_df["movieId"].min()}, {movies_df["movieId"].max()})'
)

New range of movieId: (0, 9723)


**NOTICE:** Rewrite the csv files

In [10]:
movies_df.to_csv(MOVIES_CSV_FILE)
ratings_df.to_csv(RATINGS_CSV_FILE)

### tags.csv

In [11]:
tags_df = pd.read_csv(TAGS_CSV_FILE, header=0,  delimiter=',', quotechar='"')

tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


Most of the movies are not tagged

In [12]:
print('Number of movies:', len(movies_df))
print('Number of tagged movies:', len(tags_df['movieId'].unique()))

Number of movies: 9724
Number of tagged movies: 1572


We'll just replace the movieId with the new ids

In [13]:
tags_df['movieId'] = tags_df['movieId'].replace(ids_map)

**NOTICE:** Rewrite the csv file

In [14]:
tags_df.to_csv(TAGS_CSV_FILE)

## $\mu$

$\mu$ is the mean of movies ratings and is later used in the mean normalization

Drop unnecessary columns

In [15]:
# userId and timestamp won't be needed in mu
mu = ratings_df.drop(columns=['userId', 'timestamp'], axis=1)

Group rating by movieId

In [16]:
mu = mu.groupby('movieId')['rating'].apply(list)

# Re-assign column name to rating
mu = mu.reset_index(name='rating')

In [17]:
mu.head()

Unnamed: 0,movieId,rating
0,0,"[4.0, 4.0, 4.5, 2.5, 4.5, 3.5, 4.0, 3.5, 3.0, ..."
1,1,"[4.0, 4.0, 3.0, 3.0, 3.0, 3.5, 4.0, 4.5, 4.0, ..."
2,2,"[4.0, 5.0, 3.0, 3.0, 4.0, 5.0, 3.0, 4.0, 3.0, ..."
3,3,"[3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 1.5]"
4,4,"[5.0, 3.0, 5.0, 3.0, 4.0, 4.0, 2.0, 3.0, 4.0, ..."


Calculate the mean rating for each movie

In [18]:
mu['rating'] = mu['rating'].apply(np.mean)

In [19]:
mu.head()

Unnamed: 0,movieId,rating
0,0,3.92093
1,1,3.431818
2,2,3.259615
3,3,2.357143
4,4,3.071429


Row index can be used as movieId

In [20]:
mu = mu.drop(columns=['movieId'], axis=1)

Convert to numpy array

In [21]:
mu = mu.to_numpy()

mu

array([[3.92093023],
       [3.43181818],
       [3.25961538],
       ...,
       [3.5       ],
       [3.5       ],
       [4.        ]])

In [22]:
save_np_array(mu, MU_FILE_NAME)

'Saved array in ./cache/mu.npz'

## $\mathbf{Y}_{m,u}$

$\mathbf{Y}$ is a user-movie matrix, where each row represents a movie and each column represents a user, and the entries of the matrix represent the ratings given by the users to the movies.

$m$ and $u$ determine the number of movies and users respectively. 

Drop timestamp column

In [23]:
Y = ratings_df.drop(columns=['timestamp'], axis=1)

In [24]:
Y.head()

Unnamed: 0,userId,movieId,rating
0,1,0,4.0
1,1,2,4.0
2,1,5,4.0
3,1,43,5.0
4,1,46,5.0


In [25]:
np.unique(Y['rating'])

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

Given that the ratings are in $[0.5, 5.0]$, we can determine the empty items with $0.0$

In [26]:
# Create the (movieId * userId) matrix
Y = Y.pivot(index='movieId', columns='userId', values='rating')

# Fill NA columns with 0.0
Y = Y.fillna(0.0)

In [27]:
Y.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
1,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


Convert to numpy array

In [28]:
Y = Y.to_numpy()

Y

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

### Mean normalization

Calculate $\mathbf{Y}_{\text{norm}} = \mathbf{Y} - \mu$

In [29]:
Y_norm = Y - mu

Y_norm

array([[ 0.07906977, -3.92093023, -3.92093023, ..., -1.42093023,
        -0.92093023,  1.07906977],
       [-3.43181818, -3.43181818, -3.43181818, ..., -1.43181818,
        -3.43181818, -3.43181818],
       [ 0.74038462, -3.25961538, -3.25961538, ..., -1.25961538,
        -3.25961538, -3.25961538],
       ...,
       [-3.5       , -3.5       , -3.5       , ..., -3.5       ,
        -3.5       , -3.5       ],
       [-3.5       , -3.5       , -3.5       , ..., -3.5       ,
        -3.5       , -3.5       ],
       [-4.        , -4.        , -4.        , ..., -4.        ,
        -4.        , -4.        ]])

In [30]:
save_np_array(Y_norm, Y_FILE_NAME)

'Saved array in ./cache/Y.npz'

## $\mathbf{R}_{m,u}$

$$
\mathbf{R}_{m,u} =
\begin{cases}
    1 & \mathbf{Y}_{(i, j)} \neq 0 \\
    0 & \mathbf{Y}_{(i, j)} = 0
\end{cases}
$$

where $m$ and $u$ determine the number of movies and users respectively.

In [31]:
R = Y != 0

R

array([[ True, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True, False, False],
       [ True, False, False, ...,  True, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [32]:
save_np_array(R, R_FILE_NAME)

'Saved array in ./cache/R.npz'

## $\mathbf{X}_m$

$\mathbf{X}_m$ is a movie-genre matrix, where each row represents a movie and each column represents a genre. $\mathbf{X}_m$ is a binary matrix, with each entry indicating whether the corresponding movie belongs to the corresponding genre or not.

Drop unnecessary columns

In [33]:
# We don't need the title column
movies_df = movies_df.drop(columns=['title'], axis=1)

# Row index can be used as movieId
movies_df = movies_df.drop(columns=['movieId'], axis=1)

Separate genres by | to list

In [34]:
movies_df['genres'] = movies_df['genres'].str.split('|')

In [35]:
movies_df.head()

Unnamed: 0,genres
0,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,"[Adventure, Children, Fantasy]"
2,"[Comedy, Romance]"
3,"[Comedy, Drama, Romance]"
4,[Comedy]


Lets see what and how many unique genres are there

In [36]:
unique_genres = movies_df['genres'].explode().unique()

print('num unique genres:', unique_genres.shape)
print(unique_genres)

num unique genres: (20,)
['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'War' 'Musical'
 'Documentary' 'IMAX' 'Western' 'Film-Noir' '(no genres listed)']


One-hot encode the genres

In [37]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)

movies_df = movies_df.join(
    pd.DataFrame.sparse.from_spmatrix(
        mlb.fit_transform(movies_df.pop('genres')),
        index=movies_df.index,
        columns=mlb.classes_
    )
)

In [38]:
movies_df.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
X_m = movies_df.to_numpy()
X_m

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [40]:
save_np_array(X_m, X_M_FILE_NAME)

'Saved array in ./cache/X_m.npz'