# Collaborative Filtering Recommender Systems

In [1]:
import pandas as pd
import numpy as np

Load `ratings.csv`

In [2]:
ratings_df = pd.read_csv('../ml-latest-small/ratings.csv', header=0,  delimiter=',', quotechar='"')

In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## $\mu$: the mean of each movies ratings

$\mu$ is later used in the mean normalization

Drop unnecessary columns

In [4]:
# userId and timestamp won't be needed in mu
mu = ratings_df.drop(columns=['userId', 'timestamp'], axis=1)

Group rating by movieId

In [5]:
mu = mu.groupby('movieId')['rating'].apply(list)

# Re-assign column name to rating
mu = mu.reset_index(name='rating')

In [6]:
mu.head()

Unnamed: 0,movieId,rating
0,1,"[4.0, 4.0, 4.5, 2.5, 4.5, 3.5, 4.0, 3.5, 3.0, ..."
1,2,"[4.0, 4.0, 3.0, 3.0, 3.0, 3.5, 4.0, 4.5, 4.0, ..."
2,3,"[4.0, 5.0, 3.0, 3.0, 4.0, 5.0, 3.0, 4.0, 3.0, ..."
3,4,"[3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 1.5]"
4,5,"[5.0, 3.0, 5.0, 3.0, 4.0, 4.0, 2.0, 3.0, 4.0, ..."


Calculate the mean rating for each movie

In [7]:
mu['rating'] = mu['rating'].apply(np.mean)

In [8]:
mu.head()

Unnamed: 0,movieId,rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


Row index can be used as movieId

In [9]:
mu = mu.drop(columns=['movieId'], axis=1)

Convert to numpy array

In [10]:
mu = mu.to_numpy()

mu

array([[3.92093023],
       [3.43181818],
       [3.25961538],
       ...,
       [3.5       ],
       [3.5       ],
       [4.        ]])

## $\mathbf{Y}_{\text{movies}\times\text{users}}$

Drop timestamp column

In [11]:
Y = ratings_df.drop(columns=['timestamp'], axis=1)

In [12]:
Y.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [13]:
np.unique(Y['rating'])

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

Given that the ratings are in $[0.5, 5.0]$, we can determine the empty items with $0.0$

In [14]:
# Create the (movieId * userId) matrix
Y = Y.pivot(index='movieId', columns='userId', values='rating')

# Fill NA columns with 0.0
Y = Y.fillna(0.0)

In [15]:
Y.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


Convert to numpy array

In [16]:
Y = Y.to_numpy()

Y

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

### Mean normalization

Calculate $\mathbf{Y}_{\text{norm}} = \mathbf{Y} - \mu$

In [17]:
Y_norm = Y - mu

Y_norm

array([[ 0.07906977, -3.92093023, -3.92093023, ..., -1.42093023,
        -0.92093023,  1.07906977],
       [-3.43181818, -3.43181818, -3.43181818, ..., -1.43181818,
        -3.43181818, -3.43181818],
       [ 0.74038462, -3.25961538, -3.25961538, ..., -1.25961538,
        -3.25961538, -3.25961538],
       ...,
       [-3.5       , -3.5       , -3.5       , ..., -3.5       ,
        -3.5       , -3.5       ],
       [-3.5       , -3.5       , -3.5       , ..., -3.5       ,
        -3.5       , -3.5       ],
       [-4.        , -4.        , -4.        , ..., -4.        ,
        -4.        , -4.        ]])

## $\mathbf{R}_{\text{movies}\times\text{users}}$

$$
\mathbf{R} =
\begin{cases}
    1, & \mathbf{Y}_{(i, j)} \neq 0 \\
    0, & \mathbf{Y}_{(i, j)} = 0
\end{cases}
$$

In [18]:
R = Y != 0

R

array([[ True, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True, False, False],
       [ True, False, False, ...,  True, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

## Genres feature ($\mathbf{X_m}$)

Load the `movies.csv`

In [19]:
df = pd.read_csv('../ml-latest-small/movies.csv', header=0,  delimiter=',', quotechar='"')

In [20]:
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Drop unnecessary columns

In [21]:
# We don't need the title column
df = df.drop(columns=['title'], axis=1)

# Row index can be used as movieId
df = df.drop(columns=['movieId'], axis=1)

Separate genres by | to list

In [22]:
df['genres'] = df['genres'].str.split('|')

In [23]:
df.head()

Unnamed: 0,genres
0,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,"[Adventure, Children, Fantasy]"
2,"[Comedy, Romance]"
3,"[Comedy, Drama, Romance]"
4,[Comedy]


Lets see what and how many unique genres are there

In [24]:
unique_genres = df['genres'].explode().unique()

print('num unique genres:', unique_genres.shape)
print(unique_genres)

num unique genres: (20,)
['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'War' 'Musical'
 'Documentary' 'IMAX' 'Western' 'Film-Noir' '(no genres listed)']


One-hot encode the genres

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)

df = df.join(
    pd.DataFrame.sparse.from_spmatrix(
        mlb.fit_transform(df.pop('genres')),
        index=df.index,
        columns=mlb.classes_
    )
)

In [26]:
df.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
X_m = df.to_numpy()
X_m

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])