# Collaborative Filtering Recommender Systems

## Data Preprocess

In [1]:
import pandas as pd
import numpy as np

Load `ratings.csv`

In [10]:
ratings_df = pd.read_csv('./ml-latest-small/ratings.csv', header=0,  delimiter=',', quotechar='"')

Calculate mean of each movies ratings

In [15]:
# drop userId, timestamp columns
mu = ratings_df.drop(columns=['userId', 'timestamp'], axis=1)

# show result columns
mu.columns

Index(['movieId', 'rating'], dtype='object')

In [16]:
# group rating by movieId
mu = mu.groupby('movieId')['rating'].apply(list)

# re-assign column name to rating
mu = mu.reset_index(name='rating')

# show result
mu.head()

Unnamed: 0,movieId,rating
0,1,"[4.0, 4.0, 4.5, 2.5, 4.5, 3.5, 4.0, 3.5, 3.0, ..."
1,2,"[4.0, 4.0, 3.0, 3.0, 3.0, 3.5, 4.0, 4.5, 4.0, ..."
2,3,"[4.0, 5.0, 3.0, 3.0, 4.0, 5.0, 3.0, 4.0, 3.0, ..."
3,4,"[3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 1.5]"
4,5,"[5.0, 3.0, 5.0, 3.0, 4.0, 4.0, 2.0, 3.0, 4.0, ..."


In [17]:
# replace rating list with mean
mu['rating'] = mu['rating'].map(np.mean)

# show result
mu.head()

Unnamed: 0,movieId,rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


In [None]:
# drop movieId column
mu = mu.drop(columns=['movieId'], axis=1)

# convert rating type from Float64 to Float32
mu = mu.astype({'rating': 'Float32'})

In [34]:
# convert to numpy array
mu = mu.to_numpy()

Create matrix of movies vs users ratings

In [18]:
# drop timestamp column
R = ratings_df.drop(columns=['timestamp'], axis=1)

# show result columns
R.columns

Index(['userId', 'movieId', 'rating'], dtype='object')

In [19]:
print('min rating: ', R['rating'].min())
print('max rating: ', R['rating'].max())


min rating:  0.5
max rating:  5.0


Given that the ratings are in $[0.5, 5.0]$, we fill the empty items in `R` with $0.0$

In [25]:
# reduce types
R = R.astype({'userId': 'Int32', 'movieId': 'Int32', 'rating': 'Float32'})

# create movieId*userId matrix
R = R.pivot(index='movieId', columns='userId', values='rating')

# fill NA columns with 0.0
R = R.fillna(0.0)

# show result
R.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# convert to numpy array
R = R.to_numpy()

In [35]:
X = f

(9724, 610)