**Purpose of this notebook**

Implement collaborative filtering on movielens dataset
1. Matrix Factorization
2. Using Neural Network to learn embeddings

In [53]:
from pathlib import Path
import pandas as pd
import numpy as np
import zipfile
from tqdm import tnrange, tqdm

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [3]:
PATH = Path('data/')

In [13]:
with zipfile.ZipFile(PATH/'ml-latest-small.zip') as z:
    with z.open('ml-latest-small/ratings.csv') as f:
        data = pd.read_csv(f)

In [14]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


## Encode Data

In [15]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [16]:
def proc_col(col, train_col=None):
    """
    Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [17]:
def encode_data(df, train=None):
    """
    Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [18]:
df_train = encode_data(train)
df_val = encode_data(val, train)

## Embeddings Layer

In [20]:
# an Embedding object containing 10 users or items embedding size 3
# embeddings will be initialized at random
embed = nn.Embedding(10, 3)
embed.weight

Parameter containing:
tensor([[-0.0596,  0.6845,  0.8395],
        [ 0.1271, -0.2549,  0.3582],
        [-0.7586,  0.0949,  1.0023],
        [-1.4211, -0.6909, -0.0179],
        [ 1.0334,  1.7368,  0.4366],
        [-0.8050,  0.0520,  0.0131],
        [ 0.4518,  0.2676, -0.1353],
        [-0.5609,  2.3864, -1.0803],
        [ 1.6733,  0.4910,  0.5521],
        [ 0.1019, -0.3812, -0.5394]])

In [21]:
# given a list of ids we can "look up" the embedding corresponing to each id
# can you see that some vectors are the same?
a = torch.LongTensor([[1,0,1,4,5,1]])
embed(a)

tensor([[[ 0.1271, -0.2549,  0.3582],
         [-0.0596,  0.6845,  0.8395],
         [ 0.1271, -0.2549,  0.3582],
         [ 1.0334,  1.7368,  0.4366],
         [-0.8050,  0.0520,  0.0131],
         [ 0.1271, -0.2549,  0.3582]]])

## Matrix Factorization Model

In [22]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        # Initialize weights (replace random values)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
    
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

### Training Model

In [30]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items)

671 8442


In [51]:
model = MF(num_users, num_items, emb_size=100).cuda()

In [49]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    # Filter for parameters that require training
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    model.train()
    users = torch.LongTensor(df_train.userId.values).cuda()
    items = torch.LongTensor(df_train.movieId.values).cuda()
    ratings = torch.FloatTensor(df_train.rating.values).cuda()
    if unsqueeze:
            ratings = ratings.unsqueeze(1).cuda() # Unsqueeze makes a 1d object 2d
    for i in tnrange(epochs):
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        tqdm.write(f'Train loss: {loss.item():{.4}}')
        
    test_loss(model, unsqueeze)

In [50]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userId.values).cuda()
    items = torch.LongTensor(df_val.movieId.values).cuda()
    ratings = torch.FloatTensor(df_val.rating.values).cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print(f'Test loss {loss.item():{.4}}')

In [54]:
train_epocs(model, epochs=10, lr=0.1)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Train loss: 13.23
Train loss: 5.122
Train loss: 2.376
Train loss: 3.449
Train loss: 0.9087
Train loss: 1.808
Train loss: 2.749
Train loss: 2.279
Train loss: 1.157
Train loss: 0.9222

Test loss 1.947


In [55]:
train_epocs(model, epochs=15, lr=0.01)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

Train loss: 1.704
Train loss: 1.051
Train loss: 0.7492
Train loss: 0.6943
Train loss: 0.7594
Train loss: 0.8401
Train loss: 0.8825
Train loss: 0.8762
Train loss: 0.8341
Train loss: 0.777
Train loss: 0.7246
Train loss: 0.6898
Train loss: 0.6766
Train loss: 0.6805
Train loss: 0.692

Test loss 0.8934


In [56]:
train_epocs(model, epochs=15, lr=0.001)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

Train loss: 0.7008
Train loss: 0.6845
Train loss: 0.6711
Train loss: 0.6605
Train loss: 0.6526
Train loss: 0.6468
Train loss: 0.6429
Train loss: 0.6402
Train loss: 0.6385
Train loss: 0.6373
Train loss: 0.6364
Train loss: 0.6357
Train loss: 0.6349
Train loss: 0.6341
Train loss: 0.6333

Test loss 0.8301


## Matrix Factorization with Bias

In [57]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [58]:
model = MF_bias(num_users, num_items, emb_size=100).cuda()

In [59]:
train_epocs(model, epochs=10, lr=0.1, wd=1e-5)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Train loss: 13.23
Train loss: 4.372
Train loss: 3.481
Train loss: 2.467
Train loss: 0.7877
Train loss: 1.814
Train loss: 2.519
Train loss: 2.137
Train loss: 1.27
Train loss: 0.9019

Test loss 1.538


In [60]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Train loss: 1.283
Train loss: 0.8582
Train loss: 0.6944
Train loss: 0.6954
Train loss: 0.7549
Train loss: 0.8005
Train loss: 0.8074
Train loss: 0.781
Train loss: 0.7383
Train loss: 0.6966

Test loss 0.8234


In [61]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Train loss: 0.6681
Train loss: 0.6601
Train loss: 0.6533
Train loss: 0.6479
Train loss: 0.6436
Train loss: 0.6402
Train loss: 0.6375
Train loss: 0.6354
Train loss: 0.6337
Train loss: 0.6323

Test loss 0.8098


## Neural Network Model

The main difference between MF based collaborative filtering and NN based is that there is no matrix multiplication in  the later one, implying the embedding dimension can be different for users and items. Also, regularization can help get better results on the test set.

In [62]:
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, user_emb_size=100,
                 item_emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, user_emb_size)
        self.item_emb = nn.Embedding(num_items, item_emb_size)
        self.lin1 = nn.Linear(user_emb_size + item_emb_size,
                              n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.0)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.drop2(x)
        x = self.lin2(x)
        return x

In [63]:
model = CollabFNet(num_users, num_items, user_emb_size=80, item_emb_size=100).cuda()

In [64]:
train_epocs(model, epochs=20, lr=0.01, wd=1e-5, unsqueeze=True)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Train loss: 12.88
Train loss: 7.583
Train loss: 3.116
Train loss: 1.323
Train loss: 2.391
Train loss: 3.821
Train loss: 3.826
Train loss: 2.898
Train loss: 1.887
Train loss: 1.305
Train loss: 1.242
Train loss: 1.508
Train loss: 1.844
Train loss: 2.064
Train loss: 2.094
Train loss: 1.93
Train loss: 1.66
Train loss: 1.377
Train loss: 1.161
Train loss: 1.069

Test loss 1.074


In [65]:
train_epocs(model, epochs=20, lr=0.01, wd=1e-5, unsqueeze=True)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Train loss: 1.098
Train loss: 1.854
Train loss: 1.113
Train loss: 1.144
Train loss: 1.398
Train loss: 1.178
Train loss: 0.9652
Train loss: 1.007
Train loss: 1.126
Train loss: 1.116
Train loss: 0.9961
Train loss: 0.8988
Train loss: 0.9075
Train loss: 0.9698
Train loss: 0.9752
Train loss: 0.9061
Train loss: 0.8483
Train loss: 0.8636
Train loss: 0.8946
Train loss: 0.8885

Test loss 0.8694


In [66]:
train_epocs(model, epochs=20, lr=0.001, wd=1e-5, unsqueeze=True)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Train loss: 0.8488
Train loss: 0.8254
Train loss: 0.8249
Train loss: 0.8314
Train loss: 0.8306
Train loss: 0.8251
Train loss: 0.8192
Train loss: 0.8172
Train loss: 0.8185
Train loss: 0.821
Train loss: 0.8224
Train loss: 0.8192
Train loss: 0.8152
Train loss: 0.8165
Train loss: 0.8126
Train loss: 0.8137
Train loss: 0.8124
Train loss: 0.8102
Train loss: 0.8106
Train loss: 0.8117

Test loss 0.8308


In [67]:
train_epocs(model, epochs=20, lr=0.001, wd=1e-5, unsqueeze=True)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Train loss: 0.8092
Train loss: 0.8157
Train loss: 0.8089
Train loss: 0.8098
Train loss: 0.8107
Train loss: 0.8096
Train loss: 0.8037
Train loss: 0.8072
Train loss: 0.8063
Train loss: 0.8064
Train loss: 0.8059
Train loss: 0.8006
Train loss: 0.8029
Train loss: 0.8029
Train loss: 0.8013
Train loss: 0.8033
Train loss: 0.8005
Train loss: 0.8005
Train loss: 0.8
Train loss: 0.7986

Test loss 0.8236
