### Dataset : https://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
PATH = Path('./ml-latest-small/')
list(PATH.iterdir())

[WindowsPath('ml-latest-small/links.csv'),
 WindowsPath('ml-latest-small/movies.csv'),
 WindowsPath('ml-latest-small/ratings.csv'),
 WindowsPath('ml-latest-small/README.txt'),
 WindowsPath('ml-latest-small/tags.csv')]

### References
Lesson 5 of Jeremy Howard's Deep Learning Course

Based on work of [Yannet]('https://github.com/yanneta')

In [3]:
data = pd.read_csv(PATH/"ratings.csv")

In [4]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Encoding data
We enconde the data to have contiguous ids for users and movies. You can think about this as a categorical encoding of our two categorical variables userId and movieId.

In [5]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [6]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continuous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [7]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [8]:
# to check my new implementation
LOCAL_PATH = Path("images/")
df_t = pd.read_csv(LOCAL_PATH/"tiny_training2.csv")
df_v = pd.read_csv(LOCAL_PATH/"tiny_val2.csv")
print(df_t)
df_t_e = encode_data(df_t)
df_v_e = encode_data(df_v, df_t)
df_v_e
print(df_t_e)

    userId  movieId  rating
0       11        1       4
1       11       23       5
2        2       23       5
3        2        4       3
4       31        1       4
5       31       23       4
6        4        1       5
7        4        3       2
8       52        1       1
9       52        3       4
10      61        3       5
11       7       23       1
12       7        3       3
    userId  movieId  rating
0        0        0       4
1        0        1       5
2        1        1       5
3        1        2       3
4        2        0       4
5        2        1       4
6        3        0       5
7        3        3       2
8        4        0       1
9        4        3       4
10       5        3       5
11       6        1       1
12       6        3       3


In [9]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

## Embedding layer

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [11]:
# an Embedding module containing 10 user or item embedding size 3
# embedding will be initialized at random
embed = nn.Embedding(10, 3)

In [12]:
# given a list of ids we can "look up" the embedding corresponing to each id
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[ 0.3326,  0.7025,  0.6530],
         [-0.6357, -1.2612,  0.3864],
         [-0.4911, -0.0877, -2.0626],
         [-0.1812,  1.2104, -0.0174],
         [ 0.4360,  0.0175,  1.3721],
         [ 0.3326,  0.7025,  0.6530]]], grad_fn=<EmbeddingBackward0>)

## Matrix factorization model

In [13]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)   

## Debugging MF model

In [14]:
df_t_e

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [15]:
num_users = 7
num_items = 4
emb_size = 3

user_emb = nn.Embedding(num_users, emb_size)
item_emb = nn.Embedding(num_items, emb_size)
users = torch.LongTensor(df_t_e.userId.values)
items = torch.LongTensor(df_t_e.movieId.values)

In [16]:
U = user_emb(users)
V = item_emb(items)

In [17]:
U

tensor([[ 1.1170, -1.0140,  1.2938],
        [ 1.1170, -1.0140,  1.2938],
        [-0.0605,  0.0552, -1.6647],
        [-0.0605,  0.0552, -1.6647],
        [-0.2963, -0.8262,  0.9770],
        [-0.2963, -0.8262,  0.9770],
        [-0.9644,  0.6538, -0.0902],
        [-0.9644,  0.6538, -0.0902],
        [-0.3646,  0.2454,  0.8715],
        [-0.3646,  0.2454,  0.8715],
        [-1.9348, -1.7277,  1.1224],
        [-0.0920,  0.5678, -0.2213],
        [-0.0920,  0.5678, -0.2213]], grad_fn=<EmbeddingBackward0>)

In [18]:
# element wise multiplication
U*V 

tensor([[-1.1114, -0.2468, -0.1292],
        [-0.8543, -0.7340,  1.3431],
        [ 0.0463,  0.0400, -1.7282],
        [-0.0533, -0.0575,  3.4864],
        [ 0.2948, -0.2011, -0.0976],
        [ 0.2266, -0.5981,  1.0143],
        [ 0.9596,  0.1591,  0.0090],
        [ 0.8909,  0.7360, -0.0092],
        [ 0.3628,  0.0597, -0.0871],
        [ 0.3368,  0.2762,  0.0893],
        [ 1.7874, -1.9449,  0.1150],
        [ 0.0704,  0.4110, -0.2297],
        [ 0.0850,  0.6392, -0.0227]], grad_fn=<MulBackward0>)

In [19]:
# what we want is a dot product per row
(U*V).sum(1) 

tensor([-1.4875, -0.2452, -1.6419,  3.3756, -0.0038,  0.6429,  1.1277,  1.6177,
         0.3354,  0.7024, -0.0425,  0.2516,  0.7015], grad_fn=<SumBackward1>)

## Training MF model

In [20]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items) 

610 8998


In [21]:
model = MF(num_users, num_items, emb_size=100) # .cuda() if you have a GPU

In [22]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values) # .cuda()
        items = torch.LongTensor(df_train.movieId.values) #.cuda()
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

In [23]:
# Here is what unsqueeze does
ratings = torch.FloatTensor(df_train.rating.values)
print(ratings.shape)
ratings = ratings.unsqueeze(1) # .cuda()
print(ratings.shape)

torch.Size([80450])
torch.Size([80450, 1])


In [24]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) #.cuda()
    items = torch.LongTensor(df_val.movieId.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [25]:
train_epocs(model, epochs=10, lr=0.1)

12.913342475891113
4.85593318939209
2.586555004119873
3.106139659881592
0.8501330614089966
1.8202000856399536
2.657374143600464
2.137739658355713
1.0914249420166016
0.9741576910018921
test loss 1.848 


In [26]:
train_epocs(model, epochs=15, lr=0.01)

1.6423803567886353
1.0045005083084106
0.7117718458175659
0.6605576276779175
0.7252231240272522
0.8034783601760864
0.8432655930519104
0.8353168964385986
0.7931990027427673
0.7377158403396606
0.6878089308738708
0.6556392908096313
0.644519567489624
0.6495621204376221
0.6609945297241211
test loss 0.821 


In [27]:
train_epocs(model, epochs=15, lr=0.01)

0.669021725654602
0.6312077641487122
0.6386494636535645
0.6140164136886597
0.6051818132400513
0.6135246157646179
0.6111860871315002
0.5963952541351318
0.5844637155532837
0.5826217532157898
0.5836178064346313
0.5786370635032654
0.5679815411567688
0.5577439069747925
0.5514679551124573
test loss 0.759 


## MF with bias

In [28]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [29]:
model = MF_bias(num_users, num_items, emb_size=100) #.cuda()

In [30]:
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

12.914627075195312
9.154275894165039
4.386416912078857
1.1581398248672485
2.4697792530059814
3.7438340187072754
2.4486465454101562
1.0782543420791626
0.8159571886062622
1.3182907104492188
test loss 2.070 


In [31]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

1.8937915563583374
1.325338363647461
0.9353333711624146
0.7449634671211243
0.7222675681114197
0.7771481871604919
0.8226391673088074
0.8216954469680786
0.7813025116920471
0.7275145649909973
test loss 0.798 


In [32]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

0.6853029131889343
0.6710862517356873
0.6591898798942566
0.6494365930557251
0.6416147351264954
0.6354848146438599
0.6307964324951172
0.6273032426834106
0.624774158000946
0.6230003833770752
test loss 0.751 


Note that these models are susceptible to weight initialization, optimization algorithm and regularization.

## Neural Network Model

In [33]:
# Note here there is no matrix multiplication, we could potentially make the embeddings of different sizes.
# Here we could get better results by keep playing with regularization.
    
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [34]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()

In [35]:
train_epocs(model, epochs=15, lr=0.05, wd=1e-6, unsqueeze=True) 

15.304405212402344
1.4252458810806274
3.3766684532165527
1.1237949132919312
1.846782922744751
2.031071424484253
1.3096376657485962
0.8670371770858765
1.2798718214035034
1.4013832807540894
0.9251265525817871
0.7453456521034241
0.9489248991012573
1.0741623640060425
0.9302994608879089
test loss 0.792 


In [36]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-6, unsqueeze=True)

0.7211838960647583
0.854150116443634
0.7030884027481079
0.7071071863174438
0.7589870691299438
0.7321251034736633
0.6817824244499207
0.675032913684845
0.7032181024551392
0.7077311277389526
test loss 0.764 


In [37]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.6819716095924377
0.6692086458206177
0.6658874750137329
0.665412962436676
0.6664057374000549
0.6691492199897766
0.6686155796051025
0.6676936149597168
0.6640618443489075
0.6635606288909912
test loss 0.743 


In [38]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.6622158288955688
0.6650509238243103
0.6615883111953735
0.6618887186050415
0.6632617115974426
0.6617304086685181
0.6634445190429688
0.6614481806755066
0.6623141169548035
0.662236213684082
test loss 0.742 
