# Training neural factorization model on movielens dataset
> Training MF, MF+bias, and MLP model on movielens-100k dataset in PyTorch

- toc: false
- badges: true
- comments: true
- categories: [Pytorch, Movie, MF, MLP, RecoChef]
- author: "<a href='https://github.com/yanneta/pytorch-tutorials'>Yannet</a>"
- image:

In [1]:
!pip install -q git+https://github.com/sparsh-ai/recochef.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for recochef (PEP 517) ... [?25l[?25hdone


In [13]:
import torch
import torch.nn.functional as F

from recochef.datasets.synthetic import Synthetic
from recochef.datasets.movielens import MovieLens
from recochef.preprocessing.split import chrono_split
from recochef.preprocessing.encode import label_encode as le
from recochef.models.factorization import MF, MF_bias
from recochef.models.dnn import CollabFNet

In [15]:
# # generate synthetic implicit data
# synt = Synthetic()
# df = synt.implicit()

movielens = MovieLens()
df = movielens.load_interactions()

# changing rating colname to event following implicit naming conventions
df = df.rename(columns={'RATING': 'EVENT'})

In [16]:
# drop duplicates
df = df.drop_duplicates()

# chronological split
df_train, df_valid = chrono_split(df, ratio=0.8, min_rating=10)
print(f"Train set:\n\n{df_train}\n{'='*100}\n")
print(f"Validation set:\n\n{df_valid}\n{'='*100}\n")

Train set:

       USERID  ITEMID  EVENT  TIMESTAMP
59972       1     168    5.0  874965478
92487       1     172    5.0  874965478
74577       1     165    5.0  874965518
48214       1     156    4.0  874965556
22971       1     166    5.0  874965677
...       ...     ...    ...        ...
98752     943     139    1.0  888640027
89336     943     426    4.0  888640027
80660     943     720    1.0  888640048
93177     943      80    2.0  888640048
87415     943      53    3.0  888640067

[80000 rows x 4 columns]

Validation set:

       USERID  ITEMID  EVENT  TIMESTAMP
10508       1     208    5.0  878542960
83307       1       3    4.0  878542960
8976        1      12    5.0  878542960
78171       1      58    4.0  878542960
9811        1     201    3.0  878542960
...       ...     ...    ...        ...
81005     943     450    1.0  888693158
92536     943     227    1.0  888693158
95003     943     230    1.0  888693158
94914     943     229    2.0  888693158
92880     943     234   

In [17]:
# label encoding
df_train, uid_maps = le(df_train, col='USERID')
df_train, iid_maps = le(df_train, col='ITEMID')
df_valid = le(df_valid, col='USERID', maps=uid_maps)
df_valid = le(df_valid, col='ITEMID', maps=iid_maps)

# # event implicit to rating conversion
# event_weights = {'click':1, 'add':2, 'purchase':4}
# event_maps = dict({'EVENT_TO_IDX':event_weights})
# df_train = le(df_train, col='EVENT', maps=event_maps)
# df_valid = le(df_valid, col='EVENT', maps=event_maps)

print(f"Processed Train set:\n\n{df_train}\n{'='*100}\n")
print(f"Processed Validation set:\n\n{df_valid}\n{'='*100}\n")

Processed Train set:

       USERID  ITEMID  EVENT  TIMESTAMP
59972       0       0    5.0  874965478
92487       0       1    5.0  874965478
74577       0       2    5.0  874965518
48214       0       3    4.0  874965556
22971       0       4    5.0  874965677
...       ...     ...    ...        ...
98752     942     933    1.0  888640027
89336     942     990    4.0  888640027
80660     942     643    1.0  888640048
93177     942     155    2.0  888640048
87415     942     166    3.0  888640067

[80000 rows x 4 columns]

Processed Validation set:

       USERID  ITEMID  EVENT  TIMESTAMP
10508       0   341.0    5.0  878542960
83307       0   983.0    4.0  878542960
8976        0   425.0    5.0  878542960
78171       0   639.0    4.0  878542960
9811        0   490.0    3.0  878542960
...       ...     ...    ...        ...
81005     942   314.0    1.0  888693158
92536     942   154.0    1.0  888693158
95003     942   183.0    1.0  888693158
94914     942   176.0    2.0  888693158
9288

In [18]:
# get number of unique users and items
num_users = len(df_train.USERID.unique())
num_items = len(df_train.ITEMID.unique())

num_users_t = len(df_valid.USERID.unique())
num_items_t = len(df_valid.ITEMID.unique())

print(f"There are {num_users} users and {num_items} items in the train set.\n{'='*100}\n")
print(f"There are {num_users_t} users and {num_items_t} items in the validation set.\n{'='*100}\n")

There are 943 users and 1613 items in the train set.

There are 943 users and 1429 items in the validation set.



In [19]:
# training and testing related helper functions
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.USERID.values) # .cuda()
        items = torch.LongTensor(df_train.ITEMID.values) #.cuda()
        ratings = torch.FloatTensor(df_train.EVENT.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_valid.USERID.values) #.cuda()
    items = torch.LongTensor(df_valid.ITEMID.values) #.cuda()
    ratings = torch.FloatTensor(df_valid.EVENT.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [20]:
# training MF model
model = MF(num_users, num_items, emb_size=100) # .cuda() if you have a GPU
print(f"Training MF model:\n")
train_epocs(model, epochs=10, lr=0.1)
print(f"\n{'='*100}\n")

Training MF model:

13.594555854797363
5.292399883270264
2.558849573135376
3.584117889404297
1.0360910892486572
1.9875222444534302
2.920832633972168
2.4130148887634277
1.2886441946029663
1.112807273864746
test loss 2.085 




In [21]:
# training MF with bias model
model = MF_bias(num_users, num_items, emb_size=100) #.cuda()
print(f"Training MF+bias model:\n")
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)
print(f"\n{'='*100}\n")

Training MF+bias model:

13.59664535522461
9.730958938598633
4.798837184906006
1.3603413105010986
2.697232723236084
4.214857578277588
2.871798276901245
1.3329992294311523
0.9624974727630615
1.459389328956604
test loss 2.269 




In [22]:
# training MLP model
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()
print(f"Training MLP model:\n")
train_epocs(model, epochs=15, lr=0.05, wd=1e-6, unsqueeze=True)
print(f"\n{'='*100}\n")

Training MLP model:

12.962654113769531
1.4028953313827515
15.373563766479492
2.177295207977295
2.6291019916534424
5.752542495727539
6.88251256942749
6.2746357917785645
4.8090314865112305
3.095308303833008
1.6791961193084717
1.1257785558700562
1.678966760635376
2.615834951400757
2.80102276802063
test loss 2.559 


