In [None]:
import os
username = 'recspert'
repo = 'ITP-RecSys-2024'

# remove local directory if it already exists
if os.path.isdir(repo):
    !rm -rf {repo}

!git clone https://github.com/{username}/{repo}.git

In [None]:
!pip install --no-cache-dir --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

In [None]:
from collections import defaultdict

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, BatchSampler, SequentialSampler
import torch.nn.functional as F

from polara import get_movielens_data
from polara.lib.sampler import sample_element_wise
from polara.tools.random import random_seeds, seed_generator
from polara.preprocessing.dataframes import leave_one_out, reindex, sample_unseen_interactions
from ipypb import track

from dataprep import transform_indices
from rndutils import fix_torch_seed
from ann_metrics import metrics

# Preparing data

In [None]:
data = get_movielens_data("C:/Users/evfro/Downloads/ml-1m.zip", include_time=True)
data_small = data.sample(frac=0.05)

In [None]:
training_, data_index = transform_indices(data_small, 'userid', 'movieid')
training = training_.sort_values(['userid', 'timestamp'])

In [None]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
)
data_description

# Neural Matrix Factorization Model

In [None]:
class NeuralMF(nn.Module):
    def __init__(self, user_num, item_num, embedding_dim):
        super().__init__()
        self.user_embeddings = nn.Embedding(user_num, embedding_dim)
        self.item_embeddings = nn.Embedding(item_num, embedding_dim)
        self.initialize()

    def initialize(self):
        nn.init.normal_(self.user_embeddings.weight, std=0.01)
        nn.init.normal_(self.item_embeddings.weight, std=0.01)

    def forward(self, user, item):
        user_embedding = self.user_embeddings(user)
        item_embedding = self.item_embeddings(item)

        user_embedding = F.normalize(user_embedding)
        item_embedding = F.normalize(item_embedding)
        matmul = torch.sum(user_embedding*item_embedding, -1)
        return matmul.view(-1)

## Data Loader

The task is to define an iterator for our dataset that will sweep trough the observed data and also inject negative samples into it.

In [None]:
class NeuMFDataset(Dataset):
    def __init__(self, observations, n_users, n_items, n_samples=None, seed=None):
        super().__init__()
        self.observations = observations
        self.n_users = n_users
        self.n_items = n_items
        self.data = None
        self.seed = seed
        # data sampling initialization
        self.observations_mat = self.matrix_from_observations()
        self.n_samples = n_samples
        self._sampler_state = seed_generator(self.seed)
        self._shuffle_state = seed_generator(self.seed)
        self.shuffle = True
        self.reset_dataset()

    def reset_random_state(self):
        fix_torch_seed(self.seed)
        self._sampler_state.send(self.seed)
        self._shuffle_state.send(self.seed)

    def reset_dataset(self):
        '''This method will be used by pytorch Sampler object'''
        if self.n_samples:
            negative_examples = self.sample_negatives(next(self._sampler_state))
            data = np.concatenate((self.observations, negative_examples), axis=0)
            labels = [1] * len(self.observations) + [0] * len(negative_examples)
        else:
            data = self.observations
            labels = [0] * len(self.observations)

        self.data = np.concatenate([data, np.array(labels)[:, np.newaxis]], axis=1)

        if self.shuffle:
            random_state = np.random.RandomState(next(self._shuffle_state))
            random_state.shuffle(self.data)

    def matrix_from_observations(self):
        vals = np.broadcast_to(1, len(self.observations))
        rows = self.observations[:, 0]
        cols = self.observations[:, 1]
        shape = (self.n_users, self.n_items)
        return csr_matrix((vals, (rows, cols)), shape=shape)

    def sample_negatives(self, entropy):
        # important note: negative samples must include holdout items as well,
        # otherwise the model will be provided with hints about ground truth
        samples = sample_element_wise(
            # performs "uniform" sampling
            indptr = self.observations_mat.indptr,
            indices = self.observations_mat.indices,
            n_cols = self.n_items,
            n_samples = self.n_samples,
            seed_seq = random_seeds(self.n_users, entropy=entropy)
        )
        user_index = np.broadcast_to(
            np.repeat(
                np.arange(self.n_users),
                np.diff(self.observations_mat.indptr)
            )[:, np.newaxis],
            samples.shape
        )
        return np.concatenate([user_index.flat, samples.flat]).reshape(-1, 2, order='F')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        user, item, label = self.data[idx]
        output = {
            "users": np.int64(user),
            "items": np.int64(item),
            "labels": np.float32(label),
        }
        return output


class SamplerWithReset(RandomSampler):
    def __iter__(self):
        self.data_source.reset_dataset()
        return super().__iter__()

In [None]:
observed_data = training[[
    data_description['users'],
    data_description['items']
]]

train_dataset = NeuMFDataset(
    observed_data.values,
    data_description['n_users'],
    data_description['n_items'],
    n_samples = 1,
    seed = 42
)

In [None]:
train_dl = DataLoader(
        train_dataset,
        batch_size = 64, # will use the default collation_fn for gathering batches
        drop_last = False,
        sampler = SamplerWithReset(train_dataset),
    )

Remark: The default `collate_fn` function gathers batches entry-by-entry, it is an IO-bound procedure. Hence, it is more appropriate for ANN's with heavy computations, e.g. a CNN trained on an image dataset. In that case the overhead to gather a batch is small comparing to the compute time.

However, in the recsys case with a simple Neural MF model, computations are lightweight and the time spent inside `collate_fn` dominates, see [this issue](https://github.com/pytorch/pytorch/issues/21645) for more details. A viable workaround for not too large datasets is to create a custom dataloader which doesn't rely on `collate_fn` and performs sampling more efficiently in a vectorized form. For example, see [cofida](https://github.com/evfro/cofida) library.

In [None]:
for batch in train_dl:
    break

## Training

In [None]:
def train(loader, model, optimizer, criterion, scheduler=None, show_progress=True):
    model.train()
    losses = []

    if show_progress:
        loader = track(loader)

    for batch in loader:
        users = batch["users"]
        items = batch["items"]
        labels = batch["labels"]
        optimizer.zero_grad()
        loss = criterion(model(users, items), labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.detach().cpu().item())

    if scheduler is not None:
        scheduler.step()
    return losses

In [None]:
config = dict(
    dim = 50,
    learning_rate = 1e-3,
    epochs = 2,
)

In [None]:
model = NeuralMF(data_description['n_users'], data_description['n_items'], config['dim'])

if torch.cuda.is_available():
    if not next(model.parameters()).is_cuda:
        model = model.cuda()

In [None]:
criterion = torch.nn.BCEWithLogitsLoss().cuda()

optimizer = torch.optim.Adam(
    model.parameters(),
    lr = config['learning_rate']
)

In [None]:
for epoch in track(range(config['epochs'])):
    losses = train(train_dl, model, optimizer, criterion, show_progress=True)

In [None]:
pd.Series(losses).plot()

## Validation

In [None]:
def validate(loader, model, top_k=[10], show_progress=True):
    model.eval()
    data = defaultdict(list)
    coverage_set = set()

    if show_progress:
        loader = track(loader)

    for batch in loader:
        users = batch[:, 0]
        items = batch[:, 1]
        with torch.no_grad():
            predictions = model(users, items)
            for k in top_k:
                hits, mrrs, dcgs = metrics(predictions, top_k=k, coverage_set=coverage_set)
                data[f"hr@{k}"].append(hits)
                data[f"mrr@{k}"].append(mrrs)
                data[f"ndcg@{k}"].append(dcgs)

    output = {}
    for metric in ["hr", "mrr", "ndcg"]:
        for k in top_k:
            name = f"{metric}@{k}"
            output[name] = np.mean(data[name])

    for k in top_k:
        name = f"cov@{k}"
        output[name] = len(coverage_set)

    return output

In [None]:
holdout = (
    data
    .drop(data_small.index)
    .pipe(reindex, data_index['items'])
    .pipe(leave_one_out, random_state=0)[1]
    .pipe(reindex, data_index['users'])
    .sort_values(data_description['users'])
)

In [None]:
holdout.nunique()

In [None]:
test_negative_samples = 99

In [None]:
unseen_data = sample_unseen_interactions(
    pd.concat([training, holdout], axis=0, ignore_index=True),
    np.arange(data_description['n_items']),
    n_random = test_negative_samples,
    random_state = 0,
    userid = data_description['users'],
    itemid = data_description['items']
)

In [None]:
scoring_data_ = (
    holdout
    .set_index('userid')
    ['movieid']
    .combine(unseen_data, lambda x, y: np.r_[x, y])
)

In [None]:
scoring_data_

In [None]:
# check that holdout items are at the first position for each test user
assert (scoring_data_.str[0] == holdout.set_index('userid')['movieid']).all()

In [None]:
scoring_data = scoring_data_.explode().reset_index().values.astype('intp')
scoring_data

In [None]:
for batch in scoring_dl:
    break

In [None]:
batch.shape

In [None]:
for epoch in track(range(config['epochs'])):
    losses = train(train_dl, model, optimizer, criterion, show_progress=True)
    scores = validate(scoring_dl, model, top_k=[10], show_progress=True)
    cur_metrics = {'loss': np.mean(losses), **scores}
    log = f"Epoch: {epoch} | " + " | ".join(map(lambda x: f'{x[0]}: {x[1]:.3f}', cur_metrics.items()))
    print(log)