In [100]:
import pandas as pd
from rectools import Columns, ExternalIds
import tqdm
import torch
import os
import numpy as np
from lightning_fabric import seed_everything
from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Callback
from collections import OrderedDict
from pathlib import Path
import typing as tp

In [151]:
from rectools.model_selection import TimeRangeSplitter, LastNSplitter, cross_validate
from rectools.dataset import Interactions
from rectools.models import model_from_params
from rectools.models.nn.transformers.sasrec import SASRecModel
from rectools.models.nn.transformers.bert4rec import BERT4RecModel
from rectools.dataset import Dataset
from rectools.metrics import (
    calc_metrics,
    NDCG,
    AvgRecPopularity,
    CatalogCoverage,
    Recall,
    Serendipity,
)

## Dataset

### MovieLens 1M

In [152]:
ratings = pd.read_csv(
        "../Datasets/ratings.dat",
        sep="::",
        names=["userId", "movieId", "rating", "timestamp"],
        engine="python",
    )

ratings = ratings[ratings["rating"] >= 1].drop(columns=["rating"])
ratings.rename(columns={
        "userId": Columns.User,
        "movieId": Columns.Item,
        "timestamp": Columns.Datetime,
    },
    inplace=True,
)
ratings[Columns.Datetime] = pd.to_datetime(ratings[Columns.Datetime], unit="s")
ratings[Columns.Weight] = 1

TEST_SPLIT_SIZE = "7D"

### MovieLens 20M

In [None]:
ratings = pd.read_csv(
        "../Datasets/ratings.csv",
    )

ratings = ratings[ratings["rating"] >= 0].drop(columns=["rating"])
ratings.rename(columns={
        "userId": Columns.User,
        "movieId": Columns.Item,
        "timestamp": Columns.Datetime,
    },
    inplace=True,
)
ratings[Columns.Datetime] = pd.to_datetime(ratings[Columns.Datetime], unit="s")
ratings[Columns.Weight] = 1

TEST_SPLIT_SIZE = "60D"

### KION

In [None]:
ratings = pd.read_csv(
        "../Datasets/interactions.csv",
    )

ratings.rename(columns={
        "last_watch_dt": Columns.Datetime,
    },
    inplace=True,
)
ratings[Columns.Weight] = 1
ratings = ratings[Columns.Interactions]

TEST_SPLIT_SIZE = "14D"

# Split dataset

### Time-based split

In [153]:
splitter = TimeRangeSplitter(
    test_size=TEST_SPLIT_SIZE,
    n_splits=1, # for cross-validation choose more splits
    filter_cold_users=True,
    filter_cold_items=True,
    filter_already_seen=True,
)

### Leave-one-out split

In [None]:
splitter = LastNSplitter(
    n=1,
    n_splits=1, # for cross-validation choose more splits
    filter_cold_users=True,
    filter_cold_items=True,
    filter_already_seen=True,
)

### Split

In [154]:
split_iterator = splitter.split(Interactions(ratings))
train_ids, test_ids, _ = next(iter(split_iterator))
train = ratings.iloc[train_ids]
test = ratings.iloc[test_ids]
train_dataset = Dataset.construct(ratings)
test_users = test[Columns.User].unique()
catalog=train[Columns.Item].unique()


# Modification comparison

## Set seed

In [64]:
torch.use_deterministic_algorithms(True)
seed_everything(42, workers=True)

# Enable deterministic behaviour with CUDA >= 10.2
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

Seed set to 42


## Determine model parameters

Here are options that should be chosen for each modification to work:

### Training objective
* Shifted sequence: choose model `SASRecModel`
    * data_preparator_type: rectools.models.nn.transformers.sasrec.SASRecDataPreparator
* MLM: choose model `BERT4RecModel`
    * data_preparator_type: rectools.models.nn.transformers.bert4rec.BERT4RecDataPreparator
* All action: choose model `BERT4RecModel`
    * data_preparator_type: modifications.objectives.all_action.AllActionDataPreparator
    * lightning_module_type: modifications.objectives.all_action.AllActionLightningModule
    * backbone_type: modifications.objectives.all_action.AllActionTransformerTorchBackbone
* Dense all action: choose model `SASRecModel`
    * data_preparator_type: modifications.objectives.dense_all_action.DenseAllActionDataPreparator

### Transformer layers
* SASRec: 
    * transformer_layers_type: rectools.models.nn.transformers.sasrec.SASRecTransformerLayers
* BERT4Rec: 
    * transformer_layers_type: rectools.models.nn.transformers.net_blocks.PreLNTransformerLayers
* ALBERT: 
    * transformer_layers_type: src.models.transformers.transformer_layers.albert.AlbertLayers

### Loss functions
* softmax:
    * loss: softmax
* bce:
    * loss: BCE
* gBCE:
    * loss: gBCE
* sampled softmax:
    * loss: sampled_softmax

### Negative sampling
* Sample uniformly from catalogue
    * negative_sampler_type: rectools.models.nn.transformers.negative_sampler.CatalogUniformSampler
* Sample uniformly from batch
    * negative_sampler_type: modifications.negative_sampling.in_batch.InBatchSampler
    * lightning_module_type: modifications.negative_sampling.in_batch.LogQLightningModule
* Sample mixed negatives from catalogue and batch in certain proportion
    * negative_sampler_type: modifications.negative_sampling.mixed.MixedSampler
    * negative_sampler_kwargs.ratio: 0.4 (change to any float in range [0, 1])

### Scoring
* Dot product
    * similarity_module_type: rectools.models.nn.transformers.similarity.DistanceSimilarityModule
* Cosine
    * similarity_module_type: rectools.models.nn.transformers.similarity.DistanceSimilarityModule
    * similarity_module_kwargs.distance: cosine



## Create model

### Define trainer

Specify callback for early stopping (validation loss/recsys metric), specify validation mask

In [None]:
class BestModelLoad(Callback):

    def __init__(self, ckpt_path) -> None:
        self.ckpt_path = ckpt_path + ".ckpt"

    def on_fit_end(self, trainer, pl_module) -> None:
        log_dir = trainer.log_dir
        ckpt_path = Path(log_dir) / "checkpoints" / self.ckpt_path
        checkpoint = torch.load(ckpt_path, weights_only=False)
        pl_module.load_state_dict(checkpoint["state_dict"])
        self.ckpt_full_path = str(ckpt_path) 

def get_trainer_func() -> Trainer:
    min_val_loss_ckpt = ModelCheckpoint(
        monitor="val_loss",
        mode="min",
        filename="best_val_loss",
    )
    early_stopping_val_loss = EarlyStopping(
        monitor=f"val_loss",
        mode="min",
        patience=20,
        divergence_threshold=None,
    )
    best_model_load = BestModelLoad("best_val_loss")
    callbacks = [
        min_val_loss_ckpt,
        best_model_load,
        early_stopping_val_loss,
    ]
    return Trainer(
        max_epochs=5, # set to required value
        deterministic=True,
        enable_progress_bar=True,
        enable_model_summary=True,
        logger=CSVLogger("test_logs"),
        callbacks=callbacks,
    )

In [142]:
def leave_one_out_mask_for_users(
    train, val_users
) -> np.ndarray:
    rank = (
        train.sort_values(Columns.Datetime, ascending=False, kind="stable")
        .groupby(Columns.User, sort=False)
        .cumcount()
    )
    val_mask = (train[Columns.User].isin(val_users)) & (rank == 0)
    return val_mask.values

def get_val_mask_func(train: pd.DataFrame) -> np.ndarray:
    users = train[Columns.User].unique()
    val_users = users[:2048]
    return leave_one_out_mask_for_users(train, val_users=val_users)

## Define model

In [123]:
example_model_parameters = {
    "cls": SASRecModel,
    "loss": "sampled_softmax",
    "transformer_layers_type": 
    "rectools.models.nn.transformers.sasrec.SASRecTransformerLayers",
    "negative_sampler_type": "modifications.negative_sampling.mixed.MixedSampler",
    "negative_sampler_kwargs.ratio": 0.4,
    "get_trainer_func": get_trainer_func,
    "get_val_mask_func": get_val_mask_func,
}
model = model_from_params(example_model_parameters)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


# Compute metrics

### Metrics

In [161]:
metrics = OrderedDict(
    [
        (f"recall@{10}", Recall(k=10)),
        (f"ndcg@{10}", NDCG(k=10, divide_by_achievable=True)),
        (f"arp@{10}", AvgRecPopularity(k=10, normalize=True)),
        (f"serendipity@{10}", Serendipity(k=10)),
        (f"coverage@{10}", CatalogCoverage(k=10, normalize=True)),
    ]
)

### Compute cross-validation metrics

In [158]:
cv_results = cross_validate(
        dataset=train_dataset,
        splitter=splitter, # to split train data into actual train and validation
        metrics=metrics,
        models={"model": model},
        k=10,
        filter_viewed=True,
    )

  unq_values = pd.unique(values)
  PydanticSerializationUnexpectedValue(Expected `str` - serialized value may not be as expected [input_value=('rectools.models.nn.item...net.CatFeaturesItemNet'), input_type=tuple])
  return self.__pydantic_serializer__.to_python(

  | Name        | Type                     | Params | Mode 
-----------------------------------------------------------------
0 | torch_model | TransformerTorchBackbone | 1.7 M  | train
-----------------------------------------------------------------
1.7 M     Trainable params
0         Non-trainable params
1.7 M     Total params
6.959     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/Users/mayyaspirina/Desktop/vkrgrid/repo/prepare_vkr/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


                                                                           

/Users/mayyaspirina/Desktop/vkrgrid/repo/prepare_vkr/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/Users/mayyaspirina/Desktop/vkrgrid/repo/prepare_vkr/.venv/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (48) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 4: 100%|██████████| 48/48 [00:04<00:00, 10.79it/s, v_num=4]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 48/48 [00:04<00:00, 10.70it/s, v_num=4]


In [160]:
metric_results = (
    pd.DataFrame(cv_results["metrics"])
    .drop(columns=["i_split", "model"])
    .mean()
    .to_dict()
)

### Compute metrics on test dataset

In [164]:
model.fit(train_dataset)
reco = model.recommend(
        users=test_users,
        dataset=train_dataset,
        k=10,
        filter_viewed=True,
        on_unsupported_targets="warn",
    )

  unq_values = pd.unique(values)
  PydanticSerializationUnexpectedValue(Expected `str` - serialized value may not be as expected [input_value=('rectools.models.nn.item...net.CatFeaturesItemNet'), input_type=tuple])
  return self.__pydantic_serializer__.to_python(

  | Name        | Type                     | Params | Mode 
-----------------------------------------------------------------
0 | torch_model | TransformerTorchBackbone | 1.7 M  | train
-----------------------------------------------------------------
1.7 M     Trainable params
0         Non-trainable params
1.7 M     Total params
6.959     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


                                                                           

/Users/mayyaspirina/Desktop/vkrgrid/repo/prepare_vkr/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/Users/mayyaspirina/Desktop/vkrgrid/repo/prepare_vkr/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/Users/mayyaspirina/Desktop/vkrgrid/repo/prepare_vkr/.venv/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (48) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for t

Epoch 4: 100%|██████████| 48/48 [00:04<00:00, 10.89it/s, v_num=6]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 48/48 [00:04<00:00, 10.79it/s, v_num=6]


In [126]:
metric_results = calc_metrics(
    metrics=metrics,
    reco=reco,
    interactions=test,
    prev_interactions=train,
    catalog=train[Columns.Item].unique(),
)

### Save metrics

In [141]:
pd.DataFrame.from_dict(metric_results, orient="index").to_csv("metrics")