In [1]:
# changing core directory
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)
os.chdir('..')

In [2]:
import os
import logging
import warnings
import yaml

from replay_benchmarks.utils.conf import load_config, seed_everything
from replay_benchmarks import TrainRunner, InferRunner

import logging
import os
from abc import ABC, abstractmethod
from typing import Tuple

import pandas as pd
from rs_datasets import MovieLens, Netflix

from replay.data import (
    FeatureHint,
    FeatureInfo,
    FeatureSchema,
    FeatureSource,
    FeatureType,
    Dataset,
)
from replay.preprocessing.filters import MinCountFilter, NumInteractionsFilter
from replay.splitters import TimeSplitter
from replay.utils import DataFrameLike
from replay.data.nn import (
    SequenceTokenizer,
    SequentialDataset,
    TensorFeatureSource,
    TensorSchema,
    TensorFeatureInfo,
)

from torch.utils.data import DataLoader
from replay.models.nn.sequential.sasrec import (
    SasRecTrainingDataset,
    SasRecValidationDataset,
    SasRecPredictionDataset,
)
from replay.models.nn.sequential.bert4rec import (
    Bert4RecTrainingDataset,
    Bert4RecValidationDataset,
    Bert4RecPredictionDataset,
)

In [3]:
config_dir = "./replay_benchmarks/configs"
base_config_path = os.path.join(config_dir, "config.yaml")
config = load_config(base_config_path, config_dir)

seed_everything(config["env"]["SEED"])

In [4]:
config = config
model_name = config["model"]["name"]
dataset_name = config["dataset"]["name"]
dataset_cfg = config["dataset"]
model_cfg = config["model"]["params"]
mode = config["mode"]["name"]
item_column = dataset_cfg["feature_schema"]["item_column"]
user_column = dataset_cfg["feature_schema"]["query_column"]
timestamp_column = dataset_cfg["feature_schema"]["timestamp_column"]
tokenizer = None
interactions = None
user_features = None
item_features = None


os.environ["CUDA_DEVICE_ORDER"] = config["env"]["CUDA_DEVICE_ORDER"]
os.environ["OMP_NUM_THREADS"] = config["env"]["OMP_NUM_THREADS"]
os.environ["CUDA_VISIBLE_DEVICES"] = config["env"]["CUDA_VISIBLE_DEVICES"]
os.environ["KAGGLE_USERNAME"] = "recsysaccelerate"
os.environ["KAGGLE_KEY"] = "6363e91b656fea576c39e4f55dcc1d00"

In [5]:
embedding_dim = model_cfg["training_params"]["embedding_dim"]
item_feature_name = "item_id_seq"

tensor_schema =  TensorSchema(
    TensorFeatureInfo(
        name=item_feature_name,
        is_seq=True,
        feature_type=FeatureType.CATEGORICAL,
        feature_sources=[
            TensorFeatureSource(
                FeatureSource.INTERACTIONS,
                item_column,
            )
        ],
        feature_hint=FeatureHint.ITEM_ID,
        embedding_dim=embedding_dim,
    )
)

tensor_schema

<replay.data.nn.schema.TensorSchema at 0x7f3ec32f0a50>

In [6]:
DATASET_MAPPINGS = {
    "zvuk": {"kaggle": "alexxl/zvuk-dataset", "file": "zvuk-interactions.parquet"},
    "megamarket": {"kaggle": "alexxl/megamarket", "file": "megamarket.parquet"},
}
SUPPORTED_RS_DATASETS = ["movielens", "netflix"]

def _download_dataset(
    data_path: str, dataset_name: str, interactions_file: str
):
    """Download dataset from Kaggle or rs_datasets."""
    if dataset_name in DATASET_MAPPINGS:
        _download_kaggle_dataset(data_path, dataset_name, interactions_file)
    elif any(ds in dataset_name for ds in SUPPORTED_RS_DATASETS):
        _download_rs_dataset(data_path, dataset_name, interactions_file)
    else:
        raise ValueError(f"Unsupported dataset: {dataset_name}")

def _download_kaggle_dataset(
    data_path: str, dataset_name: str, interactions_file: str
) -> None:
    from kaggle.api.kaggle_api_extended import KaggleApi

    """Download dataset from Kaggle."""
    kaggle_info = DATASET_MAPPINGS[dataset_name]
    kaggle_dataset = kaggle_info["kaggle"]
    raw_data_file = os.path.join(data_path, kaggle_info["file"])

    os.environ.setdefault("KAGGLE_USERNAME", "recsysaccelerate")
    os.environ.setdefault("KAGGLE_KEY", "6363e91b656fea576c39e4f55dcc1d00")

    api = KaggleApi()
    api.authenticate()

    api.dataset_download_files(kaggle_dataset, path=data_path, unzip=True)
    logging.info(f"Dataset downloaded and extracted to {data_path}")

    interactions = pd.read_parquet(raw_data_file)
    interactions[timestamp_column] = interactions[
        timestamp_column
    ].astype("int64")
    if dataset_name == "megamarket":
        interactions = interactions[interactions.event == 2] # take only purchase
    interactions.to_parquet(interactions_file)

def _download_rs_dataset(
    data_path: str, dataset_name: str, interactions_file: str
) -> None:
    """Download dataset from rs_datasets."""
    if "movielens" in dataset_name:
        version = dataset_name.split("_")[1]
        movielens = MovieLens(version=version, path=data_path)
        interactions = movielens.ratings
        interactions = interactions[interactions[dataset_cfg["feature_schema"]["rating_column"]] > dataset_cfg["preprocess"]["min_rating"]]
    elif dataset_name == "netflix":
        netflix = Netflix(path=data_path)
        interactions = pd.concat([netflix.train, netflix.test]).fillna(5).reset_index(drop=True)
        interactions = interactions[interactions[dataset_cfg["feature_schema"]["rating_column"]] > dataset_cfg["preprocess"]["min_rating"]]
        interactions = interactions.sort_values(by=[user_column, timestamp_column])
        interactions[timestamp_column] += interactions.groupby([user_column, timestamp_column]).cumcount()
    else:
        raise ValueError(f"Unsupported dataset: {dataset_name}")

    interactions[timestamp_column] = interactions[
        timestamp_column
    ].astype("int64")
    interactions.to_parquet(interactions_file)


In [7]:
dataset_name = dataset_cfg["name"]
data_path = dataset_cfg["path"]
interactions_file = os.path.join(data_path, "interactions.parquet")

if not os.path.exists(interactions_file):
    _download_dataset(data_path, dataset_name, interactions_file)

interactions = pd.read_parquet(interactions_file)
interactions.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
3,1,3408,4,978300275
4,1,2355,5,978824291
6,1,1287,5,978302039
7,1,2804,5,978300719


In [8]:
print(interactions.groupby(user_column).size().min(), interactions.groupby(item_column).size().min())

interactions = MinCountFilter(
    num_entries=dataset_cfg["preprocess"]["min_users_per_item"],
    groupby_column=item_column,
).transform(interactions)

interactions = MinCountFilter(
    num_entries=dataset_cfg["preprocess"]["min_items_per_user"],
    groupby_column=user_column,
).transform(interactions)


interactions.groupby(user_column).size().min(), interactions.groupby(item_column).size().min()

1 1


(4, 5)

In [9]:
dataset_cfg["preprocess"]["global_split_ratio"]

0.1

In [10]:
user_column

'user_id'

In [11]:
splitter = TimeSplitter(
    time_threshold=dataset_cfg["preprocess"]["global_split_ratio"],
    drop_cold_users=True,
    drop_cold_items=True,
    item_column=item_column,
    query_column=user_column,
    timestamp_column=timestamp_column,
)

# train_events, validation_events, validation_gt, test_events, test_gt = (
#     _split_data(splitter, interactions)
# )

test_events, test_gt = splitter.split(interactions)
validation_events, validation_gt = splitter.split(test_events)
train_events = validation_events

test_gt = test_gt[test_gt[item_column].isin(train_events[item_column])]
test_gt = test_gt[test_gt[user_column].isin(train_events[user_column])]


# Limit number of gt events in val and test only if max_num_test_interactions is not null
max_test_interactions = dataset_cfg["preprocess"]["max_num_test_interactions"]
print(f"Distribution of seq_len in validation:\n{validation_gt.groupby(user_column)[item_column].agg('count').describe()}.")
print(f"Distribution of seq_len in test:\n{test_gt.groupby(user_column)[item_column].agg('count').describe()}.")
if max_test_interactions is not None:
    
    validation_gt = NumInteractionsFilter(
        num_interactions=max_test_interactions,
        first=True,
        query_column=user_column,
        item_column=item_column,
        timestamp_column=timestamp_column,
    ).transform(validation_gt)
    print(f"Distribution of seq_len in validation  after filtering:\n{validation_gt.groupby(user_column)[item_column].agg('count').describe()}.")

    test_gt = NumInteractionsFilter(
        num_interactions=max_test_interactions,
        first=True,
        query_column=user_column,
        item_column=item_column,
        timestamp_column=timestamp_column,
    ).transform(test_gt)


Distribution of seq_len in validation:
count    448.000000
mean      24.837054
std       49.389768
min        1.000000
25%        2.000000
50%        6.500000
75%       26.000000
max      656.000000
Name: item_id, dtype: float64.
Distribution of seq_len in test:
count    999.000000
mean      43.796797
std       59.198925
min        1.000000
25%        7.000000
50%       23.000000
75%       57.000000
max      585.000000
Name: item_id, dtype: float64.


In [12]:
test_gt.head()

Unnamed: 0,user_id,item_id,rating,timestamp
140725,904,2348,4,977939424
140721,904,858,4,977939543
140660,904,260,5,977939543
140668,904,2866,5,977939573
140696,904,1193,4,977939573


In [13]:
train_events['timestamp'].max() <= validation_events['timestamp'].min()

False

In [14]:
timestamp_column, item_column, user_column

('timestamp', 'item_id', 'user_id')

In [15]:
def test_splitting(events, gt, name=''):
    if events[timestamp_column].max() > gt[timestamp_column].min():
        print("Problem with time points in", name)
    if len(set(gt[user_column].unique().tolist()) - set(events[user_column].unique().tolist())) > 0:
        print("Problem with cold users in", name)
    if len(set(gt[item_column].unique().tolist()) - set(events[item_column].unique().tolist())) > 0:
        print("Problem with cold items in", name)


test_splitting(train_events, test_gt, "train events, test gt")
test_splitting(train_events, validation_gt, "train events, valid gt")

In [16]:
set([1, 2, 3])  - set({1, 2, 3, 4})

set()

In [17]:
def prepare_feature_schema(is_ground_truth: bool) -> FeatureSchema:
    """Prepare the feature schema based on whether ground truth is needed."""
    base_features = FeatureSchema(
        [
            FeatureInfo(
                column=user_column,
                feature_hint=FeatureHint.QUERY_ID,
                feature_type=FeatureType.CATEGORICAL,
            ),
            FeatureInfo(
                column=item_column,
                feature_hint=FeatureHint.ITEM_ID,
                feature_type=FeatureType.CATEGORICAL,
            ),
        ]
    )
    if is_ground_truth:
        return base_features

    return base_features + FeatureSchema(
        [
            FeatureInfo(
                column=timestamp_column,
                feature_type=FeatureType.NUMERICAL,
                feature_hint=FeatureHint.TIMESTAMP,
            ),
        ]
    )

feature_schema = prepare_feature_schema(is_ground_truth=False)
ground_truth_schema = prepare_feature_schema(is_ground_truth=True)

train_dataset = Dataset(
    feature_schema=feature_schema,
    interactions=train_events,
    query_features=user_features,
    item_features=item_features,
    check_consistency=True,
    categorical_encoded=False,
)
validation_dataset = Dataset(
    feature_schema=feature_schema,
    interactions=validation_events,
    query_features=user_features,
    item_features=item_features,
    check_consistency=True,
    categorical_encoded=False,
)
validation_gt_dataset = Dataset(
    feature_schema=ground_truth_schema,
    interactions=validation_gt,
    check_consistency=True,
    categorical_encoded=False,
)
test_dataset = Dataset(
    feature_schema=feature_schema,
    interactions=test_events,
    query_features=user_features,
    item_features=item_features,
    check_consistency=True,
    categorical_encoded=False,
)
test_gt_dataset = Dataset(
    feature_schema=ground_truth_schema,
    interactions=test_gt,
    check_consistency=True,
    categorical_encoded=False,
)


In [20]:
tokenizer = SequenceTokenizer(
    tensor_schema, allow_collect_to_master=True, handle_unknown_rule="drop"
)
tokenizer.fit(train_dataset)

seq_train_dataset = tokenizer.transform(train_dataset)
# seq_validation_dataset, seq_validation_gt = _prepare_sequential_validation(
#     validation_dataset, validation_gt
# )

seq_validation_dataset = tokenizer.transform(validation_dataset)
seq_validation_gt = tokenizer.transform(
    validation_gt_dataset, [tensor_schema.item_id_feature_name]
)

seq_validation_dataset, seq_validation_gt = SequentialDataset.keep_common_query_ids(
            seq_validation_dataset, seq_validation_gt
)


test_query_ids = test_gt_dataset.query_ids
test_query_ids_np = tokenizer.query_id_encoder.transform(test_query_ids)[
    user_column
].values
seq_test_dataset = tokenizer.transform(test_dataset).filter_by_query_id(
    test_query_ids_np
)
# seq_test_dataset = self._prepare_sequential_test(test_dataset, test_gt)




In [21]:
dataset_mapping = {
    "sasrec": (
        SasRecTrainingDataset,
        SasRecValidationDataset,
        SasRecPredictionDataset,
    ),
    "bert4rec": (
        Bert4RecTrainingDataset,
        Bert4RecValidationDataset,
        Bert4RecPredictionDataset,
    ),
}

if model_name.lower() in dataset_mapping:
    TrainingDataset, ValidationDataset, PredictionDataset = dataset_mapping[
        model_name.lower()
    ]
else:
    raise ValueError(
        f"Unsupported model type for dataloaders: {model_name}"
    )

common_params = {
    "batch_size": model_cfg["training_params"]["batch_size"],
    "num_workers": model_cfg["training_params"]["num_workers"],
    "pin_memory": True,
}

train_dataloader = DataLoader(
    dataset=TrainingDataset(
        seq_train_dataset,
        max_sequence_length=model_cfg["model_params"]["max_seq_len"],
    ),
    shuffle=True,
    **common_params,
)
val_dataloader = DataLoader(
    dataset=ValidationDataset(
        seq_validation_dataset,
        seq_validation_gt,
        seq_train_dataset,
        max_sequence_length=model_cfg["model_params"]["max_seq_len"],
    ),
    **common_params,
)
prediction_dataloader = DataLoader(
    dataset=PredictionDataset(
        seq_test_dataset,
        max_sequence_length=model_cfg["model_params"]["max_seq_len"],
    ),
    **common_params,
)
