In [1]:
import os
import copy

import torch
import pandas as pd
import lightning as L
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window

from replay.metrics.torch_metrics_builder import metrics_to_df
from replay.data import (
    FeatureHint,
    FeatureSource,
    FeatureType,
)
from replay.data.nn import (
    TensorFeatureInfo,
    TensorFeatureSource,
    TensorSchema,
)
from replay.preprocessing.label_encoder import LabelEncoder, LabelEncodingRule, SequenceEncodingRule
from replay.metrics import MAP, OfflineMetrics, Precision, Recall
from replay.splitters import LastNSplitter, RatioSplitter

# Fix seed to ensure reproducibility
L.seed_everything(42)

import warnings
warnings.filterwarnings("ignore")

Seed set to 42


In [2]:
user_features = pd.read_csv("./data/ml1m_users.dat", sep="\t", names=["user_id", "gender", "age", "occupation", "zip_code"])
item_features = pd.read_csv("./data/ml1m_items.dat", sep="\t", names=["item_id", "title", "genres"])
interactions = pd.read_csv("./data/ml1m_ratings.dat", sep="\t", names=["user_id", "item_id","rating","timestamp"])

In [3]:
interactions["timestamp"] = interactions["timestamp"].astype("int64")
interactions = interactions.sort_values(by="timestamp")
interactions["timestamp"] = interactions.groupby("user_id").cumcount()
interactions

Unnamed: 0,user_id,item_id,rating,timestamp
1000138,6040,858,4,0
1000153,6040,2384,4,1
999873,6040,593,5,2
1000192,6040,2019,5,3
1000007,6040,1961,4,4
...,...,...,...,...
825793,4958,2399,1,446
825438,4958,1407,5,447
825731,4958,2634,3,448
825724,4958,3264,4,449


In [4]:
splitter = RatioSplitter(
    test_size=0.1,
    divide_column="user_id",
    query_column="user_id",
    drop_cold_users=True, 
    drop_cold_items=True,
)

raw_test_events, raw_test_gt = splitter.split(interactions)
raw_val_events, raw_val_gt = splitter.split(raw_test_events)
raw_train_events = raw_val_events

In [5]:
def align_data_by_train_items(data, train_events):
    data = data[data["item_id"].isin(train_events["item_id"])]
    return data

raw_test_events = align_data_by_train_items(raw_test_events, raw_train_events)
raw_test_gt = align_data_by_train_items(raw_test_gt, raw_train_events)

In [6]:
tokenizer = LabelEncoder([
    LabelEncodingRule("user_id", default_value="last"),
    LabelEncodingRule("item_id", default_value="last")
])

raw_train_events = raw_train_events.sort_values(by="item_id", ascending=True)
train_events = tokenizer.fit_transform(raw_train_events)
val_events = tokenizer.transform(raw_val_events)
val_gt = tokenizer.transform(raw_val_gt)
test_events = tokenizer.transform(raw_test_events)
test_gt = tokenizer.transform(raw_test_gt)

In [7]:
from replay.data.nn.utils import groupby_sequences


def bake_data(df: pd.DataFrame):
    grouped_interactions = groupby_sequences(
        events=df,
        groupby_col="user_id",
        sort_col="timestamp"
    )

    return grouped_interactions

train_events = bake_data(train_events)
val_events = bake_data(val_events)
val_gt = bake_data(val_gt)
test_events = bake_data(test_events)
test_gt = bake_data(test_gt)

In [None]:
val_gt_to_join = val_gt.loc[:, ["user_id", "item_id"]].rename(columns={"item_id": "ground_truth"})
train_events_to_join = train_events.loc[:, ["user_id", "item_id"]].rename(columns={"item_id": "train"})

val_events = (val_events
              .merge(val_gt_to_join, how="inner", on="user_id")
              .merge(train_events_to_join, how="inner", on="user_id")
)
val_events.head(5)

Unnamed: 0,user_id,rating,timestamp,item_id,ground_truth,train
0,0,"[4, 5, 3, 5, 5, 5, 1, 1, 5, 4, 2, 4, 2, 3, 3, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1763, 1126, 1938, 1095, 1092, 2119, 163, 3218...","[1597, 2932, 848, 1155, 2150, 2638, 3098, 3101...","[1763, 1126, 1938, 1095, 1092, 2119, 163, 3218..."
1,1,"[3, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[464, 930, 3012, 2692, 1994, 1238, 722, 2970, ...","[591, 31, 2350, 2866, 432, 3235, 2386, 181, 32...","[464, 930, 3012, 2692, 1994, 1238, 722, 2970, ..."
2,2,"[4, 4, 4, 3, 5, 4, 5, 4, 5, 4, 4, 5, 4, 4, 4, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1102, 528, 3438, 1229, 1763, 1909, 589, 1127,...","[521, 2215, 251, 2673, 202, 793, 3463, 1964, 2...","[1102, 528, 3438, 1229, 1763, 1909, 589, 1127,..."
3,3,"[5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 3, 5, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[3156, 1192, 769, 1763, 1471, 1095, 2917, 3473...","[210, 1189, 2146, 920, 143, 2074, 1012, 3060, ...","[3156, 1192, 769, 1763, 1471, 1095, 2917, 3473..."
4,4,"[5, 4, 3, 3, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[837, 1131, 621, 1592, 1080, 3100, 2180, 3096,...","[3250, 1184, 1165, 1688]","[837, 1131, 621, 1592, 1080, 3100, 2180, 3096,..."


In [9]:
data_dir = "./preprocessed/"
os.makedirs(data_dir, exist_ok=True)

TRAIN_PATH = os.path.join(data_dir, "train.parquet")
VAL_PATH = os.path.join(data_dir, "val.parquet")
TEST_PATH =  os.path.join(data_dir, "test.parquet")

train_events.to_parquet(TRAIN_PATH)
val_events.to_parquet(VAL_PATH)
test_events.to_parquet(TEST_PATH)

In [10]:
import pickle

FEATURE_MAPPING_PATH = os.path.join(data_dir, "feature_mapping.pickle")

# feature_mapping = {
#     **tokenizer.item_id_encoder.mapping,
#     **(tokenizer.item_features_encoder.mapping if tokenizer.item_features_encoder is not None else {}),
#     **(tokenizer.interactions_encoder.mapping if tokenizer.interactions_encoder is not None else {}),
# }
feature_mapping = {
    "item_id" : tokenizer.rules[1]._mapping
}
with open(FEATURE_MAPPING_PATH, "wb") as f:  #
    pickle.dump(feature_mapping, f)

In [None]:
item_features = item_features[item_features["item_id"].isin(raw_train_events["item_id"])]

item_features_encoded = tokenizer.rules[1].transform(item_features)
item_features_encoded

Unnamed: 0,title,genres,item_id
0,Toy Story (1995),Animation|Children's|Comedy,0
1,Jumanji (1995),Adventure|Children's|Fantasy,1
2,Grumpier Old Men (1995),Comedy|Romance,2
3,Waiting to Exhale (1995),Comedy|Drama,3
4,Father of the Bride Part II (1995),Comedy,4
...,...,...,...
3878,Meet the Parents (2000),Comedy,3664
3879,Requiem for a Dream (2000),Drama,3665
3880,Tigerland (2000),Drama,3666
3881,Two Family House (2000),Drama,3667


In [17]:
PATH_ENCODED_FEATURES = os.path.join(data_dir, "item_features_encoded.parquet")
item_features_encoded.loc[:, ["item_id"]].to_parquet(PATH_ENCODED_FEATURES)

In [34]:
EMBEDDING_DIM = 192
ITEM_FEATURE_NAME = "item_id"

NUM_UNIQUE_ITEMS = len(tokenizer.mapping["item_id"])

tensor_schema = TensorSchema(
    [
        TensorFeatureInfo(
            "item_id",
            is_seq=True,
            feature_type=FeatureType.CATEGORICAL,
            embedding_dim=EMBEDDING_DIM,
            padding_value=NUM_UNIQUE_ITEMS,
            cardinality=NUM_UNIQUE_ITEMS+1,  # taking into account padding
            feature_hint=FeatureHint.ITEM_ID,
            feature_sources=[TensorFeatureSource(FeatureSource.ITEM_FEATURES, "item_id")]
        ),
        # TensorFeatureInfo(
        #     "genres",
        #     is_seq=True,
        #     feature_type=FeatureType.CATEGORICAL_LIST,
        #     embedding_dim=EMBEDDING_DIM,
        #     padding_value=NUM_UNIQUE_GENRE_VALUES,
        #     cardinality=NUM_UNIQUE_GENRE_VALUES+1,
        #     feature_sources=[TensorFeatureSource(FeatureSource.ITEM_FEATURES, "genres")]
        # ),
        # TensorFeatureInfo(
        #     "title",
        #     is_seq=True,
        #     feature_type=FeatureType.NUMERICAL_LIST,
        #     tensor_dim=TITLE_EMB_DIM,
        #     embedding_dim=EMBEDDING_DIM,
        #     padding_value=0,
        #     feature_sources=[TensorFeatureSource(FeatureSource.ITEM_FEATURES, "title")]
        # )
    ]
)

In [38]:
from replay.nn.transforms import (
    GroupTransform,
    RenameTransform,
    NextTokenTransform,
    UnsqueezeTransform,
    UniformNegativeSamplingTransform
)

MAX_SEQ_LEN = 50
BATCH_SIZE = 64
SHIFT=1

TRANSFORMS = {
    "train": [
        NextTokenTransform(label_field="item_id", query_features="user_id", shift=SHIFT, out_feature_name="positive_labels"),
        RenameTransform({"user_id": "query_id", "item_id_mask": "padding_mask",  "positive_labels_mask": "target_padding_mask"}),
        UniformNegativeSamplingTransform(vocab_size=NUM_UNIQUE_ITEMS, num_negative_samples=500),
        UnsqueezeTransform("target_padding_mask", -1),
        UnsqueezeTransform("positive_labels", -1),
        GroupTransform({"feature_tensors": tensor_schema.names})
    ],
    "val": [
        RenameTransform({"user_id": "query_id", "item_id_mask": "padding_mask"}),
        GroupTransform({"feature_tensors": tensor_schema.names}),
    ],
    "test": [
        RenameTransform({"user_id": "query_id", "item_id_mask": "padding_mask"}),
        GroupTransform({"feature_tensors": tensor_schema.names})
    ]
}

shared_meta = {
    "user_id": {},
    "item_id": {"shape": MAX_SEQ_LEN+1, "padding": tensor_schema["item_id"].padding_value},
    "genres": {"shape": [MAX_SEQ_LEN+1, MAX_LEN_GENRE], "padding": tensor_schema["genres"].padding_value},
    "title": {"shape": [MAX_SEQ_LEN+1, TITLE_EMB_DIM], "padding": tensor_schema["title"].padding_value},
}

METADATA = {
    "train": copy.deepcopy(shared_meta),
    "val": {
        **copy.deepcopy(shared_meta),
        "train": {
            "shape": MAX_SEQ_LEN,
            "padding": tensor_schema["item_id"].padding_value
        },
        "ground_truth": {
            "shape": MAX_SEQ_LEN,
            "padding": tensor_schema["item_id"].padding_value
        },
    },
    "test": copy.deepcopy(shared_meta)
}

In [39]:
from replay.data.nn import ParquetModule

parquet_datamodule = ParquetModule(
    train_path=TRAIN_PATH,
    val_path=VAL_PATH,
    test_path=TEST_PATH,
    batch_size=BATCH_SIZE,
    metadata=METADATA,
    transforms=TRANSFORMS
)

In [40]:
from replay.nn.sequential import TwoTower
from replay.nn import DefaultAttentionMask, SequenceEmbedding, SumAggregator, SwiGLUEncoder, ConcatAggregator
from replay.nn.loss import CESampled, CE, BCE, BCESampled, LogInCE, LogInCESampled, LogOutCE
from replay.nn.sequential import SasRecAggregator, SasRecTransformerLayer, DiffTransformerLayer
import torch

In [23]:
excluded_features = None
excluded_features = list(set(excluded_features or []))
excluded_features

[]

In [42]:
from replay.nn.sequential import ItemReference

i = ItemReference(tensor_schema, PATH_ENCODED_FEATURES)
i

<replay.nn.sequential.twotower.model.ItemReference at 0x7fc598e12a10>

In [None]:
twotower = TwoTower.build_original(schema=tensor_schema,
                                   embedding_dim=EMBEDDING_DIM,
                                    item_reference_path=PATH_ENCODED_FEATURES
                                   )

In [21]:
NUM_HEADS = 2
NUM_BLOCKS = 2
DROPOUT = 0.2
FEATURE_NAMES = ["item_id"]

# common_aggregator = SumAggregator(embedding_dim=EMBEDDING_DIM)
common_aggregator = ConcatAggregator(
    input_embedding_dims=[EMBEDDING_DIM],
    output_embedding_dim=EMBEDDING_DIM)

excluded_features = None
twotower = TwoTower(
    schema=tensor_schema,
    embedder=SequenceEmbedding(
        schema=tensor_schema,
        categorical_list_feature_aggregation_method="sum",
        excluded_features=excluded_features,
    ),
    attn_mask_builder=DefaultAttentionMask(
        reference_feature_name=tensor_schema.item_id_feature_name,
        num_heads=NUM_HEADS,
    ),
    query_tower_feature_names=FEATURE_NAMES,
    item_tower_feature_names=FEATURE_NAMES,
    query_embedding_aggregator=SasRecAggregator(
        embedding_aggregator=common_aggregator,
        max_sequence_length=MAX_SEQ_LEN,
        dropout=DROPOUT,
    ),
    item_embedding_aggregator=common_aggregator,
    query_encoder=DiffTransformerLayer(
        embedding_dim=EMBEDDING_DIM,
        num_heads=NUM_HEADS,
        num_blocks=NUM_BLOCKS,
    ),
    query_tower_output_normalization=torch.nn.LayerNorm(EMBEDDING_DIM),
    item_encoder=SwiGLUEncoder(embedding_dim=EMBEDDING_DIM, hidden_dim=2*EMBEDDING_DIM),
    
    item_reference_path=PATH_ENCODED_FEATURES,
    loss=CESampled(padding_idx=tensor_schema.item_id_features.item().padding_value),
    context_merger=None,
)


In [25]:
from replay.nn.lightning import LightningModule
from replay.nn.optimizer_utils import FatOptimizerFactory, LambdaLRSchedulerFactory

model = LightningModule(twotower, 
                        lr_scheduler_factory=LambdaLRSchedulerFactory(warmup_steps=6),
                        optimizer_factory=FatOptimizerFactory(),
)

In [25]:
from replay.nn.lightning.callbacks import ComputeMetricsCallback
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger


checkpoint_callback = ModelCheckpoint(
    dirpath=".checkpoints",
    save_top_k=1,
    verbose=True,
    monitor="recall@10",
    mode="max",
)


validation_metrics_callback = ComputeMetricsCallback(
    metrics=["map", "ndcg", "recall"],
    ks=[1, 5, 10, 20],
    item_count=len(tokenizer.mapping["item_id"]),
)

csv_logger = CSVLogger(save_dir=".logs/train")

trainer = L.Trainer(
    max_epochs=3,
    callbacks=[checkpoint_callback, validation_metrics_callback],
    logger=csv_logger,
)

trainer.fit(model, datamodule=parquet_datamodule)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/evtsinovnik/.conda/envs/replay/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/evtsinovnik/RePlay/examples/.checkpoints exists and is not empty.

  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | SasRec | 1.8 M  | train
-----------------------------------------
1.8 M     Trainable params
0         Non-trainable params
1.8 M     Total params
7.006     Total estimated model params size (MB)
40        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/evtsinovnik/.conda/envs/replay/lib/python3.11/site-packages/lightning/pytorch/utilities/data.py:122: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 95: 'recall@10' reached 0.01660 (best 0.01660), saving model to '/home/evtsinovnik/RePlay/examples/.checkpoints/epoch=0-step=95-v34.ckpt' as top 1


k              1        10        20         5
map     0.083984  0.033989  0.024495  0.048034
ndcg    0.083984  0.083884  0.081045  0.085554
recall  0.001680  0.016602  0.031758  0.008555



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 190: 'recall@10' reached 0.01961 (best 0.01961), saving model to '/home/evtsinovnik/RePlay/examples/.checkpoints/epoch=1-step=190.ckpt' as top 1


k              1        10        20         5
map     0.107422  0.039323  0.029221  0.052272
ndcg    0.107422  0.098296  0.096111  0.097699
recall  0.002148  0.019609  0.037969  0.009727



Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 285: 'recall@10' reached 0.02148 (best 0.02148), saving model to '/home/evtsinovnik/RePlay/examples/.checkpoints/epoch=2-step=285.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=3` reached.


k              1        10        20         5
map     0.091797  0.043252  0.031479  0.055690
ndcg    0.091797  0.104995  0.098759  0.103680
recall  0.001836  0.021484  0.038945  0.010703



In [30]:
best_model = LightningModule.load_from_checkpoint(checkpoint_callback.best_model_path, model=twotower)

In [43]:
from replay.nn.lightning.postprocessors import SeenItemsFilter
from replay.nn.lightning.callbacks import ComputeMetricsCallback, PandasTopItemsCallback, PolarsTopItemsCallback, SparkTopItemsCallback, TorchTopItemsCallback, HiddenStatesCallback


csv_logger = CSVLogger(save_dir=".logs/test", name="GPT_example")

TOPK = [1, 2, 3]

postprocessor = SeenItemsFilter(
        seen_path=TEST_PATH,
        item_count=tensor_schema[ITEM_FEATURE_NAME].cardinality-1,
        query_column="user_id",
        item_column=tensor_schema.item_id_feature_name
    )
postprocessors = [postprocessor]

pandas_prediction_callback = PandasTopItemsCallback(
    top_k=max(TOPK),
    query_column="user_id",
    item_column="item_id",
    rating_column="score",
    postprocessors=postprocessors,
)
metrics_callback = ComputeMetricsCallback(
    metrics=["map", "ndcg", "recall", "coverage"],
    ks=TOPK,
    postprocessors=postprocessors,
    item_count=tensor_schema[ITEM_FEATURE_NAME].cardinality,
)
trainer = L.Trainer(
    callbacks=[
        pandas_prediction_callback,
             metrics_callback
             ],
    logger=csv_logger,
    inference_mode=True
)

trainer.predict(best_model, datamodule=parquet_datamodule, return_predictions=False)

pandas_res = pandas_prediction_callback.get_result()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/evtsinovnik/.conda/envs/replay/lib/python3.11/site-packages/lightning/pytorch/utilities/data.py:122: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.


Predicting: |          | 0/? [00:00<?, ?it/s]

item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torch.Size([3669, 192])
logits torch.Size([64, 3669])
item_embeddings torc

In [48]:
c.get_result()

tensor([[[-1.7201,  0.4378, -2.8051,  ..., -0.6522,  1.7160, -0.3824],
         [-1.7601,  0.4055, -2.7832,  ..., -0.6392,  1.6754, -0.4011],
         [-1.7452,  0.4201, -2.7750,  ..., -0.6306,  1.6847, -0.3832],
         ...,
         [-1.7222,  0.4265, -2.7614,  ..., -0.6046,  1.6727, -0.4025],
         [-1.7226,  0.4232, -2.7611,  ..., -0.6055,  1.6823, -0.3897],
         [-1.7128,  0.4193, -2.7668,  ..., -0.6021,  1.6806, -0.3935]],

        [[-1.7079,  0.4461, -2.7777,  ..., -0.5705,  1.6361, -0.3696],
         [-1.6855,  0.4683, -2.8039,  ..., -0.5824,  1.6170, -0.3704],
         [-1.6991,  0.4424, -2.7856,  ..., -0.5903,  1.6562, -0.3784],
         ...,
         [-1.7243,  0.4363, -2.7662,  ..., -0.6099,  1.6528, -0.3922],
         [-1.7236,  0.4325, -2.7662,  ..., -0.6046,  1.6672, -0.3896],
         [-1.7205,  0.4386, -2.7662,  ..., -0.6152,  1.6567, -0.3775]],

        [[-1.7079,  0.4461, -2.7777,  ..., -0.5705,  1.6361, -0.3696],
         [-1.6896,  0.4542, -2.7940,  ..., -0

In [42]:
pandas_res = tokenizer.inverse_transform(pandas_res)
pandas_res

DataFrame[score: double, user_id: bigint, item_id: bigint]

In [41]:
test_gt.loc[:, ["user_id", "item_id"]].explode("item_id")

Unnamed: 0,user_id,item_id
0,0,209
0,0,101
0,0,837
0,0,322
0,0,1431
...,...,...
6038,6038,2872
6038,6038,2933
6039,6039,1159
6039,6039,1123


In [42]:
result_metrics = OfflineMetrics(
    [Recall(TOPK), Precision(TOPK), MAP(TOPK)],
    query_column="user_id",
    rating_column="score"
)(pandas_res, test_gt.loc[:, ["user_id", "item_id"]].explode("item_id"))

metrics_to_df(result_metrics)



k,1,2,3
MAP,0.00596,0.003849,0.003155
Precision,0.00596,0.004719,0.004912
Recall,0.000409,0.000592,0.000918
