# Comparison of performance for different parts of pipeline depends on input data format (PySpark, Pandas, Polars)

Note that this example designed only for comparison goals, to get more detailed information about pipeline steps see `09_sasrec_example.ipynb` and `10_bert4rec_example.ipynb`

Example was executed on Intel(R) Xeon(R) Gold 6248R: 12 cores, HT:on, Turbo:on. OS: Ubuntu 20.04 LTS, total memory of 96 GB.  

In [1]:
import lightning as L
from torch.utils.data import DataLoader

from replay.metrics import OfflineMetrics, Recall, Precision, MAP, NDCG, HitRate, MRR
from replay.splitters import LastNSplitter
from replay.utils import get_spark_session, DataFrameLike
from replay.data import (
    FeatureHint,
    FeatureInfo,
    FeatureSchema,
    FeatureSource,
    FeatureType,
    Dataset,
)
from replay.models.nn.optimizer_utils import FatOptimizerFactory
from replay.models.nn.sequential.callbacks import (
    ValidationMetricsCallback,
    SparkPredictionCallback,
    PandasPredictionCallback, 
    PolarsPredictionCallback,
)
from replay.models.nn.sequential.postprocessors import RemoveSeenItems
from replay.data.nn import (
    SequenceTokenizer,
    SequentialDataset,
    TensorFeatureSource,
    TensorSchema,
    TensorFeatureInfo
)
from replay.models.nn.sequential import SasRec
from replay.models.nn.sequential.sasrec import (
    SasRecPredictionDataset,
    SasRecTrainingDataset,
    SasRecValidationDataset,
)

import pandas as pd
import polars as pl

In [None]:
spark_session = get_spark_session()

## Observe and load data

In [None]:
!pip install rs-datasets

In [4]:
from rs_datasets import MovieLens

movielens = MovieLens("20m")
interactions_pandas = movielens.ratings
item_features_pandas = movielens.items

In [5]:
interactions_pandas.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [6]:
item_features_pandas.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
interactions_pandas["timestamp"] = interactions_pandas["timestamp"].astype("int64")
interactions_pandas = interactions_pandas.sort_values(by="timestamp")
interactions_pandas["timestamp"] = interactions_pandas.groupby("user_id").cumcount()
interactions_pandas

Unnamed: 0,user_id,item_id,rating,timestamp
4182421,28507,1176,4.0,0
18950979,131160,1079,3.0,0
18950936,131160,47,5.0,1
18950930,131160,21,3.0,2
12341178,85252,45,3.0,0
...,...,...,...,...
7819902,53930,118706,3.5,1429
2508834,16978,2093,3.5,707
12898546,89081,55232,3.5,1607
12898527,89081,52458,4.0,1608


In [None]:
interactions_spark = spark_session.createDataFrame(interactions_pandas)
item_features_spark = spark_session.createDataFrame(item_features_pandas)

interactions_polars = pl.from_pandas(interactions_pandas)
item_features_polars = pl.from_pandas(item_features_pandas)

## Splitting comparison

In [6]:
splitter = LastNSplitter(
    N=1,
    divide_column="user_id",
    query_column="user_id",
    strategy="interactions",
)

In [None]:
%%timeit -n 2 -r 3

splitter.split(interactions_pandas)

18.3 s ± 29.7 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

For a correct comparison, it is necessary to trigger the graph of calculations in the pyspark

In [None]:
%%timeit -n 2 -r 3

train, test = splitter.split(interactions_spark)
train.count(), test.count()

7.12 s ± 1.82 s per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [None]:
%%timeit -n 2 -r 3

splitter.split(interactions_polars)

1.66 s ± 6.05 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [7]:
# pandas datasets
raw_test_events_pandas, raw_test_gt_pandas = splitter.split(interactions_pandas)
raw_validation_events_pandas, raw_validation_gt_pandas = splitter.split(raw_test_events_pandas)
raw_train_events_pandas = raw_validation_events_pandas


# pyspark datasets
raw_test_events_spark, raw_test_gt_spark = splitter.split(interactions_spark)
raw_validation_events_spark, raw_validation_gt_spark = splitter.split(raw_test_events_spark)
raw_train_events_spark = raw_validation_events_spark

# trigger for correct performance measurements
raw_train_events_spark.cache(), raw_train_events_spark.count()
raw_validation_events_spark.cache(), raw_validation_events_spark.count()
raw_validation_gt_spark.cache(), raw_validation_gt_spark.count()
raw_test_events_spark.cache(), raw_test_events_spark.count()
raw_test_gt_spark.cache(), raw_test_gt_spark.count()

# polars datasets
raw_test_events_polars, raw_test_gt_polars = splitter.split(interactions_polars)
raw_validation_events_polars, raw_validation_gt_polars = splitter.split(raw_test_events_polars)
raw_train_events_polars = raw_validation_events_polars

## Creating Dataset objects

In [8]:
def prepare_feature_schema(is_ground_truth: bool) -> FeatureSchema:
    base_features = FeatureSchema(
        [
            FeatureInfo(
                column="user_id",
                feature_hint=FeatureHint.QUERY_ID,
                feature_type=FeatureType.CATEGORICAL,
            ),
            FeatureInfo(
                column="item_id",
                feature_hint=FeatureHint.ITEM_ID,
                feature_type=FeatureType.CATEGORICAL,
            ),
        ]
    )
    if is_ground_truth:
        return base_features

    all_features = base_features + FeatureSchema(
        [
            FeatureInfo(
                column="timestamp",
                feature_type=FeatureType.NUMERICAL,
                feature_hint=FeatureHint.TIMESTAMP,
            ),
        ]
    )
    return all_features


def create_dataset_object(
    schema: FeatureSchema,
    interactions: DataFrameLike,
    item_features: DataFrameLike = None,
):
    return Dataset(
        feature_schema=schema,
        interactions=interactions,
        item_features=item_features,
        check_consistency=True,
        categorical_encoded=False,
    )

In [9]:
schema = prepare_feature_schema(is_ground_truth=False)
gt_schema = prepare_feature_schema(is_ground_truth=True)

In [10]:
train_dataset_pandas = create_dataset_object(
    schema, raw_train_events_pandas, item_features_pandas
)
validation_dataset_pandas = create_dataset_object(
    schema, raw_validation_events_pandas, item_features_pandas
)
validation_gt_pandas = create_dataset_object(
    gt_schema, raw_validation_gt_pandas
)
test_dataset_pandas = create_dataset_object(
    schema, raw_test_events_pandas, item_features_pandas
)
test_gt_pandas = create_dataset_object(
    gt_schema, raw_test_gt_pandas
)

In [17]:
train_dataset_spark = create_dataset_object(
    schema, raw_train_events_spark, item_features_spark
)
validation_dataset_spark = create_dataset_object(
    schema, raw_validation_events_spark, item_features_spark
)
validation_gt_spark = create_dataset_object(
    gt_schema, raw_validation_gt_spark
)
test_dataset_spark = create_dataset_object(
    schema, raw_test_events_spark, item_features_spark
)
test_gt_spark = create_dataset_object(
    gt_schema, raw_test_gt_spark
)

                                                                                

In [13]:
train_dataset_polars = create_dataset_object(
    schema, raw_train_events_polars, item_features_polars
)
validation_dataset_polars = create_dataset_object(
    schema, raw_validation_events_polars, item_features_polars
)
validation_gt_polars = create_dataset_object(
    gt_schema, raw_validation_gt_polars
)
test_dataset_polars = create_dataset_object(
    schema, raw_test_events_polars, item_features_polars
)
test_gt_polars = create_dataset_object(
    gt_schema, raw_test_gt_polars
)

In [11]:
ITEM_FEATURE_NAME = "item_id_seq"

tensor_schema = TensorSchema(
    TensorFeatureInfo(
        name=ITEM_FEATURE_NAME,
        is_seq=True,
        feature_type=FeatureType.CATEGORICAL,
        feature_sources=[TensorFeatureSource(FeatureSource.INTERACTIONS, train_dataset_pandas.feature_schema.item_id_column)],
        feature_hint=FeatureHint.ITEM_ID,
    )
)

## Tokenizing comparison

Fit tokenizer comparison

In [None]:
%%timeit -n 2 -r 3

tokenizer = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)
tokenizer.fit(train_dataset_pandas)

269 ms ± 18.8 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [None]:
%%timeit -n 2 -r 3

tokenizer = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)
tokenizer.fit(train_dataset_spark)

5.39 s ± 425 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [None]:
%%timeit -n 2 -r 3

tokenizer = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)
tokenizer.fit(train_dataset_polars)

180 ms ± 40.7 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [None]:
tokenizer_pandas = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)
tokenizer_spark = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)
tokenizer_polars = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)

tokenizer_pandas.fit(train_dataset_pandas)
tokenizer_spark.fit(train_dataset_spark)
tokenizer_polars.fit(train_dataset_polars)

Transform dataset with fitted tokenizer comparison

In [None]:
%%timeit -n 2 -r 3

tokenizer_pandas.transform(train_dataset_pandas)

24 s ± 87.1 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [None]:
%%timeit -n 2 -r 3

tokenizer_spark.transform(train_dataset_spark)

41.2 s ± 2.56 s per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [None]:
%%timeit -n 2 -r 3

tokenizer_polars.transform(train_dataset_polars)

2.29 s ± 20.2 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

Due to there are only `PandasSequentialDataset` and `PolarsSequentialDataset`, we can get rid of spark frames now

In [13]:
sequential_train_dataset = tokenizer_pandas.transform(train_dataset_pandas)
sequential_validation_dataset = tokenizer_pandas.transform(validation_dataset_pandas)
sequential_validation_gt = tokenizer_pandas.transform(validation_gt_pandas, [tensor_schema.item_id_feature_name])

sequential_validation_dataset, sequential_validation_gt = SequentialDataset.keep_common_query_ids(
    sequential_validation_dataset, sequential_validation_gt
)

test_query_ids = test_gt_pandas.query_ids
test_query_ids_np = tokenizer_pandas.query_id_encoder.transform(test_query_ids)["user_id"].values
sequential_test_dataset = tokenizer_pandas.transform(test_dataset_pandas).filter_by_query_id(test_query_ids_np)

In [14]:
MAX_SEQ_LEN = 200
BATCH_SIZE = 512
NUM_WORKERS = 4

We will not waste time training the model, so we will get the training time for only 1 epoch

In [15]:
model = SasRec(
    tensor_schema,
    block_count=2,
    head_count=2,
    max_seq_len=MAX_SEQ_LEN,
    hidden_size=300,
    dropout_rate=0.5,
    optimizer_factory=FatOptimizerFactory(learning_rate=0.001),
)

validation_metrics_callback = ValidationMetricsCallback(
    metrics=["map", "ndcg", "recall"],
    ks=[1, 5, 10, 20],
    item_count=train_dataset_pandas.item_count,
    postprocessors=[RemoveSeenItems(sequential_validation_dataset)]
)

trainer = L.Trainer(
    max_epochs=1,
    callbacks=[validation_metrics_callback],
    logger=False,
    log_every_n_steps=1000,
)

train_dataloader = DataLoader(
    dataset=SasRecTrainingDataset(
        sequential_train_dataset,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
)

validation_dataloader = DataLoader(
    dataset=SasRecValidationDataset(
        sequential_validation_dataset,
        sequential_validation_gt,
        sequential_train_dataset,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [16]:
trainer.fit(
    model,
    train_dataloaders=train_dataloader,
    val_dataloaders=validation_dataloader,
)

You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type             | Params
--------------------------------------------
0 | _model | SasRecModel      | 9.3 M 
1 | _loss  | CrossEntropyLoss | 0     
--------------------------------------------
9.3 M     Trainable params
0         Non-trainable params
9.3 M     Total params
37.321    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

k              1        10        20         5
map     0.032406  0.063699  0.068970  0.056406
ndcg    0.032406  0.085727  0.105130  0.067868
recall  0.032406  0.158535  0.235694  0.102901



`Trainer.fit` stopped: `max_epochs=1` reached.


## Inference callbacks comparison

Let's launch the inference of the model. 

Please note that no time measurement is performed at this stage, because the result will be obtained inside the callbacks in the torch.Tensor format.

Conversion to dataframes will begin only when we call the `get_result` function for callbacks

In [None]:
TOPK = [1, 10, 20, 100]
postprocessors = [RemoveSeenItems(sequential_test_dataset)]

prediction_dataloader = DataLoader(
    dataset=SasRecPredictionDataset(
        sequential_test_dataset,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

spark_prediction_callback = SparkPredictionCallback(
    spark_session=spark_session,
    top_k=max(TOPK),
    query_column="user_id",
    item_column="item_id",
    rating_column="score",
    postprocessors=postprocessors,
)

pandas_prediction_callback = PandasPredictionCallback(
    top_k=max(TOPK),
    query_column="user_id",
    item_column="item_id",
    rating_column="score",
    postprocessors=postprocessors,
)

polars_prediction_callback = PolarsPredictionCallback(
    top_k=max(TOPK),
    query_column="user_id",
    item_column="item_id",
    rating_column="score",
    postprocessors=postprocessors,
)

trainer = L.Trainer(
    callbacks=[
        spark_prediction_callback,
        pandas_prediction_callback,
        polars_prediction_callback
    ], 
    inference_mode=True
)

trainer.predict(model, dataloaders=prediction_dataloader, return_predictions=False)

In [None]:
%%timeit -n 2 -r 3

pandas_prediction_callback.get_result()

3.82 s ± 15.5 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [None]:
%%timeit -n 2 -r 3

spark_prediction_callback.get_result()

13.4 s ± 312 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [None]:
%%timeit -n 2 -r 3

polars_prediction_callback.get_result()

777 ms ± 27.2 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

## Metrics comparison

In [39]:
result_spark = spark_prediction_callback.get_result()
result_pandas = pandas_prediction_callback.get_result()
result_polars = polars_prediction_callback.get_result()

recommendations_spark = tokenizer_spark.query_and_item_id_encoder.inverse_transform(result_spark)
recommendations_pandas = tokenizer_pandas.query_and_item_id_encoder.inverse_transform(result_pandas)
recommendations_polars = tokenizer_spark.query_and_item_id_encoder.inverse_transform(result_polars)
# Polars is sensitive to different dtypes, so we need to match them in user column
raw_test_gt_polars = raw_test_gt_polars.with_columns(
    pl.col("user_id").cast(recommendations_polars.get_column("user_id").dtype)
)

In [35]:
init_args = {"query_column": "user_id", "item_column": "item_id", "rating_column": "score"}
result_metrics = OfflineMetrics(
    [Recall(TOPK), Precision(TOPK), MAP(TOPK), NDCG(TOPK), MRR(TOPK), HitRate(TOPK)], **init_args
)

In [None]:
%%timeit -n 2 -r 3

result_metrics(recommendations_pandas, raw_test_gt_pandas)

3min 28s ± 15.7 s per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [None]:
%%timeit -n 2 -r 3

result_metrics(recommendations_spark, raw_test_gt_spark)

4min 32s ± 3.86 s per loop (mean ± std. dev. of 3 runs, 2 loops each)

In [None]:
%%timeit -n 2 -r 3

result_metrics(recommendations_polars, raw_test_gt_polars)

14.5 s ± 35.3 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)