# Comparison of performance for different parts of pipeline depends on input data format (PySpark, Pandas, Polars)

Note that this example designed only for comparison goals, to get more detailed information about pipeline steps see `09_sasrec_example.ipynb` and `10_bert4rec_example.ipynb`

Example was executed on CPU

In [1]:
import lightning as L
from torch.utils.data import DataLoader

from replay.metrics import OfflineMetrics, Recall, Precision, MAP, NDCG, HitRate, MRR
from replay.splitters import LastNSplitter
from replay.utils import get_spark_session, DataFrameLike
from replay.data import (
    FeatureHint,
    FeatureInfo,
    FeatureSchema,
    FeatureSource,
    FeatureType,
    Dataset,
)
from replay.models.nn.optimizer_utils import FatOptimizerFactory
from replay.models.nn.sequential.callbacks import (
    ValidationMetricsCallback,
    SparkPredictionCallback,
    PandasPredictionCallback, 
    PolarsPredictionCallback,
)
from replay.models.nn.sequential.postprocessors import RemoveSeenItems
from replay.data.nn import (
    SequenceTokenizer,
    SequentialDataset,
    TensorFeatureSource,
    TensorSchema,
    TensorFeatureInfo
)
from replay.models.nn.sequential import SasRec
from replay.models.nn.sequential.sasrec import (
    SasRecPredictionDataset,
    SasRecTrainingDataset,
    SasRecValidationDataset,
)

import pandas as pd
import polars as pl

## Observe and load data

In [None]:
spark_session = get_spark_session()

In [None]:
!pip install rs-datasets

In [4]:
from rs_datasets import MovieLens

movielens = MovieLens("1m")
interactions_pandas = movielens.ratings
user_features_pandas = movielens.users
item_features_pandas = movielens.items

In [5]:
interactions_pandas.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
user_features_pandas.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
item_features_pandas.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
interactions_pandas["timestamp"] = interactions_pandas["timestamp"].astype("int64")
interactions_pandas = interactions_pandas.sort_values(by="timestamp")
interactions_pandas["timestamp"] = interactions_pandas.groupby("user_id").cumcount()
interactions_pandas

Unnamed: 0,user_id,item_id,rating,timestamp
1000138,6040,858,4,0
1000153,6040,2384,4,1
999873,6040,593,5,2
1000007,6040,1961,4,3
1000192,6040,2019,5,4
...,...,...,...,...
825793,4958,2399,1,446
825438,4958,1407,5,447
825724,4958,3264,4,448
825731,4958,2634,3,449


In [None]:
interactions_spark = spark_session.createDataFrame(interactions_pandas)
user_features_spark = spark_session.createDataFrame(user_features_pandas)
item_features_spark = spark_session.createDataFrame(item_features_pandas)

interactions_polars = pl.from_pandas(interactions_pandas)
user_features_polars = pl.from_pandas(user_features_pandas)
item_features_polars = pl.from_pandas(item_features_pandas)

In [10]:
interactions_spark.show(5)

+-------+-------+------+---------+
|user_id|item_id|rating|timestamp|
+-------+-------+------+---------+
|   6040|    858|     4|        0|
|   6040|   2384|     4|        1|
|   6040|    593|     5|        2|
|   6040|   1961|     4|        3|
|   6040|   2019|     5|        4|
+-------+-------+------+---------+
only showing top 5 rows



In [11]:
interactions_polars.head(5)

user_id,item_id,rating,timestamp
i32,i32,i32,i64
6040,858,4,0
6040,2384,4,1
6040,593,5,2
6040,1961,4,3
6040,2019,5,4


## Splitting comparison

In [12]:
splitter = LastNSplitter(
    N=1,
    divide_column="user_id",
    query_column="user_id",
    strategy="interactions",
)

In [None]:
%%timeit -n 10 -r 7

splitter.split(interactions_pandas)

547 ms ± 3.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

We reduce number of loops and runs in spark, because of it's caching data between each execution

In [None]:
%%timeit -n 1 -r 1

splitter.split(interactions_spark)

203 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [None]:
%%timeit -n 10 -r 7

splitter.split(interactions_polars)

61 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [14]:
raw_test_events_pandas, raw_test_gt_pandas = splitter.split(interactions_pandas)
raw_validation_events_pandas, raw_validation_gt_pandas = splitter.split(raw_test_events_pandas)
raw_train_events_pandas = raw_validation_events_pandas

raw_test_events_spark, raw_test_gt_spark = splitter.split(interactions_spark)
raw_validation_events_spark, raw_validation_gt_spark = splitter.split(raw_test_events_spark)
raw_train_events_spark = raw_validation_events_spark

raw_test_events_polars, raw_test_gt_polars = splitter.split(interactions_polars)
raw_validation_events_polars, raw_validation_gt_polars = splitter.split(raw_test_events_polars)
raw_train_events_polars = raw_validation_events_polars

## Creating Dataset objects

In [15]:
def prepare_feature_schema(is_ground_truth: bool) -> FeatureSchema:
    base_features = FeatureSchema(
        [
            FeatureInfo(
                column="user_id",
                feature_hint=FeatureHint.QUERY_ID,
                feature_type=FeatureType.CATEGORICAL,
            ),
            FeatureInfo(
                column="item_id",
                feature_hint=FeatureHint.ITEM_ID,
                feature_type=FeatureType.CATEGORICAL,
            ),
        ]
    )
    if is_ground_truth:
        return base_features

    all_features = base_features + FeatureSchema(
        [
            FeatureInfo(
                column="timestamp",
                feature_type=FeatureType.NUMERICAL,
                feature_hint=FeatureHint.TIMESTAMP,
            ),
        ]
    )
    return all_features


def create_dataset_object(
    schema: FeatureSchema,
    interactions: DataFrameLike,
    query_features: DataFrameLike = None,
    item_features: DataFrameLike = None,
):
    return Dataset(
        feature_schema=schema,
        interactions=interactions,
        query_features=query_features,
        item_features=item_features,
        check_consistency=True,
        categorical_encoded=False,
    )

In [16]:
schema = prepare_feature_schema(is_ground_truth=False)
gt_schema = prepare_feature_schema(is_ground_truth=True)

In [17]:
train_dataset_pandas = create_dataset_object(
    schema, raw_train_events_pandas, user_features_pandas, item_features_pandas
)
validation_dataset_pandas = create_dataset_object(
    schema, raw_validation_events_pandas, user_features_pandas, item_features_pandas
)
validation_gt_pandas = create_dataset_object(
    gt_schema, raw_validation_gt_pandas
)
test_dataset_pandas = create_dataset_object(
    schema, raw_test_events_pandas, user_features_pandas, item_features_pandas
)
test_gt_pandas = create_dataset_object(
    gt_schema, raw_test_gt_pandas
)

In [18]:
train_dataset_spark = create_dataset_object(
    schema, raw_train_events_spark, user_features_spark, item_features_spark
)
validation_dataset_spark = create_dataset_object(
    schema, raw_validation_events_spark, user_features_spark, item_features_spark
)
validation_gt_spark = create_dataset_object(
    gt_schema, raw_validation_gt_spark
)
test_dataset_spark = create_dataset_object(
    schema, raw_test_events_spark, user_features_spark, item_features_spark
)
test_gt_spark = create_dataset_object(
    gt_schema, raw_test_gt_spark
)

                                                                                

In [19]:
train_dataset_polars = create_dataset_object(
    schema, raw_train_events_polars, user_features_polars, item_features_polars
)
validation_dataset_polars = create_dataset_object(
    schema, raw_validation_events_polars, user_features_polars, item_features_polars
)
validation_gt_polars = create_dataset_object(
    gt_schema, raw_validation_gt_polars
)
test_dataset_polars = create_dataset_object(
    schema, raw_test_events_polars, user_features_polars, item_features_polars
)
test_gt_polars = create_dataset_object(
    gt_schema, raw_test_gt_polars
)

In [20]:
ITEM_FEATURE_NAME = "item_id_seq"

tensor_schema = TensorSchema(
    TensorFeatureInfo(
        name=ITEM_FEATURE_NAME,
        is_seq=True,
        cardinality=train_dataset_polars.item_count,
        feature_type=FeatureType.CATEGORICAL,
        feature_sources=[TensorFeatureSource(FeatureSource.INTERACTIONS, train_dataset_polars.feature_schema.item_id_column)],
        feature_hint=FeatureHint.ITEM_ID,
    )
)

## Tokenizing comparison

In [21]:
tokenizer = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)

Fit tokenizer comparison

In [None]:
%%timeit -n 10 -r 7

tokenizer.fit(train_dataset_pandas)

4.72 ms ± 56.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [None]:
%%timeit -n 10 -r 7

tokenizer.fit(train_dataset_spark)

792 ms ± 90.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [None]:
%%timeit -n 10 -r 7

tokenizer.fit(train_dataset_polars)

4.03 ms ± 261 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [None]:
tokenizer_pandas = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)
tokenizer_spark = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)
tokenizer_polars = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)

tokenizer_pandas.fit(train_dataset_pandas)
tokenizer_spark.fit(train_dataset_spark)
tokenizer_polars.fit(train_dataset_polars)

Transform dataset with fitted tokenizer comparison

In [None]:
%%timeit -n 10 -r 7

tokenizer_pandas.transform(train_dataset_pandas)

675 ms ± 1.83 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [None]:
%%timeit -n 10 -r 7

tokenizer_spark.transform(train_dataset_spark)

2.28 s ± 116 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [None]:
%%timeit -n 10 -r 7

tokenizer_polars.transform(train_dataset_polars)

84.6 ms ± 512 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Due to there are only `PandasSequentialDataset` and `PolarsSequentialDataset`, we can get rid of spark frames now

In [23]:
sequential_train_dataset_pandas = tokenizer_pandas.transform(train_dataset_pandas)
sequential_validation_dataset_pandas = tokenizer_pandas.transform(validation_dataset_pandas)
sequential_validation_gt_pandas = tokenizer_pandas.transform(validation_gt_pandas, [tensor_schema.item_id_feature_name])

sequential_validation_dataset_pandas, sequential_validation_gt_pandas = SequentialDataset.keep_common_query_ids(
    sequential_validation_dataset_pandas, sequential_validation_gt_pandas
)

test_query_ids_pandas = test_gt_pandas.query_ids
test_query_ids_np_pandas = tokenizer_pandas.query_id_encoder.transform(test_query_ids_pandas)["user_id"].values
sequential_test_dataset_pandas = tokenizer_pandas.transform(test_dataset_pandas).filter_by_query_id(test_query_ids_np_pandas)

In [24]:
sequential_train_dataset_polars = tokenizer_polars.transform(train_dataset_polars)
sequential_validation_dataset_polars = tokenizer_polars.transform(validation_dataset_polars)
sequential_validation_gt_polars = tokenizer_polars.transform(validation_gt_polars, [tensor_schema.item_id_feature_name])

sequential_validation_dataset_polars, sequential_validation_gt_polars = SequentialDataset.keep_common_query_ids(
    sequential_validation_dataset_polars, sequential_validation_gt_polars
)

test_query_ids_polars = test_gt_polars.query_ids
test_query_ids_np_polars = tokenizer_polars.query_id_encoder.transform(test_query_ids_polars)["user_id"].to_numpy()
sequential_test_dataset_polars = tokenizer_polars.transform(test_dataset_polars).filter_by_query_id(test_query_ids_np_polars)

## Training model comparison

In [25]:
MAX_SEQ_LEN = 200
BATCH_SIZE = 512
NUM_WORKERS = 4

Pandas

In [None]:
model_pandas = SasRec(
    tensor_schema,
    block_count=2,
    head_count=2,
    max_seq_len=MAX_SEQ_LEN,
    hidden_size=300,
    dropout_rate=0.5,
    optimizer_factory=FatOptimizerFactory(learning_rate=0.001),
)

validation_metrics_callback = ValidationMetricsCallback(
    metrics=["map", "ndcg", "recall"],
    ks=[1, 5, 10, 20],
    item_count=train_dataset_pandas.item_count,
    postprocessors=[RemoveSeenItems(sequential_validation_dataset_pandas)]
)

trainer = L.Trainer(
    max_epochs=1,
    callbacks=[validation_metrics_callback],
)

train_dataloader = DataLoader(
    dataset=SasRecTrainingDataset(
        sequential_train_dataset_pandas,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
)

validation_dataloader = DataLoader(
    dataset=SasRecValidationDataset(
        sequential_validation_dataset_pandas,
        sequential_validation_gt_pandas,
        sequential_train_dataset_pandas,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

In [None]:
%%timeit -n 2 -r 3

trainer.fit(
    model_pandas,
    train_dataloaders=train_dataloader,
    val_dataloaders=validation_dataloader,
)

45.3 s ± 18.9 s per loop (mean ± std. dev. of 3 runs, 2 loops each)

Polars

In [None]:
model_polars = SasRec(
    tensor_schema,
    block_count=2,
    head_count=2,
    max_seq_len=MAX_SEQ_LEN,
    hidden_size=300,
    dropout_rate=0.5,
    optimizer_factory=FatOptimizerFactory(learning_rate=0.001),
)

validation_metrics_callback = ValidationMetricsCallback(
    metrics=["map", "ndcg", "recall"],
    ks=[1, 5, 10, 20],
    item_count=train_dataset_polars.item_count,
    postprocessors=[RemoveSeenItems(sequential_validation_dataset_polars)]
)

trainer = L.Trainer(
    max_epochs=1,
    callbacks=[validation_metrics_callback],
)

train_dataloader = DataLoader(
    dataset=SasRecTrainingDataset(
        sequential_train_dataset_polars,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
)

validation_dataloader = DataLoader(
    dataset=SasRecValidationDataset(
        sequential_validation_dataset_polars,
        sequential_validation_gt_polars,
        sequential_train_dataset_polars,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

In [None]:
%%timeit -n 2 -r 3

trainer.fit(
    model_polars,
    train_dataloaders=train_dataloader,
    val_dataloaders=validation_dataloader,
)

45.3 s ± 19 s per loop (mean ± std. dev. of 3 runs, 2 loops each)

## Inference comparison

In [30]:
prediction_dataloader_pandas = DataLoader(
    dataset=SasRecPredictionDataset(
        sequential_test_dataset_pandas,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

prediction_dataloader_pandas = DataLoader(
    dataset=SasRecPredictionDataset(
        sequential_test_dataset_polars,
        max_sequence_length=MAX_SEQ_LEN,
    ),
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

In [31]:
TOPK = [1, 10, 20, 100]

Pandas

In [None]:
postprocessors = [RemoveSeenItems(sequential_test_dataset_pandas)]
trainer = L.Trainer(inference_mode=True)

In [None]:
%%timeit -n 2 -r 3

trainer.predict(model_pandas, dataloaders=prediction_dataloader_pandas, return_predictions=False)

39.8 s ± 133 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

Polars

In [None]:
postprocessors = [RemoveSeenItems(sequential_test_dataset_polars)]
trainer = L.Trainer(inference_mode=True)

In [None]:
%%timeit -n 2 -r 3

trainer.predict(model_polars, dataloaders=prediction_dataloader_pandas, return_predictions=False)

39.7 s ± 94 ms per loop (mean ± std. dev. of 3 runs, 2 loops each)

## Callbacks comparison

In [None]:
postprocessors = [RemoveSeenItems(sequential_test_dataset_pandas)]

spark_prediction_callback = SparkPredictionCallback(
    spark_session=spark_session,
    top_k=max(TOPK),
    query_column="user_id",
    item_column="item_id",
    rating_column="score",
    postprocessors=postprocessors,
)

pandas_prediction_callback = PandasPredictionCallback(
    top_k=max(TOPK),
    query_column="user_id",
    item_column="item_id",
    rating_column="score",
    postprocessors=postprocessors,
)

polars_prediction_callback = PolarsPredictionCallback(
    top_k=max(TOPK),
    query_column="user_id",
    item_column="item_id",
    rating_column="score",
    postprocessors=postprocessors,
)

trainer = L.Trainer(
    callbacks=[
        spark_prediction_callback,
        pandas_prediction_callback,
        polars_prediction_callback
    ], 
    inference_mode=True
)

trainer.predict(model_pandas, dataloaders=prediction_dataloader_pandas, return_predictions=False)

In [None]:
%%timeit -n 10 -r 7

pandas_prediction_callback.get_result()

125 ms ± 493 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [None]:
%%timeit -n 10 -r 7

spark_prediction_callback.get_result()

461 ms ± 4.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [None]:
%%timeit -n 10 -r 7

polars_prediction_callback.get_result()

15.7 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

## Metrics comparison

In [40]:
result = pandas_prediction_callback.get_result()
recommendations = tokenizer_pandas.query_and_item_id_encoder.inverse_transform(result)

In [42]:
recommendations_spark = spark_session.createDataFrame(recommendations)

Polars is sensitive to different dtypes, so we need to match them in user column

In [46]:
recommendations_polars = pl.from_pandas(recommendations)
raw_test_gt_polars = raw_test_gt_polars.with_columns(
    pl.col("user_id").cast(recommendations_polars.get_column("user_id").dtype)
)

In [47]:
init_args = {"query_column": "user_id", "rating_column": "score"}
result_metrics = OfflineMetrics(
    [Recall(TOPK), Precision(TOPK), MAP(TOPK), NDCG(TOPK), MRR(TOPK), HitRate(TOPK)], **init_args
)

In [None]:
%%timeit -n 10 -r 7

result_metrics(recommendations, raw_test_gt_pandas)

4.57 s ± 15.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [None]:
%%timeit -n 10 -r 7

result_metrics(recommendations_spark, raw_test_gt_spark)

8.9 s ± 363 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [None]:
%%timeit -n 10 -r 7

result_metrics(recommendations_polars, raw_test_gt_polars)

525 ms ± 9.99 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)