GRUs are best 

because they can learn the patterns in a sequence of user events (e.g., articles viewed, clicked, read times, scroll percentages) to predict the next item(s) in a session.

The sequence of user events (e.g., articles viewed, clicked, read times, scroll percentages) can be treated as a temporal sequence where each event is represented by its corresponding features.

The GRU can learn the patterns in this sequence to predict the next item(s) in a session, effectively recommending articles that are likely to engage the user based on their past behavior and the content of the articles themselves.

In [11]:
import numpy as np
import polars as pl

import os
from pathlib import Path

# read in data

train_val_base = Path(os.getcwd()).parent / "data-merged" / "merged" / "3-ebnerd_large_(3.0GB)"
test_base = Path(os.getcwd()).parent / "data-merged" / "merged" / "5-ebnerd_testset_(1.5GB)"
assert train_val_base.exists() and test_base.exists()

train_behaviors: pl.LazyFrame = pl.scan_parquet(train_val_base / "train" / "behaviors.parquet")
train_history: pl.LazyFrame = pl.scan_parquet(train_val_base / "train" / "history.parquet")
train_articles: pl.LazyFrame = pl.scan_parquet(train_val_base / "articles.parquet")

val_behaviors: pl.LazyFrame = pl.scan_parquet(train_val_base / "validation" / "behaviors.parquet")
val_history: pl.LazyFrame = pl.scan_parquet(train_val_base / "validation" / "history.parquet")
val_articles: pl.LazyFrame = pl.scan_parquet(train_val_base / "articles.parquet")

test_behaviors: pl.LazyFrame = pl.scan_parquet(test_base / "test" / "behaviors.parquet")
test_history: pl.LazyFrame = pl.scan_parquet(test_base / "test" / "history.parquet")
test_articles: pl.LazyFrame = pl.scan_parquet(test_base / "articles.parquet")

In [15]:
def merge_behavior_history(path: Path, history_size: int = 30) -> pl.DataFrame:
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    return df_behaviors