see paper: https://wuch15.github.io/paper/EMNLP2019-NRMS.pdf

# Getting started

## Load functionality

In [1]:
from transformers import AutoTokenizer, AutoModel # huggingface transformers
from pathlib import Path
import tensorflow as tf
import polars as pl

"""
preprocessing
"""

from ebrec.utils._constants import ( # column names as constants
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
)
from ebrec.utils._behaviors import (
    create_binary_labels_column, # add a binary labels column to the dataframe
    sampling_strategy_wu2019, # sampling strategy from NPA paper
    add_known_user_column, # add a column to the dataframe saying whether user has been seen before
    add_prediction_scores, # add a column to the dataframe with the prediction scores
    truncate_history, # sort by timestamp and truncate the history to the last n articles
)
from ebrec.utils._articles import convert_text2encoding_with_transformers # tokenize text with transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes # merge two columns, merge two dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping # add row id (because it isn't the default in polars)
from ebrec.utils._nlp import get_transformers_word_embeddings # turn tokens into word embeddings


"""
nrms model
"""

from ebrec.models.newsrec.dataloader import NRMSDataLoader # load news rec data
from ebrec.models.newsrec.model_config import hparams_nrms # class to globally store hyperparameters as constants
from ebrec.models.newsrec import NRMSModel # the model itself

## Load dataset

In [2]:
import os


data_base = Path(os.getcwd()).parent / "data-merged" / "merged"
# train_val_base = data_base / "1-ebnerd_demo_(20MB)"
train_val_base = data_base / "2-ebnerd_small_(80MB)"
# train_val_base = data_base / "3-ebnerd_large_(3.0GB)"
test_base = data_base / "5-ebnerd_testset_(1.5GB)"
assert train_val_base.exists() and test_base.exists()


"""
load user history
"""

train_behaviors = pl.scan_parquet(train_val_base / "train" / "behaviors.parquet")
train_history = pl.scan_parquet(train_val_base / "train" / "history.parquet")

val_behavior = pl.scan_parquet(train_val_base / "validation" / "behaviors.parquet")
val_history = pl.scan_parquet(train_val_base / "validation" / "history.parquet")

test_behavior = pl.scan_parquet(test_base / "test" / "behaviors.parquet")
test_history = pl.scan_parquet(test_base / "test" / "history.parquet")


"""
load article content
"""

train_articles: pl.LazyFrame = pl.scan_parquet(train_val_base / "articles.parquet")
val_articles: pl.LazyFrame = train_articles
test_articles: pl.LazyFrame = pl.scan_parquet(test_base / "articles.parquet")

articles_word2vec: pl.LazyFrame = pl.scan_parquet(data_base / "7-Ekstra-Bladet-word2vec_(133MB)" / "document_vector.parquet")
articles_image_embeddings: pl.LazyFrame = pl.scan_parquet(data_base / "8-Ekstra_Bladet_image_embeddings_(372MB)" / "image_embeddings.parquet")
articles_contrastive_vector: pl.LazyFrame = pl.scan_parquet(data_base / "9-Ekstra-Bladet-contrastive_vector_(341MB)" / "contrastive_vector.parquet")
articles_bert_base_multilingual_cased: pl.LazyFrame = pl.scan_parquet(data_base / "10-google-bert-base-multilingual-cased_(344MB)" / "bert_base_multilingual_cased.parquet")
articles_xlm_roberta_base: pl.LazyFrame = pl.scan_parquet(data_base / "11-FacebookAI-xlm-roberta-base_(341MB)" / "xlm_roberta_base.parquet")

In [3]:
def ebnerd_from_path(history: pl.LazyFrame, behaviors: pl.LazyFrame, history_size: int = 30) -> pl.DataFrame:
    df_history = (
        history
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
        )
    )
    df_behaviors = (
        behaviors
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    return df_behaviors


### Generate labels
We sample a few just to get started. For testset we just make up a dummy column with 0 and 1 - this is not the true labels.

In [4]:
"""
preprocessing: truncate user history, select subset of columns, join on behavior, sample based on Wu2019, add binary labels column
"""

COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
]
HISTORY_SIZE = 30
N_SAMPLES = 100
df_train = (
    ebnerd_from_path(history=train_history, behaviors=train_behaviors, history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(n=N_SAMPLES)
)
print("done: train")

df_validation = (
    ebnerd_from_path(history=val_history, behaviors=val_behavior, history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(n=N_SAMPLES)
)
print("done: validation")

df_test = (
    ebnerd_from_path(history=test_history, behaviors=val_behavior, history_size=HISTORY_SIZE)
    .with_columns(pl.Series(DEFAULT_CLICKED_ARTICLES_COL, [[]]))
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(n=N_SAMPLES)
)
print("done: test")



done: train
done: validation
done: test


### Look at the difference between Training/Validation and Testset
Note, the testset doesn't include labels, and we have remove some of the other columns.

In [5]:
df_train.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,labels
u32,list[i32],list[i64],list[i64],list[i8]
689819,"[9767765, 9768722, … 9770146]","[9773137, 9773137, … 9772355]",[9772635],"[0, 0, … 0]"
2256162,"[9767738, 9768708, … 9769366]","[9778902, 9778902, … 9777156]",[9777156],"[0, 0, … 1]"


In [6]:
df_test.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,labels
u32,list[i32],list[i32],list[null],list[i8]
2124286,"[9785019, 9787098, … 9787586]","[9781998, 9784406, … 9782836]",[],"[0, 0, … 0]"
881917,"[9790574, 9790121, … 9776147]","[9766007, 9769994, … 9769459]",[],"[0, 0, … 0]"


## Load articles

In [7]:
df_articles = train_articles.collect()
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var i…","""Politiet frygt…",2023-06-29 06:20:33,False,"""Sagen om den ø…",2006-08-31 08:06:45,[3150850],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars …","""Biografgængern…",2023-06-29 06:20:35,False,"""Vatikanet har …",2006-05-21 16:57:00,[3006712],"""article_defaul…","""https://ekstra…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""


## Init model using HuggingFace's tokenizer and wordembedding
In the original implementation, they use the GloVe embeddings and tokenizer. To get going fast, we'll use a multilingual LLM from Hugging Face. 
Utilizing the tokenizer to tokenize the articles and the word-embedding to init NRMS.


In [8]:
TRANSFORMER_MODEL_NAME = "bert-base-multilingual-cased"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
# concat columns containing strings
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
# convert text to tokens
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# add row id to the dataframe (because it isn't the default in polars)
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)



## Initiate the dataloaders
In the implementations we have disconnected the models and data. Hence, you should built a dataloader that fits your needs.

In [9]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=64,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=32,
)
test_dataloader = NRMSDataLoader(
    behaviors=df_test,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=32,
)

## Train the model


In [10]:
MODEL_NAME = "NRMS"
LOG_DIR = f"../runs/{MODEL_NAME}"
MODEL_WEIGHTS = f"../runs/data/state_dict/{MODEL_NAME}/weights"

# CALLBACKS
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=MODEL_WEIGHTS, save_best_only=True, save_weights_only=True, verbose=1
)

hparams_nrms.history_size = HISTORY_SIZE
model = NRMSModel(
    hparams=hparams_nrms,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)
hist = model.model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=1,
    callbacks=[tensorboard_callback, early_stopping, modelcheckpoint],
)
model.model.load_weights(filepath=MODEL_WEIGHTS)

2024-05-10 22:35:52.729306: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-05-10 22:35:52.729376: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-05-10 22:35:52.729385: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-05-10 22:35:52.729527: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-10 22:35:52.730806: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-05-10 22:35:54.774893: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2024-05-10 22:35:54.880790:



# Example how to compute some metrics:

In [None]:
pred_validation = model.scorer.predict(val_dataloader)



In [None]:
df_validation = add_prediction_scores(df_validation, pred_validation.tolist()).pipe(
    add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
)
df_validation.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,labels,scores,is_known_user
u32,list[i32],list[i32],list[i32],list[i8],list[f64],bool
909106,"[9779498, 9779737, … 9779747]","[9784591, 9784044, … 9784696]",[9784679],"[0, 0, … 0]","[0.500626, 0.499298, … 0.499624]",False
483450,"[9777190, 9777704, … 9776967]","[9789810, 9446756, … 9428643]",[9790475],"[0, 0, … 0]","[0.49968, 0.500049, … 0.499711]",False


In [None]:
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore

metrics = MetricEvaluator(
    labels=df_validation["labels"].to_list(),
    predictions=df_validation["scores"].to_list(),
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()

<MetricEvaluator class>: 
 {
    "auc": 0.4684587424610308,
    "mrr": 0.3048188440128843,
    "ndcg@5": 0.3337379229627095,
    "ndcg@10": 0.41919724747438886
}

## Make submission file

In [None]:
pred_test = model.scorer.predict(test_dataloader)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (32,) + inhomogeneous part.

In [None]:
pred_test

NameError: name 'pred_test' is not defined