# Getting started

In this notebook, we illustrate how to use the Neural News Recommendation with Multi-Head Self-Attention ([NRMS](https://aclanthology.org/D19-1671/)). The implementation is taken from the [recommenders](https://github.com/recommenders-team/recommenders) repository. We have simply stripped the model to keep it cleaner.

We use a small dataset, which is downloaded from [recsys.eb.dk](https://recsys.eb.dk/). All the datasets are stored in the folder path ```~/ebnerd_data/*```.

## Load functionality

In [1]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import tensorflow as tf
import polars as pl
from tensorflow.python.client import device_lib
import numpy as np

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_ARTICLE_ID_COL,
    DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL, #-------
    DEFAULT_HISTORY_READ_TIME_COL #-------
)

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

from ebrec.models.newsrec.dataloader import NRMSDataLoader
from ebrec.models.newsrec.model_config import hparams_nrms
from ebrec.models.newsrec import NRMSModel, NRMSWrapper

2024-12-02 14:58:22.588786: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-02 14:58:22.592871: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-02 14:58:22.640459: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-02 14:58:22.640491: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-02 14:58:22.640525: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [2]:
# List all physical devices
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

physical_devices = tf.config.list_physical_devices('GPU')
print("Available devices:", physical_devices)

Available devices: []


2024-12-02 14:58:25.710630: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Make gridsearch for hyper make arguments TODO

## Load dataset

In [3]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        # .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL)
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL,DEFAULT_HISTORY_READ_TIME_COL) #------------
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    
    return df_behaviors

### Generate labels
We sample a few just to get started. For testset we just make up a dummy column with 0 and 1 - this is not the true labels.

In [4]:
PATH = Path("/dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data").expanduser()
DATASPLIT = "ebnerd_small" # REMEMBER if change to change make_embedding_artifacts.ipynb file (embeddings)
# DATASPLIT = "ebnerd__testset"
DUMP_DIR = PATH.joinpath("dump_artifacts")
DUMP_DIR.mkdir(exist_ok=True, parents=True)

In this example we sample the dataset, just to keep it smaller. Also, one can simply add the testset similary to the validation.

In [5]:
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL, #--------neu 
    DEFAULT_HISTORY_READ_TIME_COL, #------- neu
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    
]
HISTORY_SIZE = 30 #30
FRACTION = 0.03 #Fraction af datasæt

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=6,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
print(df_train.head(2))
print(df_validation.head(2))

shape: (2, 9)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ user_id ┆ article_id ┆ scroll_pe ┆ read_time ┆ … ┆ article_i ┆ impressio ┆ impressio ┆ labels    │
│ ---     ┆ _fixed     ┆ rcentage_ ┆ _fixed    ┆   ┆ ds_clicke ┆ n_id      ┆ n_time    ┆ ---       │
│ u32     ┆ ---        ┆ fixed     ┆ ---       ┆   ┆ d         ┆ ---       ┆ ---       ┆ list[i8]  │
│         ┆ list[i32]  ┆ ---       ┆ list[f32] ┆   ┆ ---       ┆ u32       ┆ datetime[ ┆           │
│         ┆            ┆ list[f32] ┆           ┆   ┆ list[i64] ┆           ┆ μs]       ┆           │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1103214 ┆ [9762520,  ┆ [100.0,   ┆ [54.0,    ┆ … ┆ [9774541] ┆ 42346545  ┆ 2023-05-2 ┆ [0, 1, …  │
│         ┆ 9761561, … ┆ 34.0, …   ┆ 7.0, …    ┆   ┆           ┆           ┆ 2         ┆ 0]        │
│         ┆ 9769197]   ┆ 53.0]     ┆ 3.0]      ┆   ┆           ┆           ┆ 

## Load articles

In [6]:
df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT, "articles.parquet"))
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var i…","""Politiet frygt…",2023-06-29 06:20:33,False,"""Sagen om den ø…",2006-08-31 08:06:45,[3150850],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars …","""Biografgængern…",2023-06-29 06:20:35,False,"""Vatikanet har …",2006-05-21 16:57:00,[3006712],"""article_defaul…","""https://ekstra…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""


## Added features and hourly difference between published and viewed article

In [7]:
## NEW

from sklearn.preprocessing import StandardScaler

# Convert polars DataFrame to pandas
df_train = df_train.to_pandas()

# Create a mapping dictionary from article_id to last_modified_time
article_time_dict = df_articles.select(
    "article_id", 
    "published_time"
).to_dict(as_series=False)
article_time_dict = dict(zip(
    article_time_dict["article_id"], 
    article_time_dict["published_time"]
))

# Create a function to map article IDs to their timestamps
def get_article_times(article_ids):
    return [article_time_dict.get(aid, None) for aid in article_ids]

# Add new column with the published-time
df_train["inview_article_times"] = df_train["article_ids_inview"].apply(get_article_times)

#add new column with the last publish_time for the clicked article
df_train["clicked_article_time"] = df_train["article_ids_clicked"].apply(get_article_times)

# Create a function to calculate hour differences
def calculate_hour_differences(impression_time, article_times):
        # If article_times is a single value (for clicked articles)
    if not isinstance(article_times, list):
        if article_times is None:
            return None
        return (impression_time - article_times).total_seconds() / 3600
    
    # If article_times is a list (for inview articles)
    differences = [(impression_time - article_time).total_seconds() / 3600 
                  if article_time is not None else None 
                  for article_time in article_times]
    return differences

# Use for inview articles
df_train['inview_hour_differences'] = df_train.apply(
    lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
    axis=1
)

# # Use for clicked article
# df_train['clicked_hour_difference'] = df_train.apply(
#    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
#    axis=1
# )

# Create a mapping dictionary from article_id to last_modified_category
article_cat_dict = df_articles.select(
    "article_id", 
    "category"
).to_dict(as_series=False)
article_cat_dict = dict(zip(
    article_cat_dict["article_id"], 
    article_cat_dict["category"]
))

# Create a function to map article IDs to their category
def get_article_category(article_ids):
    return [article_cat_dict.get(aid, None) for aid in article_ids]

#  Add new column with the article category
df_train["inview_article_categories"] = df_train["article_ids_inview"].apply(get_article_category)

df_train["history_article_categories"] = df_train["article_id_fixed"].apply(get_article_category)

# Create a mapping dictionary from article_id to article_type
article_type_dict = df_articles.select(
    "article_id", 
    "article_type"
).to_dict(as_series=False)
article_type_dict = dict(zip(
    article_type_dict["article_id"], 
    article_type_dict["article_type"]
))

# Create a function to map article IDs to their type
def get_article_type(article_ids):
    return [article_type_dict.get(aid, None) for aid in article_ids]

# Add new column with the article type
df_train["inview_article_types"] = df_train["article_ids_inview"].apply(get_article_type)

df_train["history_article_types"] = df_train["article_id_fixed"].apply(get_article_type)

#drop columns with the time
df_train = df_train.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)

df_train = pl.from_pandas(df_train)

df_train.head(2)

user_id,article_id_fixed,scroll_percentage_fixed,read_time_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,inview_article_categories,history_article_categories,inview_article_types,history_article_types
u32,list[i32],list[f32],list[f32],list[i64],list[i64],u32,list[i8],list[f64],list[i64],list[i64],list[str],list[str]
1103214,"[9762520, 9761561, … 9769197]","[100.0, 34.0, … 53.0]","[54.0, 7.0, … 3.0]","[9774429, 9774541, … 9771355]",[9774541],42346545,"[0, 1, … 0]","[0.624722, 1.684167, … 0.644444]","[142, 118, … 118]","[457, 512, … 140]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]"
2497899,"[9769828, 9769518, … 9770594]","[20.0, 88.0, … 85.0]","[5.0, 10.0, … 382.0]","[9779285, 9779285, … 9779285]",[9779269],24814465,"[0, 0, … 0]","[0.741944, 0.741944, … 0.741944]","[414, 414, … 414]","[118, 118, … 118]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]"


In [8]:
# Convert polars DataFrame to pandas
df_validation = df_validation.to_pandas()

# Create a mapping dictionary from article_id to last_modified_time
article_time_dict = df_articles.select(
    "article_id", 
    "published_time"
).to_dict(as_series=False)
article_time_dict = dict(zip(
    article_time_dict["article_id"], 
    article_time_dict["published_time"]
))

# Create a function to map article IDs to their timestamps
def get_article_times(article_ids):
    return [article_time_dict.get(aid, None) for aid in article_ids]

# Add new column with the published-time
df_validation["inview_article_times"] = df_validation["article_ids_inview"].apply(get_article_times)

#add new column with the last publish_time for the clicked article
df_validation["clicked_article_time"] = df_validation["article_ids_clicked"].apply(get_article_times)

# Create a function to calculate hour differences
def calculate_hour_differences(impression_time, article_times):
        # If article_times is a single value (for clicked articles)
    if not isinstance(article_times, list):
        if article_times is None:
            return None
        return (impression_time - article_times).total_seconds() / 3600
    
    # If article_times is a list (for inview articles)
    differences = [(impression_time - article_time).total_seconds() / 3600 
                  if article_time is not None else None 
                  for article_time in article_times]
    return differences

# Use for inview articles
df_validation['inview_hour_differences'] = df_validation.apply(
    lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
    axis=1
)

# # Use for clicked article -- might be leaky??
# df_validation['clicked_hour_difference'] = df_validation.apply(
#    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
#    axis=1
# )
# Create a mapping dictionary from article_id to last_modified_category
article_cat_dict = df_articles.select(
    "article_id", 
    "category"
).to_dict(as_series=False)
article_cat_dict = dict(zip(
    article_cat_dict["article_id"], 
    article_cat_dict["category"]
))

# Create a function to map article IDs to their category
def get_article_category(article_ids):
    return [article_cat_dict.get(aid, None) for aid in article_ids]

#  Add new column with the article category
df_validation["inview_article_categories"] = df_validation["article_ids_inview"].apply(get_article_category)

df_validation["history_article_categories"] = df_validation["article_id_fixed"].apply(get_article_category)

# Create a mapping dictionary from article_id to article_type
article_type_dict = df_articles.select(
    "article_id", 
    "article_type"
).to_dict(as_series=False)
article_type_dict = dict(zip(
    article_type_dict["article_id"], 
    article_type_dict["article_type"]
))

# Create a function to map article IDs to their type
def get_article_type(article_ids):
    return [article_type_dict.get(aid, None) for aid in article_ids]

# Add new column with the article type
df_validation["inview_article_types"] = df_validation["article_ids_inview"].apply(get_article_type)

df_validation["history_article_types"] = df_validation["article_id_fixed"].apply(get_article_type)


#drop columns with the time
df_validation = df_validation.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)


df_validation = pl.from_pandas(df_validation)

df_validation.head(2)

user_id,article_id_fixed,scroll_percentage_fixed,read_time_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,inview_article_categories,history_article_categories,inview_article_types,history_article_types
u32,list[i32],list[f32],list[f32],list[i32],list[i32],u32,list[i8],list[f64],list[i64],list[i64],list[str],list[str]
1581880,"[9775985, 9777374, … 9779738]","[100.0, 100.0, … null]","[42.0, 62.0, … 0.0]","[9776315, 9772363, … 9777846]",[9777846],184836170,"[0, 0, … 1]","[64.012222, 141.3025, … 0.128889]","[2975, 2975, … 140]","[414, 142, … 414]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]"
401400,"[9759955, 9776897, … 9780096]","[28.0, 14.0, … 53.0]","[11.0, 4.0, … 7.0]","[9780460, 9553264, … 9506503]",[9787230],214315254,"[0, 0, … 0]","[11.757778, 3883.658056, … 4795.724444]","[140, 457, … 118]","[414, 118, … 118]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_scribblelive"", … ""article_default""]"


## Init model using HuggingFace's tokenizer and wordembedding
In the original implementation, they use the GloVe embeddings and tokenizer. To get going fast, we'll use a multilingual LLM from Hugging Face. 
Utilizing the tokenizer to tokenize the articles and the word-embedding to init NRMS.


In [9]:
# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-large"
TRANSFORMER_MODEL_NAME = "Maltehb/danish-bert-botxo"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30 #hardcoded somewhere ?? error if change



# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#


df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)

# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)




In [10]:
word2vec_embedding

array([[ 0.01057525,  0.0519704 ,  0.08909235, ..., -0.00795781,
        -0.06168545, -0.07079539],
       [-0.01835794,  0.04070301,  0.02630469, ..., -0.00612215,
        -0.03679245, -0.00261144],
       [-0.01904276,  0.02300256, -0.00536503, ...,  0.00180998,
         0.01913669, -0.00572065],
       ...,
       [ 0.02969794, -0.02969835,  0.0127237 , ..., -0.0130282 ,
        -0.00069379,  0.004221  ],
       [ 0.03114044, -0.03700501,  0.01400322, ..., -0.00791059,
         0.00770514, -0.00168254],
       [ 0.0367507 , -0.0307173 ,  0.00670483, ..., -0.01460291,
         0.00015374, -0.00201466]], dtype=float32)

# Initiate the dataloaders
In the implementations we have disconnected the models and data. Hence, you should built a dataloader that fits your needs.

In [11]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=128,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=64,
)

## Train the model


In [12]:
# List all physical devices
# physical_devices = tf.config.list_physical_devices('GPU')
# print("Available devices:", physical_devices)
# import torch.nn as nn
# print(torch.cuda.is_available())

In [13]:
import os
from tqdm.notebook import tqdm


MODEL_NAME = "NRMS"
LOG_DIR = f"downloads/runs/{MODEL_NAME}"
WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(WEIGHTS_DIR, exist_ok=True)

# Create a custom ModelCheckpoint for PyTorch
class PyTorchModelCheckpoint:
    def __init__(self, filepath, model_wrapper=None, save_best_only=True, save_weights_only=True, verbose=1):
        self.filepath = filepath
        self.model_wrapper = model_wrapper  # Store the model wrapper reference
        self.save_best_only = save_best_only
        self.save_weights_only = save_weights_only
        self.verbose = verbose
        self.best_val_loss = float('inf')
    
    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs.get('val_loss', None)
        if val_loss is None:
            return
        
        if self.save_best_only:
            if val_loss < self.best_val_loss:
                if self.verbose:
                    print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
                self.best_val_loss = val_loss
                # Use the model_wrapper reference
                self.model_wrapper.save_weights(self.filepath)
        else:
            self.model_wrapper.save_weights(self.filepath)

# Initialize model first
hparams_nrms.history_size = HISTORY_SIZE
pytorch_model = NRMSModel(
    hparams=hparams_nrms,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)
model = NRMSWrapper(pytorch_model)

# Then create the callback with the model reference
modelcheckpoint = PyTorchModelCheckpoint(
    filepath=MODEL_WEIGHTS,
    model_wrapper=model,  # Pass the model wrapper
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Training
hist = model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=1,
    callbacks=[modelcheckpoint]
)

# Load weights using the wrapper
model.load_weights(filepath=MODEL_WEIGHTS)

# Get predictions
pred_validation = model.predict(val_dataloader)

NRMSWrapper init
True


Epoch 1/1 [Train]:   0%|          | 0/55 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 338.00 MiB. GPU 0 has a total capacity of 39.50 GiB of which 253.75 MiB is free. Process 23564 has 1.36 GiB memory in use. Process 275171 has 6.24 GiB memory in use. Process 377164 has 448.00 MiB memory in use. Process 389821 has 24.73 GiB memory in use. Process 423385 has 944.00 MiB memory in use. Process 509202 has 982.00 MiB memory in use. Process 517287 has 1.10 GiB memory in use. Process 602174 has 982.00 MiB memory in use. Process 850463 has 788.00 MiB memory in use. Including non-PyTorch memory, this process has 1.01 GiB memory in use. Process 861832 has 708.00 MiB memory in use. Of the allocated memory 523.54 MiB is allocated by PyTorch, and 22.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
#TODO Attempt at new

# import os
# from tqdm.notebook import tqdm
# from transformers import AutoModel, AutoTokenizer

# # Define constants
# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"
# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-large"

# # Create necessary directories
# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Load transformer model and tokenizer
# transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
# transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# # Create a custom ModelCheckpoint for PyTorch
# class PyTorchModelCheckpoint:
#     def __init__(self, filepath, model_wrapper=None, save_best_only=True, save_weights_only=True, verbose=1):
#         self.filepath = filepath
#         self.model_wrapper = model_wrapper  # Store the model wrapper reference
#         self.save_best_only = save_best_only
#         self.save_weights_only = save_weights_only
#         self.verbose = verbose
#         self.best_val_loss = float('inf')
    
#     def on_epoch_end(self, epoch, logs=None):
#         val_loss = logs.get('val_loss', None)
#         if val_loss is None:
#             return
        
#         if self.save_best_only:
#             if val_loss < self.best_val_loss:
#                 if self.verbose:
#                     print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
#                 self.best_val_loss = val_loss
#                 # Use the model_wrapper reference
#                 self.model_wrapper.save_weights(self.filepath)
#         else:
#             self.model_wrapper.save_weights(self.filepath)

# # Define model hyperparameters
# hparams_nrms = {
#     "head_num": 8,
#     "head_dim": 64,
#     "title_size": MAX_TITLE_LENGTH,
#     "history_size": HISTORY_SIZE,
#     "dropout": 0.2 #NOT USED
# } # from gpt

# # init NRMS model
# pytorch_model = NRMSModel(
#     hparams=hparams_nrms,
#     transformer_model=transformer_model,
#     transformer_tokenizer=transformer_tokenizer,
#     seed=42
# )
# model = NRMSWrapper(pytorch_model)

# # Create the checkpoint callback
# modelcheckpoint = PyTorchModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     model_wrapper=model,  # Pass the model wrapper
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1
# )

# # fit
# hist = model.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=1,
#     callbacks=[modelcheckpoint]
# )

# # Load weights using the wrapper
# model.load_weights(filepath=MODEL_WEIGHTS)

# # Get predictions
# pred_validation = model.predict(val_dataloader)

# Saving / loading model because hpc annoying

In [None]:
# MODEL_FILE = f"downloads/models/{MODEL_NAME}.h5" 

# # Save the model after training
# print("Saving the model...")
# os.makedirs(os.path.dirname(MODEL_FILE), exist_ok=True)
# model.model.save(MODEL_FILE)  # Save the full model (architecture + weights)
# print(f"Model saved at {MODEL_FILE}")

##LOAD SAVED MODEL
# from tensorflow.keras.models import load_model

# # Load the saved model
# print(f"Loading the model from {MODEL_FILE}...")
# model.model = load_model(MODEL_FILE)
# print("Model loaded successfully.")


# Example how to compute some metrics:

In [None]:
# pred_validation = model.scorer.predict(val_dataloader)

## Add the predictions to the dataframe

In [None]:
df_validation = add_prediction_scores(df_validation, pred_validation.tolist()).pipe(
    add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
)
df_validation.head(2)

user_id,article_id_fixed,scroll_percentage_fixed,read_time_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,inview_article_categories,inview_article_types,scores,is_known_user
u32,list[i32],list[f32],list[f32],list[i32],list[i32],u32,list[i8],list[f64],list[i64],list[str],list[f64],bool
22548,"[9772629, 9773335, … 9776929]","[29.0, null, … 95.0]","[6.0, 0.0, … 56.0]","[9784710, 9784591, … 9783865]",[9784696],96791,"[0, 0, … 0]","[6.814444, 8.833333, … 6.831111]","[142, 142, … 498]","[""article_default"", ""article_default"", … ""article_default""]","[0.092089, 0.260742, … 0.157016]",True
22548,"[9772629, 9773335, … 9776929]","[29.0, null, … 95.0]","[6.0, 0.0, … 56.0]","[9784406, 9784642, … 9784281]",[9784281],96798,"[0, 0, … 1]","[11.292222, 9.510556, … 12.075]","[414, 118, … 142]","[""article_fullscreen_gallery"", ""article_default"", … ""article_default""]","[0.124995, 0.109533, … 0.046443]",True


### Compute metrics

In [None]:
metrics = MetricEvaluator(
    labels=df_validation["labels"].to_list(),
    predictions=df_validation["scores"].to_list(),
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()

<MetricEvaluator class>: 
 {
    "auc": 0.5657954501202044,
    "mrr": 0.3557576891716898,
    "ndcg@5": 0.39685534906028735,
    "ndcg@10": 0.4723051350875782
}

## Make submission file

In [None]:
df_validation = df_validation.with_columns(
    pl.col("scores")
    .map_elements(lambda x: list(rank_predictions_by_score(x)))
    .alias("ranked_scores")
)
df_validation.head(2)

user_id,article_id_fixed,scroll_percentage_fixed,read_time_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,inview_article_categories,inview_article_types,scores,is_known_user,ranked_scores
u32,list[i32],list[f32],list[f32],list[i32],list[i32],u32,list[i8],list[f64],list[i64],list[str],list[f64],bool,list[i64]
22548,"[9772629, 9773335, … 9776929]","[29.0, null, … 95.0]","[6.0, 0.0, … 56.0]","[9784710, 9784591, … 9783865]",[9784696],96791,"[0, 0, … 0]","[6.814444, 8.833333, … 6.831111]","[142, 142, … 498]","[""article_default"", ""article_default"", … ""article_default""]","[0.092089, 0.260742, … 0.157016]",True,"[5, 2, … 4]"
22548,"[9772629, 9773335, … 9776929]","[29.0, null, … 95.0]","[6.0, 0.0, … 56.0]","[9784406, 9784642, … 9784281]",[9784281],96798,"[0, 0, … 1]","[11.292222, 9.510556, … 12.075]","[414, 118, … 142]","[""article_fullscreen_gallery"", ""article_default"", … ""article_default""]","[0.124995, 0.109533, … 0.046443]",True,"[11, 17, … 22]"


This is using the validation, simply add the testset to your flow.

In [None]:
write_submission_file(
    impression_ids=df_validation[DEFAULT_IMPRESSION_ID_COL],
    prediction_scores=df_validation["ranked_scores"],
    path="downloads/predictions.txt",
)

0it [00:00, ?it/s]

244647it [00:13, 18481.69it/s]


Zipping downloads/predictions.txt to downloads/predictions.zip


# DONE 🚀

In [10]:
# import argparse


# def get_args():
#     parser = argparse.ArgumentParser(
#         description="Argument parser for NRMSModel training"
#     )

#     parser.add_argument(
#         "--data_path",
#         type=str,
#         default=str("/dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data"), #str("~/ebnerd_data"),
#         help="Path to the data directory",
#     )

#     # General settings
#     parser.add_argument("--seed", type=int, default=123, help="Random seed")
#     parser.add_argument(
#         "--datasplit", type=str, default="ebnerd_small", help="Dataset split to use"
#     )
#     parser.add_argument("--debug", action="store_true", help="Enable debug mode")

#     # Batch sizes
#     parser.add_argument(
#         "--bs_train", type=int, default=32, help="Batch size for training"
#     )
#     parser.add_argument(
#         "--bs_test", type=int, default=32, help="Batch size for testing"
#     )
#     parser.add_argument(
#         "--batch_size_test_wo_b",
#         type=int,
#         default=32,
#         help="Batch size for testing without balancing",
#     )
#     parser.add_argument(
#         "--batch_size_test_w_b",
#         type=int,
#         default=4,
#         help="Batch size for testing with balancing",
#     )

#     # History and ratios
#     parser.add_argument(
#         "--history_size", type=int, default=20, help="History size for the model"
#     )
#     parser.add_argument(
#         "--npratio", type=int, default=4, help="Negative-positive ratio"
#     )

#     # Training settings
#     parser.add_argument("--epochs", type=int, default=5, help="Number of epochs")
#     parser.add_argument(
#         "--train_fraction",
#         type=float,
#         default=1.0,
#         help="Fraction of training data to use",
#     )
#     parser.add_argument(
#         "--fraction_test",
#         type=float,
#         default=1.0,
#         help="Fraction of testing data to use",
#     )

#     # Model and loader settings
#     parser.add_argument(
#         "--nrms_loader",
#         type=str,
#         default="NRMSDataLoaderPretransform",
#         choices=["NRMSDataLoaderPretransform", "NRMSDataLoader"],
#         help="Data loader type (speed or memory efficient)",
#     )

#     # Chunk processing
#     parser.add_argument(
#         "--n_chunks_test", type=int, default=10, help="Number of test chunks to process"
#     )
#     parser.add_argument(
#         "--chunks_done", type=int, default=0, help="Number of chunks already processed"
#     )

#     # =====================================================================================
#     #  ############################# UNIQUE FOR NRMSDocVec ###############################
#     # =====================================================================================

#     parser.add_argument(
#         "--document_embeddings",
#         type=str,
#         default="Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet",
#         help="Path to the document embeddings file",
#     )
#     # Model function and architecture
#     parser.add_argument(
#         "--title_size", type=int, default=768, help="Size of title encoding"
#     )
#     parser.add_argument(
#         "--head_num", type=int, default=16, help="Number of attention heads"
#     )
#     parser.add_argument(
#         "--head_dim", type=int, default=16, help="Dimension of each attention head"
#     )
#     parser.add_argument(
#         "--attention_hidden_dim",
#         type=int,
#         default=200,
#         help="Dimension of attention hidden layers",
#     )
#     parser.add_argument(
#         "--newsencoder_units_per_layer",
#         nargs="+",
#         type=int,
#         default=[512, 512, 512],
#         help="List of units per layer in the news encoder",
#     )

#     # Optimizer settings
#     parser.add_argument(
#         "--optimizer", type=str, default="adam", help="Optimizer to use"
#     )
#     parser.add_argument(
#         "--loss", type=str, default="cross_entropy_loss", help="Loss function"
#     )
#     parser.add_argument("--dropout", type=float, default=0.2, help="Dropout rate")
#     parser.add_argument(
#         "--learning_rate", type=float, default=1e-4, help="Learning rate"
#     )
#     parser.add_argument(
#         "--newsencoder_l2_regularization",
#         type=float,
#         default=1e-4,
#         help="L2 regularization for the news encoder",
#     )

#     return parser.parse_args()


In [11]:
# # Running DOCVEC

# from transformers import AutoTokenizer, AutoModel
# from ebrec.utils2._nlp import get_transformers_word_embeddings
# from ebrec.utils2._articles import convert_text2encoding_with_transformers

# from pathlib import Path
# import tensorflow as tf
# import datetime as dt
# import polars as pl
# import shutil
# import gc
# import os

# from ebrec.utils2._constants import *

# from ebrec.utils2._behaviors import (
#     create_binary_labels_column,
#     sampling_strategy_wu2019,
#     add_prediction_scores,
#     truncate_history,
#     ebnerd_from_path,
# )



# from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore

# from ebrec.utils2._python import (
#     write_submission_file,
#     rank_predictions_by_score,
#     write_json_file,
# )
# from ebrec.utils2._articles import create_article_id_to_value_mapping
# from ebrec.utils2._polars import split_df_chunks, concat_str_columns

# from ebrec.models.newsrec.dataloader import NRMSDataLoader, NRMSDataLoaderPretransform
# from ebrec.models.newsrec.model_config2 import (
#     hparams_nrms,
#     hparams_nrms_docvec,
#     hparams_to_dict,
#     print_hparams,
# )
# from ebrec.models.newsrec.nrms_docvec2 import NRMSDocVec
# from ebrec.models.newsrec import NRMSModel

# os.environ["TOKENIZERS_PARALLELISM"] = "false"


# # from args_nrms_docvec import get_args


# args = get_args()

# for arg, val in vars(args).items():
#     print(f"{arg} : {val}")

# PATH = Path(args.data_path).expanduser()
# # Access arguments as variables
# SEED = args.seed
# DATASPLIT = args.datasplit
# DEBUG = args.debug
# BS_TRAIN = args.bs_train
# BS_TEST = args.bs_test
# BATCH_SIZE_TEST_WO_B = args.batch_size_test_wo_b
# BATCH_SIZE_TEST_W_B = args.batch_size_test_w_b
# HISTORY_SIZE = args.history_size
# NPRATIO = args.npratio
# EPOCHS = args.epochs
# TRAIN_FRACTION = args.train_fraction if not DEBUG else 0.0001
# FRACTION_TEST = args.fraction_test if not DEBUG else 0.0001

# NRMSLoader_training = (
#     NRMSDataLoaderPretransform
#     if args.nrms_loader == "NRMSDataLoaderPretransform"
#     else NRMSDataLoader
# )

# # =====================================================================================
# #  ############################# UNIQUE FOR NRMSModel ################################
# # =====================================================================================

# # Model in use:
# model_func = NRMSDocVec
# hparams = hparams_nrms_docvec
# #
# hparams.title_size = args.title_size
# hparams.history_size = args.history_size
# hparams.head_num = args.head_num
# hparams.head_dim = args.head_dim
# hparams.attention_hidden_dim = args.attention_hidden_dim
# hparams.newsencoder_units_per_layer = args.newsencoder_units_per_layer
# hparams.optimizer = args.optimizer
# hparams.loss = args.loss
# hparams.dropout = args.dropout
# hparams.learning_rate = args.learning_rate
# hparams.newsencoder_l2_regularization = args.newsencoder_l2_regularization


# # =============
# # Data-path
# DOC_VEC_PATH = PATH.joinpath(f"artifacts/{args.document_embeddings}")
# print("Initiating articles...")
# df_articles = pl.read_parquet(DOC_VEC_PATH)
# article_mapping = create_article_id_to_value_mapping(
#     df=df_articles, value_col=df_articles.columns[-1]
# )

# # =====================================================================================
# #  ############################# UNIQUE FOR NRMSDocVec ###############################
# # =====================================================================================

# print_hparams(hparams)

# # Dump paths:
# DUMP_DIR = Path("ebnerd_predictions")
# DUMP_DIR.mkdir(exist_ok=True, parents=True)
# #
# DT_NOW = dt.datetime.now()
# #
# MODEL_NAME = model_func.__name__
# MODEL_OUTPUT_NAME = f"{MODEL_NAME}-{DT_NOW}"
# #
# ARTIFACT_DIR = DUMP_DIR.joinpath("test_predictions", MODEL_OUTPUT_NAME)
# # Model monitoring:
# MODEL_WEIGHTS = DUMP_DIR.joinpath(f"state_dict/{MODEL_OUTPUT_NAME}/weights")
# LOG_DIR = DUMP_DIR.joinpath(f"runs/{MODEL_OUTPUT_NAME}")
# # Evaluating the test test can be memory intensive, we'll chunk it up:
# TEST_CHUNKS_DIR = ARTIFACT_DIR.joinpath("test_chunks")
# TEST_CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
# N_CHUNKS_TEST = args.n_chunks_test
# CHUNKS_DONE = args.chunks_done  # if it crashes, you can start from here.
# # Just trying keeping the dataframe slime:
# COLUMNS = [
#     DEFAULT_IMPRESSION_TIMESTAMP_COL,
#     DEFAULT_HISTORY_ARTICLE_ID_COL,
#     DEFAULT_INVIEW_ARTICLES_COL,
#     DEFAULT_CLICKED_ARTICLES_COL,
#     DEFAULT_IMPRESSION_ID_COL,
#     DEFAULT_USER_COL,
# ]
# # Store hparams
# write_json_file(
#     hparams_to_dict(hparams),
#     ARTIFACT_DIR.joinpath(f"{MODEL_NAME}_hparams.json"),
# )
# write_json_file(vars(args), ARTIFACT_DIR.joinpath(f"{MODEL_NAME}_argparser.json"))

# # =====================================================================================
# # We'll use the training + validation sets for training.
# df = (
#     pl.concat(
#         [
#             ebnerd_from_path(
#                 PATH.joinpath(DATASPLIT, "train"),
#                 history_size=HISTORY_SIZE,
#                 padding=0,
#             ),
#             ebnerd_from_path(
#                 PATH.joinpath(DATASPLIT, "validation"),
#                 history_size=HISTORY_SIZE,
#                 padding=0,
#             ),
#         ]
#     )
#     .sample(fraction=TRAIN_FRACTION, shuffle=True, seed=SEED)
#     .select(COLUMNS)
#     .pipe(
#         sampling_strategy_wu2019,
#         npratio=NPRATIO,
#         shuffle=True,
#         with_replacement=True,
#         seed=SEED,
#     )
#     .pipe(create_binary_labels_column)
# )

# # We keep the last day of our training data as the validation set.
# last_dt = df[DEFAULT_IMPRESSION_TIMESTAMP_COL].dt.date().max() - dt.timedelta(days=1)
# df_train = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).dt.date() < last_dt)
# df_validation = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).dt.date() >= last_dt)

# # =====================================================================================
# print(f"Initiating training-dataloader")
# train_dataloader = NRMSLoader_training(
#     behaviors=df_train,
#     article_dict=article_mapping,
#     unknown_representation="zeros",
#     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#     eval_mode=False,
#     batch_size=BS_TRAIN,
# )

# val_dataloader = NRMSLoader_training(
#     behaviors=df_validation,
#     article_dict=article_mapping,
#     unknown_representation="zeros",
#     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#     eval_mode=False,
#     batch_size=BS_TRAIN,
# )

# # =====================================================================================
# # CALLBACKS
# tensorboard_callback = tf.keras.callbacks.TensorBoard(
#     log_dir=LOG_DIR,
#     histogram_freq=1,
# )
# early_stopping = tf.keras.callbacks.EarlyStopping(
#     monitor="val_auc",
#     mode="max",
#     patience=4,
#     restore_best_weights=True,
# )
# modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     monitor="val_auc",
#     mode="max",
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1,
# )
# lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
#     monitor="val_auc",
#     mode="max",
#     factor=0.2,
#     patience=2,
#     min_lr=1e-6,
# )
# callbacks = [tensorboard_callback, early_stopping, modelcheckpoint, lr_scheduler]

# # =====================================================================================
# model = model_func(
#     hparams=hparams,
#     seed=42,
# )
# model.model.compile(
#     optimizer=model.model.optimizer,
#     loss=model.model.loss,
#     metrics=["AUC"],
# )
# f"Initiating {MODEL_NAME}, start training..."
# # =>
# hist = model.model.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=EPOCHS,
#     callbacks=callbacks,
# )

# print(f"loading model: {MODEL_WEIGHTS}")
# model.model.load_weights(MODEL_WEIGHTS)

# # =====================================================================================
# print("Initiating testset...")
# df_test = (
#     ebnerd_from_path(
#         PATH.joinpath("ebnerd_testset", "test"),
#         history_size=HISTORY_SIZE,
#         padding=0,
#     )
#     .sample(fraction=FRACTION_TEST)
#     .with_columns(
#         pl.col(DEFAULT_INVIEW_ARTICLES_COL)
#         .list.first()
#         .alias(DEFAULT_CLICKED_ARTICLES_COL)
#     )
#     .select(COLUMNS + [DEFAULT_IS_BEYOND_ACCURACY_COL])
#     .with_columns(
#         pl.col(DEFAULT_INVIEW_ARTICLES_COL)
#         .list.eval(pl.element() * 0)
#         .alias(DEFAULT_LABELS_COL)
#     )
# )
# # Split test in beyond-accuracy TRUE / FALSE. In the BA 'article_ids_inview' is 250.
# df_test_wo_beyond = df_test.filter(~pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))
# df_test_w_beyond = df_test.filter(pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))

# df_test_chunks = split_df_chunks(df_test_wo_beyond, n_chunks=N_CHUNKS_TEST)
# df_pred_test_wo_beyond = []
# print("Initiating testset without beyond-accuracy...")
# for i, df_test_chunk in enumerate(df_test_chunks[CHUNKS_DONE:], start=1 + CHUNKS_DONE):
#     print(f"Test chunk: {i}/{len(df_test_chunks)}")
#     # Initialize DataLoader
#     test_dataloader_wo_b = NRMSDataLoader(
#         behaviors=df_test_chunk,
#         article_dict=article_mapping,
#         unknown_representation="zeros",
#         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#         eval_mode=True,
#         batch_size=BATCH_SIZE_TEST_WO_B,
#     )
#     # Predict and clear session
#     scores = model.scorer.predict(test_dataloader_wo_b)
#     tf.keras.backend.clear_session()

#     # Process the predictions
#     df_test_chunk = add_prediction_scores(df_test_chunk, scores.tolist()).with_columns(
#         pl.col("scores")
#         .map_elements(lambda x: list(rank_predictions_by_score(x)))
#         .alias("ranked_scores")
#     )

#     # Save the processed chunk
#     df_test_chunk.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
#         TEST_CHUNKS_DIR.joinpath(f"pred_wo_ba_{i}.parquet")
#     )

#     # Append and clean up
#     df_pred_test_wo_beyond.append(df_test_chunk)

#     # Cleanup
#     del df_test_chunk, test_dataloader_wo_b, scores
#     gc.collect()

# df_pred_test_wo_beyond = pl.concat(df_pred_test_wo_beyond)
# df_pred_test_wo_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
#     TEST_CHUNKS_DIR.joinpath("pred_wo_ba.parquet")
# )
# # =====================================================================================
# print("Initiating testset with beyond-accuracy...")
# test_dataloader_w_b = NRMSDataLoader(
#     behaviors=df_test_w_beyond,
#     article_dict=article_mapping,
#     unknown_representation="zeros",
#     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#     eval_mode=True,
#     batch_size=BATCH_SIZE_TEST_W_B,
# )
# scores = model.scorer.predict(test_dataloader_w_b)
# df_pred_test_w_beyond = add_prediction_scores(
#     df_test_w_beyond, scores.tolist()
# ).with_columns(
#     pl.col("scores")
#     .map_elements(lambda x: list(rank_predictions_by_score(x)))
#     .alias("ranked_scores")
# )
# df_pred_test_w_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
#     TEST_CHUNKS_DIR.joinpath("pred_w_ba.parquet")
# )

# # =====================================================================================
# print("Saving prediction results...")
# df_test = pl.concat([df_pred_test_wo_beyond, df_pred_test_w_beyond])
# df_test.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
#     ARTIFACT_DIR.joinpath("test_predictions.parquet")
# )

# if TEST_CHUNKS_DIR.exists() and TEST_CHUNKS_DIR.is_dir():
#     shutil.rmtree(TEST_CHUNKS_DIR)

# write_submission_file(
#     impression_ids=df_test[DEFAULT_IMPRESSION_ID_COL],
#     prediction_scores=df_test["ranked_scores"],
#     path=ARTIFACT_DIR.joinpath("predictions.txt"),
#     filename_zip=f"{MODEL_NAME}-{SEED}-{DATASPLIT}.zip",
# )
