# Getting started

In this notebook, we illustrate how to use the Neural News Recommendation with Multi-Head Self-Attention ([NRMS](https://aclanthology.org/D19-1671/)). The implementation is taken from the [recommenders](https://github.com/recommenders-team/recommenders) repository. We have simply stripped the model to keep it cleaner.

We use a small dataset, which is downloaded from [recsys.eb.dk](https://recsys.eb.dk/). All the datasets are stored in the folder path ```~/ebnerd_data/*```.

In [1]:
VariableTrueIfPrint=False

## Load functionality

In [2]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import tensorflow as tf
import polars as pl
from tensorflow.python.client import device_lib
import numpy as np

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_ARTICLE_ID_COL
)

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

from ebrec.models.newsrec.dataloader import NRMSDataLoader
from ebrec.models.newsrec.model_config import hparams_nrms
# from ebrec.models.newsrec import NRMSModel, NRMSWrapper
from ebrec.models.newsrec import NRMSModel

2024-12-02 10:14:47.233036: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-02 10:14:47.237184: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-02 10:14:47.285457: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-02 10:14:47.285491: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-02 10:14:47.285523: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [3]:
# List all physical devices
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

physical_devices = tf.config.list_physical_devices('GPU')
if VariableTrueIfPrint: print("Available devices:", physical_devices)

2024-12-02 10:14:50.275834: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


## Load dataset

In [4]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL)
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    
    return df_behaviors

### Generate labels
We sample a few just to get started. For testset we just make up a dummy column with 0 and 1 - this is not the true labels.

In [5]:
PATH = Path("/dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data").expanduser()
DATASPLIT = "ebnerd_small" # REMEMBER if change to change make_embedding_artifacts.ipynb file (embeddings)
# DATASPLIT = "ebnerd__testset"
DUMP_DIR = PATH.joinpath("dump_artifacts")
DUMP_DIR.mkdir(exist_ok=True, parents=True)

In this example we sample the dataset, just to keep it smaller. Also, one can simply add the testset similary to the validation.

In [6]:
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
]
HISTORY_SIZE = 20 #20
FRACTION =0.01  # 0.01 #Ratio of Dataset!

df_train = (
                                            #REMEMBER #"train"
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
if VariableTrueIfPrint: print(df_train.head(2))
if VariableTrueIfPrint: print(df_validation.head(2))

## Load articles

In [7]:
df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT, "articles.parquet"))
if VariableTrueIfPrint: df_articles.head(2)

## hourly difference between published and viewed article

In [8]:
## NEW

from sklearn.preprocessing import StandardScaler

# Convert polars DataFrame to pandas
df_train = df_train.to_pandas()

# Create a mapping dictionary from article_id to last_modified_time
article_time_dict = df_articles.select(
    "article_id", 
    "published_time"
    ).to_dict(as_series=False)
article_time_dict = dict(zip(
    article_time_dict["article_id"], 
    article_time_dict["published_time"]
))

# Create a function to map article IDs to their timestamps
def get_article_times(article_ids):
    return [article_time_dict.get(aid, None) for aid in article_ids]

# Add new column with the published-time
df_train["inview_article_times"] = df_train["article_ids_inview"].apply(get_article_times)

#add new column with the last publish_time for the clicked article
df_train["clicked_article_time"] = df_train["article_ids_clicked"].apply(get_article_times)

# Create a function to calculate hour differences
def calculate_hour_differences(impression_time, article_times):
        # If article_times is a single value (for clicked articles)
    if not isinstance(article_times, list):
        if article_times is None:
            return None
        return (impression_time - article_times).total_seconds() / 3600
    
    # If article_times is a list (for inview articles)
    differences = [(impression_time - article_time).total_seconds() / 3600 
                  if article_time is not None else None 
                  for article_time in article_times]
    return differences

# Use for inview articles
df_train['inview_hour_differences'] = df_train.apply(
    lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
    axis=1
)

# # Use for clicked article
# df_train['clicked_hour_difference'] = df_train.apply(
#    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
#    axis=1
# )


#drop columns with the time
df_train = df_train.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)

df_train = pl.from_pandas(df_train)

if VariableTrueIfPrint: df_train.head(2)

In [9]:
# Convert polars DataFrame to pandas
df_validation = df_validation.to_pandas()

# Create a mapping dictionary from article_id to last_modified_time
article_time_dict = df_articles.select(
    "article_id", 
    "published_time"
).to_dict(as_series=False)
article_time_dict = dict(zip(
    article_time_dict["article_id"], 
    article_time_dict["published_time"]
))

# Create a function to map article IDs to their timestamps
def get_article_times(article_ids):
    return [article_time_dict.get(aid, None) for aid in article_ids]

# Add new column with the published-time
df_validation["inview_article_times"] = df_validation["article_ids_inview"].apply(get_article_times)

#add new column with the last publish_time for the clicked article
df_validation["clicked_article_time"] = df_validation["article_ids_clicked"].apply(get_article_times)

# Create a function to calculate hour differences
def calculate_hour_differences(impression_time, article_times):
        # If article_times is a single value (for clicked articles)
    if not isinstance(article_times, list):
        if article_times is None:
            return None
        return (impression_time - article_times).total_seconds() / 3600
    
    # If article_times is a list (for inview articles)
    differences = [(impression_time - article_time).total_seconds() / 3600 
                  if article_time is not None else None 
                  for article_time in article_times]
    return differences

# Use for inview articles
df_validation['inview_hour_differences'] = df_validation.apply(
    lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
    axis=1
)

# # Use for clicked article -- might be leaky??
# df_validation['clicked_hour_difference'] = df_validation.apply(
#    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
#    axis=1
# )

#drop columns with the time
df_validation = df_validation.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)


df_validation = pl.from_pandas(df_validation)

if VariableTrueIfPrint: df_validation.head(2)

## Init model using HuggingFace's tokenizer and wordembedding
In the original implementation, they use the GloVe embeddings and tokenizer. To get going fast, we'll use a multilingual LLM from Hugging Face. 
Utilizing the tokenizer to tokenize the articles and the word-embedding to init NRMS.


In [10]:
# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-large"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30



# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#


df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)

# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)




# Initiate the dataloaders
In the implementations we have disconnected the models and data. Hence, you should built a dataloader that fits your needs.

In [11]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=128,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=64,
)

## Train the model


In [12]:
# List all physical devices
physical_devices = tf.config.list_physical_devices('GPU')
print("Available devices:", physical_devices)
# import torch.nn as nn
# print(torch.cuda.is_available())

Available devices: []


In [13]:
# import os
# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.weights.h5"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # CALLBACKS
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
# early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
# modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
#     filepath=MODEL_WEIGHTS, save_best_only=True, save_weights_only=True, verbose=1
# )

# # hparams_nrms.history_size = HISTORY_SIZE
# # model = NRMSModel(
# #     hparams=hparams_nrms,
# #     word2vec_embedding=word2vec_embedding,
# #     # precomputed_embeddings=article_embeddings,
# #     seed=42,
# # )
# # print("starting fitting")

# # hist = model.model.fit(
# #     train_dataloader,
# #     validation_data=val_dataloader,
# #     epochs=1,
# #     callbacks=[early_stopping, modelcheckpoint]#tensorboard_callback
# # )
# # print("load weigths")
# # _ = model.model.load_weights(filepath=MODEL_WEIGHTS)


In [14]:
# # from ebrec.models.newsrec.nrmspy import NRMSModel, NRMSWrapper 
# # import torch
# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create a custom ModelCheckpoint for PyTorch
# class PyTorchModelCheckpoint:
#     def __init__(self, filepath, save_best_only=True, save_weights_only=True, verbose=1):
#         self.filepath = filepath
#         self.save_best_only = save_best_only
#         self.save_weights_only = save_weights_only
#         self.verbose = verbose
#         self.best_val_loss = float('inf')
    
#     def on_epoch_end(self, epoch, logs=None):
#         val_loss = logs.get('val_loss', None)
#         if val_loss is None:
#             return
        
#         if self.save_best_only:
#             if val_loss < self.best_val_loss:
#                 if self.verbose:
#                     print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
#                 self.best_val_loss = val_loss
#                 torch.save(model.model.state_dict(), self.filepath)
#         else:
#             torch.save(model.model.state_dict(), self.filepath)

# # CALLBACKS
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
# early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
# modelcheckpoint = PyTorchModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1
# )

# # # Update hyperparameters if needed
# # hparams_nrms.head_num = 20  # number of attention heads
# # hparams_nrms.head_dim = 20  # dimension of each head (total dimension will be head_num * head_dim)
# # hparams_nrms.attention_hidden_dim = hparams_nrms.head_num * hparams_nrms.head_dim
# # hparams_nrms.dropout = 0.2
# # hparams_nrms.title_size = 30  # should match your MAX_TITLE_LENGTH

# # # Initialize model
# # hparams_nrms.history_size = HISTORY_SIZE
# # pytorch_model = NRMSModel(
# #     hparams=hparams_nrms,
# #     word2vec_embedding=word2vec_embedding,
# #     seed=42,
# # )
# # model = NRMSWrapper(pytorch_model)

# # # Training
# # hist = model.fit(
# #     train_dataloader,
# #     validation_data=val_dataloader,
# #     epochs=5,
# #     callbacks=[modelcheckpoint]
# # )

# # # Load weights using the wrapper
# # model.load_weights(filepath=MODEL_WEIGHTS)

# # # Get predictions
# # pred_validation = model.predict(val_dataloader)

In [15]:
# import os
# import torch
# from ebrec.models.newsrec.nrmspy import NRMSModel, NRMSWrapper  # Add this import

# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create base model
# base_model = NRMSModel(
#     hparams=hparams_nrms,
#     word2vec_embedding=word2vec_embedding,
#     seed=42,
# )

# # Wrap model with NRMSWrapper
# model = NRMSWrapper(
#     model=base_model,
#     device='cuda' if torch.cuda.is_available() else 'cpu'
# )

# print("starting fitting")

# # Use wrapper's fit method
# hist = model.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=1,
#     callbacks=[early_stopping, modelcheckpoint]
# )

# print("load weights")
# model.load_weights(filepath=MODEL_WEIGHTS)

In [16]:
# import os
# from torch.utils.tensorboard import SummaryWriter
# from ebrec.models.newsrec.nrmspy import NRMSModel, NRMSWrapper
# import torch

# # Setup directories
# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create tensorboard writer
# writer = SummaryWriter(LOG_DIR)

# # Setup ModelCheckpoint
# class PyTorchModelCheckpoint:
#     def __init__(self, filepath, save_best_only=True, save_weights_only=True, verbose=1):
#         self.filepath = filepath
#         self.save_best_only = save_best_only
#         self.save_weights_only = save_weights_only
#         self.verbose = verbose
#         self.best_val_loss = float('inf')
    
#     def on_epoch_end(self, epoch, logs=None):
#         val_loss = logs.get('val_loss', None)
#         if val_loss is None:
#             return
        
#         if self.save_best_only:
#             if val_loss < self.best_val_loss:
#                 if self.verbose:
#                     print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
#                 self.best_val_loss = val_loss
#                 torch.save(model.state_dict(), self.filepath)
#         else:
#             torch.save(model.state_dict(), self.filepath)

# # Initialize callback
# modelcheckpoint = PyTorchModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1
# )

# # Create model and wrapper
# model = NRMSModel(
#     hparams=hparams_nrms,
#     word2vec_embedding=word2vec_embedding,
#     seed=42
# )
# wrapper = NRMSWrapper(model)

# # Train
# wrapper.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=5,
#     callbacks=[modelcheckpoint],
#     writer=writer  # Pass writer to fit method
# )

# # Close writer
# writer.close()

# # Load best weights and predict
# wrapper.load_weights(MODEL_WEIGHTS)
# predictions = wrapper.predict(val_dataloader)

In [None]:
# # Create model with Keras-like interface
# from ebrec.models.newsrec.nrmspy import NRMSModel, NRMSWrapper, KerasLikeModel

# import os
# from torch.utils.tensorboard import SummaryWriter
# from ebrec.models.newsrec.nrmspy import NRMSModel, NRMSWrapper
# import torch

# # Setup directories
# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create tensorboard writer
# writer = SummaryWriter(LOG_DIR)

# # Setup ModelCheckpoint
# class PyTorchModelCheckpoint:
#     def __init__(self, filepath, save_best_only=True, save_weights_only=True, verbose=1):
#         self.filepath = filepath
#         self.save_best_only = save_best_only
#         self.save_weights_only = save_weights_only
#         self.verbose = verbose
#         self.best_val_loss = float('inf')
    
#     def on_epoch_end(self, epoch, logs=None):
#         val_loss = logs.get('val_loss', None)
#         if val_loss is None:
#             return
        
#         if self.save_best_only:
#             if val_loss < self.best_val_loss:
#                 if self.verbose:
#                     print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
#                 self.best_val_loss = val_loss
#                 torch.save(model.state_dict(), self.filepath)
#         else:
#             torch.save(model.state_dict(), self.filepath)

# # Initialize callback
# modelcheckpoint = PyTorchModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1
# )

# pytorch_model = NRMSModel(hparams=hparams_nrms, word2vec_embedding=word2vec_embedding, seed=42)
# keras_like_model = KerasLikeModel(pytorch_model)

# # Compile model
# keras_like_model.compile(optimizer='adam', loss='categorical_crossentropy')

# # Train model
# keras_like_model.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=5,
#     callbacks=[modelcheckpoint]
# )

In [12]:
import os
from tqdm.notebook import tqdm


MODEL_NAME = "NRMS"
LOG_DIR = f"downloads/runs/{MODEL_NAME}"
WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(WEIGHTS_DIR, exist_ok=True)

# Create a custom ModelCheckpoint for PyTorch
class PyTorchModelCheckpoint:
    def __init__(self, filepath, model_wrapper=None, save_best_only=True, save_weights_only=True, verbose=1):
        self.filepath = filepath
        self.model_wrapper = model_wrapper  # Store the model wrapper reference
        self.save_best_only = save_best_only
        self.save_weights_only = save_weights_only
        self.verbose = verbose
        self.best_val_loss = float('inf')
    
    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs.get('val_loss', None)
        if val_loss is None:
            return
        
        if self.save_best_only:
            if val_loss < self.best_val_loss:
                if self.verbose:
                    print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
                self.best_val_loss = val_loss
                # Use the model_wrapper reference
                self.model_wrapper.save_weights(self.filepath)
        else:
            self.model_wrapper.save_weights(self.filepath)

# Initialize model first
hparams_nrms.history_size = HISTORY_SIZE
pytorch_model = NRMSModel(
    hparams=hparams_nrms,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)
model = NRMSWrapper(pytorch_model)

# Then create the callback with the model reference
modelcheckpoint = PyTorchModelCheckpoint(
    filepath=MODEL_WEIGHTS,
    model_wrapper=model,  # Pass the model wrapper
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Training
hist = model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=1,
    callbacks=[modelcheckpoint]
)

# Load weights using the wrapper
model.load_weights(filepath=MODEL_WEIGHTS)

# Get predictions
pred_validation = model.predict(val_dataloader)

NRMSWrapper init
True


Epoch 1/1 [Train]: 100%|██████████| 19/19 [00:03<00:00,  4.86it/s, loss=0.5436]
Epoch 1/1 [Valid]: 100%|██████████| 39/39 [00:07<00:00,  5.07it/s, loss=0.7819]


Epoch 1 - Train Loss: 0.5436, Val Loss: 0.3809

Validation loss improved from inf to 0.38093


  self.model.load_state_dict(torch.load(filepath))
Predicting: 100%|██████████| 39/39 [00:07<00:00,  4.98it/s]


# Tring to code stuff

In [35]:
from transformers import BertModel, BertTokenizer

# Specify the BERT model you want to use
BERT_MODEL_NAME = 'Maltehb/danish-bert-botxo'

# Load the BERT model and tokenizer
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME)
bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/378 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/253k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/498k [00:00<?, ?B/s]

In [None]:
def tokenize_texts(texts, tokenizer, max_length):
    return tokenizer(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

# Function to tokenize and encode texts
def tokenize_and_encode(texts, tokenizer, max_length):
    encoding = tokenizer(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return encoding['input_ids'], encoding['attention_mask']

# Example usage in your data loading or preprocessing script

# For history articles
history_texts = [...]  # List of lists of history article texts
history_input_ids = []
history_attention_masks = []
for user_history in history_texts:
    ids, masks = tokenize_and_encode(user_history, bert_tokenizer, MAX_TITLE_LENGTH)
    history_input_ids.append(ids)
    history_attention_masks.append(masks)

history_input_ids = torch.stack(history_input_ids)
history_attention_masks = torch.stack(history_attention_masks)

# For candidate articles
candidate_texts = [...]  # List of candidate article texts
candidate_input_ids, candidate_attention_masks = tokenize_and_encode(candidate_texts, bert_tokenizer, MAX_TITLE_LENGTH)

In [None]:
# Initialize model
hparams_nrms.history_size = HISTORY_SIZE
pytorch_model = NRMSModel(
    hparams=hparams_nrms,
    bert_model=bert_model,
    seed=42,
)
model = NRMSWrapper(pytorch_model)

# Then create the callback as before
modelcheckpoint = PyTorchModelCheckpoint(
    filepath=MODEL_WEIGHTS,
    model_wrapper=model,
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Training
hist = model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=NUM_EPOCHS,
    callbacks=[modelcheckpoint]
)

# Load weights
model.load_weights(filepath=MODEL_WEIGHTS)

# Get predictions
pred_validation = model.predict(val_dataloader)

In [38]:
class NRMSWrapper:
    def __init__(self, model, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.model = model
        self.device = device
        self.model.to(device)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)  # Smaller LR for BERT
        self.criterion = nn.BCELoss()
        
    def fit(self, train_dataloader, validation_data=None, epochs=1, callbacks=None):
        for epoch in range(epochs):
            # Training loop remains largely the same
            self.model.train()
            for batch_idx, (inputs, labels) in enumerate(train_dataloader):
                history_input_ids, history_attention_mask, candidate_input_ids, candidate_attention_mask = inputs
                labels = labels.float().to(self.device)
                
                history_input_ids = history_input_ids.to(self.device)
                history_attention_mask = history_attention_mask.to(self.device)
                candidate_input_ids = candidate_input_ids.to(self.device)
                candidate_attention_mask = candidate_attention_mask.to(self.device)
                
                self.optimizer.zero_grad()
                outputs = self.model(
                    history_input_ids,
                    history_attention_mask,
                    candidate_input_ids,
                    candidate_attention_mask,
                    training=True
                )
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                
            # Validation loop similar adjustments

    def predict(self, dataloader):
        self.model.eval()
        predictions = []
        with torch.no_grad():
            for inputs, _ in dataloader:
                history_input_ids, history_attention_mask, candidate_input_ids, candidate_attention_mask = inputs
                
                history_input_ids = history_input_ids.to(self.device)
                history_attention_mask = history_attention_mask.to(self.device)
                candidate_input_ids = candidate_input_ids.to(self.device)
                candidate_attention_mask = candidate_attention_mask.to(self.device)
                
                outputs = self.model(
                    history_input_ids,
                    history_attention_mask,
                    candidate_input_ids,
                    candidate_attention_mask,
                    training=False
                )
                predictions.append(outputs.cpu().numpy())
        return np.concatenate(predictions)
    
    # Methods for saving/loading weights remain the same

In [None]:
# For user history
his_input_ids, his_attention_mask = prepare_inputs(history_titles, bert_tokenizer, MAX_TITLE_LENGTH)

# For candidate news
pred_input_ids, pred_attention_mask = prepare_inputs(candidate_titles, bert_tokenizer, MAX_TITLE_LENGTH)

In [None]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        # Unpack the batch
        his_input_ids, his_attention_mask, pred_input_ids, pred_attention_mask, labels = batch

        # Move tensors to device
        his_input_ids = his_input_ids.to(device)
        his_attention_mask = his_attention_mask.to(device)
        pred_input_ids = pred_input_ids.to(device)
        pred_attention_mask = pred_attention_mask.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(
            his_input_ids,
            his_attention_mask,
            pred_input_ids,
            pred_attention_mask,
            training=True
        )

        # Compute loss and backpropagate
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Saving / loading model because hpc annoying

In [19]:
# MODEL_FILE = f"downloads/models/{MODEL_NAME}.h5" 

# # Save the model after training
# print("Saving the model...")
# os.makedirs(os.path.dirname(MODEL_FILE), exist_ok=True)
# model.model.save(MODEL_FILE)  # Save the full model (architecture + weights)
# print(f"Model saved at {MODEL_FILE}")

##LOAD SAVED MODEL
# from tensorflow.keras.models import load_model

# # Load the saved model
# print(f"Loading the model from {MODEL_FILE}...")
# model.model = load_model(MODEL_FILE)
# print("Model loaded successfully.")


# Example how to compute some metrics:

In [20]:
# pred_validation = model.scorer.predict(val_dataloader)

## Add the predictions to the dataframe

In [21]:
df_validation = add_prediction_scores(df_validation, pred_validation.tolist()).pipe(
    add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
)
df_validation.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,scores,is_known_user
u32,list[i32],list[i32],list[i32],u32,list[i8],list[f64],list[f64],bool
22548,"[9773295, 9769504, … 9776929]","[9783865, 9784710, … 9784696]",[9784696],96791,"[0, 0, … 1]","[6.831111, 6.814444, … 8.163333]","[0.056111, 0.245147, … 0.254644]",True
22548,"[9773295, 9769504, … 9776929]","[9784281, 9784583, … 9782726]",[9784281],96798,"[1, 0, … 0]","[12.075, 9.953889, … 0.53]","[0.272449, 0.102848, … 0.137341]",True


### Compute metrics

In [22]:
metrics = MetricEvaluator(
    labels=df_validation["labels"].to_list(),
    predictions=df_validation["scores"].to_list(),
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()

<MetricEvaluator class>: 
 {
    "auc": 0.5429764670849283,
    "mrr": 0.3383194556757905,
    "ndcg@5": 0.37634755125872577,
    "ndcg@10": 0.4549911716551946
}

## Make submission file

In [23]:
df_validation = df_validation.with_columns(
    pl.col("scores")
    .map_elements(lambda x: list(rank_predictions_by_score(x)))
    .alias("ranked_scores")
)
df_validation.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,scores,is_known_user,ranked_scores
u32,list[i32],list[i32],list[i32],u32,list[i8],list[f64],list[f64],bool,list[i64]
22548,"[9773295, 9769504, … 9776929]","[9783865, 9784710, … 9784696]",[9784696],96791,"[0, 0, … 1]","[6.831111, 6.814444, … 8.163333]","[0.056111, 0.245147, … 0.254644]",True,"[5, 2, … 1]"
22548,"[9773295, 9769504, … 9776929]","[9784281, 9784583, … 9782726]",[9784281],96798,"[1, 0, … 0]","[12.075, 9.953889, … 0.53]","[0.272449, 0.102848, … 0.137341]",True,"[4, 18, … 15]"


This is using the validation, simply add the testset to your flow.

In [24]:
write_submission_file(
    impression_ids=df_validation[DEFAULT_IMPRESSION_ID_COL],
    prediction_scores=df_validation["ranked_scores"],
    path="downloads/predictions.txt",
)

0it [00:00, ?it/s]

244647it [00:10, 22954.92it/s]


Zipping downloads/predictions.txt to downloads/predictions.zip


# DONE 🚀