# Getting started

In this notebook, we illustrate how to use the Neural News Recommendation with Multi-Head Self-Attention ([NRMS](https://aclanthology.org/D19-1671/)). The implementation is taken from the [recommenders](https://github.com/recommenders-team/recommenders) repository. We have simply stripped the model to keep it cleaner.

We use a small dataset, which is downloaded from [recsys.eb.dk](https://recsys.eb.dk/). All the datasets are stored in the folder path ```~/ebnerd_data/*```.

## Load functionality

In [1]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import tensorflow as tf
import polars as pl
from tensorflow.python.client import device_lib
import numpy as np

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_ARTICLE_ID_COL
)

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

from ebrec.models.newsrec.dataloader import NRMSDataLoader
from ebrec.models.newsrec.model_config import hparams_nrms
from ebrec.models.newsrec import NRMSModel, NRMSWrapper

2024-11-23 16:32:54.372272: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-23 16:32:54.419352: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-23 16:32:54.419387: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-23 16:32:54.419438: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-23 16:32:54.429620: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: A

In [2]:
# List all physical devices
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

physical_devices = tf.config.list_physical_devices('GPU')
print("Available devices:", physical_devices)

Available devices: []


2024-11-23 16:32:57.693753: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


## Load dataset

In [3]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL)
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    
    return df_behaviors

### Generate labels
We sample a few just to get started. For testset we just make up a dummy column with 0 and 1 - this is not the true labels.

In [4]:
PATH = Path("/dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data").expanduser()
DATASPLIT = "ebnerd_small"
DUMP_DIR = PATH.joinpath("dump_artifacts")
DUMP_DIR.mkdir(exist_ok=True, parents=True)

In this example we sample the dataset, just to keep it smaller. Also, one can simply add the testset similary to the validation.

In [5]:
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
]
HISTORY_SIZE = 20 #20
FRACTION =1  # 0.01

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
print(df_train.head(2))
print(df_validation.head(2))

shape: (2, 7)
┌─────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬─────────────┐
│ user_id ┆ article_id_f ┆ article_ids_ ┆ article_ids_ ┆ impression_i ┆ impression_t ┆ labels      │
│ ---     ┆ ixed         ┆ inview       ┆ clicked      ┆ d            ┆ ime          ┆ ---         │
│ u32     ┆ ---          ┆ ---          ┆ ---          ┆ ---          ┆ ---          ┆ list[i8]    │
│         ┆ list[i32]    ┆ list[i64]    ┆ list[i64]    ┆ u32          ┆ datetime[μs] ┆             │
╞═════════╪══════════════╪══════════════╪══════════════╪══════════════╪══════════════╪═════════════╡
│ 139836  ┆ [0, 9745590, ┆ [9778728,    ┆ [9778657]    ┆ 149474       ┆ 2023-05-24   ┆ [0, 1, … 0] │
│         ┆ … 9765156]   ┆ 9778657, …   ┆              ┆              ┆ 07:47:53     ┆             │
│         ┆              ┆ 9778669]     ┆              ┆              ┆              ┆             │
│ 143471  ┆ [9767637,    ┆ [9778682,    ┆ [9778623]    ┆ 150528       ┆ 2023-

## Load articles

In [6]:
df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT, "articles.parquet"))
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var i…","""Politiet frygt…",2023-06-29 06:20:33,False,"""Sagen om den ø…",2006-08-31 08:06:45,[3150850],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars …","""Biografgængern…",2023-06-29 06:20:35,False,"""Vatikanet har …",2006-05-21 16:57:00,[3006712],"""article_defaul…","""https://ekstra…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""


## hourly difference between published and viewed article

In [None]:
## NEW

from sklearn.preprocessing import StandardScaler

# Convert polars DataFrame to pandas
df_train = df_train.to_pandas()

# Create a mapping dictionary from article_id to last_modified_time
article_time_dict = df_articles.select(
    "article_id", 
    "published_time"
).to_dict(as_series=False)
article_time_dict = dict(zip(
    article_time_dict["article_id"], 
    article_time_dict["published_time"]
))

# Create a function to map article IDs to their timestamps
def get_article_times(article_ids):
    return [article_time_dict.get(aid, None) for aid in article_ids]

# Add new column with the published-time
df_train["inview_article_times"] = df_train["article_ids_inview"].apply(get_article_times)

#add new column with the last publish_time for the clicked article
df_train["clicked_article_time"] = df_train["article_ids_clicked"].apply(get_article_times)

# Create a function to calculate hour differences
def calculate_hour_differences(impression_time, article_times):
        # If article_times is a single value (for clicked articles)
    if not isinstance(article_times, list):
        if article_times is None:
            return None
        return (impression_time - article_times).total_seconds() / 3600
    
    # If article_times is a list (for inview articles)
    differences = [(impression_time - article_time).total_seconds() / 3600 
                  if article_time is not None else None 
                  for article_time in article_times]
    return differences

# Use for inview articles
df_train['inview_hour_differences'] = df_train.apply(
    lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
    axis=1
)

# # Use for clicked article
# df_train['clicked_hour_difference'] = df_train.apply(
#    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
#    axis=1
# )


#drop columns with the time
df_train = df_train.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)

df_train = pl.from_pandas(df_train)

df_train.head(2)

In [None]:
# Convert polars DataFrame to pandas
df_validation = df_validation.to_pandas()

# Create a mapping dictionary from article_id to last_modified_time
article_time_dict = df_articles.select(
    "article_id", 
    "published_time"
).to_dict(as_series=False)
article_time_dict = dict(zip(
    article_time_dict["article_id"], 
    article_time_dict["published_time"]
))

# Create a function to map article IDs to their timestamps
def get_article_times(article_ids):
    return [article_time_dict.get(aid, None) for aid in article_ids]

# Add new column with the published-time
df_validation["inview_article_times"] = df_validation["article_ids_inview"].apply(get_article_times)

#add new column with the last publish_time for the clicked article
df_validation["clicked_article_time"] = df_validation["article_ids_clicked"].apply(get_article_times)

# Create a function to calculate hour differences
def calculate_hour_differences(impression_time, article_times):
        # If article_times is a single value (for clicked articles)
    if not isinstance(article_times, list):
        if article_times is None:
            return None
        return (impression_time - article_times).total_seconds() / 3600
    
    # If article_times is a list (for inview articles)
    differences = [(impression_time - article_time).total_seconds() / 3600 
                  if article_time is not None else None 
                  for article_time in article_times]
    return differences

# Use for inview articles
df_validation['inview_hour_differences'] = df_validation.apply(
    lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
    axis=1
)

# # Use for clicked article -- might be leaky??
# df_validation['clicked_hour_difference'] = df_validation.apply(
#    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
#    axis=1
# )

#drop columns with the time
df_validation = df_validation.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)


df_validation = pl.from_pandas(df_validation)

df_validation.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences
u32,list[i32],list[i32],list[i32],u32,list[i8],list[f64]
2559103,"[9778168, 9778102, … 9779071]","[9779653, 9781354, … 9781535]",[9780372],269179170,"[0, 0, … 0]","[2.236389, 1.738056, … 1.913889]"
1866066,"[9777000, 9777704, … 9780271]","[9780874, 9770451, … 9784591]",[9784607],329969502,"[0, 0, … 0]","[56.234444, 224.853889, … 1.015]"


## Init model using HuggingFace's tokenizer and wordembedding
In the original implementation, they use the GloVe embeddings and tokenizer. To get going fast, we'll use a multilingual LLM from Hugging Face. 
Utilizing the tokenizer to tokenize the articles and the word-embedding to init NRMS.


In [None]:
# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-large"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)




In [None]:
word2vec_embedding

array([[-0.0562439 , -0.02876282, -0.07940674, ...,  0.02944946,
         0.0267334 , -0.02970886],
       [-0.0042305 ,  0.0026474 ,  0.0078125 , ..., -0.00432587,
         0.00658798, -0.00568771],
       [-0.0147171 ,  0.01287842, -0.10516357, ...,  0.11804199,
         0.13781738,  0.203125  ],
       ...,
       [ 0.17944336,  0.1887207 ,  0.24584961, ..., -0.01290894,
         0.18457031, -0.15307617],
       [ 0.05737305, -0.03430176, -0.07635498, ...,  0.0725708 ,
         0.06689453,  0.06781006],
       [ 0.02632141,  0.01670837, -0.04385376, ...,  0.01147461,
         0.08642578,  0.06854248]], dtype=float32)

# Initiate the dataloaders
In the implementations we have disconnected the models and data. Hence, you should built a dataloader that fits your needs.

In [None]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=128,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=64,
)

## Train the model


In [None]:
# List all physical devices
physical_devices = tf.config.list_physical_devices('GPU')
print("Available devices:", physical_devices)
# import torch.nn as nn
# print(torch.cuda.is_available())

Available devices: []


In [None]:
# import os
# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.weights.h5"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # CALLBACKS
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
# early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
# modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
#     filepath=MODEL_WEIGHTS, save_best_only=True, save_weights_only=True, verbose=1
# )

# # hparams_nrms.history_size = HISTORY_SIZE
# # model = NRMSModel(
# #     hparams=hparams_nrms,
# #     word2vec_embedding=word2vec_embedding,
# #     # precomputed_embeddings=article_embeddings,
# #     seed=42,
# # )
# # print("starting fitting")

# # hist = model.model.fit(
# #     train_dataloader,
# #     validation_data=val_dataloader,
# #     epochs=1,
# #     callbacks=[early_stopping, modelcheckpoint]#tensorboard_callback
# # )
# # print("load weigths")
# # _ = model.model.load_weights(filepath=MODEL_WEIGHTS)


In [None]:
# # from ebrec.models.newsrec.nrmspy import NRMSModel, NRMSWrapper 
# # import torch
# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create a custom ModelCheckpoint for PyTorch
# class PyTorchModelCheckpoint:
#     def __init__(self, filepath, save_best_only=True, save_weights_only=True, verbose=1):
#         self.filepath = filepath
#         self.save_best_only = save_best_only
#         self.save_weights_only = save_weights_only
#         self.verbose = verbose
#         self.best_val_loss = float('inf')
    
#     def on_epoch_end(self, epoch, logs=None):
#         val_loss = logs.get('val_loss', None)
#         if val_loss is None:
#             return
        
#         if self.save_best_only:
#             if val_loss < self.best_val_loss:
#                 if self.verbose:
#                     print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
#                 self.best_val_loss = val_loss
#                 torch.save(model.model.state_dict(), self.filepath)
#         else:
#             torch.save(model.model.state_dict(), self.filepath)

# # CALLBACKS
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
# early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
# modelcheckpoint = PyTorchModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1
# )

# # # Update hyperparameters if needed
# # hparams_nrms.head_num = 20  # number of attention heads
# # hparams_nrms.head_dim = 20  # dimension of each head (total dimension will be head_num * head_dim)
# # hparams_nrms.attention_hidden_dim = hparams_nrms.head_num * hparams_nrms.head_dim
# # hparams_nrms.dropout = 0.2
# # hparams_nrms.title_size = 30  # should match your MAX_TITLE_LENGTH

# # # Initialize model
# # hparams_nrms.history_size = HISTORY_SIZE
# # pytorch_model = NRMSModel(
# #     hparams=hparams_nrms,
# #     word2vec_embedding=word2vec_embedding,
# #     seed=42,
# # )
# # model = NRMSWrapper(pytorch_model)

# # # Training
# # hist = model.fit(
# #     train_dataloader,
# #     validation_data=val_dataloader,
# #     epochs=5,
# #     callbacks=[modelcheckpoint]
# # )

# # # Load weights using the wrapper
# # model.load_weights(filepath=MODEL_WEIGHTS)

# # # Get predictions
# # pred_validation = model.predict(val_dataloader)

In [None]:
# import os
# import torch
# from ebrec.models.newsrec.nrmspy import NRMSModel, NRMSWrapper  # Add this import

# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create base model
# base_model = NRMSModel(
#     hparams=hparams_nrms,
#     word2vec_embedding=word2vec_embedding,
#     seed=42,
# )

# # Wrap model with NRMSWrapper
# model = NRMSWrapper(
#     model=base_model,
#     device='cuda' if torch.cuda.is_available() else 'cpu'
# )

# print("starting fitting")

# # Use wrapper's fit method
# hist = model.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=1,
#     callbacks=[early_stopping, modelcheckpoint]
# )

# print("load weights")
# model.load_weights(filepath=MODEL_WEIGHTS)

In [None]:
# import os
# from torch.utils.tensorboard import SummaryWriter
# from ebrec.models.newsrec.nrmspy import NRMSModel, NRMSWrapper
# import torch

# # Setup directories
# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create tensorboard writer
# writer = SummaryWriter(LOG_DIR)

# # Setup ModelCheckpoint
# class PyTorchModelCheckpoint:
#     def __init__(self, filepath, save_best_only=True, save_weights_only=True, verbose=1):
#         self.filepath = filepath
#         self.save_best_only = save_best_only
#         self.save_weights_only = save_weights_only
#         self.verbose = verbose
#         self.best_val_loss = float('inf')
    
#     def on_epoch_end(self, epoch, logs=None):
#         val_loss = logs.get('val_loss', None)
#         if val_loss is None:
#             return
        
#         if self.save_best_only:
#             if val_loss < self.best_val_loss:
#                 if self.verbose:
#                     print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
#                 self.best_val_loss = val_loss
#                 torch.save(model.state_dict(), self.filepath)
#         else:
#             torch.save(model.state_dict(), self.filepath)

# # Initialize callback
# modelcheckpoint = PyTorchModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1
# )

# # Create model and wrapper
# model = NRMSModel(
#     hparams=hparams_nrms,
#     word2vec_embedding=word2vec_embedding,
#     seed=42
# )
# wrapper = NRMSWrapper(model)

# # Train
# wrapper.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=5,
#     callbacks=[modelcheckpoint],
#     writer=writer  # Pass writer to fit method
# )

# # Close writer
# writer.close()

# # Load best weights and predict
# wrapper.load_weights(MODEL_WEIGHTS)
# predictions = wrapper.predict(val_dataloader)

In [None]:
# # Create model with Keras-like interface
# from ebrec.models.newsrec.nrmspy import NRMSModel, NRMSWrapper, KerasLikeModel

# import os
# from torch.utils.tensorboard import SummaryWriter
# from ebrec.models.newsrec.nrmspy import NRMSModel, NRMSWrapper
# import torch

# # Setup directories
# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create tensorboard writer
# writer = SummaryWriter(LOG_DIR)

# # Setup ModelCheckpoint
# class PyTorchModelCheckpoint:
#     def __init__(self, filepath, save_best_only=True, save_weights_only=True, verbose=1):
#         self.filepath = filepath
#         self.save_best_only = save_best_only
#         self.save_weights_only = save_weights_only
#         self.verbose = verbose
#         self.best_val_loss = float('inf')
    
#     def on_epoch_end(self, epoch, logs=None):
#         val_loss = logs.get('val_loss', None)
#         if val_loss is None:
#             return
        
#         if self.save_best_only:
#             if val_loss < self.best_val_loss:
#                 if self.verbose:
#                     print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
#                 self.best_val_loss = val_loss
#                 torch.save(model.state_dict(), self.filepath)
#         else:
#             torch.save(model.state_dict(), self.filepath)

# # Initialize callback
# modelcheckpoint = PyTorchModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1
# )

# pytorch_model = NRMSModel(hparams=hparams_nrms, word2vec_embedding=word2vec_embedding, seed=42)
# keras_like_model = KerasLikeModel(pytorch_model)

# # Compile model
# keras_like_model.compile(optimizer='adam', loss='categorical_crossentropy')

# # Train model
# keras_like_model.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=5,
#     callbacks=[modelcheckpoint]
# )

In [None]:
import os
from tqdm.notebook import tqdm


MODEL_NAME = "NRMS"
LOG_DIR = f"downloads/runs/{MODEL_NAME}"
WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(WEIGHTS_DIR, exist_ok=True)

# Create a custom ModelCheckpoint for PyTorch
class PyTorchModelCheckpoint:
    def __init__(self, filepath, model_wrapper=None, save_best_only=True, save_weights_only=True, verbose=1):
        self.filepath = filepath
        self.model_wrapper = model_wrapper  # Store the model wrapper reference
        self.save_best_only = save_best_only
        self.save_weights_only = save_weights_only
        self.verbose = verbose
        self.best_val_loss = float('inf')
    
    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs.get('val_loss', None)
        if val_loss is None:
            return
        
        if self.save_best_only:
            if val_loss < self.best_val_loss:
                if self.verbose:
                    print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
                self.best_val_loss = val_loss
                # Use the model_wrapper reference
                self.model_wrapper.save_weights(self.filepath)
        else:
            self.model_wrapper.save_weights(self.filepath)

# Initialize model first
hparams_nrms.history_size = HISTORY_SIZE
pytorch_model = NRMSModel(
    hparams=hparams_nrms,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)
model = NRMSWrapper(pytorch_model)

# Then create the callback with the model reference
modelcheckpoint = PyTorchModelCheckpoint(
    filepath=MODEL_WEIGHTS,
    model_wrapper=model,  # Pass the model wrapper
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Training
hist = model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=1,
    callbacks=[modelcheckpoint]
)

# Load weights using the wrapper
model.load_weights(filepath=MODEL_WEIGHTS)

# Get predictions
pred_validation = model.predict(val_dataloader)

NRMSWrapper init
True


Epoch 1/1 [Train]: 100%|██████████| 184/184 [00:24<00:00,  7.39it/s, loss=0.4870]
Epoch 1/1 [Valid]: 100%|██████████| 383/383 [00:58<00:00,  6.51it/s, loss=0.6787]


Epoch 1 - Train Loss: 0.4870, Val Loss: 0.3261

Validation loss improved from inf to 0.32606


  self.model.load_state_dict(torch.load(filepath))
Predicting: 100%|██████████| 383/383 [00:58<00:00,  6.52it/s]


# Saving / loading model because hpc annoying

In [None]:
# MODEL_FILE = f"downloads/models/{MODEL_NAME}.h5" 

# # Save the model after training
# print("Saving the model...")
# os.makedirs(os.path.dirname(MODEL_FILE), exist_ok=True)
# model.model.save(MODEL_FILE)  # Save the full model (architecture + weights)
# print(f"Model saved at {MODEL_FILE}")

##LOAD SAVED MODEL
# from tensorflow.keras.models import load_model

# # Load the saved model
# print(f"Loading the model from {MODEL_FILE}...")
# model.model = load_model(MODEL_FILE)
# print("Model loaded successfully.")


# Example how to compute some metrics:

In [None]:
# pred_validation = model.scorer.predict(val_dataloader)

## Add the predictions to the dataframe

In [None]:
df_validation = add_prediction_scores(df_validation, pred_validation.tolist()).pipe(
    add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
)
df_validation.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,scores,is_known_user
u32,list[i32],list[i32],list[i32],u32,list[i8],list[f64],list[f64],bool
2559103,"[9778168, 9778102, … 9779071]","[9779653, 9781354, … 9781535]",[9780372],269179170,"[0, 0, … 0]","[2.236389, 1.738056, … 1.913889]","[0.227807, 0.172969, … 0.218408]",True
1866066,"[9777000, 9777704, … 9780271]","[9780874, 9770451, … 9784591]",[9784607],329969502,"[0, 0, … 0]","[56.234444, 224.853889, … 1.015]","[0.230873, 0.334224, … 0.227539]",True


### Compute metrics

In [None]:
metrics = MetricEvaluator(
    labels=df_validation["labels"].to_list(),
    predictions=df_validation["scores"].to_list(),
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()

<MetricEvaluator class>: 
 {
    "auc": 0.5419201977973713,
    "mrr": 0.33620641224139075,
    "ndcg@5": 0.37563557466943853,
    "ndcg@10": 0.45369826648145395
}

## Make submission file

In [None]:
df_validation = df_validation.with_columns(
    pl.col("scores")
    .map_elements(lambda x: list(rank_predictions_by_score(x)))
    .alias("ranked_scores")
)
df_validation.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,scores,is_known_user,ranked_scores
u32,list[i32],list[i32],list[i32],u32,list[i8],list[f64],list[f64],bool,list[i64]
2559103,"[9778168, 9778102, … 9779071]","[9779653, 9781354, … 9781535]",[9780372],269179170,"[0, 0, … 0]","[2.236389, 1.738056, … 1.913889]","[0.227807, 0.172969, … 0.218408]",True,"[3, 8, … 4]"
1866066,"[9777000, 9777704, … 9780271]","[9780874, 9770451, … 9784591]",[9784607],329969502,"[0, 0, … 0]","[56.234444, 224.853889, … 1.015]","[0.230873, 0.334224, … 0.227539]",True,"[4, 1, … 5]"


This is using the validation, simply add the testset to your flow.

In [None]:
write_submission_file(
    impression_ids=df_validation[DEFAULT_IMPRESSION_ID_COL],
    prediction_scores=df_validation["ranked_scores"],
    path="downloads/predictions.txt",
)

0it [00:00, ?it/s]

24464it [00:00, 28394.70it/s]


Zipping downloads/predictions.txt to downloads/predictions.zip


# DONE 🚀