# Getting started

In this notebook, we illustrate how to use the Neural News Recommendation with Multi-Head Self-Attention ([NRMS](https://aclanthology.org/D19-1671/)). The implementation is taken from the [recommenders](https://github.com/recommenders-team/recommenders) repository. We have simply stripped the model to keep it cleaner.

We use a small dataset, which is downloaded from [recsys.eb.dk](https://recsys.eb.dk/). All the datasets are stored in the folder path ```~/ebnerd_data/*```.

## Load functionality

In [1]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import tensorflow as tf
import polars as pl
from tensorflow.python.client import device_lib
import numpy as np

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_ARTICLE_ID_COL,
    DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL, #-------
    DEFAULT_HISTORY_READ_TIME_COL #-------
)

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

from ebrec.models.newsrec.dataloader import NRMSDataLoader
from ebrec.models.newsrec.model_config import hparams_nrms
from ebrec.models.newsrec import NRMSModel, NRMSWrapper

2024-12-15 09:15:43.342618: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-15 09:15:43.346762: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-15 09:15:43.396377: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-15 09:15:43.396441: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-15 09:15:43.396477: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [None]:
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# Setup and load everything
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

print("Loading data")

# List all physical devices
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

physical_devices = tf.config.list_physical_devices('GPU')
print("Available devices:", physical_devices)


#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
## Load dataset
# #-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        # .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL)
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL,DEFAULT_HISTORY_READ_TIME_COL) #------------
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    
    return df_behaviors
  
  
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
  ### Generate labels
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# We sample a few just to get started. For testset we just make up a dummy column with 0 and 1 - this is not the true labels.

PATH = Path("/dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data").expanduser()
DATASPLIT = "ebnerd_small" # TODO if change to change make_embedding_artifacts.ipynb file (embeddings)

# DATASPLIT = "ebnerd__testset"
DUMP_DIR = PATH.joinpath("dump_artifacts")
DUMP_DIR.mkdir(exist_ok=True, parents=True)

#In this example we sample the dataset, just to keep it smaller. Also, one can simply add the testset similary to the validation.

### Define the Data Cols -- New ones here
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL, #--------neu 
    DEFAULT_HISTORY_READ_TIME_COL, #------- neu
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
]

#____————____————____————____————
# Define the history size and fraction
# ____————____————____————____————
HISTORY_SIZE = 30 #30
FRACTION = 0.2 #Fraction af datasæt

#____————____————____————____————

print("____————____————____————____———")
print("HISTORY_SIZE:", HISTORY_SIZE)
print("FRACTION:", FRACTION)
print("____————____————____————____———")
print("")

#____————____————____————____————

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=6,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
print(df_train.head(2))
print(df_validation.head(2))


#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
## Load articles
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT, "articles.parquet"))
df_articles.head(2)

Loading data
Available devices: []
____————____————____————____———
HISTORY_SIZE: 30
FRACTION: 0.2
____————____————____————____———



2024-12-15 09:15:46.741277: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


shape: (2, 9)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ user_id ┆ article_id ┆ scroll_pe ┆ read_time ┆ … ┆ article_i ┆ impressio ┆ impressio ┆ labels    │
│ ---     ┆ _fixed     ┆ rcentage_ ┆ _fixed    ┆   ┆ ds_clicke ┆ n_id      ┆ n_time    ┆ ---       │
│ u32     ┆ ---        ┆ fixed     ┆ ---       ┆   ┆ d         ┆ ---       ┆ ---       ┆ list[i8]  │
│         ┆ list[i32]  ┆ ---       ┆ list[f32] ┆   ┆ ---       ┆ u32       ┆ datetime[ ┆           │
│         ┆            ┆ list[f32] ┆           ┆   ┆ list[i64] ┆           ┆ μs]       ┆           │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1610498 ┆ [9764086,  ┆ [100.0,   ┆ [27.0,    ┆ … ┆ [9772813] ┆ 225635116 ┆ 2023-05-1 ┆ [0, 1, …  │
│         ┆ 9764008, … ┆ 52.0, …   ┆ 5.0, …    ┆   ┆           ┆           ┆ 9         ┆ 0]        │
│         ┆ 9766873]   ┆ 32.0]     ┆ 33.0]     ┆   ┆           ┆           ┆ 

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var i…","""Politiet frygt…",2023-06-29 06:20:33,False,"""Sagen om den ø…",2006-08-31 08:06:45,[3150850],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars …","""Biografgængern…",2023-06-29 06:20:35,False,"""Vatikanet har …",2006-05-21 16:57:00,[3006712],"""article_defaul…","""https://ekstra…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""


In [3]:
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

# Loading the article embeddings and other feature stuff

#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

### Added features and hourly difference between published and viewed article

## NEW

from sklearn.preprocessing import StandardScaler

# Convert polars DataFrame to pandas
df_train = df_train.to_pandas()

# Create a mapping dictionary from article_id to last_modified_time
article_time_dict = df_articles.select(
    "article_id", 
    "published_time"
).to_dict(as_series=False)
article_time_dict = dict(zip(
    article_time_dict["article_id"], 
    article_time_dict["published_time"]
))

# Create a function to map article IDs to their timestamps
def get_article_times(article_ids):
    return [article_time_dict.get(aid, None) for aid in article_ids]

# Add new column with the published-time
df_train["inview_article_times"] = df_train["article_ids_inview"].apply(get_article_times)

#add new column with the last publish_time for the clicked article
df_train["clicked_article_time"] = df_train["article_ids_clicked"].apply(get_article_times)

# Create a function to calculate hour differences
def calculate_hour_differences(impression_time, article_times):
        # If article_times is a single value (for clicked articles)
    if not isinstance(article_times, list):
        if article_times is None:
            return None
        return (impression_time - article_times).total_seconds() / 3600
    
    # If article_times is a list (for inview articles)
    differences = [(impression_time - article_time).total_seconds() / 3600 
                  if article_time is not None else None 
                  for article_time in article_times]
    return differences

# Use for inview articles
df_train['inview_hour_differences'] = df_train.apply(
    lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
    axis=1
)

# # Use for clicked article
# df_train['clicked_hour_difference'] = df_train.apply(
#    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
#    axis=1
# )

# Create a mapping dictionary from article_id to last_modified_category
article_cat_dict = df_articles.select(
    "article_id", 
    "category"
).to_dict(as_series=False)
article_cat_dict = dict(zip(
    article_cat_dict["article_id"], 
    article_cat_dict["category"]
))

# Create a function to map article IDs to their category
def get_article_category(article_ids):
    return [article_cat_dict.get(aid, None) for aid in article_ids]

#  Add new column with the article category
df_train["inview_article_categories"] = df_train["article_ids_inview"].apply(get_article_category)

df_train["history_article_categories"] = df_train["article_id_fixed"].apply(get_article_category)

# Create a mapping dictionary from article_id to article_type
article_type_dict = df_articles.select(
    "article_id", 
    "article_type"
).to_dict(as_series=False)
article_type_dict = dict(zip(
    article_type_dict["article_id"], 
    article_type_dict["article_type"]
))

# Create a function to map article IDs to their type
def get_article_type(article_ids):
    return [article_type_dict.get(aid, None) for aid in article_ids]

# Add new column with the article type
df_train["inview_article_types"] = df_train["article_ids_inview"].apply(get_article_type)

df_train["history_article_types"] = df_train["article_id_fixed"].apply(get_article_type)

#drop columns with the time
df_train = df_train.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)

df_train = pl.from_pandas(df_train)

df_train.head(2)


##########################################################################################


# Convert polars DataFrame to pandas
df_validation = df_validation.to_pandas()

# Create a mapping dictionary from article_id to last_modified_time
article_time_dict = df_articles.select(
    "article_id", 
    "published_time"
).to_dict(as_series=False)
article_time_dict = dict(zip(
    article_time_dict["article_id"], 
    article_time_dict["published_time"]
))

# Create a function to map article IDs to their timestamps
def get_article_times(article_ids):
    return [article_time_dict.get(aid, None) for aid in article_ids]

# Add new column with the published-time
df_validation["inview_article_times"] = df_validation["article_ids_inview"].apply(get_article_times)

#add new column with the last publish_time for the clicked article
df_validation["clicked_article_time"] = df_validation["article_ids_clicked"].apply(get_article_times)

# Create a function to calculate hour differences
def calculate_hour_differences(impression_time, article_times):
        # If article_times is a single value (for clicked articles)
    if not isinstance(article_times, list):
        if article_times is None:
            return None
        return (impression_time - article_times).total_seconds() / 3600
    
    # If article_times is a list (for inview articles)
    differences = [(impression_time - article_time).total_seconds() / 3600 
                  if article_time is not None else None 
                  for article_time in article_times]
    return differences

# Use for inview articles
df_validation['inview_hour_differences'] = df_validation.apply(
    lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
    axis=1
)

# # Use for clicked article -- might be leaky??
# df_validation['clicked_hour_difference'] = df_validation.apply(
#    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
#    axis=1
# )
# Create a mapping dictionary from article_id to last_modified_category
article_cat_dict = df_articles.select(
    "article_id", 
    "category"
).to_dict(as_series=False)
article_cat_dict = dict(zip(
    article_cat_dict["article_id"], 
    article_cat_dict["category"]
))

# Create a function to map article IDs to their category
def get_article_category(article_ids):
    return [article_cat_dict.get(aid, None) for aid in article_ids]

#  Add new column with the article category
df_validation["inview_article_categories"] = df_validation["article_ids_inview"].apply(get_article_category)

df_validation["history_article_categories"] = df_validation["article_id_fixed"].apply(get_article_category)

# Create a mapping dictionary from article_id to article_type
article_type_dict = df_articles.select(
    "article_id", 
    "article_type"
).to_dict(as_series=False)
article_type_dict = dict(zip(
    article_type_dict["article_id"], 
    article_type_dict["article_type"]
))

# Create a function to map article IDs to their type
def get_article_type(article_ids):
    return [article_type_dict.get(aid, None) for aid in article_ids]

# Add new column with the article type
df_validation["inview_article_types"] = df_validation["article_ids_inview"].apply(get_article_type)

df_validation["history_article_types"] = df_validation["article_id_fixed"].apply(get_article_type)


#drop columns with the time
df_validation = df_validation.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)


df_validation = pl.from_pandas(df_validation)

df_validation.head(2)

user_id,article_id_fixed,scroll_percentage_fixed,read_time_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,inview_article_categories,history_article_categories,inview_article_types,history_article_types
u32,list[i32],list[f32],list[f32],list[i32],list[i32],u32,list[i8],list[f64],list[i64],list[i64],list[str],list[str]
2302800,"[9750718, 9750789, … 9778728]","[100.0, 97.0, … 100.0]","[302.0, 19.0, … 51.0]","[9788720, 9788183, … 9788752]",[9788720],280015603,"[1, 0, … 0]","[0.249167, 0.992222, … 0.128611]","[142, 414, … 140]","[118, 414, … 142]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]"
2555765,"[9778369, 9778381, … 9772548]","[100.0, 51.0, … 100.0]","[28.0, 3.0, … 1079.0]","[9788766, 9790548, … 9788794]",[9788766],213466077,"[1, 0, … 0]","[2.476111, 3.537222, … 27.857778]","[2975, 140, … 142]","[142, 142, … 142]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]"


In [4]:
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
## Init model using HuggingFace's tokenizer and wordembedding

# In the original implementation, they use the GloVe embeddings and tokenizer. To get going fast, we'll use a multilingual LLM from Hugging Face. 
# Utilizing the tokenizer to tokenize the articles and the word-embedding to init NRMS.
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-robe rta-large"
# TRANSFORMER_MODEL_NAME = "google-bert/bert-base-multilingual-uncased" 
#Argue for cased vs uncased.  TODO
# #Cased might be better but to compare with malteHb we use uncased

TRANSFORMER_MODEL_NAME = "Maltehb/danish-bert-botxo"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30 #hardcoded somewhere ?? error if change

print("")
print("____————____————____————____———")
print("Using transformer model:", TRANSFORMER_MODEL_NAME)
print("____————____————____————____———")
print("")

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#


df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)

# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

#_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

# print("df_train columns:", df_train.columns)
# print("df_validation columns:", df_validation.columns)
#_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_


____————____————____————____———
Using transformer model: Maltehb/danish-bert-botxo
____————____————____————____———





# Initiate the dataloaders
In the implementations we have disconnected the models and data. Hence, you should built a dataloader that fits your needs.

In [5]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=128,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=64,
)


## Train the model


In [6]:
import torch
# List all physical devices
physical_devices = tf.config.list_physical_devices('GPU')
print("Available devices:", physical_devices)
import torch.nn as nn
print(torch.cuda.is_available())

Available devices: []
True


In [13]:
#_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# Original Model
#_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

# Works fine -- Can change epochs on line 67

# import os
# from tqdm.notebook import tqdm


# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create a custom ModelCheckpoint for PyTorch
# class PyTorchModelCheckpoint:
#     def __init__(self, filepath, model_wrapper=None, save_best_only=True, save_weights_only=True, verbose=1):
#         self.filepath = filepath
#         self.model_wrapper = model_wrapper  # Store the model wrapper reference
#         self.save_best_only = save_best_only
#         self.save_weights_only = save_weights_only
#         self.verbose = verbose
#         self.best_val_loss = float('inf')
    
#     def on_epoch_end(self, epoch, logs=None):
#         val_loss = logs.get('val_loss', None)
#         if val_loss is None:
#             return
        
#         if self.save_best_only:
#             if val_loss < self.best_val_loss:
#                 if self.verbose:
#                     print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
#                 self.best_val_loss = val_loss
#                 # Use the model_wrapper reference
#                 self.model_wrapper.save_weights(self.filepath)
#         else:
#             self.model_wrapper.save_weights(self.filepath)

# # Initialize model first
# hparams_nrms.history_size = HISTORY_SIZE

# pytorch_model = NRMSModel(
#     hparams=hparams_nrms,
#     word2vec_embedding=word2vec_embedding,
#     seed=42,
# )
# model = NRMSWrapper(pytorch_model)

# # Then create the callback with the model reference
# modelcheckpoint = PyTorchModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     model_wrapper=model,  # Pass the model wrapper
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1
# )

# # Training
# hist = model.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=4, ### EPOCHS INPUT
#     callbacks=[modelcheckpoint]
# )

# # Load weights using the wrapper
# model.load_weights(filepath=MODEL_WEIGHTS)

# # Get predictions
# pred_validation = model.predict(val_dataloader)

NRMSWrapper init
True


Epoch 1/4 [Train]:   4%|▍         | 30/733 [00:04<01:51,  6.28it/s, loss=0.4383]

Epoch 1/4 [Train]: 100%|██████████| 733/733 [02:37<00:00,  4.64it/s, loss=0.3889]
Epoch 1/4 [Valid]: 100%|██████████| 1530/1530 [06:09<00:00,  4.14it/s, loss=0.6621]


Epoch 1 - Train Loss: 0.3889, Val Loss: 0.3172

Validation loss improved from inf to 0.31719


Epoch 2/4 [Train]: 100%|██████████| 733/733 [03:11<00:00,  3.83it/s, loss=0.3791]
Epoch 2/4 [Valid]: 100%|██████████| 1530/1530 [08:58<00:00,  2.84it/s, loss=0.6765]


Epoch 2 - Train Loss: 0.3791, Val Loss: 0.3241


Epoch 3/4 [Train]: 100%|██████████| 733/733 [03:21<00:00,  3.64it/s, loss=0.3720]
Epoch 3/4 [Valid]: 100%|██████████| 1530/1530 [10:02<00:00,  2.54it/s, loss=0.6823]


Epoch 3 - Train Loss: 0.3720, Val Loss: 0.3269


Epoch 4/4 [Train]: 100%|██████████| 733/733 [04:02<00:00,  3.03it/s, loss=0.3676]
Epoch 4/4 [Valid]: 100%|██████████| 1530/1530 [10:07<00:00,  2.52it/s, loss=0.7182]
  self.model.load_state_dict(torch.load(filepath))


Epoch 4 - Train Loss: 0.3676, Val Loss: 0.3441


Predicting: 100%|██████████| 1530/1530 [11:18<00:00,  2.26it/s]


In [8]:
def calculate_metrics(outputs, labels):
    # AUC
    auc = roc_auc_score(labels, outputs)

    # MRR
    sorted_indices = np.argsort(outputs)[::-1]
    ranks = np.where(labels[sorted_indices] == 1)[0] + 1  # 1-based ranks
    mrr = 1 / ranks[0] if len(ranks) > 0 else 0

    # NDCG@5
    top_k = 5
    top_k_indices = sorted_indices[:top_k]
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(top_k, np.sum(labels))))
    dcg = sum(
        labels[i] / np.log2(rank + 2)
        for rank, i in enumerate(top_k_indices)
    )
    ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0

    return auc, mrr, ndcg

In [7]:
import optuna
from sklearn.metrics import roc_auc_score
import numpy as np
import torch
hparams_nrms.history_size = HISTORY_SIZE # JUST leave this here -- it is annoying

# ___———___———___———___———___
# INPUTS
EPOCHSS = 2
NUM_TRIALS = 10 #Change according to how many iterations of hyperparameter tuning you want
            # techmically it is how many branches of tree you want to explore so u dont run all
# ___———___———___———___———___

# Theres a bunch of stuff to do for the objective funciton but i tried keeping it basic -- maybe do multi objective -- but then again how interested are we in MRR or somthin else -- maybe minimize loss but idk if you can maximize one thing and minimize another -- i guess you could just maximize -loss
def objective(trial):
    # Change theese? 
    hparams_nrms.head_num = trial.suggest_int('head_num', 10, 30)
    hparams_nrms.head_dim = trial.suggest_int('head_dim', 10, 30)
    hparams_nrms.attention_hidden_dim = trial.suggest_int('attention_hidden_dim', 100, 300)
    hparams_nrms.dropout = trial.suggest_float('dropout', 0.0, 0.5)
    hparams_nrms.learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    
    # DEFAULT HPARAM CLASS (src/ebrec/models/newsrec/model_config.py):
                    # class hparams_nrms:
                        # # INPUT DIMENTIONS:
                        # title_size: int = DEFAULT_TITLE_SIZE  -- Hardcoded somwhere i think
                        # history_size: int = 50                -- NO need to reload all data for this
                        # # MODEL ARCHITECTURE
                        # head_num: int = 20                   -- included
                        # head_dim: int = 20                   -- included
                        # attention_hidden_dim: int = 200      - included
                        # # MODEL OPTIMIZER:
                        # optimizer: str = "adam"              -- possible?        
                        # loss: str = "cross_entropy_loss"     -- possible?
                        # dropout: float = 0.2                 -- included
                        # learning_rate: float = 0.0001        -- included


    # Initialize the model -- same as b4
    pytorch_model = NRMSModel(
        hparams=hparams_nrms,
        word2vec_embedding=word2vec_embedding,
        seed=42,
    )
    model = NRMSWrapper(pytorch_model)

    # get model params from model.model.parameters() basically
    optimizer = torch.optim.Adam(model.model.parameters(), lr=hparams_nrms.learning_rate)
    # loss_fn = torch.nn.BCELoss()  # Use the appropriate loss function for your problem
    loss_fn = torch.nn.CrossEntropyLoss() 
        #alternatives are:
            # BCEWithLogitsLoss() - combines a sigmoid layer and the BCELoss in one single class
            # CrossEntropyLoss() - combines LogSoftmax and NLLLoss in one single class
            # KLDivLoss() - the Kullback-Leibler divergence loss
            # MSELoss() - the mean squared error loss
            # NLLLoss() - the negative log likelihood loss
            
            #But does it make sense to change theese since our model is based on CEL?
            #cuz like i think it should be the same as the model right....?

    # Training loop
    for epoch in range(EPOCHSS): 
        model.model.train()
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            history, candidates = inputs

            history = torch.from_numpy(history).to(model.device)
            candidates = torch.from_numpy(candidates).to(model.device)
            labels = torch.from_numpy(labels).float().to(model.device)

            # Forward pass using the underlying model
            try:
                outputs = model.model(history, candidates, training=True)
            except RuntimeError as e:
                print(f"History shape: {history.shape}")
                print(f"Candidates shape: {candidates.shape}")
                raise e

            loss = loss_fn(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.model.parameters(), max_norm=5.0)
            optimizer.step()

    # Validation loop
    model.model.eval()  # Now use evalation mode
    all_outputs, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            history, candidates = inputs
            history = torch.from_numpy(history).to(model.device)
            candidates = torch.from_numpy(candidates).to(model.device)
            labels = torch.from_numpy(labels).float().to(model.device)

            outputs = model.model(history, candidates, training=False)
            all_outputs.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())


    # Calculate AUC
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)
    auc = roc_auc_score(all_labels, all_outputs)

    print(f"AUC: {auc:.4f}")

    # Here we optimize for AUC, but can also do other stuf
    return auc

# Create a single-objective Optuna 'study'
study = optuna.create_study(direction='maximize')  # Maximize AUC -- dont minimize it ;) 
study.optimize(objective, n_trials=NUM_TRIALS, show_progress_bar=True)


print("Best hyperparameters:", study.best_params)
print(f"Best AUC (train): {study.best_value:.4f}")

# Print best trial stuff -- basically just claude
def evaluate_best_trial(study, MAKE_SUBMISSIONS=False):
    # Get best trial hyperparameters
    best_trial = study.best_trial
    hparams_nrms.head_num = best_trial.params['head_num']
    hparams_nrms.head_dim = best_trial.params['head_dim']
    hparams_nrms.attention_hidden_dim = best_trial.params['attention_hidden_dim']
    hparams_nrms.dropout = best_trial.params['dropout']
    hparams_nrms.learning_rate = best_trial.params['learning_rate']

    # TODO couldt think of sometihng better than to just retrain it ;)
    pytorch_model = NRMSModel(
        hparams=hparams_nrms,
        word2vec_embedding=word2vec_embedding,
        seed=42,
    )
    model = NRMSWrapper(pytorch_model)
    model.model.eval()

    # Generate predictions for the validation set
    all_outputs = []
    with torch.no_grad():
        for inputs, _ in val_dataloader:
            history, candidates = inputs
            history = torch.from_numpy(history).to(model.device)
            candidates = torch.from_numpy(candidates).to(model.device)
            outputs = model.model(history, candidates, training=False)
            all_outputs.append(outputs.cpu().numpy())

    # Add predictions to the validation DataFrame
    pred_validation = np.concatenate(all_outputs)
    df_validation_with_preds = add_prediction_scores(
        df_validation, pred_validation.tolist()
    ).pipe(
        add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
    )

    # Metrics
    metrics = MetricEvaluator(
        labels=df_validation_with_preds["labels"].to_list(),
        predictions=df_validation_with_preds["scores"].to_list(),
        metric_functions=[
            AucScore(),
            MrrScore(),
            NdcgScore(k=5),
            NdcgScore(k=10),
        ],
    )
    metric_results = metrics.evaluate()
    
    # MAKE SUBMISSIONS FILE!!!!

    if MAKE_SUBMISSIONS:
        # Rank the predictions
        df_validation_with_preds = df_validation_with_preds.with_columns(pl.col("scores").map_elements(lambda x: list(rank_predictions_by_score(x))).alias("ranked_scores"))

        write_submission_file(
        impression_ids=df_validation_with_preds[DEFAULT_IMPRESSION_ID_COL],
        prediction_scores=df_validation_with_preds["ranked_scores"],
        path="downloads/predictions.txt",
        )  
        print("Submission file created!")
    
    return metric_results

# Evaluate metrics for the best trial
best_trial_metrics = evaluate_best_trial(study)
print("Metrics for Best AUC Trial:", best_trial_metrics)


[I 2024-12-15 09:30:23,440] A new study created in memory with name: no-name-34772e3d-115c-473a-8dc4-cf69a0307f01


  0%|          | 0/10 [00:00<?, ?it/s]

In [8]:
best_trial_metrics = evaluate_best_trial(study, MAKE_SUBMISSIONS=True)


48929it [00:01, 31023.55it/s]


Zipping downloads/predictions.txt to downloads/predictions.zip
Submission file created!


In [9]:
import optuna.visualization as vis

def generate_optuna_plots(study): #thanks claude
    """
    Generate and display multiple Optuna visualization plots.
    
    Args:
        study (optuna.study.Study): The Optuna study object to visualize
    """
    # 1. Param Importances Plot
    param_importances = vis.plot_param_importances(study)
    param_importances.show()
    
    # 2. Optimization History Plot
    optimization_history = vis.plot_optimization_history(study) ## Remember it is test AUC
    optimization_history.show()
    
    # 3. Parallel Coordinate Plot
    parallel_coordinate = vis.plot_parallel_coordinate(study)
    parallel_coordinate.show()
    
    # 4. Slice Plot
    slice_plot = vis.plot_slice(study)
    slice_plot.show()
    
    # 5. Contour Plot
    contour_plot = vis.plot_contour(study)
    contour_plot.show()
    
        # EDF (Empirical Distribution Function) plot
    edf_plot = vis.plot_edf(study)
    edf_plot.show()
    
generate_optuna_plots(study)

ValueError: Cannot evaluate parameter importances with only a single trial.

In [20]:
# Get trial statistics
print("\nStudy Statistics:")
print(f"Number of completed trials: {len(study.trials)}")
print(f"Number of pruned trials: {len(study.get_trials(states=[optuna.trial.TrialState.PRUNED]))}")
print(f"Number of complete trials: {len(study.get_trials(states=[optuna.trial.TrialState.COMPLETE]))}")

            
print("\nTop 2 Trials:")
sorted_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)
for trial in sorted_trials[:2]:
    print(f"\nTrial {trial.number}")
    print(f"AUC: {trial.value:.4f}")
    print("Parameters:")
    for param_name, param_value in trial.params.items():
        print(f"  {param_name}: {param_value}")


Study Statistics:
Number of completed trials: 4
Number of pruned trials: 0
Number of complete trials: 4

Top 2 Trials:

Trial 0
AUC: 0.5578
Parameters:
  head_num: 29
  head_dim: 12
  attention_hidden_dim: 117
  dropout: 0.06297744623833967
  learning_rate: 5.48331531711811e-05

Trial 1
AUC: 0.5548
Parameters:
  head_num: 10
  head_dim: 24
  attention_hidden_dim: 210
  dropout: 0.15981037963782108
  learning_rate: 6.660790310583853e-05


# DONE ⛄️