# Getting started

In this notebook, we illustrate how to use the Neural News Recommendation with Multi-Head Self-Attention ([NRMS](https://aclanthology.org/D19-1671/)). The implementation is taken from the [recommenders](https://github.com/recommenders-team/recommenders) repository. We have simply stripped the model to keep it cleaner.

We use a small dataset, which is downloaded from [recsys.eb.dk](https://recsys.eb.dk/). All the datasets are stored in the folder path ```~/ebnerd_data/*```.

## Load functionality

In [1]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import tensorflow as tf
import polars as pl
from tensorflow.python.client import device_lib
import numpy as np

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_ARTICLE_ID_COL,
    DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL, #-------
    DEFAULT_HISTORY_READ_TIME_COL #-------
)

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

from ebrec.models.newsrec.dataloader import NRMSDataLoader
from ebrec.models.newsrec.model_config import hparams_nrms
from ebrec.models.newsrec import NRMSModel, NRMSWrapper

2024-12-14 22:56:16.442053: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-14 22:56:16.570869: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-14 22:56:16.570906: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-14 22:56:16.571483: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-14 22:56:16.630820: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: A

In [2]:
# List all physical devices
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

physical_devices = tf.config.list_physical_devices('GPU')
print("Available devices:", physical_devices)

Available devices: []


2024-12-14 22:56:20.048491: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Make gridsearch for hyper make arguments TODO

## Load dataset

In [3]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        # .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL)
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL,DEFAULT_HISTORY_READ_TIME_COL) #------------
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    
    return df_behaviors

### Generate labels
We sample a few just to get started. For testset we just make up a dummy column with 0 and 1 - this is not the true labels.

In [4]:
PATH = Path("/dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data").expanduser()
DATASPLIT = "ebnerd_small" # REMEMBER if change to change make_embedding_artifacts.ipynb file (embeddings)
# DATASPLIT = "ebnerd__testset"
DUMP_DIR = PATH.joinpath("dump_artifacts")
DUMP_DIR.mkdir(exist_ok=True, parents=True)

In this example we sample the dataset, just to keep it smaller. Also, one can simply add the testset similary to the validation.

In [5]:
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL, #--------neu 
    DEFAULT_HISTORY_READ_TIME_COL, #------- neu
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    
]
HISTORY_SIZE = 30 #30
FRACTION = 0.01 #Fraction af datasæt

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=6,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
print(df_train.head(2))
print(df_validation.head(2))

shape: (2, 9)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ user_id ┆ article_id ┆ scroll_pe ┆ read_time ┆ … ┆ article_i ┆ impressio ┆ impressio ┆ labels    │
│ ---     ┆ _fixed     ┆ rcentage_ ┆ _fixed    ┆   ┆ ds_clicke ┆ n_id      ┆ n_time    ┆ ---       │
│ u32     ┆ ---        ┆ fixed     ┆ ---       ┆   ┆ d         ┆ ---       ┆ ---       ┆ list[i8]  │
│         ┆ list[i32]  ┆ ---       ┆ list[f32] ┆   ┆ ---       ┆ u32       ┆ datetime[ ┆           │
│         ┆            ┆ list[f32] ┆           ┆   ┆ list[i64] ┆           ┆ μs]       ┆           │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 275409  ┆ [9769414,  ┆ [26.0,    ┆ [1115.0,  ┆ … ┆ [9770594] ┆ 271337003 ┆ 2023-05-1 ┆ [0, 0, …  │
│         ┆ 9769387, … ┆ 26.0, …   ┆ 4.0, …    ┆   ┆           ┆           ┆ 8         ┆ 0]        │
│         ┆ 9769328]   ┆ 39.0]     ┆ 23.0]     ┆   ┆           ┆           ┆ 

## Load articles

In [6]:
df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT, "articles.parquet"))
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var i…","""Politiet frygt…",2023-06-29 06:20:33,False,"""Sagen om den ø…",2006-08-31 08:06:45,[3150850],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars …","""Biografgængern…",2023-06-29 06:20:35,False,"""Vatikanet har …",2006-05-21 16:57:00,[3006712],"""article_defaul…","""https://ekstra…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""


## Added features and hourly difference between published and viewed article

In [7]:
## NEW

from sklearn.preprocessing import StandardScaler

# Convert polars DataFrame to pandas
df_train = df_train.to_pandas()

# Create a mapping dictionary from article_id to last_modified_time
article_time_dict = df_articles.select(
    "article_id", 
    "published_time"
).to_dict(as_series=False)
article_time_dict = dict(zip(
    article_time_dict["article_id"], 
    article_time_dict["published_time"]
))

# Create a function to map article IDs to their timestamps
def get_article_times(article_ids):
    return [article_time_dict.get(aid, None) for aid in article_ids]

# Add new column with the published-time
df_train["inview_article_times"] = df_train["article_ids_inview"].apply(get_article_times)

#add new column with the last publish_time for the clicked article
df_train["clicked_article_time"] = df_train["article_ids_clicked"].apply(get_article_times)

# Create a function to calculate hour differences
def calculate_hour_differences(impression_time, article_times):
        # If article_times is a single value (for clicked articles)
    if not isinstance(article_times, list):
        if article_times is None:
            return None
        return (impression_time - article_times).total_seconds() / 3600
    
    # If article_times is a list (for inview articles)
    differences = [(impression_time - article_time).total_seconds() / 3600 
                  if article_time is not None else None 
                  for article_time in article_times]
    return differences

# Use for inview articles
df_train['inview_hour_differences'] = df_train.apply(
    lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
    axis=1
)

# # Use for clicked article
# df_train['clicked_hour_difference'] = df_train.apply(
#    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
#    axis=1
# )

# Create a mapping dictionary from article_id to last_modified_category
article_cat_dict = df_articles.select(
    "article_id", 
    "category"
).to_dict(as_series=False)
article_cat_dict = dict(zip(
    article_cat_dict["article_id"], 
    article_cat_dict["category"]
))

# Create a function to map article IDs to their category
def get_article_category(article_ids):
    return [article_cat_dict.get(aid, None) for aid in article_ids]

#  Add new column with the article category
df_train["inview_article_categories"] = df_train["article_ids_inview"].apply(get_article_category)

df_train["history_article_categories"] = df_train["article_id_fixed"].apply(get_article_category)

# Create a mapping dictionary from article_id to article_type
article_type_dict = df_articles.select(
    "article_id", 
    "article_type"
).to_dict(as_series=False)
article_type_dict = dict(zip(
    article_type_dict["article_id"], 
    article_type_dict["article_type"]
))

# Create a function to map article IDs to their type
def get_article_type(article_ids):
    return [article_type_dict.get(aid, None) for aid in article_ids]

# Add new column with the article type
df_train["inview_article_types"] = df_train["article_ids_inview"].apply(get_article_type)

df_train["history_article_types"] = df_train["article_id_fixed"].apply(get_article_type)

#drop columns with the time
df_train = df_train.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)

df_train = pl.from_pandas(df_train)

df_train.head(2)

user_id,article_id_fixed,scroll_percentage_fixed,read_time_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,inview_article_categories,history_article_categories,inview_article_types,history_article_types
u32,list[i32],list[f32],list[f32],list[i64],list[i64],u32,list[i8],list[f64],list[i64],list[i64],list[str],list[str]
275409,"[9769414, 9769387, … 9769328]","[26.0, 26.0, … 39.0]","[1115.0, 4.0, … 23.0]","[9770799, 9770799, … 9770799]",[9770594],271337003,"[0, 0, … 0]","[10.4725, 10.4725, … 10.4725]","[142, 142, … 142]","[140, 2975, … 414]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]"
1399788,"[9767868, 9767725, … 9735909]","[23.0, 69.0, … 13.0]","[3.0, 3.0, … 4.0]","[9772363, 9772366, … 9772517]",[9772284],35775585,"[0, 0, … 0]","[2.317222, 2.875833, … 1.465]","[2975, 118, … 118]","[498, 512, … 118]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]"


In [8]:
# Convert polars DataFrame to pandas
df_validation = df_validation.to_pandas()

# Create a mapping dictionary from article_id to last_modified_time
article_time_dict = df_articles.select(
    "article_id", 
    "published_time"
).to_dict(as_series=False)
article_time_dict = dict(zip(
    article_time_dict["article_id"], 
    article_time_dict["published_time"]
))

# Create a function to map article IDs to their timestamps
def get_article_times(article_ids):
    return [article_time_dict.get(aid, None) for aid in article_ids]

# Add new column with the published-time
df_validation["inview_article_times"] = df_validation["article_ids_inview"].apply(get_article_times)

#add new column with the last publish_time for the clicked article
df_validation["clicked_article_time"] = df_validation["article_ids_clicked"].apply(get_article_times)

# Create a function to calculate hour differences
def calculate_hour_differences(impression_time, article_times):
        # If article_times is a single value (for clicked articles)
    if not isinstance(article_times, list):
        if article_times is None:
            return None
        return (impression_time - article_times).total_seconds() / 3600
    
    # If article_times is a list (for inview articles)
    differences = [(impression_time - article_time).total_seconds() / 3600 
                  if article_time is not None else None 
                  for article_time in article_times]
    return differences

# Use for inview articles
df_validation['inview_hour_differences'] = df_validation.apply(
    lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
    axis=1
)

# # Use for clicked article -- might be leaky??
# df_validation['clicked_hour_difference'] = df_validation.apply(
#    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
#    axis=1
# )
# Create a mapping dictionary from article_id to last_modified_category
article_cat_dict = df_articles.select(
    "article_id", 
    "category"
).to_dict(as_series=False)
article_cat_dict = dict(zip(
    article_cat_dict["article_id"], 
    article_cat_dict["category"]
))

# Create a function to map article IDs to their category
def get_article_category(article_ids):
    return [article_cat_dict.get(aid, None) for aid in article_ids]

#  Add new column with the article category
df_validation["inview_article_categories"] = df_validation["article_ids_inview"].apply(get_article_category)

df_validation["history_article_categories"] = df_validation["article_id_fixed"].apply(get_article_category)

# Create a mapping dictionary from article_id to article_type
article_type_dict = df_articles.select(
    "article_id", 
    "article_type"
).to_dict(as_series=False)
article_type_dict = dict(zip(
    article_type_dict["article_id"], 
    article_type_dict["article_type"]
))

# Create a function to map article IDs to their type
def get_article_type(article_ids):
    return [article_type_dict.get(aid, None) for aid in article_ids]

# Add new column with the article type
df_validation["inview_article_types"] = df_validation["article_ids_inview"].apply(get_article_type)

df_validation["history_article_types"] = df_validation["article_id_fixed"].apply(get_article_type)


#drop columns with the time
df_validation = df_validation.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)


df_validation = pl.from_pandas(df_validation)

df_validation.head(2)

user_id,article_id_fixed,scroll_percentage_fixed,read_time_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,inview_article_categories,history_article_categories,inview_article_types,history_article_types
u32,list[i32],list[f32],list[f32],list[i32],list[i32],u32,list[i8],list[f64],list[i64],list[i64],list[str],list[str]
1879356,"[9778627, 9778351, … 9779648]","[100.0, 63.0, … 100.0]","[36.0, 134.0, … 153.0]","[9780447, 9770419, … 9686731]",[9770419],114099540,"[0, 1, … 0]","[19.380278, 199.366944, … 1555.213611]","[2975, 142, … 414]","[118, 118, … 140]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]"
2064874,"[9771995, 9776148, … 9779867]","[84.0, 100.0, … 100.0]","[14.0, 8.0, … 75.0]","[9783729, 9783657, … 9782256]",[9783655],143113690,"[0, 0, … 0]","[1.100833, 2.063056, … 0.625556]","[142, 142, … 498]","[118, 2975, … 498]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]"


## Init model using HuggingFace's tokenizer and wordembedding
In the original implementation, they use the GloVe embeddings and tokenizer. To get going fast, we'll use a multilingual LLM from Hugging Face. 
Utilizing the tokenizer to tokenize the articles and the word-embedding to init NRMS.


In [9]:
# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-robe rta-large"
# TRANSFORMER_MODEL_NAME = "google-bert/bert-base-multilingual-uncased" 
#Argue for cased vs uncased.  TODO
# #Cased might be better but to compare with malteHb we use uncased

TRANSFORMER_MODEL_NAME = "Maltehb/danish-bert-botxo"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30 #hardcoded somewhere ?? error if change



# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#


df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)

# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)




In [10]:
word2vec_embedding

array([[ 0.01057525,  0.0519704 ,  0.08909235, ..., -0.00795781,
        -0.06168545, -0.07079539],
       [-0.01835794,  0.04070301,  0.02630469, ..., -0.00612215,
        -0.03679245, -0.00261144],
       [-0.01904276,  0.02300256, -0.00536503, ...,  0.00180998,
         0.01913669, -0.00572065],
       ...,
       [ 0.02969794, -0.02969835,  0.0127237 , ..., -0.0130282 ,
        -0.00069379,  0.004221  ],
       [ 0.03114044, -0.03700501,  0.01400322, ..., -0.00791059,
         0.00770514, -0.00168254],
       [ 0.0367507 , -0.0307173 ,  0.00670483, ..., -0.01460291,
         0.00015374, -0.00201466]], dtype=float32)

In [11]:
print("df_train columns:", df_train.columns)
print("df_validation columns:", df_validation.columns)

df_train columns: ['user_id', 'article_id_fixed', 'scroll_percentage_fixed', 'read_time_fixed', 'article_ids_inview', 'article_ids_clicked', 'impression_id', 'labels', 'inview_hour_differences', 'inview_article_categories', 'history_article_categories', 'inview_article_types', 'history_article_types']
df_validation columns: ['user_id', 'article_id_fixed', 'scroll_percentage_fixed', 'read_time_fixed', 'article_ids_inview', 'article_ids_clicked', 'impression_id', 'labels', 'inview_hour_differences', 'inview_article_categories', 'history_article_categories', 'inview_article_types', 'history_article_types']


# Initiate the dataloaders
In the implementations we have disconnected the models and data. Hence, you should built a dataloader that fits your needs.

In [12]:
# train_dataloader = NRMSDataLoader(
#     behaviors=df_train,
#     article_dict=article_mapping,
#     unknown_representation="zeros",
#     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#     eval_mode=False,
#     batch_size=128,
# )
# val_dataloader = NRMSDataLoader(
#     behaviors=df_validation,
#     article_dict=article_mapping,
#     unknown_representation="zeros",
#     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#     eval_mode=True,
#     batch_size=64,
# )


# Define which additional features to include
feature_columns = [
    'scroll_percentage_fixed',
    'read_time_fixed', 
    'inview_hour_differences',
    'inview_article_categories',
    'history_article_categories',
    'inview_article_types',
    'history_article_types'
]

train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=128,
)

val_dataloader = NRMSDataLoader(
    behaviors=df_validation, 
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=64,
)

## Train the model


In [12]:
# List all physical devices
# physical_devices = tf.config.list_physical_devices('GPU')
# print("Available devices:", physical_devices)
# import torch.nn as nn
# print(torch.cuda.is_available())

In [13]:
# import os
# from tqdm.notebook import tqdm


# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create a custom ModelCheckpoint for PyTorch
# class PyTorchModelCheckpoint:
#     def __init__(self, filepath, model_wrapper=None, save_best_only=True, save_weights_only=True, verbose=1):
#         self.filepath = filepath
#         self.model_wrapper = model_wrapper  # Store the model wrapper reference
#         self.save_best_only = save_best_only
#         self.save_weights_only = save_weights_only
#         self.verbose = verbose
#         self.best_val_loss = float('inf')
    
#     def on_epoch_end(self, epoch, logs=None):
#         val_loss = logs.get('val_loss', None)
#         if val_loss is None:
#             return
        
#         if self.save_best_only:
#             if val_loss < self.best_val_loss:
#                 if self.verbose:
#                     print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
#                 self.best_val_loss = val_loss
#                 # Use the model_wrapper reference
#                 self.model_wrapper.save_weights(self.filepath)
#         else:
#             self.model_wrapper.save_weights(self.filepath)

# # Initialize model first
# hparams_nrms.history_size = HISTORY_SIZE

# pytorch_model = NRMSModel(
#     hparams=hparams_nrms,
#     word2vec_embedding=word2vec_embedding,
#     seed=42,
# )
# model = NRMSWrapper(pytorch_model)

# # Then create the callback with the model reference
# modelcheckpoint = PyTorchModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     model_wrapper=model,  # Pass the model wrapper
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1
# )

# # Training
# hist = model.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=4,
#     callbacks=[modelcheckpoint]
# )

# # Load weights using the wrapper
# model.load_weights(filepath=MODEL_WEIGHTS)

# # Get predictions
# pred_validation = model.predict(val_dataloader)

NRMSWrapper init
True


Epoch 1/4 [Train]:   4%|▍         | 30/733 [00:04<01:51,  6.28it/s, loss=0.4383]

Epoch 1/4 [Train]: 100%|██████████| 733/733 [02:37<00:00,  4.64it/s, loss=0.3889]
Epoch 1/4 [Valid]: 100%|██████████| 1530/1530 [06:09<00:00,  4.14it/s, loss=0.6621]


Epoch 1 - Train Loss: 0.3889, Val Loss: 0.3172

Validation loss improved from inf to 0.31719


Epoch 2/4 [Train]: 100%|██████████| 733/733 [03:11<00:00,  3.83it/s, loss=0.3791]
Epoch 2/4 [Valid]: 100%|██████████| 1530/1530 [08:58<00:00,  2.84it/s, loss=0.6765]


Epoch 2 - Train Loss: 0.3791, Val Loss: 0.3241


Epoch 3/4 [Train]: 100%|██████████| 733/733 [03:21<00:00,  3.64it/s, loss=0.3720]
Epoch 3/4 [Valid]: 100%|██████████| 1530/1530 [10:02<00:00,  2.54it/s, loss=0.6823]


Epoch 3 - Train Loss: 0.3720, Val Loss: 0.3269


Epoch 4/4 [Train]: 100%|██████████| 733/733 [04:02<00:00,  3.03it/s, loss=0.3676]
Epoch 4/4 [Valid]: 100%|██████████| 1530/1530 [10:07<00:00,  2.52it/s, loss=0.7182]
  self.model.load_state_dict(torch.load(filepath))


Epoch 4 - Train Loss: 0.3676, Val Loss: 0.3441


Predicting: 100%|██████████| 1530/1530 [11:18<00:00,  2.26it/s]


In [13]:
import os
from tqdm.notebook import tqdm


MODEL_NAME = "NRMS"
LOG_DIR = f"downloads/runs/{MODEL_NAME}"
WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(WEIGHTS_DIR, exist_ok=True)

# Create a custom ModelCheckpoint for PyTorch
class PyTorchModelCheckpoint:
    def __init__(self, filepath, model_wrapper=None, save_best_only=True, save_weights_only=True, verbose=1):
        self.filepath = filepath
        self.model_wrapper = model_wrapper  # Store the model wrapper reference
        self.save_best_only = save_best_only
        self.save_weights_only = save_weights_only
        self.verbose = verbose
        self.best_val_loss = float('inf')
    
    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs.get('val_loss', None)
        if val_loss is None:
            return
        
        if self.save_best_only:
            if val_loss < self.best_val_loss:
                if self.verbose:
                    print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
                self.best_val_loss = val_loss
                # Use the model_wrapper reference
                self.model_wrapper.save_weights(self.filepath)
        else:
            self.model_wrapper.save_weights(self.filepath)

# Initialize model first
hparams_nrms.history_size = HISTORY_SIZE

pytorch_model = NRMSModel(
    hparams=hparams_nrms,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)
model = NRMSWrapper(pytorch_model)

# Then create the callback with the model reference
modelcheckpoint = PyTorchModelCheckpoint(
    filepath=MODEL_WEIGHTS,
    model_wrapper=model,  # Pass the model wrapper
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Training
hist = model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=4,
    callbacks=[modelcheckpoint]
)

# Load weights using the wrapper
model.load_weights(filepath=MODEL_WEIGHTS)

# Get predictions
pred_validation = model.predict(val_dataloader)

Epoch 1/4 [Train]: 100%|██████████| 19/19 [00:02<00:00,  6.53it/s, loss=0.4546]
Epoch 1/4 [Valid]: 100%|██████████| 39/39 [00:09<00:00,  4.27it/s, loss=0.6839]


Epoch 1 - Train Loss: 0.4546, Val Loss: 0.3332

Validation loss improved from inf to 0.33318


Epoch 2/4 [Train]: 100%|██████████| 19/19 [00:02<00:00,  7.68it/s, loss=0.3973]
Epoch 2/4 [Valid]: 100%|██████████| 39/39 [00:07<00:00,  4.89it/s, loss=0.6138]


Epoch 2 - Train Loss: 0.3973, Val Loss: 0.2990

Validation loss improved from 0.33318 to 0.29902


Epoch 3/4 [Train]: 100%|██████████| 19/19 [00:02<00:00,  7.72it/s, loss=0.3711]
Epoch 3/4 [Valid]: 100%|██████████| 39/39 [00:07<00:00,  4.91it/s, loss=0.6244]


Epoch 3 - Train Loss: 0.3711, Val Loss: 0.3042


Epoch 4/4 [Train]: 100%|██████████| 19/19 [00:02<00:00,  7.73it/s, loss=0.3567]
Epoch 4/4 [Valid]: 100%|██████████| 39/39 [00:07<00:00,  4.90it/s, loss=0.6341]
  self.model.load_state_dict(torch.load(filepath))


Epoch 4 - Train Loss: 0.3567, Val Loss: 0.3089


Predicting: 100%|██████████| 39/39 [00:07<00:00,  4.89it/s]


In [None]:
# import os
# from tqdm.notebook import tqdm


# MODEL_NAME = "NRMS"
# LOG_DIR = f"downloads/runs/{MODEL_NAME}"
# WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
# MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

# os.makedirs(LOG_DIR, exist_ok=True)
# os.makedirs(WEIGHTS_DIR, exist_ok=True)

# # Create a custom ModelCheckpoint for PyTorch
# class PyTorchModelCheckpoint:
#     def __init__(self, filepath, model_wrapper=None, save_best_only=True, save_weights_only=True, verbose=1):
#         self.filepath = filepath
#         self.model_wrapper = model_wrapper  # Store the model wrapper reference
#         self.save_best_only = save_best_only
#         self.save_weights_only = save_weights_only
#         self.verbose = verbose
#         self.best_val_loss = float('inf')
    
#     def on_epoch_end(self, epoch, logs=None):
#         val_loss = logs.get('val_loss', None)
#         if val_loss is None:
#             return
        
#         if self.save_best_only:
#             if val_loss < self.best_val_loss:
#                 if self.verbose:
#                     print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
#                 self.best_val_loss = val_loss
#                 # Use the model_wrapper reference
#                 self.model_wrapper.save_weights(self.filepath)
#         else:
#             self.model_wrapper.save_weights(self.filepath)

# # Initialize model first
# hparams_nrms.history_size = HISTORY_SIZE

# pytorch_model = NRMSModel(
#     hparams=hparams_nrms,
#     word2vec_embedding=word2vec_embedding,
#     seed=42,
# )
# model = NRMSWrapper(pytorch_model)

# # Then create the callback with the model reference
# modelcheckpoint = PyTorchModelCheckpoint(
#     filepath=MODEL_WEIGHTS,
#     model_wrapper=model,  # Pass the model wrapper
#     save_best_only=True,
#     save_weights_only=True,
#     verbose=1
# )

# # Training
# hist = model.fit(
#     train_dataloader,
#     validation_data=val_dataloader,
#     epochs=1,
#     callbacks=[modelcheckpoint]
# )

# # Load weights using the wrapper
# model.load_weights(filepath=MODEL_WEIGHTS)

# # Get predictions
# pred_validation = model.predict(val_dataloader)

In [22]:
!pip install optuna

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
Downloading alembic-1.14.0-py3-none-any.whl (233 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.8 alembic-1.14.0 colorlog-6.9.0 optuna-4.1.0


In [15]:
def calculate_metrics(outputs, labels):
    # AUC
    auc = roc_auc_score(labels, outputs)

    # MRR
    sorted_indices = np.argsort(outputs)[::-1]
    ranks = np.where(labels[sorted_indices] == 1)[0] + 1  # 1-based ranks
    mrr = 1 / ranks[0] if len(ranks) > 0 else 0

    # NDCG@5
    top_k = 5
    top_k_indices = sorted_indices[:top_k]
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(top_k, np.sum(labels))))
    dcg = sum(
        labels[i] / np.log2(rank + 2)
        for rank, i in enumerate(top_k_indices)
    )
    ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0

    return auc, mrr, ndcg

In [27]:
# # Import necessary libraries
# import optuna
# from optuna import Trial

# # Define the objective function for Optuna
# def objective(trial: Trial):
#     # Suggest values for the hyperparameters
#     hparams_nrms.head_num = trial.suggest_int('head_num', 10, 30)
#     hparams_nrms.head_dim = trial.suggest_int('head_dim', 10, 30)
#     hparams_nrms.attention_hidden_dim = trial.suggest_int('attention_hidden_dim', 100, 300)
#     hparams_nrms.dropout = trial.suggest_float('dropout', 0.0, 0.5)
#     hparams_nrms.learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)

#     # Initialize the model with suggested hyperparameters
#     pytorch_model = NRMSModel(
#         hparams=hparams_nrms,
#         word2vec_embedding=word2vec_embedding,
#         seed=42,
#     )
#     model = NRMSWrapper(pytorch_model) #specify which model

#     # Use model.model.parameters() to access the underlying model's parameters
#     optimizer = torch.optim.Adam(model.model.parameters(), lr=hparams_nrms.learning_rate)
#     loss_fn = torch.nn.BCELoss()  # Use the appropriate loss function for your problem

#     # Training loop
#     for epoch in range(2):
#         # Set the model to training mode
#         model.model.train()

#         epoch_train_loss = 0.0
#         for inputs, labels in train_dataloader:
#             optimizer.zero_grad()
#             history, candidates = inputs

#             # Move data to the appropriate device
#             history = torch.from_numpy(history).to(model.device)
#             candidates = torch.from_numpy(candidates).to(model.device)
#             labels = torch.from_numpy(labels).float().to(model.device)

#             # Forward pass using the underlying model
#             outputs = model.model(history, candidates, training=True)
#             loss = loss_fn(outputs, labels)

#             # Backward pass and optimization
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.model.parameters(), max_norm=5.0)
#             optimizer.step()

#             epoch_train_loss += loss.item()
#         epoch_train_loss /= len(train_dataloader)

#         # Validation loop
#         model.model.eval()  # Set the model to evaluation mode
#         epoch_val_loss = 0.0
#         with torch.no_grad():
#             for inputs, labels in val_dataloader:
#                 history, candidates = inputs
#                 history = torch.from_numpy(history).to(model.device)
#                 candidates = torch.from_numpy(candidates).to(model.device)
#                 labels = torch.from_numpy(labels).float().to(model.device)

#                 outputs = model.model(history, candidates, training=False)
#                 loss = loss_fn(outputs, labels)
#                 epoch_val_loss += loss.item()
#         epoch_val_loss /= len(val_dataloader)

#         # Report intermediate objective value
#         trial.report(epoch_val_loss, epoch)

#         # Handle pruning (optional)
#         if trial.should_prune():
#             raise optuna.exceptions.TrialPruned()

#         print(f'Epoch {epoch+1}, Training Loss: {epoch_train_loss:.4f}, Validation Loss: {epoch_val_loss:.4f}')

#     return epoch_val_loss

# # Create an Optuna study and optimize
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)

# # Print the best hyperparameters
# print('Best hyperparameters:', study.best_params)

[I 2024-12-14 18:45:09,681] A new study created in memory with name: no-name-5d512886-3926-4201-8dde-8e6ee0d523c3
  hparams_nrms.learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)


NRMSWrapper init
True
Epoch 1, Training Loss: 0.5782, Validation Loss: 0.3739


[I 2024-12-14 18:46:26,165] Trial 0 finished with value: 0.31280321966518054 and parameters: {'head_num': 30, 'head_dim': 25, 'attention_hidden_dim': 167, 'dropout': 0.22542837077246064, 'learning_rate': 2.836064310721951e-05}. Best is trial 0 with value: 0.31280321966518054.


Epoch 2, Training Loss: 0.4150, Validation Loss: 0.3128
NRMSWrapper init
True
Epoch 1, Training Loss: 0.5053, Validation Loss: 0.3116


[I 2024-12-14 18:46:45,894] Trial 1 finished with value: 0.3098915300586007 and parameters: {'head_num': 11, 'head_dim': 19, 'attention_hidden_dim': 136, 'dropout': 0.2551789183453944, 'learning_rate': 0.0001855247208175534}. Best is trial 1 with value: 0.3098915300586007.


Epoch 2, Training Loss: 0.4072, Validation Loss: 0.3099
NRMSWrapper init
True
Epoch 1, Training Loss: 0.4679, Validation Loss: 0.3111


[I 2024-12-14 18:47:54,944] Trial 2 finished with value: 0.3081528439924314 and parameters: {'head_num': 25, 'head_dim': 27, 'attention_hidden_dim': 141, 'dropout': 0.2170231032877039, 'learning_rate': 0.00012027589052281536}. Best is trial 2 with value: 0.3081528439924314.


Epoch 2, Training Loss: 0.4076, Validation Loss: 0.3082
NRMSWrapper init
True
Epoch 1, Training Loss: 0.6376, Validation Loss: 0.5643


[I 2024-12-14 18:48:42,281] Trial 3 finished with value: 0.3879450706692485 and parameters: {'head_num': 22, 'head_dim': 24, 'attention_hidden_dim': 228, 'dropout': 0.4007823667923102, 'learning_rate': 1.708951752526795e-05}. Best is trial 2 with value: 0.3081528439924314.


Epoch 2, Training Loss: 0.5206, Validation Loss: 0.3879
NRMSWrapper init
True
Epoch 1, Training Loss: 0.4342, Validation Loss: 0.3127


[I 2024-12-14 18:49:15,530] Trial 4 finished with value: 0.3065825499497451 and parameters: {'head_num': 17, 'head_dim': 23, 'attention_hidden_dim': 242, 'dropout': 0.0177378111684095, 'learning_rate': 0.0006618828440160914}. Best is trial 4 with value: 0.3065825499497451.


Epoch 2, Training Loss: 0.3892, Validation Loss: 0.3066
NRMSWrapper init
True


[I 2024-12-14 18:49:30,575] Trial 5 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:49:44,389] Trial 6 pruned. 


NRMSWrapper init
True
Epoch 1, Training Loss: 0.5400, Validation Loss: 0.2976


[I 2024-12-14 18:50:06,261] Trial 7 finished with value: 0.30354183370416815 and parameters: {'head_num': 12, 'head_dim': 19, 'attention_hidden_dim': 134, 'dropout': 0.06874060206577653, 'learning_rate': 0.00012309881261735734}. Best is trial 7 with value: 0.30354183370416815.


Epoch 2, Training Loss: 0.4114, Validation Loss: 0.3035
NRMSWrapper init
True


[I 2024-12-14 18:50:19,719] Trial 8 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:50:45,879] Trial 9 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:50:56,636] Trial 10 pruned. 


NRMSWrapper init
True
Epoch 1, Training Loss: 0.4299, Validation Loss: 0.3025


[I 2024-12-14 18:51:26,137] Trial 11 finished with value: 0.30457936440195355 and parameters: {'head_num': 17, 'head_dim': 20, 'attention_hidden_dim': 243, 'dropout': 0.10048472321262351, 'learning_rate': 0.0009425628155764133}. Best is trial 7 with value: 0.30354183370416815.


Epoch 2, Training Loss: 0.3858, Validation Loss: 0.3046
NRMSWrapper init
True
Epoch 1, Training Loss: 0.4372, Validation Loss: 0.2981


[I 2024-12-14 18:51:50,781] Trial 12 finished with value: 0.30474833505494253 and parameters: {'head_num': 17, 'head_dim': 18, 'attention_hidden_dim': 221, 'dropout': 0.12024776814128625, 'learning_rate': 0.0006192611774202494}. Best is trial 7 with value: 0.30354183370416815.


Epoch 2, Training Loss: 0.3918, Validation Loss: 0.3047
NRMSWrapper init
True


[I 2024-12-14 18:52:00,343] Trial 13 pruned. 


NRMSWrapper init
True
Epoch 1, Training Loss: 0.4743, Validation Loss: 0.3110


[I 2024-12-14 18:52:20,405] Trial 14 pruned. 


NRMSWrapper init
True
Epoch 1, Training Loss: 0.4396, Validation Loss: 0.2976


[I 2024-12-14 18:52:41,874] Trial 15 finished with value: 0.30256819183176215 and parameters: {'head_num': 19, 'head_dim': 10, 'attention_hidden_dim': 204, 'dropout': 0.16990489261973232, 'learning_rate': 0.0009544695229692615}. Best is trial 15 with value: 0.30256819183176215.


Epoch 2, Training Loss: 0.3878, Validation Loss: 0.3026
NRMSWrapper init
True
Epoch 1, Training Loss: 0.5351, Validation Loss: 0.2970


[I 2024-12-14 18:53:07,352] Trial 16 finished with value: 0.30492772065199814 and parameters: {'head_num': 26, 'head_dim': 11, 'attention_hidden_dim': 201, 'dropout': 0.1858661300106021, 'learning_rate': 0.00011491155721188424}. Best is trial 15 with value: 0.30256819183176215.


Epoch 2, Training Loss: 0.4109, Validation Loss: 0.3049
NRMSWrapper init
True
Epoch 1, Training Loss: 0.4566, Validation Loss: 0.3029


[I 2024-12-14 18:53:33,759] Trial 17 finished with value: 0.3041133481960792 and parameters: {'head_num': 20, 'head_dim': 15, 'attention_hidden_dim': 201, 'dropout': 0.3074169983489604, 'learning_rate': 0.00036011013048270577}. Best is trial 15 with value: 0.30256819183176215.


Epoch 2, Training Loss: 0.4015, Validation Loss: 0.3041
NRMSWrapper init
True


[I 2024-12-14 18:53:43,823] Trial 18 pruned. 


NRMSWrapper init
True
Epoch 1, Training Loss: 0.4571, Validation Loss: 0.2976


[I 2024-12-14 18:54:34,367] Trial 19 finished with value: 0.30526120283386926 and parameters: {'head_num': 26, 'head_dim': 21, 'attention_hidden_dim': 298, 'dropout': 0.061845980993593064, 'learning_rate': 0.00017942259907619408}. Best is trial 15 with value: 0.30256819183176215.


Epoch 2, Training Loss: 0.4061, Validation Loss: 0.3053
NRMSWrapper init
True


[I 2024-12-14 18:54:46,536] Trial 20 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:55:00,993] Trial 21 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:55:11,236] Trial 22 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:55:30,866] Trial 23 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:55:44,591] Trial 24 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:55:56,921] Trial 25 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:56:09,095] Trial 26 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:56:23,576] Trial 27 pruned. 


NRMSWrapper init
True
Epoch 1, Training Loss: 0.4656, Validation Loss: 0.3014


[I 2024-12-14 18:56:46,120] Trial 28 finished with value: 0.30801563758354683 and parameters: {'head_num': 15, 'head_dim': 18, 'attention_hidden_dim': 127, 'dropout': 0.23912169051307391, 'learning_rate': 0.00028103323999633246}. Best is trial 15 with value: 0.30256819183176215.


Epoch 2, Training Loss: 0.4039, Validation Loss: 0.3080
NRMSWrapper init
True


[I 2024-12-14 18:57:19,453] Trial 29 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:57:31,651] Trial 30 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:57:45,481] Trial 31 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:57:59,579] Trial 32 pruned. 


NRMSWrapper init
True
Epoch 1, Training Loss: 0.5257, Validation Loss: 0.2976


[I 2024-12-14 18:58:18,624] Trial 33 finished with value: 0.31009990944490806 and parameters: {'head_num': 12, 'head_dim': 20, 'attention_hidden_dim': 271, 'dropout': 0.1712664133063157, 'learning_rate': 0.00015439903551947684}. Best is trial 15 with value: 0.30256819183176215.


Epoch 2, Training Loss: 0.4083, Validation Loss: 0.3101
NRMSWrapper init
True


[I 2024-12-14 18:58:28,517] Trial 34 pruned. 


NRMSWrapper init
True


[I 2024-12-14 18:58:52,748] Trial 35 pruned. 


NRMSWrapper init
True
Epoch 1, Training Loss: 0.4292, Validation Loss: 0.2984


[I 2024-12-14 18:59:23,804] Trial 36 finished with value: 0.3022862297373933 and parameters: {'head_num': 18, 'head_dim': 21, 'attention_hidden_dim': 252, 'dropout': 0.04387264176902639, 'learning_rate': 0.0009849376770800665}. Best is trial 36 with value: 0.3022862297373933.


Epoch 2, Training Loss: 0.3855, Validation Loss: 0.3023
NRMSWrapper init
True
Epoch 1, Training Loss: 0.4629, Validation Loss: 0.2972


[I 2024-12-14 18:59:59,738] Trial 37 finished with value: 0.3045895742131518 and parameters: {'head_num': 19, 'head_dim': 23, 'attention_hidden_dim': 136, 'dropout': 0.0030558575071432392, 'learning_rate': 0.000219732017463491}. Best is trial 36 with value: 0.3022862297373933.


Epoch 2, Training Loss: 0.4048, Validation Loss: 0.3046
NRMSWrapper init
True
Epoch 1, Training Loss: 0.4419, Validation Loss: 0.2977


[I 2024-12-14 19:00:22,453] Trial 38 finished with value: 0.3046636229211634 and parameters: {'head_num': 13, 'head_dim': 22, 'attention_hidden_dim': 180, 'dropout': 0.038647797087991514, 'learning_rate': 0.0006662145235265185}. Best is trial 36 with value: 0.3022862297373933.


Epoch 2, Training Loss: 0.3905, Validation Loss: 0.3047
NRMSWrapper init
True
Epoch 1, Training Loss: 0.5259, Validation Loss: 0.2963


[I 2024-12-14 19:00:44,180] Trial 39 finished with value: 0.3072310612573252 and parameters: {'head_num': 19, 'head_dim': 12, 'attention_hidden_dim': 260, 'dropout': 0.3361061950185143, 'learning_rate': 0.00014273913365876067}. Best is trial 36 with value: 0.3022862297373933.


Epoch 2, Training Loss: 0.4091, Validation Loss: 0.3072
NRMSWrapper init
True


[I 2024-12-14 19:01:13,805] Trial 40 pruned. 


NRMSWrapper init
True


[I 2024-12-14 19:01:27,478] Trial 41 pruned. 


NRMSWrapper init
True


[I 2024-12-14 19:01:52,448] Trial 42 pruned. 


NRMSWrapper init
True


[I 2024-12-14 19:02:08,820] Trial 43 pruned. 


NRMSWrapper init
True


[I 2024-12-14 19:02:21,946] Trial 44 pruned. 


NRMSWrapper init
True
Epoch 1, Training Loss: 0.4623, Validation Loss: 0.2971


[I 2024-12-14 19:02:43,985] Trial 45 finished with value: 0.3033513937677656 and parameters: {'head_num': 13, 'head_dim': 19, 'attention_hidden_dim': 280, 'dropout': 0.020525713916253283, 'learning_rate': 0.0003519804267263168}. Best is trial 36 with value: 0.3022862297373933.


Epoch 2, Training Loss: 0.4012, Validation Loss: 0.3034
NRMSWrapper init
True
Epoch 1, Training Loss: 0.4653, Validation Loss: 0.2970


[I 2024-12-14 19:03:02,904] Trial 46 finished with value: 0.3042949801915652 and parameters: {'head_num': 12, 'head_dim': 19, 'attention_hidden_dim': 282, 'dropout': 0.022647344338325808, 'learning_rate': 0.0003588119734843157}. Best is trial 36 with value: 0.3022862297373933.


Epoch 2, Training Loss: 0.4008, Validation Loss: 0.3043
NRMSWrapper init
True


[I 2024-12-14 19:03:11,286] Trial 47 pruned. 


NRMSWrapper init
True


[I 2024-12-14 19:03:21,293] Trial 48 pruned. 


NRMSWrapper init
True


[I 2024-12-14 19:03:34,969] Trial 49 pruned. 


Best hyperparameters: {'head_num': 18, 'head_dim': 21, 'attention_hidden_dim': 252, 'dropout': 0.04387264176902639, 'learning_rate': 0.0009849376770800665}


In [16]:
import optuna
from sklearn.metrics import roc_auc_score
import numpy as np
import torch

# Define the objective function for Optuna
def objective(trial):
    # print("Trial Hyperparameters:")
    # print(f"head_num: {trial.suggest_int('head_num', 10, 30)}")
    # print(f"head_dim: {trial.suggest_int('head_dim', 10, 30)}")
    # print(f"attention_hidden_dim: {trial.suggest_int('attention_hidden_dim', 100, 300)}")
    # print(f"dropout: {trial.suggest_float('dropout', 0.0, 0.5)}")
    # print(f"learning_rate: {trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)}")

    hparams_nrms.head_num = trial.suggest_int('head_num', 20, 30)
    hparams_nrms.head_dim = trial.suggest_int('head_dim', 20, 30)
    hparams_nrms.attention_hidden_dim = trial.suggest_int('attention_hidden_dim', 100, 300)
    hparams_nrms.dropout = trial.suggest_float('dropout', 0.1, 0.5)
    hparams_nrms.learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)

    # Initialize the model with suggested hyperparameters
    pytorch_model = NRMSModel(
        hparams=hparams_nrms,
        word2vec_embedding=word2vec_embedding,
        seed=42,
    )
    model = NRMSWrapper(pytorch_model)

    # Use model.model.parameters() to access the underlying model's parameters
    optimizer = torch.optim.Adam(model.model.parameters(), lr=hparams_nrms.learning_rate)
    loss_fn = torch.nn.BCELoss()  # Use the appropriate loss function for your problem

    # Training loop
    for epoch in range(4):  # Adjust epochs as needed
        # Set the model to training mode
        model.model.train()

        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            history, candidates = inputs

            # Move data to the appropriate device
            history = torch.from_numpy(history).to(model.device)
            candidates = torch.from_numpy(candidates).to(model.device)
            labels = torch.from_numpy(labels).float().to(model.device)

            # Forward pass using the underlying model
            try:
                outputs = model.model(history, candidates, training=True)
            except RuntimeError as e:
                print(f"History shape: {history.shape}")
                print(f"Candidates shape: {candidates.shape}")
                raise e

            loss = loss_fn(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.model.parameters(), max_norm=5.0)
            optimizer.step()

    # Validation loop
    model.model.eval()  # Set the model to evaluation mode
    all_outputs, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            history, candidates = inputs
            history = torch.from_numpy(history).to(model.device)
            candidates = torch.from_numpy(candidates).to(model.device)
            labels = torch.from_numpy(labels).float().to(model.device)

            outputs = model.model(history, candidates, training=False)
            all_outputs.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    # Calculate AUC
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)
    auc = roc_auc_score(all_labels, all_outputs)

    print(f"AUC: {auc:.4f}")

    # Return AUC as the single objective to maximize
    return auc

# Create a single-objective study
study = optuna.create_study(direction='maximize')  # Maximize AUC
study.optimize(objective, n_trials=10)

# Print the best hyperparameters and AUC
print("Best hyperparameters:", study.best_params)
print(f"Best AUC: {study.best_value:.4f}")

# Add predictions and evaluate metrics for the best trial
def evaluate_best_trial(study):
    # Get best trial hyperparameters
    best_trial = study.best_trial
    hparams_nrms.head_num = best_trial.params['head_num']
    hparams_nrms.head_dim = best_trial.params['head_dim']
    hparams_nrms.attention_hidden_dim = best_trial.params['attention_hidden_dim']
    hparams_nrms.dropout = best_trial.params['dropout']
    hparams_nrms.learning_rate = best_trial.params['learning_rate']

    # Re-train the model using the best parameters
    pytorch_model = NRMSModel(
        hparams=hparams_nrms,
        word2vec_embedding=word2vec_embedding,
        seed=42,
    )
    model = NRMSWrapper(pytorch_model)
    model.model.eval()

    # Generate predictions for the validation set
    all_outputs = []
    with torch.no_grad():
        for inputs, _ in val_dataloader:
            history, candidates = inputs
            history = torch.from_numpy(history).to(model.device)
            candidates = torch.from_numpy(candidates).to(model.device)
            outputs = model.model(history, candidates, training=False)
            all_outputs.append(outputs.cpu().numpy())

    # Add predictions to the validation DataFrame
    pred_validation = np.concatenate(all_outputs)
    df_validation_with_preds = add_prediction_scores(
        df_validation, pred_validation.tolist()
    ).pipe(
        add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
    )

    # Compute metrics
    metrics = MetricEvaluator(
        labels=df_validation_with_preds["labels"].to_list(),
        predictions=df_validation_with_preds["scores"].to_list(),
        metric_functions=[
            AucScore(),
            MrrScore(),
            NdcgScore(k=5),
            NdcgScore(k=10),
        ],
    )
    metric_results = metrics.evaluate()
    return metric_results

# Evaluate metrics for the best trial
best_trial_metrics = evaluate_best_trial(study)
print("Metrics for Best AUC Trial:", best_trial_metrics)


[I 2024-12-14 23:35:26,853] A new study created in memory with name: no-name-4db291cc-f5a3-442c-9784-9ca6e9b536fb
[W 2024-12-14 23:35:26,948] Trial 0 failed with parameters: {'head_num': 24, 'head_dim': 20, 'attention_hidden_dim': 175, 'dropout': 0.1767020756495025, 'learning_rate': 0.0003604276072918151} because of the following error: OutOfMemoryError('CUDA out of memory. Tried to allocate 212.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 78.19 MiB is free. Including non-PyTorch memory, this process has 15.69 GiB memory in use. Of the allocated memory 14.65 GiB is allocated by PyTorch, and 675.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)').
Traceback (most recent call last):
  File "/dtu/blackhole/14/155764/DeepL-Project-Corn2/.venv

History shape: torch.Size([128, 30, 30])
Candidates shape: torch.Size([128, 7, 30])


OutOfMemoryError: CUDA out of memory. Tried to allocate 212.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 78.19 MiB is free. Including non-PyTorch memory, this process has 15.69 GiB memory in use. Of the allocated memory 14.65 GiB is allocated by PyTorch, and 675.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [53]:
def generate_optuna_plots(study): #chanks claude
    """
    Generate and display multiple Optuna visualization plots.
    
    Args:
        study (optuna.study.Study): The Optuna study object to visualize
    """
    # 1. Param Importances Plot
    param_importances = vis.plot_param_importances(study)
    param_importances.show()
    
    # 2. Optimization History Plot
    optimization_history = vis.plot_optimization_history(study)
    optimization_history.show()
    
    # 3. Parallel Coordinate Plot
    parallel_coordinate = vis.plot_parallel_coordinate(study)
    parallel_coordinate.show()
    
    # 4. Slice Plot
    slice_plot = vis.plot_slice(study)
    slice_plot.show()
    
    # 5. Contour Plot
    contour_plot = vis.plot_contour(study)
    contour_plot.show()

generate_optuna_plots(study)

# Saving / loading model because hpc annoying

In [15]:
# MODEL_FILE = f"downloads/models/{MODEL_NAME}.h5" 

# # Save the model after training
# print("Saving the model...")
# os.makedirs(os.path.dirname(MODEL_FILE), exist_ok=True)
# model.model.save(MODEL_FILE)  # Save the full model (architecture + weights)
# print(f"Model saved at {MODEL_FILE}")

##LOAD SAVED MODEL
# from tensorflow.keras.models import load_model

# # Load the saved model
# print(f"Loading the model from {MODEL_FILE}...")
# model.model = load_model(MODEL_FILE)
# print("Model loaded successfully.")


# Example how to compute some metrics:

In [16]:
# pred_validation = model.scorer.predict(val_dataloader) -- Added to model/training block

## Add the predictions to the dataframe

In [17]:
df_validation = add_prediction_scores(df_validation, pred_validation.tolist()).pipe(
    add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
)
df_validation.head(2)

user_id,article_id_fixed,scroll_percentage_fixed,read_time_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,inview_article_categories,history_article_categories,inview_article_types,history_article_types,scores,is_known_user
u32,list[i32],list[f32],list[f32],list[i32],list[i32],u32,list[i8],list[f64],list[i64],list[i64],list[str],list[str],list[f64],bool
140115,"[9773282, 9773307, … 9778422]","[null, 29.0, … 100.0]","[15.0, 8.0, … 136.0]","[9780302, 9345280, … 9780325]",[9780195],58448516,"[0, 0, … 0]","[2.175556, 7340.245556, … 2.016389]","[118, 140, … 118]","[118, 140, … 118]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]","[0.099918, 0.007427, … 0.08889]",True
377904,"[9779538, 9779629, … 9780020]","[100.0, 97.0, … 100.0]","[40.0, 27.0, … 691.0]","[9781987, 9782315, … 9782180]",[9782180],503085507,"[0, 0, … 1]","[2.044444, 1.551667, … 3.8975]","[2975, 142, … 512]","[118, 142, … 118]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]","[0.151382, 0.185197, … 0.206735]",True


### Compute metrics

In [18]:
metrics = MetricEvaluator(
    labels=df_validation["labels"].to_list(),
    predictions=df_validation["scores"].to_list(),
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()

<MetricEvaluator class>: 
 {
    "auc": 0.5410424424355816,
    "mrr": 0.3365010986542997,
    "ndcg@5": 0.3752912206207446,
    "ndcg@10": 0.4535116930280336
}

## Make submission file

In [19]:
df_validation = df_validation.with_columns(
    pl.col("scores")
    .map_elements(lambda x: list(rank_predictions_by_score(x)))
    .alias("ranked_scores")
)
df_validation.head(2)

user_id,article_id_fixed,scroll_percentage_fixed,read_time_fixed,article_ids_inview,article_ids_clicked,impression_id,labels,inview_hour_differences,inview_article_categories,history_article_categories,inview_article_types,history_article_types,scores,is_known_user,ranked_scores
u32,list[i32],list[f32],list[f32],list[i32],list[i32],u32,list[i8],list[f64],list[i64],list[i64],list[str],list[str],list[f64],bool,list[i64]
140115,"[9773282, 9773307, … 9778422]","[null, 29.0, … 100.0]","[15.0, 8.0, … 136.0]","[9780302, 9345280, … 9780325]",[9780195],58448516,"[0, 0, … 0]","[2.175556, 7340.245556, … 2.016389]","[118, 140, … 118]","[118, 140, … 118]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]","[0.099918, 0.007427, … 0.08889]",True,"[12, 17, … 13]"
377904,"[9779538, 9779629, … 9780020]","[100.0, 97.0, … 100.0]","[40.0, 27.0, … 691.0]","[9781987, 9782315, … 9782180]",[9782180],503085507,"[0, 0, … 1]","[2.044444, 1.551667, … 3.8975]","[2975, 142, … 512]","[118, 142, … 118]","[""article_default"", ""article_default"", … ""article_default""]","[""article_default"", ""article_default"", … ""article_default""]","[0.151382, 0.185197, … 0.206735]",True,"[3, 2, … 1]"


This is using the validation, simply add the testset to your flow.

In [20]:
write_submission_file(
    impression_ids=df_validation[DEFAULT_IMPRESSION_ID_COL],
    prediction_scores=df_validation["ranked_scores"],
    path="downloads/predictions.txt",
)

0it [00:00, ?it/s]

97858it [00:41, 2379.13it/s]


Zipping downloads/predictions.txt to downloads/predictions.zip


# DONE 🚀