# Introduction
This note book run the model (with optimal hyperparameters) and finally creates a predictions.txt file that can be submitted to codabench


In [None]:
#____————____————____————____————
# Define the history size and fraction and EPOCHS
# ____————____————____————____————
HISTORY_SIZE = 20 #30
FRACTION = 0.001 #Fraction af datasæt
EPOCHS = 2 
FRACTION_testset = 0.0001
#____————____————____————____————

In [3]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import tensorflow as tf
import polars as pl
from tensorflow.python.client import device_lib
import numpy as np

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_ARTICLE_ID_COL,
    DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL, #-------
    DEFAULT_HISTORY_READ_TIME_COL #-------
)

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

from ebrec.models.newsrec.dataloader import NRMSDataLoader
from ebrec.models.newsrec.model_config import hparams_nrms
from ebrec.models.newsrec import NRMSModel, NRMSWrapper

In [None]:
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# Setup and load everything
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

print("Loading data")

# List all physical devices
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

physical_devices = tf.config.list_physical_devices('GPU')
print("Available devices:", physical_devices)


#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
## Load dataset
# #-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
#     """
#     Load ebnerd - function
#     """
#     df_history = (
#         pl.scan_parquet(path.joinpath("history.parquet"))
#         # .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL)
#         .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL,DEFAULT_HISTORY_READ_TIME_COL) #------------
#         .pipe(
#             truncate_history,
#             column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#             history_size=history_size,
#             padding_value=0,
#             enable_warning=False,
#         )
#     )
#     df_behaviors = (
#         pl.scan_parquet(path.joinpath("behaviors.parquet"))
#         .collect()
#         .pipe(
#             slice_join_dataframes,
#             df2=df_history.collect(),
#             on=DEFAULT_USER_COL,
#             how="left",
#         )
#     )
    
#     return df_behaviors


def ebnerd_from_path(
    path: Path,
    history_size: int = 30,
    padding: int = 0,
    user_col: str = DEFAULT_USER_COL,
    history_aids_col: str = DEFAULT_HISTORY_ARTICLE_ID_COL,
) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select(user_col, history_aids_col)
        .pipe(
            truncate_history,
            column=history_aids_col,
            history_size=history_size,
            padding_value=padding,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=user_col,
            how="left",
        )
    )
    return df_behaviors
  
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
  ### Generate labels
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# We sample a few just to get started. For testset we just make up a dummy column with 0 and 1 - this is not the true labels.

PATH = Path("/dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data").expanduser()
DATASPLIT = "ebnerd_small" # TODO if change to change make_embedding_artifacts.ipynb file (embeddings)

# DATASPLIT = "ebnerd__testset"
DUMP_DIR = PATH.joinpath("dump_artifacts")
DUMP_DIR.mkdir(exist_ok=True, parents=True)

#In this example we sample the dataset, just to keep it smaller. Also, one can simply add the testset similary to the validation.

### Define the Data Cols -- New ones here
# COLUMNS = [
#     DEFAULT_USER_COL,
#     DEFAULT_HISTORY_ARTICLE_ID_COL,
#     DEFAULT_HISTORY_SCROLL_PERCENTAGE_COL, #--------neu 
#     DEFAULT_HISTORY_READ_TIME_COL, #------- neu
#     DEFAULT_INVIEW_ARTICLES_COL,
#     DEFAULT_CLICKED_ARTICLES_COL,
#     DEFAULT_IMPRESSION_ID_COL,
#     DEFAULT_IMPRESSION_TIMESTAMP_COL,
# ]
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
]


print("____————____————____————____———")
print("HISTORY_SIZE:", HISTORY_SIZE)
print("FRACTION:", FRACTION)
print("EPOCHS:", EPOCHS)
print("FRACTION_testset:", FRACTION_testset)
print("____————____————____————____———")
print("")
#____————____————____————____————

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=6,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
print(df_train.head(2))
print(df_validation.head(2))


#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
## Load articles
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT, "articles.parquet"))
df_articles.head(2)




Loading data
Available devices: []
____————____————____————____———
HISTORY_SIZE: 20
FRACTION: 0.1
____————____————____————____———

shape: (2, 7)
┌─────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬─────────────┐
│ user_id ┆ impression_i ┆ impression_t ┆ article_id_f ┆ article_ids_ ┆ article_ids_ ┆ labels      │
│ ---     ┆ d            ┆ ime          ┆ ixed         ┆ clicked      ┆ inview       ┆ ---         │
│ u32     ┆ ---          ┆ ---          ┆ ---          ┆ ---          ┆ ---          ┆ list[i8]    │
│         ┆ u32          ┆ datetime[μs] ┆ list[i32]    ┆ list[i64]    ┆ list[i64]    ┆             │
╞═════════╪══════════════╪══════════════╪══════════════╪══════════════╪══════════════╪═════════════╡
│ 475419  ┆ 308292451    ┆ 2023-05-20   ┆ [9764422,    ┆ [9772355]    ┆ [9494434,    ┆ [0, 0, … 0] │
│         ┆              ┆ 05:56:28     ┆ 9764822, …   ┆              ┆ 9773084, …   ┆             │
│         ┆              ┆              ┆ 97590

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var i…","""Politiet frygt…",2023-06-29 06:20:33,False,"""Sagen om den ø…",2006-08-31 08:06:45,[3150850],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars …","""Biografgængern…",2023-06-29 06:20:35,False,"""Vatikanet har …",2006-05-21 16:57:00,[3006712],"""article_defaul…","""https://ekstra…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""


In [12]:
# #-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

# # Loading the article embeddings and other feature stuff

# #-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

# ### Added features and hourly difference between published and viewed article

# ## NEW

# from sklearn.preprocessing import StandardScaler

# # Convert polars DataFrame to pandas
# df_train = df_train.to_pandas()

# # Create a mapping dictionary from article_id to last_modified_time
# article_time_dict = df_articles.select(
#     "article_id", 
#     "published_time"
# ).to_dict(as_series=False)
# article_time_dict = dict(zip(
#     article_time_dict["article_id"], 
#     article_time_dict["published_time"]
# ))

# # Create a function to map article IDs to their timestamps
# def get_article_times(article_ids):
#     return [article_time_dict.get(aid, None) for aid in article_ids]

# # Add new column with the published-time
# df_train["inview_article_times"] = df_train["article_ids_inview"].apply(get_article_times)

# #add new column with the last publish_time for the clicked article
# df_train["clicked_article_time"] = df_train["article_ids_clicked"].apply(get_article_times)

# # Create a function to calculate hour differences
# def calculate_hour_differences(impression_time, article_times):
#         # If article_times is a single value (for clicked articles)
#     if not isinstance(article_times, list):
#         if article_times is None:
#             return None
#         return (impression_time - article_times).total_seconds() / 3600
    
#     # If article_times is a list (for inview articles)
#     differences = [(impression_time - article_time).total_seconds() / 3600 
#                   if article_time is not None else None 
#                   for article_time in article_times]
#     return differences

# # Use for inview articles
# df_train['inview_hour_differences'] = df_train.apply(
#     lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
#     axis=1
# )

# # # Use for clicked article
# # df_train['clicked_hour_difference'] = df_train.apply(
# #    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
# #    axis=1
# # )

# # Create a mapping dictionary from article_id to last_modified_category
# article_cat_dict = df_articles.select(
#     "article_id", 
#     "category"
# ).to_dict(as_series=False)
# article_cat_dict = dict(zip(
#     article_cat_dict["article_id"], 
#     article_cat_dict["category"]
# ))

# # Create a function to map article IDs to their category
# def get_article_category(article_ids):
#     return [article_cat_dict.get(aid, None) for aid in article_ids]

# #  Add new column with the article category
# df_train["inview_article_categories"] = df_train["article_ids_inview"].apply(get_article_category)

# df_train["history_article_categories"] = df_train["article_id_fixed"].apply(get_article_category)

# # Create a mapping dictionary from article_id to article_type
# article_type_dict = df_articles.select(
#     "article_id", 
#     "article_type"
# ).to_dict(as_series=False)
# article_type_dict = dict(zip(
#     article_type_dict["article_id"], 
#     article_type_dict["article_type"]
# ))

# # Create a function to map article IDs to their type
# def get_article_type(article_ids):
#     return [article_type_dict.get(aid, None) for aid in article_ids]

# # Add new column with the article type
# df_train["inview_article_types"] = df_train["article_ids_inview"].apply(get_article_type)

# df_train["history_article_types"] = df_train["article_id_fixed"].apply(get_article_type)

# #drop columns with the time
# df_train = df_train.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)

# df_train = pl.from_pandas(df_train)

# df_train.head(2)


# ##########################################################################################


# # Convert polars DataFrame to pandas
# df_validation = df_validation.to_pandas()

# # Create a mapping dictionary from article_id to last_modified_time
# article_time_dict = df_articles.select(
#     "article_id", 
#     "published_time"
# ).to_dict(as_series=False)
# article_time_dict = dict(zip(
#     article_time_dict["article_id"], 
#     article_time_dict["published_time"]
# ))

# # Create a function to map article IDs to their timestamps
# def get_article_times(article_ids):
#     return [article_time_dict.get(aid, None) for aid in article_ids]

# # Add new column with the published-time
# df_validation["inview_article_times"] = df_validation["article_ids_inview"].apply(get_article_times)

# #add new column with the last publish_time for the clicked article
# df_validation["clicked_article_time"] = df_validation["article_ids_clicked"].apply(get_article_times)

# # Create a function to calculate hour differences
# def calculate_hour_differences(impression_time, article_times):
#         # If article_times is a single value (for clicked articles)
#     if not isinstance(article_times, list):
#         if article_times is None:
#             return None
#         return (impression_time - article_times).total_seconds() / 3600
    
#     # If article_times is a list (for inview articles)
#     differences = [(impression_time - article_time).total_seconds() / 3600 
#                   if article_time is not None else None 
#                   for article_time in article_times]
#     return differences

# # Use for inview articles
# df_validation['inview_hour_differences'] = df_validation.apply(
#     lambda row: calculate_hour_differences(row['impression_time'], row['inview_article_times']), 
#     axis=1
# )

# # # Use for clicked article -- might be leaky??
# # df_validation['clicked_hour_difference'] = df_validation.apply(
# #    lambda row: calculate_hour_differences(row['impression_time'], row['clicked_article_time']), 
# #    axis=1
# # )
# # Create a mapping dictionary from article_id to last_modified_category
# article_cat_dict = df_articles.select(
#     "article_id", 
#     "category"
# ).to_dict(as_series=False)
# article_cat_dict = dict(zip(
#     article_cat_dict["article_id"], 
#     article_cat_dict["category"]
# ))

# # Create a function to map article IDs to their category
# def get_article_category(article_ids):
#     return [article_cat_dict.get(aid, None) for aid in article_ids]

# #  Add new column with the article category
# df_validation["inview_article_categories"] = df_validation["article_ids_inview"].apply(get_article_category)

# df_validation["history_article_categories"] = df_validation["article_id_fixed"].apply(get_article_category)

# # Create a mapping dictionary from article_id to article_type
# article_type_dict = df_articles.select(
#     "article_id", 
#     "article_type"
# ).to_dict(as_series=False)
# article_type_dict = dict(zip(
#     article_type_dict["article_id"], 
#     article_type_dict["article_type"]
# ))

# # Create a function to map article IDs to their type
# def get_article_type(article_ids):
#     return [article_type_dict.get(aid, None) for aid in article_ids]

# # Add new column with the article type
# df_validation["inview_article_types"] = df_validation["article_ids_inview"].apply(get_article_type)

# df_validation["history_article_types"] = df_validation["article_id_fixed"].apply(get_article_type)


# #drop columns with the time
# df_validation = df_validation.drop(['inview_article_times', 'clicked_article_time','impression_time'], axis=1)


# df_validation = pl.from_pandas(df_validation)

# df_validation.head(2)

In [13]:
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
## Init model using HuggingFace's tokenizer and wordembedding

# In the original implementation, they use the GloVe embeddings and tokenizer. To get going fast, we'll use a multilingual LLM from Hugging Face. 
# Utilizing the tokenizer to tokenize the articles and the word-embedding to init NRMS.
#-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
# TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-robe rta-large"
# TRANSFORMER_MODEL_NAME = "google-bert/bert-base-multilingual-uncased" 
#Argue for cased vs uncased.  TODO
# #Cased might be better but to compare with malteHb we use uncased

TRANSFORMER_MODEL_NAME = "Maltehb/danish-bert-botxo"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30 #hardcoded somewhere ?? error if change

print("")
print("____————____————____————____———")
print("Using transformer model:", TRANSFORMER_MODEL_NAME)
print("____————____————____————____———")
print("")

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#


df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)

# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

#_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

# print("df_train columns:", df_train.columns)
# print("df_validation columns:", df_validation.columns)
#_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_


____————____————____————____———
Using transformer model: Maltehb/danish-bert-botxo
____————____————____————____———



# Initiate the dataloaders
In the implementations we have disconnected the models and data. Hence, you should built a dataloader that fits your needs.

In [14]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=128,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=64,
)


## Train the model


In [15]:
import torch
# List all physical devices
physical_devices = tf.config.list_physical_devices('GPU')
print("Available devices:", physical_devices)
import torch.nn as nn
print(torch.cuda.is_available())

Available devices: []
True


In [None]:
# _-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
# Original Model
# _-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

# Works fine -- Can change epochs on line 67

import os
from tqdm.notebook import tqdm

MODEL_NAME = "NRMS"
LOG_DIR = f"downloads/runs/{MODEL_NAME}"
WEIGHTS_DIR = f"downloads/data/state_dict/{MODEL_NAME}"
MODEL_WEIGHTS = f"{WEIGHTS_DIR}/weights.pt"

os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(WEIGHTS_DIR, exist_ok=True)

# Create a custom ModelCheckpoint for PyTorch
class PyTorchModelCheckpoint:
    def __init__(self, filepath, model_wrapper=None, save_best_only=True, save_weights_only=True, verbose=1):
        self.filepath = filepath
        self.model_wrapper = model_wrapper  # Store the model wrapper reference
        self.save_best_only = save_best_only
        self.save_weights_only = save_weights_only
        self.verbose = verbose
        self.best_val_loss = float('inf')
    
    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs.get('val_loss', None)
        if val_loss is None:
            return
        
        if self.save_best_only:
            if val_loss < self.best_val_loss:
                if self.verbose:
                    print(f'\nValidation loss improved from {self.best_val_loss:.5f} to {val_loss:.5f}')
                self.best_val_loss = val_loss
                # Use the model_wrapper reference
                self.model_wrapper.save_weights(self.filepath)
        else:
            self.model_wrapper.save_weights(self.filepath)

# Initialize model first
hparams_nrms.history_size = HISTORY_SIZE

# Best hyperparameters: {'head_num': 19, 'head_dim': 29, 'attention_hidden_dim': 145, 'dropout': 0.22088583494496855, 'learning_rate': 0.00030309205322750723}

hparams_nrms.head_num = 19
hparams_nrms.head_dim = 29
hparams_nrms.attention_hidden_dim = 145
hparams_nrms.dropout = 0.22088583494496855
hparams_nrms.learning_rate = 0.00030309205322750723

pytorch_model = NRMSModel(
    hparams=hparams_nrms,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)
model = NRMSWrapper(pytorch_model)

# Then create the callback with the model reference
modelcheckpoint = PyTorchModelCheckpoint(
    filepath=MODEL_WEIGHTS,
    model_wrapper=model,  # Pass the model wrapper
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

# Training
hist = model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=EPOCHS, ### EPOCHS INPUT
    callbacks=[modelcheckpoint]
)

# Load weights using the wrapper
model.load_weights(filepath=MODEL_WEIGHTS)

# Get predictions
pred_validation = model.predict(val_dataloader)

Epoch 1/3 [Train]:   0%|          | 0/184 [00:00<?, ?it/s]

Epoch 1/3 [Train]: 100%|██████████| 184/184 [00:26<00:00,  6.90it/s, loss=0.3976]
Epoch 1/3 [Valid]: 100%|██████████| 383/383 [01:22<00:00,  4.62it/s, loss=0.6304]


Epoch 1 - Train Loss: 0.3976, Val Loss: 0.3028

Validation loss improved from inf to 0.30284


Epoch 2/3 [Train]: 100%|██████████| 184/184 [00:26<00:00,  6.88it/s, loss=0.3811]
Epoch 2/3 [Valid]: 100%|██████████| 383/383 [01:23<00:00,  4.59it/s, loss=0.6641]


Epoch 2 - Train Loss: 0.3811, Val Loss: 0.3190


Epoch 3/3 [Train]: 100%|██████████| 184/184 [00:26<00:00,  6.85it/s, loss=0.3737]
Epoch 3/3 [Valid]: 100%|██████████| 383/383 [01:23<00:00,  4.61it/s, loss=0.7010]
  self.model.load_state_dict(torch.load(filepath))


Epoch 3 - Train Loss: 0.3737, Val Loss: 0.3368


Predicting: 100%|██████████| 383/383 [01:23<00:00,  4.61it/s]


# DONE ⛄️

## Test set

In [None]:
# NEW (OLD) CODE

import datetime
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
]
df = (
    ebnerd_from_path(
        PATH.joinpath(DATASPLIT, "train"),
        history_size=HISTORY_SIZE,
        padding=0,
    )
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)

dt_split = pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).max() - datetime.timedelta(days=1)
df_train = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) < dt_split)
df_validation = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) >= dt_split)

print(f"Train samples: {df_train.height}\nValidation samples: {df_validation.height}")
df_train.head(2)

PATH= Path("/dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data").expanduser()
datasplit="ebnerd_testset"
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    # DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
]

df_test = (
    ebnerd_from_path(
        PATH.joinpath(datasplit, "test"),
        # PATH.joinpath(DATASPLIT, "validation"),
        history_size=HISTORY_SIZE,
        padding=0,
    )
    .select(COLUMNS)
    # .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION_testset)
)
print("Fraction of test data:", FRACTION_testset)


Train samples: 20156
Validation samples: 3271
Fraction of test data: 0.0001


In [18]:
#Create dummny labels that get removed in the Dataloader anyways

df_test = df_test.with_columns(pl.lit(0).alias("labels")) 

In [19]:
BATCH_SIZE_TEST = 32

test_dataloader = NRMSDataLoader(
    behaviors=df_test,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=BATCH_SIZE_TEST,
)


In [20]:
print("Started doing predictions on testset")
pred_test = model.scorer.predict(test_dataloader)
df_test = add_prediction_scores(df_test, pred_test.tolist())


Started doing predictions on testset


Predicting: 100%|██████████| 43/43 [00:06<00:00,  6.35it/s]


In [21]:
# metrics = MetricEvaluator(
#     labels=df_test["labels"].to_list(),
#     predictions=df_test["scores"].to_list(),
#     metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
# )
# metrics.evaluate() ### IT IS SUPPOSDD TO BE COMMENTED OUT -- cant run metrics on unlabeled dataset

### make submissions file

In [22]:
df_test = df_test.with_columns(
    pl.col("scores")
    .map_elements(lambda x: list(rank_predictions_by_score(x)))
    .alias("ranked_scores")
)
df_test.head(2)

user_id,impression_id,impression_time,article_id_fixed,article_ids_inview,labels,scores,ranked_scores
u32,u32,datetime[μs],list[i32],list[i32],i32,list[f64],list[i64]
1865414,29513300,2023-06-02 14:57:18,"[9779511, 9778939, … 9203696]","[8054212, 9778257, … 9794017]",0,"[0.028204, 0.1812, … 0.203133]","[5, 2, … 1]"
73278,281985418,2023-06-04 08:18:40,"[9789446, 9788116, … 9789832]","[9795545, 9790335, … 9796792]",0,"[0.089995, 0.126175, … 0.110625]","[4, 2, … 3]"


In [23]:
write_submission_file(
    impression_ids=df_test[DEFAULT_IMPRESSION_ID_COL],
    prediction_scores=df_test["ranked_scores"],
    path=DUMP_DIR.joinpath("predictions.txt"),
    filename_zip=f"predictions__.zip",
)
print("Submission file created!")
print("DONE")
print("")


1353it [00:00, 23575.84it/s]

Zipping /dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data/dump_artifacts/predictions.txt to /dtu/blackhole/14/155764/DeepL-Project-Corn2/ebnerd-benchmark-copy/ebnerd_data/dump_artifacts/predictions__.zip
Submission file created!
DONE






In [16]:
print("Merry Christmas!")
print("    *  *  ")
print("  *  -  - *")
print("   / O  O \\ ")
print("  (    >    )")
print("   \\ '===' /")
print("   /|     |\\")
print("  /_|     |_\\")

Merry Christmas!
    *  *  
  *  -  - *
   / O  O \ 
  (    >    )
   \ '===' /
   /|     |\
  /_|     |_\


## NEW SHIT that is shit

In [13]:
# TEST_FRACITON=0.002
# DEFAULT_IS_BEYOND_ACCURACY_COL = "is_beyond_accuracy"
# from ebrec.utils._polars import split_df_chunks, concat_str_columns

# NUM_CHUNKS_TEST = 10
# CHUNKS_DONE = 0 

In [10]:
# # =====================================================================================
# print("Initiating testset...")
# df_test = (
#     ebnerd_from_path(
#         PATH.joinpath("ebnerd_testset", "test"),
#         history_size=HISTORY_SIZE,
#         padding=0,
#     )
#     .sample(fraction=TEST_FRACITON)
#     .with_columns(
#         pl.col(DEFAULT_INVIEW_ARTICLES_COL)
#         .list.first()
#         .alias(DEFAULT_CLICKED_ARTICLES_COL)
#     )
#     .select(COLUMNS + [DEFAULT_IS_BEYOND_ACCURACY_COL])
#     .with_columns(
#         pl.col(DEFAULT_INVIEW_ARTICLES_COL)
#         .list.eval(pl.element() * 0)
#         .alias(DEFAULT_LABELS_COL)
#     )
# )
# # Split test in beyond-accuracy TRUE / FALSE. In the BA 'article_ids_inview' is 250.
# df_test_wo_beyond = df_test.filter(~pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))
# df_test_w_beyond = df_test.filter(pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))

# df_test_chunks = split_df_chunks(df_test_wo_beyond, n_chunks=NUM_CHUNKS_TEST)
# df_pred_test_wo_beyond = []
# print("Initiating testset without beyond-accuracy...")

Initiating testset...
Initiating testset without beyond-accuracy...


In [11]:
# df_test_w_beyond.shape

(393, 8)

In [12]:
# df_test_wo_beyond.shape

(26680, 8)

In [14]:
# BATCH_SIZE_TEST_WO_B = 32
# BATCH_SIZE_TEST_W_B = 32

In [15]:
# import datetime as dt
# # Dump paths:
# DUMP_DIR = Path("ebnerd_predictions")
# DUMP_DIR.mkdir(exist_ok=True, parents=True)
# #
# DT_NOW = dt.datetime.now()
# #
# MODEL_NAME ="NRMS_MODEL"
# MODEL_OUTPUT_NAME = f"{MODEL_NAME}-{DT_NOW}"
# #
# ARTIFACT_DIR = DUMP_DIR.joinpath("test_predictions", MODEL_OUTPUT_NAME)

In [16]:
# TEST_CHUNKS_DIR = ARTIFACT_DIR.joinpath("test_chunks")
# TEST_CHUNKS_DIR.mkdir(parents=True, exist_ok=True)

In [17]:
# import gc
# import shutil


In [None]:

# for i, df_test_chunk in enumerate(df_test_chunks[CHUNKS_DONE:], start=1 + CHUNKS_DONE):
#     print(f"Test chunk: {i}/{len(df_test_chunks)}")
#     # Initialize DataLoader
#     test_dataloader_wo_b = NRMSDataLoader(
#         behaviors=df_test_chunk,
#         article_dict=article_mapping,
#         unknown_representation="zeros",
#         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#         eval_mode=True,
#         batch_size=BATCH_SIZE_TEST_WO_B,
#     )
#     # Predict and clear session
#     scores = model.scorer.predict(test_dataloader_wo_b)
#     tf.keras.backend.clear_session()

#     # Process the predictions
#     df_test_chunk = add_prediction_scores(df_test_chunk, scores.tolist()).with_columns(
#         pl.col("scores")
#         .map_elements(lambda x: list(rank_predictions_by_score(x)))
#         .alias("ranked_scores")
#     )

#     # Save the processed chunk
#     df_test_chunk.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
#         TEST_CHUNKS_DIR.joinpath(f"pred_wo_ba_{i}.parquet")
#     )

#     # Append and clean up
#     df_pred_test_wo_beyond.append(df_test_chunk)

#     # Cleanup
#     del df_test_chunk, test_dataloader_wo_b, scores
#     gc.collect()

# df_pred_test_wo_beyond = pl.concat(df_pred_test_wo_beyond)
# df_pred_test_wo_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
#     TEST_CHUNKS_DIR.joinpath("pred_wo_ba.parquet")
# )

Test chunk: 1/10


Predicting: 100%|██████████| 84/84 [00:09<00:00,  8.78it/s]


Test chunk: 2/10


Predicting: 100%|██████████| 84/84 [00:09<00:00,  8.70it/s]


Test chunk: 3/10


Predicting: 100%|██████████| 84/84 [00:09<00:00,  8.73it/s]


Test chunk: 4/10


Predicting: 100%|██████████| 84/84 [00:09<00:00,  8.83it/s]


Test chunk: 5/10


Predicting: 100%|██████████| 84/84 [00:09<00:00,  8.88it/s]


Test chunk: 6/10


Predicting: 100%|██████████| 84/84 [00:09<00:00,  8.70it/s]


Test chunk: 7/10


Predicting: 100%|██████████| 84/84 [00:09<00:00,  8.80it/s]


Test chunk: 8/10


Predicting: 100%|██████████| 84/84 [00:09<00:00,  8.79it/s]


Test chunk: 9/10


Predicting: 100%|██████████| 84/84 [00:09<00:00,  8.73it/s]


Test chunk: 10/10


Predicting: 100%|██████████| 84/84 [00:09<00:00,  8.68it/s]


In [22]:
# df_test_w_beyond_chunks = split_df_chunks(df_test_w_beyond, n_chunks=NUM_CHUNKS_TEST)
# df_pred_test_w_beyond = []
# print("Initiating testset without beyond-accuracy...")

Initiating testset without beyond-accuracy...


In [None]:
# for i, df_test_w_beyond_chunk in enumerate(df_test_w_beyond_chunks[CHUNKS_DONE:], start=1 + CHUNKS_DONE):
#     print(f"Test chunk: {i}/{len(df_test_w_beyond_chunks)}")
#     # Initialize DataLoader
#     test_dataloader_w_b = NRMSDataLoader(
#         behaviors=df_test_w_beyond_chunk,
#         article_dict=article_mapping,
#         unknown_representation="zeros",
#         history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#         eval_mode=True,
#         batch_size=BATCH_SIZE_TEST_W_B,
#     )
#     # Predict and clear session
#     scores = model.scorer.predict(test_dataloader_w_b)
#     tf.keras.backend.clear_session()

#     # Process the predictions
#     df_test_w_beyond_chunk = add_prediction_scores(
#         df_test_w_beyond_chunk, scores.tolist()
#     ).with_columns(
#         pl.col("scores")
#         .map_elements(lambda x: list(rank_predictions_by_score(x)))
#         .alias("ranked_scores")
#     )

#     # Save the processed chunk
#     df_test_w_beyond_chunk.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
#         TEST_CHUNKS_DIR.joinpath(f"pred_w_ba_{i}.parquet")
#     )

#     # Append and clean up
#     df_pred_test_w_beyond.append(df_test_w_beyond_chunk)

#     # Cleanup
#     del df_test_w_beyond_chunk, test_dataloader_w_b, scores
#     gc.collect()

# df_pred_test_w_beyond = pl.concat(df_pred_test_w_beyond)
# df_pred_test_w_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
#     TEST_CHUNKS_DIR.joinpath("pred_w_ba.parquet")
# )

In [None]:

# # =====================================================================================
# print("Initiating testset with beyond-accuracy...")
# test_dataloader_w_b = NRMSDataLoader(
#     behaviors=df_test_w_beyond,
#     article_dict=article_mapping,
#     unknown_representation="zeros",
#     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#     eval_mode=True,
#     batch_size=BATCH_SIZE_TEST_W_B,
# )
# scores = model.scorer.predict(test_dataloader_w_b)
# df_pred_test_w_beyond = add_prediction_scores(
#     df_test_w_beyond, scores.tolist()
# ).with_columns(
#     pl.col("scores")
#     .map_elements(lambda x: list(rank_predictions_by_score(x)))
#     .alias("ranked_scores")
# )
# df_pred_test_w_beyond.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
#     TEST_CHUNKS_DIR.joinpath("pred_w_ba.parquet")
# )


In [None]:

# # =====================================================================================
# print("Saving prediction results...")
# df_test = pl.concat([df_pred_test_wo_beyond, df_pred_test_w_beyond])
# df_test.select(DEFAULT_IMPRESSION_ID_COL, "ranked_scores").write_parquet(
#     ARTIFACT_DIR.joinpath("test_predictions.parquet")
# )

# if TEST_CHUNKS_DIR.exists() and TEST_CHUNKS_DIR.is_dir():
#     shutil.rmtree(TEST_CHUNKS_DIR)

# write_submission_file(
#     impression_ids=df_test[DEFAULT_IMPRESSION_ID_COL],
#     prediction_scores=df_test["ranked_scores"],
#     path=ARTIFACT_DIR.joinpath("predictions.txt"),
#     filename_zip=f"{MODEL_NAME}-{SEED}-{DATASPLIT}.zip",
# )
