## Intialization & Model Prameters

In [37]:
from pathlib import Path
from models.train import grab_data,grab_embeded_articles,initialize_model,train_model,validate_model,grab_data_test,predict_model
from models.dataloader_pytorch import NRMSDataLoader
from torch.utils.data import DataLoader
from tqdm import tqdm
from utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_IMPRESSION_ID_COL
)
import polars as pl
import torch
import torch.nn as nn
from argparse import Namespace
from utils._behaviors import (
    add_prediction_scores
)
from utils._python import write_submission_file, rank_predictions_by_score


# Create the parser
args = Namespace(debug=True, dataset="my_dataset")



In [23]:
# set parameters 
HISTORY_SIZE = 20
BATCH_SIZE = 64
title_size, history_size = 30, 30
head_num, head_dim, attention_hidden_dim, dropout = 20, 20, 200, 0.2
lr =0.01
weight_decay = 1e-5

## Load Data

In [15]:
frac = 0.15
dataset_name = "ebnerd_demo"
df_train, df_validation = grab_data(dataset_name,HISTORY_SIZE,frac)
df_train = df_train.head(4*BATCH_SIZE)
df_validation = df_validation.head(4*BATCH_SIZE)


## Load Tokenizer

In [16]:
BASE_PATH = Path("/Users/astridh/Documents/GitHub/Deep-Learning-Projects---News-Recommendation-Systems?fbclid=IwZXh0bgNhZW0CMTEAAR2W1fXivVjSypOEuVjFfg6jZ5IdOeH2OkDWZcbidgezWxAkAp1PnOoHKBA_aem_LPGF8NWJj3P5GF0SIL4g2w")

In [17]:
LOCAL_TOKENIZER_PATH = BASE_PATH.joinpath("data/local-tokenizer")
LOCAL_MODEL_PATH = BASE_PATH.joinpath("data/local-tokenizer-model")

article_mapping, word2vec_embedding = grab_embeded_articles(LOCAL_TOKENIZER_PATH,LOCAL_MODEL_PATH,dataset_name, title_size)


Loading articles and generating embeddings...


## Data Loaders

In [None]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    unknown_representation="zeros",
    eval_mode=False,
    batch_size=BATCH_SIZE, 
)

val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    unknown_representation="zeros",
    eval_mode=True,
    batch_size=BATCH_SIZE,
)

In [22]:
# Wrap in PyTorch DataLoader
train_loader = DataLoader(train_dataloader, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataloader, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

## Train Model

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = initialize_model(word2vec_embedding, title_size, HISTORY_SIZE, head_num, head_dim, attention_hidden_dim, dropout)
model.to(device)
# Set up optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) # with added weight decay
criterion = nn.CrossEntropyLoss()

Initializing NewsEncoder and UserEncoder with all submodules...
Model initialized with the following architecture:
NRMSModel(
  (user_encoder): UserEncoder(
    (titleencoder): NewsEncoder(
      (embedding): Embedding(250002, 768)
      (dropout1): Dropout(p=0.2, inplace=False)
      (self_attention): SelfAttention()
      (dropout2): Dropout(p=0.2, inplace=False)
      (att_layer): AttLayer2()
    )
    (self_attention): SelfAttention()
    (attention): AttLayer2()
  )
  (news_encoder): NewsEncoder(
    (embedding): Embedding(250002, 768)
    (dropout1): Dropout(p=0.2, inplace=False)
    (self_attention): SelfAttention()
    (dropout2): Dropout(p=0.2, inplace=False)
    (att_layer): AttLayer2()
  )
)


In [36]:
# Training and validation loop
epochs = 5
for epoch in range(epochs):
    # Train the model
    with tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}") as pbar:
            train_loss, train_acc, train_auc = train_model(pbar, model, criterion, optimizer, device, args, "run")
    print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Train Acc = {train_acc:.4f}, Train Auc = {train_auc:.4f}")

    # Validate the model
    with tqdm(val_dataloader, desc=f"Validation Epoch {epoch + 1}") as pbar:
            val_loss, val_acc, val_auc = validate_model(pbar, model, criterion, device,args, "run")

    print(f"Epoch {epoch + 1}: Val Loss = {val_loss:.4f}, Val Acc = {val_acc:.4f}, Val Auc = {val_auc:.4f}")


Training Epoch 1: 100%|██████████| 4/4 [00:18<00:00,  4.61s/it]


History size in dataloader: torch.Size([64, 20, 30]), NPRatio: torch.Size([64, 5, 30])
Epoch 1: Train Loss = 0.4194, Train Acc = 0.8945, Train Auc = 0.6349


Validation Epoch 1: 100%|██████████| 4/4 [00:44<00:00, 11.23s/it]


Epoch 1: Val Loss = 442.2948, Val Acc = 0.8449, Val Auc = 0.5346


Training Epoch 2: 100%|██████████| 4/4 [00:19<00:00,  4.87s/it]


History size in dataloader: torch.Size([64, 20, 30]), NPRatio: torch.Size([64, 5, 30])
Epoch 2: Train Loss = 0.3118, Train Acc = 0.9141, Train Auc = 0.6299


Validation Epoch 2: 100%|██████████| 4/4 [00:47<00:00, 11.79s/it]


Epoch 2: Val Loss = 454.1446, Val Acc = 0.8318, Val Auc = 0.5316


Training Epoch 3: 100%|██████████| 4/4 [00:21<00:00,  5.27s/it]


History size in dataloader: torch.Size([64, 20, 30]), NPRatio: torch.Size([64, 5, 30])
Epoch 3: Train Loss = 0.2350, Train Acc = 0.9375, Train Auc = 0.6254


Validation Epoch 3: 100%|██████████| 4/4 [00:45<00:00, 11.36s/it]


Epoch 3: Val Loss = 466.3376, Val Acc = 0.8272, Val Auc = 0.5306


Training Epoch 4: 100%|██████████| 4/4 [00:17<00:00,  4.36s/it]


History size in dataloader: torch.Size([64, 20, 30]), NPRatio: torch.Size([64, 5, 30])
Epoch 4: Train Loss = 0.1930, Train Acc = 0.9492, Train Auc = 0.6251


Validation Epoch 4: 100%|██████████| 4/4 [00:38<00:00,  9.73s/it]


Epoch 4: Val Loss = 477.2268, Val Acc = 0.8211, Val Auc = 0.5298


Training Epoch 5: 100%|██████████| 4/4 [00:17<00:00,  4.39s/it]


History size in dataloader: torch.Size([64, 20, 30]), NPRatio: torch.Size([64, 5, 30])
Epoch 5: Train Loss = 0.1633, Train Acc = 0.9570, Train Auc = 0.6238


Validation Epoch 5: 100%|██████████| 4/4 [00:41<00:00, 10.30s/it]

Epoch 5: Val Loss = 486.2350, Val Acc = 0.8148, Val Auc = 0.5298





## Predict with Model

In [39]:
# Generate predictions from the model
model.eval()  # Set model to evaluation mode
pred_validation = []

# Load the test dataset
print("Loading test dataset...")
df_test = grab_data_test(dataset_name="ebnerd_testset", history_size=HISTORY_SIZE) #.head(4*BATCH_SIZE)
df_test = df_test.head(4*BATCH_SIZE)

# Create Test Dataloader
test_dataloader = NRMSDataLoader(
    behaviors=df_test,
    article_dict=article_mapping,
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    unknown_representation="zeros",
    eval_mode=True,  # Test mode
    batch_size=BATCH_SIZE,
)

pred_test = predict_model(test_dataloader, model, device)
# Add prediction scores
df_test = add_prediction_scores(df_test, pred_test.tolist())

# Rank predictions
print("Ranking predictions for test set...")
df_test = df_test.with_columns(
    pl.col("scores")
    .map_elements(lambda x: list(rank_predictions_by_score(x)))
    .alias("ranked_scores")
)

# Write submission file
test_output_file = Path("data/predictions/test_predictions.txt")
test_output_file.parent.mkdir(parents=True, exist_ok=True)
write_submission_file(
    impression_ids=df_test[DEFAULT_IMPRESSION_ID_COL],
    prediction_scores=df_test["ranked_scores"],
    path=str(test_output_file),
)

print(f"Test predictions saved at: {test_output_file}")

Loading test dataset...
Generating predictions for test set...


Predicting batches: 100%|██████████| 4/4 [00:42<00:00, 10.74s/it]
  df_test = df_test.with_columns(


Ranking predictions for test set...


256it [00:00, 41674.44it/s]

Zipping data/predictions/test_predictions.txt to data/predictions/test_predictions.zip
Test predictions saved at: data/predictions/test_predictions.txt



