In [None]:
%load_ext autoreload
%autoreload 2

from src.model.news_encoder import LNEConfig
from src.model.user_encoder import CPJAConfig, PEConfig,NSAConfig,PAUEConfig
from src.model.popularity_predictor import TANPPConfig, CRGConfig, CBPDConfig, RBPDConfig, REConfig
from src.model.pprec import PPRConfig, PAGConfig, PPRec

from src.data.split import EBNeRDSplit
from src.data.train import EBNeRDTrainDataset

import torch
from torch.utils.data import DataLoader
import numpy as np

Some data for testing


In [None]:
dataset = EBNeRDTrainDataset()

Configuration of the model

In [None]:
# This depeneds on how were going to make the dataloader
max_clicked = 50

# This is the config we can do experiments with.
# Any configuration should work. For instance
# size_n in the user news encoder doesnt have to
# match the one in the popularity news encoder.
# I filled it in the way they implemented it.
# Apart from the news encoder of course, which
# is still using the embeddings from the artifacts.
pprec_config = PPRConfig(
    user_news_encoder_config=LNEConfig(size_n=400, model="bert"),
    popularity_news_encoder_config=LNEConfig(size_n=400, model="bert"),
    user_encoder_config=PAUEConfig(
        popularity_embedding_config=PEConfig(
            size_p=100,
            max_ctr=200,
        ),
        news_self_attention_config=NSAConfig(
            n_attention_heads=20,
            head_output_size=20,
        ),
        content_popularity_joint_attention_config=CPJAConfig(weight_size=100),
    ),
    popularity_predictor_config=TANPPConfig(
        recency_based_popularity_dense_config=RBPDConfig(hidden_layers=[256, 128]),
        content_based_popularity_dense_config=CBPDConfig(hidden_layers=[256, 128]),
        recency_embedding_config=REConfig(
            r_size=100,
            max_recency=1500,
            recency_factor=0.5,
        ),
        content_recency_gate_config=CRGConfig(hidden_layers=[256, 128]),
    ),
    aggregator_gate_config=PAGConfig(hidden_layers=[256, 128]),
)

In [None]:
model = PPRec(
    max_clicked=max_clicked,
    device=torch.device("cpu"),
    config=pprec_config,
)

In [None]:
batch_size = 32


candidates = PPRec.CandidateBatch(
    ids=np.array([dataset.split.get_random_article_id() for _ in range(batch_size)]),
    ctr=torch.rand(batch_size),
    recencies=torch.rand(batch_size),
)

user_clicks = PPRec.ClicksBatch(
    ids=np.array(
        [
            [dataset.split.get_random_article_id() for _ in range(max_clicked)]
            for _ in range(batch_size)
        ]
    ),
    ctr=torch.rand(batch_size, max_clicked),
)

In [None]:
# Works with random inputs!
scores, user_scores, popularity_scores = model(candidates=candidates, clicks=user_clicks)

In [None]:
# Just gotta finish the dataset in src.models.train
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=dataset.collate_fn)

In [None]:
for i in dataloader:
    print(i)
    break