In [2]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import torch 
import polars as pl

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
    DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL
)

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from ebrec.utils._articles import convert_text2encoding_with_transformers,concat_list_to_text
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

%load_ext autoreload
%autoreload 2

from src.model.dataloader import PPRecDataLoader
from src.model.model_config import hparams_pprec
from src.model.pprec import PPRec
PATH = Path("/Users/sohamchatterjee/Documents/UvA/RecSYS/Project/ebnerd_data")
DATASPLIT = "ebnerd_demo"

  from .autonotebook import tqdm as notebook_tqdm


In [52]:
# df_history_dummy = (
#         pl.scan_parquet(PATH.joinpath(DATASPLIT,"train","history.parquet"))
#         .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL)
#         .pipe(
#             truncate_history,
#             column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#             history_size=10,
#             padding_value=0,
#             enable_warning=False,
#         ).collect()
#     )

Reached here
Reached here
Reached here


In [7]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL)
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        ).pipe(
            truncate_history,
            column=DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    return df_behaviors

In [8]:
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL
]
HISTORY_SIZE = 10
FRACTION = 0.001

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
df_train.head(2)

Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here


user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8]
1541887,"[9767741, 9770178, … 9769684]","[9773070, 9756785, … 9773045]",[9773070],445836814,2023-05-20 08:11:42,"[2023-05-16 16:08:58, 2023-05-17 15:44:31, … 2023-05-17 15:52:28]","[1, 0, … 0]"
2273352,"[9763942, 9764086, … 9769328]","[9776234, 9776394, … 9486486]",[9776234],291422246,2023-05-22 21:48:27,"[2023-05-16 03:38:06, 2023-05-16 03:38:33, … 2023-05-17 09:34:51]","[1, 0, … 0]"


In [3]:
df_articles = pl.read_parquet(PATH.joinpath("articles.parquet"))
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3000022,"""Hanks beskyldt…","""Tom Hanks har …",2023-06-29 06:20:32,False,"""Tom Hanks skul…",2006-09-20 09:24:18,[3518381],"""article_defaul…","""https://ekstra…","[""David Gardner""]","[""PER""]","[""Kriminalitet"", ""Kendt"", … ""Litteratur""]",414,[432],"""underholdning""",,,,0.9911,"""Negative"""
3000063,"""Bostrups aske …","""Studieværten b…",2023-06-29 06:20:32,False,"""Strålende sens…",2006-09-24 07:45:30,"[3170935, 3170939]","""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Underholdning"", … ""Personlig begivenhed""]",118,[133],"""nyheder""",,,,0.5155,"""Neutral"""


In [14]:
# df_articles_1 = df_articles.with_columns(
#         pl.col('title').str.lengths().alias('title_length')
# )
# df_articles_1.head()

In [4]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping_title = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)



In [23]:
df_articles = concat_list_to_text(df_articles,'ner_clusters','ner_clusters_text')
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, 'ner_clusters_text', max_length=MAX_TITLE_LENGTH
)
article_mapping_entity = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

Reached here


In [13]:
df_articles = df_articles.with_columns(
            pl.col('subtitle-title_encode_FacebookAI/xlm-roberta-base').list.len().alias("title_embed_length"),
            pl.col('ner_clusters_text_encode_FacebookAI/xlm-roberta-base').list.len().alias("entity_embed_length")
        )
df_articles.head()

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,ner_clusters_text,subtitle-title,subtitle-title_encode_FacebookAI/xlm-roberta-base,ner_clusters_text_encode_FacebookAI/xlm-roberta-base,title_embed_length,entity_embed_length
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,str,str,list[i64],list[i64],u32,u32
3000022,"""Hanks beskyldt…","""Tom Hanks har …",2023-06-29 06:20:32,False,"""Tom Hanks skul…",2006-09-20 09:24:18,[3518381],"""article_defaul…","""https://ekstra…","[""David Gardner""]","[""PER""]","[""Kriminalitet"", ""Kendt"", … ""Litteratur""]",414,[432],"""underholdning""",,,,0.9911,"""Negative""","""David Gardner""","""Tom Hanks har …","[8352, 2548, … 56]","[6765, 90968, … 1]",30,30
3000063,"""Bostrups aske …","""Studieværten b…",2023-06-29 06:20:32,False,"""Strålende sens…",2006-09-24 07:45:30,"[3170935, 3170939]","""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Underholdning"", … ""Personlig begivenhed""]",118,[133],"""nyheder""",,,,0.5155,"""Neutral""","""""","""Studieværten b…","[60716, 17052, … 1]","[1, 1, … 1]",30,30
3000613,"""Jesper Olsen r…","""Den tidligere …",2023-06-29 06:20:33,False,"""Jesper Olsen, …",2006-05-09 11:29:00,[3164998],"""article_defaul…","""https://ekstra…","[""Frankrig"", ""Jesper Olsen"", … ""Jesper Olsen""]","[""LOC"", ""PER"", … ""PER""]","[""Kendt"", ""Sport"", … ""Sygdom og behandling""]",142,"[196, 271]","""sport""",,,,0.9876,"""Negative""","""Frankrig-Jespe…","""Den tidligere …","[1575, 12532, … 111326]","[192380, 9, … 1]",30,30
3000700,"""Madonna topløs…","""47-årige Madon…",2023-06-29 06:20:33,False,"""Skal du have s…",2006-05-04 11:03:12,[3172046],"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Livsstil"", ""Underholdning""]",414,[432],"""underholdning""",,,,0.8786,"""Neutral""","""""","""47-årige Madon…","[7657, 9, … 22907]","[1, 1, … 1]",30,30
3000840,"""Otto Brandenbu…","""Sangeren og sk…",2023-06-29 06:20:33,False,"""'Og lidt for S…",2007-03-01 18:34:00,[3914446],"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Underholdning"", … ""Musik og lyd""]",118,[133],"""nyheder""",,,,0.9468,"""Negative""","""""","""Sangeren og sk…","[22986, 3683, … 1]","[1, 1, … 1]",30,30


In [312]:
# df_train.head()
df_train1 = df_train.with_columns(
            pl.col('article_id_fixed').list.len().alias("click_history_length"),
            pl.col('article_ids_inview').list.len().alias("click_inview_length"),
             pl.col('labels').list.len().alias("labels_length"),
            #  pl.col('impression_time_fixed').list.len().alias("hist_imp_time_length")
        )
df_train1.head(100)
print(df_train1.select ( pl.struct("click_inview_length").n_unique()))

shape: (1, 1)
┌─────────────────────┐
│ click_inview_length │
│ ---                 │
│ u32                 │
╞═════════════════════╡
│ 1                   │
└─────────────────────┘


# Adding CTR info to Train

In [24]:
in_view_dict = {}
click_dict = {}
train_clicked_articles =  df_train['article_ids_clicked'].to_list()
train_inview_articles = df_train['article_ids_inview'].to_list()

In [25]:
for clicked_list in train_clicked_articles:
    for item in clicked_list:
        if item not in click_dict:
            click_dict[item] = 1
        else:
            click_dict[item] +=1

for inview_list in train_inview_articles:
    for item in inview_list:
        if item not in in_view_dict:
            in_view_dict[item] = 1
        else:
            in_view_dict[item] += 1

In [26]:
clicked_articles_val =  df_validation['article_ids_clicked'].to_list()
inview_articles_val = df_validation['article_ids_inview'].to_list()

In [27]:
for clicked_list in clicked_articles_val:
    for item in clicked_list:
        if item not in click_dict:
            click_dict[item] = 1
        else:
            click_dict[item] +=1

for inview_list in inview_articles_val:
    for item in inview_list:
        if item not in in_view_dict:
            in_view_dict[item] = 1
        else:
            in_view_dict[item] += 1

In [28]:
articles = df_articles['article_id'].to_list()

In [29]:
# ctr
articles_ctr = {}
for article in articles:
    if article not in articles_ctr:
        articles_ctr[article]=0

for key,value in click_dict.items():
    articles_ctr[key] = value / in_view_dict[key]


        

    

In [30]:
articles_ctr.update((x, int(y*100)) for x, y in articles_ctr.items())

# Recency info

In [22]:
inview_articles_train = df_train['article_ids_inview'].to_list()
inview_articles_val = df_validation['article_ids_inview'].to_list()


In [23]:
len(df_train)

24

In [24]:
hist_articles_train = df_train['article_id_fixed'].to_list()
hist_articles_val = df_validation['article_id_fixed'].to_list()

In [25]:
impression_timestamps_train = df_train['impression_time'].to_list()
impression_timestamps_val = df_validation['impression_time'].to_list()


In [26]:
hist_imp_timestamps_train = df_train['impression_time_fixed'].to_list()
hist_impression_timestamps_val = df_validation['impression_time_fixed'].to_list()

In [27]:
len(df_train)

24

In [28]:
df = df_articles.select(['article_id','published_time'])
article_last_published_times = dict(df.iter_rows())
article_last_published_times

{3000022: datetime.datetime(2006, 9, 20, 9, 24, 18),
 3000063: datetime.datetime(2006, 9, 24, 7, 45, 30),
 3000613: datetime.datetime(2006, 5, 9, 11, 29),
 3000700: datetime.datetime(2006, 5, 4, 11, 3, 12),
 3000840: datetime.datetime(2007, 3, 1, 18, 34),
 3001278: datetime.datetime(2006, 5, 2, 8, 35, 37),
 3001299: datetime.datetime(2007, 3, 20, 7, 29, 32),
 3001353: datetime.datetime(2006, 8, 31, 8, 6, 45),
 3001457: datetime.datetime(2007, 3, 19, 12, 45, 28),
 3001459: datetime.datetime(2006, 5, 19, 11, 26, 38),
 3001724: datetime.datetime(2007, 3, 21, 11, 19, 42),
 3001899: datetime.datetime(2007, 3, 9, 10, 58, 31),
 3002265: datetime.datetime(2006, 4, 29, 17, 36, 37),
 3002293: datetime.datetime(2007, 3, 7, 14, 35),
 3002673: datetime.datetime(2006, 7, 12, 12, 22, 43),
 3002698: datetime.datetime(2007, 1, 8, 11, 35),
 3002728: datetime.datetime(2006, 12, 29, 13, 14, 8),
 3003065: datetime.datetime(2006, 5, 21, 16, 57),
 3005322: datetime.datetime(2001, 5, 19, 10, 10),
 3005351: da

In [29]:
inview_articles_recency_train = []
for outer in range(len(inview_articles_train)):
    tmp=[]
    for inner in range(len(inview_articles_train[outer])):
        article = inview_articles_train[outer][inner]
        time_view_article = int((impression_timestamps_train[outer] - article_last_published_times[article]).total_seconds()/3600)
        # if time_view_article < 0:
        #     time_view_article = 0
        tmp.append(time_view_article)
    inview_articles_recency_train.append(tmp)
    
print(len(inview_articles_recency_train))

df_train = df_train.with_columns(
   pl.DataFrame(
    [
        pl.Series("recency_inview", inview_articles_recency_train),
       
    ]
).select('recency_inview')
)


24


In [30]:
inview_articles_recency_val = []
for outer in range(len(inview_articles_val)):
    tmp=[]
    for inner in range(len(inview_articles_val[outer])):
        article = inview_articles_val[outer][inner]
        time_view_article = int((impression_timestamps_val[outer] - article_last_published_times[article]).total_seconds()/3600)
        # if time_view_article < 0:
        #     time_view_article = 0
        tmp.append(time_view_article)
    inview_articles_recency_val.append(tmp)
    
print(len(inview_articles_recency_val))

df_validation = df_validation.with_columns(
   pl.DataFrame(
    [
        pl.Series("recency_inview", inview_articles_recency_val),
       
    ]
).select('recency_inview')
)

25


In [31]:
len(hist_articles_train)

24

In [32]:
hist_articles_recency_train = []
for outer in range(len(hist_articles_train)):
    tmp=[]
    for inner in range(len(hist_articles_train[outer])):
        article = hist_articles_train[outer][inner]
        
        if article == 0:
            time_view_article = 0
        else:
            time_view_article = int((hist_imp_timestamps_train[outer][inner] - article_last_published_times[article]).total_seconds()/3600)
        
            

       
        tmp.append(time_view_article)
    hist_articles_recency_train.append(tmp)
    
print(len(hist_articles_recency_train))

df_train = df_train.with_columns(
   pl.DataFrame(
    [
        pl.Series("recency_hist", hist_articles_recency_train),
       
    ]
).select('recency_hist')
)


24


In [33]:
hist_articles_recency_val = []
for outer in range(len(hist_articles_val)):
    tmp=[]
    for inner in range(len(hist_articles_val[outer])):
        article = hist_articles_val[outer][inner]
        if article == 0:
            time_view_article =0
        else:
            time_view_article = int((hist_impression_timestamps_val[outer][inner] - article_last_published_times[article]).total_seconds()/3600)
        # if time_view_article < 0:
        #     time_view_article = 0
        tmp.append(time_view_article)
    hist_articles_recency_val.append(tmp)
    
print(len(hist_articles_recency_val))

df_validation = df_validation.with_columns(
   pl.DataFrame(
    [
        pl.Series("recency_hist", hist_articles_recency_val),
       
    ]
).select('recency_hist')
)

25


In [34]:

# df_train = df_train.drop(['impression_time','c'])
# df_validation = df_validation.drop(['impression_time','c'])

In [36]:
df_train.write_parquet("small_demo_train_all_features.parquet")
df_validation.write_parquet("small_demo_val_all_features.parquet")

In [35]:
df_train.head()

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels,recency_inview,recency_hist
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8],list[i64],list[i64]
50306,"[9770178, 9769433, … 9769917]","[9220931, 9779019, … 9779019]",[9779366],262119122,2023-05-24 17:10:29,"[2023-05-17 15:56:55, 2023-05-17 16:35:48, … 2023-05-18 04:49:59]","[0, 0, … 0]","[9562, 5, … 5]","[0, 0, … 12]"
1432021,"[0, 0, … 9768260]","[9776917, 9777075, … 9776897]",[9776897],259571782,2023-05-23 08:43:17,"[1970-01-01 00:00:00, 1970-01-01 00:00:00, … 2023-05-17 19:06:01]","[0, 0, … 1]","[1, 0, … 1]","[0, 0, … 0]"
2498205,"[9767363, 9767417, … 9769404]","[9779860, 9779747, … 9779748]",[9779748],372622576,2023-05-25 04:18:38,"[2023-05-16 09:51:46, 2023-05-16 09:53:54, … 2023-05-18 05:24:29]","[0, 0, … 1]","[7, 8, … 9]","[1, 2, … 9]"
564256,"[9767646, 9768860, … 9770400]","[9775785, 9775855, … 9527795]",[9775855],15614407,2023-05-22 14:56:59,"[2023-05-16 09:19:04, 2023-05-16 21:55:59, … 2023-05-17 20:42:35]","[0, 1, … 0]","[4, 3, … 4201]","[0, 0, … 0]"
1788365,"[9770037, 9769622, … 9767546]","[9776489, 9776049, … 9776148]",[9776489],204181432,2023-05-23 03:26:33,"[2023-05-17 16:15:30, 2023-05-17 16:16:00, … 2023-05-18 06:53:52]","[1, 0, … 0]","[7, 6, … 8]","[3, 0, … 13]"


In [415]:
val_dataloader.__getitem__(1)

Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
['title_user_id', 'title_article_ids_clicked', 'title_impression_id', 'title_impression_time', 'title_impression_time_fixed', 'title_recency_inview', 'title_recency_hist', 'title_n_samples', 'title_article_id_fixed', 'title_article_ids_inview', 'ner_clusters_text_user_id', 'ner_clusters_text_article_ids_clicked', 'ner_clusters_text_impression_id', 'ner_clusters_text_impression_time', 'ner_clusters_text_impression_time_fixed', 'ner_clusters_text_recency_inview', 'ner_clusters_text_recency_hist', 'ner_clusters_text_n_samples', 'ner_clusters_text_article_id_fixed', 'ner_clusters_text_article_ids_inview', 'ctr_user_id', 'ctr_article_ids_clicked', 'ctr_impression_id', 'ctr_impression_time', 'ctr_impression_time_fixed', 'ctr_recency_inview', 'ctr_recency_hist', 'ctr_n_samples', 'ctr_article_id_fixed', 'ctr_article_ids_inview']


((320,), (373,))

# Popularity embedding is built from CTR and recency embedding

# Features adding - DONE

# Test Pytorch modules

# 1. KnowledgeAwareNewsEncoder

In [322]:
from src.model.modules import KnowledgeAwareNewsEncoder
model = KnowledgeAwareNewsEncoder(hparams_pprec,word2vec_embedding,seed=123)
for i, data in enumerate(train_dataloader):
        # Every data instance is an input + label pair
        inputs, labels = data
        title_data = inputs[5]
        entity_data = inputs[6]
        #print(title_data.shape)
        outputs = model(title_data,entity_data)
        print("KnowledgeAwareNewsEncoder output shape:",outputs.shape)
        



Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Entity cross attention: torch.Size([3, 150, 768])
KnowledgeAwareNewsEncoder output shape: torch.Size([3, 150, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Entity cross attention: torch.Size([3, 150, 768])
KnowledgeAwareNewsEncoder output shape: torch.Size([3, 150, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Entity cross attention: torch.Size([3, 150, 768])
KnowledgeAwareNewsEncoder output shape: torch.Size([3, 150, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Entity cross attention: torc

# 2. Time-aware news popularity predictor

In [3]:
COLUMNS = [
   'user_id',
   'article_id_fixed',
   'article_ids_inview',
   'article_ids_clicked',
   'impression_id',
   'labels',
   'recency_inview',
   'recency_hist'  
]
df_train  = pl.scan_parquet("small_demo_train_all_features.parquet").select(COLUMNS).collect()

df_validation =  pl.scan_parquet("small_demo_val_all_features.parquet").select(COLUMNS).collect()

In [33]:
import pickle 

with open('article_mapping_title.pkl', 'wb') as f:
    pickle.dump(article_mapping_title, f)



with open('article_mapping_entity.pkl', 'wb') as f:
    pickle.dump(article_mapping_entity, f)


with open('articles_ctr.pkl', 'wb') as f:
    pickle.dump(articles_ctr, f)   


In [262]:
train_dataloader = PPRecDataLoader(
    behaviors=df_train,
    article_dict=article_mapping_title,
    entity_mapping=article_mapping_entity,
    popularity_mapping=pop_articles,
    ctr_mapping=articles_ctr,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_recency = 'recency_hist',
    inview_recency = 'recency_inview',
    eval_mode=False,
    batch_size=3,
)

In [57]:
val_dataloader = PPRecDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping_title,
    entity_mapping=article_mapping_entity,
    ctr_mapping=articles_ctr,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_recency = 'recency_hist',
    inview_recency = 'recency_inview',
    eval_mode=True,
    batch_size=32,
)

In [335]:
from src.model.modules import TimeAwarePopularityEncoder
model = TimeAwarePopularityEncoder(word2vec_embedding=word2vec_embedding, seed=123)


In [338]:
for i, data in enumerate(train_dataloader):
        # print("HAA")
        # Every data instance is an input + label pair
        inputs, labels = data
        title_data = inputs[5]
        ctr = inputs[7]
        recency = inputs[8]
        
       
        print("Input title shape:",title_data.shape)
        print("Input recency shape:",recency.shape)
        print("Input ctr shape:",ctr.shape)

        
        
        outputs = model(title_data,recency,ctr)
        print("TimeAwarePopularityEncoder output shape:",outputs.shape)
        


Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Input title shape: (3, 5, 30)
Input recency shape: (3, 5)
Input ctr shape: (3, 5)
TimeAwarePopularityEncoder output shape: torch.Size([3, 5, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Input title shape: (3, 5, 30)
Input recency shape: (3, 5)
Input ctr shape: (3, 5)
TimeAwarePopularityEncoder output shape: torch.Size([3, 5, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Input title shape: (3, 5, 30)
Input recency shape: (3, 5)
Input ctr shape: (3, 5)
TimeAwarePopularityEncoder output shape: torch.Size([3, 5, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here


# Add popularity info

In [72]:
pop_articles = {}
recency_articles = {}
inview_articles_train = df_train['article_ids_inview'].to_list()
inview_articles_val = df_validation['article_ids_inview'].to_list()

train_recency = df_train['recency_inview'].to_list()
val_recency = df_train['recency_inview'].to_list()

In [78]:
for outer in range(len(inview_articles_train)):
    for inner in range(len(inview_articles_train[outer])):
        if article not in recency_articles:
            recency_articles[article] = inview_articles_train[outer][inner]
        elif inview_articles_train[outer][inner] < recency_articles[article]:
            recency_articles[article] = inview_articles_train[outer][inner]

for outer in range(len(inview_articles_val)):
    for inner in range(len(inview_articles_val[outer])):
        if article not in recency_articles:
            recency_articles[article] = inview_articles_val[outer][inner]
        elif inview_articles_val[outer][inner] < recency_articles[article]:
            recency_articles[article] = inview_articles_val[outer][inner]




In [80]:
for key,val in articles_ctr.items():
    if key not in recency_articles:
        pop_articles[key] = val
    else:
        pop_articles[key] = val/recency_articles[key]+1
         


In [7]:
train_dataloader = PPRecDataLoader(
    behaviors=df_train,
    article_dict=article_mapping_title,
    entity_mapping=article_mapping_entity,
    ctr_mapping=articles_ctr,
    popularity_mapping = popularity_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_recency = 'recency_hist',
    inview_recency = 'recency_inview',
    eval_mode=False,
    batch_size=3,
)

In [91]:
val_dataloader = PPRecDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping_title,
    entity_mapping=article_mapping_entity,
    ctr_mapping=articles_ctr,
    popularity_mapping = popularity_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_recency = 'recency_hist',
    inview_recency = 'recency_inview',
    eval_mode=True,
    batch_size=8,
)

In [92]:
with open('popularity_mapping.pkl', 'wb') as f:
    pickle.dump(pop_articles, f)   


In [5]:
import pickle
article_mapping_title, article_mapping_entity, articles_ctr, popularity_mapping = {},{},{},{}
with open('article_mapping_title.pkl', 'rb') as handle:
    article_mapping_title = pickle.load(handle)
with open('article_mapping_entity.pkl', 'rb') as handle:
    article_mapping_entity = pickle.load(handle)
with open('articles_ctr.pkl', 'rb') as handle:
    articles_ctr = pickle.load(handle)
with open('popularity_mapping.pkl', 'rb') as handle:
    popularity_mapping = pickle.load(handle)

COLUMNS = [
   'user_id',
   'article_id_fixed',
   'article_ids_inview',
   'article_ids_clicked',
   'impression_id',
   'labels',
   'recency_inview',
   'recency_hist'  
]
df_train  = pl.scan_parquet("small_demo_train_all_features.parquet").select(COLUMNS).collect()

df_validation =  pl.scan_parquet("small_demo_val_all_features.parquet").select(COLUMNS).collect()

# Test Popularity Aware User Encoder

In [23]:
from src.model.modules import PopularityAwareUserEncoder

model = PopularityAwareUserEncoder(hparams_pprec, word2vec_embedding=word2vec_embedding, seed=123)
for i, data in enumerate(train_dataloader):
        # Every data instance is an i
        # nput + label pair
        inputs, labels = data
        hist_title = inputs[0]
        popularity = inputs[2]
        outputs = model(hist_title, popularity)
        print("PopularityAwareUserEncoder output shape:",outputs.shape)
        

Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
After: torch.Size([3, 10, 768])
PopularityAwareUserEncoder output shape: torch.Size([3, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
After: torch.Size([3, 10, 768])
PopularityAwareUserEncoder output shape: torch.Size([3, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
After: torch.Size([3, 10, 768])
PopularityAwareUserEncoder output shape: torch.Size([3, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
After: torch.Size([3, 10, 768])
PopularityAwareUserEncoder output shape: torch.Size([3, 768])


# Test whole PPRec

In [37]:
from model.modules import PPRec

model = PPRec(hparams_pprec,word2vec_embedding)

In [38]:
# Popularity is actually CTR of articles present in user clicked  history
for i, data in enumerate(train_dataloader):
        # Every data instance is an i
        # nput + label pair
        inputs, labels = data
        title = inputs[5]
        entities = inputs[6]
        ctr = inputs[7]
        recency = inputs[8]
        hist_title = inputs[0]
        hist_popularity = inputs[2]

        #print(labels.shape)
        outputs = model(title, entities, ctr, recency ,hist_title, hist_popularity )
        print("PopularityAwareUserEncoder output shape:",outputs.shape)

Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
PopularityAwareUserEncoder output shape: torch.Size([3, 5])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
PopularityAwareUserEncoder output shape: torch.Size([3, 5])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
PopularityAwareUserEncoder output shape: torch.Size([3, 5])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
PopularityAwareUserEncoder output shape: torch.Size([3, 5])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reache

# Loss function and update gradients

In [71]:
from model.modules import BPELoss
criterion = BPELoss()
for i, data in enumerate(train_dataloader):
        
        
        inputs, labels = data
        title = inputs[5]
        entities = inputs[6]
        ctr = inputs[7]
        recency = inputs[8]
        hist_title = inputs[0]
        hist_popularity = inputs[2]

        outputs = model(title, entities, ctr, recency ,hist_title, hist_popularity )
        loss = criterion(outputs, labels)
        print("Loss:",loss)
        

Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Loss: tensor(0.6846, grad_fn=<NegBackward0>)
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Loss: tensor(0.6756, grad_fn=<NegBackward0>)
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Loss: tensor(0.6741, grad_fn=<NegBackward0>)
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Loss: tensor(0.6798, grad_fn=<NegBackward0>)
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Loss: tensor(0.6743, grad_fn=<NegBackwar