In [60]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import torch 
import polars as pl

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
    DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL
)

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from ebrec.utils._articles import convert_text2encoding_with_transformers,concat_list_to_text
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

%load_ext autoreload
%autoreload 2

from src.model.dataloader import PPRecDataLoader
from src.model.model_config import hparams_pprec
from src.model.pprec import PPRec
PATH = Path("/Users/sohamchatterjee/Documents/UvA/RecSYS/Project/ebnerd_data")
DATASPLIT = "ebnerd_demo"

import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
cpu


In [61]:
# df_history_dummy = (
#         pl.scan_parquet(PATH.joinpath(DATASPLIT,"train","history.parquet"))
#         .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL)
#         .pipe(
#             truncate_history,
#             column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#             history_size=10,
#             padding_value=0,
#             enable_warning=False,
#         ).collect()
#     )

In [62]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL,DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL)
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        ).pipe(
            truncate_history,
            column=DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    return df_behaviors

In [63]:
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL
]
HISTORY_SIZE = 10
#FRACTION = 0.001

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    #.sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    #.sample(fraction=FRACTION)
)
df_train.head(2)

Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here


user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8]
22779,"[9770333, 9769641, … 9770541]","[9774461, 9759544, … 9774461]",[9759966],48401,2023-05-21 21:06:50,"[2023-05-17 15:50:15, 2023-05-17 15:51:08, … 2023-05-18 06:26:39]","[0, 0, … 0]"
150224,"[9770604, 9769622, … 9735909]","[9778657, 9778682, … 9777397]",[9778661],152513,2023-05-24 07:31:26,"[2023-05-17 18:32:59, 2023-05-17 18:33:08, … 2023-05-18 06:13:47]","[0, 0, … 0]"


In [64]:
df_articles = pl.read_parquet(PATH.joinpath("articles.parquet"))
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3000022,"""Hanks beskyldt…","""Tom Hanks har …",2023-06-29 06:20:32,False,"""Tom Hanks skul…",2006-09-20 09:24:18,[3518381],"""article_defaul…","""https://ekstra…","[""David Gardner""]","[""PER""]","[""Kriminalitet"", ""Kendt"", … ""Litteratur""]",414,[432],"""underholdning""",,,,0.9911,"""Negative"""
3000063,"""Bostrups aske …","""Studieværten b…",2023-06-29 06:20:32,False,"""Strålende sens…",2006-09-24 07:45:30,"[3170935, 3170939]","""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Underholdning"", … ""Personlig begivenhed""]",118,[133],"""nyheder""",,,,0.5155,"""Neutral"""


In [65]:
# df_articles_1 = df_articles.with_columns(
#         pl.col('title').str.lengths().alias('title_length')
# )
# df_articles_1.head()

In [66]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping_title = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)



In [67]:
df_articles = concat_list_to_text(df_articles,'ner_clusters','ner_clusters_text')
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, 'ner_clusters_text', max_length=MAX_TITLE_LENGTH
)
article_mapping_entity = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

Reached here


In [68]:
df_articles = df_articles.with_columns(
            pl.col('subtitle-title_encode_FacebookAI/xlm-roberta-base').list.len().alias("title_embed_length"),
            pl.col('ner_clusters_text_encode_FacebookAI/xlm-roberta-base').list.len().alias("entity_embed_length")
        )
df_articles.head()

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,subtitle-title,subtitle-title_encode_FacebookAI/xlm-roberta-base,ner_clusters_text,ner_clusters_text_encode_FacebookAI/xlm-roberta-base,title_embed_length,entity_embed_length
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,str,list[i64],str,list[i64],u32,u32
3000022,"""Hanks beskyldt…","""Tom Hanks har …",2023-06-29 06:20:32,False,"""Tom Hanks skul…",2006-09-20 09:24:18,[3518381],"""article_defaul…","""https://ekstra…","[""David Gardner""]","[""PER""]","[""Kriminalitet"", ""Kendt"", … ""Litteratur""]",414,[432],"""underholdning""",,,,0.9911,"""Negative""","""Tom Hanks har …","[8352, 2548, … 56]","""David Gardner""","[6765, 90968, … 1]",30,30
3000063,"""Bostrups aske …","""Studieværten b…",2023-06-29 06:20:32,False,"""Strålende sens…",2006-09-24 07:45:30,"[3170935, 3170939]","""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Underholdning"", … ""Personlig begivenhed""]",118,[133],"""nyheder""",,,,0.5155,"""Neutral""","""Studieværten b…","[60716, 17052, … 1]","""""","[1, 1, … 1]",30,30
3000613,"""Jesper Olsen r…","""Den tidligere …",2023-06-29 06:20:33,False,"""Jesper Olsen, …",2006-05-09 11:29:00,[3164998],"""article_defaul…","""https://ekstra…","[""Frankrig"", ""Jesper Olsen"", … ""Jesper Olsen""]","[""LOC"", ""PER"", … ""PER""]","[""Kendt"", ""Sport"", … ""Sygdom og behandling""]",142,"[196, 271]","""sport""",,,,0.9876,"""Negative""","""Den tidligere …","[1575, 12532, … 111326]","""Frankrig-Jespe…","[192380, 9, … 1]",30,30
3000700,"""Madonna topløs…","""47-årige Madon…",2023-06-29 06:20:33,False,"""Skal du have s…",2006-05-04 11:03:12,[3172046],"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Livsstil"", ""Underholdning""]",414,[432],"""underholdning""",,,,0.8786,"""Neutral""","""47-årige Madon…","[7657, 9, … 22907]","""""","[1, 1, … 1]",30,30
3000840,"""Otto Brandenbu…","""Sangeren og sk…",2023-06-29 06:20:33,False,"""'Og lidt for S…",2007-03-01 18:34:00,[3914446],"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Underholdning"", … ""Musik og lyd""]",118,[133],"""nyheder""",,,,0.9468,"""Negative""","""Sangeren og sk…","[22986, 3683, … 1]","""""","[1, 1, … 1]",30,30


In [69]:
# df_train.head()
# df_train1 = df_train.with_columns(
#             pl.col('article_id_fixed').list.len().alias("click_history_length"),
#             pl.col('article_ids_inview').list.len().alias("click_inview_length"),
#              pl.col('labels').list.len().alias("labels_length"),
#             #  pl.col('impression_time_fixed').list.len().alias("hist_imp_time_length")
#         )
# df_train1.head(100)
# print(df_train1.select ( pl.struct("click_inview_length").n_unique()))

# Adding CTR info to Train

In [70]:
in_view_dict = {}
click_dict = {}
train_clicked_articles =  df_train['article_ids_clicked'].to_list()
train_inview_articles = df_train['article_ids_inview'].to_list()

In [71]:
for clicked_list in train_clicked_articles:
    for item in clicked_list:
        if item not in click_dict:
            click_dict[item] = 1
        else:
            click_dict[item] +=1

for inview_list in train_inview_articles:
    for item in inview_list:
        if item not in in_view_dict:
            in_view_dict[item] = 1
        else:
            in_view_dict[item] += 1

In [72]:
clicked_articles_val =  df_validation['article_ids_clicked'].to_list()
inview_articles_val = df_validation['article_ids_inview'].to_list()

In [73]:
for clicked_list in clicked_articles_val:
    for item in clicked_list:
        if item not in click_dict:
            click_dict[item] = 1
        else:
            click_dict[item] +=1

for inview_list in inview_articles_val:
    for item in inview_list:
        if item not in in_view_dict:
            in_view_dict[item] = 1
        else:
            in_view_dict[item] += 1

In [74]:
articles = df_articles['article_id'].to_list()

In [75]:
# ctr
articles_ctr = {}
for article in articles:
    if article not in articles_ctr:
        articles_ctr[article]=0

for key,value in click_dict.items():
    articles_ctr[key] = value / in_view_dict[key]


        

    

In [76]:
articles_ctr.update((x, int(y*100)) for x, y in articles_ctr.items())

# Recency info

In [77]:
inview_articles_train = df_train['article_ids_inview'].to_list()
inview_articles_val = df_validation['article_ids_inview'].to_list()


In [78]:
len(df_train)

24888

In [79]:
hist_articles_train = df_train['article_id_fixed'].to_list()
hist_articles_val = df_validation['article_id_fixed'].to_list()

In [80]:
impression_timestamps_train = df_train['impression_time'].to_list()
impression_timestamps_val = df_validation['impression_time'].to_list()


In [81]:
hist_imp_timestamps_train = df_train['impression_time_fixed'].to_list()
hist_impression_timestamps_val = df_validation['impression_time_fixed'].to_list()

In [82]:
len(df_train)

24888

In [83]:
df = df_articles.select(['article_id','published_time'])
article_last_published_times = dict(df.iter_rows())
article_last_published_times

{3000022: datetime.datetime(2006, 9, 20, 9, 24, 18),
 3000063: datetime.datetime(2006, 9, 24, 7, 45, 30),
 3000613: datetime.datetime(2006, 5, 9, 11, 29),
 3000700: datetime.datetime(2006, 5, 4, 11, 3, 12),
 3000840: datetime.datetime(2007, 3, 1, 18, 34),
 3001278: datetime.datetime(2006, 5, 2, 8, 35, 37),
 3001299: datetime.datetime(2007, 3, 20, 7, 29, 32),
 3001353: datetime.datetime(2006, 8, 31, 8, 6, 45),
 3001457: datetime.datetime(2007, 3, 19, 12, 45, 28),
 3001459: datetime.datetime(2006, 5, 19, 11, 26, 38),
 3001724: datetime.datetime(2007, 3, 21, 11, 19, 42),
 3001899: datetime.datetime(2007, 3, 9, 10, 58, 31),
 3002265: datetime.datetime(2006, 4, 29, 17, 36, 37),
 3002293: datetime.datetime(2007, 3, 7, 14, 35),
 3002673: datetime.datetime(2006, 7, 12, 12, 22, 43),
 3002698: datetime.datetime(2007, 1, 8, 11, 35),
 3002728: datetime.datetime(2006, 12, 29, 13, 14, 8),
 3003065: datetime.datetime(2006, 5, 21, 16, 57),
 3005322: datetime.datetime(2001, 5, 19, 10, 10),
 3005351: da

In [84]:
inview_articles_recency_train = []
for outer in range(len(inview_articles_train)):
    tmp=[]
    for inner in range(len(inview_articles_train[outer])):
        article = inview_articles_train[outer][inner]
        time_view_article = int((impression_timestamps_train[outer] - article_last_published_times[article]).total_seconds()/7200)
        # if time_view_article < 0:
        #     time_view_article = 0
        tmp.append(time_view_article)
    inview_articles_recency_train.append(tmp)
    
print(len(inview_articles_recency_train))

df_train = df_train.with_columns(
   pl.DataFrame(
    [
        pl.Series("recency_inview", inview_articles_recency_train),
       
    ]
).select('recency_inview')
)


24888


In [85]:
inview_articles_recency_val = []
for outer in range(len(inview_articles_val)):
    tmp=[]
    for inner in range(len(inview_articles_val[outer])):
        article = inview_articles_val[outer][inner]
        time_view_article = int((impression_timestamps_val[outer] - article_last_published_times[article]).total_seconds()/7200)
        # if time_view_article < 0:
        #     time_view_article = 0
        tmp.append(time_view_article)
    inview_articles_recency_val.append(tmp)
    
print(len(inview_articles_recency_val))

df_validation = df_validation.with_columns(
   pl.DataFrame(
    [
        pl.Series("recency_inview", inview_articles_recency_val),
       
    ]
).select('recency_inview')
)

25505


In [86]:
len(hist_articles_train)

24888

In [87]:
hist_articles_recency_train = []
for outer in range(len(hist_articles_train)):
    tmp=[]
    for inner in range(len(hist_articles_train[outer])):
        article = hist_articles_train[outer][inner]
        
        if article == 0:
            time_view_article = 0
        else:
            time_view_article = int((hist_imp_timestamps_train[outer][inner] - article_last_published_times[article]).total_seconds()/7200)
        
            

       
        tmp.append(time_view_article)
    hist_articles_recency_train.append(tmp)
    
print(len(hist_articles_recency_train))

df_train = df_train.with_columns(
   pl.DataFrame(
    [
        pl.Series("recency_hist", hist_articles_recency_train),
       
    ]
).select('recency_hist')
)


24888


In [88]:
hist_articles_recency_val = []
for outer in range(len(hist_articles_val)):
    tmp=[]
    for inner in range(len(hist_articles_val[outer])):
        article = hist_articles_val[outer][inner]
        if article == 0:
            time_view_article =0
        else:
            time_view_article = int((hist_impression_timestamps_val[outer][inner] - article_last_published_times[article]).total_seconds()/7200)
        # if time_view_article < 0:
        #     time_view_article = 0
        tmp.append(time_view_article)
    hist_articles_recency_val.append(tmp)
    
print(len(hist_articles_recency_val))

df_validation = df_validation.with_columns(
   pl.DataFrame(
    [
        pl.Series("recency_hist", hist_articles_recency_val),
       
    ]
).select('recency_hist')
)

25505


In [89]:
df_train = df_train.with_columns(
    pl.format("[{}]",
      pl.col("recency_inview").cast(pl.List(pl.Utf8)).list.join(", ")).alias("recency_inview_new"),
    pl.format("[{}]",
      pl.col("recency_hist").cast(pl.List(pl.Utf8)).list.join(", ")).alias("recency_hist_new")
)
df_train.head()


Reached here
Reached here
Reached here
Reached here
Reached here
Reached here


user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels,recency_inview,recency_hist,recency_inview_new,recency_hist_new
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8],list[i64],list[i64],str,str
22779,"[9770333, 9769641, … 9770541]","[9774461, 9759544, … 9774461]",[9759966],48401,2023-05-21 21:06:50,"[2023-05-17 15:50:15, 2023-05-17 15:51:08, … 2023-05-18 06:26:39]","[0, 0, … 0]","[0, 0, … 0]","[0, 1, … 1]","""[0, 0, 0, 0, 0…","""[0, 1, 0, 1, 0…"
150224,"[9770604, 9769622, … 9735909]","[9778657, 9778682, … 9777397]",[9778661],152513,2023-05-24 07:31:26,"[2023-05-17 18:32:59, 2023-05-17 18:33:08, … 2023-05-18 06:13:47]","[0, 0, … 0]","[0, 0, … 1]","[0, 1, … 0]","""[0, 0, 2518, 0…","""[0, 1, 1, 0, 7…"
160892,"[9767604, 9766264, … 9770178]","[9778448, 9778155, … 9778351]",[9777856],155390,2023-05-24 07:30:33,"[2023-05-17 07:09:32, 2023-05-17 07:11:01, … 2023-05-17 15:51:19]","[0, 0, … 0]","[2, 5, … 5]","[2, 14, … 0]","""[2, 5, 2, 5, 5…","""[2, 14, 47, 1,…"
1001055,"[9767722, 9770882, … 9769981]","[9776855, 9776246, … 9776808]",[9776566],214679,2023-05-23 05:25:40,"[2023-05-17 05:29:08, 2023-05-18 05:12:44, … 2023-05-18 05:31:44]","[0, 0, … 0]","[0, 0, … 0]","[6, 0, … 4]","""[0, 0, 0, 0, 0…","""[6, 0, 4, 4, 5…"
1001055,"[9767722, 9770882, … 9769981]","[9774840, 9776449, … 9776691]",[9776553],214681,2023-05-23 05:31:54,"[2023-05-17 05:29:08, 2023-05-18 05:12:44, … 2023-05-18 05:31:44]","[0, 0, … 0]","[19, 5, … 1]","[6, 0, … 4]","""[19, 5, 3, 4, …","""[6, 0, 4, 4, 5…"


In [90]:
with pl.Config(fmt_str_lengths=50):
        print(df_train)

shape: (24_888, 12)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ user_id ┆ article_id ┆ article_i ┆ article_i ┆ … ┆ recency_i ┆ recency_h ┆ recency_i ┆ recency_h │
│ ---     ┆ _fixed     ┆ ds_inview ┆ ds_clicke ┆   ┆ nview     ┆ ist       ┆ nview_new ┆ ist_new   │
│ u32     ┆ ---        ┆ ---       ┆ d         ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│         ┆ list[i32]  ┆ list[i64] ┆ ---       ┆   ┆ list[i64] ┆ list[i64] ┆ str       ┆ str       │
│         ┆            ┆           ┆ list[i64] ┆   ┆           ┆           ┆           ┆           │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 22779   ┆ [9770333,  ┆ [9774461, ┆ [9759966] ┆ … ┆ [0, 0, …  ┆ [0, 1, …  ┆ [0, 0, 0, ┆ [0, 1, 0, │
│         ┆ 9769641, … ┆ 9759544,  ┆           ┆   ┆ 0]        ┆ 1]        ┆ 0, 0]     ┆ 1, 0, 1,  │
│         ┆ 9770541]   ┆ …         ┆           ┆   ┆           ┆       

In [91]:
df_train = df_train.drop(['recency_inview','recency_hist'])
df_train = df_train.rename({"recency_inview_new": "recency_inview"})
df_train = df_train.rename({"recency_hist_new": "recency_hist"})

In [92]:
df_train.columns

['user_id',
 'article_id_fixed',
 'article_ids_inview',
 'article_ids_clicked',
 'impression_id',
 'impression_time',
 'impression_time_fixed',
 'labels',
 'recency_inview',
 'recency_hist']

In [93]:
df_train, token_col_title = convert_text2encoding_with_transformers(
    df_train, transformer_tokenizer, 'recency_inview', max_length=5
)
df_train, token_col_title = convert_text2encoding_with_transformers(
    df_train, transformer_tokenizer, 'recency_hist', max_length=5
)

df_train.head()

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels,recency_inview,recency_hist,recency_inview_encode_FacebookAI/xlm-roberta-base,recency_hist_encode_FacebookAI/xlm-roberta-base
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8],str,str,list[i64],list[i64]
22779,"[9770333, 9769641, … 9770541]","[9774461, 9759544, … 9774461]",[9759966],48401,2023-05-21 21:06:50,"[2023-05-17 15:50:15, 2023-05-17 15:51:08, … 2023-05-18 06:26:39]","[0, 0, … 0]","""[0, 0, 0, 0, 0…","""[0, 1, 0, 1, 0…","[378, 2389, … 4]","[378, 2389, … 4]"
150224,"[9770604, 9769622, … 9735909]","[9778657, 9778682, … 9777397]",[9778661],152513,2023-05-24 07:31:26,"[2023-05-17 18:32:59, 2023-05-17 18:33:08, … 2023-05-18 06:13:47]","[0, 0, … 0]","""[0, 0, 2518, 0…","""[0, 1, 1, 0, 7…","[378, 2389, … 4]","[378, 2389, … 4]"
160892,"[9767604, 9766264, … 9770178]","[9778448, 9778155, … 9778351]",[9777856],155390,2023-05-24 07:30:33,"[2023-05-17 07:09:32, 2023-05-17 07:11:01, … 2023-05-17 15:51:19]","[0, 0, … 0]","""[2, 5, 2, 5, 5…","""[2, 14, 47, 1,…","[378, 304, … 4]","[378, 304, … 4]"
1001055,"[9767722, 9770882, … 9769981]","[9776855, 9776246, … 9776808]",[9776566],214679,2023-05-23 05:25:40,"[2023-05-17 05:29:08, 2023-05-18 05:12:44, … 2023-05-18 05:31:44]","[0, 0, … 0]","""[0, 0, 0, 0, 0…","""[6, 0, 4, 4, 5…","[378, 2389, … 4]","[378, 910, … 4]"
1001055,"[9767722, 9770882, … 9769981]","[9774840, 9776449, … 9776691]",[9776553],214681,2023-05-23 05:31:54,"[2023-05-17 05:29:08, 2023-05-18 05:12:44, … 2023-05-18 05:31:44]","[0, 0, … 0]","""[19, 5, 3, 4, …","""[6, 0, 4, 4, 5…","[378, 2947, … 4]","[378, 910, … 4]"


In [94]:
df_train = df_train.drop(['recency_inview','recency_hist'])
df_train = df_train.rename({"recency_inview_encode_FacebookAI/xlm-roberta-base": "recency_inview"})
df_train = df_train.rename({"recency_hist_encode_FacebookAI/xlm-roberta-base": "recency_hist"})

In [95]:
df_train.head()

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels,recency_inview,recency_hist
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8],list[i64],list[i64]
22779,"[9770333, 9769641, … 9770541]","[9774461, 9759544, … 9774461]",[9759966],48401,2023-05-21 21:06:50,"[2023-05-17 15:50:15, 2023-05-17 15:51:08, … 2023-05-18 06:26:39]","[0, 0, … 0]","[378, 2389, … 4]","[378, 2389, … 4]"
150224,"[9770604, 9769622, … 9735909]","[9778657, 9778682, … 9777397]",[9778661],152513,2023-05-24 07:31:26,"[2023-05-17 18:32:59, 2023-05-17 18:33:08, … 2023-05-18 06:13:47]","[0, 0, … 0]","[378, 2389, … 4]","[378, 2389, … 4]"
160892,"[9767604, 9766264, … 9770178]","[9778448, 9778155, … 9778351]",[9777856],155390,2023-05-24 07:30:33,"[2023-05-17 07:09:32, 2023-05-17 07:11:01, … 2023-05-17 15:51:19]","[0, 0, … 0]","[378, 304, … 4]","[378, 304, … 4]"
1001055,"[9767722, 9770882, … 9769981]","[9776855, 9776246, … 9776808]",[9776566],214679,2023-05-23 05:25:40,"[2023-05-17 05:29:08, 2023-05-18 05:12:44, … 2023-05-18 05:31:44]","[0, 0, … 0]","[378, 2389, … 4]","[378, 910, … 4]"
1001055,"[9767722, 9770882, … 9769981]","[9774840, 9776449, … 9776691]",[9776553],214681,2023-05-23 05:31:54,"[2023-05-17 05:29:08, 2023-05-18 05:12:44, … 2023-05-18 05:31:44]","[0, 0, … 0]","[378, 2947, … 4]","[378, 910, … 4]"


# For validation

In [96]:
df_validation = df_validation.with_columns(
    pl.format("[{}]",
      pl.col("recency_inview").cast(pl.List(pl.Utf8)).list.join(", ")).alias("recency_inview_new"),
    pl.format("[{}]",
      pl.col("recency_hist").cast(pl.List(pl.Utf8)).list.join(", ")).alias("recency_hist_new")
)
df_validation.head()


Reached here
Reached here
Reached here
Reached here
Reached here
Reached here


user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels,recency_inview,recency_hist,recency_inview_new,recency_hist_new
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8],list[i64],list[i64],str,str
76658,"[9776322, 9776315, … 9779045]","[9553264, 9783042, … 9553264]",[9783042],144772,2023-05-30 14:21:34,"[2023-05-22 19:54:50, 2023-05-22 19:55:51, … 2023-05-24 19:31:08]","[0, 1, … 0]","[1946, 0, … 1946]","[1, 1, … 0]","""[1946, 0, 1946…","""[1, 1, 1, 1, 0…"
76658,"[9776322, 9776315, … 9779045]","[9788125, 9788361, … 6741781]",[9788125],144777,2023-05-30 14:22:11,"[2023-05-22 19:54:50, 2023-05-22 19:55:51, … 2023-05-24 19:31:08]","[1, 0, … 0]","[2, 1, … 25669]","[1, 1, … 0]","""[2, 1, 1946, 1…","""[1, 1, 1, 1, 0…"
760446,"[9776688, 9775804, … 9778375]","[9784444, 9784444, … 9784506]",[9782806],196487,2023-05-27 19:54:18,"[2023-05-23 03:39:36, 2023-05-23 03:40:37, … 2023-05-24 03:56:24]","[0, 0, … 0]","[1, 1, … 0]","[1, 3, … 2]","""[1, 1, 1, 1, 0…","""[1, 3, 0, 0, 0…"
760446,"[9776688, 9775804, … 9778375]","[9784662, 9784642, … 9782656]",[9782656],196495,2023-05-27 19:53:48,"[2023-05-23 03:39:36, 2023-05-23 03:40:37, … 2023-05-24 03:56:24]","[0, 0, … 1]","[0, 0, … 0]","[1, 3, … 2]","""[0, 0, 0, 0, 0…","""[1, 3, 0, 0, 0…"
760446,"[9776688, 9775804, … 9778375]","[9779370, 9784430, … 9782517]",[9777324],196496,2023-05-27 19:56:28,"[2023-05-23 03:39:36, 2023-05-23 03:40:37, … 2023-05-24 03:56:24]","[0, 0, … 0]","[1, 1, … 12]","[1, 3, … 2]","""[1, 1, 1, 1, 1…","""[1, 3, 0, 0, 0…"


In [97]:
with pl.Config(fmt_str_lengths=50):
        print(df_validation)

shape: (25_505, 12)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ user_id ┆ article_id ┆ article_i ┆ article_i ┆ … ┆ recency_i ┆ recency_h ┆ recency_i ┆ recency_h │
│ ---     ┆ _fixed     ┆ ds_inview ┆ ds_clicke ┆   ┆ nview     ┆ ist       ┆ nview_new ┆ ist_new   │
│ u32     ┆ ---        ┆ ---       ┆ d         ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│         ┆ list[i32]  ┆ list[i64] ┆ ---       ┆   ┆ list[i64] ┆ list[i64] ┆ str       ┆ str       │
│         ┆            ┆           ┆ list[i64] ┆   ┆           ┆           ┆           ┆           │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 76658   ┆ [9776322,  ┆ [9553264, ┆ [9783042] ┆ … ┆ [1946, 0, ┆ [1, 1, …  ┆ [1946, 0, ┆ [1, 1, 1, │
│         ┆ 9776315, … ┆ 9783042,  ┆           ┆   ┆ … 1946]   ┆ 0]        ┆ 1946,     ┆ 1, 0, 1,  │
│         ┆ 9779045]   ┆ …         ┆           ┆   ┆           ┆       

In [98]:
df_validation = df_validation.drop(['recency_inview','recency_hist'])
df_validation = df_validation.rename({"recency_inview_new": "recency_inview"})
df_validation = df_validation.rename({"recency_hist_new": "recency_hist"})

In [99]:
df_validation, token_col_title = convert_text2encoding_with_transformers(
    df_validation, transformer_tokenizer, 'recency_inview', max_length=5
)
df_validation, token_col_title = convert_text2encoding_with_transformers(
    df_validation, transformer_tokenizer, 'recency_hist', max_length=5
)

df_validation.head()

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels,recency_inview,recency_hist,recency_inview_encode_FacebookAI/xlm-roberta-base,recency_hist_encode_FacebookAI/xlm-roberta-base
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8],str,str,list[i64],list[i64]
76658,"[9776322, 9776315, … 9779045]","[9553264, 9783042, … 9553264]",[9783042],144772,2023-05-30 14:21:34,"[2023-05-22 19:54:50, 2023-05-22 19:55:51, … 2023-05-24 19:31:08]","[0, 1, … 0]","""[1946, 0, 1946…","""[1, 1, 1, 1, 0…","[378, 199352, … 4]","[68252, 4, … 106]"
76658,"[9776322, 9776315, … 9779045]","[9788125, 9788361, … 6741781]",[9788125],144777,2023-05-30 14:22:11,"[2023-05-22 19:54:50, 2023-05-22 19:55:51, … 2023-05-24 19:31:08]","[1, 0, … 0]","""[2, 1, 1946, 1…","""[1, 1, 1, 1, 0…","[378, 304, … 4]","[68252, 4, … 106]"
760446,"[9776688, 9775804, … 9778375]","[9784444, 9784444, … 9784506]",[9782806],196487,2023-05-27 19:54:18,"[2023-05-23 03:39:36, 2023-05-23 03:40:37, … 2023-05-24 03:56:24]","[0, 0, … 0]","""[1, 1, 1, 1, 0…","""[1, 3, 0, 0, 0…","[68252, 4, … 106]","[68252, 4, … 757]"
760446,"[9776688, 9775804, … 9778375]","[9784662, 9784642, … 9782656]",[9782656],196495,2023-05-27 19:53:48,"[2023-05-23 03:39:36, 2023-05-23 03:40:37, … 2023-05-24 03:56:24]","[0, 0, … 1]","""[0, 0, 0, 0, 0…","""[1, 3, 0, 0, 0…","[378, 2389, … 4]","[68252, 4, … 757]"
760446,"[9776688, 9775804, … 9778375]","[9779370, 9784430, … 9782517]",[9777324],196496,2023-05-27 19:56:28,"[2023-05-23 03:39:36, 2023-05-23 03:40:37, … 2023-05-24 03:56:24]","[0, 0, … 0]","""[1, 1, 1, 1, 1…","""[1, 3, 0, 0, 0…","[68252, 4, … 106]","[68252, 4, … 757]"


In [100]:
df_validation.columns

['user_id',
 'article_id_fixed',
 'article_ids_inview',
 'article_ids_clicked',
 'impression_id',
 'impression_time',
 'impression_time_fixed',
 'labels',
 'recency_inview',
 'recency_hist',
 'recency_inview_encode_FacebookAI/xlm-roberta-base',
 'recency_hist_encode_FacebookAI/xlm-roberta-base']

In [101]:
df_validation = df_validation.drop(['recency_inview','recency_hist'])
df_validation = df_validation.rename({"recency_inview_encode_FacebookAI/xlm-roberta-base": "recency_inview"})
df_validation = df_validation.rename({"recency_hist_encode_FacebookAI/xlm-roberta-base": "recency_hist"})

In [102]:
df_validation.head()

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels,recency_inview,recency_hist
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8],list[i64],list[i64]
76658,"[9776322, 9776315, … 9779045]","[9553264, 9783042, … 9553264]",[9783042],144772,2023-05-30 14:21:34,"[2023-05-22 19:54:50, 2023-05-22 19:55:51, … 2023-05-24 19:31:08]","[0, 1, … 0]","[378, 199352, … 4]","[68252, 4, … 106]"
76658,"[9776322, 9776315, … 9779045]","[9788125, 9788361, … 6741781]",[9788125],144777,2023-05-30 14:22:11,"[2023-05-22 19:54:50, 2023-05-22 19:55:51, … 2023-05-24 19:31:08]","[1, 0, … 0]","[378, 304, … 4]","[68252, 4, … 106]"
760446,"[9776688, 9775804, … 9778375]","[9784444, 9784444, … 9784506]",[9782806],196487,2023-05-27 19:54:18,"[2023-05-23 03:39:36, 2023-05-23 03:40:37, … 2023-05-24 03:56:24]","[0, 0, … 0]","[68252, 4, … 106]","[68252, 4, … 757]"
760446,"[9776688, 9775804, … 9778375]","[9784662, 9784642, … 9782656]",[9782656],196495,2023-05-27 19:53:48,"[2023-05-23 03:39:36, 2023-05-23 03:40:37, … 2023-05-24 03:56:24]","[0, 0, … 1]","[378, 2389, … 4]","[68252, 4, … 757]"
760446,"[9776688, 9775804, … 9778375]","[9779370, 9784430, … 9782517]",[9777324],196496,2023-05-27 19:56:28,"[2023-05-23 03:39:36, 2023-05-23 03:40:37, … 2023-05-24 03:56:24]","[0, 0, … 0]","[68252, 4, … 106]","[68252, 4, … 757]"


In [103]:
df_articles.head()

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,subtitle-title,subtitle-title_encode_FacebookAI/xlm-roberta-base,ner_clusters_text,ner_clusters_text_encode_FacebookAI/xlm-roberta-base,title_embed_length,entity_embed_length
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,str,list[i64],str,list[i64],u32,u32
3000022,"""Hanks beskyldt…","""Tom Hanks har …",2023-06-29 06:20:32,False,"""Tom Hanks skul…",2006-09-20 09:24:18,[3518381],"""article_defaul…","""https://ekstra…","[""David Gardner""]","[""PER""]","[""Kriminalitet"", ""Kendt"", … ""Litteratur""]",414,[432],"""underholdning""",,,,0.9911,"""Negative""","""Tom Hanks har …","[8352, 2548, … 56]","""David Gardner""","[6765, 90968, … 1]",30,30
3000063,"""Bostrups aske …","""Studieværten b…",2023-06-29 06:20:32,False,"""Strålende sens…",2006-09-24 07:45:30,"[3170935, 3170939]","""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Underholdning"", … ""Personlig begivenhed""]",118,[133],"""nyheder""",,,,0.5155,"""Neutral""","""Studieværten b…","[60716, 17052, … 1]","""""","[1, 1, … 1]",30,30
3000613,"""Jesper Olsen r…","""Den tidligere …",2023-06-29 06:20:33,False,"""Jesper Olsen, …",2006-05-09 11:29:00,[3164998],"""article_defaul…","""https://ekstra…","[""Frankrig"", ""Jesper Olsen"", … ""Jesper Olsen""]","[""LOC"", ""PER"", … ""PER""]","[""Kendt"", ""Sport"", … ""Sygdom og behandling""]",142,"[196, 271]","""sport""",,,,0.9876,"""Negative""","""Den tidligere …","[1575, 12532, … 111326]","""Frankrig-Jespe…","[192380, 9, … 1]",30,30
3000700,"""Madonna topløs…","""47-årige Madon…",2023-06-29 06:20:33,False,"""Skal du have s…",2006-05-04 11:03:12,[3172046],"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Livsstil"", ""Underholdning""]",414,[432],"""underholdning""",,,,0.8786,"""Neutral""","""47-årige Madon…","[7657, 9, … 22907]","""""","[1, 1, … 1]",30,30
3000840,"""Otto Brandenbu…","""Sangeren og sk…",2023-06-29 06:20:33,False,"""'Og lidt for S…",2007-03-01 18:34:00,[3914446],"""article_defaul…","""https://ekstra…",[],[],"[""Kendt"", ""Underholdning"", … ""Musik og lyd""]",118,[133],"""nyheder""",,,,0.9468,"""Negative""","""Sangeren og sk…","[22986, 3683, … 1]","""""","[1, 1, … 1]",30,30


In [104]:

# df_train = df_train.drop(['impression_time','c'])
# df_validation = df_validation.drop(['impression_time','c'])

In [46]:
# df_train.write_parquet("small_demo_train_all_features.parquet")
# df_validation.write_parquet("small_demo_val_all_features.parquet")

In [298]:
df_train.head()

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels,recency_inview,recency_hist
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8],list[i64],list[i64]
22779,"[9770333, 9769641, … 9770541]","[9774461, 9759966, … 9775371]",[9759966],48401,2023-05-21 21:06:50,"[2023-05-17 15:50:15, 2023-05-17 15:51:08, … 2023-05-18 06:26:39]","[0, 1, … 0]","[0, 0, … 0]","[0, 1, … 1]"
150224,"[9770604, 9769622, … 9735909]","[9778682, 9778657, … 9778661]",[9778661],152513,2023-05-24 07:31:26,"[2023-05-17 18:32:59, 2023-05-17 18:33:08, … 2023-05-18 06:13:47]","[0, 0, … 1]","[0, 0, … 0]","[0, 1, … 0]"
160892,"[9767604, 9766264, … 9770178]","[9778351, 9778448, … 9778351]",[9777856],155390,2023-05-24 07:30:33,"[2023-05-17 07:09:32, 2023-05-17 07:11:01, … 2023-05-17 15:51:19]","[0, 0, … 0]","[5, 2, … 5]","[2, 14, … 0]"
1001055,"[9767722, 9770882, … 9769981]","[9776566, 9776246, … 9776855]",[9776566],214679,2023-05-23 05:25:40,"[2023-05-17 05:29:08, 2023-05-18 05:12:44, … 2023-05-18 05:31:44]","[1, 0, … 0]","[0, 0, … 0]","[6, 0, … 4]"
1001055,"[9767722, 9770882, … 9769981]","[9776553, 9776449, … 9776691]",[9776553],214681,2023-05-23 05:31:54,"[2023-05-17 05:29:08, 2023-05-18 05:12:44, … 2023-05-18 05:31:44]","[1, 0, … 0]","[4, 5, … 1]","[6, 0, … 4]"


In [299]:
val_dataloader.__getitem__(1)

Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here


((array([[[   159, 100038,    191, ...,  31143,      9,  17086],
          [121525,     33,     17, ...,     17, 154759,   6530],
          [ 32972,  16471,    107, ...,    100,  15875,   7250],
          ...,
          [   893,  22666,    261, ...,    271,  20807,      6],
          [ 50715,     18,  14936, ...,      1,      1,      1],
          [  8301,    332, 136617, ...,   1004,    149,  44075]],
  
         [[   159, 100038,    191, ...,  31143,      9,  17086],
          [121525,     33,     17, ...,     17, 154759,   6530],
          [ 32972,  16471,    107, ...,    100,  15875,   7250],
          ...,
          [   893,  22666,    261, ...,    271,  20807,      6],
          [ 50715,     18,  14936, ...,      1,      1,      1],
          [  8301,    332, 136617, ...,   1004,    149,  44075]],
  
         [[   159, 100038,    191, ...,  31143,      9,  17086],
          [121525,     33,     17, ...,     17, 154759,   6530],
          [ 32972,  16471,    107, ...,    100,  158

# Popularity embedding is built from CTR and recency embedding

# Features adding - DONE

# Test Pytorch modules

# 1. KnowledgeAwareNewsEncoder

In [322]:
from src.model.modules import KnowledgeAwareNewsEncoder
model = KnowledgeAwareNewsEncoder(hparams_pprec,word2vec_embedding,seed=123)
for i, data in enumerate(train_dataloader):
        # Every data instance is an input + label pair
        inputs, labels = data
        title_data = inputs[5]
        entity_data = inputs[6]
        #print(title_data.shape)
        outputs = model(title_data,entity_data)
        print("KnowledgeAwareNewsEncoder output shape:",outputs.shape)
        



Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Entity cross attention: torch.Size([3, 150, 768])
KnowledgeAwareNewsEncoder output shape: torch.Size([3, 150, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Entity cross attention: torch.Size([3, 150, 768])
KnowledgeAwareNewsEncoder output shape: torch.Size([3, 150, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Entity cross attention: torch.Size([3, 150, 768])
KnowledgeAwareNewsEncoder output shape: torch.Size([3, 150, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Entity cross attention: torc

# 2. Time-aware news popularity predictor

In [3]:
COLUMNS = [
   'user_id',
   'article_id_fixed',
   'article_ids_inview',
   'article_ids_clicked',
   'impression_id',
   'labels',
   'recency_inview',
   'recency_hist'  
]
df_train  = pl.scan_parquet("small_demo_train_all_features.parquet").select(COLUMNS).collect()

df_validation =  pl.scan_parquet("small_demo_val_all_features.parquet").select(COLUMNS).collect()

In [33]:
import pickle 

with open('article_mapping_title.pkl', 'wb') as f:
    pickle.dump(article_mapping_title, f)



with open('article_mapping_entity.pkl', 'wb') as f:
    pickle.dump(article_mapping_entity, f)


with open('articles_ctr.pkl', 'wb') as f:
    pickle.dump(articles_ctr, f)   


In [262]:
train_dataloader = PPRecDataLoader(
    behaviors=df_train,
    article_dict=article_mapping_title,
    entity_mapping=article_mapping_entity,
    popularity_mapping=pop_articles,
    ctr_mapping=articles_ctr,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_recency = 'recency_hist',
    inview_recency = 'recency_inview',
    eval_mode=False,
    batch_size=3,
)

In [122]:
val_dataloader = PPRecDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping_title,
    entity_mapping=article_mapping_entity,
    ctr_mapping=articles_ctr,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_recency = 'recency_hist',
    inview_recency = 'recency_inview',
    eval_mode=True,
    batch_size=32,
)

TypeError: 'NoneType' object is not iterable

In [335]:
from src.model.modules import TimeAwarePopularityEncoder
model = TimeAwarePopularityEncoder(word2vec_embedding=word2vec_embedding, seed=123)


In [338]:
for i, data in enumerate(train_dataloader):
        # print("HAA")
        # Every data instance is an input + label pair
        inputs, labels = data
        title_data = inputs[5]
        ctr = inputs[7]
        recency = inputs[8]
        
       
        print("Input title shape:",title_data.shape)
        print("Input recency shape:",recency.shape)
        print("Input ctr shape:",ctr.shape)

        
        
        outputs = model(title_data,recency,ctr)
        print("TimeAwarePopularityEncoder output shape:",outputs.shape)
        


Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Input title shape: (3, 5, 30)
Input recency shape: (3, 5)
Input ctr shape: (3, 5)
TimeAwarePopularityEncoder output shape: torch.Size([3, 5, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Input title shape: (3, 5, 30)
Input recency shape: (3, 5)
Input ctr shape: (3, 5)
TimeAwarePopularityEncoder output shape: torch.Size([3, 5, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Input title shape: (3, 5, 30)
Input recency shape: (3, 5)
Input ctr shape: (3, 5)
TimeAwarePopularityEncoder output shape: torch.Size([3, 5, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here


# Add popularity info

In [105]:
pop_articles = {}
recency_articles = {}
inview_articles_train = df_train['article_ids_inview'].to_list()
inview_articles_val = df_validation['article_ids_inview'].to_list()

train_recency = df_train['recency_inview'].to_list()
val_recency = df_train['recency_inview'].to_list()

In [106]:
for outer in range(len(inview_articles_train)):
    for inner in range(len(inview_articles_train[outer])):
        if article not in recency_articles:
            recency_articles[article] = inview_articles_train[outer][inner]
        elif inview_articles_train[outer][inner] < recency_articles[article]:
            recency_articles[article] = inview_articles_train[outer][inner]

for outer in range(len(inview_articles_val)):
    for inner in range(len(inview_articles_val[outer])):
        if article not in recency_articles:
            recency_articles[article] = inview_articles_val[outer][inner]
        elif inview_articles_val[outer][inner] < recency_articles[article]:
            recency_articles[article] = inview_articles_val[outer][inner]




In [107]:
for key,val in articles_ctr.items():
    if key not in recency_articles:
        pop_articles[key] = val
    else:
        pop_articles[key] = val/recency_articles[key]+1
         


In [78]:
train_dataloader = PPRecDataLoader(
    behaviors=df_train,
    article_dict=article_mapping_title,
    entity_mapping=article_mapping_entity,
    ctr_mapping=articles_ctr,
    popularity_mapping = popularity_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_recency = 'recency_hist',
    inview_recency = 'recency_inview',
    eval_mode=False,
    batch_size=4,
)

In [124]:
val_dataloader = PPRecDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping_title,
    entity_mapping=article_mapping_entity,
    ctr_mapping=articles_ctr,
    popularity_mapping = popularity_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_recency = 'recency_hist',
    inview_recency = 'recency_inview',
    eval_mode=True,
    batch_size=4,
)

In [92]:
with open('popularity_mapping.pkl', 'wb') as f:
    pickle.dump(pop_articles, f)   


In [8]:
import pickle
article_mapping_title, article_mapping_entity, articles_ctr, popularity_mapping = {},{},{},{}
with open('article_mapping_title.pkl', 'rb') as handle:
    article_mapping_title = pickle.load(handle)
with open('article_mapping_entity.pkl', 'rb') as handle:
    article_mapping_entity = pickle.load(handle)
with open('articles_ctr.pkl', 'rb') as handle:
    articles_ctr = pickle.load(handle)
with open('popularity_mapping.pkl', 'rb') as handle:
    popularity_mapping = pickle.load(handle)

COLUMNS = [
   'user_id',
   'article_id_fixed',
   'article_ids_inview',
   'article_ids_clicked',
   'impression_id',
   'labels',
   'recency_inview',
   'recency_hist'  
]
df_train  = pl.scan_parquet("small_demo_train_all_features.parquet").select(COLUMNS).collect()

df_validation =  pl.scan_parquet("small_demo_val_all_features_with_sampling.parquet").select(COLUMNS).collect()

# Test Popularity Aware User Encoder

In [23]:
from src.model.modules import PopularityAwareUserEncoder

model = PopularityAwareUserEncoder(hparams_pprec, word2vec_embedding=word2vec_embedding, seed=123)
for i, data in enumerate(train_dataloader):
        # Every data instance is an i
        # nput + label pair
        inputs, labels = data
        hist_title = inputs[0]
        popularity = inputs[2]
        outputs = model(hist_title, popularity)
        print("PopularityAwareUserEncoder output shape:",outputs.shape)
        

Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
After: torch.Size([3, 10, 768])
PopularityAwareUserEncoder output shape: torch.Size([3, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
After: torch.Size([3, 10, 768])
PopularityAwareUserEncoder output shape: torch.Size([3, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
After: torch.Size([3, 10, 768])
PopularityAwareUserEncoder output shape: torch.Size([3, 768])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
After: torch.Size([3, 10, 768])
PopularityAwareUserEncoder output shape: torch.Size([3, 768])


# Test whole PPRec

In [37]:
from model.modules import PPRec

model = PPRec(hparams_pprec,word2vec_embedding)

In [38]:
# Popularity is actually CTR of articles present in user clicked  history
for i, data in enumerate(train_dataloader):
        # Every data instance is an i
        # nput + label pair
        inputs, labels = data
        title = inputs[5]
        entities = inputs[6]
        ctr = inputs[7]
        recency = inputs[8]
        hist_title = inputs[0]
        hist_popularity = inputs[2]

        #print(labels.shape)
        outputs = model(title, entities, ctr, recency ,hist_title, hist_popularity )
        print("PopularityAwareUserEncoder output shape:",outputs.shape)

Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
PopularityAwareUserEncoder output shape: torch.Size([3, 5])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
PopularityAwareUserEncoder output shape: torch.Size([3, 5])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
PopularityAwareUserEncoder output shape: torch.Size([3, 5])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
PopularityAwareUserEncoder output shape: torch.Size([3, 5])
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reache

# Loss function and update gradients

In [76]:
from model.modules import BPELoss
criterion = BPELoss()
for i, data in enumerate(train_dataloader):
        
        
        inputs, labels = data
        title = inputs[5]
        entities = inputs[6]
        ctr = inputs[7]
        recency = inputs[8]
        hist_title = inputs[0]
        hist_popularity = inputs[2]

        outputs = model(title, entities, ctr, recency ,hist_title, hist_popularity )
        loss = criterion(outputs, labels)
        print("Loss:",loss)
        

Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Loss: tensor(0.6923, grad_fn=<NegBackward0>)
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Loss: tensor(0.6933, grad_fn=<NegBackward0>)
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Loss: tensor(0.6921, grad_fn=<NegBackward0>)
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Loss: tensor(0.6927, grad_fn=<NegBackward0>)
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Loss: tensor(0.6921, grad_fn=<NegBackwar

# Whole training and validation loop

In [198]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import torch 
import polars as pl

from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
    DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_IMPRESSION_TIMESTAMP_COL
)

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
from ebrec.utils._articles import convert_text2encoding_with_transformers,concat_list_to_text
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

%load_ext autoreload
%autoreload 2

from src.model.dataloader import PPRecDataLoader
from src.model.model_config import hparams_pprec
from src.model.pprec import PPRec
PATH = Path("/Users/sohamchatterjee/Documents/UvA/RecSYS/Project/ebnerd_data")
DATASPLIT = "ebnerd_demo"

import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
cpu


In [199]:
df_articles = pl.read_parquet(PATH.joinpath("articles.parquet"))
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping_title = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)



In [54]:
import pickle
article_mapping_title, article_mapping_entity, articles_ctr, popularity_mapping = {},{},{},{}
with open('/Users/sohamchatterjee/Documents/UvA/RecSYS/Project/PP-Rec/OurCode/small_processed/article_mapping_title_SMALL.pkl', 'rb') as handle:
    article_mapping_title = pickle.load(handle)
with open('/Users/sohamchatterjee/Documents/UvA/RecSYS/Project/PP-Rec/OurCode/small_processed/article_mapping_entity_SMALL.pkl', 'rb') as handle:
    article_mapping_entity = pickle.load(handle)
with open('/Users/sohamchatterjee/Documents/UvA/RecSYS/Project/PP-Rec/OurCode/small_processed/articles_ctr_SMALL.pkl', 'rb') as handle:
    articles_ctr = pickle.load(handle)
with open('/Users/sohamchatterjee/Documents/UvA/RecSYS/Project/PP-Rec/OurCode/small_processed/popularity_mapping_SMALL.pkl', 'rb') as handle:
    popularity_mapping = pickle.load(handle)

COLUMNS = [
   'user_id',
   'article_id_fixed',
   'article_ids_inview',
   'article_ids_clicked',
   'impression_id',
   'labels',
   'recency_inview',
   'recency_hist'  
]
df_train  = pl.scan_parquet("/Users/sohamchatterjee/Documents/UvA/RecSYS/Project/PP-Rec/OurCode/small_processed/train_SMALL.parquet").select(COLUMNS).collect()

df_validation =  pl.scan_parquet("/Users/sohamchatterjee/Documents/UvA/RecSYS/Project/PP-Rec/OurCode/small_processed/val_SMALL.parquet").select(COLUMNS).collect()

In [55]:
df_train = df_train.head(10)

In [56]:
train_dataloader = PPRecDataLoader(
    behaviors=df_train,
    article_dict=article_mapping_title,
    entity_mapping=article_mapping_entity,
    ctr_mapping=articles_ctr,
    popularity_mapping = popularity_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_recency = 'recency_hist',
    inview_recency = 'recency_inview',
    eval_mode=False,
    batch_size=4,
)

In [57]:
val_dataloader = PPRecDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping_title,
    entity_mapping=article_mapping_entity,
    ctr_mapping=articles_ctr,
    popularity_mapping = popularity_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_recency = 'recency_hist',
    inview_recency = 'recency_inview',
    eval_mode=True,
    batch_size=512,
)

In [59]:
from model.modules import PPRec
from model.modules import train_one_epoch
from datetime import datetime
from model.modules import BPELoss
from torch.utils.tensorboard import SummaryWriter
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/pprec_check{}'.format(timestamp))
epoch_number = 0

EPOCHS = 5
loss_fn = BPELoss()
model = PPRec(hparams_pprec,word2vec_embedding)
model.to(device)
optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()),
            lr=0.01,
            weight_decay=1e-4
        )

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer,train_dataloader, optimizer,model,loss_fn,device)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(val_dataloader):
            vinputs, vlabels = vdata
            vtitle = vinputs[5]
            ventities = vinputs[6]
            vctr = vinputs[7]
            vrecency = vinputs[8]
            vhist_title = vinputs[0]
            vhist_popularity = vinputs[2]

            vtitle = torch.from_numpy(vtitle)
            ventities = torch.from_numpy(ventities)
            vctr = torch.from_numpy(vctr)
            vrecency = torch.from_numpy(vrecency)
            vhist_title = torch.from_numpy(vhist_title)
            vhist_popularity = torch.from_numpy(vhist_popularity)
            vlabels = torch.from_numpy(vlabels)
        
            # vtitle = vtitle.to(device)
            # ventities = ventities.to(device)
            # vctr = vctr.to(device)
            # vrecency = vrecency.to(device)
            # vhist_title = vhist_title.to(device)
            # vhist_popularity = vhist_popularity.to(device)
            # vlabels = vlabels.to(device)
            


            voutputs = model(vtitle, ventities, vctr, vrecency , vhist_title, vhist_popularity )
            # print(voutputs.shape)
            # print(vlabels.shape)
            vloss = loss_fn(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
torch.bool
torch.float32
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
torch.bool
torch.float32
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
torch.bool
torch.float32
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
torch.bool
torch.float32
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached here
Reached

KeyboardInterrupt: 

In [138]:
# import torch

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# print(device)

cpu


# Store info  for DEMO split

In [109]:
cd /Users/sohamchatterjee/Documents/UvA/RecSYS/Project/PP-Rec/OurCode/demo_processed

/Users/sohamchatterjee/Documents/UvA/RecSYS/Project/PP-Rec/OurCode/demo_processed


In [110]:
df_train.write_parquet("train_DEMO.parquet")
df_validation.write_parquet("val_DEMO.parquet")

In [111]:
import pickle 

with open('article_mapping_title_DEMO.pkl', 'wb') as f:
    pickle.dump(article_mapping_title, f)



with open('article_mapping_entity_DEMO.pkl', 'wb') as f:
    pickle.dump(article_mapping_entity, f)


with open('articles_ctr_DEMO.pkl', 'wb') as f:
    pickle.dump(articles_ctr, f)   

with open('popularity_mapping_DEMO.pkl', 'wb') as f:
    pickle.dump(pop_articles, f)   



In [261]:

df_val1 = df_validation.with_columns(
            # pl.col('article_id_fixed').list.len().alias("click_history_length"),
            # pl.col('article_ids_inview').list.len().alias("click_inview_length"),
             pl.col('labels').list.len().alias("labels_length"),
           
        )
#df_val1.head()
print(df_val1.select ( pl.struct("recency_inview").n_unique()))

shape: (1, 1)
┌────────────────┐
│ recency_inview │
│ ---            │
│ u32            │
╞════════════════╡
│ 135285         │
└────────────────┘


In [377]:
df_val1.head()

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,impression_time,impression_time_fixed,labels,recency_inview,recency_hist,labels_length
u32,list[i32],list[i64],list[i64],u32,datetime[μs],list[datetime[μs]],list[i8],list[i64],list[i64],u32
22548,"[9774840, 9757574, … 9776929]","[9784710, 9784696, … 9784679]",[9784696],96791,2023-05-28 04:21:24,"[2023-05-21 15:11:36, 2023-05-21 15:11:51, … 2023-05-25 04:45:13]","[0, 1, … 0]","[3, 4, … 4]","[0, 1, … 21]",5
22548,"[9774840, 9757574, … 9776929]","[9784648, 9783405, … 9782656]",[9784281],96798,2023-05-28 04:31:48,"[2023-05-21 15:11:36, 2023-05-21 15:11:51, … 2023-05-25 04:45:13]","[0, 0, … 0]","[4, 4, … 4]","[0, 1, … 21]",5
22548,"[9774840, 9757574, … 9776929]","[9782108, 9783405, … 9784444]",[9784444],96801,2023-05-28 04:30:17,"[2023-05-21 15:11:36, 2023-05-21 15:11:51, … 2023-05-25 04:45:13]","[0, 0, … 1]","[22, 4, … 5]","[0, 1, … 21]",5
22548,"[9774840, 9757574, … 9776929]","[9784406, 9695098, … 9781983]",[9781983],96808,2023-05-28 04:27:19,"[2023-05-21 15:11:36, 2023-05-21 15:11:51, … 2023-05-25 04:45:13]","[0, 0, … 1]","[5, 487, … 22]","[0, 1, … 21]",5
22548,"[9774840, 9757574, … 9776929]","[9781520, 9782108, … 9782108]",[9784642],96810,2023-05-28 04:29:47,"[2023-05-21 15:11:36, 2023-05-21 15:11:51, … 2023-05-25 04:45:13]","[0, 0, … 0]","[29, 21, … 21]","[0, 1, … 21]",5


# Data preprocess for large data
# Split train into train and val ans use val as test