In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from src.utils import DATA_FOLDER
from src.data.split import EBNeRDSplit
import os

In [2]:
# set the env var or change this
data_folder = DATA_FOLDER or "data"
print(data_folder)

if not os.path.exists(data_folder):
    raise ValueError(f"Data folder {data_folder} does not exist")

/Users/pepijnvanderklei/Data/RecSys


In [3]:
predictions_txt_path = os.path.join(data_folder, 'predictions.txt')
articles_parquet_path = os.path.join(data_folder, 'articles.parquet')

# Predictions

This is an example of the file we have to submit to the challenge. So in the test set, there is a behaviors file which does include the articles view column, but does not contain the articles clicked column. The goal is submit a ranking of all the articles viewed, by chance of clicking. So just the ranking of the viewed articles based on the output of our model.

In [4]:
predictions = pd.read_csv(predictions_txt_path, sep=' ', header=None)
predictions.columns = ['impression_id', 'rankings']


# Articles

These top level articles are **all** the articles I presume? The separate data folders `demo`, `small` and `large` all have an articles file as well. 

In [5]:
articles = pd.read_parquet(articles_parquet_path)
print(articles.columns)
print(articles)

Index(['article_id', 'title', 'subtitle', 'last_modified_time', 'premium',
       'body', 'published_time', 'image_ids', 'article_type', 'url',
       'ner_clusters', 'entity_groups', 'topics', 'category', 'subcategory',
       'category_str', 'total_inviews', 'total_pageviews', 'total_read_time',
       'sentiment_score', 'sentiment_label'],
      dtype='object')
        article_id                                              title  \
0          3000022                     Hanks beskyldt for mishandling   
1          3000063                    Bostrups aske spredt i Furesøen   
2          3000613                Jesper Olsen ramt af hjerneblødning   
3          3000700                           Madonna topløs med heste   
4          3000840                            Otto Brandenburg er død   
...            ...                                                ...   
125536     9803505                Flyvende Antonsen knuser topspiller   
125537     9803510  Nedschroef Langeskov udbetalt

# Data Splits

Info can be found here: https://recsys.eb.dk/dataset/

---

## Articles

This speaks for itself. All articles with an id, title, body, catagory etc.

---

## Behaviours / Impressions

Every behaviour (i.e. impression), is some time where a user has seen a list of articles. I'll note the difference between the article, inview articles and clicked articles, since it was a bit unclear what article id was to me. 

### Article Id 

This is article from where the impression was made. So lets say this is Youtube. When viewing a video, there is a list of recommended videos on the right. This 'article id' would be the video that was watched. The 'impression' would be the list of recommended videos. This can be None if the user is not already watching a video (so reading an article for us), but is on the homepage for example.

### Inview Article Ids

These are actually the articles id's of the recommended news articles (so the recommended videos in the case of Youtube). 

### Clicked Article Ids

These are the videos that were clicked from the list. These could be multiple, since some user might click a video, then go back and click another one.

---

## History

This is just a simplifief version of the behaviours file, where user id is the main key, contains all the articles ever clicked by a user. So we can use this to find the input data to the Popularity-aware User Encoder, in a easier and cleaner way than the behaviours file.

---





## Demo

In [6]:
demo_train_split = EBNeRDSplit('train', "demo")
demo_train_split.summarize(show_columns=True)
demo_train_split._history.head()

Articles: (11777, 21)
Columns Articles:  Index(['article_id', 'title', 'subtitle', 'last_modified_time', 'premium',
       'body', 'published_time', 'image_ids', 'article_type', 'url',
       'ner_clusters', 'entity_groups', 'topics', 'category', 'subcategory',
       'category_str', 'total_inviews', 'total_pageviews', 'total_read_time',
       'sentiment_score', 'sentiment_label'],
      dtype='object')


Behaviors: (24724, 17)
Columns Behaviors:  Index(['impression_id', 'article_id', 'impression_time', 'read_time',
       'scroll_percentage', 'device_type', 'article_ids_inview',
       'article_ids_clicked', 'user_id', 'is_sso_user', 'gender', 'postcode',
       'age', 'is_subscriber', 'session_id', 'next_read_time',
       'next_scroll_percentage'],
      dtype='object')


History: (1590, 5)
Columns History:  Index(['user_id', 'impression_time_fixed', 'scroll_percentage_fixed',
       'article_id_fixed', 'read_time_fixed'],
      dtype='object')


Unnamed: 0_level_0,user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13538,13538,"[2023-04-27T10:17:43.000000, 2023-04-27T10:18:...","[100.0, 35.0, 100.0, 24.0, 100.0, 23.0, 100.0,...","[9738663, 9738569, 9738663, 9738490, 9738663, ...","[17.0, 12.0, 4.0, 5.0, 4.0, 9.0, 5.0, 46.0, 11..."
58608,58608,"[2023-04-27T18:48:09.000000, 2023-04-27T18:48:...","[37.0, 61.0, 100.0, 100.0, 55.0, 100.0, 100.0,...","[9739362, 9739179, 9738567, 9739344, 9739202, ...","[2.0, 24.0, 72.0, 65.0, 11.0, 4.0, 101.0, 0.0,..."
95507,95507,"[2023-04-27T15:20:28.000000, 2023-04-27T15:20:...","[60.0, 100.0, 100.0, 21.0, 29.0, 67.0, 49.0, 5...","[9739035, 9738646, 9634967, 9738902, 9735495, ...","[18.0, 29.0, 51.0, 12.0, 10.0, 10.0, 13.0, 24...."
106588,106588,"[2023-04-27T08:29:09.000000, 2023-04-27T08:29:...","[24.0, 57.0, 100.0, nan, nan, 100.0, 100.0, 73...","[9738292, 9738216, 9737266, 9737556, 9737657, ...","[9.0, 15.0, 42.0, 9.0, 3.0, 58.0, 26.0, 214.0,..."
617963,617963,"[2023-04-27T14:42:25.000000, 2023-04-27T14:43:...","[100.0, 100.0, nan, 46.0, 23.0, 19.0, 61.0, 70...","[9739035, 9739088, 9738902, 9738968, 9738760, ...","[45.0, 29.0, 116.0, 26.0, 34.0, 42.0, 58.0, 59..."


In [7]:
demo_train_split._articles.sample(10)

Unnamed: 0_level_0,article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,...,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9744693,9744693,Restaurant på Nørrebro skifter sur smiley ud m...,Restauranten Falafel Factory på Nørrebro kan n...,2023-06-29 06:48:29,False,En sur smiley kan nu rives ned og en glad hæng...,2023-05-01 15:01:41,[9744692],article_default,https://ekstrabladet.dk/auto/magna/smileys/res...,...,"[ORG, ORG, LOC, LOC, LOC]","[Erhverv, Privat virksomhed, Livsstil, Kultur,...",2889,"[2890, 2901]",auto,10417.0,31.0,650.0,0.8632,Positive
9330229,9330229,Uhyggelige videoer: Riffelmand truer med selvmord,Fire videoklip viste før og under de dødbringe...,2023-06-29 06:42:30,True,Fire korte videoklip lagt ud på Youtube af den...,2022-07-08 05:05:32,"[9330323, 9330736, 9328690]",article_default,https://ekstrabladet.dk/krimi/uhyggelige-video...,...,"[ORG, ORG, ORG, LOC, PROD, PER, PROD]","[Kriminalitet, Personfarlig kriminalitet]",140,[],krimi,,,,0.9944,Negative
9747633,9747633,Danske Mike i Ukraine: - En helvedes masse vold,Bykamp er den mest ekstreme form for krig. Dan...,2023-08-25 12:44:47,True,Lyskeglen fra lommelygten på Mikes våben strej...,2023-05-07 08:28:24,"[9749125, 9638628, 9749121, 9647422, 9749122]",article_default,https://ekstrabladet.dk/krimi/danske-mike-i-uk...,...,"[LOC, ORG, ORG, ORG, PER, PER, ORG, MISC, MISC...","[Konflikt og krig, Væbnet konflikt]",140,[],krimi,1116167.0,40164.0,2976915.0,0.9101,Negative
9737393,9737393,Kønsskiftet voldtægtsdømt vil afsone med kvinder,"62-årig tidligere mand, der afsoner forvaring ...",2023-06-29 06:48:21,True,"En 62-årig, der biologisk er mand, men juridis...",2023-04-27 12:26:16,"[9737397, 9737395, 7673550]",article_default,https://ekstrabladet.dk/krimi/koensskiftet-vol...,...,"[ORG, LOC, ORG, PER, ORG, ORG, ORG]","[Kriminalitet, Personfarlig kriminalitet]",140,[],krimi,1307006.0,141959.0,7435040.0,0.8861,Negative
9737071,9737071,Efter bizart pressemøde: Nu trækker hun sig,Den engelske tennisspiller Emma Raducanu trækk...,2023-06-29 06:48:21,False,WTA-turneringen Madrid Open bliver en stjerne ...,2023-04-26 11:37:23,"[9736208, 9737088]",article_default,https://ekstrabladet.dk/sport/anden_sport/tenn...,...,"[PER, EVENT, ORG, EVENT, PER, ORG]","[Kendt, Begivenhed, Sport, Sundhed, Sygdom og ...",142,"[327, 349]",sport,539855.0,104596.0,4689783.0,0.9341,Negative
9486486,9486486,Prostatalægen: Sådan lever jeg selv,Prostatakræft er den mest hyppige kræftsygdom ...,2023-06-29 06:44:45,True,Klinisk professor og overlæge i urinvejskirurg...,2023-01-05 06:05:07,[9486502],article_default,https://ekstrabladet.dk/forbrug/sundhed/prosta...,...,"[LOC, PER, LOC, MISC]","[Livsstil, Samfund, Sundhed, Sygdom og behandl...",457,[475],forbrug,,,,0.8432,Neutral
8189648,8189648,Melina: Endnu en debutant på toppen!,"Melina, 24 år og fra Helsingør: Wow! Det er je...",2023-06-29 06:30:54,True,Se minigalleriet med Melina i bunden af artikl...,2020-07-04 21:30:00,"[8178743, 8178744, 8178742, 8178745, 8178743]",article_default,https://ekstrabladet.dk/side9/maanedens/melina...,...,[],"[Livsstil, Erotik]",572,[573],side9,,,,0.977,Positive
9751866,9751866,Alvorlige anklager mod Rusland,"Ukraine har frigivet optagelser, der angivelig...",2023-06-29 06:48:36,False,Den brutale kamp om Bakhmut fortsætter.\nSiden...,2023-05-06 14:49:44,[9744934],article_default,https://ekstrabladet.dk/nyheder/krigogkatastro...,...,"[LOC, LOC, PER, ORG, PER, LOC, ORG, ORG, PER, ...","[Konflikt og krig, Væbnet konflikt]",118,[127],nyheder,585474.0,117934.0,9505974.0,0.9933,Negative
8423084,8423084,Kærlighedseksperten guider: Sådan dater du nu,"Hvordan bærer man sig ad, hvis man som single ...",2023-06-29 06:33:58,True,Det er ikke let at være på jagt efter kærlighe...,2021-01-24 20:04:44,"[8423320, 8423321, 8423306, 8427168, 8423319]",article_default,https://ekstrabladet.dk/sex_og_samliv/kaerligh...,...,[],"[Livsstil, Partnerskab]",565,[],sex_og_samliv,,,,0.866,Neutral
8054212,8054212,Henriette elsker sex i bilen,Kæresteparret Lars og Henriette ynder at dyrke...,2023-07-31 12:54:04,True,Kæresteparret Lars og Henriette er begge del a...,2020-04-25 18:55:11,"[8066573, 8066582, 8066571, 8066580, 8066574, ...",article_default,https://ekstrabladet.dk/sex_og_samliv/henriett...,...,"[LOC, ORG, ORG, ORG, LOC, LOC]","[Livsstil, Transportmiddel, Bil, Erotik]",565,[],sex_og_samliv,,,,0.8792,Neutral


## Small

In [9]:
small_train_split = EBNeRDSplit('train', "small")
print(len(small_train_split._behaviors))
small_train_split._behaviors.tail()

232887


Unnamed: 0_level_0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
580099643,580099643,9769306,2023-05-18 10:01:05,121.0,100.0,3,"[9233208, 9771242, 9767697, 9514481, 9771065, ...",[9770886],2106715,False,,,,False,1416293,121,
580099644,580099644,9770882,2023-05-18 10:05:07,176.0,100.0,3,"[9771065, 9767697, 9770886, 9758882, 9709817, ...",[9769306],2106715,False,,,,False,1416293,148,100.0
580099645,580099645,9769306,2023-05-18 10:11:03,24.0,100.0,3,"[9771042, 9440508, 9486080, 9770997, 9120051, ...",[9771042],2106715,False,,,,False,1416293,4,
580100695,580100695,9771242,2023-05-18 10:00:08,5.0,100.0,1,"[9440508, 9142581, 9769917, 9767697, 9514481, ...",[9767697],2110744,False,,,,False,747086,75,100.0
580100697,580100697,9771242,2023-05-18 10:01:29,14.0,100.0,1,"[9233208, 8422665, 9769306, 9771042, 9514481, ...",[9771042],2110744,False,,,,False,747086,123,25.0


# Artifacts

There is some files with outputs for standart embedding models, for all the articles. So for models like BERT, RoBERTa, Word2Vec, and some Contrastive model. The word2vec model is not a word to embbeding mapping, but article to embedding mapping. I its a sum of all the embeddings of the words in the article.

## We can't really use this

But our model needs a word2vec model which actually maps words to embeddings. 

> Given a news title, we obtain the word em- beddings based on word embedding dictionary pre- trained on large-scale corpus to incorporate initial word-level semantic information. We also convert entities into embeddings based on pre-trained entity embeddings to incorporate knowledge information in knowledge graphs to our model.

They do not really state which dictionary they use I think. So lets find it in their code, or we can just choose and download a word2vec model. Lets find something Huggingface for this. Then we have two options. We can use the word2vec model as a dictionary, like the autors do, or arguably better, train it together with our model!

## But we can use it as a starting point

Songga said a good first step is to not implement the Knowledge-aware News Encoder right away. This model takes in an article and outputs some embeddings. So she said we can start using this. 

In [10]:
bert_parquet_path = os.path.join(data_folder, "google_bert_base_multilingual_cased", 'bert_base_multilingual_cased.parquet')
roberta_parquet_path = os.path.join(data_folder, "FacebookAI_xlm_roberta_base", 'xlm_roberta_base.parquet')
word2vec_parquet_path = os.path.join(data_folder, "Ekstra_Bladet_word2vec", 'document_vector.parquet')
contrastive_parquet_path = os.path.join(data_folder, "Ekstra_Bladet_contrastive_vector", 'contrastive_vector.parquet')

In [11]:
bert_parquet = pd.read_parquet(bert_parquet_path)
roberta_parquet = pd.read_parquet(roberta_parquet_path)
word2vec_parquet = pd.read_parquet(word2vec_parquet_path)
contrastive_parquet = pd.read_parquet(contrastive_parquet_path)

In [32]:
print(bert_parquet['google-bert/bert-base-multilingual-cased'].iloc[0].shape)
print(type(bert_parquet['google-bert/bert-base-multilingual-cased'].iloc[0]))
bert_parquet.head()

(768,)
<class 'numpy.ndarray'>


Unnamed: 0,article_id,google-bert/bert-base-multilingual-cased
0,3000022,"[-0.35060593, 0.0034366532, -0.1756858, -0.088..."
1,3000063,"[-0.0034481985, 0.22765873, -0.19700234, 0.060..."
2,3000613,"[-0.03811903, -0.030120859, -0.6928362, 0.0801..."
3,3000700,"[-0.29374197, -0.07282147, -0.0926456, -0.0540..."
4,3000840,"[0.0019190352, -0.010633812, -0.49158585, 0.17..."


In [31]:
print(roberta_parquet['FacebookAI/xlm-roberta-base'].iloc[0].shape)
print(type(roberta_parquet['FacebookAI/xlm-roberta-base'].iloc[0]))
roberta_parquet.head()

(768,)
<class 'numpy.ndarray'>


Unnamed: 0,article_id,FacebookAI/xlm-roberta-base
0,3000022,"[0.102449246, 0.10114823, 0.056887403, 0.02293..."
1,3000063,"[0.10729711, 0.103072755, 0.054031033, -0.0292..."
2,3000613,"[0.12513922, 0.12462065, 0.061414186, -0.03694..."
3,3000700,"[0.10569707, 0.07633519, 0.0717533, 0.00109790..."
4,3000840,"[0.098174535, 0.114628576, 0.03703187, -0.0088..."


In [29]:
print(word2vec_parquet['document_vector'].iloc[0].shape)
print(type(word2vec_parquet['document_vector'].iloc[0]))
word2vec_parquet.head()

(300,)
<class 'numpy.ndarray'>


Unnamed: 0,article_id,document_vector
0,3000022,"[0.06542388, -0.047424573, 0.06384871, -0.0014..."
1,3000063,"[0.028815078, -0.00016637295, 0.055056807, 0.0..."
2,3000613,"[0.037971217, 0.03392251, 0.027297212, 0.01708..."
3,3000700,"[0.04652399, 0.0029133065, 0.06280604, -0.0051..."
4,3000840,"[0.014736942, 0.024067875, 0.0051865038, 0.041..."


In [28]:
print(contrastive_parquet['contrastive_vector'].iloc[0].shape)
print(type(contrastive_parquet['contrastive_vector'].iloc[0]))
contrastive_parquet.head()

(768,)
<class 'numpy.ndarray'>


Unnamed: 0,article_id,contrastive_vector
0,3000022,"[-0.012159083, 0.057096627, 0.018299146, -0.03..."
1,3000063,"[0.03448151, 0.033532683, 0.054597735, -0.0231..."
2,3000613,"[-0.01463833, 0.030934403, 0.036162928, 0.0394..."
3,3000700,"[-0.06416679, 0.004852634, 0.0132708335, -0.00..."
4,3000840,"[-0.01304012, 0.024513246, 0.031050924, 0.0123..."
