In [1]:
import polars as pl

In [2]:
from pathlib import Path


dpath = Path('../dataset')

dtype = 'large'
articles = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/articles.parquet')

behaviors_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/behaviors.parquet')
history_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/history.parquet')

behaviors_val = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/validation/behaviors.parquet')
history_val = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/validation/history.parquet')

In [3]:
df = history_train.select('impression_time_fixed', 'article_id_fixed').explode(['impression_time_fixed', 'article_id_fixed'])
df = df.with_columns(pl.col('impression_time_fixed').cast(pl.Date).alias('date'), 
                     pl.col('impression_time_fixed').dt.hour().alias('hour'),
                     pl.col('impression_time_fixed').dt.minute().alias('minute'))\
        .rename({'article_id_fixed': 'article_id'})\
        .drop('impression_time_fixed')
df.head(3)

article_id,date,hour,minute
i32,date,i8,i8
9735579,2023-04-28,6,16
9739888,2023-04-28,6,17
9739471,2023-04-28,6,18


In [4]:
ap = articles.select('article_id', 'ner_clusters')\
    .with_columns(pl.col('ner_clusters').list.eval(pl.element().str.strip_chars_start('\"')))\
    .with_columns(pl.col('ner_clusters').list.eval(pl.element().str.strip_chars(' ')))\
    .with_columns(pl.col('ner_clusters').list.eval(pl.element().str.to_lowercase()))\
    .with_columns(pl.col('ner_clusters').list.eval(pl.element().filter(pl.element().str.len_chars() > 0)))\
    .with_columns(pl.col('ner_clusters').list.unique())\
    .with_columns(pl.col('ner_clusters').list.sort())\
    .sort('article_id')\
    .set_sorted('article_id')
ap.head(2)


article_id,ner_clusters
i32,list[str]
3000022,"[""david gardner""]"
3000063,[]


In [5]:
ner_mappings = ap.select('article_id', 'ner_clusters')\
        .explode('ner_clusters') \
        .rename({'ner_clusters': 'ner'})\
        .unique('ner')\
        .drop_nulls()\
        .sort('ner')\
        .with_row_index()\
        .drop('article_id')\
        .rename({'index': 'ner_index'})
ner_mappings.head(2)

ner_index,ner
u32,str
0,"""#"""
1,"""# antwerpen"""


In [23]:
ner_id_map = {k:v for k,v in zip(ner_mappings['ner'], ner_mappings['ner_index'])}
ap_ind= ap.with_columns(
    pl.col('ner_clusters').list.eval(pl.element().replace(ner_id_map).cast(pl.Int32))
)
ap_ind.head(2)

article_id,ner_clusters
i32,list[i64]
3000022,[20966]
3000063,[]


In [6]:
df

article_id,date,hour,minute
i32,date,i8,i8
9735579,2023-04-28,6,16
9739888,2023-04-28,6,17
9739471,2023-04-28,6,18
9739864,2023-04-28,6,18
9738441,2023-04-28,6,20
9739883,2023-04-28,6,21
9739153,2023-04-28,6,21
9739844,2023-04-28,6,22
9739634,2023-04-28,6,22
9739802,2023-04-28,6,22


In [20]:
df = df.join(ap_ind, on='article_id', how='left')
df.head(3)

article_id,date,hour,minute,ner_clusters
i32,date,i8,i8,list[i32]
9738663,2023-04-27,10,17,"[915, 7645, … 37662]"
9738569,2023-04-27,10,18,"[8308, 8829, … 35645]"
9738663,2023-04-27,10,18,"[915, 7645, … 37662]"


In [22]:
most_common_ner_per_hour = df.explode('ner_clusters')\
    .rename({'ner_clusters': 'ner_index'})\
    .group_by('date', 'hour')\
    .agg(pl.col('ner_index').max())\
    .sort('date', 'hour')
most_common_ner_per_hour.head(3)

date,hour,ner_index
date,i8,i32
2023-04-27,7,43746
2023-04-27,8,43602
2023-04-27,9,43746


In [25]:
most_common_ner_per_day = df.explode('ner_clusters')\
    .rename({'ner_clusters': 'ner_index'})\
    .group_by('date')\
    .agg(pl.col('ner_index').max())\
    .sort('date')
most_common_ner_per_day.head(3)

date,ner_index
date,i32
2023-04-27,43772
2023-04-28,43772
2023-04-29,43767


In [26]:
most_common_ner_per_day.plot('date', 'ner_index', kind='line', title='Most common NER per day')

In [27]:
most_common_ner_per_hour.plot(x='date', y='ner_index', kind='line', title='Most common NER per hour')