In [6]:
import polars as pl
import gc

In [2]:
articles = pl.read_parquet("/home/ubuntu/dataset/ebnerd_large/articles.parquet")

In [9]:
generated_articles = articles.select(["article_id","url"]).filter(pl.col("url").str.contains("/auto/magna"))
generated_articles

article_id,url
i32,str
9601214,"""https://ekstrabladet.dk/auto/m…"
9601617,"""https://ekstrabladet.dk/auto/m…"
9601648,"""https://ekstrabladet.dk/auto/m…"
9601779,"""https://ekstrabladet.dk/auto/m…"
9614274,"""https://ekstrabladet.dk/auto/m…"
…,…
9802625,"""https://ekstrabladet.dk/auto/m…"
9802650,"""https://ekstrabladet.dk/auto/m…"
9802878,"""https://ekstrabladet.dk/auto/m…"
9802918,"""https://ekstrabladet.dk/auto/m…"


In [5]:
behaviors_train = pl.read_parquet("/home/ubuntu/dataset/ebnerd_large/train/behaviors.parquet")

In [13]:
behaviors_train = behaviors_train.select(["article_ids_inview","article_ids_clicked"]).explode(["article_ids_inview"]) \
.rename({"article_ids_inview":"article_id"}) \
.with_columns(
    pl.col("article_ids_clicked").list.contains(pl.col("article_id")).alias("target")
).drop("article_ids_clicked")

In [14]:
behaviors_train

article_id,target
i32,bool
9482380,false
9775183,true
9744403,false
9775297,false
9774020,false
…,…
9233208,false
9769917,false
9767697,false
9770369,false


In [15]:
behaviors_train.join(generated_articles, on="article_id",how="inner")

article_id,target,url
i32,bool,str
9775754,false,"""https://ekstrabladet.dk/auto/m…"
9775754,false,"""https://ekstrabladet.dk/auto/m…"
9775754,false,"""https://ekstrabladet.dk/auto/m…"
9775754,false,"""https://ekstrabladet.dk/auto/m…"
9773257,false,"""https://ekstrabladet.dk/auto/m…"
…,…,…
9772010,false,"""https://ekstrabladet.dk/auto/m…"
9767376,false,"""https://ekstrabladet.dk/auto/m…"
9729475,true,"""https://ekstrabladet.dk/auto/m…"
9772470,false,"""https://ekstrabladet.dk/auto/m…"


In [27]:
behaviors_val = pl.read_parquet("/home/ubuntu/dataset/ebnerd_large/validation/behaviors.parquet")

In [28]:
behaviors_val = behaviors_val.select(["article_ids_inview","article_ids_clicked"]).explode(["article_ids_inview"]) \
.rename({"article_ids_inview":"article_id"}) \
.with_columns(
    pl.col("article_ids_clicked").list.contains(pl.col("article_id")).alias("target")
).drop("article_ids_clicked")

In [29]:
behaviors_val

article_id,target
i32,bool
9230405,false
9784793,false
9784803,false
9784275,false
9782726,false
…,…
9784138,false
9783850,false
9782836,false
9279095,false


In [30]:
behaviors_val.join(generated_articles, on="article_id",how="inner")

article_id,target,url
i32,bool,str
9789890,false,"""https://ekstrabladet.dk/auto/m…"
9782485,false,"""https://ekstrabladet.dk/auto/m…"
9782485,false,"""https://ekstrabladet.dk/auto/m…"
9738052,false,"""https://ekstrabladet.dk/auto/m…"
9767273,false,"""https://ekstrabladet.dk/auto/m…"
…,…,…
9780917,false,"""https://ekstrabladet.dk/auto/m…"
9780917,false,"""https://ekstrabladet.dk/auto/m…"
9769604,false,"""https://ekstrabladet.dk/auto/m…"
9772421,false,"""https://ekstrabladet.dk/auto/m…"


In [23]:
behaviors_test = pl.read_parquet("/home/ubuntu/dataset/ebnerd_testset/test/behaviors.parquet")
behaviors_test

impression_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,is_beyond_accuracy
u32,datetime[μs],f32,f32,i8,list[i32],u32,bool,i8,i8,i8,bool,u32,bool
6451339,2023-06-05 15:02:49,8.0,,2,"[9796527, 7851321, … 9492777]",35982,false,,,,false,388,false
6451363,2023-06-05 15:03:56,20.0,,2,"[9798532, 9791602, … 9798958]",36012,false,,,,false,804,false
6451382,2023-06-05 15:25:53,9.0,,2,"[9798498, 9793856, … 9798724]",36162,false,,,,false,1528,false
6451383,2023-06-05 15:26:35,14.0,,2,"[9797419, 9798829, … 9798805]",36162,false,,,,false,1528,false
6451385,2023-06-05 15:26:14,8.0,,2,"[9785014, 9798958, … 9486080]",36162,false,,,,false,1528,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0,2023-06-01 07:00:01,0.0,,0,"[9793163, 9793069, … 9789545]",1589163,false,,,,false,0,true
0,2023-06-01 07:00:01,0.0,,0,"[9793163, 9793069, … 9789545]",1699456,false,,,,false,0,true
0,2023-06-01 07:00:01,0.0,,0,"[9793163, 9793069, … 9789545]",635479,false,,,,false,0,true
0,2023-06-01 07:00:01,0.0,,0,"[9793163, 9793069, … 9789545]",251030,false,,,,false,0,true


In [24]:
behaviors_test = behaviors_test.select(["impression_id","article_ids_inview"]).explode(["article_ids_inview"]) \
.rename({"article_ids_inview":"article_id"})
behaviors_test

impression_id,article_id
u32,i32
6451339,9796527
6451339,7851321
6451339,9798805
6451339,9795150
6451339,9531110
…,…
0,9792362
0,9788041
0,9790135
0,9792408


In [26]:
behaviors_test.join(generated_articles, on="article_id",how="inner").filter(pl.col("impression_id")!=0)

impression_id,article_id,url
u32,i32,str
9815875,9800965,"""https://ekstrabladet.dk/auto/m…"
9815877,9800965,"""https://ekstrabladet.dk/auto/m…"
9815880,9800965,"""https://ekstrabladet.dk/auto/m…"
9815883,9800965,"""https://ekstrabladet.dk/auto/m…"
11369842,9794680,"""https://ekstrabladet.dk/auto/m…"
…,…,…
532822873,9786010,"""https://ekstrabladet.dk/auto/m…"
532822873,9791788,"""https://ekstrabladet.dk/auto/m…"
532822873,9788406,"""https://ekstrabladet.dk/auto/m…"
532888599,9797195,"""https://ekstrabladet.dk/auto/m…"
