In [1]:
import polars as pl
from ebrec.utils._constants import * # a bunch of constant strings for column names

import os
from pathlib import Path


data_base = Path(os.getcwd()).parent / "data-merged" / "merged"
train_val_base = data_base / "1-ebnerd_demo_(20MB)"
# train_val_base = data_base / "3-ebnerd_large_(3.0GB)"
test_base = data_base / "5-ebnerd_testset_(1.5GB)"
assert train_val_base.exists() and test_base.exists()


"""
load user history
"""

train_behaviors = pl.scan_parquet(train_val_base / "train" / "behaviors.parquet")
train_history = pl.scan_parquet(train_val_base / "train" / "history.parquet")

val_behavior = pl.scan_parquet(train_val_base / "validation" / "behaviors.parquet")
val_history = pl.scan_parquet(train_val_base / "validation" / "history.parquet")

test_behavior = pl.scan_parquet(test_base / "test" / "behaviors.parquet")
test_history = pl.scan_parquet(test_base / "test" / "history.parquet")


"""
merge history and behavior
"""

assert list(set(train_behaviors.columns).intersection(set(train_history.columns)))[0] == DEFAULT_USER_COL
train_user: pl.LazyFrame = train_behaviors.join(train_history, on=DEFAULT_USER_COL, how="inner")

assert list(set(val_behavior.columns).intersection(set(val_history.columns)))[0] == DEFAULT_USER_COL
val_user: pl.LazyFrame = val_behavior.join(val_history, on=DEFAULT_USER_COL, how="inner")

assert list(set(test_behavior.columns).intersection(set(test_history.columns)))[0] == DEFAULT_USER_COL
test_user: pl.LazyFrame = test_behavior.join(test_history, on=DEFAULT_USER_COL, how="inner")


"""
load article content
"""

train_articles: pl.LazyFrame = pl.scan_parquet(train_val_base / "articles.parquet")
val_articles: pl.LazyFrame = train_articles
test_articles: pl.LazyFrame = pl.scan_parquet(test_base / "articles.parquet")

articles_word2vec: pl.LazyFrame = pl.scan_parquet(data_base / "7-Ekstra-Bladet-word2vec_(133MB)" / "document_vector.parquet")
articles_image_embeddings: pl.LazyFrame = pl.scan_parquet(data_base / "8-Ekstra_Bladet_image_embeddings_(372MB)" / "image_embeddings.parquet")
articles_contrastive_vector: pl.LazyFrame = pl.scan_parquet(data_base / "9-Ekstra-Bladet-contrastive_vector_(341MB)" / "contrastive_vector.parquet")
articles_bert_base_multilingual_cased: pl.LazyFrame = pl.scan_parquet(data_base / "10-google-bert-base-multilingual-cased_(344MB)" / "bert_base_multilingual_cased.parquet")
articles_xlm_roberta_base: pl.LazyFrame = pl.scan_parquet(data_base / "11-FacebookAI-xlm-roberta-base_(341MB)" / "xlm_roberta_base.parquet")

# check sparsity of cols

the dataset is massive.

ever single column that we can drop will save us a lot of time and memory.

user data:

> drop `gender`, `postcode`, `age`, `scroll_percentage`

- the null value in the `article_id` means that the impression was left from the front page, so it is not a missing value.
- all user sets (train/val/test) have too few `gender`, `postcode`, `age`, `scroll_percentage` values for these columns to be useful.

articles data:

> don't drop anything

- the `total_pageviews`, `total_inviews`, `total_read_time` columns reflect the popularity of the article in the first 7 days after publication. they aren't missing values.
- since only 88.59% of articles have `image_ids`. fewer images also mean fewer image embeddings (reduces completeness, not predictive power).

In [14]:
print("user train")
total_train: int = train_user.select(DEFAULT_USER_COL).count().collect().to_dicts()[0][DEFAULT_USER_COL]
for col in train_user.columns:
    non_null: int = train_user.select(col).drop_nulls().count().collect().to_dicts()[0][col]
    non_null_ratio = non_null / total_train
    if non_null_ratio < 1.0:
        print(f"\t- {col}: {non_null_ratio * 100:.2f}% {'CRITICAL' if non_null_ratio < 0.5 else ''}")

user train
	- article_id: 29.89% CRITICAL
	- scroll_percentage: 29.34% CRITICAL
	- gender: 7.35% CRITICAL
	- postcode: 2.23% CRITICAL
	- age: 3.06% CRITICAL
	- next_read_time: 97.35% 
	- next_scroll_percentage: 88.72% 


In [15]:
print("user val")
total_val: int = val_user.select(DEFAULT_USER_COL).count().collect().to_dicts()[0][DEFAULT_USER_COL]
for col in val_user.columns:
    non_null: int = val_user.select(col).drop_nulls().count().collect().to_dicts()[0][col]
    non_null_ratio = non_null / total_val
    if non_null_ratio < 1.0:
        print(f"\t- {col}: {non_null_ratio * 100:.2f}% {'CRITICAL' if non_null_ratio < 0.5 else ''}")

user val
	- article_id: 29.02% CRITICAL
	- scroll_percentage: 28.65% CRITICAL
	- gender: 7.07% CRITICAL
	- postcode: 2.15% CRITICAL
	- age: 2.94% CRITICAL
	- next_read_time: 97.29% 
	- next_scroll_percentage: 88.05% 


In [16]:
print("user test")
total_test: int = test_user.select(DEFAULT_USER_COL).count().collect().to_dicts()[0][DEFAULT_USER_COL]
for col in test_user.columns:
    non_null: int = test_user.select(col).drop_nulls().count().collect().to_dicts()[0][col]
    non_null_ratio = non_null / total_test
    if non_null_ratio < 1.0:
        print(f"\t- {col}: {non_null_ratio * 100:.2f}% {'CRITICAL' if non_null_ratio < 0.5 else ''}")

user test
	- scroll_percentage: 28.44% CRITICAL
	- gender: 7.07% CRITICAL
	- postcode: 2.15% CRITICAL
	- age: 2.94% CRITICAL


In [23]:
print("train/val articles")
total_articles: int = train_articles.select(DEFAULT_ARTICLE_ID_COL).count().collect().to_dicts()[0][DEFAULT_ARTICLE_ID_COL]
for col in train_articles.columns:
    non_null: int = train_articles.select(col).drop_nulls().count().collect().to_dicts()[0][col]
    non_null_ratio = non_null / total_articles
    if non_null_ratio < 1.0:
        print(f"\t- {col}: {non_null_ratio * 100:.2f}% {'CRITICAL' if non_null_ratio < 0.5 else ''}")

train/val articles
	- image_ids: 88.59% 
	- total_inviews: 14.61% CRITICAL
	- total_pageviews: 13.49% CRITICAL
	- total_read_time: 13.49% CRITICAL


In [24]:
print("test articles")
total_articles: int = test_articles.select(DEFAULT_ARTICLE_ID_COL).count().collect().to_dicts()[0][DEFAULT_ARTICLE_ID_COL]
for col in test_articles.columns:
    non_null: int = test_articles.select(col).drop_nulls().count().collect().to_dicts()[0][col]
    non_null_ratio = non_null / total_articles
    if non_null_ratio < 1.0:
        print(f"\t- {col}: {non_null_ratio * 100:.2f}% {'CRITICAL' if non_null_ratio < 0.5 else ''}")

test articles
	- image_ids: 88.59% 
	- total_inviews: 14.61% CRITICAL
	- total_pageviews: 13.49% CRITICAL
	- total_read_time: 13.49% CRITICAL


In [31]:
print("word2vec", articles_word2vec.count().collect().to_dicts())
print("image embeddings", articles_image_embeddings.count().collect().to_dicts())
print("contrastive vector", articles_contrastive_vector.count().collect().to_dicts())
print("bert base multilingual cased", articles_bert_base_multilingual_cased.count().collect().to_dicts())
print("xlm roberta base", articles_xlm_roberta_base.count().collect().to_dicts())

word2vec [{'article_id': 125541, 'document_vector': 125541}]
image embeddings [{'article_id': 102603, 'image_embedding': 102603}]
contrastive vector [{'article_id': 125541, 'contrastive_vector': 125541}]
bert base multilingual cased [{'article_id': 125541, 'google-bert/bert-base-multilingual-cased': 125541}]
xlm roberta base [{'article_id': 125541, 'FacebookAI/xlm-roberta-base': 125541}]
