# Hello

This notebook is an example of how to make a beyond-accuracy dataset, and how one could make baselines

In [None]:
%cd ../../src/
%ls  

In [3]:
import tensorflow as tf

if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


In [None]:
import sys
sys.path.append('/src')

# Get started

## Dependencies

In [7]:
from pathlib import Path
import polars as pl
import numpy as np

from ebrec.utils._constants import *
from ebrec.evaluation.beyond_accuracy import (
    IntralistDiversity,
    Distribution,
    Serendipity,
    Sentiment,
    Coverage,
    Novelty,
)
from ebrec.utils._articles import create_sort_based_prediction_score
from ebrec.utils._behaviors import truncate_history
from ebrec.utils._polars import slice_join_dataframes
from ebrec.utils._python import (
    rank_predictions_by_score,
    write_submission_file,
    write_json_file,
    read_json_file,
)

## Set paths

In [8]:
ROOT_FOLDER = "evaluation_artifacts"
DATASET_SIZE = "ebnerd_small"
# ROOT PATH:
PATH = Path(f"../data/{DATASET_SIZE}")

## We are using the LARGE articles dataset to ensure we have all articles IDs,
# for beyond-accuracy; as only 154 aids are found in the demo.
ARTICLES_PATH = PATH.parent.joinpath("ebnerd_large")

# PATH TO DUMP ARTIFACTS:
PATH_BEYOND_ACCURACY = PATH.joinpath(ROOT_FOLDER)
PATH_BEYOND_ACCURACY.mkdir(exist_ok=True, parents=True)
# BASELINE ARTIFACTS:
PATH_BEYOND_ACCURACY_BASELINES = PATH_BEYOND_ACCURACY.joinpath("baselines")
PATH_BEYOND_ACCURACY_BASELINES.mkdir(parents=True, exist_ok=True)

### Output files

In [9]:
BEYOND_ACCURACY_HISTORY_DICT = "beyond_accuracy_history_dict.json"
BEYOND_ACCURACY_USERS_DICT = "beyond_accuracy_users_dict.json"
CANDIDATE_LIST = "candidate_list.json"
ARTICLES_DICT = "articles_dict.json"
BEHAVIORS_TIMESTAMP_DICT = "behaviors_timestamp_dict.json"
#
BASELINE_DIVERSITY = "intralist_diversity_npa.json"
BASELINE_SENTIMENT_SCORE = "sentiment_score_npa.json"
BASELINE_NOVELTY = "novelty_npa.json"
BASELINE_SERENDIPITY = "serendipity_npa.json"
BASELINE_COVERAGE = "coverage_npa.json"
BASELINE_DISTRIBUTION_CATEGORY = "distribution_category_npa.json"
BASELINE_DISTRIBUTION_SENTIMENT_LABEL = "distribution_sentiment_label_npa.json"
BASELINE_DISTRIBUTION_TOPICS = "distribution_topics_npa.json"

## Load dataset

In [None]:

df_validation = pl.scan_parquet(PATH.joinpath("validation", "behaviors.parquet")).collect()

split_point = len(df_validation)
df_behaviors = df_validation[:split_point]

df_behaviors_pd = df_behaviors.to_pandas()
df_behaviors = pl.from_pandas(df_behaviors_pd)

df_articles = pl.scan_parquet(ARTICLES_PATH.joinpath("articles.parquet"))
df_history = pl.scan_parquet(PATH.joinpath("validation", "history.parquet")).select(
    DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL
)

# Make / Dump Metadata

## User meta data: Segments

In [11]:
user_meta_columns = [
    DEFAULT_IS_SUBSCRIBER_COL,
    DEFAULT_IS_SSO_USER_COL,
    DEFAULT_POSTCODE_COL,
    DEFAULT_GENDER_COL,
    DEFAULT_AGE_COL,
]
df_users = df_behaviors.select(user_meta_columns)

# Convert selected columns to dictionary
users_dict = {col: df_users[col].to_list() for col in df_users.columns}

# Write the users dictionary to a JSON file
write_json_file(users_dict, PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_USERS_DICT), verbose=True)

print(f"#rows: {df_users.shape[0]}")
print(df_users.head(3))

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/beyond_accuracy_users_dict.json'
#rows: 244647
shape: (3, 5)
┌───────────────┬─────────────┬──────────┬────────┬──────┐
│ is_subscriber ┆ is_sso_user ┆ postcode ┆ gender ┆ age  │
│ ---           ┆ ---         ┆ ---      ┆ ---    ┆ ---  │
│ bool          ┆ bool        ┆ f64      ┆ f64    ┆ f64  │
╞═══════════════╪═════════════╪══════════╪════════╪══════╡
│ false         ┆ false       ┆ null     ┆ null   ┆ null │
│ false         ┆ false       ┆ null     ┆ null   ┆ null │
│ false         ┆ false       ┆ null     ┆ null   ┆ null │
└───────────────┴─────────────┴──────────┴────────┴──────┘


## User Histories

In [12]:
HISTORY_SIZE = 20
# Load history data
df_history = pl.read_parquet(PATH.joinpath("validation", "history.parquet")).select(
    [DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL]
)

# Truncate history
df_history_truncate = df_history.pipe(
    truncate_history,
    column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_size=HISTORY_SIZE,
    padding_value=None,
    enable_warning=False,
)

# Join df_beyond_accuracy with truncated history
df_user_histories = df_behaviors.select([DEFAULT_USER_COL]).join(
    df_history_truncate, on=DEFAULT_USER_COL, how="left"
)

# Convert to dictionary
user_history_dict = {
    DEFAULT_HISTORY_ARTICLE_ID_COL: df_user_histories[DEFAULT_HISTORY_ARTICLE_ID_COL].to_list()
}

# Write the user history dictionary to a JSON file
write_json_file(
    user_history_dict,
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT),
    verbose=True,
)

print(f"#rows: {df_user_histories.shape[0]}")
print(df_user_histories.head(5))

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/beyond_accuracy_history_dict.json'
#rows: 244647
shape: (5, 2)
┌─────────┬───────────────────────────────┐
│ user_id ┆ article_id_fixed              │
│ ---     ┆ ---                           │
│ u32     ┆ list[i32]                     │
╞═════════╪═══════════════════════════════╡
│ 22548   ┆ [9773295, 9769504, … 9776929] │
│ 22548   ┆ [9773295, 9769504, … 9776929] │
│ 22548   ┆ [9773295, 9769504, … 9776929] │
│ 22548   ┆ [9773295, 9769504, … 9776929] │
│ 22548   ┆ [9773295, 9769504, … 9776929] │
└─────────┴───────────────────────────────┘


## Timestamp for Behaviors

Used for computing the AUC as function of time

In [13]:
# Select and cast the timestamp column
df_behaviors_timestamp = df_behaviors.select(
    pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).cast(pl.Utf8)
)

# Convert to dictionary
behaviors_timestamp_dict = {
    DEFAULT_IMPRESSION_TIMESTAMP_COL: df_behaviors_timestamp[DEFAULT_IMPRESSION_TIMESTAMP_COL].to_list()
}

# Write the behaviors timestamp dictionary to a JSON file
write_json_file(
    behaviors_timestamp_dict,
    PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT),
    verbose=True,
)

print(f"#rows: {df_behaviors_timestamp.shape[0]}")
print(df_behaviors_timestamp.head(3))

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/behaviors_timestamp_dict.json'
#rows: 244647
shape: (3, 1)
┌────────────────────────────┐
│ impression_time            │
│ ---                        │
│ str                        │
╞════════════════════════════╡
│ 2023-05-28 04:21:24.000000 │
│ 2023-05-28 04:31:48.000000 │
│ 2023-05-28 04:30:17.000000 │
└────────────────────────────┘


# Make Candidate lookup dict / Dump lookup dict

## Articles to include: *candidate-list* and *history-articles*

In [14]:
history_article_id = (
    df_user_histories.lazy()
    .select(pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).explode().unique())
    .collect()[DEFAULT_HISTORY_ARTICLE_ID_COL]
    .to_list()
)
print(f"#history_article_id: {len(history_article_id)})")

#history_article_id: 4235)


Note, the different datasizes (*demo*, *small*, and *large*) has subset of the total article-catelog. Hence, if you're using *demo*, not all of the articles in the candidate-list may be in the dataset.

In [15]:
import zipfile
import ast

aids_in_split = (
    df_articles.select(DEFAULT_ARTICLE_ID_COL)
    .collect()[DEFAULT_ARTICLE_ID_COL]
    .to_list()
)

with zipfile.ZipFile('downloads/ba_npa_danish.zip', 'r') as zip_ref:
    zip_ref.extractall()

candidate_list = []
with open('ba_npa_danish.txt', 'r') as file:
    for line in file:
        line_data = ast.literal_eval(line.strip().split(' ', 1)[1])
        candidate_list.append(line_data)

candidate_list = [id for candidate in candidate_list for id in candidate]


history_article_id = [id for id in history_article_id if id in aids_in_split]
candidate_list = [id for id in candidate_list if id in aids_in_split]

article_ids = history_article_id + candidate_list
print(
    f"#articles: {len(article_ids)} (#history_article_id: {len(history_article_id)})"
)

#articles: 738176 (#history_article_id: 4235)


## Select articles that should be included in the lookup dictionary

In [16]:
# =>
df_lookup_articles = (
    df_articles.filter(pl.col(DEFAULT_ARTICLE_ID_COL).is_in(article_ids))
    .with_columns(
        pl.col(
            DEFAULT_ARTICLE_MODIFIED_TIMESTAMP_COL,
            DEFAULT_ARTICLE_PUBLISHED_TIMESTAMP_COL,
        ).cast(pl.Utf8)
    )
    # Zeros might cause issues
    .with_columns(
        pl.col(DEFAULT_TOTAL_INVIEWS_COL, DEFAULT_TOTAL_PAGEVIEWS_COL).fill_null(1)
    )
    .collect()
)
print(f"df_lookup_articles shape: {df_lookup_articles.shape}")

df_lookup_articles shape: (5637, 21)


### Make normalize popularity-scores

In [17]:
DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX = (
    DEFAULT_TOTAL_PAGEVIEWS_COL + "_normalized_max"
)
DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX = (
    DEFAULT_TOTAL_PAGEVIEWS_COL + "_normalized_min_max"
)

MIN_X = df_lookup_articles[DEFAULT_TOTAL_PAGEVIEWS_COL].min()
MAX_X = df_lookup_articles[DEFAULT_TOTAL_PAGEVIEWS_COL].max()
MIN_RANGE = 1e-4
MAX_RANGE = 1.0

df_lookup_articles = df_lookup_articles.with_columns(
    (  # SIMPLE MAX NORMALIZATION: x / max()
        pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL) / pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL).max()
    ).alias(DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX)
).with_columns(
    (  #  MIN-MAX NORMALIZATION: ( x_i − X_min ⁡ ) / ( X_max ⁡ − X_min ⁡ ) * (max_range − min_range) + min_range
        ((pl.col(DEFAULT_TOTAL_PAGEVIEWS_COL) - MIN_X) / (MAX_X - MIN_X))
        * (MAX_RANGE - MIN_RANGE)
        + MIN_RANGE
    ).alias(
        DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX
    )
)

df_lookup_articles.select(
    DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MAX,
    DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX,
).describe()

statistic,total_pageviews_normalized_max,total_pageviews_normalized_min_max
str,f64,f64
"""count""",5637.0,5637.0
"""null_count""",0.0,0.0
"""mean""",0.034458,0.034554
"""std""",0.039572,0.039568
"""min""",6.1059e-07,0.0001
"""25%""",6.1059e-07,0.0001
"""50%""",0.026185,0.026281
"""75%""",0.052479,0.052573
"""max""",1.0,1.0


## Add embeddings representations

In [18]:
# => Embeddings:
BERT_VECTOR = "bert_base_multilingual_cased"
ROBERTA_VECTOR = "xlm_roberta_base"

CONTRASTIVE_VECTOR = "contrastive_vector"
DOCUMENT_VECTOR = "document_vector"


def load_join_embeddings(df: pl.DataFrame, emb_path: Path) -> pl.DataFrame:
    emb_contrastive = (
        pl.scan_parquet(PATH.parent.joinpath(emb_path))
        .filter(pl.col(DEFAULT_ARTICLE_ID_COL).is_in(df.select(DEFAULT_ARTICLE_ID_COL)))
        .collect()
    )
    return df.join(emb_contrastive, on=DEFAULT_ARTICLE_ID_COL, how="left")


df_lookup_articles = df_lookup_articles.pipe(
    load_join_embeddings,
    emb_path=f"embeddings/Ekstra_Bladet_contrastive_vector/{CONTRASTIVE_VECTOR}.parquet",
).pipe(
    load_join_embeddings,
    emb_path=f"embeddings/Ekstra_Bladet_word2vec/{DOCUMENT_VECTOR}.parquet",
)
print(f"#rows: {df_lookup_articles.shape[0]}")
df_lookup_articles.head(2)

#rows: 5637


article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label,total_pageviews_normalized_max,total_pageviews_normalized_min_max,contrastive_vector,document_vector
i32,str,str,str,bool,str,str,list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str,f64,f64,list[f32],list[f32]
3001353,"""Natascha var i…","""Politiet frygt…","""2023-06-29 06:…",False,"""Sagen om den ø…","""2006-08-31 08:…",[3150850],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",1,1,,0.9955,"""Negative""",6.1059e-07,0.0001,"[0.014536, 0.013818, … 0.017486]","[0.055219, 0.011371, … 0.007982]"
3033563,"""Kniven for str…","""I aftenens udg…","""2023-06-29 06:…",False,"""Når man ser fj…","""2007-03-27 10:…","[3005524, 3005525]","""article_defaul…","""https://ekstra…",[],[],"[""Livsstil"", ""Underholdning"", … ""Mad og drikke""]",414,"[433, 436]","""underholdning""",1,1,,0.9371,"""Neutral""",6.1059e-07,0.0001,"[-0.00684, 0.011205, … 0.019961]","[0.050432, -0.022492, … 0.037712]"


## Convert to lookup dictionary:

In [19]:
articles_dict = {}
for row in df_lookup_articles.iter_rows(named=True):
    # Note, all keys in dictionaries are converted to strings, when serializing an object to JSON format.
    articles_dict[str(row[DEFAULT_ARTICLE_ID_COL])] = row
# Write it:
write_json_file(
    articles_dict, PATH_BEYOND_ACCURACY.joinpath(ARTICLES_DICT), verbose=True
)
print(f"#articles: {len(articles_dict)}")

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/articles_dict.json'
#articles: 5637


# Create Baselines

Make a couple *Baselines* based on the candidate-list:
1. @EditorialPicks: We approximate this based on the number **inview** an articles have recived. Ekstra Bladet is front-page driven, meaning, if an article has a lot of inview-impression (seen) a lot, we believe it has been selected to be in a top priority from the editors. This is static (it does change for our *candidate_list*), i.e., the computation is done once.
2. @Popular: We approximate this based on the number **clicks** an articles have recived. This is static (it does change for our *candidate_list*), i.e., the computation is done once.
3. @Random: Simple baseline and important baseline. We simple pick a set of *top-n* articles from the *candidate-list* and run multiple times.
4. @Dissimilarity / Similarity (will come later): Select top-n articles that are the most similar / dissimilar.
5. @Newest: Simply pick the newest released articles. We do see newssite where the top banner is *Newest released*. We include it, but note this is very sensitive and might not be meaningful.

### Load the artifacts

In [20]:
def n_items(d) -> int:
    return len(d[list(d)[0]])


# =>
behaviors_timestamp_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEHAVIORS_TIMESTAMP_DICT)
)
print(f"#behaviors_timestamp_dict: {n_items(behaviors_timestamp_dict)}")

# =>
history_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT)
)
print(
    f"#history_dict: {n_items(history_dict)}\n history_dict.keys(): {history_dict.keys()}"
)

# =>
users_dict = read_json_file(PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_USERS_DICT))
print(f"#users_dict {n_items(users_dict)}\n users_dict.keys(): {users_dict.keys()}")

# =>
user_history_dict = read_json_file(
    PATH_BEYOND_ACCURACY.joinpath(BEYOND_ACCURACY_HISTORY_DICT)
)
print(
    f"#user_history_dict {n_items(user_history_dict)}\n users_dict.keys(): {user_history_dict.keys()}"
)

# =>
articles_dict = {
    int(key): val
    for key, val in read_json_file(PATH_BEYOND_ACCURACY.joinpath(ARTICLES_DICT)).items()
}
aid_keys = articles_dict[list(articles_dict)[0]].keys()
print(f"#articles_dict: {len(articles_dict)}\n articles_dict[ID].keys(): {aid_keys}")


#behaviors_timestamp_dict: 244647
#history_dict: 244647
 history_dict.keys(): dict_keys(['article_id_fixed'])
#users_dict 244647
 users_dict.keys(): dict_keys(['is_subscriber', 'is_sso_user', 'postcode', 'gender', 'age'])
#user_history_dict 244647
 users_dict.keys(): dict_keys(['article_id_fixed'])
#articles_dict: 5637
 articles_dict[ID].keys(): dict_keys(['article_id', 'title', 'subtitle', 'last_modified_time', 'premium', 'body', 'published_time', 'image_ids', 'article_type', 'url', 'ner_clusters', 'entity_groups', 'topics', 'category', 'subcategory', 'category_str', 'total_inviews', 'total_pageviews', 'total_read_time', 'sentiment_score', 'sentiment_label', 'total_pageviews_normalized_max', 'total_pageviews_normalized_min_max', 'contrastive_vector', 'document_vector'])


'\ncandidate_list = [\n    id\n    for id in read_json_file(PATH_BEYOND_ACCURACY.joinpath(CANDIDATE_LIST))\n    if id in list(articles_dict)\n]\nprint(f"#candidate_list: {len(candidate_list)}")\n\ndf_candidate_articles = df_lookup_articles.filter(\n    pl.col(DEFAULT_ARTICLE_ID_COL).is_in(candidate_list)\n)\nprint(f"#candidate-articles (df): {df_candidate_articles.shape[0]}")\n'

## Make Ranked Candidate lists

## Init Metrics

In [24]:
intralist_diversity = IntralistDiversity()
distribution = Distribution()
serendipity = Serendipity()
sentiment = Sentiment()
coverage = Coverage()
novelty = Novelty()

## Setting Baselines (and your model)

In [21]:
RANDOM_ITER = df_behaviors.select(DEFAULT_INVIEW_ARTICLES_COL).shape[0]
TOP_N = 3
np.random.seed(123)

user_history = user_history_dict[DEFAULT_HISTORY_ARTICLE_ID_COL]

print(f"#random-iterations: {RANDOM_ITER}")
print(f"Top@{TOP_N} ranked articles")

#random-iterations: 244647
Top@5 ranked articles


### Your Model
Try to add your model's prediction of the candidate list. In this notebook we just take a random sample.

In [None]:
import zipfile
import ast

with zipfile.ZipFile('downloads/ba_npa_danish.zip', 'r') as zip_ref:
    zip_ref.extractall()

candidate_list_nrms = []
candidates_name_pairs = []
with open('ba_npa_danish.txt', 'r') as file:
    for line in file:
        
        line_data = ast.literal_eval(line.strip().split(' ', 1)[1])
        candidate_list_nrms.append(line_data)

print(candidate_list_nrms[0])

candidates_nrms = [np.array(candidate_list_nrms[np.random.choice(len(candidate_list_nrms))]) for _ in range(RANDOM_ITER)]

candidates_name_pairs.append([candidates_nrms, "npa ba"])


## User-level

### Instralist-Diversity

In [25]:
intralist_diversity_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = intralist_diversity(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=CONTRASTIVE_VECTOR,
    )
    intralist_diversity_dict[f"{list_name}_{intralist_diversity.name}"] = [
        scores.mean(),
        scores.std(),
    ]

write_json_file(
    intralist_diversity_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DIVERSITY),
    verbose=True,
)

pl.DataFrame(intralist_diversity_dict)

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/baselines/intralist_diversity_npa.json'


name,random_2_intralist_diversity
str,f64
"""mean""",0.771758
"""std""",0.128911


### Sentiment

In [26]:
sentiment_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = sentiment(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=DEFAULT_SENTIMENT_SCORE_COL,
    )
    sentiment_dict[f"{list_name}_{sentiment.name}"] = [
        scores.mean(),
        scores.std(),
    ]

write_json_file(
    sentiment_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_SENTIMENT_SCORE),
    verbose=True,
)

pl.DataFrame(sentiment_dict)

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/baselines/sentiment_score_npa.json'


name,random_2_sentiment
str,f64
"""mean""",0.837702
"""std""",0.091702


### Novelty

In [27]:
novelty_dict = {"name": ["mean", "std"]}

for candidates, list_name in candidates_name_pairs:
    scores = novelty(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=DEFAULT_TOTAL_PAGEVIEWS_COL_NORMALIZED_MIN_MAX,
    )
    novelty_dict[f"{list_name}_{novelty.name}"] = [
        scores.mean(),
        scores.std(),
    ]

write_json_file(
    novelty_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_NOVELTY),
    verbose=True,
)

pl.DataFrame(novelty_dict)

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/baselines/novelty_npa.json'


name,random_2_novelty
str,f64
"""mean""",4.668599
"""std""",1.118288


### Serendipity
When computing Serendipity it using the user's history; similarity between recommendations and browsed items

In [28]:
serendipity_dict = {"name": ["mean", "std"]}
for candidates, list_name in candidates_name_pairs:
    if len(candidates) == 1:
        candidates = np.tile(candidates, len(user_history)).reshape(-1, TOP_N)
    #
    scores = serendipity(
        candidates,
        H=user_history,
        lookup_dict=articles_dict,
        lookup_key=CONTRASTIVE_VECTOR,
    )
    serendipity_dict[f"{list_name}_{serendipity.name}"] = [scores.mean(), scores.std()]

write_json_file(
    serendipity_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_SERENDIPITY),
    verbose=True,
)

pl.DataFrame(serendipity_dict)

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/baselines/serendipity_npa.json'


name,random_2_serendipity
str,f64
"""mean""",0.794444
"""std""",0.045322


## Model-level

### Coverage

In [29]:
coverage_dict = {"name": ["count", "fraction"]}
for candidates, list_name in candidates_name_pairs:
    coverage_dict[f"{list_name}_{coverage.name}"] = coverage(candidates, candidate_list)

write_json_file(
    coverage_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_COVERAGE),
    verbose=True,
)

pl.DataFrame(coverage_dict)

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/baselines/coverage_npa.json'


name,random_2_coverage
str,f64
"""count""",2511.0
"""fraction""",0.928968


### Distribution - Category

#### Distribution helper function

In [30]:
def concat_distribution_dict(dict_: dict) -> dict:
    output_results = (
        pl.concat(
            [pl.DataFrame(val) for val in dict_.values()],
            how="diagonal",
        )
        .with_row_index(name="name")
        .with_columns(pl.Series(dict_.keys()).alias("name"))
    ).to_dict()
    return {key: val.to_list() for key, val in output_results.items()}

In [31]:
COLUMN = DEFAULT_CATEGORY_STR_COL
distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}
# =>
distribution_category_dict = concat_distribution_dict(distribution_dict)

write_json_file(
    distribution_category_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DISTRIBUTION_CATEGORY),
    verbose=True,
)

pl.DataFrame(distribution_category_dict)

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/baselines/distribution_category_npa.json'


name,underholdning,nyheder,krimi,sport,nationen,penge,opinionen,forbrug,musik,sex_og_samliv,biler,ferie,side9,haandvaerkeren,plus,podcast,play,auto
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""random_2_novel…",0.144942,0.269121,0.142941,0.290616,0.039507,0.036579,0.004277,0.02797,0.037244,0.005134,0.000253,0.000542,0.00077,7.4e-05,3e-06,5e-06,1.4e-05,8e-06


### Distribution - Sentiment

In [32]:
COLUMN = DEFAULT_SENTIMENT_LABEL_COL
distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}
# =>
distribution_sentiment_dict = concat_distribution_dict(distribution_dict)

write_json_file(
    distribution_sentiment_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DISTRIBUTION_SENTIMENT_LABEL),
    verbose=True,
)

pl.DataFrame(distribution_sentiment_dict)

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/baselines/distribution_sentiment_label_npa.json'


name,Negative,Positive,Neutral
str,f64,f64,f64
"""random_2_novel…",0.484574,0.237106,0.278319


### Distribution - Topics

In [33]:
COLUMN = DEFAULT_TOPICS_COL
distribution_dict = {
    f"{list_name}_{novelty.name}": distribution(
        candidates,
        lookup_dict=articles_dict,
        lookup_key=COLUMN,
    )
    for candidates, list_name in candidates_name_pairs
}
# =>

distribution_topics_dict = concat_distribution_dict(distribution_dict)

write_json_file(
    distribution_topics_dict,
    PATH_BEYOND_ACCURACY_BASELINES.joinpath(BASELINE_DISTRIBUTION_TOPICS),
    verbose=True,
)

pl.DataFrame(distribution_topics_dict)

Writing JSON: '../downloads/ebnerd_small/evaluation_artifacts/baselines/distribution_topics_npa.json'


name,Kendt,Livsstil,Underholdning,Økonomi,Partnerskab,Samfund,Bæredygtighed og klima,Politik,International politik,National politik,Kriminalitet,Personfarlig kriminalitet,Katastrofe,Mindre ulykke,Transportmiddel,Bil,Sport,Ketcher- og batsport,Erhverv,Privat virksomhed,Værdier,Konflikt og krig,Større transportmiddel,Begivenhed,Fodbold,Sportsbegivenhed,Film og tv,Uddannelse,Ungdomsuddannelse,Makro,Bolig,Køb og salg,Tendenser,Krop og velvære,Kosmetisk behandling,Væbnet konflikt,…,Dyr,Videnskab,Naturvidenskab,Underholdningsbegivenhed,Kultur,Musik og lyd,Håndbold,Ansættelsesforhold,Offentlig instans,Terror,Reality,Offentlig transport,Personlig begivenhed,Mikro,Erotik,Mindre transportmiddel,Mad og drikke,Byliv,Cykling,Forbrugerelektronik,Rejse,Bedrageri,Større katastrofe,Vejr,Fritid,Familieliv,Religion,Kunstig intelligens og software,Bandekriminalitet,Museum og seværdighed,Samfundsvidenskab og humaniora,Grundskole,Videregående uddannelse,Kunst,Udlejning,Litteratur,Mærkedag
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""random_2_novel…",0.08828,0.032686,0.046535,0.034061,0.009769,0.017457,0.00195,0.042658,0.028608,0.017274,0.038093,0.017434,0.022589,0.019939,0.023828,0.006965,0.077196,0.011535,0.037605,0.017933,0.007457,0.021155,0.011182,0.071942,0.043531,0.054003,0.018258,0.001601,0.000808,0.013839,0.010184,0.00579,0.000796,0.003202,0.000636,0.009647,…,0.00411,0.003559,0.002524,0.01484,0.006402,0.012487,0.001844,0.010183,0.001454,0.000596,0.004,0.00233,0.003279,0.005134,0.005929,0.002447,0.003511,0.001518,0.003952,0.001264,0.000515,0.001634,0.00279,0.001753,0.000542,0.002327,0.000618,0.000387,0.000978,0.000391,0.000299,0.000507,0.000265,0.00035,8.9e-05,1.3e-05,2e-06
