In [3]:
import numpy as np
import polars as pl
from tqdm import tqdm
from polimi.utils._polars import reduce_polars_df_memory_size

In [4]:
from pathlib import Path


dpath = Path('../../dataset')
emb_dir = dpath / 'embeddings'
dtype = 'small'
articles = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/articles.parquet')

behaviors_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/behaviors.parquet')
history_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/history.parquet')

# Test

In [8]:
embeddings = pl.read_parquet(emb_dir / 'google_bert_base_multilingual_cased' / 'bert_base_multilingual_cased.parquet').sort('article_id')
embeddings.columns = ['article_id', 'embedding']
embeddings = embeddings.with_row_index()
embeddings.head(2)

index,article_id,embedding
u32,i32,list[f32]
0,3000022,"[-0.350606, 0.003437, … 0.001947]"
1,3000063,"[-0.003448, 0.227659, … -0.057015]"


In [9]:
all_zero_embeddings = embeddings.with_columns(pl.col('embedding').list.eval(pl.element() == 0.0).list.all().alias('check'))
are_all_zero_embeddings_present = len(all_zero_embeddings.filter(pl.col('check') == True)) > 0
are_all_zero_embeddings_present

False

In [10]:
m_non_norm = np.array([np.array(x) for x in embeddings['embedding'].to_numpy()])
row_norms = np.linalg.norm(m_non_norm, axis=1, keepdims=True)
m = m_non_norm / row_norms
m.shape

(125541, 768)

# Build feature

In [11]:
article_emb_mapping = embeddings.select('index', 'article_id')
article_emb_mapping.head(1)

index,article_id
u32,i32
0,3000022


In [12]:
history_m = history_train.select('user_id', pl.col('article_id_fixed').list.eval(pl.element().replace(article_emb_mapping['article_id'], article_emb_mapping['index'], default=None).drop_nulls())).with_row_index('user_index')
user_history_map = history_m.select('user_id', 'user_index')
history_m = history_m['article_id_fixed'].to_numpy()
history_m.shape

(15143,)

In [13]:
df = behaviors_train.select('impression_id', 'user_id', pl.col('article_ids_inview').alias('article'))\
    .join(user_history_map, on='user_id')\
    .with_columns(
        pl.col('article').list.eval(pl.element().replace(article_emb_mapping['article_id'], article_emb_mapping['index'], default=None)).name.suffix('_index'),
    ).drop('impression_time_fixed', 'scroll_percentage_fixed', 'read_time_fixed')

df = reduce_polars_df_memory_size(df)
df.head(2)

Memory usage of dataframe is 25.95 MB
Memory usage after optimization is: 25.95 MB
Decreased by 0.0%


impression_id,user_id,article,user_index,article_index
u32,u32,list[i32],u32,list[u32]
149474,139836,"[9778623, 9778682, … 9778728]",11894,"[123178, 123188, … 123195]"
150528,143471,"[9778718, 9778728, … 9778682]",7016,"[123192, 123195, … 123188]"


In [14]:
scores_df = pl.concat([
    slice.explode(['article_index', 'article']).with_columns(scores = np.dot(
        m[slice['article_index'].explode().to_numpy()], 
        m[history_m[key[0]]].T))\
    .group_by(['impression_id', 'user_id', 'user_index'])\
    .agg(pl.all())
    for key, slice in tqdm(df[:1000].partition_by(by=['user_index'], as_dict=True).items(), total=df['user_index'].n_unique())
]).drop('article_index')
scores_df

  3%|▎         | 466/15143 [00:01<00:53, 274.50it/s]


impression_id,user_id,user_index,article,scores
u32,u32,u32,list[i32],list[list[f32]]
149474,139836,11894,"[9778623, 9778682, … 9778728]","[[0.988827, 0.984725, … 0.954573], [0.982375, 0.988658, … 0.955571], … [0.976225, 0.973736, … 0.937987]]"
150528,143471,7016,"[9778718, 9778728, … 9778682]","[[0.9813, 0.945006, … 0.960722], [0.972239, 0.936991, … 0.948615], … [0.987887, 0.95329, … 0.961936]]"
153068,151570,7074,"[9778657, 9778669, … 9778682]","[[0.984971, 0.941774, … 0.982133], [0.941257, 0.945315, … 0.940903], … [0.987914, 0.937185, … 0.975545]]"
153071,151570,7074,"[9777492, 9774568, … 9775990]","[[0.990505, 0.944563, … 0.982758], [0.813731, 0.847427, … 0.828457], … [0.951217, 0.913168, … 0.952556]]"
153078,151570,7074,"[9778021, 9778627, … 7213923]","[[0.98114, 0.928685, … 0.981403], [0.963864, 0.952563, … 0.953528], … [0.988723, 0.938198, … 0.976789]]"
…,…,…,…,…
2433256,1606050,14460,"[9483850, 9779648, … 9779777]","[[0.983992, 0.983894, … 0.984721], [0.98539, 0.976882, … 0.981156], … [0.981787, 0.982506, … 0.981081]]"
2433248,1606050,14460,"[9552181, 9779263, … 9547869]","[[0.987949, 0.98747, … 0.987622], [0.988494, 0.983613, … 0.982609], … [0.985486, 0.982646, … 0.985924]]"
2435848,1692081,10750,"[9779263, 9779205, … 9779577]","[[0.946211, 0.977764, … 0.984172], [0.942077, 0.969015, … 0.983493], … [0.944518, 0.981686, … 0.988128]]"
2435885,1695195,10254,"[9658252, 9569934, … 9775885]","[[0.942331, 0.990499, … 0.992581], [0.941311, 0.988905, … 0.984904], … [0.943828, 0.991274, … 0.993161]]"


In [23]:
scores_df = scores_df.with_columns(
    pl.col('scores').list.eval(pl.element().list.mean()).name.suffix('_mean'),
    pl.col('scores').list.eval(pl.element().list.max()).name.suffix('_max'),
    pl.col('scores').list.eval(pl.element().list.max()).name.suffix('_min'),
    pl.col('scores').list.eval(pl.element().list.std()).name.suffix('_std'),
)
scores_df.head(2)

impression_id,user_id,user_index,article,scores,scores_mean,scores_max,scores_min,scores_std
u32,u32,u32,list[i32],list[list[f32]],list[f32],list[f32],list[f32],list[f32]
149474,139836,11894,"[9778623, 9778682, … 9778728]","[[0.988827, 0.984725, … 0.954573], [0.982375, 0.988658, … 0.955571], … [0.976225, 0.973736, … 0.937987]]","[0.979383, 0.977003, … 0.968955]","[0.988899, 0.988986, … 0.980122]","[0.988899, 0.988986, … 0.980122]","[0.015714, 0.016969, … 0.016606]"
150528,143471,7016,"[9778718, 9778728, … 9778682]","[[0.9813, 0.945006, … 0.960722], [0.972239, 0.936991, … 0.948615], … [0.987887, 0.95329, … 0.961936]]","[0.971627, 0.963866, … 0.972263]","[0.992518, 0.989399, … 0.99187]","[0.992518, 0.989399, … 0.99187]","[0.055476, 0.053514, … 0.054451]"


In [24]:
explode_cols = ['article'] + [col for col in scores_df.columns if col.startswith('scores_')]
res = scores_df.drop('user_index', 'scores')\
    .explode(explode_cols)
res.head(2)

impression_id,user_id,article,scores_mean,scores_max,scores_min,scores_std
u32,u32,i32,f32,f32,f32,f32
149474,139836,9778623,0.979383,0.988899,0.988899,0.015714
149474,139836,9778682,0.977003,0.988986,0.988986,0.016969


# Multiple embeddings

In [29]:
emb_name_list = {'Ekstra_Bladet_contrastive_vector': 'contrastive_vector',
                #  'Ekstra_Bladet_image_embeddings': 'image_embeddings',
                 'FacebookAI_xlm_roberta_base': 'xlm_roberta_base',
                 'google_bert_base_multilingual_cased': 'bert_base_multilingual_cased'}

In [80]:
def build_emb_scores(df: pl.DataFrame, history_m: np.ndarray, m_dict:dict[str, np.ndarray]):
    df = reduce_polars_df_memory_size(df)
    print(f'Starting to build embeddings scores for {m_dict.keys()}...')
    df = pl.concat([
        slice.explode(['article_index', 'article']).with_columns(
            *[pl.lit(np.dot(m[slice['article_index'].explode().to_numpy()], m[history_m[key[0]]].T)).alias(f'{emb_name}_scores') for emb_name, m in m_dict.items()]
        )\
        .group_by(['impression_id', 'user_id', 'user_index'])\
        .agg(pl.all())
        for key, slice in tqdm(df[:1000].partition_by(by=['user_index'], as_dict=True).items(), total=df['user_index'].n_unique()) # keep only 1000 for testing
    ]).drop('article_index', 'user_index')
    return df

def build_agg_scores(df: pl.DataFrame, history: pl.DataFrame, emb_names: list[str]):
    df = reduce_polars_df_memory_size(df)
    print(f'Starting to build aggregated scores for {emb_names}...')
    df = df.with_columns(
        *[pl.col(f'{emb_name}_scores').list.eval(pl.element().list.mean()).name.suffix('_mean') for emb_name in emb_names],
        *[pl.col(f'{emb_name}_scores').list.eval(pl.element().list.max()).name.suffix('_max') for emb_name in emb_names],
        *[pl.col(f'{emb_name}_scores').list.eval(pl.element().list.max()).name.suffix('_min') for emb_name in emb_names],
        *[pl.col(f'{emb_name}_scores').list.eval(pl.element().list.std()).name.suffix('_std') for emb_name in emb_names],
    )
    return df


In [81]:
norm_m_dict = {}
for dir, file_name in emb_name_list.items():
    print(f'Processing {file_name} embedding matrix...')
    emb_df = pl.read_parquet(emb_dir / dir / f'{file_name}.parquet')
    emb_df.columns = ['article_id', 'embedding']
    emb_df = emb_df.with_row_index()
    
    m = np.array([np.array(row) for row in emb_df['embedding'].to_numpy()])
    row_norms = np.linalg.norm(m, axis=1, keepdims=True)
    m = m / row_norms
    norm_m_dict[file_name] = m

Processing contrastive_vector embedding matrix...
Processing xlm_roberta_base embedding matrix...
Processing bert_base_multilingual_cased embedding matrix...


In [83]:
article_emb_mapping = emb_df.select('index', 'article_id')

history_m = history_train\
    .select('user_id', pl.col('article_id_fixed').list.eval(
                pl.element().replace(article_emb_mapping['article_id'], article_emb_mapping['index'], default=None)))\
    .with_row_index('user_index')

user_history_map = history_m.select('user_id', 'user_index')
history_m = history_m['article_id_fixed'].to_numpy()
train_ds = behaviors_train.select('impression_id', 'user_id', pl.col('article_ids_inview').alias('article'))\
    .join(user_history_map, on='user_id')\
    .with_columns(
        pl.col('article').list.eval(pl.element().replace(article_emb_mapping['article_id'], article_emb_mapping['index'], default=None)).name.suffix('_index'),
    ).drop('impression_time_fixed', 'scroll_percentage_fixed', 'read_time_fixed')

train_ds = build_emb_scores(train_ds, history_m, m_dict=norm_m_dict)
train_ds = build_agg_scores(train_ds, history_train, emb_names=list(norm_m_dict.keys()))
agg_scores_col = [col for col in train_ds.columns if '_scores_' in col]
train_ds = train_ds.drop([f'{emb_name}_scores' for emb_name in list(norm_m_dict.keys())]).explode(['article'] + agg_scores_col)
train_ds.head()

Memory usage of dataframe is 25.95 MB
Memory usage after optimization is: 25.95 MB
Decreased by 0.0%
Starting to build embeddings scores for dict_keys(['contrastive_vector', 'xlm_roberta_base', 'bert_base_multilingual_cased'])...


  3%|▎         | 466/15143 [00:03<01:48, 135.55it/s]


Memory usage of dataframe is 37.97 MB
Memory usage after optimization is: 37.97 MB
Decreased by 0.0%
Starting to build aggregated scores for ['contrastive_vector', 'xlm_roberta_base', 'bert_base_multilingual_cased']...


impression_id,user_id,article,contrastive_vector_scores_mean,xlm_roberta_base_scores_mean,bert_base_multilingual_cased_scores_mean,contrastive_vector_scores_max,xlm_roberta_base_scores_max,bert_base_multilingual_cased_scores_max,contrastive_vector_scores_min,xlm_roberta_base_scores_min,bert_base_multilingual_cased_scores_min,contrastive_vector_scores_std,xlm_roberta_base_scores_std,bert_base_multilingual_cased_scores_std
u32,u32,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
149474,139836,9778623,0.122687,0.999321,0.979383,0.355306,0.999502,0.988899,0.355306,0.999502,0.988899,0.085057,0.000127,0.015714
149474,139836,9778682,0.192321,0.999325,0.977003,0.634906,0.999623,0.988986,0.634906,0.999623,0.988986,0.180453,0.00024,0.016969
149474,139836,9778669,0.101564,0.999104,0.93978,0.504418,0.999329,0.951898,0.504418,0.999329,0.951898,0.225029,0.000158,0.009223
149474,139836,9778657,0.144526,0.999398,0.978806,0.667968,0.999623,0.991782,0.667968,0.999623,0.991782,0.251974,0.000176,0.016069
149474,139836,9778736,0.037258,0.998896,0.981011,0.434094,0.999381,0.99125,0.434094,0.999381,0.99125,0.158988,0.000378,0.01584


In [84]:
train_ds.null_count()

impression_id,user_id,article,contrastive_vector_scores_mean,xlm_roberta_base_scores_mean,bert_base_multilingual_cased_scores_mean,contrastive_vector_scores_max,xlm_roberta_base_scores_max,bert_base_multilingual_cased_scores_max,contrastive_vector_scores_min,xlm_roberta_base_scores_min,bert_base_multilingual_cased_scores_min,contrastive_vector_scores_std,xlm_roberta_base_scores_std,bert_base_multilingual_cased_scores_std
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
