In [2]:
import numpy as np
import polars as pl
from tqdm import tqdm
from polimi.utils._polars import reduce_polars_df_memory_size

In [3]:
from pathlib import Path


dpath = Path('../../dataset')
emb_dir = dpath / 'embeddings'
dtype = 'small'
articles = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/articles.parquet')

behaviors_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/behaviors.parquet')
history_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/history.parquet')

# Test

In [125]:
embeddings = pl.read_parquet(emb_dir / 'Ekstra_Bladet_image_embeddings' / 'image_embeddings.parquet').sort('article_id')
embeddings.columns = ['article_id', 'embedding']
emb_size = len(embeddings['embedding'][0])
missing_articles_in_embedding = list(set(articles['article_id'].to_numpy()) - set(embeddings['article_id'].to_numpy()))
null_vector = np.zeros(emb_size, dtype=np.float32)
embeddings = embeddings.vstack(pl.DataFrame({'article_id': missing_articles_in_embedding, 'embedding': [null_vector] * len(missing_articles_in_embedding)}))
embeddings = embeddings.with_row_index()
embeddings.head(2)

index,article_id,embedding
u32,i32,list[f32]
0,3000022,"[-0.033208, -0.013787, … -0.036042]"
1,3000063,"[-0.047797, -0.025657, … 0.018883]"


In [126]:
all_zero_embeddings = embeddings.with_columns(pl.col('embedding').list.eval(pl.element() == 0.0).list.all().alias('check'))
are_all_zero_embeddings_present = len(all_zero_embeddings.filter(pl.col('check') == True)) > 0
are_all_zero_embeddings_present

True

In [128]:
m_non_norm = np.array([np.array(x) for x in embeddings['embedding'].to_numpy()])
row_norms = np.linalg.norm(m_non_norm, axis=1, keepdims=True)
m = m_non_norm / (row_norms + 1e-6)
m.shape

(106346, 1024)

In [4]:
article_emb_mapping = embeddings.select('index', 'article_id')
article_emb_mapping.head(1)

NameError: name 'embeddings' is not defined

In [5]:
history_m = history_train.select('user_id', pl.col('article_id_fixed').list.eval(pl.element().replace(article_emb_mapping['article_id'], article_emb_mapping['index'], default=None).drop_nulls())).with_row_index('user_index')
user_history_map = history_m.select('user_id', 'user_index')
history_m = history_m['article_id_fixed'].to_numpy()
history_m.shape

NameError: name 'article_emb_mapping' is not defined

In [131]:
df = behaviors_train.select('impression_id', 'user_id', pl.col('article_ids_inview').alias('article'))\
    .join(user_history_map, on='user_id')\
    .with_columns(
        pl.col('article').list.eval(pl.element().replace(article_emb_mapping['article_id'], article_emb_mapping['index'], default=None)).name.suffix('_index'),
    ).drop('impression_time_fixed', 'scroll_percentage_fixed', 'read_time_fixed')

df = reduce_polars_df_memory_size(df)
df.head(2)

Memory usage of dataframe is 25.95 MB
Memory usage after optimization is: 25.95 MB
Decreased by 0.0%


impression_id,user_id,article,user_index,article_index
u32,u32,list[i32],u32,list[u32]
149474,139836,"[9778623, 9778682, … 9778728]",11894,"[100868, 100874, … 100879]"
150528,143471,"[9778718, 9778728, … 9778682]",7016,"[105241, 100879, … 100874]"


In [132]:
scores_df = pl.concat([
    slice.explode(['article_index', 'article']).with_columns(scores = np.dot(
        m[slice['article_index'].explode().to_numpy()], 
        m[history_m[key[0]]].T))\
    .group_by(['impression_id', 'user_id', 'user_index'])\
    .agg(pl.all())
    for key, slice in tqdm(df[:1000].partition_by(by=['user_index'], as_dict=True).items(), total=df['user_index'].n_unique())
]).drop('article_index')
scores_df



  3%|▎         | 466/15143 [00:02<01:16, 191.65it/s]


impression_id,user_id,user_index,article,scores
u32,u32,u32,list[i32],list[list[f32]]
149474,139836,11894,"[9778623, 9778682, … 9778728]","[[0.546279, 0.456443, … 0.068054], [0.108021, -0.134935, … 0.308029], … [0.789861, 0.399746, … -0.11546]]"
150528,143471,7016,"[9778718, 9778728, … 9778682]","[[0.0, 0.0, … 0.0], [0.766936, 0.0, … 0.0], … [0.028365, 0.0, … 0.0]]"
153075,151570,7074,"[9778500, 9776420, … 9020783]","[[0.290248, 0.0, … -0.126946], [-0.170831, 0.0, … 0.305366], … [0.552781, 0.0, … 0.519307]]"
153071,151570,7074,"[9777492, 9774568, … 9775990]","[[-0.130659, 0.0, … -0.122535], [0.438304, 0.0, … 0.340977], … [0.6444, 0.0, … 0.253928]]"
153068,151570,7074,"[9778657, 9778669, … 9778682]","[[0.0, 0.0, … 0.0], [0.2538, 0.0, … 0.184754], … [0.087496, 0.0, … 0.545875]]"
…,…,…,…,…
2433248,1606050,14460,"[9552181, 9779263, … 9547869]","[[0.0, 0.0, … 0.0], [0.37884, 0.271457, … -0.0275], … [0.62122, 0.070636, … 0.213934]]"
2433256,1606050,14460,"[9483850, 9779648, … 9779777]","[[0.337382, -0.065383, … 0.307274], [0.0, 0.0, … 0.0], … [0.0, 0.0, … 0.0]]"
2435848,1692081,10750,"[9779263, 9779205, … 9779577]","[[-0.135781, 0.201273, … 0.452139], [0.197592, 0.566438, … 0.604189], … [0.629099, 0.220339, … 0.12199]]"
2435885,1695195,10254,"[9658252, 9569934, … 9775885]","[[0.59537, 0.322892, … 0.576517], [-0.082606, -0.137399, … -0.313506], … [0.004153, 0.27604, … 0.32076]]"


In [None]:
scores_df = scores_df.with_columns(
    pl.col('scores').list.eval(pl.element().list.mean()).name.suffix('_mean'),
    pl.col('scores').list.eval(pl.element().list.max()).name.suffix('_max'),
    pl.col('scores').list.eval(pl.element().list.max()).name.suffix('_min'),
    pl.col('scores').list.eval(pl.element().list.std()).name.suffix('_std'),
)
scores_df.head(2)

In [None]:
explode_cols = ['article'] + [col for col in scores_df.columns if col.startswith('scores_')]
res = scores_df.drop('user_index', 'scores')\
    .explode(explode_cols)
res.head(2)

# Multiple embeddings

In [6]:
emb_name_list = {'Ekstra_Bladet_contrastive_vector': 'contrastive_vector',
                 'FacebookAI_xlm_roberta_base': 'xlm_roberta_base',
                 'Ekstra_Bladet_image_embeddings': 'image_embeddings',
                 'google_bert_base_multilingual_cased': 'bert_base_multilingual_cased'}

In [14]:
def build_emb_scores(df: pl.DataFrame, history_m: np.ndarray, m_dict:dict[str, np.ndarray]):
    df = reduce_polars_df_memory_size(df)
    print(f'Starting to build embeddings scores for {m_dict.keys()}...')
    df = pl.concat([
        slice.explode(['article_index', 'article']).with_columns(
            *[pl.lit(np.dot(m[slice['article_index'].explode().to_numpy()], m[history_m[key[0]]].T)).alias(f'{emb_name}_scores') for emb_name, m in m_dict.items()]
        )\
        .group_by(['impression_id', 'user_id', 'user_index'])\
        .agg(pl.all())
        for key, slice in tqdm(df.partition_by(by=['user_index'], as_dict=True).items(), total=df['user_index'].n_unique()) # keep only 1000 for testing
    ]).drop('article_index', 'user_index')
    return df

def build_agg_scores(df: pl.DataFrame, history: pl.DataFrame, emb_names: list[str]):
    df = reduce_polars_df_memory_size(df)
    print(f'Starting to build aggregated scores for {emb_names}...')
    df = df.with_columns(
        *[pl.col(f'{emb_name}_scores').list.eval(pl.element().list.mean()).name.suffix('_mean') for emb_name in emb_names],
        *[pl.col(f'{emb_name}_scores').list.eval(pl.element().list.max()).name.suffix('_max') for emb_name in emb_names],
        *[pl.col(f'{emb_name}_scores').list.eval(pl.element().list.min()).name.suffix('_min') for emb_name in emb_names],
        *[pl.col(f'{emb_name}_scores').list.eval(pl.element().list.std()).name.suffix('_std') for emb_name in emb_names],
        *[pl.col(f'{emb_name}_scores').list.eval(pl.element().list.median()).name.suffix('_median') for emb_name in emb_names],
    )
    return df


In [15]:
norm_m_dict = {}
article_emb_mapping = articles.select('article_id').unique().with_row_index()
for dir, file_name in emb_name_list.items():
    print(f'Processing {file_name} embedding matrix...')
    emb_df = pl.read_parquet(emb_dir / dir / f'{file_name}.parquet')
    emb_df.columns = ['article_id', 'embedding']
    
    emb_size = len(emb_df['embedding'][0])
    missing_articles_in_embedding = list(set(articles['article_id'].to_numpy()) - set(emb_df['article_id'].to_numpy()))
    if len(missing_articles_in_embedding) > 0:
        print(f'[Warning... {len(missing_articles_in_embedding)} missing articles in embedding matrix]')
        null_vector = np.zeros(emb_size, dtype=np.float32)
        emb_df = emb_df.vstack(pl.DataFrame({'article_id': missing_articles_in_embedding, 'embedding': [null_vector] * len(missing_articles_in_embedding)}))
        
    emb_df = article_emb_mapping.join(emb_df, on='article_id', how='left')
    m = np.array([np.array(row) for row in emb_df['embedding'].to_numpy()])
    row_norms = np.linalg.norm(m, axis=1, keepdims=True)
    m = m / (row_norms + 1e-6)
    norm_m_dict[file_name] = m

Processing contrastive_vector embedding matrix...
Processing xlm_roberta_base embedding matrix...
Processing image_embeddings embedding matrix...
Processing bert_base_multilingual_cased embedding matrix...


In [16]:
history_m = history_train\
    .select('user_id', pl.col('article_id_fixed').list.eval(
                pl.element().replace(article_emb_mapping['article_id'], article_emb_mapping['index'], default=None)))\
    .with_row_index('user_index')

user_history_map = history_m.select('user_id', 'user_index')
history_m = history_m['article_id_fixed'].to_numpy()
train_ds = behaviors_train.select('impression_id', 'user_id', pl.col('article_ids_inview').alias('article'))\
    .join(user_history_map, on='user_id')\
    .with_columns(
        pl.col('article').list.eval(pl.element().replace(article_emb_mapping['article_id'], article_emb_mapping['index'], default=None)).name.suffix('_index'),
    ).drop('impression_time_fixed', 'scroll_percentage_fixed', 'read_time_fixed')

train_ds = build_emb_scores(train_ds, history_m, m_dict=norm_m_dict)
# train_ds = build_agg_scores(train_ds, history_train, emb_names=list(norm_m_dict.keys()))
# agg_scores_col = [col for col in train_ds.columns if '_scores_' in col]
# train_ds = train_ds.drop([f'{emb_name}_scores' for emb_name in list(norm_m_dict.keys())]).explode(['article'] + agg_scores_col)
train_ds.head()

Memory usage of dataframe is 25.95 MB
Memory usage after optimization is: 21.01 MB
Decreased by 19.0%
Starting to build embeddings scores for dict_keys(['contrastive_vector', 'xlm_roberta_base', 'image_embeddings', 'bert_base_multilingual_cased'])...


100%|██████████| 15143/15143 [03:24<00:00, 74.05it/s] 


impression_id,user_id,article,contrastive_vector_scores,xlm_roberta_base_scores,image_embeddings_scores,bert_base_multilingual_cased_scores
u32,u32,list[i32],list[list[f32]],list[list[f32]],list[list[f32]],list[list[f32]]
149474,139836,"[9778623, 9778682, … 9778728]","[[0.280006, 0.019717, … 0.310368], [0.19372, 0.089885, … 0.331878], … [0.195114, 0.004229, … 0.188044]]","[[0.999206, 0.999163, … 0.999462], [0.998954, 0.999033, … 0.999613], … [0.999181, 0.999166, … 0.999317]]","[[0.546279, 0.456443, … 0.068054], [0.108021, -0.134935, … 0.308029], … [0.789861, 0.399746, … -0.11546]]","[[0.988827, 0.984724, … 0.954573], [0.982375, 0.988657, … 0.95557], … [0.976225, 0.973736, … 0.937987]]"
344665043,139836,"[9773700, 9773644, … 9773727]","[[0.00788, -0.050313, … 0.187935], [-0.038524, -0.008582, … 0.059008], … [0.123018, 0.079869, … 0.159032]]","[[0.997335, 0.996725, … 0.995732], [0.997351, 0.997109, … 0.996358], … [0.998496, 0.998695, … 0.99917]]","[[0.0, 0.0, … 0.0], [0.343694, 0.351041, … 0.314163], … [0.219095, 0.255553, … 0.599089]]","[[0.936781, 0.935228, … 0.942454], [0.946271, 0.951898, … 0.948559], … [0.977936, 0.977717, … 0.946093]]"
285235071,139836,"[9770997, 9769306, … 9771042]","[[0.165057, 0.204798, … 0.208821], [0.544786, 0.21536, … 0.183794], … [0.152006, 0.137472, … 0.108568]]","[[0.999059, 0.999281, … 0.999239], [0.99926, 0.999255, … 0.999421], … [0.998882, 0.999164, … 0.999225]]","[[0.0, 0.0, … 0.0], [0.445831, 0.354551, … 0.458699], … [-0.299292, 0.2319, … 0.082202]]","[[0.899222, 0.901025, … 0.913627], [0.988371, 0.988426, … 0.954942], … [0.942678, 0.949056, … 0.957068]]"
285235075,139836,"[9695098, 9769306, … 9771042]","[[0.170219, 0.235351, … 0.025821], [0.544786, 0.21536, … 0.183794], … [0.152006, 0.137472, … 0.108568]]","[[0.999037, 0.999121, … 0.998786], [0.99926, 0.999255, … 0.999421], … [0.998882, 0.999164, … 0.999225]]","[[0.0, 0.0, … 0.0], [0.445831, 0.354551, … 0.458699], … [-0.299292, 0.2319, … 0.082202]]","[[0.987271, 0.987736, … 0.953529], [0.988371, 0.988426, … 0.954942], … [0.942678, 0.949056, … 0.957068]]"
344665040,139836,"[9531745, 9773493, … 9773341]","[[0.289196, 0.212383, … 0.174665], [0.149765, -0.019319, … 0.155517], … [0.216174, 0.593863, … 0.064604]]","[[0.998885, 0.99902, … 0.999357], [0.99846, 0.998762, … 0.999127], … [0.999023, 0.999339, … 0.999147]]","[[0.20742, 0.381484, … 0.314527], [0.196254, 0.25199, … 0.365795], … [-0.042151, 0.007024, … 0.312564]]","[[0.983417, 0.9841, … 0.951619], [0.9758, 0.975772, … 0.946478], … [0.981525, 0.980215, … 0.941779]]"


In [1]:
import polars as pl

In [8]:
pl.read_parquet('/Users/lorecampa/Desktop/Projects/RecSysChallenge2024/experiments/preprocessing_embedding_scores_2024-05-07_16-45-14/train/embeddings_scores.parquet')

impression_id,user_id,article,contrastive_vector_scores,xlm_roberta_base_scores,image_embeddings_scores,bert_base_multilingual_cased_scores
u32,u32,list[i32],list[list[f32]],list[list[f32]],list[list[f32]],list[list[f32]]
149474,139836,"[9778623, 9778682, … 9778728]","[[0.280006, 0.019717, … 0.310368], [0.19372, 0.089885, … 0.331878], … [0.195114, 0.004229, … 0.188044]]","[[0.999206, 0.999163, … 0.999462], [0.998954, 0.999033, … 0.999613], … [0.999181, 0.999166, … 0.999317]]","[[0.546279, 0.456443, … 0.068054], [0.108021, -0.134935, … 0.308029], … [0.789861, 0.399746, … -0.11546]]","[[0.988827, 0.984724, … 0.954573], [0.982375, 0.988657, … 0.95557], … [0.976225, 0.973736, … 0.937987]]"
150528,143471,"[9778718, 9778728, … 9778682]","[[0.487253, 0.222981, … 0.421921], [0.27305, 0.079195, … 0.05142], … [0.192729, 0.108325, … 0.272533]]","[[0.999433, 0.999432, … 0.999513], [0.999252, 0.999181, … 0.999224], … [0.999523, 0.999268, … 0.999264]]","[[0.0, 0.0, … 0.0], [0.766936, 0.0, … 0.0], … [0.028365, 0.0, … 0.0]]","[[0.9813, 0.945006, … 0.960722], [0.972239, 0.936991, … 0.948615], … [0.987886, 0.95329, … 0.961935]]"
153070,151570,"[9020783, 9778444, … 9778628]","[[0.40875, 0.133315, … 0.162373], [0.265542, 0.090084, … 0.255454], … [0.2833, 0.096032, … 0.221158]]","[[0.999482, 0.999167, … 0.999274], [0.999354, 0.999083, … 0.999389], … [0.999391, 0.999181, … 0.999296]]","[[0.552781, 0.0, … 0.519307], [0.0, 0.0, … 0.0], … [-0.077273, 0.0, … 0.123942]]","[[0.990108, 0.939705, … 0.979351], [0.986347, 0.941202, … 0.975838], … [0.981966, 0.937075, … 0.983944]]"
153071,151570,"[9777492, 9774568, … 9775990]","[[0.436637, 0.022653, … 0.152943], [0.168714, 0.138129, … 0.138965], … [0.188897, 0.106059, … 0.135605]]","[[0.999441, 0.999144, … 0.999291], [0.999124, 0.998861, … 0.999209], … [0.999332, 0.998993, … 0.999224]]","[[-0.130659, 0.0, … -0.122535], [0.438304, 0.0, … 0.340977], … [0.6444, 0.0, … 0.253928]]","[[0.990504, 0.944563, … 0.982758], [0.813731, 0.847427, … 0.828457], … [0.951217, 0.913168, … 0.952556]]"
153078,151570,"[9778021, 9778627, … 7213923]","[[0.363323, 0.097399, … 0.28931], [0.201101, 0.192492, … 0.18965], … [0.009028, 0.058391, … 0.01964]]","[[0.999129, 0.998635, … 0.999222], [0.999251, 0.999142, … 0.999195], … [0.999299, 0.999075, … 0.999146]]","[[0.427206, 0.0, … 0.154384], [0.239121, 0.0, … 0.156372], … [-0.068958, 0.0, … -0.38356]]","[[0.98114, 0.928685, … 0.981403], [0.963864, 0.952562, … 0.953528], … [0.988723, 0.938198, … 0.976789]]"
…,…,…,…,…,…,…
2433256,1606050,"[9483850, 9779648, … 9779777]","[[0.099568, 0.269038, … 0.240463], [0.423035, 0.358301, … 0.354762], … [0.147808, 0.339846, … 0.035343]]","[[0.999248, 0.999437, … 0.999439], [0.999629, 0.999317, … 0.999263], … [0.999344, 0.999509, … 0.999301]]","[[0.337382, -0.065383, … 0.307274], [0.0, 0.0, … 0.0], … [0.0, 0.0, … 0.0]]","[[0.983992, 0.983894, … 0.984721], [0.98539, 0.976882, … 0.981156], … [0.981787, 0.982505, … 0.981081]]"
2433248,1606050,"[9552181, 9779263, … 9547869]","[[0.279315, 0.337229, … 0.133837], [0.378713, 0.02498, … 0.094999], … [0.21466, 0.081328, … 0.215833]]","[[0.999341, 0.999519, … 0.999569], [0.999577, 0.999119, … 0.998869], … [0.999402, 0.999432, … 0.999415]]","[[0.0, 0.0, … 0.0], [0.37884, 0.271457, … -0.0275], … [0.62122, 0.070636, … 0.213934]]","[[0.987948, 0.98747, … 0.987622], [0.988494, 0.983613, … 0.982609], … [0.985486, 0.982646, … 0.985924]]"
2435848,1692081,"[9779263, 9779205, … 9779577]","[[0.058322, 0.110961, … 0.247141], [0.041045, 0.206982, … 0.433246], … [0.079398, 0.193836, … 0.544893]]","[[0.997882, 0.998676, … 0.999465], [0.998287, 0.998917, … 0.999622], … [0.999135, 0.998783, … 0.998799]]","[[-0.135781, 0.201273, … 0.452139], [0.197592, 0.566438, … 0.604189], … [0.629099, 0.220339, … 0.12199]]","[[0.94621, 0.977764, … 0.984171], [0.942077, 0.969015, … 0.983493], … [0.944518, 0.981686, … 0.988128]]"
2435885,1695195,"[9658252, 9569934, … 9775885]","[[0.417648, 0.384465, … 0.255178], [0.45826, 0.31132, … 0.2843], … [0.571571, 0.283195, … 0.335594]]","[[0.999386, 0.999555, … 0.999455], [0.999396, 0.999309, … 0.999222], … [0.999655, 0.999617, … 0.999547]]","[[0.59537, 0.322892, … 0.576517], [-0.082606, -0.137399, … -0.313506], … [0.004153, 0.27604, … 0.32076]]","[[0.942331, 0.990499, … 0.992581], [0.941311, 0.988905, … 0.984904], … [0.943828, 0.991274, … 0.993161]]"
