In [1]:
import polars as pl
from pathlib import Path
from polimi.utils._urm import train_recommender
from RecSys_Course_AT_PoliMi.Recommenders.GraphBased.P3alphaRecommender import P3alphaRecommender
from RecSys_Course_AT_PoliMi.Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from RecSys_Course_AT_PoliMi.Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from RecSys_Course_AT_PoliMi.Recommenders.KNN.UserKNNCFRecommender import UserKNNCFRecommender
from RecSys_Course_AT_PoliMi.Recommenders.MatrixFactorization.Cython.MatrixFactorization_Cython import MatrixFactorization_AsySVD_Cython, MatrixFactorization_BPR_Cython
from RecSys_Course_AT_PoliMi.Recommenders.MatrixFactorization.PureSVDRecommender import PureSVDRecommender, PureSVDItemRecommender
from RecSys_Course_AT_PoliMi.Recommenders.Neural.MultVAERecommender import MultVAERecommender
from RecSys_Course_AT_PoliMi.Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender, MultiThreadSLIM_SLIMElasticNetRecommender
from RecSys_Course_AT_PoliMi.Recommenders.SLIM.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
from RecSys_Course_AT_PoliMi.Recommenders.MatrixFactorization.NMFRecommender import NMFRecommender

In [1]:
dpath = Path('../../dataset')

dtype = 'small'
articles = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/articles.parquet')

behaviors_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/behaviors.parquet')
history_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/history.parquet')

behaviors_val = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/validation/behaviors.parquet')
history_val = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/validation/history.parquet')

NameError: name 'Path' is not defined

In [5]:
from polimi.utils._urm import *
ap = build_articles_with_processed_ner(articles)
ner_mapping_new = build_ner_mapping(ap)
ner_mapping_new.head(2)

ner_index,ner
u32,str
0,"""# metoo"""
1,"""# minhistorieminmening"""


## Create URM

In [6]:
user_id_mapping = history_train.sort('user_id').with_row_index() \
    .select(['index', 'user_id']).rename({'index': 'user_index'})

ner_mapping = articles.select(['ner_clusters']) \
    .explode('ner_clusters') \
    .rename({'ner_clusters': 'ner'}) \
    .with_columns(pl.col('ner').str.strip_chars_start('\" ').str.to_lowercase()) \
    .unique('ner').drop_nulls().filter(pl.col('ner').str.len_chars() > 0) \
    .sort('ner').with_row_index().rename({'index': 'ner_index'})

In [6]:
import scipy.sparse as sps
import numpy as np

ner_interactions = history_train.select(['user_id', 'article_id_fixed']).explode('article_id_fixed') \
    .join(articles.select(['article_id', 'ner_clusters']), left_on='article_id_fixed', right_on='article_id', how='left') \
    .explode('ner_clusters').unique(['user_id', 'ner_clusters']) \
    .with_columns(pl.col('ner_clusters').str.strip_chars_start('\" ').str.to_lowercase()) \
    .join(user_id_mapping, on='user_id', how='left') \
    .join(ner_mapping, left_on='ner_clusters', right_on='ner') \
    .select(['user_index', 'ner_index', 'user_id', 'article_id_fixed']) \
    .unique(['user_index', 'ner_index'])

URM_ner = sps.csr_matrix((np.ones(ner_interactions.shape[0]),
                          (ner_interactions['user_index'].to_numpy(), ner_interactions['ner_index'].to_numpy())),
                         shape=(user_id_mapping.shape[0], ner_mapping.shape[0]))

ner_interactions_test = behaviors_train.select(['user_id', 'article_ids_clicked']).explode('article_ids_clicked') \
    .join(articles.select(['article_id', 'ner_clusters']), left_on='article_ids_clicked', right_on='article_id', how='left') \
    .explode('ner_clusters').unique(['user_id', 'ner_clusters']) \
    .with_columns(pl.col('ner_clusters').str.strip_chars_start('\" ').str.to_lowercase()) \
    .join(user_id_mapping, on='user_id', how='left') \
    .join(ner_mapping, left_on='ner_clusters', right_on='ner') \
    .select(['user_index', 'ner_index', 'user_id', 'article_ids_clicked']) \
    .unique(['user_index', 'ner_index'])

URM_ner_test = sps.csr_matrix((np.ones(ner_interactions_test.shape[0]),
                               (ner_interactions_test['user_index'].to_numpy(), ner_interactions_test['ner_index'].to_numpy())),
                              shape=(user_id_mapping.shape[0], ner_mapping.shape[0]))

In [3]:
from polimi.utils._custom import load_sparse_csr

ner_path = dpath.joinpath('urm').joinpath('ner').joinpath('small')
algo_path = ner_path.joinpath('algo').joinpath('train')

URM_train = load_sparse_csr(ner_path.joinpath('URM_train.npz'))
URM_train_val = load_sparse_csr(ner_path.joinpath('URM_train_val.npz'))
URM_val =  load_sparse_csr(ner_path.joinpath('URM_validation.npz'))
URM_test = load_sparse_csr(ner_path.joinpath('URM_test.npz'))

File loaded at: ../../dataset/urm/ner/small/URM_train.npz
File loaded at: ../../dataset/urm/ner/small/URM_train_val.npz
File loaded at: ../../dataset/urm/ner/small/URM_validation.npz
File loaded at: ../../dataset/urm/ner/small/URM_test.npz


# Build rec sys features

In [9]:
# rp3beta = RP3betaRecommender(URM_train)

# item_knn = ItemKNNCFRecommender(URM_train)
puresvd = PureSVDRecommender(URM_train)
puresvd.fit(num_factors=20)
# rp3beta.load_model(str(algo_path), file_name='RP3betaRecommender-ner-small-ndcg100_new')
# user_knn.load_model(str(algo_path), file_name='UserKNNCFRecommender-ner-small-ndcg100')
# item_knn.load_model(str(algo_path), file_name='ItemKNNCFRecommender-ner-small-ndcg100')
# pure_svd_item.load_model(str(algo_path), file_name='PureSVDItemRecommender-ner-small-ndcg100')

NameError: name 'URM_train' is not defined

In [6]:
from polimi.utils._urm import build_ner_mapping, build_user_id_mapping, build_articles_with_processed_ner, _build_batch_ner_interactions
user_id_mapping = build_user_id_mapping(history_train.vstack(history_val))
ap = build_articles_with_processed_ner(articles)
ner_mapping = build_ner_mapping(ap)
ap = ap.with_columns(
    pl.col('ner_clusters').list.eval(pl.element().replace(ner_mapping['ner'], ner_mapping['ner_index'], default=None).drop_nulls()).alias('ner_clusters_index'),
)

In [7]:
train_ds = behaviors_train
train_ds = train_ds\
    .rename({'article_ids_inview': 'candidate_ids'})\
    .with_columns(
        pl.col('candidate_ids').list.eval(pl.element().replace(ap['article_id'], ap['ner_clusters_index'], default=None).drop_nulls()).alias('candidate_ner_index'),
        pl.col('user_id').replace(user_id_mapping['user_id'], user_id_mapping['user_index'], default=None).alias('user_index')
    ).select('impression_id', 'user_id', 'user_index', 'candidate_ids', 'candidate_ner_index')
train_ds.head(2)

impression_id,user_id,user_index,candidate_ids,candidate_ner_index
u32,u32,u32,list[i32],list[list[u32]]
149474,139836,938,"[9778623, 9778682, … 9778728]","[[1016, 3366, … 41304], [3827, 4679, … 39272], … [14122, 14127, … 42433]]"
150528,143471,972,"[9778718, 9778728, … 9778682]","[[5927, 23803, … 42713], [14122, 14127, … 42433], … [3827, 4679, … 39272]]"


In [8]:
all_users = user_id_mapping['user_index'].unique().sort().to_list()
all_items = ner_mapping['ner_index'].unique().sort().to_list()
# rp3betascores = rp3beta._compute_item_score(all_users)
# userknnparamscores = user_knn._compute_item_score(all_users)
# userknnparamscores.shape, rp3betascores.shape

In [27]:
# train_ds.with_columns(
#     pl.col('candidate_ner_index').list.eval(
#        (pl.element().list.len() > 0)
#     ).alias('x')
# ).with_columns(
#     pl.col('candidate_ids').gather('x').alias('m')
# )

In [9]:
from tqdm import tqdm
train_ds = pl.concat([
    slice.explode(['candidate_ids', 'candidate_ner_index'])\
        .filter(pl.col('candidate_ner_index').list.len() > 0)\
        .group_by(['impression_id', 'user_id', 'user_index']).agg(pl.all())
    for slice in tqdm(train_ds.iter_slices(10000), total=train_ds.shape[0]//10000)
])
train_ds.head(2)

24it [00:00, 62.09it/s]                        


impression_id,user_id,user_index,candidate_ids,candidate_ner_index
u32,u32,u32,list[i32],list[list[u32]]
9732680,968619,6932,"[9779019, 9052240, … 9779181]","[[6965, 10024, … 36241], [1520, 1748, … 35999], … [2485, 8768, … 41306]]"
17725622,779401,5583,"[9770028, 9776099, … 9775855]","[[197, 3478, … 42319], [5110, 8308, … 42371], … [293, 7736, … 41340]]"


In [12]:
from tqdm import tqdm
recs = [puresvd]

impression_scores = pl.concat([
    slice.with_columns(
            *[pl.col('candidate_ner_index').list.eval(
                pl.element().list.eval(pl.element().replace(all_items, rec._compute_item_score(user_index, all_items)[0, all_items], default=None))
            ).alias(f"{rec.RECOMMENDER_NAME}_scores") for rec in recs]
        ).drop('user_index', 'candidate_ner_index')
for user_index, slice in tqdm(train_ds[:100].partition_by(['user_index'], as_dict=True).items(), total=train_ds['user_index'].n_unique())
])
impression_scores.head(3)

  1%|          | 99/15143 [00:08<21:15, 11.79it/s] 


impression_id,user_id,candidate_ids,PureSVDRecommender_scores
u32,u32,list[i32],list[list[f32]]
9732680,968619,"[9779019, 9052240, … 9779181]","[[0.0, 0.534057, … 0.0], [0.534334, 0.012626, … 0.049816], … [0.0, 0.0, … 0.434684]]"
17725622,779401,"[9770028, 9776099, … 9775855]","[[0.606877, 0.104283, … 0.000432], [0.090129, 0.853301, … 0.284373], … [0.115324, 0.328663, … 0.502769]]"
8288145,151570,"[9774163, 9766889, … 9766757]","[[0.0, -0.000368, … 0.272246], [1.116187, 0.284376, … 0.514707], … [0.473899, 1.116187, … 1.186296]]"


In [17]:
from polimi.utils._polars import reduce_polars_df_memory_size
scores_cols = [col for col in impression_scores.columns if col.endswith('_scores')]
df = impression_scores.with_columns(
        *[pl.col(col).list.eval(pl.element().list.sum()).alias(f'sum_{col}') for col in scores_cols],
        *[pl.col(col).list.eval(pl.element().list.max()).alias(f'max_{col}') for col in scores_cols],
        *[pl.col(col).list.eval(pl.element().list.mean()).alias(f'mean_{col}') for col in scores_cols],
).drop(scores_cols)

df = reduce_polars_df_memory_size(df)
df = df.sort(['impression_id', 'user_id'])\
    .explode(pl.all().exclude(['impression_id', 'user_id']))\
    .rename({'candidate_ids': 'article'})

df.head(3)

Memory usage of dataframe is 0.02 MB
Memory usage after optimization is: 0.02 MB
Decreased by 0.0%


impression_id,user_id,article,sum_PureSVDRecommender_scores,max_PureSVDRecommender_scores,mean_PureSVDRecommender_scores
u32,u32,i32,f32,f32,f32
783925,773496,9775076,3.803324,1.477678,0.422592
783925,773496,9775142,0.393973,0.393973,0.078795
783925,773496,9775202,1.721335,0.476289,0.156485


# Test API

In [None]:
from polimi.utils._urm import build_ner_scores_features, load_recommender

load_dict = {
    RP3betaRecommender: {'path': algo_path, 'file_name': 'rp3beta'},
    UserKNNCFRecommender: {'path': algo_path, 'file_name': 'userknn'},
}

recs = []
for rec, load_info in load_dict.items():
    recs.append(load_recommender(URM_train, rec, file_path=str(load_info['path']), file_name=load_info['file_name']))


df = build_ner_scores_features(history=history_train, behaviors=behaviors_train, articles=articles, recs=recs)
df.head(3)

# Evaluate Models

In [None]:
rp3beta = RP3betaRecommender(URM_train)
user_knn = UserKNNCFRecommender(URM_train)
# item_knn = ItemKNNCFRecommender(URM_train)
# pure_svd_item = PureSVDItemRecommender(URM_train)
rp3beta.load_model(str(algo_path), file_name='RP3betaRecommender-ner-small-ndcg100')
user_knn.load_model(str(algo_path), file_name='UserKNNCFRecommender-ner-small-ndcg100')
# item_knn.load_model(str(algo_path), file_name='ItemKNNCFRecommender-ner-small-ndcg100')
# pure_svd_item.load_model(str(algo_path), file_name='PureSVDItemRecommender-ner-small-ndcg100')

In [None]:
from RecSys_Course_AT_PoliMi.Evaluation.Evaluator import EvaluatorHoldout

cutoff = 100
metric = 'NDCG'
evaluator = EvaluatorHoldout(URM_val, cutoff_list=[cutoff, 10], exclude_seen=False)
test_evaluator = EvaluatorHoldout(URM_test, cutoff_list=[cutoff, 10], exclude_seen=False)

In [None]:
# user_knn = UserKNNCFRecommender(URM_train)
# user_knn.fit()
result_df, _ = evaluator.evaluateRecommender(user_knn)
result_df.loc[cutoff][metric.upper()]

In [None]:
result_df

In [None]:
user_knn = UserKNNCFRecommender(URM_train_val)
user_knn.fit(**userknnparams)
result_df, _ = test_evaluator.evaluateRecommender(user_knn)
result_df.loc[cutoff][metric.upper()]