In [39]:
import polars as pl
from pathlib import Path

In [40]:
dpath = Path('../../dataset')

dtype = 'small'
articles = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/articles.parquet')

behaviors_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/behaviors.parquet')
history_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/history.parquet')

behaviors_val = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/validation/behaviors.parquet')
history_val = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/validation/history.parquet')

In [41]:
from polimi.utils._custom import load_best_optuna_params
rp3params = load_best_optuna_params('RP3betaRecommender-ner-small-ndcg10')
userknnparams = load_best_optuna_params('UserKNNCFRecommender-ner-small-ndcg10')
x = load_best_optuna_params('MatrixFactorization_BPR_Cython-ner-small-ndcg100')
rp3params, userknnparams, x

({'topK': 161,
  'normalize_similarity': True,
  'alpha': 0.6828230602859293,
  'beta': 0.40218098711568806},
 {'similarity': 'euclidean',
  'topK': 1000,
  'shrink': 944,
  'normalize_avg_row': True,
  'similarity_from_distance_mode': 'log',
  'normalize': False},
 {'sgd_mode': 'adam',
  'num_factors': 14,
  'batch_size': 128,
  'positive_reg': 4.714125732154838e-05,
  'negative_reg': 0.00024304832828732497,
  'learning_rate': 0.0002288000442554581})

In [4]:
from polimi.utils._custom import load_sparse_csr

ner_path = dpath.joinpath('urm').joinpath('ner').joinpath('small')
URM_train = load_sparse_csr(ner_path.joinpath('URM_train.npz'))
URM_val =  load_sparse_csr(ner_path.joinpath('URM_validation.npz'))

File loaded at: ../../dataset/urm/ner/small/URM_train.npz
File loaded at: ../../dataset/urm/ner/small/URM_validation.npz


In [5]:
from polimi.utils._urm import train_recommender
from RecSys_Course_AT_PoliMi.Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from RecSys_Course_AT_PoliMi.Recommenders.KNN.UserKNNCFRecommender import UserKNNCFRecommender
algo_path = ner_path.joinpath('algo')

# train_recommender(URM_train, RP3betaRecommender, rp3params, file_name='rp3beta', output_dir=Path('.'))
# train_recommender(URM_train, UserKNNCFRecommender, userknnparams, file_name='userknn', output_dir=Path('.'))

# Build rec sys features

In [6]:
rp3beta = RP3betaRecommender(URM_train)
user_knn = UserKNNCFRecommender(URM_train)
rp3beta.load_model(str(algo_path), file_name='rp3beta')
user_knn.load_model(str(algo_path), file_name='userknn')

RP3betaRecommender: URM Detected 3685 (19.6%) users with no interactions.
RP3betaRecommender: URM Detected 18909 (43.2%) items with no interactions.
UserKNNCFRecommender: URM Detected 3685 (19.6%) users with no interactions.
UserKNNCFRecommender: URM Detected 18909 (43.2%) items with no interactions.
RP3betaRecommender: Loading model from file '../../dataset/urm/ner/small/algo/rp3beta'
RP3betaRecommender: Loading complete
UserKNNCFRecommender: Loading model from file '../../dataset/urm/ner/small/algo/userknn'
UserKNNCFRecommender: Loading complete


In [7]:
from polimi.utils._urm import build_ner_mapping, build_user_id_mapping, build_articles_with_processed_ner, _build_batch_ner_interactions
user_id_mapping = build_user_id_mapping(history_train.vstack(history_val))
ap = build_articles_with_processed_ner(articles)
ner_mapping = build_ner_mapping(ap)
ap = ap.with_columns(
    pl.col('ner_clusters').list.eval(pl.element().replace(ner_mapping['ner'], ner_mapping['ner_index'], default=None).drop_nulls()).alias('ner_clusters_index'),
)

In [8]:
train_ds = behaviors_train
train_ds = train_ds\
    .rename({'article_ids_inview': 'candidate_ids'})\
    .with_columns(
        pl.col('candidate_ids').list.eval(pl.element().replace(ap['article_id'], ap['ner_clusters_index'], default=None).drop_nulls()).alias('candidate_ner_index'),
        pl.col('user_id').replace(user_id_mapping['user_id'], user_id_mapping['user_index'], default=None).alias('user_index')
    ).select('impression_id', 'user_id', 'user_index', 'candidate_ids', 'candidate_ner_index')
train_ds.head(2)

impression_id,user_id,user_index,candidate_ids,candidate_ner_index
u32,u32,u32,list[i32],list[list[u32]]
149474,139836,938,"[9778623, 9778682, … 9778728]","[[1016, 3366, … 41304], [3827, 4679, … 39272], … [14122, 14127, … 42433]]"
150528,143471,972,"[9778718, 9778728, … 9778682]","[[5927, 23803, … 42713], [14122, 14127, … 42433], … [3827, 4679, … 39272]]"


In [9]:
all_users = user_id_mapping['user_index'].unique().sort().to_list()
all_items = ner_mapping['ner_index'].unique().sort().to_list()
# rp3betascores = rp3beta._compute_item_score(all_users)
# userknnparamscores = user_knn._compute_item_score(all_users)
# userknnparamscores.shape, rp3betascores.shape

In [10]:
from tqdm import tqdm
train_ds = pl.concat([
    slice.explode(['candidate_ids', 'candidate_ner_index'])\
        .filter(pl.col('candidate_ner_index').list.len() > 0)\
        .group_by(['impression_id', 'user_id', 'user_index']).agg(pl.all())
    for slice in tqdm(train_ds.iter_slices(10000), total=train_ds.shape[0]//10000)
])
train_ds.head(2)

  0%|          | 0/23 [00:00<?, ?it/s]

24it [00:00, 66.31it/s]                        


impression_id,user_id,user_index,candidate_ids,candidate_ner_index
u32,u32,u32,list[i32],list[list[u32]]
18359307,1062789,7634,"[9779577, 9780195, … 9780193]","[[1579, 4886, … 37892], [8598, 10671, … 42272], … [251, 9781, … 22185]]"
13627414,815075,5831,"[9771248, 9772710, … 9440508]","[[2377, 8308, … 34317], [334, 633, … 42381], … [22185, 35345]]"


In [11]:
from tqdm import tqdm
recs = [user_knn, rp3beta]

impression_scores = pl.concat([
    slice.with_columns(
            *[pl.col('candidate_ner_index').list.eval(
                pl.element().list.eval(pl.element().replace(all_items, rec._compute_item_score(user_index)[0], default=None))
            ).alias(f"{rec.RECOMMENDER_NAME}_scores") for rec in recs]
        ).drop('user_index', 'candidate_ner_index')
for user_index, slice in tqdm(train_ds.partition_by(['user_index'], as_dict=True).items(), total=train_ds['user_index'].n_unique())
])
impression_scores.head(3)

100%|██████████| 15143/15143 [04:37<00:00, 54.61it/s]


impression_id,user_id,candidate_ids,UserKNNCFRecommender_scores,RP3betaRecommender_scores
u32,u32,list[i32],list[list[f32]],list[list[f32]]
18359307,1062789,"[9779577, 9780195, … 9780193]","[[0.0, 0.0, … 0.068838], [0.119672, 0.119672, … 0.0], … [0.119672, 0.118613, … 0.119672]]","[[0.0, 0.0, … 0.113963], [0.615105, 1.102915, … 0.0], … [0.413016, 0.274689, … 1.114899]]"
5518209,1062789,"[9774972, 9774789, … 9774542]","[[0.0, 0.118613, … 0.0], [0.119672, 0.114377, … 0.0], … [0.0]]","[[0.0, 0.295102, … 0.0], [1.102915, 0.18634, … 0.0], … [0.0]]"
42335635,1062789,"[9774541, 9775697, … 9470078]","[[0.119672, 0.119672, … 0.0], [0.0, 0.0, … 0.01165], … [0.117554, 0.01059, … 0.076251]]","[[0.614114, 1.102915, … 0.0], [0.0, 0.0, … 0.0], … [0.198479, 0.0, … 0.053455]]"


In [12]:
from polimi.utils._polars import reduce_polars_df_memory_size
scores_cols = [col for col in impression_scores.columns if '_scores' in col]
df = impression_scores.with_columns(
        *[pl.col(col).list.eval(pl.element().list.sum()).alias(f'sum_{col}') for col in scores_cols],
        *[pl.col(col).list.eval(pl.element().list.max()).alias(f'max_{col}') for col in scores_cols],
        *[pl.col(col).list.eval(pl.element().list.mean()).alias(f'mean_{col}') for col in scores_cols],
).with_columns(
    pl.all().exclude(['impression_id', 'user_id', 'candidate_ids'] + scores_cols).list.eval(pl.element().truediv(pl.element().max()).fill_nan(0.0)), #inf norm
).drop(scores_cols)

df = reduce_polars_df_memory_size(df)
df = df.sort(['impression_id', 'user_id'])\
    .explode(pl.all().exclude(['impression_id', 'user_id']))\
    .rename({'candidate_ids': 'article'})

df.head(3)

Memory usage of dataframe is 80.99 MB
Memory usage after optimization is: 80.99 MB
Decreased by 0.0%


impression_id,user_id,article,sum_UserKNNCFRecommender_scores,sum_RP3betaRecommender_scores,max_UserKNNCFRecommender_scores,max_RP3betaRecommender_scores,mean_UserKNNCFRecommender_scores,mean_RP3betaRecommender_scores
u32,u32,i32,f32,f32,f32,f32,f32,f32
149474,139836,9778623,0.174803,0.284011,1.0,0.858929,0.14791,0.240317
149474,139836,9778682,0.057638,0.008967,0.380449,0.02712,0.070446,0.01096
149474,139836,9778669,0.036063,0.0,0.302508,0.0,0.066115,0.0


In [28]:
df.filter(
    pl.any_horizontal([pl.all().is_nan()]),
)

impression_id,user_id,article,sum_UserKNNCFRecommender_scores,sum_RP3betaRecommender_scores,max_UserKNNCFRecommender_scores,max_RP3betaRecommender_scores,mean_UserKNNCFRecommender_scores,mean_RP3betaRecommender_scores
u32,u32,i32,f32,f32,f32,f32,f32,f32


In [29]:
df.filter(
    pl.any_horizontal([pl.all().is_null()]),
)

impression_id,user_id,article,sum_UserKNNCFRecommender_scores,sum_RP3betaRecommender_scores,max_UserKNNCFRecommender_scores,max_RP3betaRecommender_scores,mean_UserKNNCFRecommender_scores,mean_RP3betaRecommender_scores
u32,u32,i32,f32,f32,f32,f32,f32,f32


# Test API

In [6]:
from polimi.utils._urm import build_ner_scores_features, load_recommender

load_dict = {
    RP3betaRecommender: {'path': algo_path, 'file_name': 'rp3beta'},
    UserKNNCFRecommender: {'path': algo_path, 'file_name': 'userknn'},
}

recs = []
for rec, load_info in load_dict.items():
    recs.append(load_recommender(URM_train, rec, file_path=str(load_info['path']), file_name=load_info['file_name']))


df = build_ner_scores_features(history=history_train, behaviors=behaviors_train, articles=articles, recs=recs)
df.head(3)

RP3betaRecommender: URM Detected 3685 (19.6%) users with no interactions.
RP3betaRecommender: URM Detected 18909 (43.2%) items with no interactions.
RP3betaRecommender: Loading model from file '../../dataset/urm/ner/small/algo/rp3beta'
RP3betaRecommender: Loading complete
UserKNNCFRecommender: URM Detected 3685 (19.6%) users with no interactions.
UserKNNCFRecommender: URM Detected 18909 (43.2%) items with no interactions.
UserKNNCFRecommender: Loading model from file '../../dataset/urm/ner/small/algo/userknn'
UserKNNCFRecommender: Loading complete


100%|██████████| 15143/15143 [04:23<00:00, 57.52it/s]


Memory usage of dataframe is 251.64 MB
Memory usage after optimization is: 251.64 MB
Decreased by 0.0%
Memory usage of dataframe is 84.49 MB
Memory usage after optimization is: 84.49 MB
Decreased by 0.0%


impression_id,user_id,article,sum_RP3betaRecommender_ner_scores,sum_UserKNNCFRecommender_ner_scores,max_RP3betaRecommender_ner_scores,max_UserKNNCFRecommender_ner_scores,mean_RP3betaRecommender_ner_scores,mean_UserKNNCFRecommender_ner_scores
u32,u32,i32,f32,f32,f32,f32,f32,f32
149474,139836,9778623,0.308894,0.167121,1.0,1.0,0.261372,0.14141
149474,139836,9778682,0.0,0.039055,0.0,0.268196,0.0,0.047734
149474,139836,9778669,0.0,0.062216,0.0,0.524904,0.0,0.114063


# Evaluate Models

In [None]:
from RecSys_Course_AT_PoliMi.Evaluation.Evaluator import EvaluatorHoldout

cutoff = 10
metric = 'NDCG'
evaluator = EvaluatorHoldout(URM_val, cutoff_list=[cutoff], exclude_seen=False)

In [None]:
from RecSys_Course_AT_PoliMi.Recommenders.MatrixFactorization.PureSVDRecommender import PureSVDRecommender

rec_instance = PureSVDRecommender(URM_train)
params = {'num_factors': 577}
rec_instance.fit(**params)
result_df, _ = evaluator.evaluateRecommender(rec_instance)
result_df.loc[cutoff][metric.upper()]

In [None]:
from RecSys_Course_AT_PoliMi.Recommenders.KNN.UserKNNCFRecommender import UserKNNCFRecommender

rec_instance = UserKNNCFRecommender(URM_train)
params = {'topK': 467, 'shrink': 95, 'similarity': 'dice', 'normalize': True}
rec_instance.fit(**params)
result_df, _ = evaluator.evaluateRecommender(rec_instance)
result_df.loc[cutoff][metric.upper()]

In [None]:
from polimi.utils._urm import _build_batch_ner_interactions, build_articles_with_processed_ner, build_user_id_mapping, build_ner_mapping
ap = build_articles_with_processed_ner(articles)
user_id_mapping = build_user_id_mapping(history_train.vstack(history_val))
ner_mapping = build_ner_mapping(ap)
ner_interactions = _build_batch_ner_interactions(history_train, ap, user_id_mapping, ner_mapping, 'article_id_fixed')

In [38]:
import json

# Sample dictionary
my_dict = {'name': 'John', 'age': 30, 'city': 'New York'}

# Convert dictionary to string
dict_string = json.dumps(my_dict)

# Print the string representation
print("String representation of dictionary:", dict_string)

# Convert string back to dictionary
new_dict = json.loads(dict_string)

# Print the dictionary
print("Dictionary from string:", new_dict)


String representation of dictionary: {"name": "John", "age": 30, "city": "New York"}
Dictionary from string: {'name': 'John', 'age': 30, 'city': 'New York'}
