In [292]:
import polars as pl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import polars as pl
import scipy.stats as stats
import scipy.sparse as sps
import gc
import tqdm
from polimi.utils._custom import save_sparse_csr

In [293]:
from os import system, getpid, walk
from psutil import Process
from colorama import Fore, Style, init
from IPython.display import display, HTML

def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
    print(style + color + text + Style.RESET_ALL)
    
def GetMemUsage():   
    pid = getpid()
    py = Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return f"RAM memory GB usage = {memory_use :.4}"

PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)

[1m[31m
RAM memory GB usage = 11.96[0m


In [294]:
def reduce_polars_df_memory_size(df, set_categorical=True):

    start_mem = df.estimated_size('mb')
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type in [pl.Int16, pl.Int32, pl.Int64]:
            c_min = df[col].fill_null(0).min()
            c_max = df[col].fill_null(0).max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.with_columns(pl.col(col).cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.with_columns(pl.col(col).cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.with_columns(pl.col(col).cast(pl.Int32))
        elif col_type in [pl.UInt16, pl.UInt32, pl.UInt64]:
            c_min = df[col].fill_null(0).min()
            c_max = df[col].fill_null(0).max()
            if c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                df = df.with_columns(pl.col(col).cast(pl.UInt8))
            elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                df = df.with_columns(pl.col(col).cast(pl.UInt16))
            elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                df = df.with_columns(pl.col(col).cast(pl.UInt32))
        elif col_type == pl.Float64:
            c_min = df[col].fill_null(0).min()
            c_max = df[col].fill_null(0).max()
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.with_columns(pl.col(col).cast(pl.Float32))

    gc.collect()
    end_mem = df.estimated_size('mb')
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [295]:
articles = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/articles.parquet')

behaviors_train = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/train/behaviors.parquet')
history_train = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/train/history.parquet')

behaviors_val = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/validation/behaviors.parquet')
history_val = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/validation/history.parquet')

contrastive_vector_2 = pl.read_parquet('/home/ubuntu/dataset/Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet')
w_2_vec = pl.read_parquet('/home/ubuntu/dataset/Ekstra_Bladet_word2vec/document_vector.parquet')
roberta = pl.read_parquet('/home/ubuntu/dataset/FacebookAI_xlm_roberta_base/xlm_roberta_base.parquet')
google_bert = pl.read_parquet('/home/ubuntu/dataset/google_bert_base_multilingual_cased/bert_base_multilingual_cased.parquet')
distilbert = pl.read_parquet('/home/ubuntu/dataset/distilbert_title_embedding.parquet')
kenneth = pl.read_parquet('/home/ubuntu/dataset/kenneth_embedding.parquet')
emotions = pl.read_parquet('/home/ubuntu/dataset/emotions_embedding.parquet')

gc.collect()
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)

[1m[31m
RAM memory GB usage = 14.25[0m


In [296]:
articles_mapping = articles.select('article_id').with_row_index().rename({'index': 'article_index'})
articles_mapping

article_index,article_id
u32,i32
0,3001353
1,3003065
2,3012771
3,3023463
4,3032577
…,…
20733,9803492
20734,9803505
20735,9803525
20736,9803560


In [297]:
associations = {
    'contrastive_vector' : contrastive_vector_2,
    'document_vector': w_2_vec,
    'google-bert/bert-base-multilingual-cased': google_bert,
    'FacebookAI/xlm-roberta-base': roberta,
    'title_embedding': distilbert,
    'kenneth_title+subtitle': kenneth,
    'emotion_scores': emotions
    
    
}

In [298]:
name = 'FacebookAI/xlm-roberta-base'

In [299]:
ICM_dataframe = associations[name].join(articles, on='article_id').select(['article_id',name]).with_columns(
        pl.col(name).apply(lambda lst : list(range(len(lst)))).alias("indici")      
    )\
    .explode([name,'indici'])\
    .rename({'indici': 'feature_id'})\
    .join(articles_mapping, on='article_id')\
    .drop('article_id')
ICM_dataframe

  pl.col(name).apply(lambda lst : list(range(len(lst)))).alias("indici")
  ICM_dataframe = associations[name].join(articles, on='article_id').select(['article_id',name]).with_columns(


FacebookAI/xlm-roberta-base,feature_id,article_index
f32,i64,u32
0.082843,0,0
0.117304,1,0
0.077264,2,0
0.027662,3,0
0.081476,4,0
…,…,…
-0.017621,763,20737
0.103328,764,20737
-0.048548,765,20737
0.132479,766,20737


In [300]:
"""
contrastive_vector = contrastive_vector.select(
    pl.exclude('contrastive_vector'), 
    *[pl.col('contrastive_vector').list.get(x).alias(f"feature_{x}") 
          for x in range(n_features)]
)


contrastive_vector
"""

'\ncontrastive_vector = contrastive_vector.select(\n    pl.exclude(\'contrastive_vector\'), \n    *[pl.col(\'contrastive_vector\').list.get(x).alias(f"feature_{x}") \n          for x in range(n_features)]\n)\n\n\ncontrastive_vector\n'

In [301]:
n_articles = ICM_dataframe.select('article_index').n_unique()
print(f'n_articles:{n_articles}')
n_features = ICM_dataframe.select('feature_id').n_unique()
print(f'num_features: {n_features}')

n_articles:20738
num_features: 768


In [302]:
ICM = sps.csr_matrix((ICM_dataframe[name].to_numpy(), 
                          (ICM_dataframe["article_index"].to_numpy(), ICM_dataframe["feature_id"].to_numpy())),
                        shape = (n_articles, n_features))

ICM

<20738x768 sparse matrix of type '<class 'numpy.float32'>'
	with 15926784 stored elements in Compressed Sparse Row format>

In [303]:
save_sparse_csr(path=Path(f'/home/ubuntu/recsys2024/icm/recsys/small/{name}.npz'),array=ICM)

File saved at: /home/ubuntu/recsys2024/icm/recsys/small/FacebookAI/xlm-roberta-base.npz


In [304]:
from RecSys_Course_AT_PoliMi.Recommenders.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from RecSys_Course_AT_PoliMi.Recommenders.KNN.UserKNNCBFRecommender import UserKNNCBFRecommender
from RecSys_Course_AT_PoliMi.Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from RecSys_Course_AT_PoliMi.Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from RecSys_Course_AT_PoliMi.Recommenders.MatrixFactorization.PureSVDRecommender import PureSVDRecommender, PureSVDItemRecommender
from RecSys_Course_AT_PoliMi.Recommenders.KNN.ItemKNN_CFCBF_Hybrid_Recommender import ItemKNN_CFCBF_Hybrid_Recommender

In [305]:
from polimi.utils._custom import load_sparse_csr

URM_train = load_sparse_csr(Path('/home/ubuntu/recsys2024/urm/recsys/small/URM_train.npz'))
URM_validation_train = load_sparse_csr(Path('/home/ubuntu/recsys2024/urm/recsys/small/URM_validation_train.npz'))
URM_validation = load_sparse_csr(Path('/home/ubuntu/recsys2024/urm/recsys/small/URM_validation.npz'))
URM_validation_validation = load_sparse_csr(Path('/home/ubuntu/recsys2024/urm/recsys/small/URM_validation_validation.npz'))


File loaded at: /home/ubuntu/recsys2024/urm/recsys/small/URM_train.npz
File loaded at: /home/ubuntu/recsys2024/urm/recsys/small/URM_validation_train.npz
File loaded at: /home/ubuntu/recsys2024/urm/recsys/small/URM_validation.npz
File loaded at: /home/ubuntu/recsys2024/urm/recsys/small/URM_validation_validation.npz


In [306]:
ICM.shape

(20738, 768)

In [307]:
URM_train.shape

(18827, 20738)

In [308]:
knn_icm = ItemKNNCBFRecommender(URM_train=URM_train, ICM_train=ICM)
knn_icm.fit()

ItemKNNCBFRecommender: URM Detected 3684 (19.6%) users with no interactions.
ItemKNNCBFRecommender: URM Detected 11952 (57.6%) items with no interactions.


Compute_Similarity: detected dense matrix
Similarity column 20738 (100.0%), 310.48 column/sec. Elapsed time 1.11 min


In [309]:
from RecSys_Course_AT_PoliMi.Evaluation.Evaluator import EvaluatorHoldout
evaluator_validation = EvaluatorHoldout(URM_validation_train, cutoff_list=[100])

EvaluatorHoldout: Ignoring 3684 (19.6%) Users that have less than 1 test interactions


In [310]:
result_df, _ = evaluator_validation.evaluateRecommender(knn_icm)
result_df.loc[100]["NDCG"]

EvaluatorHoldout: Processed 15143 (100.0%) in 12.11 sec. Users per second: 1251


0.009612214178197213