In [8]:
import polars as pl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import polars as pl
import scipy.stats as stats
import scipy.sparse as sps
import gc
import tqdm

In [9]:
from os import system, getpid, walk
from psutil import Process
from colorama import Fore, Style, init
from IPython.display import display, HTML

def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
    print(style + color + text + Style.RESET_ALL)
    
def GetMemUsage():   
    pid = getpid()
    py = Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return f"RAM memory GB usage = {memory_use :.4}"

PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)

[1m[31m
RAM memory GB usage = 1.692[0m


In [10]:
def reduce_polars_df_memory_size(df, set_categorical=True):

    start_mem = df.estimated_size('mb')
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type in [pl.Int16, pl.Int32, pl.Int64]:
            c_min = df[col].fill_null(0).min()
            c_max = df[col].fill_null(0).max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.with_columns(pl.col(col).cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.with_columns(pl.col(col).cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.with_columns(pl.col(col).cast(pl.Int32))
        elif col_type in [pl.UInt16, pl.UInt32, pl.UInt64]:
            c_min = df[col].fill_null(0).min()
            c_max = df[col].fill_null(0).max()
            if c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                df = df.with_columns(pl.col(col).cast(pl.UInt8))
            elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                df = df.with_columns(pl.col(col).cast(pl.UInt16))
            elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                df = df.with_columns(pl.col(col).cast(pl.UInt32))
        elif col_type == pl.Float64:
            c_min = df[col].fill_null(0).min()
            c_max = df[col].fill_null(0).max()
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.with_columns(pl.col(col).cast(pl.Float32))

    gc.collect()
    end_mem = df.estimated_size('mb')
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [11]:
articles = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/articles.parquet')

behaviors_train = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/train/behaviors.parquet')
history_train = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/train/history.parquet')

behaviors_val = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/validation/behaviors.parquet')
history_val = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/validation/history.parquet')

contrastive_vector = pl.read_parquet('/home/ubuntu/dataset/Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet')

gc.collect()
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)

[1m[31m
RAM memory GB usage = 2.033[0m


In [12]:
unique_entities = articles.select('entity_groups').explode('entity_groups')['entity_groups'].unique().to_list()
unique_entities = [e for e in unique_entities if e is not None]
unique_entities

['PROD', 'PER', 'EVENT', 'LOC', 'ORG', 'MISC']

In [13]:
train_ds = pl.read_parquet('/mnt/ebs_volume/recsys2024/preprocessing/small_ds/train/train_ds.parquet')
train_ds

impression_id,user_id,article,target,device_type,read_time,scroll_percentage,is_sso_user,gender,age,is_subscriber,postcode,trendiness_score_1d,trendiness_score_3d,trendiness_score_5d,weekday,hour,trendiness_score_1d/3d,trendiness_score_1d/5d,normalized_trendiness_score_overall,premium,category,sentiment_score,sentiment_label,num_images,title_len,subtitle_len,body_len,num_topics,total_pageviews,total_inviews,total_read_time,total_pageviews/inviews,article_type,article_delay_days,article_delay_hours,Entity_EVENT_Present,…,entropy_impression_endorsement_10h,entropy_impression_total_pageviews/inviews,entropy_impression_mean_JS,entropy_impression_mean_topic_model_cosine,entropy_impression_topics_cosine,entropy_impression_article_delay_hours,entropy_impression_total_pageviews,entropy_impression_total_inviews,entropy_impression_trendiness_score_category,entropy_impression_std_JS,entropy_impression_trendiness_score_5d,entropy_impression_total_read_time,trendiness_score_3d_minus_median_impression,endorsement_10h_minus_median_impression,total_pageviews/inviews_minus_median_impression,mean_JS_minus_median_impression,mean_topic_model_cosine_minus_median_impression,topics_cosine_minus_median_impression,article_delay_hours_minus_median_impression,total_pageviews_minus_median_impression,total_inviews_minus_median_impression,trendiness_score_category_minus_median_impression,std_JS_minus_median_impression,trendiness_score_5d_minus_median_impression,total_read_time_minus_median_impression,mean_JS_l_inf_user,std_JS_l_inf_user,mean_topic_model_cosine_l_inf_user,topics_cosine_l_inf_user,article_delay_hours_l_inf_article,mean_JS_l_inf_article,std_JS_l_inf_article,mean_topic_model_cosine_l_inf_article,topics_cosine_l_inf_article,category_diversity_impression,sentiment_label_diversity_impression,article_type_diversity_impression
u32,u32,i32,i8,i8,f32,f32,bool,i8,i8,bool,i8,i16,i16,i16,i8,i8,f32,f32,f32,bool,i16,f32,str,u32,u8,u8,u16,u32,i32,i32,f32,f32,str,i16,i32,bool,…,f64,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f64,f64,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,u32,u32,u32
149474,139836,9778728,0,2,13.0,,false,2,,false,5,150,521,836,3,7,0.287908,0.179426,0.880068,false,142,0.9654,"""Negative""",1,5,18,251,7,22415,220247,1.004828e6,0.101772,"""article_default""",0,0,true,…,1.622597,1.726481,1.747889,1.788942,1.689578,,1.613433,1.737743,1.612282,1.739272,1.676222,1.558951,281.5,-873.5,-0.111306,-0.005177,0.091665,-0.02667,-1.0,-62463.5,-165503.0,4.0,-0.033664,447.5,-4.981345e6,0.447581,0.403328,0.96685,0.36119,0.0,0.233531,0.335164,0.908604,0.301919,4,3,1
149474,139836,9778669,0,2,13.0,,false,2,,false,5,85,199,313,3,7,0.427136,0.271565,0.336149,false,118,0.9481,"""Negative""",1,5,11,150,4,74491,373488,4.365609e6,0.199447,"""article_default""",0,1,false,…,1.622597,1.726481,1.747889,1.788942,1.689578,,1.613433,1.737743,1.612282,1.739272,1.676222,1.558951,-40.5,179.5,-0.013631,0.003967,-0.050146,0.015762,0.0,-10387.5,-12262.0,0.0,0.028015,-75.5,-1.620564e6,0.501163,0.632488,0.797317,0.437299,0.043478,0.266627,0.511387,0.789201,0.319083,4,3,1
149474,139836,9778657,1,2,13.0,,false,2,,false,5,45,117,183,3,7,0.384615,0.245902,0.197635,false,118,0.8347,"""Neutral""",2,6,31,336,3,108389,478098,7.606737e6,0.226709,"""article_default""",0,1,false,…,1.622597,1.726481,1.747889,1.788942,1.689578,,1.613433,1.737743,1.612282,1.739272,1.676222,1.558951,-122.5,368.5,0.013631,0.05309,0.005468,0.218611,0.0,23510.5,92348.0,0.0,0.051035,-205.5,1.620564e6,0.78901,0.718014,0.863803,0.801144,0.066667,0.350947,0.730242,0.840012,0.604487,4,3,1
150528,143471,9778682,0,2,25.0,,false,2,,false,5,69,206,334,3,7,0.334951,0.206587,0.347973,false,498,0.9546,"""Negative""",1,5,20,267,3,143520,455723,9.298546e6,0.314928,"""article_default""",0,1,true,…,1.712983,2.137333,2.093172,2.19456,2.100994,,2.063931,2.158165,1.924178,2.126514,1.946778,2.008479,7.0,204.0,0.088219,0.013219,0.095761,-0.001332,1.0,48254.0,82235.0,-47.0,0.013249,21.0,4.223417e6,0.793928,0.732491,0.94067,0.581838,0.041667,0.299855,0.536061,0.891664,0.447169,6,3,1
150528,143471,9778669,0,2,25.0,,false,2,,false,5,85,199,313,3,7,0.427136,0.271565,0.336149,false,118,0.9481,"""Negative""",1,5,11,150,4,74491,373488,4.365609e6,0.199447,"""article_default""",0,1,false,…,1.712983,2.137333,2.093172,2.19456,2.100994,,2.063931,2.158165,1.924178,2.126514,1.946778,2.008479,0.0,598.0,-0.027262,0.025741,-0.061045,0.159455,1.0,-20775.0,0.0,62.0,0.007517,0.0,-709520.0,0.881599,0.709201,0.756751,0.871154,0.043478,0.392552,0.524339,0.763467,0.633637,6,3,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
580100695,2110744,9769917,0,1,5.0,100.0,false,2,,false,5,46,105,152,4,10,0.438095,0.302632,0.203883,true,140,0.989,"""Negative""",4,5,32,826,2,203222,2163455,1.2661448e7,0.093934,"""article_default""",0,17,false,…,1.570477,1.037743,1.826027,1.940373,1.748273,1.200948,1.060549,0.924248,1.757789,1.898659,1.762101,0.913896,-50.0,1404.0,-0.114787,0.002116,-0.103502,0.0,-4373.0,4017.0,1.209047e6,18.0,0.088883,-79.0,0.0,0.582369,0.927592,0.682434,0.472501,0.093407,0.120789,0.389978,0.696381,0.268713,4,3,2
580100695,2110744,9767697,1,1,5.0,100.0,false,2,,false,5,50,187,238,4,10,0.26738,0.210084,0.363107,false,118,0.9613,"""Negative""",5,7,2,982,3,199205,954408,2.595362e7,0.208721,"""article_default""",0,3,false,…,1.570477,1.037743,1.826027,1.940373,1.748273,1.200948,1.060549,0.924248,1.757789,1.898659,1.762101,0.913896,32.0,1339.0,0.0,0.035035,0.028725,0.237915,-4387.0,0.0,0.0,47.0,0.051003,7.0,1.3292172e7,0.80426,0.763092,0.836592,0.908091,0.017964,0.287404,0.373625,0.779936,0.597165,4,3,2
580100697,2110744,9770997,0,1,14.0,100.0,false,2,,false,5,32,78,136,4,10,0.410256,0.235294,0.151456,false,414,0.845,"""Positive""",1,5,18,164,4,110632,485698,5.034287e6,0.227779,"""article_default""",0,3,false,…,1.708204,1.083186,1.954657,2.070895,1.949265,1.455156,0.965362,0.915239,1.841282,2.033027,1.795838,0.937667,-47.5,1339.5,0.066013,-0.020815,0.014447,-0.035568,-4786.0,0.0,0.0,0.0,-0.032977,-64.5,0.0,0.162473,0.29199,0.836789,0.177689,0.018072,0.172841,0.384146,0.818583,0.204345,5,3,2
580100697,2110744,9514481,0,1,14.0,100.0,false,2,,false,5,8,37,49,4,10,0.216216,0.163265,0.071845,true,414,0.9501,"""Neutral""",7,9,30,371,3,,,,,"""article_standard_feature""",182,4390,false,…,1.708204,1.083186,1.954657,2.070895,1.949265,1.455156,0.965362,0.915239,1.841282,2.033027,1.795838,0.937667,-88.5,-363.5,,-0.021758,-0.126811,-0.071621,-399.0,,,0.0,-0.012502,-151.5,,0.156114,0.380906,0.672102,0.111681,0.963776,0.054301,0.239584,0.69207,0.069082,5,3,2


In [14]:
articles_mapping = articles.select('article_id').with_row_index().rename({'index': 'article_index'})

ICM_dataframe = contrastive_vector.join(articles, on='article_id').select(['article_id','contrastive_vector']).with_columns(
        pl.col("contrastive_vector").apply(lambda lst : list(range(len(lst)))).alias("indici")      
    )\
    .explode(['contrastive_vector','indici'])\
    .rename({'indici': 'feature_id'})\
    .join(articles_mapping, on='article_id')\
    .drop('article_id')
ICM_dataframe

  pl.col("contrastive_vector").apply(lambda lst : list(range(len(lst)))).alias("indici")
  ICM_dataframe = contrastive_vector.join(articles, on='article_id').select(['article_id','contrastive_vector']).with_columns(


contrastive_vector,feature_id,article_index
f32,i64,u32
0.014536,0,0
0.013818,1,0
0.01941,2,0
-0.071472,3,0
-0.012827,4,0
…,…,…
0.005754,763,20737
0.028954,764,20737
0.004138,765,20737
0.004545,766,20737


In [15]:
n_articles = ICM_dataframe.select('article_index').n_unique()
print(f'n_articles:{n_articles}')
n_features = ICM_dataframe.select('feature_id').n_unique()
print(f'num_features: {n_features}')

n_articles:20738
num_features: 768


In [16]:
ICM = sps.csr_matrix((ICM_dataframe["contrastive_vector"].to_numpy(), 
                          (ICM_dataframe["article_index"].to_numpy(), ICM_dataframe["feature_id"].to_numpy())),
                        shape = (n_articles, n_features))

ICM

<20738x768 sparse matrix of type '<class 'numpy.float32'>'
	with 15926784 stored elements in Compressed Sparse Row format>

In [17]:
from RecSys_Course_AT_PoliMi.Recommenders.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender

In [18]:
from polimi.utils._urm import build_recsys_features
from polimi.utils._custom import load_recommenders,load_sparse_csr

URM_train = load_sparse_csr(Path('/home/ubuntu/recsys2024/urm/recsys/small/URM_train.npz'))

knn_icm = ItemKNNCBFRecommender(URM_train=URM_train, ICM_train=ICM)
knn_icm.fit()

recs = load_recommenders(URM_train, Path('/home/ubuntu/recsys2024/algo/recsys/small/train'))

recs.append(knn_icm)

recsys_features = build_recsys_features(history=history_train.vstack(history_val),behaviors=behaviors_train,articles=articles,recs=recs)

2024-05-20 09:53:42.623573: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


File loaded at: /home/ubuntu/recsys2024/urm/recsys/small/URM_train.npz
ItemKNNCBFRecommender: URM Detected 3684 (19.6%) users with no interactions.
ItemKNNCBFRecommender: URM Detected 11952 (57.6%) items with no interactions.
Compute_Similarity: detected dense matrix
Similarity column 20738 (100.0%), 215.12 column/sec. Elapsed time 1.61 min
SLIM_BPR_Cython: URM Detected 3684 (19.6%) users with no interactions.
SLIM_BPR_Cython: URM Detected 11952 (57.6%) items with no interactions.
SLIM_BPR_Cython: Loading model from file '/home/ubuntu/recsys2024/algo/recsys/small/train/SLIM_BPR_Cython'
SLIM_BPR_Cython: Loading complete
ItemKNNCFRecommender: URM Detected 3684 (19.6%) users with no interactions.
ItemKNNCFRecommender: URM Detected 11952 (57.6%) items with no interactions.
ItemKNNCFRecommender: Loading model from file '/home/ubuntu/recsys2024/algo/recsys/small/train/ItemKNNCFRecommender'
ItemKNNCFRecommender: Loading complete
PureSVDItemRecommender: URM Detected 3684 (19.6%) users with no 

In [19]:
recsys_features

impression_id,article,user_id,SLIM_BPR_Cython,ItemKNNCFRecommender,PureSVDItemRecommender,PureSVDRecommender,RP3betaRecommender,ItemKNNCBFRecommender
u32,i32,u32,f32,f32,f32,f32,f32,f32
556008624,9193263,10068,0.0,0.0,0.00011,0.000007,0.0,0.0
556008624,9695098,10068,0.0,0.002663,0.009685,0.008015,0.003995,0.0
556008624,9758734,10068,0.0,0.0,0.003549,0.000875,0.006811,0.0
385198819,9762352,10068,0.0,0.005326,0.02905,0.029936,0.006742,0.0
556008624,9772548,10068,0.0,0.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…
567760515,9779705,2590637,0.0,0.0,0.0,0.0,0.0,0.0
567760513,9779705,2590637,0.0,0.0,0.0,0.0,0.0,0.0
567760515,9779723,2590637,0.0,0.0,0.0,0.0,0.0,0.0
567760513,9779723,2590637,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
recsys_features.select('ItemKNNCFRecommender').filter(pl.col('ItemKNNCFRecommender')!=0).shape[0]/recsys_features.select('ItemKNNCFRecommender').shape[0]

0.11858932834496182

In [83]:
#recsys_features = pl.read_parquet('/mnt/ebs_volume_2/recsys2024/features/recsys/small/train/recsys_scores_features.parquet')
#recsys_features

In [31]:
train_ds = train_ds.join(recsys_features, on=['impression_id', 'article','user_id'], how= 'left')
train_ds

impression_id,user_id,article,target,device_type,read_time,scroll_percentage,is_sso_user,gender,age,is_subscriber,postcode,trendiness_score_1d,trendiness_score_3d,trendiness_score_5d,weekday,hour,trendiness_score_1d/3d,trendiness_score_1d/5d,normalized_trendiness_score_overall,premium,category,sentiment_score,sentiment_label,num_images,title_len,subtitle_len,body_len,num_topics,total_pageviews,total_inviews,total_read_time,total_pageviews/inviews,article_type,article_delay_days,article_delay_hours,Entity_EVENT_Present,…,entropy_impression_total_pageviews,entropy_impression_total_inviews,entropy_impression_trendiness_score_category,entropy_impression_std_JS,entropy_impression_trendiness_score_5d,entropy_impression_total_read_time,trendiness_score_3d_minus_median_impression,endorsement_10h_minus_median_impression,total_pageviews/inviews_minus_median_impression,mean_JS_minus_median_impression,mean_topic_model_cosine_minus_median_impression,topics_cosine_minus_median_impression,article_delay_hours_minus_median_impression,total_pageviews_minus_median_impression,total_inviews_minus_median_impression,trendiness_score_category_minus_median_impression,std_JS_minus_median_impression,trendiness_score_5d_minus_median_impression,total_read_time_minus_median_impression,mean_JS_l_inf_user,std_JS_l_inf_user,mean_topic_model_cosine_l_inf_user,topics_cosine_l_inf_user,article_delay_hours_l_inf_article,mean_JS_l_inf_article,std_JS_l_inf_article,mean_topic_model_cosine_l_inf_article,topics_cosine_l_inf_article,category_diversity_impression,sentiment_label_diversity_impression,article_type_diversity_impression,SLIM_BPR_Cython,ItemKNNCFRecommender,PureSVDItemRecommender,PureSVDRecommender,RP3betaRecommender,ItemKNNCBFRecommender
u32,u32,i32,i8,i8,f32,f32,bool,i8,i8,bool,i8,i16,i16,i16,i8,i8,f32,f32,f32,bool,i16,f32,str,u32,u8,u8,u16,u32,i32,i32,f32,f32,str,i16,i32,bool,…,f64,f64,f64,f32,f64,f32,f64,f64,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,u32,u32,u32,f32,f32,f32,f32,f32,f32
149474,139836,9778728,0,2,13.0,,false,2,,false,5,150,521,836,3,7,0.287908,0.179426,0.880068,false,142,0.9654,"""Negative""",1,5,18,251,7,22415,220247,1.004828e6,0.101772,"""article_default""",0,0,true,…,1.613433,1.737743,1.612282,1.739272,1.676222,1.558951,281.5,-873.5,-0.111306,-0.005177,0.091665,-0.02667,-1.0,-62463.5,-165503.0,4.0,-0.033664,447.5,-4.981345e6,0.447581,0.403328,0.96685,0.36119,0.0,0.233531,0.335164,0.908604,0.301919,4,3,1,0.0,0.0,0.0,0.0,0.0,0.0
149474,139836,9778669,0,2,13.0,,false,2,,false,5,85,199,313,3,7,0.427136,0.271565,0.336149,false,118,0.9481,"""Negative""",1,5,11,150,4,74491,373488,4.365609e6,0.199447,"""article_default""",0,1,false,…,1.613433,1.737743,1.612282,1.739272,1.676222,1.558951,-40.5,179.5,-0.013631,0.003967,-0.050146,0.015762,0.0,-10387.5,-12262.0,0.0,0.028015,-75.5,-1.620564e6,0.501163,0.632488,0.797317,0.437299,0.043478,0.266627,0.511387,0.789201,0.319083,4,3,1,0.0,0.0,0.0,0.0,0.0,0.0
149474,139836,9778657,1,2,13.0,,false,2,,false,5,45,117,183,3,7,0.384615,0.245902,0.197635,false,118,0.8347,"""Neutral""",2,6,31,336,3,108389,478098,7.606737e6,0.226709,"""article_default""",0,1,false,…,1.613433,1.737743,1.612282,1.739272,1.676222,1.558951,-122.5,368.5,0.013631,0.05309,0.005468,0.218611,0.0,23510.5,92348.0,0.0,0.051035,-205.5,1.620564e6,0.78901,0.718014,0.863803,0.801144,0.066667,0.350947,0.730242,0.840012,0.604487,4,3,1,0.0,0.0,0.0,0.0,0.0,0.0
150528,143471,9778682,0,2,25.0,,false,2,,false,5,69,206,334,3,7,0.334951,0.206587,0.347973,false,498,0.9546,"""Negative""",1,5,20,267,3,143520,455723,9.298546e6,0.314928,"""article_default""",0,1,true,…,2.063931,2.158165,1.924178,2.126514,1.946778,2.008479,7.0,204.0,0.088219,0.013219,0.095761,-0.001332,1.0,48254.0,82235.0,-47.0,0.013249,21.0,4.223417e6,0.793928,0.732491,0.94067,0.581838,0.041667,0.299855,0.536061,0.891664,0.447169,6,3,1,0.0,0.0,0.0,0.0,0.0,0.0
150528,143471,9778669,0,2,25.0,,false,2,,false,5,85,199,313,3,7,0.427136,0.271565,0.336149,false,118,0.9481,"""Negative""",1,5,11,150,4,74491,373488,4.365609e6,0.199447,"""article_default""",0,1,false,…,2.063931,2.158165,1.924178,2.126514,1.946778,2.008479,0.0,598.0,-0.027262,0.025741,-0.061045,0.159455,1.0,-20775.0,0.0,62.0,0.007517,0.0,-709520.0,0.881599,0.709201,0.756751,0.871154,0.043478,0.392552,0.524339,0.763467,0.633637,6,3,1,0.0,0.0,0.0,0.0,0.0,0.024638
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
580100695,2110744,9769917,0,1,5.0,100.0,false,2,,false,5,46,105,152,4,10,0.438095,0.302632,0.203883,true,140,0.989,"""Negative""",4,5,32,826,2,203222,2163455,1.2661448e7,0.093934,"""article_default""",0,17,false,…,1.060549,0.924248,1.757789,1.898659,1.762101,0.913896,-50.0,1404.0,-0.114787,0.002116,-0.103502,0.0,-4373.0,4017.0,1.209047e6,18.0,0.088883,-79.0,0.0,0.582369,0.927592,0.682434,0.472501,0.093407,0.120789,0.389978,0.696381,0.268713,4,3,2,0.085755,0.038615,0.559645,0.315015,0.052498,0.018076
580100695,2110744,9767697,1,1,5.0,100.0,false,2,,false,5,50,187,238,4,10,0.26738,0.210084,0.363107,false,118,0.9613,"""Negative""",5,7,2,982,3,199205,954408,2.595362e7,0.208721,"""article_default""",0,3,false,…,1.060549,0.924248,1.757789,1.898659,1.762101,0.913896,32.0,1339.0,0.0,0.035035,0.028725,0.237915,-4387.0,0.0,0.0,47.0,0.051003,7.0,1.3292172e7,0.80426,0.763092,0.836592,0.908091,0.017964,0.287404,0.373625,0.779936,0.597165,4,3,2,0.0,0.019973,0.195145,0.096154,0.039106,0.008761
580100697,2110744,9770997,0,1,14.0,100.0,false,2,,false,5,32,78,136,4,10,0.410256,0.235294,0.151456,false,414,0.845,"""Positive""",1,5,18,164,4,110632,485698,5.034287e6,0.227779,"""article_default""",0,3,false,…,0.965362,0.915239,1.841282,2.033027,1.795838,0.937667,-47.5,1339.5,0.066013,-0.020815,0.014447,-0.035568,-4786.0,0.0,0.0,0.0,-0.032977,-64.5,0.0,0.162473,0.29199,0.836789,0.177689,0.018072,0.172841,0.384146,0.818583,0.204345,5,3,2,0.0,0.0,0.0,0.0,0.0,0.00639
580100697,2110744,9514481,0,1,14.0,100.0,false,2,,false,5,8,37,49,4,10,0.216216,0.163265,0.071845,true,414,0.9501,"""Neutral""",7,9,30,371,3,,,,,"""article_standard_feature""",182,4390,false,…,0.965362,0.915239,1.841282,2.033027,1.795838,0.937667,-88.5,-363.5,,-0.021758,-0.126811,-0.071621,-399.0,,,0.0,-0.012502,-151.5,,0.156114,0.380906,0.672102,0.111681,0.963776,0.054301,0.239584,0.69207,0.069082,5,3,2,0.0,0.007989,0.05109,0.045008,0.001339,0.0


In [32]:
from catboost import CatBoostClassifier


## Classifier


In [33]:
train_ds = train_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()

categorical_columns = ['device_type', 'is_sso_user', 'gender', 'is_subscriber', 'weekday',
                       'premium', 'category', 'sentiment_label', 'is_new_article', 'is_already_seen_article',
                       'MostFrequentCategory', 'MostFrequentWeekday', 'IsFavouriteCategory',
                       'article_type', 'postcode']
categorical_columns += [f'Entity_{entity}_Present' for entity in unique_entities]
train_ds[categorical_columns] = train_ds[categorical_columns].astype('category')

X = train_ds.drop(columns=['target'])
y = train_ds['target']



In [34]:

# just a simple model to be able to run an evaluation
model = CatBoostClassifier(cat_features=categorical_columns, iterations=1000, rsm=0.7, subsample=0.5)

model.fit(X, y, verbose=25)

gc.collect()
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)


Learning rate set to 0.169254
0:	learn: 0.6469005	total: 301ms	remaining: 5m
25:	learn: 0.5172412	total: 6.88s	remaining: 4m 17s
50:	learn: 0.5003876	total: 13.4s	remaining: 4m 9s
75:	learn: 0.4901748	total: 19.9s	remaining: 4m 2s
100:	learn: 0.4820436	total: 26.3s	remaining: 3m 53s
125:	learn: 0.4765027	total: 32.5s	remaining: 3m 45s
150:	learn: 0.4715106	total: 38.8s	remaining: 3m 37s
175:	learn: 0.4674247	total: 45.1s	remaining: 3m 31s
200:	learn: 0.4635423	total: 51.8s	remaining: 3m 25s
225:	learn: 0.4603533	total: 58.2s	remaining: 3m 19s
250:	learn: 0.4570389	total: 1m 4s	remaining: 3m 12s
275:	learn: 0.4542094	total: 1m 11s	remaining: 3m 6s
300:	learn: 0.4517399	total: 1m 17s	remaining: 3m
325:	learn: 0.4493938	total: 1m 24s	remaining: 2m 54s
350:	learn: 0.4471192	total: 1m 30s	remaining: 2m 47s
375:	learn: 0.4450307	total: 1m 37s	remaining: 2m 41s
400:	learn: 0.4430091	total: 1m 43s	remaining: 2m 34s
425:	learn: 0.4411070	total: 1m 49s	remaining: 2m 28s
450:	learn: 0.4395243	tot

## Ranker


In [89]:
"""train_ds = train_ds.drop(['article', 'user_id']).to_pandas().sort_values(by='impression_id')
groups = train_ds['impression_id'].copy()

categorical_columns = ['device_type', 'is_sso_user', 'gender', 'is_subscriber', 'weekday',
                       'premium', 'category', 'sentiment_label', 'is_new_article', 'is_already_seen_article',
                       'MostFrequentCategory', 'MostFrequentWeekday', 'IsFavouriteCategory',
                       'article_type', 'postcode']
categorical_columns += [f'Entity_{entity}_Present' for entity in unique_entities]
train_ds[categorical_columns] = train_ds[categorical_columns].astype('category')

X = train_ds.drop(columns=['target', 'impression_id'])
y = train_ds['target']"""

"train_ds = train_ds.drop(['article', 'user_id']).to_pandas().sort_values(by='impression_id')\ngroups = train_ds['impression_id'].copy()\n\ncategorical_columns = ['device_type', 'is_sso_user', 'gender', 'is_subscriber', 'weekday',\n                       'premium', 'category', 'sentiment_label', 'is_new_article', 'is_already_seen_article',\n                       'MostFrequentCategory', 'MostFrequentWeekday', 'IsFavouriteCategory',\n                       'article_type', 'postcode']\ncategorical_columns += [f'Entity_{entity}_Present' for entity in unique_entities]\ntrain_ds[categorical_columns] = train_ds[categorical_columns].astype('category')\n\nX = train_ds.drop(columns=['target', 'impression_id'])\ny = train_ds['target']"

In [90]:

"""model = CatBoostRanker(cat_features=categorical_columns, iterations=2000, depth=8, colsample_bylevel=0.5)

model.fit(X, y, group_id=groups, verbose=25)

gc.collect()
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED)"""


'model = CatBoostRanker(cat_features=categorical_columns, iterations=2000, depth=8, colsample_bylevel=0.5)\n\nmodel.fit(X, y, group_id=groups, verbose=25)\n\ngc.collect()\nPrintColor(f"\n" + GetMemUsage(), color = Fore.RED)'

In [37]:
val_ds = pl.read_parquet('/mnt/ebs_volume/recsys2024/preprocessing/small_ds/validation/val_ds.parquet')
val_ds

impression_id,article,user_id,target,device_type,read_time,scroll_percentage,is_sso_user,gender,age,is_subscriber,postcode,trendiness_score_1d,trendiness_score_3d,trendiness_score_5d,weekday,hour,trendiness_score_1d/3d,trendiness_score_1d/5d,normalized_trendiness_score_overall,premium,category,sentiment_score,sentiment_label,num_images,title_len,subtitle_len,body_len,num_topics,total_pageviews,total_inviews,total_read_time,total_pageviews/inviews,article_type,article_delay_days,article_delay_hours,Entity_EVENT_Present,…,entropy_impression_endorsement_10h,entropy_impression_total_pageviews/inviews,entropy_impression_mean_JS,entropy_impression_mean_topic_model_cosine,entropy_impression_topics_cosine,entropy_impression_article_delay_hours,entropy_impression_total_pageviews,entropy_impression_total_inviews,entropy_impression_trendiness_score_category,entropy_impression_std_JS,entropy_impression_trendiness_score_5d,entropy_impression_total_read_time,trendiness_score_3d_minus_median_impression,endorsement_10h_minus_median_impression,total_pageviews/inviews_minus_median_impression,mean_JS_minus_median_impression,mean_topic_model_cosine_minus_median_impression,topics_cosine_minus_median_impression,article_delay_hours_minus_median_impression,total_pageviews_minus_median_impression,total_inviews_minus_median_impression,trendiness_score_category_minus_median_impression,std_JS_minus_median_impression,trendiness_score_5d_minus_median_impression,total_read_time_minus_median_impression,mean_JS_l_inf_user,std_JS_l_inf_user,mean_topic_model_cosine_l_inf_user,topics_cosine_l_inf_user,article_delay_hours_l_inf_article,mean_JS_l_inf_article,std_JS_l_inf_article,mean_topic_model_cosine_l_inf_article,topics_cosine_l_inf_article,category_diversity_impression,sentiment_label_diversity_impression,article_type_diversity_impression
u32,i32,u32,i8,i8,f32,f32,bool,i8,i8,bool,i8,i16,i16,i16,i8,i8,f32,f32,f32,bool,i16,f32,str,u32,u8,u8,u16,u32,i32,i32,f32,f32,str,i16,i32,bool,…,f64,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f64,f64,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,u32,u32,u32
205373625,9785030,1801226,0,2,251.0,,true,0,,true,5,30,80,162,7,17,0.375,0.185185,0.146789,false,414,0.9167,"""Positive""",3,6,31,426,3,37391,349453,2.850225e6,0.106999,"""article_default""",0,2,false,…,3.613138,3.589502,3.595951,3.758065,3.631038,,3.447933,3.399587,3.603802,3.672916,3.567628,3.488753,-139.0,69.0,-0.021321,-0.059766,0.077358,-0.207508,-3.0,-15276.5,-34312.5,-67.0,-0.035288,-253.0,-770304.0,0.291109,0.442209,0.972551,0.25217,0.025974,0.253,0.421015,0.879186,0.272875,8,3,3
456698625,9553264,1757180,0,2,34.0,,false,2,,false,5,55,105,190,3,5,0.52381,0.289474,0.173267,true,457,0.8349,"""Neutral""",1,6,20,718,3,,,,,"""article_default""",162,3908,false,…,2.414501,2.598626,2.587271,2.703429,2.621012,,2.487291,2.576228,2.518426,2.683312,2.546191,2.466265,-80.0,-377.0,,-0.007509,-0.09586,-0.077113,3906.0,,,-96.0,0.000809,-89.0,,0.324542,0.586527,0.720961,0.291917,0.993644,0.10526,0.269075,0.808239,0.179034,5,3,1
46558736,7213923,1885995,0,2,25.0,,false,2,,false,5,26,64,94,1,6,0.40625,0.276596,0.105263,true,565,0.8109,"""Neutral""",5,10,42,1048,4,,,,,"""article_default""",1766,42385,false,…,2.5383,2.609011,2.512937,2.704911,2.545427,,2.441637,2.554746,2.483112,2.606141,2.522325,2.387892,-98.0,-1373.0,,-0.074603,0.002056,-0.228412,42383.0,,,-95.0,-0.09143,-217.0,,0.093726,0.196071,0.89184,0.124086,0.998281,0.045365,0.194879,0.848475,0.144924,6,3,1
470321755,9504105,594528,0,2,0.0,100.0,false,2,,false,5,84,249,393,2,19,0.337349,0.21374,0.394612,true,142,0.8874,"""Positive""",5,7,44,836,2,,,,,"""article_default""",203,4895,true,…,3.224331,3.149038,,3.427022,,,3.018402,2.828171,3.191193,,3.153275,3.059159,121.0,-737.0,,0.20034,0.079726,0.402638,4892.0,,,27.0,0.075762,185.0,,0.495476,0.434085,0.963454,0.572476,0.992699,0.629829,0.499302,0.864027,0.71809,8,3,2
214849994,9279095,811018,0,1,12.0,,false,2,,false,5,10,48,73,7,15,0.208333,0.136986,0.088073,true,414,0.8922,"""Neutral""",15,5,34,1358,2,,,,,"""article_standard_feature""",361,8667,false,…,1.97702,1.734625,2.122262,2.298575,2.136155,1.303762,1.647051,1.696444,1.864146,2.231121,2.028662,1.666511,-101.0,-976.0,,-0.008241,0.009916,0.112593,8644.5,,,9.0,-0.003612,-183.0,,0.330884,0.328687,0.903213,0.480696,0.993125,0.176142,0.323765,0.878249,0.406838,5,3,2
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
184283316,9526715,2530683,0,2,25.0,,false,2,,false,5,16,46,109,7,19,0.347826,0.146789,0.084404,true,457,0.9216,"""Positive""",8,6,28,846,4,,,,,"""article_default""",179,4306,true,…,2.579236,2.549223,2.392785,2.769034,2.455192,,2.438128,2.585122,2.445014,2.563066,2.516468,2.484598,-159.0,851.5,,-0.068041,-0.010979,-0.202956,4305.0,,,-66.0,-0.079315,-240.5,,0.062263,0.154296,0.855596,0.082469,0.985806,0.117577,0.20303,0.811139,0.162312,7,3,1
90104855,9538004,2188571,0,1,88.0,,false,2,,false,5,61,190,344,6,8,0.321053,0.177326,0.368217,true,140,0.6134,"""Neutral""",11,6,24,1112,5,,,,,"""article_standard_feature""",145,3503,false,…,3.178899,3.238027,3.56544,3.683996,3.585115,1.93449,3.180917,3.299404,3.425413,3.607439,3.433167,3.230481,72.5,145.0,,0.009155,0.013953,0.037341,3471.0,,,0.0,0.006474,154.5,,0.597854,0.470919,0.913364,0.488181,0.973867,0.403892,0.66911,0.871372,0.498279,8,3,3
257661167,9268227,2577740,0,2,83.0,100.0,true,0,50,false,5,45,155,290,6,17,0.290323,0.155172,0.300388,true,140,0.9946,"""Negative""",5,6,25,1102,5,,,,,"""article_default""",361,8676,false,…,2.615567,2.698224,2.763518,2.885952,2.800743,,2.487511,2.639263,2.779887,2.778692,2.726676,2.495079,-4.0,-113.5,,-0.002833,-0.015208,-0.008086,8675.0,,,-54.0,-0.040868,17.0,,0.617845,0.433314,0.831347,0.59966,0.987592,0.329419,0.643728,0.798818,0.439408,5,3,2
379754148,9786230,1567717,0,1,110.0,100.0,false,2,,false,5,69,250,413,1,11,0.276,0.16707,0.411184,false,142,0.9891,"""Negative""",2,6,14,263,4,62719,288112,2.380866e6,0.21769,"""article_default""",0,3,false,…,0.693936,1.919736,1.902583,1.944614,1.896908,,1.818739,1.884916,1.858519,1.90391,1.926037,1.836109,-48.0,1589.0,0.068483,-0.043218,0.0,-0.088774,0.0,4351.0,-26080.0,0.0,-0.014528,-71.0,-600245.0,0.522058,0.630632,0.892089,0.611012,0.058824,0.387504,0.446642,0.860705,0.584431,3,2,1


In [38]:
URM_validation = load_sparse_csr(Path('/home/ubuntu/recsys2024/urm/recsys/small/URM_validation.npz'))

knn_icm = ItemKNNCBFRecommender(URM_train=URM_validation, ICM_train=ICM)
knn_icm.fit()

recs = load_recommenders(URM_validation, Path('/home/ubuntu/recsys2024/algo/recsys/small/validation'))

recs.append(knn_icm)

recsys_features = build_recsys_features(history=history_train.vstack(history_val),behaviors=behaviors_val,articles=articles,recs=recs)

File loaded at: /home/ubuntu/recsys2024/urm/recsys/small/URM_validation.npz
ItemKNNCBFRecommender: URM Detected 3485 (18.5%) users with no interactions.
ItemKNNCBFRecommender: URM Detected 12029 (58.0%) items with no interactions.
Compute_Similarity: detected dense matrix
Similarity column 20738 (100.0%), 225.71 column/sec. Elapsed time 1.53 min
SLIM_BPR_Cython: URM Detected 3485 (18.5%) users with no interactions.
SLIM_BPR_Cython: URM Detected 12029 (58.0%) items with no interactions.
SLIM_BPR_Cython: Loading model from file '/home/ubuntu/recsys2024/algo/recsys/small/validation/SLIM_BPR_Cython'
SLIM_BPR_Cython: Loading complete
ItemKNNCFRecommender: URM Detected 3485 (18.5%) users with no interactions.
ItemKNNCFRecommender: URM Detected 12029 (58.0%) items with no interactions.
ItemKNNCFRecommender: Loading model from file '/home/ubuntu/recsys2024/algo/recsys/small/validation/ItemKNNCFRecommender'
ItemKNNCFRecommender: Loading complete
PureSVDItemRecommender: URM Detected 3485 (18.5%)

In [93]:
#recsys_features = pl.read_parquet('/mnt/ebs_volume_2/recsys2024/features/recsys/small/validation/recsys_scores_features.parquet')
#recsys_features

In [39]:
val_ds = val_ds.join(recsys_features, on=['article','impression_id','user_id'], how='left')
val_ds

impression_id,article,user_id,target,device_type,read_time,scroll_percentage,is_sso_user,gender,age,is_subscriber,postcode,trendiness_score_1d,trendiness_score_3d,trendiness_score_5d,weekday,hour,trendiness_score_1d/3d,trendiness_score_1d/5d,normalized_trendiness_score_overall,premium,category,sentiment_score,sentiment_label,num_images,title_len,subtitle_len,body_len,num_topics,total_pageviews,total_inviews,total_read_time,total_pageviews/inviews,article_type,article_delay_days,article_delay_hours,Entity_EVENT_Present,…,entropy_impression_total_pageviews,entropy_impression_total_inviews,entropy_impression_trendiness_score_category,entropy_impression_std_JS,entropy_impression_trendiness_score_5d,entropy_impression_total_read_time,trendiness_score_3d_minus_median_impression,endorsement_10h_minus_median_impression,total_pageviews/inviews_minus_median_impression,mean_JS_minus_median_impression,mean_topic_model_cosine_minus_median_impression,topics_cosine_minus_median_impression,article_delay_hours_minus_median_impression,total_pageviews_minus_median_impression,total_inviews_minus_median_impression,trendiness_score_category_minus_median_impression,std_JS_minus_median_impression,trendiness_score_5d_minus_median_impression,total_read_time_minus_median_impression,mean_JS_l_inf_user,std_JS_l_inf_user,mean_topic_model_cosine_l_inf_user,topics_cosine_l_inf_user,article_delay_hours_l_inf_article,mean_JS_l_inf_article,std_JS_l_inf_article,mean_topic_model_cosine_l_inf_article,topics_cosine_l_inf_article,category_diversity_impression,sentiment_label_diversity_impression,article_type_diversity_impression,SLIM_BPR_Cython,ItemKNNCFRecommender,PureSVDItemRecommender,PureSVDRecommender,RP3betaRecommender,ItemKNNCBFRecommender
u32,i32,u32,i8,i8,f32,f32,bool,i8,i8,bool,i8,i16,i16,i16,i8,i8,f32,f32,f32,bool,i16,f32,str,u32,u8,u8,u16,u32,i32,i32,f32,f32,str,i16,i32,bool,…,f64,f64,f64,f32,f64,f32,f64,f64,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,u32,u32,u32,f32,f32,f32,f32,f32,f32
205373625,9785030,1801226,0,2,251.0,,true,0,,true,5,30,80,162,7,17,0.375,0.185185,0.146789,false,414,0.9167,"""Positive""",3,6,31,426,3,37391,349453,2.850225e6,0.106999,"""article_default""",0,2,false,…,3.447933,3.399587,3.603802,3.672916,3.567628,3.488753,-139.0,69.0,-0.021321,-0.059766,0.077358,-0.207508,-3.0,-15276.5,-34312.5,-67.0,-0.035288,-253.0,-770304.0,0.291109,0.442209,0.972551,0.25217,0.025974,0.253,0.421015,0.879186,0.272875,8,3,3,0.0,0.0,0.0,0.0,0.0,0.007636
456698625,9553264,1757180,0,2,34.0,,false,2,,false,5,55,105,190,3,5,0.52381,0.289474,0.173267,true,457,0.8349,"""Neutral""",1,6,20,718,3,,,,,"""article_default""",162,3908,false,…,2.487291,2.576228,2.518426,2.683312,2.546191,2.466265,-80.0,-377.0,,-0.007509,-0.09586,-0.077113,3906.0,,,-96.0,0.000809,-89.0,,0.324542,0.586527,0.720961,0.291917,0.993644,0.10526,0.269075,0.808239,0.179034,5,3,1,0.0,0.0,0.000602,-0.000296,0.0,0.007901
46558736,7213923,1885995,0,2,25.0,,false,2,,false,5,26,64,94,1,6,0.40625,0.276596,0.105263,true,565,0.8109,"""Neutral""",5,10,42,1048,4,,,,,"""article_default""",1766,42385,false,…,2.441637,2.554746,2.483112,2.606141,2.522325,2.387892,-98.0,-1373.0,,-0.074603,0.002056,-0.228412,42383.0,,,-95.0,-0.09143,-217.0,,0.093726,0.196071,0.89184,0.124086,0.998281,0.045365,0.194879,0.848475,0.144924,6,3,1,0.015117,0.023968,0.157036,0.05609,0.014369,0.0
470321755,9504105,594528,0,2,0.0,100.0,false,2,,false,5,84,249,393,2,19,0.337349,0.21374,0.394612,true,142,0.8874,"""Positive""",5,7,44,836,2,,,,,"""article_default""",203,4895,true,…,3.018402,2.828171,3.191193,,3.153275,3.059159,121.0,-737.0,,0.20034,0.079726,0.402638,4892.0,,,27.0,0.075762,185.0,,0.495476,0.434085,0.963454,0.572476,0.992699,0.629829,0.499302,0.864027,0.71809,8,3,2,0.0,0.0,0.000044,-0.000059,0.0,0.0
214849994,9279095,811018,0,1,12.0,,false,2,,false,5,10,48,73,7,15,0.208333,0.136986,0.088073,true,414,0.8922,"""Neutral""",15,5,34,1358,2,,,,,"""article_standard_feature""",361,8667,false,…,1.647051,1.696444,1.864146,2.231121,2.028662,1.666511,-101.0,-976.0,,-0.008241,0.009916,0.112593,8644.5,,,9.0,-0.003612,-183.0,,0.330884,0.328687,0.903213,0.480696,0.993125,0.176142,0.323765,0.878249,0.406838,5,3,2,0.0,0.0,0.002322,0.001932,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
184283316,9526715,2530683,0,2,25.0,,false,2,,false,5,16,46,109,7,19,0.347826,0.146789,0.084404,true,457,0.9216,"""Positive""",8,6,28,846,4,,,,,"""article_default""",179,4306,true,…,2.438128,2.585122,2.445014,2.563066,2.516468,2.484598,-159.0,851.5,,-0.068041,-0.010979,-0.202956,4305.0,,,-66.0,-0.079315,-240.5,,0.062263,0.154296,0.855596,0.082469,0.985806,0.117577,0.20303,0.811139,0.162312,7,3,1,0.0,0.0,0.0,0.0,0.0,0.0
90104855,9538004,2188571,0,1,88.0,,false,2,,false,5,61,190,344,6,8,0.321053,0.177326,0.368217,true,140,0.6134,"""Neutral""",11,6,24,1112,5,,,,,"""article_standard_feature""",145,3503,false,…,3.180917,3.299404,3.425413,3.607439,3.433167,3.230481,72.5,145.0,,0.009155,0.013953,0.037341,3471.0,,,0.0,0.006474,154.5,,0.597854,0.470919,0.913364,0.488181,0.973867,0.403892,0.66911,0.871372,0.498279,8,3,3,0.0,0.0,0.004847,-0.000262,0.0,0.016177
257661167,9268227,2577740,0,2,83.0,100.0,true,0,50,false,5,45,155,290,6,17,0.290323,0.155172,0.300388,true,140,0.9946,"""Negative""",5,6,25,1102,5,,,,,"""article_default""",361,8676,false,…,2.487511,2.639263,2.779887,2.778692,2.726676,2.495079,-4.0,-113.5,,-0.002833,-0.015208,-0.008086,8675.0,,,-54.0,-0.040868,17.0,,0.617845,0.433314,0.831347,0.59966,0.987592,0.329419,0.643728,0.798818,0.439408,5,3,2,0.0,0.0,0.007424,0.005384,0.0,0.007175
379754148,9786230,1567717,0,1,110.0,100.0,false,2,,false,5,69,250,413,1,11,0.276,0.16707,0.411184,false,142,0.9891,"""Negative""",2,6,14,263,4,62719,288112,2.380866e6,0.21769,"""article_default""",0,3,false,…,1.818739,1.884916,1.858519,1.90391,1.926037,1.836109,-48.0,1589.0,0.068483,-0.043218,0.0,-0.088774,0.0,4351.0,-26080.0,0.0,-0.014528,-71.0,-600245.0,0.522058,0.630632,0.892089,0.611012,0.058824,0.387504,0.446642,0.860705,0.584431,3,2,1,0.0,0.0,0.0,0.0,0.0,0.008196


In [40]:
val_ds.shape

(2928942, 260)

In [41]:
# impression_id will be later useful for evaluation
val_ds_pandas = val_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()

val_ds_pandas[categorical_columns] = val_ds_pandas[categorical_columns].astype('category')

X_val = val_ds_pandas.drop(columns=['target'])
y_val = val_ds_pandas['target']

val_ds = val_ds.with_columns(pl.Series(model.predict_proba(X_val)[:, 1]).alias('prediction'))
val_ds.select(['impression_id', 'target', 'prediction'])


impression_id,target,prediction
u32,i8,f64
205373625,0,0.003834
456698625,0,0.043841
46558736,0,0.003513
470321755,0,0.01971
214849994,0,0.008382
…,…,…
184283316,0,0.062339
90104855,0,0.013577
257661167,0,0.108202
379754148,0,0.631277


In [98]:
"""# impression_id will be later useful for evaluation
val_ds_pandas = val_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()

val_ds_pandas[categorical_columns] = val_ds_pandas[categorical_columns].astype('category')

X_val = val_ds_pandas.drop(columns=['target'])
y_val = val_ds_pandas['target']

val_ds = val_ds.with_columns(pl.Series(model.predict(X_val)).alias('prediction'))
val_ds.select(['impression_id', 'target', 'prediction'])"""

"# impression_id will be later useful for evaluation\nval_ds_pandas = val_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()\n\nval_ds_pandas[categorical_columns] = val_ds_pandas[categorical_columns].astype('category')\n\nX_val = val_ds_pandas.drop(columns=['target'])\ny_val = val_ds_pandas['target']\n\nval_ds = val_ds.with_columns(pl.Series(model.predict(X_val)).alias('prediction'))\nval_ds.select(['impression_id', 'target', 'prediction'])"

In [42]:
evaluation_ds = val_ds.group_by('impression_id').agg(pl.col('target'), pl.col('prediction'))
evaluation_ds

impression_id,target,prediction
u32,list[i8],list[f64]
31280009,"[0, 0, … 0]","[0.399578, 0.240918, … 0.084325]"
456762936,"[0, 0, … 0]","[0.062732, 0.44511, … 0.412559]"
168750204,"[0, 0, … 0]","[0.246336, 0.022812, … 0.283401]"
373961595,"[0, 0, … 1]","[0.554871, 0.176925, … 0.357857]"
265823266,"[0, 0, … 0]","[0.596666, 0.347244, … 0.066255]"
…,…,…
495136827,"[0, 0, … 1]","[0.016536, 0.011279, … 0.977852]"
149833767,"[0, 0, … 0]","[0.399938, 0.143863, … 0.151544]"
487716256,"[1, 0, … 0]","[0.98752, 0.004064, … 0.005033]"
42791017,"[0, 0, … 0]","[0.194567, 0.421583, … 0.142405]"


In [46]:
import sys
sys.path.append('/home/ubuntu/RecSysChallenge2024/src')

In [50]:
%cd /home/ubuntu/RecSysChallenge2024/src

/home/ubuntu/RecSysChallenge2024/src


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [52]:
from fastauc.fastauc.fast_auc import CppAuc
cpp_auc = CppAuc()
np.mean([cpp_auc.roc_auc_score(np.array(y_t).astype(bool), np.array(y_s).astype(np.float32)) 
                 for y_t, y_s in zip(evaluation_ds['target'].to_list(), 
                                     evaluation_ds['prediction'].to_list())])

0.7872166138635267