In [1]:
from kaggle_secrets import UserSecretsClient
secret_label = "Token"
token = UserSecretsClient().get_secret(secret_label)

! git clone --branch baseline_boosting https://{token}@github.com/FrancescoZanella/RecSysChallenge2024.git
! git clone https://github.com/ebanalyse/ebnerd-benchmark.git

Cloning into 'RecSysChallenge2024'...
remote: Enumerating objects: 166, done.[K
remote: Counting objects: 100% (166/166), done.[K
remote: Compressing objects: 100% (129/129), done.[K
remote: Total 166 (delta 33), reused 141 (delta 22), pack-reused 0[K
Receiving objects: 100% (166/166), 16.57 MiB | 28.42 MiB/s, done.
Resolving deltas: 100% (33/33), done.
Cloning into 'ebnerd-benchmark'...
remote: Enumerating objects: 340, done.[K
remote: Counting objects: 100% (340/340), done.[K
remote: Compressing objects: 100% (224/224), done.[K
remote: Total 340 (delta 151), reused 277 (delta 93), pack-reused 0[K
Receiving objects: 100% (340/340), 15.18 MiB | 34.16 MiB/s, done.
Resolving deltas: 100% (151/151), done.


In [2]:
%cd ebnerd-benchmark/src

/kaggle/working/ebnerd-benchmark/src


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import polars as pl
import scipy.stats as stats
import gc

from ebrec.utils._descriptive_analysis import (
    min_max_impression_time_behaviors, 
    min_max_impression_time_history
)
from ebrec.utils._polars import slice_join_dataframes
from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    truncate_history,
)
from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_USER_COL
)
from ebrec.evaluation.metrics_protocols import *

from catboost import CatBoostClassifier

In [4]:
articles = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/articles.parquet')

behaviors_train = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/train/behaviors.parquet')
history_train = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/train/history.parquet')

behaviors_val = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/validation/behaviors.parquet')
history_val = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/validation/history.parquet')

In [5]:
unique_entities = articles.select('entity_groups').explode('entity_groups')['entity_groups'].unique().to_list()
unique_entities = [e for e in unique_entities if e is not None]
unique_entities

['PER', 'LOC', 'PROD', 'MISC', 'ORG', 'EVENT']

In [6]:
NPRATIO = 2
train_ds = behaviors_train.pipe(
    sampling_strategy_wu2019, npratio=NPRATIO, shuffle=False, with_replacement=True, seed=123
).pipe(create_binary_labels_column, shuffle=True, seed=123)

In [9]:
# dropping duplicates that can occur because of sampling strategy
train_ds = train_ds.select(['impression_id', 'article_ids_inview', 'article_id', 'impression_time', 'labels', 
                            'device_type', 'read_time', 'scroll_percentage', 'user_id', 'is_sso_user', 'gender',
                            'age', 'is_subscriber']) \
    .explode(['article_ids_inview', 'labels']) \
    .rename({'article_ids_inview': 'article', 'labels': 'target'}) \
    .unique(['impression_id', 'article']) \
    .with_columns(
        pl.col('impression_time').dt.weekday().alias('weekday'),
        pl.col('impression_time').dt.hour().alias('hour'),
        pl.col('article').cast(pl.Int32),
    ).join(articles.select(['article_id', 'premium', 'published_time', 'category', 
                            'sentiment_score', 'sentiment_label', 'entity_groups']),
           left_on='article', right_on='article_id', how='left') \
    .with_columns(
        (pl.col('impression_time') - pl.col('published_time')).dt.total_days().alias('article_delay_days'),
        (pl.col('impression_time') - pl.col('published_time')).dt.total_hours().alias('article_delay_hours')
    ).drop(['impression_time', 'published_time', 'article_id']) \
    .with_columns(
        pl.col('entity_groups').list.contains(entity).alias(f'Entity_{entity}_Present')
        for entity in unique_entities
    )

train_ds.head()

impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,entity_groups,article_delay_days,article_delay_hours,Entity_PER_Present,Entity_LOC_Present,Entity_PROD_Present,Entity_MISC_Present,Entity_ORG_Present,Entity_EVENT_Present
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,list[str],i64,i64,bool,bool,bool,bool,bool,bool
150528,9778669,0,2,25.0,,143471,False,,,False,3,7,True,False,118,0.9481,"""Negative""","[""ORG"", ""ORG"", … ""PROD""]",0,1,True,False,True,False,True,False
153068,9778669,1,1,78.0,100.0,151570,False,,,False,3,7,False,False,118,0.9481,"""Negative""","[""ORG"", ""ORG"", … ""PROD""]",0,0,True,False,True,False,True,False
153071,9775990,0,1,125.0,100.0,151570,False,,,False,3,7,False,False,142,0.5539,"""Neutral""","[""PER"", ""MISC"", … ""EVENT""]",1,43,True,True,False,True,True,True
153071,9771223,0,1,125.0,100.0,151570,False,,,False,3,7,False,False,142,0.7164,"""Neutral""","[""EVENT"", ""MISC"", … ""PROD""]",5,139,True,True,True,True,True,True
153078,9778226,1,1,7.0,100.0,151570,False,,,False,3,7,False,False,414,0.9722,"""Positive""","[""PER"", ""PER"", … ""PROD""]",0,1,True,False,True,False,True,False


In [10]:
train_ds.shape

(699537, 27)

# Adding history

## Retrieving Article Features

In [11]:
%%time

from rich.progress import Progress


def get_single_feature_function(f_name, progress_task):
    def get_feature(article_ids):
        progress.update(progress_task, advance=1)
        feature_values = articles.filter(pl.col('article_id').is_in(article_ids)) \
            .select(pl.col(f_name))[f_name].to_list()
        return feature_values
    return get_feature


def get_unique_list_feature_function(f_name, progress_task):
    def get_feature(article_ids):
        progress.update(progress_task, advance=1)
        feature_values = articles.filter(pl.col('article_id').is_in(article_ids)) \
            .select(pl.col(f_name).list.unique())[f_name].to_list()
        return feature_values
    return get_feature


def get_unique_list_exploded_feature_function(f_name, progress_task):
    def get_feature(article_ids):
        progress.update(progress_task, advance=1)
        feature_values = articles.filter(pl.col('article_id').is_in(article_ids)) \
            .select(pl.col(f_name).list.unique()).explode(f_name)[f_name].to_list()
        return feature_values
    return get_feature


# ner_clusters for the moment is not used since there are too much unique values (more than 44k)
columns = ['category', 'article_type', 'sentiment_label', 'sentiment_score']
return_dtypes = [pl.Int64, pl.String, pl.String, pl.Float64]
with Progress() as progress: 
    
    tasks = {}
    for col in columns:
        tasks[col] = progress.add_task(f"Getting {col}", total=history_train.shape[0])
    tasks['entity_groups'] = progress.add_task("Getting entity_groups", total=history_train.shape[0])
    tasks['entity_groups_detailed'] = progress.add_task("Getting detailed entity_groups", total=history_train.shape[0])

    history_train = history_train.with_columns(
        [pl.col('article_id_fixed').map_elements(get_single_feature_function(col, tasks[col]), 
                                                 return_dtype=pl.List(dtype)).alias(col)
         for col, dtype in zip(columns, return_dtypes)] + \
        [pl.col('article_id_fixed').map_elements(get_unique_list_exploded_feature_function('entity_groups', tasks['entity_groups']), 
                                                 return_dtype=pl.List(pl.String)).alias('entity_groups'),
         pl.col('article_id_fixed').map_elements(get_unique_list_feature_function('entity_groups', tasks['entity_groups_detailed']), 
                                                 return_dtype=pl.List(pl.List(pl.String))).alias('entity_groups_detailed')]
    ).drop('article_id_fixed')
    
history_train.head(2)

Output()

CPU times: user 4min 1s, sys: 20.8 s, total: 4min 22s
Wall time: 2min 51s


user_id,impression_time_fixed,scroll_percentage_fixed,read_time_fixed,category,article_type,sentiment_label,sentiment_score,entity_groups,entity_groups_detailed
u32,list[datetime[μs]],list[f32],list[f32],list[i64],list[str],list[str],list[f64],list[str],list[list[str]]
13538,"[2023-04-27 10:17:43, 2023-04-27 10:18:01, … 2023-05-17 20:36:34]","[100.0, 35.0, … 100.0]","[17.0, 12.0, … 16.0]","[565, 572, … 140]","[""article_default"", ""article_fullscreen_gallery"", … ""article_default""]","[""Neutral"", ""Neutral"", … ""Negative""]","[0.8108, 0.7985, … 0.9959]","[null, null, … ""ORG""]","[[], [], … [""PER"", ""ORG"", ""LOC""]]"
14241,"[2023-04-27 09:40:18, 2023-04-27 09:40:33, … 2023-05-17 17:08:41]","[100.0, 46.0, … 100.0]","[8.0, 9.0, … 12.0]","[457, 457, … 118]","[""article_default"", ""article_default"", … ""article_default""]","[""Negative"", ""Neutral"", … ""Neutral""]","[0.9712, 0.9115, … 0.8372]","[null, ""ORG"", … ""PER""]","[[], [""PER"", ""ORG""], … [""ORG"", ""PER""]]"


## Basic Features

In [12]:
history_train = history_train.with_columns(
    pl.col('read_time_fixed').list.len().alias('NumArticlesHistory'),
    pl.col('read_time_fixed').list.median().alias('MedianReadTime'),
    pl.col('read_time_fixed').list.max().alias('MaxReadTime'),
    pl.col('read_time_fixed').list.sum().alias('TotalReadTime'),
    pl.col('scroll_percentage_fixed').list.median().alias('MedianScrollPercentage'),
    pl.col('scroll_percentage_fixed').list.max().alias('MaxScrollPercentage'),
    pl.col('impression_time_fixed').list.eval(pl.element().dt.weekday()).alias('weekdays'),
    pl.col('impression_time_fixed').list.eval(pl.element().dt.hour()).alias('hours'),
).with_columns(
    pl.col('weekdays').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int8).alias('MostFrequentWeekday'),
    pl.col('hours').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int8).alias('MostFrequentHour'),
    pl.col('category').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int16).alias('MostFrequentCategory'),
    (1 - (pl.col('article_type').list.count_matches('article_default') / pl.col('NumArticlesHistory'))).alias('PctNotDefaultArticles'),
    (pl.col('sentiment_label').list.count_matches('Negative') / pl.col('NumArticlesHistory')).alias('NegativePct'),
    (pl.col('sentiment_label').list.count_matches('Positive') / pl.col('NumArticlesHistory')).alias('PositivePct'),
    (pl.col('sentiment_label').list.count_matches('Neutral') / pl.col('NumArticlesHistory')).alias('NeutralPct'),
    # TODO: not sure about how sentiment scores are since there are 3 classes
    # probably the score is related to the class so this should be done differently for each class
#     pl.col('sentiment_score').list.mean().alias('MeanSentimentScore'),
#     pl.col('sentiment_score').list.max().alias('MaxSentimentScore'),
#     pl.col('sentiment_score').list.min().alias('MinSentimentScore'),
).drop(
    ['read_time_fixed', 'scroll_percentage_fixed', 'impression_time_fixed', 
     'weekdays', 'hours', 'sentiment_label', 'sentiment_score', 'article_type']
).with_columns(
    (pl.col('entity_groups').list.count_matches(entity) / pl.col('NumArticlesHistory')).alias(f'{entity}Pct')
    for entity in unique_entities
).drop('entity_groups')

history_train.head(2)

user_id,category,entity_groups_detailed,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,MostFrequentWeekday,MostFrequentHour,MostFrequentCategory,PctNotDefaultArticles,NegativePct,PositivePct,NeutralPct,PERPct,LOCPct,PRODPct,MISCPct,ORGPct,EVENTPct
u32,list[i64],list[list[str]],u32,f32,f32,f32,f32,f32,i8,i8,i16,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
13538,"[565, 572, … 140]","[[], [], … [""PER"", ""ORG"", ""LOC""]]",582,6.0,672.0,7192.0,32.0,100.0,3,3,118,0.073883,0.601375,0.12543,0.223368,0.819588,0.719931,0.280069,0.16323,0.87457,0.159794
14241,"[457, 457, … 118]","[[], [""PER"", ""ORG""], … [""ORG"", ""PER""]]",179,19.0,1574.0,7754.0,100.0,100.0,3,13,118,0.139665,0.553073,0.094972,0.22905,0.776536,0.681564,0.206704,0.162011,0.810056,0.106145


In [13]:
history_train.shape

(15143, 22)

## Categories and entities

In [14]:
def list_pct_matches_with_col(a, b) -> pl.Expr:
    '''
    Returns an expression to count the number of matching element in a list with another column.
    The polars function count_matches cannot be used since it wants only a single element, 
    variable element from row to row.
    '''
    return pl.when(pl.col(a).list.len() == 0).then(0.0) \
        .otherwise((pl.col(a).list.len() - (pl.col(a).list.set_difference(pl.col(b))).list.len()) / pl.col(a).list.len())


train_ds = train_ds.join(history_train.drop(['entity_groups_detailed']), on='user_id', how='left').with_columns(
    (pl.col('category') == pl.col('MostFrequentCategory')).alias('IsFavouriteCategory'),
    pl.col('category_right').list.n_unique().alias('NumberDifferentCategories'),
    list_pct_matches_with_col('category_right', 'category').alias('PctCategoryMatches'),
).drop('category_right')

gc.collect()
    
train_ds.head(2)

impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,entity_groups,article_delay_days,article_delay_hours,Entity_PER_Present,Entity_LOC_Present,Entity_PROD_Present,Entity_MISC_Present,Entity_ORG_Present,Entity_EVENT_Present,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,MostFrequentWeekday,MostFrequentHour,MostFrequentCategory,PctNotDefaultArticles,NegativePct,PositivePct,NeutralPct,PERPct,LOCPct,PRODPct,MISCPct,ORGPct,EVENTPct,IsFavouriteCategory,NumberDifferentCategories,PctCategoryMatches
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,list[str],i64,i64,bool,bool,bool,bool,bool,bool,u32,f32,f32,f32,f32,f32,i8,i8,i16,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool,u32,f64
150528,9778669,0,2,25.0,,143471,False,,,False,3,7,True,False,118,0.9481,"""Negative""","[""ORG"", ""ORG"", … ""PROD""]",0,1,True,False,True,False,True,False,482,13.0,1390.0,23016.0,56.0,100.0,4,4,118,0.112033,0.549793,0.118257,0.251037,0.802905,0.649378,0.284232,0.16805,0.848548,0.182573,True,13,0.972912
153068,9778669,1,1,78.0,100.0,151570,False,,,False,3,7,False,False,118,0.9481,"""Negative""","[""ORG"", ""ORG"", … ""PROD""]",0,0,True,False,True,False,True,False,1058,16.0,1548.0,42896.0,100.0,100.0,5,5,118,0.316635,0.397921,0.115312,0.187146,0.620038,0.535917,0.217391,0.142722,0.648393,0.163516,True,10,0.987854


In [15]:
%%time

import tqdm

entities_df = pl.concat(
    (
        rows.select(['impression_id', 'user_id', 'article', 'entity_groups']) \
            .join(history_train.select(['user_id', 'entity_groups_detailed']), on='user_id', how='left') \
            .explode('entity_groups_detailed')
            .with_columns(
                pl.col('entity_groups').list.set_intersection(pl.col('entity_groups_detailed')).list.len().alias('common_entities')
            ).drop(['entity_groups_detailed', 'entity_groups']) \
            .group_by(['impression_id', 'article']).agg(
                pl.col('common_entities').mean().alias('MeanCommonEntities'),
                pl.col('common_entities').max().alias('MaxCommonEntities'),
            )
        for rows in tqdm.tqdm(train_ds.iter_slices(100), total=train_ds.shape[0] // 100)
    )
)
train_ds = train_ds.join(entities_df, on=['impression_id', 'article'], how='left').drop(['entity_groups'])
train_ds.head(2)

6996it [03:39, 31.84it/s]                          


CPU times: user 3min 57s, sys: 23 s, total: 4min 20s
Wall time: 3min 40s


impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,article_delay_days,article_delay_hours,Entity_PER_Present,Entity_LOC_Present,Entity_PROD_Present,Entity_MISC_Present,Entity_ORG_Present,Entity_EVENT_Present,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,MostFrequentWeekday,MostFrequentHour,MostFrequentCategory,PctNotDefaultArticles,NegativePct,PositivePct,NeutralPct,PERPct,LOCPct,PRODPct,MISCPct,ORGPct,EVENTPct,IsFavouriteCategory,NumberDifferentCategories,PctCategoryMatches,MeanCommonEntities,MaxCommonEntities
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,i64,i64,bool,bool,bool,bool,bool,bool,u32,f32,f32,f32,f32,f32,i8,i8,i16,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool,u32,f64,f64,u32
150528,9778669,0,2,25.0,,143471,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,True,False,True,False,True,False,482,13.0,1390.0,23016.0,56.0,100.0,4,4,118,0.112033,0.549793,0.118257,0.251037,0.802905,0.649378,0.284232,0.16805,0.848548,0.182573,True,13,0.972912,2.106095,3
153068,9778669,1,1,78.0,100.0,151570,False,,,False,3,7,False,False,118,0.9481,"""Negative""",0,0,True,False,True,False,True,False,1058,16.0,1548.0,42896.0,100.0,100.0,5,5,118,0.316635,0.397921,0.115312,0.187146,0.620038,0.535917,0.217391,0.142722,0.648393,0.163516,True,10,0.987854,2.121457,3


In [16]:
train_ds.shape

(699537, 50)

# Training

In [17]:
train_ds = train_ds.with_columns(
    pl.col('gender').fill_null(2),
)
train_ds = train_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()

categorical_columns = ['device_type', 'is_sso_user', 'gender', 'is_subscriber', 'weekday',
                       'is_in_home', 'premium', 'category', 'sentiment_label', 
                       'MostFrequentCategory', 'MostFrequentWeekday', 'IsFavouriteCategory']
categorical_columns += [f'Entity_{entity}_Present' for entity in unique_entities]
train_ds[categorical_columns] = train_ds[categorical_columns].astype('category')

X = train_ds.drop(columns=['target'])
y = train_ds['target']

In [18]:
# just a simple model to be able to run an evaluation
model = CatBoostClassifier(cat_features=categorical_columns, iterations=100)

model.fit(X, y, verbose=10)

Learning rate set to 0.5
0:	learn: 0.6097539	total: 616ms	remaining: 1m
10:	learn: 0.5636613	total: 5.09s	remaining: 41.2s
20:	learn: 0.5571653	total: 9.46s	remaining: 35.6s
30:	learn: 0.5524992	total: 13.7s	remaining: 30.6s
40:	learn: 0.5481324	total: 18.1s	remaining: 26s
50:	learn: 0.5450041	total: 22.4s	remaining: 21.5s
60:	learn: 0.5430612	total: 27.2s	remaining: 17.4s
70:	learn: 0.5407903	total: 31.5s	remaining: 12.9s
80:	learn: 0.5391722	total: 35.8s	remaining: 8.39s
90:	learn: 0.5378080	total: 40.1s	remaining: 3.96s
99:	learn: 0.5365453	total: 43.8s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7b75f239c5b0>

In [19]:
del history_train
del train_ds
del behaviors_train
gc.collect()

0

# Model Evaluation

In [20]:
%%time

columns = ['category', 'article_type', 'sentiment_label', 'sentiment_score']
return_dtypes = [pl.Int64, pl.String, pl.String, pl.Float64]
with Progress() as progress: 
    
    tasks = {}
    for col in columns:
        tasks[col] = progress.add_task(f"Getting {col}", total=history_val.shape[0])
    tasks['entity_groups'] = progress.add_task("Getting entity_groups", total=history_val.shape[0])
    tasks['entity_groups_detailed'] = progress.add_task("Getting detailed entity_groups", total=history_val.shape[0])

    history_val = history_val.with_columns(
        [pl.col('article_id_fixed').map_elements(get_single_feature_function(col, tasks[col]), 
                                                 return_dtype=pl.List(dtype)).alias(col)
         for col, dtype in zip(columns, return_dtypes)] + \
        [pl.col('article_id_fixed').map_elements(get_unique_list_exploded_feature_function('entity_groups', tasks['entity_groups']), 
                                                 return_dtype=pl.List(pl.String)).alias('entity_groups'),
         pl.col('article_id_fixed').map_elements(get_unique_list_feature_function('entity_groups', tasks['entity_groups_detailed']), 
                                                 return_dtype=pl.List(pl.List(pl.String))).alias('entity_groups_detailed')]
    ).drop('article_id_fixed')
    
history_val = history_val.with_columns(
    pl.col('read_time_fixed').list.len().alias('NumArticlesHistory'),
    pl.col('read_time_fixed').list.median().alias('MedianReadTime'),
    pl.col('read_time_fixed').list.max().alias('MaxReadTime'),
    pl.col('read_time_fixed').list.sum().alias('TotalReadTime'),
    pl.col('scroll_percentage_fixed').list.median().alias('MedianScrollPercentage'),
    pl.col('scroll_percentage_fixed').list.max().alias('MaxScrollPercentage'),
    pl.col('impression_time_fixed').list.eval(pl.element().dt.weekday()).alias('weekdays'),
    pl.col('impression_time_fixed').list.eval(pl.element().dt.hour()).alias('hours'),
).with_columns(
    pl.col('weekdays').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int8).alias('MostFrequentWeekday'),
    pl.col('hours').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int8).alias('MostFrequentHour'),
    pl.col('category').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int16).alias('MostFrequentCategory'),
    (1 - (pl.col('article_type').list.count_matches('article_default') / pl.col('NumArticlesHistory'))).alias('PctNotDefaultArticles'),
    (pl.col('sentiment_label').list.count_matches('Negative') / pl.col('NumArticlesHistory')).alias('NegativePct'),
    (pl.col('sentiment_label').list.count_matches('Positive') / pl.col('NumArticlesHistory')).alias('PositivePct'),
    (pl.col('sentiment_label').list.count_matches('Neutral') / pl.col('NumArticlesHistory')).alias('NeutralPct'),
    # TODO: not sure about how sentiment scores are since there are 3 classes
    # probably the score is related to the class so this should be done differently for each class
#     pl.col('sentiment_score').list.mean().alias('MeanSentimentScore'),
#     pl.col('sentiment_score').list.max().alias('MaxSentimentScore'),
#     pl.col('sentiment_score').list.min().alias('MinSentimentScore'),
).drop(
    ['read_time_fixed', 'scroll_percentage_fixed', 'impression_time_fixed', 
     'weekdays', 'hours', 'sentiment_label', 'sentiment_score', 'article_type']
).with_columns(
    (pl.col('entity_groups').list.count_matches(entity) / pl.col('NumArticlesHistory')).alias(f'{entity}Pct')
    for entity in unique_entities
).drop('entity_groups')

history_val.head(2)

Output()

CPU times: user 4min 38s, sys: 34.8 s, total: 5min 13s
Wall time: 3min 40s


user_id,category,entity_groups_detailed,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,MostFrequentWeekday,MostFrequentHour,MostFrequentCategory,PctNotDefaultArticles,NegativePct,PositivePct,NeutralPct,PERPct,LOCPct,PRODPct,MISCPct,ORGPct,EVENTPct
u32,list[i64],list[list[str]],u32,f32,f32,f32,f32,f32,i8,i8,i16,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
14241,"[457, 457, … 118]","[[], [""ORG"", ""PER""], … [""PER"", ""ORG"", ""LOC""]]",161,18.0,1574.0,7608.0,98.0,100.0,3,13,118,0.130435,0.52795,0.10559,0.254658,0.763975,0.701863,0.180124,0.136646,0.801242,0.099379
20396,"[140, 414, … 140]","[[""ORG"", ""LOC"", ""PER""], [""PER"", ""MISC"", … ""ORG""], … [""PER"", ""ORG"", ""LOC""]]",74,40.5,1278.0,5406.0,100.0,100.0,7,15,414,0.22973,0.459459,0.121622,0.243243,0.756757,0.648649,0.256757,0.216216,0.783784,0.121622


In [21]:
# no sampling since for evaluation all the predictions should be needed
val_ds = behaviors_val.pipe(create_binary_labels_column, shuffle=True, seed=123) \
    .with_columns(pl.col("labels").list.len().name.suffix("_len")) \
    .select(['impression_id', 'article_ids_inview', 'article_id', 'impression_time', 'labels', 
             'device_type', 'read_time', 'scroll_percentage', 'user_id', 'is_sso_user', 'gender',
             'age', 'is_subscriber']) \
    .explode(['article_ids_inview', 'labels']) \
    .rename({'article_ids_inview': 'article', 'labels': 'target'}) \
    .with_columns(
        pl.col('impression_time').dt.weekday().alias('weekday'),
        pl.col('impression_time').dt.hour().alias('hour'),
        pl.col('article').cast(pl.Int32),
    ).join(articles.select(['article_id', 'premium', 'published_time', 'category', 
                            'sentiment_score', 'sentiment_label', 'entity_groups']),
           left_on='article', right_on='article_id', how='left') \
    .with_columns(
        (pl.col('impression_time') - pl.col('published_time')).dt.total_days().alias('article_delay_days'),
        (pl.col('impression_time') - pl.col('published_time')).dt.total_hours().alias('article_delay_hours')
    ).drop(['impression_time', 'published_time', 'article_id']) \
    .drop(['impression_time', 'published_time', 'article_id']) \
    .with_columns(
        pl.col('entity_groups').list.contains(entity).alias(f'Entity_{entity}_Present')
        for entity in unique_entities
    ).join(history_val.drop(['entity_groups_detailed']), on='user_id', how='left').with_columns(
        (pl.col('category') == pl.col('MostFrequentCategory')).alias('IsFavouriteCategory'),
        pl.col('category_right').list.n_unique().alias('NumberDifferentCategories'),
        list_pct_matches_with_col('category_right', 'category').alias('PctCategoryMatches'),
    ).drop('category_right')
    
val_ds.head(2)

impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,entity_groups,article_delay_days,article_delay_hours,Entity_PER_Present,Entity_LOC_Present,Entity_PROD_Present,Entity_MISC_Present,Entity_ORG_Present,Entity_EVENT_Present,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,MostFrequentWeekday,MostFrequentHour,MostFrequentCategory,PctNotDefaultArticles,NegativePct,PositivePct,NeutralPct,PERPct,LOCPct,PRODPct,MISCPct,ORGPct,EVENTPct,IsFavouriteCategory,NumberDifferentCategories,PctCategoryMatches
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,list[str],i64,i64,bool,bool,bool,bool,bool,bool,u32,f32,f32,f32,f32,f32,i8,i8,i16,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool,u32,f64
96791,9783865,0,2,9.0,,22548,False,,,False,7,4,True,False,498,0.9793,"""Negative""","[""PROD"", ""PER"", … ""ORG""]",0,6,True,True,True,False,True,True,130,16.0,650.0,4581.0,37.0,100.0,1,9,118,0.061538,0.576923,0.153846,0.246154,0.823077,0.715385,0.292308,0.176923,0.907692,0.223077,False,7,0.952756
96791,9784591,0,2,9.0,,22548,False,,,False,7,4,True,False,142,0.9823,"""Negative""","[""ORG"", ""LOC"", … ""ORG""]",0,8,True,True,False,False,True,False,130,16.0,650.0,4581.0,37.0,100.0,1,9,118,0.061538,0.576923,0.153846,0.246154,0.823077,0.715385,0.292308,0.176923,0.907692,0.223077,False,7,0.952756


In [22]:
val_ds.shape

(2928942, 49)

In [23]:
%%time

entities_df = pl.concat(
    (
        rows.select(['impression_id', 'user_id', 'article', 'entity_groups']) \
            .join(history_val.select(['user_id', 'entity_groups_detailed']), on='user_id', how='left') \
            .explode('entity_groups_detailed')
            .with_columns(
                pl.col('entity_groups').list.set_intersection(pl.col('entity_groups_detailed')).list.len().alias('common_entities')
            ).drop(['entity_groups_detailed', 'entity_groups']) \
            .group_by(['impression_id', 'article']).agg(
                pl.col('common_entities').mean().alias('MeanCommonEntities'),
                pl.col('common_entities').max().alias('MaxCommonEntities'),
            )
        for rows in tqdm.tqdm(val_ds.iter_slices(100), total=val_ds.shape[0] // 100)
    )
)
val_ds = val_ds.join(entities_df, on=['impression_id', 'article'], how='left').drop(['entity_groups'])
val_ds.head(2)

29290it [16:09, 30.22it/s]                           


CPU times: user 16min 44s, sys: 2min 13s, total: 18min 58s
Wall time: 16min 10s


impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,article_delay_days,article_delay_hours,Entity_PER_Present,Entity_LOC_Present,Entity_PROD_Present,Entity_MISC_Present,Entity_ORG_Present,Entity_EVENT_Present,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,MostFrequentWeekday,MostFrequentHour,MostFrequentCategory,PctNotDefaultArticles,NegativePct,PositivePct,NeutralPct,PERPct,LOCPct,PRODPct,MISCPct,ORGPct,EVENTPct,IsFavouriteCategory,NumberDifferentCategories,PctCategoryMatches,MeanCommonEntities,MaxCommonEntities
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,i64,i64,bool,bool,bool,bool,bool,bool,u32,f32,f32,f32,f32,f32,i8,i8,i16,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool,u32,f64,f64,u32
96791,9783865,0,2,9.0,,22548,False,,,False,7,4,True,False,498,0.9793,"""Negative""",0,6,True,True,True,False,True,True,130,16.0,650.0,4581.0,37.0,100.0,1,9,118,0.061538,0.576923,0.153846,0.246154,0.823077,0.715385,0.292308,0.176923,0.907692,0.223077,False,7,0.952756,3.031496,5
96791,9784591,0,2,9.0,,22548,False,,,False,7,4,True,False,142,0.9823,"""Negative""",0,8,True,True,False,False,True,False,130,16.0,650.0,4581.0,37.0,100.0,1,9,118,0.061538,0.576923,0.153846,0.246154,0.823077,0.715385,0.292308,0.176923,0.907692,0.223077,False,7,0.952756,2.503937,3


In [24]:
val_ds.shape

(2928942, 50)

In [25]:
val_ds = val_ds.with_columns(
    pl.col('gender').fill_null(2),
)
# impression_id will be later useful for evaluation
val_ds_pandas = val_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()

val_ds_pandas[categorical_columns] = val_ds_pandas[categorical_columns].astype('category')

X_val = val_ds_pandas.drop(columns=['target'])
y_val = val_ds_pandas['target']

# doint model.predict_proba(...)[:, 1] to take only the probability of class 1
# while by doing [:, 0] we take the probability of class 0
val_ds = val_ds.with_columns(pl.Series(model.predict_proba(X_val)[:, 1]).alias('prediction'))
val_ds.select(['impression_id', 'target', 'prediction'])

impression_id,target,prediction
u32,i8,f64
96791,0,0.736913
96791,0,0.404642
96791,0,0.441478
96791,1,0.30069
96791,0,0.424646
…,…,…
579552453,0,0.11661
579552453,0,0.146538
579552453,1,0.166544
579552453,0,0.256638


In [26]:
evaluation_ds = val_ds.group_by('impression_id').agg(pl.col('target'), pl.col('prediction'))
evaluation_ds

impression_id,target,prediction
u32,list[i8],list[f64]
434672953,"[0, 0, … 1]","[0.041814, 0.112422, … 0.377388]"
259637215,"[0, 0, … 1]","[0.330253, 0.069643, … 0.412103]"
188621790,"[0, 0, … 0]","[0.471977, 0.340992, … 0.428277]"
296203289,"[0, 1, … 0]","[0.291348, 0.418711, … 0.426588]"
203366569,"[0, 1, … 0]","[0.008855, 0.478099, … 0.679278]"
…,…,…
483956446,"[0, 1, … 0]","[0.569667, 0.629966, … 0.64563]"
507638780,"[0, 0, … 0]","[0.198631, 0.299611, … 0.535056]"
565738641,"[0, 0, … 0]","[0.580525, 0.587162, … 0.550583]"
216019700,"[0, 0, … 1]","[0.009485, 0.38379, … 0.537327]"


In [27]:
%%time

met_eval = MetricEvaluator(
    labels=evaluation_ds['target'].to_list(),
    predictions=evaluation_ds['prediction'].to_list(),
    metric_functions=[
        AucScore(),
        MrrScore(),
        NdcgScore(k=5),
        NdcgScore(k=10),
    ],
)
met_eval.evaluate()

CPU times: user 6min 38s, sys: 29.2 ms, total: 6min 38s
Wall time: 6min 39s


<MetricEvaluator class>: 
 {
    "auc": 0.7072870894541164,
    "mrr": 0.47792799541941483,
    "ndcg@5": 0.5364924366668778,
    "ndcg@10": 0.5827174193590527
}