In [1]:
from kaggle_secrets import UserSecretsClient
secret_label = "Token"
token = UserSecretsClient().get_secret(secret_label)

! git clone --branch baseline_boosting https://{token}@github.com/FrancescoZanella/RecSysChallenge2024.git
! git clone https://github.com/ebanalyse/ebnerd-benchmark.git

Cloning into 'RecSysChallenge2024'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Total 77 (delta 10), reused 69 (delta 8), pack-reused 0[K
Unpacking objects: 100% (77/77), 1.61 MiB | 6.29 MiB/s, done.
Cloning into 'ebnerd-benchmark'...
remote: Enumerating objects: 340, done.[K
remote: Counting objects: 100% (340/340), done.[K
remote: Compressing objects: 100% (224/224), done.[K
remote: Total 340 (delta 151), reused 277 (delta 93), pack-reused 0[K
Receiving objects: 100% (340/340), 15.18 MiB | 27.65 MiB/s, done.
Resolving deltas: 100% (151/151), done.


In [2]:
%cd ebnerd-benchmark/src

/kaggle/working/ebnerd-benchmark/src


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import polars as pl
import scipy.stats as stats

from ebrec.utils._descriptive_analysis import (
    min_max_impression_time_behaviors, 
    min_max_impression_time_history
)
from ebrec.utils._polars import slice_join_dataframes
from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    truncate_history,
)
from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_USER_COL
)
from ebrec.evaluation.metrics_protocols import *

from catboost import CatBoostClassifier

In [4]:
articles = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/articles.parquet')

behaviors_train = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/train/behaviors.parquet')
history_train = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/train/history.parquet')

behaviors_val = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/validation/behaviors.parquet')
history_val = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/validation/history.parquet')

In [5]:
NPRATIO = 2
train_ds = behaviors_train.pipe(
    sampling_strategy_wu2019, npratio=NPRATIO, shuffle=False, with_replacement=True, seed=123
).pipe(create_binary_labels_column, shuffle=True, seed=123).with_columns(pl.col("labels").list.len().name.suffix("_len"))

In [6]:
train_ds = train_ds.select(['impression_id', 'article_ids_inview', 'article_id', 'impression_time', 'labels', 
                            'device_type', 'read_time', 'scroll_percentage', 'user_id', 'is_sso_user', 'gender',
                            'age', 'is_subscriber']) \
    .explode(['article_ids_inview', 'labels']) \
    .rename({'article_ids_inview': 'article', 'labels': 'target'}) \
    .with_columns(
        pl.col('impression_time').dt.weekday().alias('weekday'),
        pl.col('impression_time').dt.hour().alias('hour'),
        pl.col('article').cast(pl.Int32),
        pl.col('article_id').is_null().alias('is_in_home'),
    ).join(articles.select(['article_id', 'premium', 'published_time', 'category', 
                            'sentiment_score', 'sentiment_label']),
           left_on='article', right_on='article_id', how='left') \
    .with_columns(
        (pl.col('impression_time') - pl.col('published_time')).dt.total_days().alias('article_delay_days'),
        (pl.col('impression_time') - pl.col('published_time')).dt.total_hours().alias('article_delay_hours')
    ).drop(['impression_time', 'published_time', 'article_id'])

train_ds.head()

impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,article_delay_days,article_delay_hours
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,i64,i64
149474,9778669,0,2,13.0,,139836,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1
149474,9778728,0,2,13.0,,139836,False,,,False,3,7,True,False,142,0.9654,"""Negative""",0,0
149474,9778657,1,2,13.0,,139836,False,,,False,3,7,True,False,118,0.8347,"""Neutral""",0,1
150528,9778669,0,2,25.0,,143471,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1
150528,9778682,0,2,25.0,,143471,False,,,False,3,7,True,False,498,0.9546,"""Negative""",0,1


In [7]:
train_ds.shape

(702831, 20)

# Adding history

In [8]:
history_train = history_train.with_columns(
    pl.col('article_id_fixed').list.len().alias('NumArticlesHistory'),
    pl.col('read_time_fixed').list.median().alias('MedianReadTime'),
    pl.col('read_time_fixed').list.max().alias('MaxReadTime'),
    pl.col('read_time_fixed').list.sum().alias('TotalReadTime'),
    pl.col('scroll_percentage_fixed').list.median().alias('MedianScrollPercentage'),
    pl.col('scroll_percentage_fixed').list.max().alias('MaxScrollPercentage'),
    pl.col('impression_time_fixed').list.eval(pl.element().dt.weekday()).alias('weekdays'),
    pl.col('impression_time_fixed').list.eval(pl.element().dt.hour()).alias('hours'),
).drop(['read_time_fixed', 'scroll_percentage_fixed', 'impression_time_fixed'])

In [9]:
history_train.shape

(15143, 10)

In [10]:
%%time

from rich.progress import Progress

def get_categories(article_ids):
    progress.update(task_id, advance=1)
    categories = articles.filter(pl.col('article_id').is_in(article_ids)) \
        .select(pl.col('category').explode())['category'].to_list()
    return list(set(categories))

with Progress() as progress: 
    task_id = progress.add_task("Getting Categories", total=history_train.shape[0])
    history_train = history_train.with_columns(
        pl.col('article_id_fixed').map_elements(get_categories, return_dtype=pl.List(pl.Int64)).alias('categories')
    ).drop('article_id_fixed')
    
history_train.head()

Output()

CPU times: user 20.1 s, sys: 5.81 s, total: 25.9 s
Wall time: 16.4 s


user_id,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,weekdays,hours,categories
u32,u32,f32,f32,f32,f32,f32,list[i8],list[i8],list[i64]
13538,582,6.0,672.0,7192.0,32.0,100.0,"[4, 4, … 3]","[10, 10, … 20]","[512, 457, … 2975]"
14241,179,19.0,1574.0,7754.0,100.0,100.0,"[4, 4, … 3]","[9, 9, … 17]","[512, 457, … 2975]"
20396,96,38.0,1278.0,6546.0,100.0,100.0,"[4, 4, … 3]","[12, 12, … 10]","[512, 140, … 414]"
34912,239,9.0,1393.0,10012.0,97.0,100.0,"[6, 6, … 4]","[7, 13, … 5]","[512, 457, … 2975]"
37953,104,15.5,1512.0,11103.0,58.0,100.0,"[4, 4, … 3]","[19, 19, … 21]","[512, 457, … 2975]"


In [11]:
%%time

history_train = history_train.with_columns(
    pl.col('categories').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int16).alias('MostFrequentCategory'),
    pl.col('weekdays').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int8).alias('MostFrequentWeekday'),
    pl.col('hours').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int8).alias('MostFrequentHour'),
).drop(['categories', 'weekdays', 'hours'])
    
train_ds = train_ds.join(history_train, on='user_id', how='left')
train_ds.head()

CPU times: user 40 s, sys: 8.79 s, total: 48.8 s
Wall time: 43.6 s


impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,article_delay_days,article_delay_hours,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,MostFrequentCategory,MostFrequentWeekday,MostFrequentHour
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,i64,i64,u32,f32,f32,f32,f32,f32,i16,i8,i8
149474,9778669,0,2,13.0,,139836,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,19,11.0,88.0,399.0,47.0,100.0,118,5,19
149474,9778728,0,2,13.0,,139836,False,,,False,3,7,True,False,142,0.9654,"""Negative""",0,0,19,11.0,88.0,399.0,47.0,100.0,118,5,19
149474,9778657,1,2,13.0,,139836,False,,,False,3,7,True,False,118,0.8347,"""Neutral""",0,1,19,11.0,88.0,399.0,47.0,100.0,118,5,19
150528,9778669,0,2,25.0,,143471,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,482,13.0,1390.0,23016.0,56.0,100.0,118,4,4
150528,9778682,0,2,25.0,,143471,False,,,False,3,7,True,False,498,0.9546,"""Negative""",0,1,482,13.0,1390.0,23016.0,56.0,100.0,118,4,4


# Training

In [12]:
train_ds = train_ds.with_columns(
    pl.col('gender').fill_null(2),
)
train_ds = train_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()

categorical_columns = ['device_type', 'is_sso_user', 'gender', 'is_subscriber', 'weekday',
                       'is_in_home', 'premium', 'category', 'sentiment_label', 
                       'MostFrequentCategory', 'MostFrequentWeekday']
train_ds[categorical_columns] = train_ds[categorical_columns].astype('category')

X = train_ds.drop(columns=['target'])
y = train_ds['target']

In [13]:
# just a simple model to be able to run an evaluation
model = CatBoostClassifier(cat_features=categorical_columns, iterations=100)

model.fit(X, y, verbose=10)

Learning rate set to 0.5
0:	learn: 0.6072589	total: 544ms	remaining: 53.9s
10:	learn: 0.5724450	total: 4.61s	remaining: 37.3s
20:	learn: 0.5663345	total: 8.66s	remaining: 32.6s
30:	learn: 0.5623938	total: 12.7s	remaining: 28.3s
40:	learn: 0.5596339	total: 16.7s	remaining: 24.1s
50:	learn: 0.5576020	total: 20.8s	remaining: 20s
60:	learn: 0.5558510	total: 25.1s	remaining: 16.1s
70:	learn: 0.5541212	total: 29.4s	remaining: 12s
80:	learn: 0.5524669	total: 33.5s	remaining: 7.85s
90:	learn: 0.5513522	total: 37.5s	remaining: 3.71s
99:	learn: 0.5500947	total: 41.2s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x782b5e0e5b70>

# Model Evaluation

In [14]:
%%time

history_val = history_val.with_columns(
    pl.col('article_id_fixed').list.len().alias('NumArticlesHistory'),
    pl.col('read_time_fixed').list.median().alias('MedianReadTime'),
    pl.col('read_time_fixed').list.max().alias('MaxReadTime'),
    pl.col('read_time_fixed').list.sum().alias('TotalReadTime'),
    pl.col('scroll_percentage_fixed').list.median().alias('MedianScrollPercentage'),
    pl.col('scroll_percentage_fixed').list.max().alias('MaxScrollPercentage'),
    pl.col('impression_time_fixed').list.eval(pl.element().dt.weekday()).alias('weekdays'),
    pl.col('impression_time_fixed').list.eval(pl.element().dt.hour()).alias('hours'),
).drop(['read_time_fixed', 'scroll_percentage_fixed', 'impression_time_fixed'])

with Progress() as progress: 
    task_id = progress.add_task("Getting Categories", total=history_val.shape[0])
    history_val = history_val.with_columns(
        pl.col('article_id_fixed').map_elements(get_categories, return_dtype=pl.List(pl.Int64)).alias('categories')
    ).drop('article_id_fixed')
    
history_val = history_val.with_columns(
    pl.col('categories').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int16).alias('MostFrequentCategory'),
    pl.col('weekdays').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int8).alias('MostFrequentWeekday'),
    pl.col('hours').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int8).alias('MostFrequentHour'),
).drop(['categories', 'weekdays', 'hours'])

Output()

CPU times: user 1min 1s, sys: 14.4 s, total: 1min 15s
Wall time: 1min


In [15]:
# no sampling since for evaluation all the predictions should be needed
val_ds = behaviors_val.pipe(create_binary_labels_column, shuffle=True, seed=123) \
    .with_columns(pl.col("labels").list.len().name.suffix("_len")) \
    .select(['impression_id', 'article_ids_inview', 'article_id', 'impression_time', 'labels', 
             'device_type', 'read_time', 'scroll_percentage', 'user_id', 'is_sso_user', 'gender',
             'age', 'is_subscriber']) \
    .explode(['article_ids_inview', 'labels']) \
    .rename({'article_ids_inview': 'article', 'labels': 'target'}) \
    .with_columns(
        pl.col('impression_time').dt.weekday().alias('weekday'),
        pl.col('impression_time').dt.hour().alias('hour'),
        pl.col('article').cast(pl.Int32),
        pl.col('article_id').is_null().alias('is_in_home'),
    ).join(articles.select(['article_id', 'premium', 'published_time', 'category', 
                            'sentiment_score', 'sentiment_label']),
           left_on='article', right_on='article_id', how='left') \
    .with_columns(
        (pl.col('impression_time') - pl.col('published_time')).dt.total_days().alias('article_delay_days'),
        (pl.col('impression_time') - pl.col('published_time')).dt.total_hours().alias('article_delay_hours')
    ).drop(['impression_time', 'published_time', 'article_id']) \
    .join(history_val, on='user_id', how='left')

val_ds.head()

impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,article_delay_days,article_delay_hours,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,MostFrequentCategory,MostFrequentWeekday,MostFrequentHour
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,i64,i64,u32,f32,f32,f32,f32,f32,i16,i8,i8
96791,9783865,0,2,9.0,,22548,False,,,False,7,4,True,False,498,0.9793,"""Negative""",0,6,130,16.0,650.0,4581.0,37.0,100.0,118,1,9
96791,9784591,0,2,9.0,,22548,False,,,False,7,4,True,False,142,0.9823,"""Negative""",0,8,130,16.0,650.0,4581.0,37.0,100.0,118,1,9
96791,9784679,0,2,9.0,,22548,False,,,False,7,4,True,False,142,0.9781,"""Positive""",0,8,130,16.0,650.0,4581.0,37.0,100.0,118,1,9
96791,9784696,1,2,9.0,,22548,False,,,False,7,4,True,False,142,0.9529,"""Positive""",0,8,130,16.0,650.0,4581.0,37.0,100.0,118,1,9
96791,9784710,0,2,9.0,,22548,False,,,False,7,4,True,False,142,0.8887,"""Positive""",0,6,130,16.0,650.0,4581.0,37.0,100.0,118,1,9


In [16]:
val_ds = val_ds.with_columns(
    pl.col('gender').fill_null(2),
)
# impression_id will be later useful for evaluation
val_ds_pandas = val_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()

categorical_columns = ['device_type', 'is_sso_user', 'gender', 'is_subscriber', 'weekday',
                       'is_in_home', 'premium', 'category', 'sentiment_label', 
                       'MostFrequentCategory', 'MostFrequentWeekday']
val_ds_pandas[categorical_columns] = val_ds_pandas[categorical_columns].astype('category')

X_val = val_ds_pandas.drop(columns=['target'])
y_val = val_ds_pandas['target']

# doint model.predict_proba(...)[:, 1] to take only the probability of class 1
# while by doing [:, 0] we take the probability of class 0
val_ds = val_ds.with_columns(pl.Series(model.predict_proba(X_val)[:, 1]).alias('prediction'))
val_ds.select(['impression_id', 'target', 'prediction'])

impression_id,target,prediction
u32,i8,f64
96791,0,0.454831
96791,0,0.439679
96791,0,0.409151
96791,1,0.394631
96791,0,0.42875
…,…,…
579552453,0,0.148944
579552453,0,0.229088
579552453,1,0.310936
579552453,0,0.234527


In [17]:
evaluation_ds = val_ds.group_by('impression_id').agg(pl.col('target'), pl.col('prediction'))
evaluation_ds

impression_id,target,prediction
u32,list[i8],list[f64]
204741078,"[0, 0, … 1]","[0.080226, 0.108455, … 0.448023]"
271656354,"[0, 0, … 0]","[0.05659, 0.216518, … 0.392319]"
394292490,"[0, 0, … 0]","[0.103546, 0.595561, … 0.407022]"
96911068,"[0, 0, … 0]","[0.248888, 0.300649, … 0.302625]"
334811472,"[0, 1, … 0]","[0.272132, 0.39529, … 0.359634]"
…,…,…
269157743,"[0, 0, … 0]","[0.368109, 0.159304, … 0.590767]"
397706761,"[0, 1, … 0]","[0.511579, 0.552344, … 0.249851]"
569492886,"[0, 0, … 0]","[0.015808, 0.47644, … 0.014012]"
279456621,"[0, 0, … 0]","[0.446433, 0.535906, … 0.496293]"


In [18]:
%%time

met_eval = MetricEvaluator(
    labels=evaluation_ds['target'].to_list(),
    predictions=evaluation_ds['prediction'].to_list(),
    metric_functions=[
        AucScore(),
        MrrScore(),
        NdcgScore(k=5),
        NdcgScore(k=10),
        LogLossScore(),
        RootMeanSquaredError(),
        AccuracyScore(threshold=0.5),
        F1Score(threshold=0.5),
    ],
)
met_eval.evaluate()

CPU times: user 20min 7s, sys: 504 ms, total: 20min 7s
Wall time: 20min 9s


<MetricEvaluator class>: 
 {
    "auc": 0.6824956764745298,
    "mrr": 0.4476792661382741,
    "ndcg@5": 0.5068768904681878,
    "ndcg@10": 0.5583374590486883,
    "logloss": 0.4514180305283289,
    "rmse": 0.3763607589786764,
    "accuracy": 0.8010700445281806,
    "f1": 0.16002721467337994
}