In [1]:
from kaggle_secrets import UserSecretsClient
secret_label = "Token"
token = UserSecretsClient().get_secret(secret_label)

! git clone --branch baseline_boosting https://{token}@github.com/FrancescoZanella/RecSysChallenge2024.git
! git clone https://github.com/ebanalyse/ebnerd-benchmark.git

Cloning into 'RecSysChallenge2024'...
remote: Enumerating objects: 73, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 73 (delta 7), reused 67 (delta 7), pack-reused 0[K
Unpacking objects: 100% (73/73), 1.61 MiB | 5.38 MiB/s, done.
Cloning into 'ebnerd-benchmark'...
remote: Enumerating objects: 340, done.[K
remote: Counting objects: 100% (340/340), done.[K
remote: Compressing objects: 100% (224/224), done.[K
remote: Total 340 (delta 151), reused 277 (delta 93), pack-reused 0[K
Receiving objects: 100% (340/340), 15.18 MiB | 13.51 MiB/s, done.
Resolving deltas: 100% (151/151), done.


In [2]:
%cd ebnerd-benchmark/src

/kaggle/working/ebnerd-benchmark/src


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import polars as pl
import scipy.stats as stats

from ebrec.utils._descriptive_analysis import (
    min_max_impression_time_behaviors, 
    min_max_impression_time_history
)
from ebrec.utils._polars import slice_join_dataframes
from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    truncate_history,
)
from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_USER_COL
)

from catboost import CatBoostClassifier

In [4]:
articles = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/articles.parquet')

behaviors_train = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/train/behaviors.parquet')
history_train = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/train/history.parquet')

behaviors_val = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/validation/behaviors.parquet')
history_val = pl.read_parquet('/kaggle/input/recommender-systems-challenge-2024/ebnerd_small/validation/history.parquet')

In [5]:
NPRATIO = 2
train_ds = behaviors_train.pipe(
    sampling_strategy_wu2019, npratio=NPRATIO, shuffle=False, with_replacement=True, seed=123
).pipe(create_binary_labels_column, shuffle=True, seed=123).with_columns(pl.col("labels").list.len().name.suffix("_len")).head(5)

In [6]:
train_ds = train_ds.select(['impression_id', 'article_ids_inview', 'article_id', 'impression_time', 'labels', 
                            'device_type', 'read_time', 'scroll_percentage', 'user_id', 'is_sso_user', 'gender',
                            'age', 'is_subscriber']) \
    .explode(['article_ids_inview', 'labels']) \
    .rename({'article_ids_inview': 'article', 'labels': 'target'}) \
    .with_columns(
        pl.col('impression_time').dt.weekday().alias('weekday'),
        pl.col('impression_time').dt.hour().alias('hour'),
        pl.col('article').cast(pl.Int32),
        pl.col('article_id').is_null().alias('is_in_home'),
    ).join(articles.select(['article_id', 'premium', 'published_time', 'category', 
                            'sentiment_score', 'sentiment_label']),
           left_on='article', right_on='article_id', how='left') \
    .with_columns(
        (pl.col('impression_time') - pl.col('published_time')).dt.total_days().alias('article_delay_days'),
        (pl.col('impression_time') - pl.col('published_time')).dt.total_hours().alias('article_delay_hours')
    ).drop(['impression_time', 'published_time', 'article_id'])

train_ds.head()

impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,article_delay_days,article_delay_hours
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,i64,i64
149474,9778669,0,2,13.0,,139836,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1
149474,9778728,0,2,13.0,,139836,False,,,False,3,7,True,False,142,0.9654,"""Negative""",0,0
149474,9778657,1,2,13.0,,139836,False,,,False,3,7,True,False,118,0.8347,"""Neutral""",0,1
150528,9778669,0,2,25.0,,143471,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1
150528,9778682,0,2,25.0,,143471,False,,,False,3,7,True,False,498,0.9546,"""Negative""",0,1


In [7]:
train_ds = train_ds.join(history_train, on='user_id', how='left')
train_ds.head()

impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,article_delay_days,article_delay_hours,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,i64,i64,list[datetime[μs]],list[f32],list[i32],list[f32]
149474,9778669,0,2,13.0,,139836,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,"[2023-05-03 19:04:15, 2023-05-03 19:05:22, … 2023-05-14 19:52:58]","[100.0, 89.0, … 47.0]","[9745590, 9748574, … 9765156]","[60.0, 11.0, … 3.0]"
149474,9778728,0,2,13.0,,139836,False,,,False,3,7,True,False,142,0.9654,"""Negative""",0,0,"[2023-05-03 19:04:15, 2023-05-03 19:05:22, … 2023-05-14 19:52:58]","[100.0, 89.0, … 47.0]","[9745590, 9748574, … 9765156]","[60.0, 11.0, … 3.0]"
149474,9778657,1,2,13.0,,139836,False,,,False,3,7,True,False,118,0.8347,"""Neutral""",0,1,"[2023-05-03 19:04:15, 2023-05-03 19:05:22, … 2023-05-14 19:52:58]","[100.0, 89.0, … 47.0]","[9745590, 9748574, … 9765156]","[60.0, 11.0, … 3.0]"
150528,9778669,0,2,25.0,,143471,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,"[2023-04-27 08:05:09, 2023-04-27 10:05:55, … 2023-05-18 06:56:14]","[21.0, 100.0, … 69.0]","[9737881, 9738659, … 9770989]","[7.0, 24.0, … 9.0]"
150528,9778682,0,2,25.0,,143471,False,,,False,3,7,True,False,498,0.9546,"""Negative""",0,1,"[2023-04-27 08:05:09, 2023-04-27 10:05:55, … 2023-05-18 06:56:14]","[21.0, 100.0, … 69.0]","[9737881, 9738659, … 9770989]","[7.0, 24.0, … 9.0]"


In [8]:
train_ds = train_ds.with_columns(
    pl.col('article_id_fixed').list.len().alias('NumArticlesHistory'),
    pl.col('read_time_fixed').list.median().alias('MedianReadTime'),
    pl.col('read_time_fixed').list.max().alias('MaxReadTime'),
    pl.col('read_time_fixed').list.sum().alias('TotalReadTime'),
    pl.col('scroll_percentage_fixed').list.median().alias('MedianScrollPercentage'),
    pl.col('scroll_percentage_fixed').list.max().alias('MaxScrollPercentage'),
    pl.col('impression_time_fixed').list.eval(pl.element().dt.weekday()).alias('weekdays'),
    pl.col('impression_time_fixed').list.eval(pl.element().dt.hour()).alias('hours'),
).drop(['read_time_fixed', 'scroll_percentage_fixed', 'impression_time_fixed'])
train_ds.head()

impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,article_delay_days,article_delay_hours,article_id_fixed,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,weekdays,hours
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,i64,i64,list[i32],u32,f32,f32,f32,f32,f32,list[i8],list[i8]
149474,9778669,0,2,13.0,,139836,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,"[9745590, 9748574, … 9765156]",19,11.0,88.0,399.0,47.0,100.0,"[3, 3, … 7]","[19, 19, … 19]"
149474,9778728,0,2,13.0,,139836,False,,,False,3,7,True,False,142,0.9654,"""Negative""",0,0,"[9745590, 9748574, … 9765156]",19,11.0,88.0,399.0,47.0,100.0,"[3, 3, … 7]","[19, 19, … 19]"
149474,9778657,1,2,13.0,,139836,False,,,False,3,7,True,False,118,0.8347,"""Neutral""",0,1,"[9745590, 9748574, … 9765156]",19,11.0,88.0,399.0,47.0,100.0,"[3, 3, … 7]","[19, 19, … 19]"
150528,9778669,0,2,25.0,,143471,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,"[9737881, 9738659, … 9770989]",482,13.0,1390.0,23016.0,56.0,100.0,"[4, 4, … 4]","[8, 10, … 6]"
150528,9778682,0,2,25.0,,143471,False,,,False,3,7,True,False,498,0.9546,"""Negative""",0,1,"[9737881, 9738659, … 9770989]",482,13.0,1390.0,23016.0,56.0,100.0,"[4, 4, … 4]","[8, 10, … 6]"


In [9]:
%%time

from rich.progress import Progress


def get_categories(article_ids):
    categories = articles.filter(pl.col('article_id').is_in(article_ids)) \
        .select(pl.col('category').explode())['category'].to_list()
    return list(set(categories))

train_ds = train_ds.with_columns(
    pl.col('article_id_fixed').map_elements(get_categories, return_dtype=pl.List(pl.Int64)).alias('categories')
).drop('article_id_fixed')
train_ds.head()

CPU times: user 127 ms, sys: 14.6 ms, total: 141 ms
Wall time: 136 ms


impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,article_delay_days,article_delay_hours,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,weekdays,hours,categories
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,i64,i64,u32,f32,f32,f32,f32,f32,list[i8],list[i8],list[i64]
149474,9778669,0,2,13.0,,139836,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,19,11.0,88.0,399.0,47.0,100.0,"[3, 3, … 7]","[19, 19, … 19]","[457, 140, … 414]"
149474,9778728,0,2,13.0,,139836,False,,,False,3,7,True,False,142,0.9654,"""Negative""",0,0,19,11.0,88.0,399.0,47.0,100.0,"[3, 3, … 7]","[19, 19, … 19]","[457, 140, … 414]"
149474,9778657,1,2,13.0,,139836,False,,,False,3,7,True,False,118,0.8347,"""Neutral""",0,1,19,11.0,88.0,399.0,47.0,100.0,"[3, 3, … 7]","[19, 19, … 19]","[457, 140, … 414]"
150528,9778669,0,2,25.0,,143471,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,482,13.0,1390.0,23016.0,56.0,100.0,"[4, 4, … 4]","[8, 10, … 6]","[512, 457, … 2975]"
150528,9778682,0,2,25.0,,143471,False,,,False,3,7,True,False,498,0.9546,"""Negative""",0,1,482,13.0,1390.0,23016.0,56.0,100.0,"[4, 4, … 4]","[8, 10, … 6]","[512, 457, … 2975]"


In [10]:
train_ds = train_ds.with_columns(
    pl.col('categories').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int16).alias('MostFrequentCategory'),
    pl.col('weekdays').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int8).alias('MostFrequentWeekday'),
    pl.col('hours').map_elements(lambda x: stats.mode(x)[0], return_dtype=pl.Int64).cast(pl.Int8).alias('MostFrequentHour'),
).drop(['categories', 'weekdays', 'hours'])
train_ds.head()

impression_id,article,target,device_type,read_time,scroll_percentage,user_id,is_sso_user,gender,age,is_subscriber,weekday,hour,is_in_home,premium,category,sentiment_score,sentiment_label,article_delay_days,article_delay_hours,NumArticlesHistory,MedianReadTime,MaxReadTime,TotalReadTime,MedianScrollPercentage,MaxScrollPercentage,MostFrequentCategory,MostFrequentWeekday,MostFrequentHour
u32,i32,i8,i8,f32,f32,u32,bool,i8,i8,bool,i8,i8,bool,bool,i16,f32,str,i64,i64,u32,f32,f32,f32,f32,f32,i16,i8,i8
149474,9778669,0,2,13.0,,139836,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,19,11.0,88.0,399.0,47.0,100.0,118,5,19
149474,9778728,0,2,13.0,,139836,False,,,False,3,7,True,False,142,0.9654,"""Negative""",0,0,19,11.0,88.0,399.0,47.0,100.0,118,5,19
149474,9778657,1,2,13.0,,139836,False,,,False,3,7,True,False,118,0.8347,"""Neutral""",0,1,19,11.0,88.0,399.0,47.0,100.0,118,5,19
150528,9778669,0,2,25.0,,143471,False,,,False,3,7,True,False,118,0.9481,"""Negative""",0,1,482,13.0,1390.0,23016.0,56.0,100.0,118,4,4
150528,9778682,0,2,25.0,,143471,False,,,False,3,7,True,False,498,0.9546,"""Negative""",0,1,482,13.0,1390.0,23016.0,56.0,100.0,118,4,4


In [11]:
train_ds = train_ds.with_columns(
    pl.col('gender').fill_null(2),
)
train_ds = train_ds.drop(['impression_id', 'article', 'user_id']).to_pandas()

categorical_columns = ['device_type', 'is_sso_user', 'gender', 'is_subscriber', 'weekday',
                       'is_in_home', 'premium', 'category', 'sentiment_label', 
                       'MostFrequentCategory', 'MostFrequentWeekday']
train_ds[categorical_columns] = train_ds[categorical_columns].astype('category')

X = train_ds.drop(columns=['target'])
y = train_ds['target']

In [12]:
model = CatBoostClassifier(cat_features=categorical_columns)

model.fit(X, y, verbose=10)

Learning rate set to 0.001714
0:	learn: 0.6923584	total: 60.3ms	remaining: 1m
10:	learn: 0.6842806	total: 80.4ms	remaining: 7.23s
20:	learn: 0.6764758	total: 92.7ms	remaining: 4.32s
30:	learn: 0.6699690	total: 107ms	remaining: 3.33s
40:	learn: 0.6631010	total: 124ms	remaining: 2.9s
50:	learn: 0.6557455	total: 141ms	remaining: 2.62s
60:	learn: 0.6490549	total: 153ms	remaining: 2.36s
70:	learn: 0.6421546	total: 168ms	remaining: 2.2s
80:	learn: 0.6345877	total: 184ms	remaining: 2.08s
90:	learn: 0.6276666	total: 199ms	remaining: 1.99s
100:	learn: 0.6197924	total: 221ms	remaining: 1.97s
110:	learn: 0.6131948	total: 239ms	remaining: 1.91s
120:	learn: 0.6077934	total: 256ms	remaining: 1.86s
130:	learn: 0.6016979	total: 274ms	remaining: 1.81s
140:	learn: 0.5951383	total: 296ms	remaining: 1.8s
150:	learn: 0.5910649	total: 313ms	remaining: 1.76s
160:	learn: 0.5858249	total: 339ms	remaining: 1.76s
170:	learn: 0.5797507	total: 358ms	remaining: 1.74s
180:	learn: 0.5746559	total: 375ms	remaining: 1.

<catboost.core.CatBoostClassifier at 0x7994111866e0>