In [1]:
import polars as pl
from pathlib import Path
import numpy as np
import datetime

In [2]:
dpath = Path('../dataset')

dtype = 'small'
articles = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/articles.parquet')

behaviors_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/behaviors.parquet')
history_train = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/train/history.parquet')

behaviors_val = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/validation/behaviors.parquet')
history_val = pl.read_parquet(f'{dpath}/ebnerd_{dtype}/validation/history.parquet')

In [3]:
print('History train: ', history_train['impression_time_fixed'].explode().min(), history_train['impression_time_fixed'].explode().max(), (history_train['impression_time_fixed'].explode().max() - history_train['impression_time_fixed'].explode().min()))
print('Behaviors train: ', behaviors_train['impression_time'].explode().min(), behaviors_train['impression_time'].explode().max(), behaviors_train['impression_time'].explode().max() - behaviors_train['impression_time'].explode().min())

History train:  2023-04-27 07:00:00 2023-05-18 06:59:59 20 days, 23:59:59
Behaviors train:  2023-05-18 07:00:01 2023-05-25 06:59:58 6 days, 23:59:57


In [4]:
print('History val: ', history_val['impression_time_fixed'].explode().min(), history_val['impression_time_fixed'].explode().max(), (history_val['impression_time_fixed'].explode().max() - history_val['impression_time_fixed'].explode().min()))
print('Behaviors val: ', behaviors_val['impression_time'].explode().min(), behaviors_val['impression_time'].explode().max(), behaviors_val['impression_time'].explode().max() - behaviors_val['impression_time'].explode().min())

History val:  2023-05-04 07:00:00 2023-05-25 06:59:59 20 days, 23:59:59
Behaviors val:  2023-05-25 07:00:02 2023-06-01 06:59:59 6 days, 23:59:57


In [5]:
behaviors_val['impression_time'].explode().max() - behaviors_train['impression_time'].explode().min()

datetime.timedelta(days=13, seconds=86398)

In [94]:
def behaviors_to_history(behaviors: pl.DataFrame) -> pl.DataFrame:
        return behaviors.sort('impression_time').select('user_id', 'impression_time', 'next_scroll_percentage', 'article_ids_clicked', 'next_read_time')\
                .rename({'impression_time': 'impression_time_fixed', 
                        'article_ids_clicked': 'article_id_fixed', 
                        'next_read_time': 'read_time_fixed', 
                        'next_scroll_percentage': 'scroll_percentage_fixed'})\
                .explode('article_id_fixed').group_by('user_id').agg(pl.all())
        
behaviors_to_history(behaviors_train).head(2)

user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
u32,list[datetime[μs]],list[f32],list[i32],list[f32]
2494588,"[2023-05-18 07:01:14, 2023-05-19 05:41:12]","[100.0, 100.0]","[9767697, 9771113]","[86.0, 17.0]"
1541532,"[2023-05-22 13:15:53, 2023-05-22 13:16:43, … 2023-05-24 10:12:17]","[37.0, 30.0, … 13.0]","[9775894, 9770145, … 9778444]","[6.0, 7.0, … 6.0]"


In [97]:
history_all = pl.concat([
    history_train.explode(pl.all().exclude('user_id')).join(
        history_val.explode(pl.all().exclude('user_id')), 
        on=['user_id', 'impression_time_fixed'], how='anti'),
    history_val.explode(pl.all().exclude('user_id')),
    behaviors_to_history(behaviors_val).explode(pl.all().exclude('user_id')),
]).sort(['user_id', 'impression_time_fixed'])\
.group_by('user_id').agg(pl.all())
history_all.head(2)

user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
u32,list[datetime[μs]],list[f32],list[i32],list[f32]
10068,"[2023-04-27 14:22:26, 2023-04-27 18:51:53, … 2023-05-16 19:48:21]","[100.0, 100.0, … 100.0]","[9735753, 9739065, … 9759717]","[39.0, 21.0, … 7.0]"
10200,"[2023-05-14 05:46:27, 2023-05-14 05:46:44, … 2023-05-29 05:09:35]","[36.0, 24.0, … null]","[9764325, 9763923, … 9784952]","[16.0, 15.0, … 1.0]"


In [101]:
history_all['impression_time_fixed'].explode().dt.date().min(), history_all['impression_time_fixed'].explode().dt.date().max()

(datetime.date(2023, 4, 27), datetime.date(2023, 6, 1))

## Moving window

In [102]:
def moving_window_split_iterator(history: pl.DataFrame, behaviors: pl.DataFrame, window:int=4, window_val:int=2, stride:int=2, verbose=True):
    assert behaviors['impression_time'].is_sorted()
    
    
    all_dates = history['impression_time_fixed'].explode().dt.date().unique().append(
        behaviors['impression_time'].dt.date().unique()
    ).unique().sort().to_list()
    all_dates_map = {date: i for i, date in enumerate(all_dates)}
    if verbose:
        print(f'Date range: [{all_dates[0]}:{all_dates_map[all_dates[0]]} - {all_dates[-1]}:{all_dates_map[all_dates[-1]]}]')
    
    history_window_train_start_date = history['impression_time_fixed'].explode().min().date()    
    start_window_train_behavior_date = behaviors['impression_time'].min().date()
    last_date = behaviors['impression_time'].max().date()
    i = 0
    while  start_window_train_behavior_date + datetime.timedelta(days=window + window_val) <= last_date:
        end_window_train_behavior_date = start_window_train_behavior_date + datetime.timedelta(days=window)
        start_window_val_behavior_date  = end_window_train_behavior_date
        end_window_val_behavior_date = start_window_val_behavior_date + datetime.timedelta(days=window_val)
        
        history_window_val_start_date = history_window_train_start_date + datetime.timedelta(days=7)


        if verbose:
            print(f'Fold {i}: ')
            print(f'Train: [{all_dates_map[history_window_train_start_date]} - {all_dates_map[start_window_train_behavior_date]} - {all_dates_map[end_window_train_behavior_date]}]')
            print(f'Validation: [{all_dates_map[history_window_val_start_date]} - {all_dates_map[start_window_val_behavior_date]} - {all_dates_map[end_window_val_behavior_date]}]')
        
            
        behaviors_k_train = behaviors.filter(
            pl.col('impression_time') >= datetime.datetime.combine(start_window_train_behavior_date, datetime.time(7, 0, 0)),
            pl.col('impression_time') < datetime.datetime.combine(end_window_train_behavior_date, datetime.time(7, 0, 0)),
        )
        
        history_k_train = history.explode(pl.all().exclude('user_id')).filter(
            pl.col('impression_time_fixed') >= datetime.datetime.combine(history_window_train_start_date, datetime.time(7, 0, 0)),
            pl.col('impression_time_fixed') < datetime.datetime.combine(
                history_window_train_start_date + datetime.timedelta(days=21), datetime.time(7, 0, 0)),
        ).group_by('user_id').agg(pl.all())

        behaviors_k_val = behaviors.filter(
            pl.col('impression_time') >= datetime.datetime.combine(start_window_val_behavior_date, datetime.time(7, 0, 0)),
            pl.col('impression_time') < datetime.datetime.combine(end_window_val_behavior_date, datetime.time(7, 0, 0)),
        )
        
        history_k_val = history.explode(pl.all().exclude('user_id')).filter(
            pl.col('impression_time_fixed') >= datetime.datetime.combine(history_window_val_start_date, datetime.time(7, 0, 0)),
            pl.col('impression_time_fixed') < datetime.datetime.combine(
                history_window_val_start_date + datetime.timedelta(days=21), datetime.time(7, 0, 0)),
        ).group_by('user_id').agg(pl.all())
        
        start_window_train_behavior_date += datetime.timedelta(days=stride)
        history_window_train_start_date += datetime.timedelta(days=stride)
        i+=1
        
        yield history_k_train, behaviors_k_train, history_k_val, behaviors_k_val

In [103]:
behaviors_all = behaviors_train.vstack(behaviors_val).sort('impression_time').set_sorted('impression_time')
for i, (history_k_train, behaviors_k_train, history_k_val, behaviors_k_val) in enumerate(moving_window_split_iterator(history_all, behaviors_all, window=4, window_val=2, stride=2)):
    continue

Date range: [2023-04-27:0 - 2023-06-01:35]
Fold 0: 
Train: [0 - 21 - 25]
Validation: [7 - 25 - 27]
Fold 1: 
Train: [2 - 23 - 27]
Validation: [9 - 27 - 29]
Fold 2: 
Train: [4 - 25 - 29]
Validation: [11 - 29 - 31]
Fold 3: 
Train: [6 - 27 - 31]
Validation: [13 - 31 - 33]
Fold 4: 
Train: [8 - 29 - 33]
Validation: [15 - 33 - 35]
