In [1]:
import polars as pl
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
import numpy as np
import logging
import random

seed = 42
np.random.seed(seed)
random.seed(seed)
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.random.set_seed(seed)
print(tf.__version__)

2024-06-15 00:33:08.143482: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-15 00:33:08.207859: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.16.1


In [2]:
from polimi.utils.tf_models import TemporalHistorySequenceModel, TemporalHistoryClassificationModel
from polimi.utils._polars import reduce_polars_df_memory_size

history = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/train/history.parquet')
behaviors = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/train/behaviors.parquet')
articles = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/articles.parquet')

topics = articles['topics'].explode().unique().drop_nans().drop_nulls().sort().to_frame().with_row_index()
category = articles['category'].unique().drop_nans().drop_nulls().sort().to_frame().with_row_index(offset=1)
subcategory = articles['subcategory'].explode().unique().drop_nans().drop_nulls().sort().to_frame().with_row_index()
mask = 0

articles = articles.select(['article_id', 'category', 'subcategory', 'premium', 'topics'])\
    .with_columns(
        pl.col('topics').fill_null(pl.lit([])),
        pl.col('subcategory').fill_null(pl.lit([]))
    )\
    .with_columns(
        pl.col('topics').list.eval(pl.element().replace(topics['topics'], topics['index'], default=None)).list.drop_nulls(),
        pl.col('category').replace(category['category'], category['index'], default=None).fill_null(mask),
        pl.col('subcategory').list.eval(pl.element().replace(subcategory['subcategory'], subcategory['index'], default=None)).list.drop_nulls(),
        pl.col('premium').cast(pl.Int8)
)

dummies_topics = articles.select('article_id', 'topics').explode('topics').drop_nulls().to_dummies(columns=['topics'])\
    .group_by('article_id').agg(pl.all().sum())
dummies_subcategories = articles.select('article_id', 'subcategory').explode('subcategory').drop_nulls().to_dummies(columns=['subcategory'])

articles = articles.join(dummies_topics, on='article_id', how='left')\
    .join(dummies_subcategories, on='article_id', how='left')\
    .drop('topics', 'subcategory')
    
one_hot_cols = [col for col in articles.columns if col.startswith('topics_') or col.startswith('subcategory_')]
articles = articles.with_columns(
    pl.col(one_hot_cols).fill_null(0)
)

df = pl.concat([
    slice.explode(pl.all().exclude('user_id'))\
        .with_columns(
            pl.col('scroll_percentage_fixed').fill_null(0.),
            pl.col('read_time_fixed').fill_null(0.),
        )\
        .with_columns(
            (pl.col('impression_time_fixed').dt.hour() // 4).alias('hour_group'),
            pl.col('impression_time_fixed').dt.weekday().alias('weekday'),
        ).drop('impression_time_fixed')\
        .rename({'scroll_percentage_fixed': 'scroll_percentage', 'read_time_fixed': 'read_time'})
        .join(articles, left_on='article_id_fixed', right_on='article_id', how='left').drop('article_id_fixed')\
        .group_by('user_id').agg(pl.all())
    for slice in history.iter_slices(10000)
])

cols = df.columns
topics_cols = sorted([col for col in cols if col.startswith('topics_')], key=lambda x: int(x.split('_')[-1]))
subcategory_cols = sorted([col for col in cols if col.startswith('subcategory_')], key=lambda x: int(x.split('_')[-1]))
all_others = set(cols) - set(topics_cols) - set(subcategory_cols) - {'user_id'}
cols = ['user_id'] + list(all_others) + topics_cols + subcategory_cols
df = df.select(cols)
df = reduce_polars_df_memory_size(df)

df.head(1)

Memory usage of dataframe is 3066.15 MB
Memory usage after optimization is: 842.00 MB
Decreased by 72.5%


user_id,premium,weekday,category,hour_group,read_time,scroll_percentage,topics_0,topics_1,topics_2,topics_3,topics_4,topics_5,topics_6,topics_7,topics_8,topics_9,topics_10,topics_11,topics_12,topics_13,topics_14,topics_15,topics_16,topics_17,topics_18,topics_19,topics_20,topics_21,topics_22,topics_23,topics_24,topics_25,topics_26,topics_27,topics_28,topics_29,…,subcategory_137,subcategory_138,subcategory_139,subcategory_140,subcategory_141,subcategory_142,subcategory_143,subcategory_144,subcategory_145,subcategory_146,subcategory_147,subcategory_148,subcategory_149,subcategory_150,subcategory_151,subcategory_152,subcategory_153,subcategory_154,subcategory_155,subcategory_156,subcategory_157,subcategory_158,subcategory_159,subcategory_160,subcategory_161,subcategory_162,subcategory_163,subcategory_164,subcategory_165,subcategory_166,subcategory_167,subcategory_168,subcategory_169,subcategory_170,subcategory_171,subcategory_172,subcategory_173
u32,list[i8],list[i8],list[u8],list[i8],list[f32],list[f32],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],…,list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8],list[i8]
377904,"[0, 0, … 0]","[4, 4, … 3]","[4, 4, … 7]","[4, 4, … 3]","[6.0, 53.0, … 34.0]","[29.0, 100.0, … 44.0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 1]","[1, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[1, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 1]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 1]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]",…,"[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 0]"


In [3]:
from tqdm import tqdm

def build_sequences_seq_to_one(df: pl.DataFrame, w: int, stride: int):
    all_features = df.drop('user_id').columns
    singular_cols = ['topics', 'subcategory', 'category', 'weekday', 'hour_group']
    name_idx_dict = {key: [i for i, col in enumerate(all_features) if col.startswith(key)] for key in singular_cols}
    numerical_cols = ['scroll_percentage', 'read_time', 'premium']
    name_idx_dict['numerical'] = [i for i, col in enumerate(all_features) if col in numerical_cols]
        
    res = {key: ([], []) for key in name_idx_dict.keys()}

    for user_df in tqdm(df.partition_by('user_id')):
        x = user_df.drop('user_id').to_numpy()[0]
        x = np.array([np.array(x_i) for x_i in x])
                
        i = 0
        if i + w >= x.shape[1]:
            # in case history is shorter than the window then we pad it and select the last element as target
            pad_width = w - x[:, :-1].shape[1]
            pad_m = np.zeros((x.shape[0], pad_width))
            padded_x = np.concatenate((pad_m, x[:, :-1]), axis=1)
            y_i = x[:, -1]
            
            for key, idx in name_idx_dict.items():
                res[key][0].append(padded_x[idx, :].T)
                res[key][1].append(y_i[idx].T)
            
        else:
            while i + w < x.shape[1]:
                # in case history is larger than the window then we select the window and the target randomly between the next elements
                x_i = x[:, i:i+w]
                target_random_id = np.random.randint(i+w, x.shape[1])
                y_i = x[:, target_random_id]
                
                for key, idx in name_idx_dict.items():
                    res[key][0].append(x_i[idx, :].T)
                    res[key][1].append(y_i[idx].T)
                
                i+=stride
                         
            #TODO: add padding for the last sequence, if we want to keep it
                

    for key in res.keys():
        res[key] = (np.array(res[key][0]), np.array(res[key][1]))
    
    return res

In [4]:
train_data = build_sequences_seq_to_one(df[:100], w=20, stride=5)
train_data['topics'][0].shape

100%|██████████| 100/100 [00:00<00:00, 147.62it/s]


(5863, 20, 78)

In [5]:
from polimi.utils.tf_models import TemporalHistorySequenceModel, TemporalHistoryClassificationModel
from polimi.utils._polars import reduce_polars_df_memory_size

model = TemporalHistorySequenceModel(
    seq_embedding_dims={
        # adding, for the moment, one dim more to cover missings in non one-hot vectors
        'topics': (78, 10, True),
        'subcategory': (174, 10, True),
        'category': (26, 10, False),
        'weekday': (8, 3, False),
        'hour_group': (7, 3, False),
    },
    seq_numerical_features=['scroll_percentage', 'read_time', 'premium'],
    n_recurrent_layers=1,
    recurrent_embedding_dim=64,
    l1_lambda=1e-4,
    l2_lambda=1e-4,
)

In [7]:
model.fit(
    train_ds=train_data,
    batch_size=64,
    epochs=10,
    # target for (topics, subcategory, category)
    loss=[tfk.losses.BinaryCrossentropy(), tfk.losses.BinaryCrossentropy(), tfk.losses.CategoricalCrossentropy()],
    loss_weights=[0.5, 0.1, 0.4],
    optimizer=tfk.optimizers.Adam(learning_rate=1e-4)
)

Epoch 1/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - loss: 5.4941
Epoch 2/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 4.7585
Epoch 3/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 4.6179
Epoch 4/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 4.5613
Epoch 5/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 4.5797
Epoch 6/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 4.5396
Epoch 7/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 4.3560
Epoch 8/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 4.0833
Epoch 9/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 4.2371
Epoch 10/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 4.2991