In [1]:
import polars as pl
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
import numpy as np
import logging
import random

seed = 42
np.random.seed(seed)
random.seed(seed)
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.random.set_seed(seed)
print(tf.__version__)

2024-06-19 09:30:10.718174: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-19 09:30:10.780875: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.16.1


# RNN

In [3]:
import json

with open('/home/ubuntu/experiments/rnn_conv_all_2024-06-17_14-48-01/info.json') as info_file:
    info = json.load(info_file)

In [6]:
from polimi.utils.tf_models.utils.build_sequences import build_history_seq, build_sequences_seq_iterator, N_CATEGORY, N_SENTIMENT_LABEL, N_SUBCATEGORY, N_TOPICS, N_HOUR_GROUP, N_WEEKDAY
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder
import joblib

history = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/validation/history.parquet')
articles = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/articles.parquet')

history_seq = build_history_seq(history, articles)

behaviors = pl.read_parquet('/home/ubuntu/dset_complete/validation_ds.parquet')

categorical_columns = info['categorical_columns']
numerical_columns = info['numerical_columns']

behaviors_pandas = behaviors.to_pandas()

xformer = joblib.load('/home/ubuntu/experiments/rnn_conv_all_2024-06-17_14-48-01/power_transformer.joblib')
behaviors_pandas[numerical_columns] = behaviors_pandas[numerical_columns].replace([-np.inf, np.inf], np.nan).fillna(0)
behaviors_pandas[numerical_columns] = xformer.transform(behaviors_pandas[numerical_columns]).astype(np.float32)

encoder = joblib.load('/home/ubuntu/experiments/rnn_conv_all_2024-06-17_14-48-01/ordinal_encoder.joblib')
for i, f in enumerate(categorical_columns):
    behaviors_pandas[f] = behaviors_pandas[f].astype(str).fillna('NA')
    categories_val = list(behaviors_pandas[f].unique())
    unknown_categories = [x for x in categories_val if x not in encoder.categories_[i]]
    behaviors_pandas[f] = behaviors_pandas[f].replace(list(unknown_categories), 'NA')
behaviors_pandas[categorical_columns] = encoder.transform(behaviors_pandas[categorical_columns]).astype(np.int16)
behaviors = behaviors.select(['target', 'user_id', 'impression_id', 'article']).hstack(pl.from_pandas(behaviors_pandas[numerical_columns + categorical_columns]))

vocabulary_sizes = {
    feature: len(encoder.categories_[i]) for i, feature in enumerate(categorical_columns)
}

In [None]:
def build_sequences_cls_iterator_test(history_seq: pl.DataFrame, behaviors: pl.DataFrame, window: int, 
                                      categorical_columns: list[str], numerical_columns: list[str], record_order: list):
    all_features = history_seq.drop('user_id').columns
    
    multi_one_hot_cols = ['topics', 'subcategory']
    categorical_cols = ['category', 'weekday', 'hour_group', 'sentiment_label']
    caterical_cols_num_classes = {
        'category': N_CATEGORY + 1,#+1 to handle null values
        'weekday': N_WEEKDAY,
        'hour_group': N_HOUR_GROUP,
        'sentiment_label': N_SENTIMENT_LABEL + 1 #+1 to handle null
    }
    #it can be hardcoded if needed
    name_idx_dict = {key: [i for i, col in enumerate(all_features) if col.startswith(key)] for key in multi_one_hot_cols + categorical_cols}
    numerical_cols = ['scroll_percentage', 'read_time', 'premium']
    name_idx_dict['numerical'] = [i for i, col in enumerate(all_features) if col in numerical_cols]
    
    mask = 0
    history_seq_trucated = history_seq.with_columns(
        pl.all().exclude('user_id').list.reverse().list.eval(pl.element().extend_constant(mask, window)).list.reverse().list.tail(window).name.keep()
    )
    
    len_numerical = len(numerical_columns)
    
    for user_history in history_seq_trucated.to_numpy():
        
        user_id = user_history[0]
        x = np.array([np.array(x_i) for x_i in user_history[1:]])
        res_x = {}
        for key, idx in name_idx_dict.items():
            res_x[f'input_{key}'] = x[idx, :].T
         
        behaviors_user = behaviors.filter(pl.col('user_id') == user_id)
        X = behaviors_user.select(numerical_columns + categorical_columns).to_numpy()
        y = behaviors_user.select('target').to_numpy().flatten()
        impression_ids = behaviors_user['impression_id'].to_list()
        articles = behaviors_user['article'].to_list()
        for i in range(behaviors_user.shape[0]):
            record_order.append([impression_ids[i], user_id, articles[i], y[i]])
            yield {
                'numerical_columns': X[i, :len_numerical],
                **{c: X[i, j+len_numerical] for j, c in enumerate(categorical_columns)},
                **res_x
            }, y[i]

In [7]:
from polimi.utils.tf_models.utils.build_sequences import build_sequences_cls_iterator

record_order = []
window = 30
validation_dataset = tf.data.Dataset.from_generator(
    lambda : build_sequences_cls_iterator_test(history_seq, behaviors, window=window, numerical_columns=numerical_columns,
                                               categorical_columns=categorical_columns, record_order=record_order),
    output_signature=(
        {
            'numerical_columns': tf.TensorSpec(shape=(len(numerical_columns),), dtype=tf.float32), # behaviors numerical columns
            **{c: tf.TensorSpec(shape=(), dtype=tf.int16) for c in categorical_columns}, # behaviors categorical columns
            'input_topics': tf.TensorSpec(shape=(window,N_TOPICS+1), dtype=tf.int32), # history topics sequence
            'input_category': tf.TensorSpec(shape=(window, 1), dtype=tf.int32), # history category sequence
            'input_subcategory': tf.TensorSpec(shape=(window, N_SUBCATEGORY+1), dtype=tf.int32), # history subcategory sequence
            'input_weekday': tf.TensorSpec(shape=(window, 1), dtype=tf.int32), # history weekday sequence
            'input_hour_group': tf.TensorSpec(shape=(window, 1), dtype=tf.int32), # history hour_group sequence
            'input_sentiment_label': tf.TensorSpec(shape=(window, 1), dtype=tf.int32), # history sentiment_label sequence
            'input_numerical': tf.TensorSpec(shape=(window, 3), dtype=tf.float32), # history (premium, read_time, scroll_percentage) sequence
        },
        tf.TensorSpec(shape=(), dtype=tf.float32), # target
    )
).batch(512)

In [8]:
from polimi.utils.tf_models import TemporalHistorySequenceModel, TemporalHistoryClassificationModel

model = TemporalHistoryClassificationModel(
    categorical_features=categorical_columns,
    numerical_features=numerical_columns,
    vocabulary_sizes=vocabulary_sizes,
    seq_embedding_dims={
        'input_topics': (N_TOPICS + 1, 20, True),
        'input_subcategory': (N_SUBCATEGORY + 1, 20, True),
        'input_category': (N_CATEGORY + 1, 20, False),
        'input_weekday': (N_WEEKDAY, 3, False),
        'input_hour_group': (N_HOUR_GROUP, 3, False),
        'input_sentiment_label': (N_SENTIMENT_LABEL + 1, 2, False)
    },
    seq_numerical_features=['scroll_percentage', 'read_time', 'premium'],
    n_recurrent_layers=1,
    recurrent_embedding_dim=128,
    l1_lambda=1e-4,
    l2_lambda=1e-4,
    dense_n_layers=2,
    dense_start_units=256,
    dense_units_decay=2,
    dense_activation='swish',
    dense_dropout_rate=0.2,
)

model._build()
model.model.load_weights('/home/ubuntu/experiments/rnn_conv_all_2024-06-17_14-48-01/checkpoints/checkpoint.weights.h5')

In [9]:
predictions = model.predict(validation_dataset, batch_size=512).flatten()

[1m5721/5721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3506s[0m 612ms/step


  self.gen.throw(typ, value, traceback)
2024-06-17 16:03:28.559503: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [10]:
from fastauc.fastauc.fast_auc import fast_numba_auc

evaluation_ds = pl.DataFrame(record_order, schema=['impression_id', 'user_id', 'article', 'target'])
evaluation_ds = evaluation_ds.with_columns(pl.Series(predictions).alias('prediction'))
evaluation_ds = evaluation_ds.group_by('impression_id').agg(pl.col('target'), pl.col('prediction'))

auc = np.mean(
    [fast_numba_auc(np.array(y_t).astype(bool), np.array(y_s).astype(np.float32)) 
        for y_t, y_s in zip(evaluation_ds['target'].to_list(), 
                            evaluation_ds['prediction'].to_list())]
)
auc

0.8030083470322491

# Conv

In [2]:
import json

with open('/home/ubuntu/experiments/rnn_conv_all_2024-06-18_18-44-22/info.json') as info_file:
    info = json.load(info_file)

In [3]:
from polimi.utils.tf_models.utils.build_sequences import build_history_seq, build_sequences_seq_iterator, N_CATEGORY, N_SENTIMENT_LABEL, N_SUBCATEGORY, N_TOPICS, N_HOUR_GROUP, N_WEEKDAY
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder
import joblib

history = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/validation/history.parquet')
articles = pl.read_parquet('/home/ubuntu/dataset/ebnerd_small/articles.parquet')

history_seq = build_history_seq(history, articles)

behaviors = pl.read_parquet('/home/ubuntu/dset_complete/validation_ds.parquet')

categorical_columns = info['categorical_columns']
numerical_columns = info['numerical_columns']

behaviors_pandas = behaviors.to_pandas()

xformer = joblib.load('/home/ubuntu/experiments/rnn_conv_all_2024-06-18_18-44-22/power_transformer.joblib')
behaviors_pandas[numerical_columns] = behaviors_pandas[numerical_columns].replace([-np.inf, np.inf], np.nan).fillna(0)
behaviors_pandas[numerical_columns] = xformer.transform(behaviors_pandas[numerical_columns]).astype(np.float32)

encoder = joblib.load('/home/ubuntu/experiments/rnn_conv_all_2024-06-18_18-44-22/ordinal_encoder.joblib')
for i, f in enumerate(categorical_columns):
    behaviors_pandas[f] = behaviors_pandas[f].astype(str).fillna('NA')
    categories_val = list(behaviors_pandas[f].unique())
    unknown_categories = [x for x in categories_val if x not in encoder.categories_[i]]
    behaviors_pandas[f] = behaviors_pandas[f].replace(list(unknown_categories), 'NA')
behaviors_pandas[categorical_columns] = encoder.transform(behaviors_pandas[categorical_columns]).astype(np.int16)
behaviors = behaviors.select(['target', 'user_id', 'impression_id', 'article']).hstack(pl.from_pandas(behaviors_pandas[numerical_columns + categorical_columns]))

vocabulary_sizes = {
    feature: len(encoder.categories_[i]) for i, feature in enumerate(categorical_columns)
}

In [4]:
def build_sequences_cls_iterator_test(history_seq: pl.DataFrame, behaviors: pl.DataFrame, window: int, 
                                      categorical_columns: list[str], numerical_columns: list[str], record_order: list):
    all_features = history_seq.drop('user_id').columns
    
    multi_one_hot_cols = ['topics', 'subcategory']
    categorical_cols = ['category', 'weekday', 'hour_group', 'sentiment_label']
    caterical_cols_num_classes = {
        'category': N_CATEGORY + 1,#+1 to handle null values
        'weekday': N_WEEKDAY,
        'hour_group': N_HOUR_GROUP,
        'sentiment_label': N_SENTIMENT_LABEL + 1 #+1 to handle null
    }
    #it can be hardcoded if needed
    name_idx_dict = {key: [i for i, col in enumerate(all_features) if col.startswith(key)] for key in multi_one_hot_cols + categorical_cols}
    numerical_cols = ['scroll_percentage', 'read_time', 'premium']
    name_idx_dict['numerical'] = [i for i, col in enumerate(all_features) if col in numerical_cols]
    
    mask = 0
    history_seq_trucated = history_seq.with_columns(
        pl.all().exclude('user_id').list.reverse().list.eval(pl.element().extend_constant(mask, window)).list.reverse().list.tail(window).name.keep()
    )
    
    len_numerical = len(numerical_columns)
    
    for user_history in history_seq_trucated.to_numpy():
        
        user_id = user_history[0]
        x = np.array([np.array(x_i) for x_i in user_history[1:]])
        res_x = {}
        for key, idx in name_idx_dict.items():
            res_x[f'input_{key}'] = x[idx, :].T
         
        behaviors_user = behaviors.filter(pl.col('user_id') == user_id)
        X = behaviors_user.select(numerical_columns + categorical_columns).to_numpy()
        y = behaviors_user.select('target').to_numpy().flatten()
        impression_ids = behaviors_user['impression_id'].to_list()
        articles = behaviors_user['article'].to_list()
        for i in range(behaviors_user.shape[0]):
            record_order.append([impression_ids[i], user_id, articles[i], y[i]])
            yield {
                'numerical_columns': X[i, :len_numerical],
                **{c: X[i, j+len_numerical] for j, c in enumerate(categorical_columns)},
                **res_x
            }, y[i]

In [5]:
from polimi.utils.tf_models.utils.build_sequences import build_sequences_cls_iterator

record_order = []
window = 30
validation_dataset = tf.data.Dataset.from_generator(
    lambda : build_sequences_cls_iterator_test(history_seq, behaviors, window=window, numerical_columns=numerical_columns,
                                               categorical_columns=categorical_columns, record_order=record_order),
    output_signature=(
        {
            'numerical_columns': tf.TensorSpec(shape=(len(numerical_columns),), dtype=tf.float32), # behaviors numerical columns
            **{c: tf.TensorSpec(shape=(), dtype=tf.int16) for c in categorical_columns}, # behaviors categorical columns
            'input_topics': tf.TensorSpec(shape=(window,N_TOPICS+1), dtype=tf.int32), # history topics sequence
            'input_category': tf.TensorSpec(shape=(window, 1), dtype=tf.int32), # history category sequence
            'input_subcategory': tf.TensorSpec(shape=(window, N_SUBCATEGORY+1), dtype=tf.int32), # history subcategory sequence
            'input_weekday': tf.TensorSpec(shape=(window, 1), dtype=tf.int32), # history weekday sequence
            'input_hour_group': tf.TensorSpec(shape=(window, 1), dtype=tf.int32), # history hour_group sequence
            'input_sentiment_label': tf.TensorSpec(shape=(window, 1), dtype=tf.int32), # history sentiment_label sequence
            'input_numerical': tf.TensorSpec(shape=(window, 3), dtype=tf.float32), # history (premium, read_time, scroll_percentage) sequence
        },
        tf.TensorSpec(shape=(), dtype=tf.float32), # target
    )
).batch(512)

In [6]:
from polimi.utils.tf_models import TemporalConvolutionalHistoryClassificationModel

model = TemporalConvolutionalHistoryClassificationModel(
    categorical_features=categorical_columns,
    numerical_features=numerical_columns,
    vocabulary_sizes=vocabulary_sizes,
    seq_embedding_dims={
        'input_topics': (N_TOPICS + 1, 20, True),
        'input_subcategory': (N_SUBCATEGORY + 1, 20, True),
        'input_category': (N_CATEGORY + 1, 20, False),
        'input_weekday': (N_WEEKDAY, 3, False),
        'input_hour_group': (N_HOUR_GROUP, 3, False),
        'input_sentiment_label': (N_SENTIMENT_LABEL + 1, 2, False)
    },
    seq_numerical_features=['scroll_percentage', 'read_time', 'premium'],
    window_size=window,
    n_conv_layers=5,
    conv_filters=128,
    kernel_size=2,
    conv_activation='swish',
    l1_lambda=1e-4,
    l2_lambda=1e-4,
    dropout_rate=0.2,
    dense_n_layers=4,
    dense_start_units=384,
    dense_units_decay=2,
    dense_activation='swish',
)

model._build()
model.model.load_weights('/home/ubuntu/experiments/rnn_conv_all_2024-06-18_18-44-22/checkpoints/checkpoint.weights.h5')

In [7]:
predictions = model.predict(validation_dataset, batch_size=512).flatten()

[1m5721/5721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3873s[0m 676ms/step


  self.gen.throw(typ, value, traceback)
2024-06-19 10:37:37.538809: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [8]:
from fastauc.fastauc.fast_auc import fast_numba_auc

evaluation_ds = pl.DataFrame(record_order, schema=['impression_id', 'user_id', 'article', 'target'])
evaluation_ds = evaluation_ds.with_columns(pl.Series(predictions).alias('prediction'))
evaluation_ds = evaluation_ds.group_by('impression_id').agg(pl.col('target'), pl.col('prediction'))

auc = np.mean(
    [fast_numba_auc(np.array(y_t).astype(bool), np.array(y_s).astype(np.float32)) 
        for y_t, y_s in zip(evaluation_ds['target'].to_list(), 
                            evaluation_ds['prediction'].to_list())]
)
auc

0.8128420126354572

In [9]:
pl.DataFrame(record_order, schema=['impression_id', 'user_id', 'article', 'target']).with_columns(pl.Series(predictions).alias('prediction')).write_parquet('/home/ubuntu/experiments/rnn_conv_all_2024-06-18_18-44-22/predictions.parquet')