In [276]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt


import tensorflow as tf

import polars as pl

from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    truncate_history,
)

from tqdm import tqdm

from sklearn.model_selection import train_test_split

In [277]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)
  
  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x
  
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)
   
    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x

class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.
    
    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

class Split_tensor(tf.keras.layers.Layer):
    def __init__(self, num_of_splits):
      super().__init__()
      self.num_of_splits = num_of_splits
      
    def call(self,x):
      return tf.split(x, num_or_size_splits=self.num_of_splits, axis=1)

In [278]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.input_vocab_size=input_vocab_size
    self.split_tensor = Split_tensor(num_of_splits=2)
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=int(input_vocab_size/2),
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)
    self.flatten = tf.keras.layers.GlobalAveragePooling1D(name='avg_pool')
    self.final_layer = tf.keras.layers.Dense(units=1, name='dense')
    self.sigmoid = tf.keras.layers.Activation('sigmoid', name='Sigmoid')

  def _build(self,input_shape):
    
    input_layer = tf.keras.layers.Input(shape=input_shape, name='Input')
    history, item = self.split_tensor(input_layer)
    history = self.encoder(history)  # (batch_size, context_len, d_model)
    item = self.decoder(item, history)  # (batch_size, target_len, d_model)
    item = self.flatten(item)
    logits = self.final_layer(item)# (batch_size, target_len, target_vocab_size)
    output = self.sigmoid(logits)
    model = tf.keras.Model(inputs=input_layer, outputs=output, name='model')

    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.AUC(curve='ROC', name='auc')]
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
    model.compile(
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                optimizer=optimizer,
                metrics=metrics)
    return model
              
  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    print(inputs)
    context, x  = inputs
    history, item = tf.split(context, num_or_size_splits=2, axis=1)
    print(history)
    print(item)
    history = self.encoder(history)  # (batch_size, context_len, d_model)
    print(history)
    item = self.decoder(item, history)  # (batch_size, target_len, d_model)
    print(item)
    item = self.flatten(item)
    print(item)
    # Final linear layer output.
    logits = self.final_layer(item)# (batch_size, target_len, target_vocab_size)
    print(logits)
    logits = self.sigmoid(logits)
    print(logits)
    # try:
    #   # Drop the keras mask, so it doesn't scale the losses/metrics.
    #   # b/250038731
    #   del logits._keras_mask
    # except AttributeError:
    #   pass

    # Return the final output and the attention weights.
    tf.print(logits)
    return logits, x

In [279]:
NPRATIO = 2
behaviors = pl.read_parquet('/home/ubuntu/dataset/ebnerd_demo/train/behaviors.parquet')
embeddings = pl.read_parquet('/home/ubuntu/dataset/Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet')
history = pl.read_parquet('/home/ubuntu/dataset/ebnerd_demo/train/history.parquet')


In [280]:
behaviors_labels = behaviors.select(['impression_id', 'article_ids_inview', 'article_ids_clicked'])\
    .pipe(create_binary_labels_column, shuffle=True, seed=123)\
    .drop('article_ids_clicked')\
    .explode(['article_ids_inview', 'labels'])\
    .rename({'article_ids_inview': 'article'})
behaviors_subsample = behaviors.pipe(create_binary_labels_column, shuffle=True, seed=123).select(['impression_id','user_id','article_ids_inview','article_ids_clicked','labels']) \
    .pipe(
        sampling_strategy_wu2019, npratio=NPRATIO, shuffle=False, with_replacement=True, seed=123
    ).explode('article_ids_inview').drop('article_ids_clicked').rename({'article_ids_inview' : 'article'}) \
    .select(['impression_id', 'user_id', 'article'])\
    .with_columns(pl.col('user_id').cast(pl.UInt32), pl.col('article').cast(pl.Int32))\
    .join(behaviors_labels, on = ['impression_id', 'article'], how = 'left')\
    .join(embeddings, left_on = 'article', right_on='article_id', how = 'left')
    

embeddings = embeddings.rename({embeddings.columns[1] : 'item_embedding'})
embedding_len = len(embeddings['item_embedding'].limit(1).item())

user_embed =  pl.concat(
        rows.select('user_id', 'article_id_fixed').explode('article_id_fixed').rename(
            {'article_id_fixed': 'article_id'}).join(embeddings, on='article_id')
        .with_columns(pl.col("item_embedding").list.to_struct()).unnest("item_embedding")
        .group_by('user_id').agg(
            [pl.col(f'field_{i}').mean().cast(pl.Float32) for i in range(embedding_len)])
        .with_columns(
            pl.concat_list([f"field_{i}" for i in range(
                            embedding_len)]).alias('user_embedding')
        )
        .select('user_id', 'user_embedding')
        for rows in tqdm(history.iter_slices(1000), total=history.shape[0] // 1000))

behaviors_labels = behaviors_subsample.join(user_embed, on='user_id', how= 'left').drop(['impression_id','user_id','article'])\
    .rename({'contrastive_vector':'item_embedding'})
print(behaviors_labels)

2it [00:01,  1.29it/s]                       

shape: (74_664, 3)
┌────────┬─────────────────────────────────┬─────────────────────────────────┐
│ labels ┆ item_embedding                  ┆ user_embedding                  │
│ ---    ┆ ---                             ┆ ---                             │
│ i8     ┆ list[f32]                       ┆ list[f32]                       │
╞════════╪═════════════════════════════════╪═════════════════════════════════╡
│ 0      ┆ [-0.00917, -0.012778, … 0.0131… ┆ [-0.006218, 0.017533, … 0.0098… │
│ 0      ┆ [0.048509, -0.006538, … 0.0087… ┆ [-0.006218, 0.017533, … 0.0098… │
│ 1      ┆ [-0.081096, 0.086189, … 0.0136… ┆ [-0.006218, 0.017533, … 0.0098… │
│ 0      ┆ [0.003116, 0.01738, … 0.040633… ┆ [-0.009878, 0.001368, … 0.0186… │
│ 0      ┆ [0.007309, -0.026007, … -0.036… ┆ [-0.009878, 0.001368, … 0.0186… │
│ …      ┆ …                               ┆ …                               │
│ 0      ┆ [-0.073725, 0.061391, … -0.052… ┆ [0.000206, 0.013822, … 0.01624… │
│ 1      ┆ [-0.003845, -0.017809,




In [281]:
dataset = behaviors_labels.select('user_embedding','item_embedding')\
    .with_columns(
        pl.concat_list([pl.col('user_embedding','item_embedding')]).alias('input')
    ).select('input')\
    .with_columns(pl.col("input").list.to_struct()).unnest("input").to_pandas()
    #.drop(['user_embedding','item_embedding'])\
    #.to_numpy()
labels = behaviors_labels.select('labels').to_pandas()

In [282]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)    
      .prefetch(buffer_size=tf.data.AUTOTUNE))

train_data, val_data, train_labels, val_labels = train_test_split(dataset, labels,
                                                                  test_size=None,
                                                                  shuffle=True, stratify=labels)
  
train_batch = tf.data.Dataset.from_tensor_slices((train_data.to_numpy(), train_labels.to_numpy()))
val_batch = tf.data.Dataset.from_tensor_slices((val_data.to_numpy(), val_labels.to_numpy()))
train_batches = train_batch.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
val_batches = val_batch.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [283]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [284]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=embedding_len * 2,
    target_vocab_size=1,
    dropout_rate=dropout_rate)
input_shape = train_data.shape[1:]

print(input_shape)
model = transformer._build(input_shape = input_shape)


model.fit(x=train_data,
          y=train_labels, #train_batches,
                #train_batches,
                epochs=1,
                batch_size=128,
                validation_data= (val_data, val_labels) #val_batches
              )


(1536,)
[1m  5/438[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:17:15[0m 11s/step - auc: 0.5166 - loss: 1.9829

KeyboardInterrupt: 

: 