In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Recsys2022 Challenge 

NVIDIA-Merlin team participated in [Recsys2022 challenge](http://www.recsyschallenge.com/2022/index.html) and secured 3rd position. This notebook contains the various techniques used in the solution.

### Learning Objective
In this notebook, we will learn the importance of the concepts that improved the results of the competition significantly.

- ##### Label smoothing
     When the probabilities predicted by a Classification model are higher than its accuracy we say the model is overconfident. It can be prevented by using Label smoothing. This technique basically, transforms One-hot encoded labels into smoothed labels. 
$$  \begin{array}{l}
y_{l} \ =\ ( 1\ -\ \alpha \ ) \ *\ y_{o} \ +\ ( \alpha \ /\ L)\\
\alpha :\ Label\ smoothing\\
L:\ Total\ number\ of\ label\ classes\\
y_{o} :\ One-hot\ encoded\ label\ vector
\end{array}
$$
When α is 0, we have the original one-hot encoded labels, and as α increases, we move towards smoothed labels. Read [this](https://arxiv.org/abs/1906.02629) paper to learn more about it.


- ##### Temperature Scaling
    Similar to Label Smoothing, Temperature Scaling is done to reduce the overconfidence of a model. In this, we divide the logits (inputs to the softmax function) by a scalar parameter (T) . For more information on Temperature Scaling read [this](https://arxiv.org/pdf/1706.04599.pdf) paper.
$$ softmax\ =\ \frac{e\ ^{( z_{i} \ /\ \ T)}}{\sum _{j} \ e^{( z_{j} \ /\ T)} \ } $$


- ##### Weight Tying
In this technique, we share the Embedding layer's weights which is used to convert the input to embeddings, as the softmax weights,  to convert hidden layer output to softmax layer output. This drastically reduces the number of parameters and allows the model to train better. For more information read [this](https://arxiv.org/pdf/1608.05859v3.pdf) paper.

In [None]:
import os
if __name__=='__main__':
    os._exit(0)

In [1]:
!nvidia-smi

Thu Aug 11 06:54:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   34C    P0    47W / 163W |  17624MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   36C    P0    73W / 163W |  20370MiB / 32510MiB |     22%      Default |
|       

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [3]:
import cupy
import cudf
import dask_cudf
import numpy as np
import pandas as pd 

import nvtabular as nvt
from merlin.dag import ColumnSelector
from merlin.io import Dataset
from merlin.schema import Schema, Tags
from nvtabular.ops import (
    AddMetadata,
)
from merlin.schema.tags import Tags
from utils import get_drepessi_recsys2022_dataset


import tensorflow as tf
from merlin.io import Dataset
from merlin.schema import Tags
from tensorflow.keras import regularizers
from merlin.models.tf.dataset import BatchedDataset
from merlin.models.tf.utils.tf_utils import extract_topk

import merlin.models.tf as mm
from merlin.models.tf import InputBlock
from merlin.models.tf.models.base import Model
from merlin.models.tf.core.aggregation import SequenceAggregation, SequenceAggregator
from merlin.models.tf.core.transformations import (
    ItemsPredictionWeightTying,
    L2Norm,
    LogitsTemperatureScaler,
)


DATA_FOLDER = 'dressipi'
DATA_PROCESSED_FOLDER = 'dressipi_processed'
DATETIME_CONVERTION = 'ms'

2022-08-11 06:54:51.182338: I tensorflow/core/platform/cpu_feature_guard.cc:152] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-11 06:54:53.256150: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16255 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:0a:00.0, compute capability: 7.0


## Dressipi
The [Dressipi](http://www.recsyschallenge.com/2022/dataset.html) dataset contains 1.1 M online retail sessions that resulted in a purchase. It provides details about items that were viewed in a session, the item purchased at the end of the session and numerous features of those items. The task of this competition was, given a sequence of items predict which item will be purchased at the end of a session.


<img src="imgs/dressipi.JPG" alt="dressipi_dataset" style="width: 400px; float: center;">  


In [4]:
train, valid, sessions = get_drepessi_recsys2022_dataset(DATA_FOLDER)

## Dataset
The training and validation data contains the `session_id` in which a user viewed and purchased an item. In each session we have `item_id` which was viewed at a given `timestamp` and `purchase_id` which is the id of item bought at the end of the session. In addition to `timestamp`, we have `day` and `date` features for representing the chronological order in which items were viewed.

The items in the Dresspi dataset had a many features out of which we took 14 most important features, namely 
`f_3, f_4, f_5, f_17, f_24, f_30, f_45, f_46, f_53, f_55, f_58, f_63, f_65, f_73`

In [6]:
train.head()

Unnamed: 0,session_id,item_id,date,f_3,f_4,f_5,f_7,f_17,f_24,f_30,...,f_61,f_63,f_65,f_68,f_69,f_72,f_73,timestamp,day,purchase_id
0,20376,16541,2020-09-29 13:58:15.382,793.0,385.0,472.0,414.0,378.0,-1.0,-1.0,...,706.0,142.0,-1.0,744.0,592.0,75.0,544.0,1601387895382,272,2855
1,20383,13811,2021-04-03 13:40:38.346,-1.0,618.0,-1.0,394.0,-1.0,588.0,-1.0,...,462.0,861.0,-1.0,739.0,805.0,75.0,544.0,1617457238346,458,419
2,20383,18953,2021-04-03 13:41:05.685,-1.0,618.0,-1.0,394.0,-1.0,588.0,-1.0,...,462.0,861.0,-1.0,739.0,805.0,75.0,544.0,1617457265685,458,419
3,20397,15041,2020-04-10 17:32:20.052,-1.0,618.0,-1.0,619.0,378.0,-1.0,480.0,...,706.0,861.0,521.0,441.0,592.0,655.0,544.0,1586539940052,100,20202
4,20397,25161,2020-04-10 17:33:12.893,-1.0,618.0,-1.0,619.0,378.0,-1.0,602.333333,...,706.0,861.0,521.0,531.0,592.0,655.0,544.0,1586539992893,100,20202


In [7]:
itemids = cudf.concat([train['item_id'], 
                       valid['item_id'], 
                       train['purchase_id'], 
                       valid['purchase_id']])
itemids = cudf.DataFrame(itemids).drop_duplicates().reset_index(drop=True).reset_index()
itemids.columns = ['_item_id', 'item_id_new']
itemids['_item_id'] = itemids['_item_id'] + 2

def map_itemids(df, col):
    df = df.merge(
        itemids, 
        how='left', 
        left_on=col,
        right_on='item_id_new'
    )
    df.drop([col], axis=1, inplace=True)
    if 'item_id_new' in df.columns:
        df.drop(['item_id_new'], axis=1, inplace=True)
    df = df.rename(columns={'_item_id': col})
    return(df)

train = map_itemids(train, 'item_id')
train = map_itemids(train, 'purchase_id')

valid = map_itemids(valid, 'item_id')
valid = map_itemids(valid, 'purchase_id')

## Feature Engineering with NVTabular

### Categorify

In [8]:
%%time
item_features_names = [col for col in train.columns if 'f_' in col]
cat_features = ['session_id', ['item_id', 'purchase_id']] + item_features_names >> nvt.ops.Categorify()

features = ['timestamp','date'] + cat_features
dataset = Dataset(sessions)
workflow0 = nvt.Workflow(features)
workflow0.fit(dataset)

# transform data
train_0 = workflow0.transform(Dataset(train))
valid_0 = workflow0.transform(Dataset(valid))



CPU times: user 2.26 s, sys: 914 ms, total: 3.17 s
Wall time: 31.7 s


### GroupBy

In [9]:
%%time
features = train_0.head().columns.tolist()

# Define Groupby Operator
to_aggregate = {
    'date': ["first", "last"],
    'item_id': ["list", 'last'],
    'timestamp': ["list"],
    'purchase_id': ['first'],
}
for name in item_features_names: 
    to_aggregate[name] = ['list']
    
groupby_features = features >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    sort_cols=["date"],
    aggs= to_aggregate,
    name_sep="_")

# Add tags needed for the t4rec models definition
item_last = groupby_features['item_id_last'] >> nvt.ops.AddMetadata(tags=[Tags.ITEM, Tags.ITEM_ID, Tags.CATEGORICAL])
item_list = groupby_features['item_id_list'] >> nvt.ops.AddMetadata(tags=[Tags.ITEM, Tags.ITEM_ID, Tags.LIST, Tags.SEQUENCE, Tags.CATEGORICAL])

# item_list = groupby_features['item_id_list'] >> nvt.ops.AddMetadata(tags=[Tags.SEQUENCE, Tags.ITEM, Tags.ITEM_ID, Tags.LIST])
feature_list = groupby_features[[name+'_list' for name in item_features_names]]>> nvt.ops.AddMetadata(tags=[Tags.SEQUENCE, Tags.ITEM, Tags.LIST])
other_features = groupby_features['session_id', 'date_first', 'date_last','timestamp_list']
target_feature = groupby_features['purchase_id_first'] >> nvt.ops.AddMetadata(tags=[Tags.TARGET])

workflow1 = nvt.Workflow(item_last + item_list + feature_list + other_features + target_feature)
workflow1.fit(train_0)

# transform data
train_1 = workflow1.transform(train_0)
valid_1 = workflow1.transform(valid_0)

CPU times: user 1.36 s, sys: 1.91 s, total: 3.27 s
Wall time: 4.13 s


### Truncate and Padding for a Maximum Sequence Length

In [10]:
SESSIONS_MAX_LENGTH = 20
list_cols = [col for col in train_1.head().columns if 'list' in col and 'date' not in col]
truncated_features = list_cols >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH, pad=True) >> nvt.ops.Rename(postfix = '_seq')

final_features = [
    'session_id', 'date_first', 'date_last', 'item_id_list', 'item_id_last', 'purchase_id_first'
]

workflow2 = nvt.Workflow(final_features + truncated_features)
workflow2.fit(train_1)

# transform data
train_2 = workflow2.transform(train_1)
valid_2 = workflow2.transform(valid_1)

### Save processed data to Parquet files

In [11]:
%%time
train_ds = Dataset(train_2.to_ddf().sort_values('date_last'), schema=train_2.schema)
valid_ds = Dataset(valid_2.to_ddf().sort_values('date_last'), schema=valid_2.schema)

train_ds.to_parquet(os.path.join(DATA_PROCESSED_FOLDER, "train/"), output_files=10)
valid_ds.to_parquet(os.path.join(DATA_PROCESSED_FOLDER, "valid/"), output_files=10)

CPU times: user 22 s, sys: 29.9 s, total: 51.9 s
Wall time: 3min 56s


## Training - MLP

A sequential-MLP model with average of the sequence as final representation

In [13]:
SEED = 42
EPOCHS = 5
BATCH_SIZE = 512
LEARNING_RATE = 3e-1
CLIPNORM = True
DROPOUT= 0.2 
LABEL_SMOOTHING = 0.2
TEMPERATURE_SCALING = 2
OPTIMIZER_NAME = 'adam'
LOSS='CategoricalCrossentropy'

tf.keras.utils.set_random_seed(SEED)

In [24]:
train = Dataset(os.path.join(DATA_PROCESSED_FOLDER, 'train/*.parquet'), shuffle=False,)
valid = Dataset(os.path.join(DATA_PROCESSED_FOLDER, 'valid/*.parquet'), shuffle=False,)



In [25]:
schema_model = train.schema.select_by_name(['item_id_last', 'item_id_list_seq', 'purchase_id_first'])
schema_model

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.num_buckets,properties.freq_threshold,properties.max_size,properties.start_index,properties.cat_path,properties.embedding_sizes.cardinality,properties.embedding_sizes.dimension,properties.domain.min,properties.domain.max,properties.domain.name
0,item_id_last,"(Tags.ITEM_ID, Tags.CATEGORICAL, Tags.ITEM)",int64,False,False,,0.0,0.0,0.0,.//categories/unique.item_id_purchase_id.parquet,23619.0,450.0,0,23619,item_id_purchase_id
1,item_id_list_seq,"(Tags.ITEM_ID, Tags.LIST, Tags.ITEM, Tags.SEQU...",int64,True,False,,0.0,0.0,0.0,.//categories/unique.item_id_purchase_id.parquet,23619.0,450.0,0,23619,item_id_purchase_id
2,purchase_id_first,"(Tags.TARGET, Tags.CATEGORICAL)",int64,False,False,,0.0,0.0,0.0,.//categories/unique.item_id_purchase_id.parquet,23619.0,450.0,0,23619,item_id_purchase_id


In [27]:
schema_model['item_id_list_seq'].properties.update({})

schema_model['item_id_list_seq'].properties['embedding_sizes'] = {
    'cardinality': float(23619), 
    'dimension': float(256)
}

schema_model['item_id_list_seq'].properties['domain'] = {
    'min': 0, 
    'max': 23619
}

schema_model['item_id_last'].properties.update({})

schema_model['item_id_last'].properties['embedding_sizes'] = {
    'cardinality': float(23619), 
    'dimension': float(256)
}

schema_model['item_id_last'].properties['domain'] = {
    'min': 0, 
    'max': 23619
}

### Model
InputBlock which takes sequential features, concatenate them and return the sequence of interaction embeddings

In [29]:
input_block = InputBlock(
        schema_model.select_by_name(['item_id_last', 'item_id_list_seq']),
        aggregation='concat',
#         seq=True,
        max_seq_length=20,
        embedding_options=mm.EmbeddingOptions(embedding_dim_default=128),
#         split_sparse=True,
)

MLPBlock to get the sequence of hidden representation

In [30]:
mlp_block = mm.MLPBlock(
                [64, 128],
                activation='relu',
                no_activation_last_layer=True,
                dropout=DROPOUT,
            )

Multi-Classiffication Prediction head which has
- Layer Normalization
- Weight Tying
- Labels as One-hot encoded vectors, used for label smoothing 
- Temperature Scaling to reduce the overconfidence of the model

In [31]:
prediction_call = L2Norm().connect(
    ItemsPredictionWeightTying(schema_model), 
    mm.LabelToOneHot(), 
    LogitsTemperatureScaler(temperature=TEMPERATURE_SCALING)
)

prediction_task = mm.MultiClassClassificationTask(
    target_name="purchase_id_first",
    pre=prediction_call,
)

Now, we connect all the blocks togther to build a model

In [32]:
# model_mlp = Model(input_block, mlp_block, SequenceAggregator(SequenceAggregation.MEAN), prediction_task)
model_mlp = Model(input_block, mlp_block, prediction_task)

optimizer = tf.keras.optimizers.Adam(
    learning_rate=LEARNING_RATE,
    clipnorm=CLIPNORM
)

# model_mlp.compile(optimizer=optimizer, run_eagerly=False)
model_mlp.compile(
    optimizer=optimizer,
    run_eagerly=True,
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=LABEL_SMOOTHING),
    metrics=mm.TopKMetricsAggregator.default_metrics(top_ks=[100])
)

In [33]:
model_mlp.layers[0].parallel_layers['categorical'].embedding_tables

{'item_id_last': Embedding(), 'item_id_list_seq': Embedding()}

### Model Training

In [34]:
%%time
history = model_mlp.fit(
    train,
    validation_data=valid,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    schema=schema_model,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 18min 16s, sys: 2min 17s, total: 20min 33s
Wall time: 15min 57s


### Model Evaluation

In [35]:
model_mlp.evaluate(valid_ds, batch_size=1024, return_dict=True)



{'loss': 7.465471267700195,
 'recall_at_100': 0.5387646555900574,
 'mrr_at_100': 0.23805998265743256,
 'ndcg_at_100': 0.29719114303588867,
 'map_at_100': 0.23805998265743256,
 'precision_at_100': 0.005387645214796066,
 'regularization_loss': 0.0}

In [36]:
def generate_recommendations(pred, df_agg, batch_size=1024, n_topk=100):
    print('Mask Predictions')
    print('Generate Top100 Recommendations')
    out_pred = []
    out_score = []
    for i in range(0, pred.shape[0]//batch_size+1):
        batch_start = (i)*batch_size
        batch_end = min((i+1)*batch_size, pred.shape[0])
        pred_tmp = pred[batch_start:batch_end]
        cp_pred = cupy.asarray(pred_tmp)
        pred_idx = cupy.argsort(-cp_pred)
        pred_idx = cupy.asnumpy(pred_idx)
        for j in range(pred_idx.shape[0]):
            topk = []
            score = []
            for k in range(n_topk):
                idx = pred_idx[j][k]
                topk.append(idx)
                score.append(pred_tmp[j][idx])
            out_pred.append(topk)
            out_score.append(score)
    
    print('Transform Top100 Recommendations')
    metadata = df_agg[['session_id', 'purchase_id_first']].to_pandas().values.tolist()
    out = []
    for i, ex in enumerate(metadata):
        session_id = ex[0]
        purchase = ex[1]
        for k in range(n_topk):
            out.append([session_id, purchase, out_pred[i][k], out_score[i][k]])

    df_rec = cudf.DataFrame(out)
    df_rec.columns = ['session_id', 'purchased', 'rec', 'score']
    return(df_rec)

def evaluate(df, add_folds=False):
    print('Model evaluation')
    df = df.drop_duplicates(['session_id', 'rec'])
    df = df.sort_values(['session_id', 'score'], ascending=False)
    df['dummy'] = 1
    df['rank'] = df[['session_id', 'dummy']].groupby('session_id').cumsum()
    df = df[df['rank']<=100]
    df.drop('dummy', inplace=True, axis=1)
    df['mrr'] = 1/df['rank']
    df.loc[df['purchased']!=df['rec'], 'mrr'] = 0
    out = {}
    mrr = df[df['purchased']==df['rec']]['mrr'].sum()/df['session_id'].drop_duplicates().shape[0]
    out['total'] = mrr
    return(out)

In [37]:
%%time
predictions = model_mlp.predict(valid, batch_size=1024, verbose=1)
ddf = valid.to_ddf()
ddf = ddf[['session_id', 'purchase_id_first']].compute()
df_rec = generate_recommendations(predictions, ddf)
val_mrr = evaluate(df_rec)['total']
print('MRR: ',val_mrr)

Mask Predictions
Generate Top100 Recommendations
Transform Top100 Recommendations
Model evaluation
MRR:  0.23805997705335835
CPU times: user 57.5 s, sys: 14 s, total: 1min 11s
Wall time: 1min 6s


## Training Bi-LSTM

In [10]:
SEED = 42
EPOCHS = 10
BATCH_SIZE = 1024 #512
LEARNING_RATE = 3e-1
CLIPNORM = True
DROPOUT= 0.2 #0.01
LABEL_SMOOTHING = 0.2
TEMPERATURE_SCALING = 2
OPTIMIZER_NAME = 'adam'
LOSS='CategoricalCrossentropy'

BI_LSTM_HIDDEN_DIM = 64
tf.keras.utils.set_random_seed(SEED)

### Model

BiLSTM Block: It requires a dictionary input with the sequence of interaction embeddings `input_sequence`

In [11]:
class BiLSTM(mm.Block):
    def __init__(self, hidden_dim= 64, **kwargs):
        self.hidden_dim = hidden_dim
        lstm = tf.keras.layers.LSTM(hidden_dim, return_sequences=False, dropout=0.05,
                                   kernel_regularizer=regularizers.l2(1e-4))
        self.lstm = tf.keras.layers.Bidirectional(lstm)
        
        super().__init__(**kwargs)
        
    def call(self, inputs, training=False, **kwargs) -> tf.Tensor:  
        interactions = inputs['input_sequence']
        sequence_representation = self.lstm(interactions)
        return sequence_representation
    
    def compute_output_shape(self, input_shape):
        input_shape = input_shape['input_sequence']
        return (input_shape[0], input_shape[1], self.hidden_dim*2)
    
    
bilstm = BiLSTM(hidden_dim=BI_LSTM_HIDDEN_DIM)

InputBlock which takes sequential features, concatenate them and return the sequence of interaction embeddings

In [12]:
schema_model = train.schema.select_by_name(['item_id_list_seq', 'purchase_id_first'])
schema_model

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.num_buckets,properties.freq_threshold,properties.max_size,properties.start_index,properties.cat_path,properties.embedding_sizes.cardinality,properties.embedding_sizes.dimension,properties.domain.min,properties.domain.max,properties.domain.name
0,item_id_list_seq,"(Tags.SEQUENCE, Tags.ITEM, Tags.LIST, Tags.CAT...",int64,True,False,,0.0,0.0,0.0,.//categories/unique.item_id.parquet,23497.0,449.0,0,23497,item_id
1,purchase_id_first,"(Tags.CATEGORICAL, Tags.TARGET)",int64,False,False,,0.0,0.0,0.0,.//categories/unique.purchase_id.parquet,18908.0,397.0,0,18908,purchase_id


In [13]:
inputs = InputBlock(
        schema_model,
        aggregation='concat',
        seq=True,
        max_seq_length=20,
        embedding_options=mm.EmbeddingOptions(
            embedding_dim_default=128,
            infer_embedding_sizes=True,
            infer_embedding_sizes_multiplier=2,
            infer_embeddings_ensure_dim_multiple_of_8=True
        ),
        split_sparse=True,
)

In [14]:
dense_block = mm.ParallelBlock({'input_sequence': inputs}).connect(bilstm)

MLPBlock to get the sequence of hidden representation

In [15]:
mlp_block = mm.MLPBlock(
                [64, 32],
                activation='relu',
                no_activation_last_layer=True,
                dropout=DROPOUT,
            )

Multi-Classiffication Prediction head which has
- Layer Normalization
- Weight Tying
- Labels as One-hot encoded vectors, used for label smoothing 
- Temperature Scaling to reduce the overconfidence of the model

In [16]:
prediction_call = L2Norm().connect(
    ItemsPredictionWeightTying(schema_model), 
    mm.LabelToOneHot(), 
    LogitsTemperatureScaler(temperature=TEMPERATURE_SCALING)
)

prediction_task = mm.MultiClassClassificationTask(
    target_name="purchase_id_first",
    pre=prediction_call,
)

Now, we connect all the blocks togther to build a model

In [17]:
model_bi_lstm = Model(dense_block, mlp_block, prediction_task)

optimizer = tf.keras.optimizers.Adam(
    learning_rate=LEARNING_RATE,
    clipnorm=CLIPNORM
)

model_bi_lstm.compile(
    optimizer=optimizer,
    run_eagerly=True,
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=LABEL_SMOOTHING),
    metrics=mm.TopKMetricsAggregator.default_metrics(top_ks=[100])
)

2022-08-08 22:46:09.666455: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


### Model Training

In [19]:
%%time
history = model_bi_lstm.fit(
    train,
    validation_data=valid,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    schema=schema_model,
    callbacks=[WandbCallback(save_model=False)],
)

2022-08-08 22:46:26.286843: I tensorflow/stream_executor/cuda/cuda_dnn.cc:379] Loaded cuDNN version 8400


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Model Evaluation

In [20]:
%%time
predictions = model_bi_lstm.predict(valid, batch_size=1024, verbose=1)
ddf = valid.to_ddf()
ddf = ddf[['session_id', 'purchase_id_first']].compute()
df_rec = generate_recommendations(predictions, ddf)
val_mrr = evaluate(df_rec)['total']
print('MRR: ',val_mrr)

Mask Predictions
Generate Top100 Recommendations
Transform Top100 Recommendations
Model evaluation
MRR:  0.02055274926770559
CPU times: user 55.2 s, sys: 12.4 s, total: 1min 7s
Wall time: 1min 5s
