In [3]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Training

> Bagged representation of the input sequence + Bi-LSTM Block + MLPBlock + Multi-classification layer with weight-tying

Topics covered in this notebook:
- Utilizing Bi-LSTM layer with Merlin models
- Label smoothing
- Temperature
- Weight Tying

## Imports

In [3]:
import os
import cudf
import pandas as pd 
import tensorflow as tf
from merlin.io import Dataset
from merlin.schema import Tags
import matplotlib.pyplot as plt
from tensorflow.keras import regularizers
from merlin.models.tf.dataset import BatchedDataset
from merlin.models.tf.utils.tf_utils import extract_topk
import numpy as np
import merlin.models.tf as mm
from merlin.models.tf import InputBlock
from merlin.models.tf.models.base import Model
from merlin.models.tf.blocks.core.aggregation import SequenceAggregation, SequenceAggregator
from merlin.models.tf.blocks.core.transformations import (
    ItemsPredictionWeightTying,
    L2Norm,
    LogitsTemperatureScaler,
)
from merlin.models.tf.inputs.embedding import EmbeddingOptions

DATA_FOLDER = 'dressipi'
DATA_PROCESSED_FOLDER = 'dressipi_processed'

2022-07-11 18:00:40.120616: I tensorflow/core/platform/cpu_feature_guard.cc:152] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-11 18:00:49.066734: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16255 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:85:00.0, compute capability: 7.0
2022-07-11 18:00:49.068978: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30677 MB memory:  -> device: 1, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:86:00.0, compute capability: 7.0
2022-07-11 18:00:49.073064: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/rep

In [4]:
train = Dataset(os.path.join(DATA_PROCESSED_FOLDER, 'train/*.parquet'),shuffle=False)
valid = Dataset(os.path.join(DATA_PROCESSED_FOLDER, 'valid/*.parquet'), shuffle=False)
test_leaderboard = Dataset(os.path.join(DATA_PROCESSED_FOLDER, 'test_leaderboard/*.parquet'), shuffle=False)
test_final = Dataset(os.path.join(DATA_PROCESSED_FOLDER, 'test_final/*.parquet'), shuffle=False)

purchases = pd.read_csv(os.path.join(DATA_FOLDER, "train_purchases.csv"))
item_map = pd.read_parquet(
    os.path.join("categories", "unique.item_id.parquet"))['item_id'].to_dict()
session_map = pd.read_parquet(
    os.path.join("categories", "unique.session_id.parquet"))['session_id'].to_dict()



In [5]:
schema_model = train.schema.select_by_name(['item_id_list_seq', 'item_id_last'])
schema_model

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.num_buckets,properties.freq_threshold,properties.max_size,properties.start_index,properties.cat_path,properties.embedding_sizes.cardinality,properties.embedding_sizes.dimension,properties.domain.min,properties.domain.max,properties.domain.name
0,item_id_list_seq,"(Tags.LIST, Tags.ITEM, Tags.SEQUENCE, Tags.CAT...",int64,True,False,,0.0,0.0,0.0,.//categories/unique.item_id.parquet,23566.0,449.0,0,23566,item_id
1,item_id_last,"(Tags.ITEM, Tags.CATEGORICAL, Tags.BINARY_CLAS...",int64,False,False,,0.0,0.0,0.0,.//categories/unique.item_id.parquet,23566.0,449.0,0,23566,item_id


In [6]:
import datetime
tmp = train.compute()
start_month_train = tmp[tmp['date_last'] <= datetime.datetime(2021, 4, 1)]
last_month_train = tmp[tmp['date_first'] >= datetime.datetime(2021, 4, 1)]
start_month_train =  Dataset(start_month_train, schema=schema_model, shuffle=True)
last_month_train =  Dataset(last_month_train, schema=schema_model, shuffle=True)

In [7]:
tmp.shape,start_month_train.compute().shape,last_month_train.compute().shape

((920830, 30), (848637, 30), (72193, 30))

## Model
- A Bi-LSTM Block with MLP for MultiClassification prediction task

The model contains:
- A InputBlock which takes sequential features, concatenate them and return the sequence of interaction embeddings
- Bi-LSTM block to get the sequence of hidden representation
- MLPBlock to get the sequence of hidden representation
- Multi-Classiffication prediction head
    - Layer normalization
    - Item weight-tying
    - transfom labels to one-hot encoding representation for metrics 
    - softmax temperature to reduce model's over confidence

In [8]:
class BiLSTM(mm.Block):
    """
    Build BiLSTM model
    It requires a dictionary input with the sequence of interaction embeddings `interactions`
    """
    def __init__(self, hidden_dim= 64, **kwargs):
        self.hidden_dim = hidden_dim
        lstm = tf.keras.layers.LSTM(hidden_dim, return_sequences=False, dropout=0.05,
                                   kernel_regularizer=regularizers.l2(1e-4))
        self.lstm = tf.keras.layers.Bidirectional(lstm)
        
        super().__init__(**kwargs)
        
    def call(self, inputs, training=False, **kwargs) -> tf.Tensor:  
        interactions = inputs['input_sequence']
        sequence_representation = self.lstm(interactions)
        return sequence_representation
    
    def compute_output_shape(self, input_shape):
        input_shape = input_shape['input_sequence']
        return (input_shape[0], input_shape[1], self.hidden_dim*2)
    

In [9]:
inputs = InputBlock(
        schema_model,
        aggregation='concat',
        seq=True,
        max_seq_length=20,
        embedding_options=mm.EmbeddingOptions(
            embedding_dim_default=128,
            infer_embedding_sizes=True,
            infer_embedding_sizes_multiplier=2,
            infer_embeddings_ensure_dim_multiple_of_8=True
        ),
        split_sparse=True,
)

bilstm = BiLSTM(hidden_dim=64)
dense_block = mm.ParallelBlock({'input_sequence': inputs}).connect(bilstm)

mlp_block = mm.MLPBlock(
                [64, 32],
                activation='relu',
                no_activation_last_layer=True,
                dropout=0.01,
            )

prediction_call = L2Norm().connect(
    ItemsPredictionWeightTying(schema_model), 
    mm.LabelToOneHot(), 
    LogitsTemperatureScaler(temperature=2)
)

task = mm.MultiClassClassificationTask(
    target_name="item_id_last",
    pre=prediction_call
)

model = Model(dense_block, mlp_block, task)

2022-07-11 18:00:57.695645: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


In [10]:
optimizer = tf.keras.optimizers.Adam(
    learning_rate=3e-1,
    clipnorm=True
)

# model.compile(optimizer=opt, run_eagerly=False)
model.compile(
    optimizer=optimizer,
    run_eagerly=True,
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.2),
    metrics=mm.TopKMetricsAggregator.default_metrics(top_ks=[100])
)

## Train the model

In [11]:
%%time
history = model.fit(start_month_train, validation_data=last_month_train, batch_size=128, epochs=1)

2022-07-11 18:01:19.839452: I tensorflow/stream_executor/cuda/cuda_dnn.cc:379] Loaded cuDNN version 8400


CPU times: user 11min 32s, sys: 14.5 s, total: 11min 47s
Wall time: 11min 8s


## Inference
MRR score on validation data

In [13]:
def compute_mrr(rec_list,target):
    mrr = 0
    for a,b in zip(rec_list,target):
        rank = np.argmax(np.array(a)==b)
        if rank != 0:
            mrr += 1 / (1 + rank)
    return mrr/(target.shape[0])

In [30]:
valid = Dataset(
    [DATA_PROCESSED_FOLDER+'/valid/*.parquet'], 
    part_mem_fraction=0.01, 
    shuffle=False)
x = BatchedDataset(
    valid, 
    batch_size=256, 
    shuffle=False, 
)
predictions = model.predict(x)

topk_predicted, topk_scores = [], []
k = 1000
for i in range(predictions.shape[0]):
    top_scores, top_indices = tf.math.top_k(predictions[i, :], k)
    topk_predicted.append(top_indices.numpy().reshape(1, k))
    topk_scores.append(top_scores.numpy().reshape(1, k))

topk_predicted = np.concatenate(topk_predicted)
topk_scores= np.concatenate(topk_scores)

valid_data = valid.to_ddf().compute().to_pandas()
valid_data['session_id'] = valid_data.session_id.map(session_map)
valid_data = pd.merge(valid_data, purchases, on='session_id')[['session_id', 'item_id']]

valid_data['topk_predicted'] = list(topk_predicted.astype(np.int32))
valid_data['topk_scores'] = list(topk_scores.astype(np.float32))

valid_data['topk_predicted'] = valid_data['topk_predicted'].apply(lambda x: [item_map[i] for i in x])

valid_data['top100'] = valid_data['topk_predicted'].apply(lambda x: x[:100])

mrr_eval = compute_mrr(valid_data['top100'], valid_data['item_id'])

In [31]:
mrr_eval

0.01572088760854956