In [3]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Training

> Bagged representation of the input sequence + MLPBlock + Multi-classification layer with weight-tying

Topics covered in this notebook:
- Label smoothing
- Temperature
- Weight Tying

## Imports

In [9]:
import os
import cudf
import pandas as pd 
import tensorflow as tf
from merlin.io import Dataset
from merlin.schema import Tags
import matplotlib.pyplot as plt
from tensorflow.keras import regularizers
from merlin.models.tf.dataset import BatchedDataset
from merlin.models.tf.utils.tf_utils import extract_topk
import numpy as np
import merlin.models.tf as mm
from merlin.models.tf import InputBlock
from merlin.models.tf.models.base import Model
from merlin.models.tf.blocks.core.aggregation import SequenceAggregation, SequenceAggregator
from merlin.models.tf.blocks.core.transformations import (
    ItemsPredictionWeightTying,
    L2Norm,
    LogitsTemperatureScaler,
)
from merlin.models.tf.inputs.embedding import EmbeddingOptions

DATA_FOLDER = 'dressipi'
DATA_PROCESSED_FOLDER = 'dressipi_processed'

In [10]:
train = Dataset(os.path.join(DATA_PROCESSED_FOLDER, 'train/*.parquet'),shuffle=False)
valid = Dataset(os.path.join(DATA_PROCESSED_FOLDER, 'valid/*.parquet'), shuffle=False)
test_leaderboard = Dataset(os.path.join(DATA_PROCESSED_FOLDER, 'test_leaderboard/*.parquet'), shuffle=False)
test_final = Dataset(os.path.join(DATA_PROCESSED_FOLDER, 'test_final/*.parquet'), shuffle=False)

purchases = pd.read_csv(os.path.join(DATA_FOLDER, "train_purchases.csv"))
item_map = pd.read_parquet(
    os.path.join("categories", "unique.item_id.parquet"))['item_id'].to_dict()
session_map = pd.read_parquet(
    os.path.join("categories", "unique.session_id.parquet"))['session_id'].to_dict()

In [11]:
schema_model = train.schema.select_by_name(['item_id_list_seq', 'item_id_last'])
schema_model

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.num_buckets,properties.freq_threshold,properties.max_size,properties.start_index,properties.cat_path,properties.embedding_sizes.cardinality,properties.embedding_sizes.dimension,properties.domain.min,properties.domain.max,properties.domain.name
0,item_id_list_seq,"(Tags.SEQUENCE, Tags.ITEM_ID, Tags.ITEM, Tags....",int64,True,False,,0.0,0.0,0.0,.//categories/unique.item_id.parquet,23566.0,449.0,0,23566,item_id
1,item_id_last,"(Tags.BINARY_CLASSIFICATION, Tags.ITEM_ID, Tag...",int64,False,False,,0.0,0.0,0.0,.//categories/unique.item_id.parquet,23566.0,449.0,0,23566,item_id


In [12]:
import datetime
tmp = train.compute()
start_month_train = tmp[tmp['date_last'] <= datetime.datetime(2021, 4, 1)]
last_month_train = tmp[tmp['date_first'] >= datetime.datetime(2021, 4, 1)]
start_month_train =  Dataset(start_month_train, schema=schema_model, shuffle=True)
last_month_train =  Dataset(last_month_train, schema=schema_model, shuffle=True)

In [13]:
tmp.shape,start_month_train.compute().shape,last_month_train.compute().shape

((920830, 30), (848637, 30), (72193, 30))

## Model
- A sequential-MLP with average of the sequence as final representation

The model contains:
- A InputBlock which takes sequential features, concatenate them and return the sequence of interaction embeddings
- MLPBlock to get the sequence of hidden representation
- Multi-Classiffication prediction head
    - Layer normalization
    - Item weight-tying
    - transfom labels to one-hot encoding representation for metrics 
    - softmax temperature to reduce model's over confidence

In [13]:
inputs = InputBlock(
        schema_model,
        aggregation='concat',
        seq=True,
        max_seq_length=20,
        embedding_options=mm.EmbeddingOptions(embedding_dim_default=128),
        split_sparse=True,
)

dense_block = mm.MLPBlock(
                [64, 128],
                activation='relu',
                no_activation_last_layer=True,
                dropout=0.01,
            )

prediction_call = L2Norm().connect(
    ItemsPredictionWeightTying(schema_model), 
    mm.LabelToOneHot(), 
    LogitsTemperatureScaler(temperature=2)
)

task = mm.MultiClassClassificationTask(
    target_name="item_id_last",
    pre=prediction_call
)

model = Model(inputs, dense_block, SequenceAggregator(SequenceAggregation.MEAN), task)

2022-07-10 02:41:43.505801: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


In [14]:
optimizer = tf.keras.optimizers.Adam(
    learning_rate=3e-1,
    clipnorm=True
)

# model.compile(optimizer=opt, run_eagerly=False)
model.compile(
    optimizer=optimizer,
    run_eagerly=True,
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.2),
    metrics=mm.TopKMetricsAggregator.default_metrics(top_ks=[100])
)

## Train the model

In [15]:
%%time
history = model.fit(start_month_train, validation_data=last_month_train, batch_size=128, epochs=1)

CPU times: user 7min 25s, sys: 11.5 s, total: 7min 36s
Wall time: 7min 3s


## Inference
MRR score on validation data

In [16]:
def compute_mrr(rec_list,target):
    mrr = 0
    for a,b in zip(rec_list,target):
        rank = np.argmax(np.array(a)==b)
        if rank != 0:
            mrr += (1 / (1 + rank))
    return mrr/(target.shape[0])

In [17]:
%%time
valid = Dataset(
    [DATA_PROCESSED_FOLDER+'/valid/*.parquet'], 
    part_mem_fraction=0.01, 
    shuffle=False)
x = BatchedDataset(
    valid, 
    batch_size=256, 
    shuffle=False, 
)
predictions = model.predict(x)

topk_predicted = []
for i in range(predictions.shape[0]):
    _, topk_indices = tf.math.top_k(predictions[i, :], 100)
    topk_predicted.append(topk_indices.numpy().reshape(1, 100))

top_predicted = np.concatenate(topk_predicted)

valid_data = valid.to_ddf().compute().to_pandas()
valid_data['session_id'] = valid_data.session_id.map(session_map)
valid_data = pd.merge(valid_data, purchases, on='session_id')[['session_id', 'item_id']]

valid_data['top100_predicted'] = top_predicted.tolist()
valid_data['top100_predicted']= valid_data['top100_predicted'].apply(lambda x: [item_map[i] for i in x])

mrr_eval = compute_mrr(valid_data['top100_predicted'], valid_data['item_id'])

CPU times: user 2min 12s, sys: 14.8 s, total: 2min 27s
Wall time: 1min 55s


In [18]:
mrr_eval

0.06950263758217776