# Retrieval using a sequential model


Sequential recommendation is a model that predicts what customer might buy next considering past history of clicks and purchases. To model sequential relationships we need to build recurrent neural networks (RNN). 

### Imports

In [1]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[K     |████████████████████████████████| 89 kB 4.8 MB/s 
[K     |████████████████████████████████| 578.0 MB 18 kB/s 
[K     |████████████████████████████████| 438 kB 64.3 MB/s 
[K     |████████████████████████████████| 1.7 MB 57.0 MB/s 
[K     |████████████████████████████████| 5.9 MB 77.8 MB/s 
[K     |████████████████████████████████| 4.7 MB 14.3 MB/s 
[?25h

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [3]:
!git clone https://github.com/rusalka013/Amazon_Rec_Systems.git
!pip install -r Amazon_Rec_Systems/ml/requirements.txt

Cloning into 'Amazon_Rec_Systems'...
remote: Enumerating objects: 227, done.[K
remote: Counting objects: 100% (227/227), done.[K
remote: Compressing objects: 100% (151/151), done.[K
remote: Total 227 (delta 119), reused 178 (delta 73), pack-reused 0[K
Receiving objects: 100% (227/227), 53.59 KiB | 10.72 MiB/s, done.
Resolving deltas: 100% (119/119), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Preparing the dataset

In [4]:
%cd /content/Amazon_Rec_Systems/ml/
!python -m data.example_generation_amazon_2 \
  --data_dir=data/raw \
  --output_dir=data/examples \
  --min_timeline_length=3 \
  --max_context_length=10 \
  --min_rating=2 \
  --train_data_fraction=0.9 \
  --build_vocabs=True

/content/Amazon_Rec_Systems/ml
2022-10-12 18:24:06.333637: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-12 18:24:06.993835: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2022-10-12 18:24:06.993955: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
I1012 18:24:08.557184 140442028676992 example_generation_amazon_2.py:420] Downloading and extracting data.
Downloading data from https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Outdoors_v1_00.tsv.gz
I1012 18:24:

In [5]:
train_filename = "./data/examples/train_amazon_1m.tfrecord"
train = tf.data.TFRecordDataset(train_filename)

test_filename = "./data/examples/test_amazon_1m.tfrecord"
test = tf.data.TFRecordDataset(test_filename)



In [6]:
feature_description = {
    'context_product_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.repeat(0, 10)),
    'context_product_rating': tf.io.FixedLenFeature([10], tf.float32, default_value=np.repeat(0, 10)),
    'label_product_id': tf.io.FixedLenFeature([1], tf.int64, default_value=0),
}

def _parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, feature_description)

train_ds = train.map(_parse_function).map(lambda x: {
    "context_product_id": tf.strings.as_string(x["context_product_id"]),
    "label_product_id": tf.strings.as_string(x["label_product_id"])
})

test_ds = test.map(_parse_function).map(lambda x: {
    "context_product_id": tf.strings.as_string(x["context_product_id"]),
    "label_product_id": tf.strings.as_string(x["label_product_id"])
})

for x in train_ds.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'context_product_id': array([b'413855107', b'0', b'0', b'0', b'0', b'0', b'0', b'0', b'0', b'0'],
      dtype=object),
 'label_product_id': array([b'350418244'], dtype=object)}


We also need product dictionary. 

In [7]:
ratings = tfds.load("amazon_us_reviews/Outdoors_v1_00", split="train")


[1mDownloading and preparing dataset 428.16 MiB (download: 428.16 MiB, generated: Unknown size, total: 428.16 MiB) to /root/tensorflow_datasets/amazon_us_reviews/Outdoors_v1_00/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/2302401 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/amazon_us_reviews/Outdoors_v1_00/0.1.0.incomplete5TGL81/amazon_us_reviews-…

[1mDataset amazon_us_reviews downloaded and prepared to /root/tensorflow_datasets/amazon_us_reviews/Outdoors_v1_00/0.1.0. Subsequent calls will reuse this data.[0m


In [8]:
products = ratings.map(lambda x: x['data']['product_parent'])
product_ids = products.batch(20_000)
unique_product_ids = np.unique(np.concatenate(list(product_ids)))

### Implementing Sequential model

We will use two-tower architecture. Both query and candidate towers will be the sequence of historic products encoded with Gated Recurrent Unit (GRU) layer. 

In [9]:
embedding_dimension = 32

query_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
      vocabulary=unique_product_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_product_ids) + 1, embedding_dimension), 
    tf.keras.layers.GRU(embedding_dimension),
])

candidate_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_product_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_product_ids) + 1, embedding_dimension)
])

The metrics, task and full model are defined similar to the basic retrieval model.




In [10]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=products.batch(128).map(candidate_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

class Model(tfrs.Model):

    def __init__(self, query_model, candidate_model):
        super().__init__()
        self._query_model = query_model
        self._candidate_model = candidate_model

        self._task = task

    def compute_loss(self, features, training=False):
        watch_history = features["context_product_id"]
        watch_next_label = features["label_product_id"]

        query_embedding = self._query_model(watch_history)       
        candidate_embedding = self._candidate_model(watch_next_label)

        return self._task(query_embedding, candidate_embedding, compute_metrics=not training)

### Fitting and Evaluating

In [11]:
sequential_model = Model(query_model, candidate_model)
sequential_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [13]:
cached_train = train_ds.shuffle(10_000).batch(12800).cache()
cached_test = test_ds.batch(2560).cache()

In [14]:
sequential_model.fit(cached_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5200887d90>

In [15]:
sequential_model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0011870628222823143,
 'factorized_top_k/top_5_categorical_accuracy': 0.0012235878966748714,
 'factorized_top_k/top_10_categorical_accuracy': 0.0012418503174558282,
 'factorized_top_k/top_50_categorical_accuracy': 0.001917563029564917,
 'factorized_top_k/top_100_categorical_accuracy': 0.0027028508484363556,
 'loss': 6713.29736328125,
 'regularization_loss': 0,
 'total_loss': 6713.29736328125}

Sequential model hasn't performed well on accuracy test. The accuracy rate of Top-10 is 0.12%. In comparison, item-to-item model accuracy rate is at 58.8%. 