# Retrieval item-to-item model on full dataset

Since Retrival item-to-item model has performed the best we would like to test it on the full dataset. 

### Imports

In [None]:
! pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
! pip install -q tensorflow-recommenders
! pip install -q --upgrade tensorflow-datasets
! pip install -q scann

[K     |████████████████████████████████| 89 kB 7.3 MB/s 
[K     |████████████████████████████████| 4.7 MB 20.3 MB/s 
[K     |████████████████████████████████| 10.4 MB 11.8 MB/s 
[K     |████████████████████████████████| 578.0 MB 15 kB/s 
[K     |████████████████████████████████| 438 kB 65.5 MB/s 
[K     |████████████████████████████████| 1.7 MB 48.2 MB/s 
[K     |████████████████████████████████| 5.9 MB 58.2 MB/s 
[?25h

In [None]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_recommenders as tfrs

# import interactive table 
from google.colab import data_table
data_table.enable_dataframe_formatter()

# set seed
tf.random.set_seed(42)

### Preparing the dataset

In [None]:
# load full dataset
ratings, info = tfds.load("amazon_us_reviews/Outdoors_v1_00", split="train", with_info = True)

[1mDownloading and preparing dataset 428.16 MiB (download: 428.16 MiB, generated: Unknown size, total: 428.16 MiB) to /root/tensorflow_datasets/amazon_us_reviews/Outdoors_v1_00/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/2302401 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/amazon_us_reviews/Outdoors_v1_00/0.1.0.incompleteJH128M/amazon_us_reviews-…

[1mDataset amazon_us_reviews downloaded and prepared to /root/tensorflow_datasets/amazon_us_reviews/Outdoors_v1_00/0.1.0. Subsequent calls will reuse this data.[0m


In [None]:
# Select the basic features.

products = ratings.map(lambda x: x['data']['product_title'])

In [None]:
# train-test split
tf.random.set_seed(42)
shuffled = ratings.shuffle(2_302_401, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(1_841_921)
test = shuffled.skip(1_841_921).take(460_480)

In [None]:
# vocabulary to map raw feature values to embedding vectors
product_titles = products.batch(50_000)
unique_product_titles = np.unique(np.concatenate(list(product_titles)))

unique_product_titles[:10]

array([b'! 1pc Small S Navy Blue Replacement Band + Free Small Grey Band With Clasp for Fitbit FLEX Only /No tracker/ Wireless Activity Bracelet Sport Wristband Fit Bit Flex Bracelet Sport Arm Band Armband',
       b'! 1pc Small S Purple / Pink Replacement Band + Free Small Grey Band With Clasp for Fitbit FLEX Only /No tracker/ Wireless Activity Bracelet Sport Wristband Fit Bit Flex Bracelet Sport Arm Band Armband',
       b'! 1pc Small S Teal (Blue/Green) Replacement Band + Free Small Grey Band With Clasp for Fitbit FLEX Only /No tracker/ Wireless Activity Bracelet Sport Wristband Fit Bit Flex Bracelet Sport Arm Band Armband',
       b'! 2pcs Small S Red (Tangerine) Replacement Bands + 1pc Free Small Grey Band With Clasp for Fitbit FLEX Only /No tracker/ Wireless Activity Bracelet Sport Wristband Fit Bit Flex Bracelet Sport Arm Band Armband',
       b'! Large L 1pc Light Blue 1pc White Replacement Bands + 1pc Free Large Grey Band With Clasp for Fitbit FLEX Only /No tracker/ Wireless A

### Implementing the model

In [None]:
# dimensionality of the query and candidate representations:
embedding_dimension = 64

In [None]:
# create product model to be used as both query and candidate submodels.
product_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_product_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_product_titles) + 1, embedding_dimension)
])

In [None]:
#define metric
metrics = tfrs.metrics.FactorizedTopK(
  candidates=products.batch(1032).map(product_model)
)

In [None]:
#define task
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [None]:
# create a model based on TensorFlow Recommenders Model class
class AmazonModel(tfrs.Model):

  def __init__(self, user_model, product_model):
    super().__init__()
    self.product_model: tf.keras.Model = product_model
    self.user_model: tf.keras.Model = product_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    product_embeddings = self.product_model(features['data']["product_title"])
    # And pick out the product features and pass them into the product model,
    # getting embeddings back.
    positive_product_embeddings = self.product_model(features['data']["product_title"])

    # The task computes the loss and the metrics.
    return self.task(product_embeddings, positive_product_embeddings, compute_metrics=not training)

In [None]:
# initiate model
item_item_model_2 = AmazonModel(product_model, product_model)
item_item_model_2.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.5))

### Fitting and Evaluaitng the model

In [None]:
# shuffle, batch, and cache train and test data
cached_train = train.shuffle(1_841_921).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [None]:
# train the model
item_item_model_2.fit(cached_train, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f017754d110>

In [None]:
# evaluate model
item_item_model_2.evaluate(cached_test, return_dict=True)

 23/113 [=====>........................] - ETA: 7:32:23 - factorized_top_k/top_1_categorical_accuracy: 0.5760 - factorized_top_k/top_5_categorical_accuracy: 0.6040 - factorized_top_k/top_10_categorical_accuracy: 0.6336 - factorized_top_k/top_50_categorical_accuracy: 0.7287 - factorized_top_k/top_100_categorical_accuracy: 0.7786 - loss: 2752.6847 - regularization_loss: 0.0000e+00 - total_loss: 2752.6847

Item-to-item model on full dataset has performed even better on the test data. Top-10 accuracy rate on test data for full dataset is at 63.36% vs 59.75% on data subset. 