# Retrieval model: item-to-item




The second model that we are planning to build has similar architecture to the 1st model, but in this case we will use two product models for both queery and candidate models.

### Imports

In [1]:
! pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
! pip install -q tensorflow-recommenders
! pip install -q --upgrade tensorflow-datasets
! pip install -q scann

In [3]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_recommenders as tfrs

# import interactive table 
from google.colab import data_table
data_table.enable_dataframe_formatter()

# set seed
tf.random.set_seed(42)

### Preparing the dataset

In [4]:
# mount G-Drive and load data
from google.colab import drive
drive.mount('/content/drive')

# load data subset 
gdrive_path = '/content/drive/MyDrive/ModelingData'
path = os.path.join(gdrive_path, "ratings")

ratings = tf.data.Dataset.load(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Select the basic features.

products = ratings.map(lambda x: x['data']['product_title'])

In [6]:
# train-test split
tf.random.set_seed(42)
shuffled = ratings.shuffle(92_096, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(92_096)
test = shuffled.skip(92_096).take(23_024)

In [7]:
# vocabulary to map raw feature values to embedding vectors
product_titles = products.batch(50_000)
unique_product_titles = np.unique(np.concatenate(list(product_titles)))

unique_product_titles[:10]

array([b'! Set 7 Colors Small S Replacement Bands + 1pc Free Small Grey Band With Clasp for Fitbit FLEX Only /No tracker/ 1pc Teal (Blue/Grey) 1pc Purple / Pink 1pc Red (Tangerine) 1pc Green 1pc Slate (Blue/Grey) 1pc Black 1pc Navy (Blue) Bands Wireless Activity Bracelet Sport Wristband Fit Bit Flex Bracelet Sport Arm Band Armband',
       b'! Small S 1pc Green 1pc Teal (Blue/Green) 1pc Red (Tangerine) Replacement Bands + 1pc Free Small Grey Band With Clasp for Fitbit FLEX Only /No tracker/ Wireless Activity Bracelet Sport Wristband Fit Bit Flex Bracelet Sport Arm Band Armband',
       b'! Small S 1pc Teal (Blue/Green) 1pc Purple / Pink Replacement Bands + 1pc Free Small Grey Band With Clasp for Fitbit FLEX Only /No tracker/ Wireless Activity Bracelet Sport Wristband Fit Bit Flex Bracelet Sport Arm Band Armband',
       b'"""SEASON SPECIAL"""THE ORIGINAL HEAVY DUTY BIG GRIZZLY COT-HEAVY DUTY QUALITY w/ IPHONE Holder & Drink Holder-High Quality Product-10 YEARS WARRANTY-84\xe2\x80\x9d L

In [8]:
# dimensionality of the query and candidate representations:
embedding_dimension = 64

### Implementing the model

In [9]:
# define model that will be used for both query and candidate submodels. 
product_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_product_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_product_titles) + 1, embedding_dimension)
])

In [10]:
# define metric (categorical accuracy)
metrics = tfrs.metrics.FactorizedTopK(
  candidates=products.batch(128).map(product_model)
)

In [11]:
# define Retrieval task
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [12]:
# create a model based on TensorFlow Recommenders Model class
class AmazonModel(tfrs.Model):

  def __init__(self, user_model, product_model):
    super().__init__()
    self.product_model: tf.keras.Model = product_model
    self.user_model: tf.keras.Model = product_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the product features and pass them into the user model.
    product_embeddings = self.product_model(features['data']["product_title"])
    # And pick out the product features and pass them into the product model,
    # getting embeddings back.
    positive_product_embeddings = self.product_model(features['data']["product_title"])

    # The task computes the loss and the metrics.
    return self.task(product_embeddings, positive_product_embeddings)

In [13]:
# initiate model
item_item_model = AmazonModel(product_model, product_model)
item_item_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.5))

### Fitting and Evaluating the model

In [14]:
# shuffle, batch, and cache train and test data
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [15]:
# train the model
item_item_model.fit(cached_train, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff0585c0dd0>

In [22]:
# evaluate model
item_item_model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.4488794207572937,
 'factorized_top_k/top_5_categorical_accuracy': 0.5337473750114441,
 'factorized_top_k/top_10_categorical_accuracy': 0.5984190702438354,
 'factorized_top_k/top_50_categorical_accuracy': 0.6898453831672668,
 'factorized_top_k/top_100_categorical_accuracy': 0.7044388651847839,
 'loss': 6141.607421875,
 'regularization_loss': 0,
 'total_loss': 6141.607421875}

Item-to-item model is performing much better in comparison to baseline SVD model. However, we do see that there is still might be some overfitting since model performed on train data with higher accuracy than on test data. The training accuracy for Top-10 is 93.66% vs 59.84% on testing. 

### Serving and saving the model

In [23]:
# recommending Top-10 products for customer 52228204

# Create a item_item_model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(item_item_model.product_model)
# recommends products out of the entire products dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((products.batch(100), products.batch(100).map(item_item_model.product_model)))
)

# Get recommendations.
_, titles = index(tf.constant(["52228204"]))
print(f"Recommendations for user 52228204: {titles[0, :10]}")

Recommendations for user 52228204: [b'North Mountain Gear Leafy Camouflage Complete Hunting Leafy Ghillie Suit Jacket Pants Hood'
 b'Sports Instruments PRO 9 Heart Rate Monitor'
 b'Bam Heartagram Pro New HIM 7.75 Skateboard Deck w/ Element Grip'
 b"Outdoor Research Men's Versaliner, Black, 2.8 oz"
 b"Outdoor Research Men's Versaliner, Black, 2.8 oz"
 b'Oury Mountain Magic Grips' b'Oury Mountain Magic Grips'
 b'Smith Safety Gear Elite Knee Pads' b'Smith Safety Gear Elite Knee Pads'
 b'Smith Safety Gear Elite Knee Pads']


There is still some repetition of recommendations, but not as extreme as in the basemodel. 

In [24]:
# model serving: saving the model to G-Drive

# Export the query model.
gdrive_path = '/content/drive/MyDrive/Models'
path = os.path.join(gdrive_path, "item_item_model")

# Save the index.
tf.saved_model.save(index, path)

# Load it back; can also be done in TensorFlow Serving.
item_item_model_2 = tf.saved_model.load(path)

# Pass a user id in, get top predicted movie titles back.
scores, titles = item_item_model_2(["52228204"])

print(f"Recommendations: {titles[0][:3]}")




Recommendations: [b'North Mountain Gear Leafy Camouflage Complete Hunting Leafy Ghillie Suit Jacket Pants Hood'
 b'Sports Instruments PRO 9 Heart Rate Monitor'
 b'Bam Heartagram Pro New HIM 7.75 Skateboard Deck w/ Element Grip']


Adding ScaNN layer for quick retrieval and saving it to G-Drive. 

In [26]:
# adding ScaNN layer
scann_index = tfrs.layers.factorized_top_k.ScaNN(item_item_model.product_model)
scann_index.index_from_dataset(
  tf.data.Dataset.zip((products.batch(100), products.batch(100).map(item_item_model.product_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7ff05366d550>

In [29]:
# Get recommendations.
_, titles = scann_index(tf.constant(["52228204"]))
print(f"Recommendations for user 52228204: {titles[0, :10]}")

Recommendations for user 52228204: [b'Assassins Creed Brotherhood Dozen Throwing Knives'
 b'Assassins Creed Brotherhood Dozen Throwing Knives'
 b"Elite Cycling Project Men's Speed Cycling Jersey"
 b"Elite Cycling Project Men's Speed Cycling Jersey"
 b"Elite Cycling Project Men's Speed Cycling Jersey"
 b'Oury Mountain Magic Grips' b'Oury Mountain Magic Grips'
 b'Eddie Bauer Trailhead Pack'
 b"Outdoor Research Men's Versaliner, Black, 2.8 oz"
 b"Outdoor Research Men's Versaliner, Black, 2.8 oz"]


In [28]:
# exporting ScaNN layer

# Export the query model.
gdrive_path = '/content/drive/MyDrive/Models'
path = os.path.join(gdrive_path, "item_item_model")

# Save the index.
tf.saved_model.save(
    index,
    path,
    options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"])
)

# Load it back; can also be done in TensorFlow Serving.
item_item_model_2 = tf.saved_model.load(path)

# Pass a user id in, get top predicted movie titles back.
scores, titles = item_item_model_2(["52228204"])

print(f"Recommendations: {titles[0][:10]}")



Recommendations: [b'North Mountain Gear Leafy Camouflage Complete Hunting Leafy Ghillie Suit Jacket Pants Hood'
 b'Sports Instruments PRO 9 Heart Rate Monitor'
 b'Bam Heartagram Pro New HIM 7.75 Skateboard Deck w/ Element Grip'
 b"Outdoor Research Men's Versaliner, Black, 2.8 oz"
 b"Outdoor Research Men's Versaliner, Black, 2.8 oz"
 b'Oury Mountain Magic Grips' b'Oury Mountain Magic Grips'
 b'Smith Safety Gear Elite Knee Pads' b'Smith Safety Gear Elite Knee Pads'
 b'Smith Safety Gear Elite Knee Pads']


Overall, we do see a significant performance improvement on item-to-item model in comparison to the base SVD model. Train accuracy for Top-5 is 90.51% and 52.45% on test for item-item model vs 77% on train and 0.03% on test for top-5 on SVD model. 