# Modeling Notebook: Retrieval with multiple features

---



In this Retrieval model we are incorporatings timestamp data ('review_date' feature) and performing Convolutional Neural Networks (CNN) on product title. 

### Imports

In [1]:
! pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
! pip install -q tensorflow-recommenders
! pip install -q --upgrade tensorflow-datasets
! pip install -q scann

[K     |████████████████████████████████| 89 kB 7.9 MB/s 
[K     |████████████████████████████████| 4.7 MB 15.2 MB/s 
[K     |████████████████████████████████| 10.4 MB 17.8 MB/s 
[K     |████████████████████████████████| 578.0 MB 14 kB/s 
[K     |████████████████████████████████| 438 kB 74.5 MB/s 
[K     |████████████████████████████████| 1.7 MB 77.4 MB/s 
[K     |████████████████████████████████| 5.9 MB 74.1 MB/s 
[?25h

In [3]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_recommenders as tfrs

# import interactive table 
from google.colab import data_table
data_table.enable_dataframe_formatter()

# set seed
tf.random.set_seed(42)

### Preparing dataset

In [43]:
# mount G-Drive and load data
from google.colab import drive
drive.mount('/content/drive')

# load data subset 
gdrive_path = '/content/drive/MyDrive/ModelingData'
path = os.path.join(gdrive_path, "ratings")

ratings = tf.data.Dataset.load(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
products = ratings.map(lambda x: x['data']['product_title'])

In [45]:
# Select the basic features.

ratings = ratings.map(lambda x: {
    'product_title': x['data']['product_title'],
    'customer_id': x['data']['customer_id'], 
    'timestamp': tf.strings.to_number(tf.strings.regex_replace(x['data']['review_date'], "-", ""), tf.int32),
    })

In [46]:
# prepare timestamp feature vocabulary

timestamps = np.concatenate(list(ratings.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)


In [47]:
# preparing feature vocabularies

unique_product_titles = np.unique(np.concatenate(list(ratings.map(lambda x: x["product_title"]).batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["customer_id"]))))

### Query model

In [8]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32)
])

In [55]:
# if using timestamp
class UserModel(tf.keras.Model):

  def __init__(self, use_timestamps):
    super().__init__()

    self._use_timestamps = use_timestamps

    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
    ])

    if use_timestamps:
      self.timestamp_embedding = tf.keras.Sequential([
          tf.keras.layers.Discretization(timestamp_buckets.tolist()),
          tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
      ])
      self.normalized_timestamp = tf.keras.layers.Normalization(
          axis=None
      )

      self.normalized_timestamp.adapt(timestamps)

  def call(self, inputs):
    if not self._use_timestamps:
      return self.user_embedding(inputs["user_id"])

    return tf.concat([
        self.user_embedding(inputs["user_id"]),
        self.timestamp_embedding(inputs["timestamp"]),
        tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)),
    ], axis=1)

### Candidate model

In [56]:
class ProductModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_product_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_product_titles) + 1, 32)
    ])

    self.title_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)

    self.title_text_embedding = tf.keras.Sequential([
      self.title_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.title_vectorizer.adapt(products)

  def call(self, titles):
    return tf.concat([
        self.title_embedding(titles),
        self.title_text_embedding(titles),
    ], axis=1)

### Combined model

In [57]:
class AmazonModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      user_model,
      tf.keras.layers.Dense(32)
    ])
    self.candidate_model = tf.keras.Sequential([
      ProductModel(),
      tf.keras.layers.Dense(32)
    ])
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=products.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model(features["customer_id"])
    product_embeddings = self.candidate_model(features["product_title"])

    return self.task(query_embeddings, product_embeddings)

In [59]:
class AmazonModel(tfrs.models.Model):

  def __init__(self, use_timestamps):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      UserModel(use_timestamps),
      tf.keras.layers.Dense(32)
    ])
    self.candidate_model = tf.keras.Sequential([
      ProductModel(),
      tf.keras.layers.Dense(32)
    ])
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=products.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
        "user_id": features["customer_id"],
        "timestamp": features["timestamp"],
    })
    product_embeddings = self.candidate_model(features["product_title"])

    return self.task(query_embeddings, product_embeddings)

### Implementing and Evaluating model

In [51]:
# train-test split
tf.random.set_seed(42)
shuffled = ratings.shuffle(115120, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(92_096)
test = shuffled.skip(92_096).take(23_024)

cached_train = train.shuffle(115120).batch(2048)
cached_test = test.batch(4096).cache()

In [60]:
# initiate model
combined_model = AmazonModel(use_timestamps = True)
combined_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.5))

In [61]:
# train the combined_model
combined_model.fit(cached_train, epochs=5)

train_accuracy = combined_model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_10_categorical_accuracy"]
test_accuracy = combined_model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_10_categorical_accuracy"]

print(f"Top-10 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-10 accuracy (test): {test_accuracy:.2f}.")

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Top-100 accuracy (train): 0.72.
Top-100 accuracy (test): 0.00.


In [67]:
# evaluate model
combined_model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 4.343294131103903e-05,
 'factorized_top_k/top_5_categorical_accuracy': 0.00017373176524415612,
 'factorized_top_k/top_10_categorical_accuracy': 0.0002605976478662342,
 'factorized_top_k/top_50_categorical_accuracy': 0.000998957664705813,
 'factorized_top_k/top_100_categorical_accuracy': 0.0018676164327189326,
 'loss': 23408.94140625,
 'regularization_loss': 0,
 'total_loss': 23408.94140625}

Even though the combined model performed well during the training and resulted in accuracy rate of 72.15% for Top-10 recommendations, it performed poorly on the test data resulting at 0.02% accuracy. 

### BruteForce serving

In [62]:
# recommending Top-10 products for customer 52228204

# Create a combined_model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(combined_model.candidate_model)
# recommends products out of the entire products dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((products.batch(100), products.batch(100).map(combined_model.candidate_model)))
)

# Get recommendations.
_, titles = index(tf.constant(["52228204"]))
print(f"Recommendations for user 52228204: {titles[0, :10]}")

Recommendations for user 52228204: [b'Micro Pedalflow' b'K2 55 mm Wheel (4-pack)'
 b'SRAM FORCE Rear Derailleur WiFli- Medium'
 b'Plastic Black Nomad Sunglasses (1-Pack of 12)'
 b'Plastic Black Nomad Sunglasses (1-Pack of 12)'
 b'Plastic Black Nomad Sunglasses (1-Pack of 12)'
 b"Waterprood R30 8oz Women's Long Sleeve Rashgaurd"
 b"Waterprood R30 8oz Women's Long Sleeve Rashgaurd"
 b'Wald 1392 Standard Large Front Handlebar Bike Basket'
 b"O'Neill Heat 3Q Zip 4/3 FSW (Black)"]


There is still some repetition of recommendations, but not as extreme as in the basemodel. 

In [63]:
# model serving: saving the model to G-Drive

# Export the query model.
gdrive_path = '/content/drive/MyDrive/Models'
path = os.path.join(gdrive_path, "combined_model")

# Save the index.
tf.saved_model.save(index, path)

# Load it back; can also be done in TensorFlow Serving.
combined_model_2 = tf.saved_model.load(path)

# Pass a user id in, get top predicted movie titles back.
scores, titles = combined_model_2(["52228204"])

print(f"Recommendations: {titles[0][:3]}")




Recommendations: [b'Micro Pedalflow' b'K2 55 mm Wheel (4-pack)'
 b'SRAM FORCE Rear Derailleur WiFli- Medium']


### ScaNN

Adding ScaNN layer for quick retrieval and saving it to G-Drive. 

In [64]:
# adding ScaNN layer
scann_index = tfrs.layers.factorized_top_k.ScaNN(combined_model.candidate_model)
scann_index.index_from_dataset(
  tf.data.Dataset.zip((products.batch(100), products.batch(100).map(combined_model.candidate_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7f2b3b28f490>

In [65]:
# Get recommendations.
_, titles = scann_index(tf.constant(["52228204"]))
print(f"Recommendations for user 52228204: {titles[0, :10]}")

Recommendations for user 52228204: [b'Micro Pedalflow' b'SNOWBOARD BINDINGS XL strap in Snowjam XL black NEW'
 b'K2 55 mm Wheel (4-pack)' b"O'Neill Heat 3Q Zip 4/3 FSW (Black)"
 b'Shimano RD-M773-10 XTShadow Rear Derailleur - 10Sp SGS Black/Silver'
 b'Fox Labs FX-42H Mark 5 Cop Top 4oz Stream Pepper Spray'
 b'Wald 1392 Standard Large Front Handlebar Bike Basket'
 b'Swisstop RacePro Brake Pads (fits Camp 10/11sp)'
 b'OXO - Good Grips Bagel Grip' b'OXO - Good Grips Bagel Grip']


In [66]:
# exporting ScaNN layer

# Export the query model.
gdrive_path = '/content/drive/MyDrive/Models'
path = os.path.join(gdrive_path, "combined_model")

# Save the index.
tf.saved_model.save(
    index,
    path,
    options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"])
)

# Load it back; can also be done in TensorFlow Serving.
combined_model_2 = tf.saved_model.load(path)

# Pass a user id in, get top predicted movie titles back.
scores, titles = combined_model_2(["52228204"])

print(f"Recommendations: {titles[0][:10]}")



Recommendations: [b'Micro Pedalflow' b'K2 55 mm Wheel (4-pack)'
 b'SRAM FORCE Rear Derailleur WiFli- Medium'
 b'Plastic Black Nomad Sunglasses (1-Pack of 12)'
 b'Plastic Black Nomad Sunglasses (1-Pack of 12)'
 b'Plastic Black Nomad Sunglasses (1-Pack of 12)'
 b"Waterprood R30 8oz Women's Long Sleeve Rashgaurd"
 b"Waterprood R30 8oz Women's Long Sleeve Rashgaurd"
 b'Wald 1392 Standard Large Front Handlebar Bike Basket'
 b"O'Neill Heat 3Q Zip 4/3 FSW (Black)"]


Even though the combined model performed well during the training and resulted in accuracy rate of 72.15% for Top-10 recommendations, it performed poorly on the test data resulting at 0.02% accuracy.

Out of 10 recommendations, I found one to be useful for this customer. 