# Data Science Festival x ASOS
## Build and Deploy a Recommender System in 3 Hours.

# Imports

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

# Import training data

In [None]:
train = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_train_with_alphanumeric_dummy_ids.parquet")
valid = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_valid_with_alphanumeric_dummy_ids.parquet")
dummy_users = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_dummy_users_with_alphanumeric_dummy_ids.csv", header=None).values.flatten().astype(str)
products = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_productIds.csv", header=None).values.flatten().astype(int)

# The briefest intro to tf

Tensors

In [None]:
x= tf.constant([1,2,3,4])
tf.math.square(x)

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 1,  4,  9, 16], dtype=int32)>

Gradients

Multiply and add tensors

In [None]:
x = tf.constant([[1,2,3]], dtype=tf.float32)
Y = tf.constant([[1,2,3, 4], [1,2,3,4], [1,2,3,4]], dtype=tf.float32)

In [None]:
z = tf.constant([10, 11, 12, 13], dtype=tf.float32)

This operation is very common in deep learning, so it has been abstracted:

You can choose to apply a function to each value in the output

We can put different layers together in a sequence:

In [None]:
dl3 = tf.keras.layers.Dense(1, use_bias=False, \
                             weights=[tf.constant([[0], [1], [0], [1]], \
                                                  dtype=tf.float32)])

We can get more flexibility if you use tf.keras.model:

So far we have been setting the weights of the dense layers, but if we don't set the weights than weights get randomly chosen.

In [None]:
dl6 = tf.keras.layers.Dense(4, use_bias=True)
dl6(x)

In [None]:
dl6.get_weights()

# Define a Recommender Model

The embedding layer gives a list of random numbers for each user and each product.

In [None]:
embedl=tf.keras.layers.Embedding(5, 8)
embedl(2)

<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([-0.03054699, -0.00557311,  0.03459969, -0.0118145 , -0.01649247,
        0.00949558,  0.00665691, -0.03847488], dtype=float32)>

In [None]:
embedl.get_weights()

[array([[-0.04663273, -0.014363  ,  0.02206874, -0.04027362, -0.04071765,
          0.00598662, -0.04959073,  0.03661405],
        [ 0.01003055,  0.00909718, -0.04359809,  0.04971406, -0.00294499,
          0.04058051,  0.03917206,  0.02313646],
        [-0.03054699, -0.00557311,  0.03459969, -0.0118145 , -0.01649247,
          0.00949558,  0.00665691, -0.03847488],
        [-0.03939084,  0.01224681,  0.02019801, -0.04419005, -0.02599722,
          0.00136926, -0.00150108,  0.00914472],
        [ 0.04016359, -0.00945417,  0.02341981, -0.04259541,  0.00771213,
         -0.012178  ,  0.00743093, -0.03526945]], dtype=float32)]

Scores can be found using the dot product.

In [None]:
dummy_user_embedding = tf.keras.layers.Embedding(len(dummy_users), 6)
product_embedding = tf.keras.layers.Embedding(len(products), 6)


In [None]:
dummy_user_embedding(1)

<tf.Tensor: shape=(6,), dtype=float32, numpy=
array([ 0.01648981, -0.01294916, -0.00169629,  0.03595338,  0.02817022,
        0.03011284], dtype=float32)>

In [None]:
product_embedding(99)

<tf.Tensor: shape=(6,), dtype=float32, numpy=
array([-0.02679669,  0.03163424, -0.00756314,  0.01552178,  0.04847943,
       -0.03336507], dtype=float32)>

In [None]:
tf.tensordot(dummy_user_embedding(1), product_embedding(99), axes=[[0], [0]])

<tf.Tensor: shape=(), dtype=float32, numpy=8.0339654e-05>

We can score multiple products at the same time, which is what we need to create a ranking.

In [None]:
example_product = tf.constant([1, 77, 104, 2062])
product_embedding(example_product)

<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
array([[ 0.03424526, -0.00072784,  0.0413002 , -0.02568839,  0.03860142,
         0.03721027],
       [ 0.01171565,  0.03346721,  0.02360849, -0.01451224, -0.04788959,
        -0.031898  ],
       [ 0.00955091, -0.03971   , -0.04802005, -0.04439354,  0.0164662 ,
        -0.0086584 ],
       [-0.02595018, -0.03641059, -0.01327674,  0.02028239,  0.0219014 ,
         0.03837201]], dtype=float32)>

In [None]:
tf.tensordot(dummy_user_embedding(1), product_embedding(example_product), axes=[[0], [1]])

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.0017884 , -0.00311159, -0.00063981,  0.00256777], dtype=float32)>

And we can score multiple users for multiple products which we will need to do if we are to train quickly.

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

But we need to map product ids to embedding ids.

In [None]:
product_table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(tf.constant(products, dtype=tf.int32), 
                                        range(len(products))), -1)

In [None]:
product_table.lookup(tf.constant([8650074]))

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([-1], dtype=int32)>

Let's put those two things together

In [None]:
class SimpleRecommender(tf.keras.Model):
    def __init__(self, dummy_users, products, length_of_embedding):
        super(SimpleRecommender, self).__init__()
        self.products = tf.constant(products, dtype=tf.int32)
        self.dummy_users = tf.constant(dummy_users, dtype=tf.string)
        self.dummy_user_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.dummy_users, range(len(dummy_users))), -1)
        self.product_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.products, range(len(products))), -1)
        
        self.user_embedding = tf.keras.layers.Embedding(len(dummy_users), length_of_embedding)
        self.product_embedding = tf.keras.layers.Embedding(len(products), length_of_embedding)

        self.dot = tf.keras.layers.Dot(axes=1)
        
    def call(self, inputs):
        user = inputs[0]
        products = inputs[1]

        user_embedding_index = self.dummy_user_table.lookup(user)
        product_embedding_index = self.product_table.lookup(products)

        user_embedding_values = self.user_embedding(user_embedding_index)
        product_embedding_values = self.product_embedding(product_embedding_index)
    
        return tf.squeeze(self.dot([user_embedding_values, product_embedding_values]),1)

    @tf.function
    def call_item_item(self, product):
        product_x = self.product_table.lookup(product)
        pe = tf.expand_dims(self.product_embedding(product_x), 0)
        
        all_pe = tf.expand_dims(self.product_embedding.embeddings, 0)#note this only works if the layer has been built!
        scores = tf.reshape(self.dot([pe, all_pe]), [-1])
        
        top_scores, top_indices = tf.math.top_k(scores, k=100)
        top_ids = tf.gather(self.products, top_indices)
        return top_ids, top_scores

In [None]:
dummy_users

array(['pmfkU4BNZhmtLgJQwJ7x', 'UDRRwOlzlWVbu7H8YCCi',
       'QHGAef0TI6dhn0wTogvW', ..., 'lcORJ5hemOZc1iGo9z7k',
       '5CqDquDAszqJp27P7AL8', 'SSPNYxJMfuKhoe1dg24m'], dtype='<U20')

In [None]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [None]:
srl = SimpleRecommender(dummy_users, products, 15)
srl([tf.constant([['pmfkU4BNZhmtLgJQwJ7x'], ['UDRRwOlzlWVbu7H8YCCi']]),
     tf.constant([[8650774, 9306139, 9961521], [12058614, 12058615, 11927550]])])

ValueError: ignored

# Creating a dataset

First create a tf.data.Dataset from the user purchase pairs.

In [None]:
dummy_user_tensor = tf.constant(train[["dummyUserId"]].values, dtype=tf.string)
product_tensor = tf.constant(train[["productId"]].values, dtype=tf.int32)

dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
for x, y in dataset:
    print(x)
    print(y)
    break

tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor([10524048], shape=(1,), dtype=int32)


In [None]:
random_neg_indexes = tf.random.uniform((7, ), minval=0, maxval=len(products), dtype=tf.int32)
random_neg_indexes

<tf.Tensor: shape=(7,), dtype=int32, numpy=array([19973, 11058, 11436, 15782, 11821,  7759,  2366], dtype=int32)>

In [None]:
tf.gather(products, random_neg_indexes)

<tf.Tensor: shape=(7,), dtype=int64, numpy=
array([10535305,  9499252, 10382620, 11740907, 13409928, 11569180,
        8603141])>

For each purchase let's sample a number of products that the user did not purchase. Then the model can score each of the products and we will know we are doing a good job if the product with the highest score is the product that the user actually purchased.

We can do this using dataset.map

In [None]:
class Mapper():
    
    def __init__(self, possible_products, num_negative_products):
        self.num_possible_products = len(possible_products)
        self.possible_products_tensor = tf.constant(possible_products, dtype=tf.int32)
        
        self.num_negative_products = num_negative_products
        self.y = tf.one_hot(0, num_negative_products+1)
    
    def __call__(self, user, product):
        random_neg_indexes = tf.random.uniform((self.num_negative_products, ), minval=0, maxval=self.num_possible_products, dtype=tf.int32)
        negatives = tf.gather(self.possible_products_tensor, random_neg_indexes)
        candidates = tf.concat([product, negatives], axis=0)
        return (user, product), self.y

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor)).map(Mapper(products, 10))
for u, c in dataset:
  print(u)
  print(c)
  break


(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'PIXcm7Ru5KmntCy0yA1K'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([10524048], dtype=int32)>)
tf.Tensor([1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(11,), dtype=float32)


Let's bring the steps together to define a function which creates a dataset 

In [None]:
def get_dataset(df, products, num_negative_products):
    dummy_user_tensor = tf.constant(df[['dummyUserId']].values, dtype=tf.string)
    product_tensor=tf.constant(df[['productId']].values, dtype=tf.int32)

    dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
    dataset = dataset.map(Mapper(products, num_negative_products))
    dataset=dataset.batch(1024)
    return dataset

In [None]:
for (u, c), y in get_dataset(train, products, 4):
  print(u)
  print(c)
  print(y)
  break

tf.Tensor(
[[b'PIXcm7Ru5KmntCy0yA1K']
 [b'd0RILFB1hUzNSINMY4Ow']
 [b'Ebax7lyhnKRm4xeRlWW2']
 ...
 [b'xuX9n8PHfSR0AP3UZ8ar']
 [b'iNnxsPFfOa9884fMjVPJ']
 [b'aD8Mn12im8lFPzXAY41P']], shape=(1024, 1), dtype=string)
tf.Tensor(
[[10524048]
 [ 9137713]
 [ 5808602]
 ...
 [11541336]
 [ 7779232]
 [ 4941259]], shape=(1024, 1), dtype=int32)
tf.Tensor(
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]], shape=(1024, 5), dtype=float32)


# Train a model

We need to compile a model, set the loss and create an evaluation metric. Then we need to train the model.

In [None]:
model = SimpleRecommender(dummy_users, products, 15)
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              optimizer = tf.keras.optimizers.SGD(learning_rate=100.),
              metrics= [tf.keras.metrics.CategoricalAccuracy()])

model.fit(get_dataset(train, products, 100), validation_data= get_dataset(valid, products, 100), epochs=5)


Epoch 1/5


ValueError: ignored

Let's do a manual check on whether the model is any good.

In [None]:
test_product = 11698965

In [None]:
print("Recs for item {}: {}".format(test_product, model.call_item_item(tf.constant(test_product, dtype=tf.int32))))

ValueError: ignored

# Save the model

In [None]:
model_path = "models/recommender/1"

In [None]:
inpute_signature = tf.TensorSpec(shape=(), dtype=tf.int32)

In [None]:
signatures = { 'call_item_item': r1.call_item_item.get_concrete_function(inpute_signature)}

In [None]:
imported_model = tf.saved_model.load('models/recommeder/1')
list(imported_model.signatures.keys())

In [None]:
imported_model.signatures['call_item_item'](tf.constant([14844847]))

In [None]:
os.makedirs("dummy/0")
tf.saved_model.save(model, 'dummy/0')    
imported = tf.saved_model.load("dummy/0")
imported(tf.constant([14844847]))

In [None]:
os.makedirs("dummy/1")
tf.saved_model.save(model, 'dummy/1',
                    model.call_item_item.get_concrete_function(tf.TensorSpec(shape=(), dtype=tf.int32)))      
list(imported_model.signatures.keys())

In [None]:
imported_model.signatures['serving_default'](tf.constant([14844847]))

Zipping the saved model will make it easier to download.

In [None]:
from zipfile import ZipFile
import os
# create a ZipFile object
with ZipFile('recommender.zip', 'w') as zipObj:
   # Iterate over all the files in directory
    for folderName, subfolders, filenames in os.walk("models"):
        for filename in filenames:
           #create complete filepath of file in directory
           filePath = os.path.join(folderName, filename)
           # Add file to zip
           zipObj.write(filePath)