# Data Science Festival x ASOS
## Build and Deploy a Recommender System in 3 Hours.

# Imports

In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

# Import training data

In [12]:
train = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_train_with_alphanumeric_dummy_ids.parquet")
valid = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_valid_with_alphanumeric_dummy_ids.parquet")
dummy_users = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_dummy_users_with_alphanumeric_dummy_ids.csv", header=None).values.flatten().astype(str)
products = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_productIds.csv", header=None).values.flatten().astype(int)





In [13]:
train.head()

Unnamed: 0,dummyUserId,productId
0,b'PIXcm7Ru5KmntCy0yA1K',10524048
1,b'd0RILFB1hUzNSINMY4Ow',9137713
2,b'Ebax7lyhnKRm4xeRlWW2',5808602
3,b'vtigDw2h2vxKt0sJpEeU',10548272
4,b'r4GfiEaUGxziyjX0PyU6',10988173


# Define a Recommender Model

The embedding layer gives a list of random numbers for each user and each product.

In [14]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

Scores can be found using the dot product.

In [15]:
dummy_users

array(['pmfkU4BNZhmtLgJQwJ7x', 'UDRRwOlzlWVbu7H8YCCi',
       'QHGAef0TI6dhn0wTogvW', ..., 'lcORJ5hemOZc1iGo9z7k',
       '5CqDquDAszqJp27P7AL8', 'SSPNYxJMfuKhoe1dg24m'], dtype='<U20')

In [16]:
class SimpleRecommender(tf.keras.Model):
    def __init__(self, dummy_users, products,length_of_embedding):
        super(SimpleRecommender, self).__init__()
        self.products = tf.constant(products, dtype=tf.int32)
        self.dummy_users = tf.constant(dummy_users, dtype=tf.string)
        self.dummy_user_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.dummy_users, range(len(dummy_users))), -1)
        self.product_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.products, range(len(products))), -1)
        
        self.user_embedding = tf.keras.layers.Embedding(len(dummy_users), length_of_embedding)
        self.product_embedding = tf.keras.layers.Embedding(len(products), length_of_embedding)
        
        self.dot = tf.keras.layers.Dot(axes=-1)

    def call(self,inputs):
        user = inputs[0]
        products = inputs[1]
        
        user_embedding_index = self.dummy_user_table.lookup(user)
        product_embedding_index = self.product_table.lookup(products)

        user_embedding_values = self.user_embedding(user_embedding_index)
        product_embedding_values = self.product_embedding(product_embedding_index)

        return tf.squeeze(self.dot([user_embedding_values,product_embedding_values]),1)


    @tf.function
    def call_item_item(self, product):
        product_x = self.product_table.lookup(product)
        pe = tf.expand_dims(self.product_embedding(product_x), 0)
        
        all_pe = tf.expand_dims(self.product_embedding.embeddings, 0)#note this only works if the layer has been built!
        scores = tf.reshape(self.dot([pe, all_pe]), [-1])
        
        top_scores, top_indices = tf.math.top_k(scores, k=100)
        top_ids = tf.gather(self.products, top_indices)
        return top_ids, top_scores

# Creating a dataset

First create a tf.data.Dataset from the user purchase pairs.

In [17]:
dummy_user_tensor = tf.constant(train[["dummyUserId"]].values, dtype=tf.string)
product_tensor = tf.constant(train[["productId"]].values, dtype=tf.int32)

dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
for x, y in dataset:
    print(x)
    print(y)
    break

tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor([10524048], shape=(1,), dtype=int32)


For each purchase let's sample a number of products that the user did not purchase. Then the model can score each of the products and we will know we are doing a good job if the product with the highest score is the product that the user actually purchased.

We can do this using dataset.map

In [18]:
class Mapper():
    
    def __init__(self, possible_products, num_negative_products):
        self.num_possible_products = len(possible_products)
        self.possible_products_tensor = tf.constant(possible_products, dtype=tf.int32)
        
        self.num_negative_products = num_negative_products
        #create array of 1,0,0,0,0++  for the array of one product user bought and 10 they didnt (hence +1)
        self.y = tf.one_hot(0, num_negative_products+1)
    
    def __call__(self, user, product):
        random_negatives_indexs = tf.random.uniform((self.num_negative_products, ), minval = 0, maxval = self.num_possible_products, dtype=tf.int32)
        negatives = tf.gather(self.possible_products_tensor,random_negatives_indexs)
        candidates = tf.concat([product, negatives],axis=0)
        return (user, candidates), self.y

In [19]:
#get a list per user that has one product they did buy and x (10 in this case) that they DID NOT buy
dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor)).map(Mapper(products,10))
for u, c in dataset:
  print(u)
  print(c)
  print (y)
  break


(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'PIXcm7Ru5KmntCy0yA1K'], dtype=object)>, <tf.Tensor: shape=(11,), dtype=int32, numpy=
array([10524048, 11082655, 10990900, 12783338, 11498206, 11935504,
       10018416, 10814731,  9006314,  8874164, 11458167], dtype=int32)>)
tf.Tensor([1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(11,), dtype=float32)
tf.Tensor([10524048], shape=(1,), dtype=int32)


Let's bring the steps together to define a function which creates a dataset 

In [20]:
def get_dataset(df,products, num_negative_products):
    dummy_user_tensor = tf.constant(df[["dummyUserId"]].values, dtype=tf.string)
    product_tensor = tf.constant(df[["productId"]].values, dtype=tf.int32)

    dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor,product_tensor))
    dataset = dataset.map(Mapper(products,num_negative_products))
    
    dataset = dataset.batch(1024)
    return dataset

In [21]:
#test dataset function
for (u, c),y in get_dataset(train, products,4):
  print(u)
  print(c)
  print(y)
  break

tf.Tensor(
[[b'PIXcm7Ru5KmntCy0yA1K']
 [b'd0RILFB1hUzNSINMY4Ow']
 [b'Ebax7lyhnKRm4xeRlWW2']
 ...
 [b'xuX9n8PHfSR0AP3UZ8ar']
 [b'iNnxsPFfOa9884fMjVPJ']
 [b'aD8Mn12im8lFPzXAY41P']], shape=(1024, 1), dtype=string)
tf.Tensor(
[[10524048 11424849 11662619 10758976  8504860]
 [ 9137713 11973720 11551232 11701198 11137858]
 [ 5808602  8919328 10260360 13546263 13668234]
 ...
 [11541336  9818830 11253897  9145603 10102239]
 [ 7779232 10805315 11473322 12358829  9230958]
 [ 4941259  8159622  7695747 12470719 10879986]], shape=(1024, 5), dtype=int32)
tf.Tensor(
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]], shape=(1024, 5), dtype=float32)


# Train a model

We need to compile a model, set the loss and create an evaluation metric. Then we need to train the model.

In [22]:
model = SimpleRecommender(dummy_users, products,15)

model.compile(loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              optimizer = tf.keras.optimizers.SGD(learning_rate=100.),
              metrics = [tf.keras.metrics.CategoricalAccuracy()] ) 

model.fit(get_dataset(train, products, 100), validation_data= get_dataset(valid,products,100), epochs=5)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f8ea1342c88>

Let's do a manual check on whether the model is any good.

In [23]:
test_product = 11698965

In [24]:
print("Recs for item {}: {}".format(test_product, model.call_item_item(tf.constant(test_product, dtype=tf.int32))))

Recs for item 11698965: (<tf.Tensor: shape=(100,), dtype=int32, numpy=
array([10402703, 11221174, 10309914, 10635879,  8603141,  9586825,
        8904051,  5801178,  9690295, 11398199, 10563199, 10581393,
        9586832, 11210565, 10637119, 10353428,  5026837, 11402965,
       10716895, 10439343, 10309915, 10636194,  9565144, 12030039,
        8811498,  9137744,  8873143, 10644438, 10598918, 10512479,
       10308557,  9020671,  8426701, 10269381,  9393295, 10326351,
       11251463,  9992295, 12241759, 12387488, 10395716, 12377314,
       10715863, 10835538,  9690303,  9427227,  9453604,  9400604,
       10062595,  8745098, 10636466, 11344538,  9923669, 10735521,
        8875455, 10413104,  3421877, 11953944, 10867467, 11364726,
        9419651, 10636398, 11937479, 11406629,  8469882, 10475476,
        2636617, 10571852,  7415117, 11141217, 10253001, 10796010,
        9686943,  9398510,  7289381, 12555488,  8978987, 11078754,
        9528209, 10261983,  9044706, 11076509, 10636469, 1

# Save the model

In [25]:
model_path = "models/recommender/2"

In [26]:
inpute_signature = tf.TensorSpec(shape=(), dtype=tf.int32)

In [27]:
signatures = { 'call_item_item': model.call_item_item.get_concrete_function(inpute_signature)}

In [28]:
tf.saved_model.save(model, model_path,signatures=signatures)

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/recommender/5/assets


In [29]:
imported_model = tf.saved_model.load(model_path )

list(imported_model.signatures.keys())


['call_item_item']

In [30]:
test_product = 11698965

print("Recs for item {}: {}".format(test_product, imported_model.call_item_item(tf.constant(test_product, dtype=tf.int32))))




Recs for item 11698965: (<tf.Tensor: shape=(100,), dtype=int32, numpy=
array([10402703, 11221174, 10309914, 10635879,  8603141,  9586825,
        8904051,  5801178,  9690295, 11398199, 10563199, 10581393,
        9586832, 11210565, 10637119, 10353428,  5026837, 11402965,
       10716895, 10439343, 10309915, 10636194,  9565144, 12030039,
        8811498,  9137744,  8873143, 10644438, 10598918, 10512479,
       10308557,  9020671,  8426701, 10269381,  9393295, 10326351,
       11251463,  9992295, 12241759, 12387488, 10395716, 12377314,
       10715863, 10835538,  9690303,  9427227,  9453604,  9400604,
       10062595,  8745098, 10636466, 11344538,  9923669, 10735521,
        8875455, 10413104,  3421877, 11953944, 10867467, 11364726,
        9419651, 10636398, 11937479, 11406629,  8469882, 10475476,
        2636617, 10571852,  7415117, 11141217, 10253001, 10796010,
        9686943,  9398510,  7289381, 12555488,  8978987, 11078754,
        9528209, 10261983,  9044706, 11076509, 10636469, 1

In [31]:
# Run docker image via compose for tf serving  - this uses the docker nameed host in the compose file for the URL

# This doesnt specify a version

!curl --header "Content-Type: application/json" --request POST --data \
'{"signature_name":"call_item_item","inputs": {"product": [8650774] } }' \
http://serving:8501/v1/models/recommender:predict
            
            
            
            

{
    "outputs": {
        "output_0": [
            10308818,
            10308557,
            10563215,
            10309914,
            10563199,
            9775640,
            7777858,
            9276055,
            11828336,
            10402698,
            9099410,
            10309924,
            10635862,
            11344122,
            9110939,
            8745086,
            11771631,
            10637119,
            6429322,
            12360115,
            10366062,
            9586832,
            12387488,
            10565130,
            6744223,
            11171018,
            11002791,
            11801850,
            10366082,
            10054584,
            8290000,
            8064082,
            10573792,
            11234611,
            9655345,
            11251463,
            11352144,
            10386518,
            10309920,
            9586825,
            11667521,
            8745089,
    

In [38]:
# run a specific served version via version number  e.g v3 /versions/3


!curl --header "Content-Type: application/json" --request POST --data \
'{"signature_name":"call_item_item","inputs": {"product": [8650774] } }' \
http://serving:8501/v1/models/recommender/versions/3:predict
            
            
            
            

{
    "outputs": {
        "output_0": [
            10573792,
            10377022,
            8927118,
            10125462,
            11235575,
            11378722,
            8569920,
            10487343,
            8941836,
            11001554,
            10893005,
            10490457,
            13329218,
            8957032,
            8544789,
            10281224,
            11041042,
            8868237,
            11569183,
            10461357,
            11469608,
            9903983,
            7425915,
            10571252,
            12083964,
            10448772,
            9049133,
            11480528,
            11435199,
            8868217,
            12110753,
            10811863,
            11588051,
            9931362,
            10313006,
            10813740,
            10339348,
            12062383,
            11702309,
            9565435,
            11393435,
            11235579,
  

In [33]:
# run a specific served version by version name  - this one uses prod label as defined in models config
# e.g dev version /labels/dev

!curl --header "Content-Type: application/json" --request POST --data \
'{"signature_name":"call_item_item","inputs": {"product": [8650774] } }' \
http://serving:8501/v1/models/recommender/labels/dev:predict

            
            

{
    "outputs": {
        "output_0": [
            8650774,
            10076866,
            10385896,
            10489436,
            10472064,
            12280078,
            11976884,
            9693739,
            6430186,
            10871563,
            11532312,
            12450271,
            10067263,
            11181654,
            9168531,
            11261344,
            10194173,
            10733129,
            11805803,
            12125622,
            9693744,
            13175230,
            10453708,
            7273376,
            12015387,
            10175964,
            10447253,
            11532307,
            8691401,
            9835456,
            12070254,
            9097969,
            12996329,
            10837303,
            10281094,
            11131515,
            10315599,
            10143117,
            10306064,
            11822512,
            9287334,
            10634180,


Zip up the models file 

Zipping the saved model will make it easier to download.

In [35]:
from zipfile import ZipFile
import os
# create a ZipFile object
with ZipFile('recommender.zip', 'w') as zipObj:
   # Iterate over all the files in directory
    for folderName, subfolders, filenames in os.walk("models"):
        for filename in filenames:
           #create complete filepath of file in directory
           filePath = os.path.join(folderName, filename)
           # Add file to zip
           zipObj.write(filePath)