<a href="https://colab.research.google.com/github/sherry-tang-97/H-M_recommender/blob/main/H%26M_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q tensorflow-recommenders

import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime
from packaging import version
import re
import numpy as np
from matplotlib import pyplot as plt
import time

[K     |████████████████████████████████| 88 kB 3.1 MB/s 
[K     |████████████████████████████████| 511.7 MB 5.3 kB/s 
[K     |████████████████████████████████| 438 kB 85.9 MB/s 
[K     |████████████████████████████████| 5.8 MB 78.8 MB/s 
[K     |████████████████████████████████| 1.6 MB 75.1 MB/s 
[?25h

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import tensorflow_recommenders as tfrs

In [None]:
#Read raw data
samples = 200000
transaction_df = pd.read_csv("/content/drive/MyDrive/hm_data/transactions_train.csv", nrows= samples)
articles_df = pd.read_csv("/content/drive/MyDrive/hm_data/articles.csv")
customers_df = pd.read_csv("/content/drive/MyDrive/hm_data/customers.csv")

In [None]:
#Merge information
transaction_df = transaction_df.merge(customers_df, on = "customer_id", how = "left")

transaction_df.article_id = transaction_df.article_id.astype(str)
articles_df.article_id = articles_df.article_id.astype(str)

transaction_df = transaction_df.merge(articles_df, on = "article_id", how = "left")

volumn = transaction_df.article_id.value_counts().rename_axis('article_id').reset_index(name='vol')

transaction_df = transaction_df.merge(volumn, on = "article_id", how = "left")
articles_df = articles_df.merge(volumn, on = "article_id", how = "left")
articles_df["vol"] = articles_df["vol"].fillna(0)

In [None]:
#Choose desired cols
transaction_df = transaction_df[['customer_id','article_id', 'age', 'product_type_name', 'graphical_appearance_name', 'colour_group_name', 'vol']]

In [None]:
#Turn transaction into dataset


tensor_aid = tf.convert_to_tensor(transaction_df.article_id)
tensor_ptn = tf.convert_to_tensor(transaction_df.product_type_name)
tensor_gan = tf.convert_to_tensor(transaction_df.graphical_appearance_name)
tensor_cgn = tf.convert_to_tensor(transaction_df.colour_group_name)
tensor_cid = tf.convert_to_tensor(transaction_df.customer_id)
tensor_age = tf.convert_to_tensor(transaction_df.age)
tensor_vol = tf.convert_to_tensor(transaction_df.vol)

dict_transaction = {"article_id": tensor_aid, 
                 "product_type_name": tensor_ptn, 
                 "graphical_appearance_name" : tensor_gan, 
                 "colour_group_name" : tensor_cgn, 
                 "customer_id": tensor_cid, 
                 "age": tensor_age, 
                 "vol": tensor_vol
}


transaction = tf.data.Dataset.from_tensor_slices(dict_transaction)


del tensor_aid, tensor_ptn, tensor_gan, tensor_cgn, tensor_cid, tensor_age, tensor_vol, dict_transaction



In [None]:
#Turn article & customer into dataset

tensor_aid = tf.convert_to_tensor(articles_df.article_id)
tensor_ptn = tf.convert_to_tensor(articles_df.product_type_name)
tensor_gan = tf.convert_to_tensor(articles_df.graphical_appearance_name)
tensor_cgn = tf.convert_to_tensor(articles_df.colour_group_name)
tensor_vol = tf.convert_to_tensor(articles_df.vol)

dict_articles = {"article_id": tensor_aid, 
                 "product_type_name": tensor_ptn, 
                 "graphical_appearance_name" : tensor_gan, 
                 "colour_group_name" : tensor_cgn, 
                 "vol": tensor_vol
}

articles = tf.data.Dataset.from_tensor_slices(dict_articles)


del tensor_aid, tensor_ptn, tensor_gan, tensor_cgn, tensor_vol, dict_articles


tensor_cid = tf.convert_to_tensor(customers_df.customer_id)
tensor_age = tf.convert_to_tensor(customers_df.age)

dict_customers = {"customer_id": tensor_cid, 
                 "age": tensor_age
}

customers = tf.data.Dataset.from_tensor_slices(dict_customers)

del tensor_cid, tensor_age, dict_customers

In [None]:
#Train/valid/test split
from tensorflow.python.ops.gen_dataset_ops import shuffle_and_repeat_dataset
tf.random.set_seed(9586)
shuffled = transaction.shuffle(samples, seed=9586, reshuffle_each_iteration=False)

train_num = round(samples*0.8*0.8)
valid_num = round(samples*0.8*0.2)
test_num = round(samples*0.2)

train = shuffled.take(train_num)
valid = shuffled.skip(train_num).take(valid_num)
test = shuffled.skip(train_num + valid_num).take(test_num)

del shuffled

In [None]:
#Unique values
uaid = articles.batch(10000).map(lambda x: x["article_id"])
uptn = articles.batch(10000).map(lambda x: x["product_type_name"])
ugan = articles.batch(10000).map(lambda x: x["graphical_appearance_name"])
ucgn = articles.batch(10000).map(lambda x: x["colour_group_name"])

ucid = customers.batch(50000).map(lambda x: x["customer_id"])


uaid = np.unique(np.concatenate(list(uaid)))
uptn = np.unique(np.concatenate(list(uptn)))
ugan = np.unique(np.concatenate(list(ugan)))
ucgn = np.unique(np.concatenate(list(ucgn)))
ucid = np.unique(np.concatenate(list(ucid)))


In [None]:
#Base model
embedding_dimension = 32
customer_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=ucid, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(ucid) + 1, embedding_dimension)
])

article_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=uaid, mask_token=None),
  tf.keras.layers.Embedding(len(uaid) + 1, embedding_dimension)
])

metrics = tfrs.metrics.FactorizedTopK(
  candidates=articles.batch(10000).map(lambda x: x["article_id"]).map(article_model)
)
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [None]:
#Base model
class HM_Model(tfrs.Model):
  
  def __init__(self, customer_model, article_model):
    super().__init__()
    self.article_model: tf.keras.Model = article_model
    self.customer_model: tf.keras.Model = customer_model
    self.task: tf.keras.layers.Layer = task


  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    customer_embeddings = self.customer_model(features["customer_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_article_embeddings = self.article_model(features["article_id"])

    # The task computes the loss and the metrics.
    return self.task(customer_embeddings, positive_article_embeddings)

In [None]:
#Prepare data for training
model = HM_Model(customer_model, article_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

train_batch = round(train_num/5)
valid_batch = round(valid_num/5)
test_batch = round(test_num/5)

cached_train = train.shuffle(train_num).batch(train_batch).cache()
cached_valid = valid.batch(valid_batch).cache()
cached_test = test.batch(test_batch).cache()

cached_train = cached_train.prefetch(tf.data.AUTOTUNE)


In [None]:
#Fit base model
start = time.time()
model.fit(cached_train, epochs = 10, validation_data = cached_valid)
end = time.time()

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: ignored

In [None]:
#Base model run time
print(end-start)

In [None]:
#Evaluate base model
model.evaluate(cached_test, return_dict=True)
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.customer_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((articles.batch(10000).map(lambda x: x["article_id"]), articles.batch(10000).map(lambda x: x["article_id"]).map(model.article_model)))
)


Cause: could not parse the source code of <function <lambda> at 0x7f8e40a9d4d0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
(lambda x: x['article_id'])

Match 1:
(lambda x: x['article_id'])

Cause: could not parse the source code of <function <lambda> at 0x7f8e40a9d4d0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
(lambda x: x['article_id'])

Match 1:
(lambda x: x['article_id'])

Cause: could not parse the source code of <function <lambda> at 0x7f8e40a8c950>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f8e40ea3d10>

In [None]:
#Complex model 1: included article & customer features

#Preprocess article features

aid_emb = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=uaid, mask_token=None),
  tf.keras.layers.Embedding(len(uaid) + 1, embedding_dimension)
])

ptn_emb = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=uptn, mask_token=None),
  tf.keras.layers.Embedding(len(uptn) + 1, embedding_dimension)
])

gan_emb = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=ugan, mask_token=None),
  tf.keras.layers.Embedding(len(ugan) + 1, embedding_dimension)
])

cgn_emb = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=ucgn, mask_token=None),
  tf.keras.layers.Embedding(len(ucgn) + 1, embedding_dimension)
])


In [None]:
#Article model
class articleModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.aid_emb = aid_emb
    self.ptn_emb = ptn_emb
    self.gan_emb = gan_emb
    self.cgn_emb = cgn_emb

  def call(self, inputs):

    return tf.concat([
        self.aid_emb(inputs["article_id"]),
        self.ptn_emb(inputs["product_type_name"]),
        self.gan_emb(inputs["graphical_appearance_name"]),
        self.cgn_emb(inputs["colour_group_name"])], axis = 1)

In [None]:
#Preprocess customer features

cid_emb = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=ucid, mask_token=None),
  tf.keras.layers.Embedding(len(ucid) + 1, embedding_dimension)
])

#age_emb = tf.keras.layers.Normalization(
        #axis=None
    #)

#Customer model
class customerModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.cid_emb = cid_emb
    #self.age_emb = age_emb

  def call(self, inputs):

    return tf.concat([
        self.cid_emb(inputs["customer_id"]),
        #tf.reshape(self.age_emb(inputs["age"]), (-1, 1))
    ], axis=1)

In [None]:
#Complex model 1
class HM_Model_1(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      customerModel(),
      tf.keras.layers.Dense(32)
    ])

    self.candidate_model = tf.keras.Sequential([
      articleModel(),
      tf.keras.layers.Dense(32)
    ])
                        
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=articles.batch(10000).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
        "customer_id": features["customer_id"],
        #"age": features["age"],
    })
    candidate_embeddings = self.candidate_model({
        "article_id": features["article_id"],
        "product_type_name": features["product_type_name"],
        "graphical_appearance_name": features["graphical_appearance_name"],
        "colour_group_name": features["colour_group_name"],
    })

    return self.task(query_embeddings, candidate_embeddings)

In [None]:
#Prepare data for training
model = HM_Model_1()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(train_num).batch(train_batch).cache()
cached_valid = valid.batch(valid_batch).cache()
cached_test = test.batch(test_batch).cache()



In [None]:
#Fit complex model 1
start = time.time()
model.fit(cached_train, epochs = 10, validation_data = cached_valid)
end = time.time()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#Complex model 1 run time
print(end-start)

249.23756647109985


In [None]:
#Evaluate complex model 1
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_100_categorical_accuracy': 0.1698250025510788,
 'factorized_top_k/top_10_categorical_accuracy': 0.08905000239610672,
 'factorized_top_k/top_1_categorical_accuracy': 0.0253749992698431,
 'factorized_top_k/top_50_categorical_accuracy': 0.1404000073671341,
 'factorized_top_k/top_5_categorical_accuracy': 0.07227499783039093,
 'loss': 95605.5234375,
 'regularization_loss': 0,
 'total_loss': 95605.5234375}

In [None]:
#Complex model 2: included popularity feature

#Preprocess article features
aid_emb = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=uaid, mask_token=None),
  tf.keras.layers.Embedding(len(uaid) + 1, embedding_dimension)
])


vol_emb = tf.keras.layers.Normalization(
        axis=None
    )


#Article model
class articleModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.aid_emb = aid_emb
    self.vol_emb = vol_emb

  def call(self, inputs):

    return tf.concat([
        self.aid_emb(inputs["article_id"]),
        tf.reshape(self.vol_emb(inputs["vol"]), (-1, 1))
    ], axis=1)


In [None]:
#Preprocess customer feature
cid_emb = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=ucid, mask_token=None),
  tf.keras.layers.Embedding(len(ucid) + 1, embedding_dimension)
])

#Customer model
class customerModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.cid_emb = cid_emb

  def call(self, inputs):

    return tf.concat([
        self.cid_emb(inputs["customer_id"]),
    ], axis=1)


In [None]:
#Complex model 2
class HM_Model_2(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      customerModel(),
      tf.keras.layers.Dense(32)
    ])

    self.candidate_model = tf.keras.Sequential([
      articleModel(),
      tf.keras.layers.Dense(32)
    ])
                        
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=articles.batch(10000).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
        "customer_id": features["customer_id"]
    })
    candidate_embeddings = self.candidate_model({
        "article_id": features["article_id"],
        "vol": features["vol"]
    })

    return self.task(query_embeddings, candidate_embeddings)

In [None]:
#Prepare data for training
model = HM_Model_2()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(train_num).batch(train_batch).cache()
cached_valid = valid.batch(valid_batch).cache()
cached_test = test.batch(test_batch).cache()



In [None]:
#Fit complex model 2
start = time.time()
model.fit(cached_train, epochs = 10, validation_data = cached_valid)
end = time.time()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#Complex model 2 run time
print(end-start)

247.45911073684692
