# Product Emebddings for Recommendation Systems

-----

## Instacart Grocery Dataset

Source: https://www.kaggle.com/c/instacart-market-basket-analysis

- Instacart is an online grocery delivery service
- They have made available 3M grocery orders for over 200K users
- They provide between 4 to 100 orders for each user and each order contains the sequence of products purchased
- We also have a brief description of the products

Goals:
- We will use this data to generate product embeddings - dense continuous representations of discrete tokens
- We will apply methods from Natural Language Processing to analyze product baskets
----

## **1. Import Libraries**

In [None]:
## data processing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from zipfile import ZipFile # read zip files directly
import gc # garbage collection
import pickle # save python objects
import random 

# parallel processing
import multiprocessing
from joblib import delayed, Parallel

# cool progress bar
import tqdm
import time

# modeling and evaluation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
### tensorflow related packages and functions
import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Input, Activation, Dense, Dot, Embedding, Flatten, GlobalAveragePooling1D, Reshape
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import make_sampling_table, skipgrams
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import History
from tensorflow.keras import optimizers

In [None]:
#===============================================
# global parameters
#===============================================

# show entire value of cell in pandas
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 500)

# number of cpus
cpus = multiprocessing.cpu_count()
f"Number of CPUs: {cpus}"

In [None]:
# test for gpu
print(tf.test.is_gpu_available(
    cuda_only=False, min_cuda_compute_capability=None
))
print(tf.test.gpu_device_name())

-----

## 2. Raw data

In [None]:
# Input data files are available in the read-only "../input/" directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# set data directory
data_dir =  "/kaggle/input/instacart-market-basket-analysis/"

## **2.1 Product info**

In [None]:
# product file
with ZipFile(data_dir + "products.csv.zip") as z:
    with z.open("products.csv") as f:
        products = pd.read_csv(f)
print(products.shape)
products.head()

In [None]:
# department file
with ZipFile(data_dir + "departments.csv.zip") as z:
    with z.open("departments.csv") as f:
        dept = pd.read_csv(f)
print(dept.shape)
dept

In [None]:
# aisle file
with ZipFile(data_dir + "aisles.csv.zip") as z:
    with z.open("aisles.csv") as f:
        aisle = pd.read_csv(f)
print(aisle.shape)
aisle.head()

In [None]:
# merge all files to a single product file
products = pd.merge(products, aisle, on = "aisle_id", how = "left")
products = pd.merge(products, dept, on = "department_id", how = "left")
print(products.shape)

In [None]:
# departments with the most products
products["department"].value_counts()

In [None]:
# aisles with the most products
products["aisle"].value_counts()

----

## 2.2 **Orders data**

In [None]:
# orders file
with ZipFile(data_dir + "orders.csv.zip") as z:
    with z.open("orders.csv") as f:
        orders = pd.read_csv(f)
print(orders.shape)
orders

In [None]:
# remove test data
orders = orders.loc[orders["eval_set"] != "test", :]
print(orders.shape)

In [None]:
# aggregate to user level
users = orders.groupby("user_id").agg({"order_number": "max"})
users = users.reset_index()
print(users.shape)
users.head()

In [None]:
# =========================================================
# split train-val-test
# =========================================================

# training set: 60%
# validation set: 20%
# test set: 20%
users["eval"] = np.random.choice(["train", "test", "val"], size = users.shape[0], p = [0.6, 0.2, 0.2])
users["eval"].value_counts()

In [None]:
# =========================================================
# merge with orders data
# =========================================================
orders = pd.merge(orders, users, on = ["user_id", "order_number"], how = "left")
print(orders.shape)

In [None]:
# set missing eval to prior and delete eval_set column
orders.loc[pd.isnull(orders["eval"]), "eval"] = "prior"
orders.drop(labels = "eval_set", axis = 1, inplace = True)
print(orders["eval"].value_counts())
orders.head()

## 2.3 **Prior orders**

In [None]:
# orders file
with ZipFile(data_dir + "order_products__prior.csv.zip") as z:
    with z.open("order_products__prior.csv") as f:
        prior = pd.read_csv(f)
print(prior.shape)
prior

In [None]:
# product frequency to check top-products
prod_freq = prior["product_id"].value_counts()
print(prod_freq.shape)
prod_freq

In [None]:
# keep products bought at least 200 times
min_freq = 200
prod_freq = prod_freq[prod_freq >= min_freq]
print(prod_freq.shape)

In [None]:
# subset prior data
prior = prior.loc[prior["product_id"].isin(list(prod_freq.index)), :]
print(prior.shape)

### Aggregate to order-level from order-product level

In [None]:
# function to aggregate
f = {"product_id": lambda g: " ".join(g),
    "add_to_cart_order": ["count"]}

# format product-ids to string
prior["product_id"] = prior["product_id"].astype(str)

# roll-up
prior_orders = prior.groupby(["order_id"]).agg(f)
print(prior_orders.shape)
prior_orders.head()

In [None]:
# =================================================
# reset column levels and rename
# =================================================

prior_orders.columns = prior_orders.columns.droplevel(1)
prior_orders.rename(columns = {"add_to_cart_order" : "num_products"}, inplace = True)
prior_orders = prior_orders.loc[prior_orders["num_products"] > 1, :]
prior_orders.reset_index(inplace = True)
print(prior_orders.shape)
prior_orders.head()

## 2.4 **Train orders**

In [None]:
# Note:
## - We have made our own training and test data
## - This dataset will eventually be appended to the larger prior orders

In [None]:
# train orders file
with ZipFile(data_dir + "order_products__train.csv.zip") as z:
    with z.open("order_products__train.csv") as f:
        train = pd.read_csv(f)
print(train.shape)
train.head()

In [None]:
# retain only frequently sold products
train = train.loc[train["product_id"].isin(list(prod_freq.index)), :]
print(train.shape)

In [None]:
# function to aggregate
f = {"product_id": lambda g: " ".join(g),
    "add_to_cart_order": ["count"]}
train["product_id"] = train["product_id"].astype(str)
train_orders = train.groupby(["order_id"]).agg(f)
train_orders.columns = train_orders.columns.droplevel(1)
train_orders.reset_index(inplace = True)
train_orders.rename(columns = {"add_to_cart_order" : "num_products"}, inplace = True)
train_orders = train_orders.loc[train_orders["num_products"] > 1, :] # retain baskets with more than one product
print(train_orders.shape)
train_orders.head()

In [None]:
# =========================================================
# Combine training and prior orders
# =========================================================


# add identifier column to both data sets to append
# orders-level data
prior_orders["eval"] = "prior"
train_orders["eval"] = "train"

# order-product level data
prior["eval"] = "prior"
train["eval"] = "train"


# append wide orders
all_orders_wide = prior_orders.append(train_orders)
all_orders_wide.reset_index(drop = True, inplace = True)
print(all_orders_wide.shape)

# append long orders
all_orders_long = prior.append(train)
all_orders_long.reset_index(drop = True, inplace = True)
print(all_orders_long.shape)

In [None]:
all_orders_wide.head()

-------

## 3. Prepare data for modeling

In [None]:
# =========================================================
# merge orders-wide and orders to get train-test split
# =========================================================

all_orders_wide.drop(labels = "eval", axis = 1, inplace = True)

# merge
orders_wide = pd.merge(all_orders_wide, orders[["order_id", "user_id", "eval"]],
                       on = "order_id", how = "left")
print(orders_wide.shape)

In [None]:
#===============================================
# split train-val-test
#===============================================

train = orders_wide.loc[orders_wide["eval"].isin(["prior", "train"]), :]
val = orders_wide.loc[orders_wide["eval"] == "val", :]
test = orders_wide.loc[orders_wide["eval"] == "test", :]
print("train size:", train.shape)
print("val size:", val.shape)
print("test size:", test.shape)

In [None]:
# clear up memory
del all_orders_wide
del orders_wide
gc.collect()
gc.collect()

In [None]:
#===============================================
# randomly sample training data
#===============================================

sample_size = 1000000
train = train.sample(n = sample_size)
train = train.reset_index(drop = True)
print(train.shape)
train.head()

In [None]:
# use keras tokenizer to split baskets into individual products
vocab_size = 15000
tokenizer = Tokenizer(num_words = vocab_size, lower = False)

# fit the tokenizer orders
tokenizer.fit_on_texts(list(train["product_id"].values))
print(tokenizer.document_count)
print(len(tokenizer.word_counts))

In [None]:
# map orders from product-ids to contiguous integers
train_orders = tokenizer.texts_to_sequences(list(train["product_id"].values))
val_orders = tokenizer.texts_to_sequences(list(val["product_id"].values))
test_orders = tokenizer.texts_to_sequences(list(test["product_id"].values))
print(len(train_orders))
print(len(val_orders))
print(len(test_orders))

In [None]:
# manually inspect some data
print(train_orders[:3])
train["product_id"][0:3]


In [None]:
# save pickle files
with open('/kaggle/working/train_orders.pkl', 'wb') as f:
    pickle.dump(train_orders, f)
f.close() 
    

with open('/kaggle/working/val_orders.pkl', 'wb') as f:
    pickle.dump(val_orders, f)
f.close()     

    
with open('/kaggle/working/test_orders.pkl', 'wb') as f:
    pickle.dump(test_orders, f)  
f.close()     

----

## **4. Setup Tensorflow**

In [None]:
# define vocab size - this is total number of unique products 
vocab_size = len(tokenizer.word_index) + 1 # we add one to account for products in test/val that were not in train. They are treated as "UNK". 

In [None]:
# build the sampling table for negative sampling
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size, sampling_factor = 0.001)

### 4.1. Data generator - this is typically the tricky part in setting up a data pipeline for deep learning

In [None]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, order_data, batch_size = 64, cs = 15, ns = 20, shuffle = True):
        'Initialization'
        self.batch_size = batch_size
        self.order_data = order_data
        self.shuffle = shuffle
        self.cs = cs
        self.ns = ns
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.order_data) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.order_data[k] for k in indexes]

        # Generate data
        X, dv = self.__data_generation(list_IDs_temp)

        return X, dv

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.order_data))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        iv = []
        dv =[]

        # Generate data
        for i, d in enumerate(list_IDs_temp):
            # Store sample
            couples, labels = skipgrams(d, vocabulary_size = vocab_size, window_size = self.cs,
                                        negative_samples = self.ns, sampling_table = sampling_table)
            iv = iv + couples
            dv = dv + labels
            
        X = np.array(iv, dtype = "int32")
        X = [X[:, 0], X[:, 1]]
        return X, np.array(dv)

In [None]:
# randomly sample orders for quick training
train_orders = random.sample(train_orders, 300000)
print(len(train_orders))

In [None]:
# initialize train and validation data generators with default setting
train_gen = DataGenerator(train_orders)
val_gen = DataGenerator(val_orders)

# test data
test_gen = DataGenerator(test_orders, cs = 10, ns = 0, shuffle = False)

### 4.2. Set-up a simple model

In [None]:
# model input parameters
emb_size = 32
batch_size = 32

In [None]:
# define input layers
input_target = Input((1, ))
input_context = Input((1, ))

# target
target = Embedding(input_dim = vocab_size, output_dim= emb_size, name = "rho")(input_target)
target = Reshape(target_shape = (emb_size, 1))(target)

# context
context = Embedding(input_dim = vocab_size, output_dim= emb_size, name = "alpha")(input_context)
context = Reshape(target_shape = (emb_size, 1))(context)

# concatenate model inputs and outputs
input_model = [input_target, input_context]
output_embeddings = Dot(axes = 1)([target, context])
output_model = Flatten()(output_embeddings)

# complete model
output_model = Dense(1, activation = "sigmoid")(output_model)

# define as keras model
emb_model = Model(inputs = input_model, outputs = output_model)

In [None]:
emb_model.summary()

In [None]:
# model optimizer and compile
adam = optimizers.Adam()
emb_model.compile(optimizer = adam, loss = "binary_crossentropy", metrics = ["acc"])

In [None]:
# initial weights - rho
init_alpha = emb_model.get_layer("alpha").get_weights()[0]
print(init_alpha.shape)

In [None]:
# fit model
history = History()
t0 = time.time()
emb_model.fit(x = train_gen, 
               epochs = 1,
               validation_data = val_gen,
               use_multiprocessing = True,
               callbacks = [history])
t1 = time.time()

In [None]:
# save keras model
# emb_model.save('/kaggle/working/emb_model/')
emb_model = tf.keras.models.load_model('/kaggle/input/product-embeddings/emb_model/')

### 4.3 Inspect model results

In [None]:
# initial weights
init_alpha

In [None]:
# final weights - target words
final_alpa = emb_model.get_layer("alpha").get_weights()[0]
print(final_alpa.shape)
final_alpa

### 4.4. Extract results for downstream tasks

In [None]:
#===============================================
# extract embeddings to data frame
#===============================================

def EmbToDataFrame(ix_word, emb_mat, col_prefix = "rho"):
    emb_df = {ix_word[i]: list(emb_mat[i-1]) for i in list(ix_word.keys())[:vocab_size]}
    emb_df = pd.DataFrame.from_dict(emb_df, orient = "columns")
    emb_df = emb_df.transpose().reset_index(drop = False)
    emb_df.columns = ["product_id"] + [col_prefix + str(i + 1) for i in range(emb_df.shape[1] - 1)]
    return emb_df

In [None]:
# reversed mapping of words to index
ix_word = tokenizer.index_word

In [None]:
# get data frame from alpha matrix
alpha_df = EmbToDataFrame(ix_word, emb_mat = emb_model.get_layer("alpha").get_weights()[0], col_prefix = "alpha")
display(alpha_df.head())

In [None]:
#===============================================
# similarity in alpha matrix
#===============================================

alpha_sim = cosine_similarity(alpha_df.iloc[:, 1:])
alpha_sim = pd.DataFrame(alpha_sim)
alpha_sim.reset_index(inplace = True, drop = True)
alpha_sim.index = list(alpha_df["product_id"].values)
alpha_sim.columns = list(alpha_df["product_id"].values)
display(alpha_sim.head())

In [None]:
# compute product similarity given the embeddings
def ComputeProductSimilarity(prod_id, alpha_sim, top = 5, include_prod_info = True):
    sim = alpha_sim.loc[:, prod_id]
    sim = sim.sort_values(ascending = False)
    sim = sim[1:][0:top]
    sim = pd.DataFrame({"product_id" : list(sim.index), "score": sim}, index = None)
    if include_prod_info:
        sim = products.loc[products["product_id"].isin(sim["product_id"]), ["product_id", "product_name", "aisle", "department"]]
    return sim

In [None]:
prod_id = "100"
print(products.loc[products["product_id"].isin([prod_id]), ["product_id", "product_name", "aisle", "department"]])
ComputeProductSimilarity(prod_id = prod_id, alpha_sim = alpha_sim, top = 10, include_prod_info = True)

In [None]:
# Find most similar products to the following product
prod_id = "49332"
print(products.loc[products["product_id"].isin([prod_id]), ["product_id", "product_name", "aisle", "department"]])
ComputeProductSimilarity(prod_id = prod_id, alpha_sim = alpha_sim, top = 10, include_prod_info = True)

In [None]:
# Find most similar products to the following product
prod_id = "3151"
print(products.loc[products["product_id"].isin([prod_id]), ["product_id", "product_name", "aisle", "department"]])
ComputeProductSimilarity(prod_id = prod_id, alpha_sim = alpha_sim, top = 10, include_prod_info = True)

### 4.5. Visualize embeddings using T-SNE

In [None]:
#===============================================
# prep data for t-sne
#===============================================

alpha_df["product_id"] = alpha_df["product_id"].astype(str)
products["product_id"] = products["product_id"].astype(str)

# relevant columns from product info
prod_info_cols = ["product_id", "product_name", "department", "aisle"]

# merge product information and product embeddings into a single data frame 
prod_vec_df = pd.merge(products[prod_info_cols], alpha_df, on = "product_id", how = "inner")
print(prod_vec_df.shape)

In [None]:
#===============================================
# fit t-sne
#===============================================

# define model
tsne = TSNE(n_components = 2, verbose = 1, perplexity = 35, n_iter = 400)

# columns to fit on
prod_vec_names = list(prod_vec_df.columns)[4:]

# fit
t0 = time.time()
tsne_fit = tsne.fit_transform(prod_vec_df[prod_vec_names])
t1 = time.time()

In [None]:
#===============================================
# create t-sne data frame for plotting
#===============================================

tsne_df = prod_vec_df[["product_name", "department", "aisle"]]

# extract t-sne dimensions
tsne_df["x_tsne"] = tsne_fit[:,0]
tsne_df["y_tsne"] = tsne_fit[:,1]
print(tsne_df.describe())

In [None]:
#===============================================
# subset data for plot
#===============================================

# select only top departments
select_dept = ["produce", "babies", "beverages"]
tsne_plot_df = tsne_df.loc[tsne_df["department"].isin(select_dept), :]
print(tsne_plot_df.shape)

In [None]:
plt.figure(figsize = (12, 12))
g = sns.scatterplot(x = "x_tsne", y = "y_tsne",
              hue="department",
              data = tsne_plot_df)