In [1]:
%load_ext autoreload
%autoreload 2

import json
# import logging
import multiprocessing
import os
import statistics
import time

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

from source.item2vec_recommender import Item2VecRecommender
from source.item2vec_embeddings import Item2VecEmbeddings
from source.data_loader import DataLoader

from source.baseline_recommender import MostPopularForUserRecommender
from source.utils import convert_size

# logging.basicConfig(
#      format="%(levelname)s - %(asctime)s: %(message)s",
#      datefmt="%H:%M:%S",
#      level=logging.INFO,
# )

## Load Data

In [2]:
# small = ""
small = "small_"
algorithm = "user-meta-item2vec"

In [3]:
data_loader = DataLoader(algorithm=algorithm,
                         small_data=(small!=""),
                         with_meta=True,
                         with_user=True,
                         use_file_iterator=True)

With Metadata: True
With User: True
Loading data...
Creating file to iterate for user-meta-item2vec in sentences/user-meta-item2vec/train.txt


In [None]:
metadata_path = (f"preprocessed_data/{small}product_metadata.csv")
train_orders_path = f"preprocessed_data/{small}train_orders.npy"
validation_orders_path = f"preprocessed_data/{small}validation_orders.npy"
test_orders_path = f"preprocessed_data/{small}test_orders.npy"

# orders_path = f"preprocessed_data/{small}order_data.csv"

train_data = np.load(train_orders_path, allow_pickle=True)
validation_data = np.load(validation_orders_path, allow_pickle=True)
test_data = np.load(test_orders_path, allow_pickle=True)
item_metadata = pd.read_csv(metadata_path)

In [None]:
# data_loader = load_data(small_data=(small!=""))

# train_data = data_loader["train"]
# validation_data = data_loader["validation"]
# test_data = data_loader["test"]
# item_metadata = data_loader["metadata"]

# TEST START

In [None]:
def generate_user_item_interactions(train_data, n_items):
    user_transactions_map = {}
    user_item_frequency = {}
    item_frequency = {}
    for index, row in enumerate(train_data):
        # user id is always first in list, then all the purchased items
        user_id = row[0]
        items = row[1:]
        temp_transactions = user_transactions_map.get(user_id, [])
        temp_transactions.append(index)
        user_transactions_map[user_id] = temp_transactions

        temp_item_frequency = user_item_frequency.get(user_id, {})
        for item in items:
            temp_item_frequency[item] = temp_item_frequency.get(item, 0) + 1
            item_frequency[item] = item_frequency.get(item, 0) + 1
        user_item_frequency[user_id] = temp_item_frequency
        

    return user_transactions_map, user_item_frequency, item_frequency

# TEST END

In [None]:
def get_product_key_conversion(metadata) -> dict:
    product_key_conversion = {}

    for index, row in metadata.iterrows():
        name = (
            str(row["product_name"])
            + "\t"
            + str(row["department"])
            + "\t"
            + str(row["aisle"])
        )
        product_key_conversion.setdefault(f'product_{row["product_id"]}', name)

    return product_key_conversion

def product_key_to_meta(key):
    assert product_key_conversion
    return product_key_conversion.get(key, key)

def product_key_to_name(key):
    assert product_key_conversion
    return product_key_conversion.get(key, key).split("\t")[0]

In [None]:
# Create product key conversion if not exists
product_key_conversion = get_product_key_conversion(item_metadata)
print(product_key_to_meta("1"))
print(product_key_to_name("1"))

### Adding Product Metadata (only Category for now) to **train** sentences

In [None]:
def get_category_key_conversion(metadata) -> dict:
    category_key_conversion = {}

    for index, row in metadata.iterrows():
        category_key_conversion.setdefault(str(row["department"]), f'category_{row["department_id"]}')
        
    return category_key_conversion

def category_to_key(category):
    assert category_key_conversion
    return category_key_conversion.get(category, category)

In [None]:
# Create category key conversion if not exists
category_key_conversion = get_category_key_conversion(item_metadata)
print(category_to_key("dairy eggs"))

In [None]:
test_items = train_data[0][1:]

def add_product_categories(items):
    items_with_categories = []

    for item in items:
        category_key = category_to_key(product_key_to_meta(item).split("\t")[1])
        items_with_categories.append(item)
        items_with_categories.append(category_key)
        
    return items_with_categories

print(test_items)
print(add_product_categories(test_items))

In [None]:
def create_sentences(data, stage, is_np_array=True, overwrite=True):
    # Build train set
    filepath = f"sentences/{algorithm}/{stage}.txt"
    if overwrite:
        print(
            f"Creating sentences for the {stage} stage for {algorithm} in {filepath}"
        )
        with open(filepath, "w") as file:
            if is_np_array:
                for transaction in data:
                    # user id is always first in list, then all the purchased items
                    user_id = [transaction[0]]
                    # add product categories to the item list
                    items_with_metadata = add_product_categories(transaction[1:])
                    # as this is user_item2vec, also include the user
                    if len(transaction) > 0:
                        file.write(" ".join(map(str, user_id + items_with_metadata)) + "\n")
            else:
                print("Not defined as of now. Please implement.")
#                 for i, row in data.iterrows():
#                     file.write(' '.join(map(str, row["product_id"])) + "\n")
                
    return filepath

In [None]:
class SentenceIterator(object):
    def __init__(self, sentences_filepath):
        self.sentences_filepath = sentences_filepath
                    
    def __iter__(self):
        assert os.path.exists(self.sentences_filepath)
        for line in open(self.sentences_filepath):
            transaction = line.split()
            # user is always first: transaction[0] so feed it as tags, items are transaction[1:]
            yield TaggedDocument(words=transaction[1:], tags=[transaction[0]])            

In [None]:
# train_sentences_path = create_sentences(train_data, "train", overwrite=False)
train_sentences_path = create_sentences(train_data, f"{small}train", is_np_array=True, overwrite=True)
train_sentences = SentenceIterator(sentences_filepath=train_sentences_path)

## Training Embeddings

In [None]:
def get_filtered_model_vectors(model, verbose=False):
    to_trim = [(index, item_key) for index, item_key in enumerate(model.wv.index2word) if item_key.startswith('category_')]
    indices_to_trim, words_to_trim = list(zip(*to_trim))
    indices_to_trim = list(indices_to_trim)
    words_to_trim = list(words_to_trim)
    
    if verbose:
        print(f"Removing {len(words_to_trim)} categories from the model: {words_to_trim}")

    for word in words_to_trim:
        del model.wv.vocab[word]

    embedding_vectors = np.delete(model.wv.vectors, indices_to_trim, axis=0)
    context_vectors = np.delete(model.trainables.syn1neg, indices_to_trim, axis=0)

    for index in sorted(indices_to_trim, reverse=True):
        del(model.wv.index2word[index])
        
    test_index2word = set(model.wv.index2word)
    for word in words_to_trim:
        assert word not in model.wv.vocab
        assert word not in test_index2word
        
    return model, embedding_vectors, context_vectors

In [None]:
def train_model(
        train_data,
        epochs,
        embedding_size,
        window_size,
        ns_exponent,
        number_of_negative_samples,
        min_count,
        sample,
        save=False,
    ):

    # PV-DBOW: dm=0, dbow_words=1
    # PV-DM modes without concatenation dm=1, dm_concat=0
        model = Doc2Vec(
            documents=train_data,
            dm= 0,# 1,
            # dm_mean=1, # if 0, it uses sum of context vectors instead of average
            # dm_concat=0,
            dbow_words=1, # if 1 it trains word vectors as well, if 0 it only trains doc vectors
            vector_size=embedding_size,
            window=window_size,
            min_count=min_count,
            compute_loss=True,
            workers=multiprocessing.cpu_count(),
            hs=0,
            sample=sample,
            negative=number_of_negative_samples,
            ns_exponent=ns_exponent,
            epochs=epochs,
        )
        
        model.init_sims(replace=True)

        if save:
            model.save(f"models/{algorithm}/embeddings.model")
            print("Model Saved")

        return model

In [None]:
embeddings = Item2VecEmbeddings(algorithm="user-meta-item2vec",
                                product_key_conversion=data_loader.product_key_conversion,
                                with_meta=True,
                                with_user=True)

embeddings.train_model(
                    data_loader.train_data_iterator,
                    epochs=15,
                    embedding_size=128,
                    window_size=5, # 100,
                    min_count=10,
                    number_of_negative_samples=7,
                    sample=0.01,
                    ns_exponent=0.5,
                    save=False,
                )

# model = train_model(
#                     train_sentences,
#                     epochs=1,
#                     embedding_size=128,
#                     window_size=200, # using 100 * metadata added. In this case I only added category which doubles the items in each sentence
#                     min_count=10,
#                     number_of_negative_samples=7,
#                     sample=0.1,
#                     ns_exponent=0.25,
#                     save=True,
#                 )

# model = Doc2Vec.load(f"models/{algorithm}/embeddings.model")

In [None]:
model, embedding_vectors, context_vectors = get_filtered_model_vectors(model, 
                                                                       verbose=True)

In [None]:
# Create a matrix filled with embeddings of all items considered.
mapping = {item_key: index for index, item_key in enumerate(model.wv.index2word)}
mapping_back = {index: item_key for item_key, index in mapping.items()}
embedding = [model.wv[key] for key in mapping.keys()]

assert len(embedding) == len(mapping) == len(embedding_vectors) == len(context_vectors)

# embedding = [model.wv[key] for key in model.wv.vocab.keys() if key.startswith('product_')]
# context_vectors = [vector for vector in model.trainables.syn1neg]
# mapping = {elem: i for i, elem in enumerate(model.wv.vocab.keys())}
# mapping_back = {v: k for k, v in mapping.items()}

In [None]:
y_category_list = []
y_aisle_list = []
        
for key in model.wv.vocab.keys():
    y_category_list.append(product_key_to_meta(key).split("\t")[1])
    y_aisle_list.append(product_key_to_meta(key).split("\t")[2])
    
assert len(embedding) == len(y_category_list) == len(y_aisle_list)
print(f"Number of Categories: {len(set(y_category_list))}")
print(f"Number of Aisles: {len(set(y_aisle_list))}")

In [None]:
y_category_list[:5]

In [None]:
y_aisle_list[:5]

## Category and Aisle Prediction

In [None]:
def predict_labels(classifier, x, y, test_size=0.5):
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0)

    classifier.fit(x_train, y_train)
    y_predictions = classifier.predict(x_test)

    # accuracy = round(accuracy_score(y_predictions, y_test), 3)
    # precision = round(precision_score(y_predictions, y_test, average="weighted"), 3)
    # recall = round(recall_score(y_predictions, y_test, average="weighted"), 3)
    f1_micro = round(f1_score(y_predictions, y_test, average="micro"), 4)
    f1_macro = round(f1_score(y_predictions, y_test, average="macro"), 4)
    f1_weighted = round(f1_score(y_predictions, y_test, average="weighted"), 4)

    return f1_micro, f1_macro, f1_weighted

In [None]:
k_neighbors = 10
k_neighbors_classifier = KNeighborsClassifier(n_neighbors=k_neighbors, n_jobs=-1)

In [None]:
category_f1, aisle_f1 = embeddings.evaluate_embeddings(k_neighbors=10)

# category_f1 = predict_labels(classifier=k_neighbors_classifier, x=embedding, y=y_category_list)
# aisle_f1 = predict_labels(classifier=k_neighbors_classifier, x=embedding, y=y_aisle_list)

# print(f"Micro: {category_f1[0]}, Macro: {category_f1[1]}, Weighted: {category_f1[2]}")
# print(f"Micro: {aisle_f1[0]}, Macro: {aisle_f1[1]}, Weighted: {aisle_f1[2]}")

In [None]:
# Small Data
# 5 epochs (64 dims)
# Micro: 0.5179, Macro: 0.3083, Weighted: 0.567
# Micro: 0.3111, Macro: 0.1314, Weighted: 0.367

# # 5 epochs (32 dims)
# Micro: 0.559, Macro: 0.3312, Weighted: 0.6019
# Micro: 0.3231, Macro: 0.1447, Weighted: 0.3697

# 5 epochs (128 dims)
# Micro: 0.4068, Macro: 0.1842, Weighted: 0.4701
# Micro: 0.2479, Macro: 0.1, Weighted: 0.3145

# 5 epochs (128 dims) using model.init_sims()
# Micro: 0.5846, Macro: 0.3872, Weighted: 0.6137
# Micro: 0.3487, Macro: 0.1544, Weighted: 0.3917

# 5 epochs (256 dims)
# Micro: 0.3316, Macro: 0.1322, Weighted: 0.4191
# Micro: 0.2017, Macro: 0.0816, Weighted: 0.265

# 15 epochs
# Micro: 0.4803, Macro: 0.2815, Weighted: 0.5265
# Micro: 0.2974, Macro: 0.1203, Weighted: 0.3548

# 25 epochs
# Micro: 0.4855, Macro: 0.2555, Weighted: 0.5369
# Micro: 0.3043, Macro: 0.1338, Weighted: 0.3557

# 100 epochs
# Micro: 0.4684, Macro: 0.2413, Weighted: 0.5284
# Micro: 0.2923, Macro: 0.1236, Weighted: 0.3459

# TEST START: MOST POPULAR FOR USER

In [None]:
user_transactions_map, user_item_frequency, item_frequency = generate_user_item_interactions(train_data=train_data, 
                                                                                             n_items=n_items)

In [None]:
most_popular_for_user_baseline = MostPopularForUserRecommender(n_items=n_items, user_item_frequency=user_item_frequency, item_frequency=item_frequency)

In [None]:
# Validation Set and Test Set
k = 10
hit_rate_at_k_val, ndcg_at_k_val = most_popular_for_user_baseline.evaluate(validation_data, k=k)
print(f"Hit Rate @ {k} on Validation Set: {hit_rate_at_k_val}")
print(f"NDCG @ {k} on Validation Set: {ndcg_at_k_val}")

# hit_rate_at_k_test, ndcg_at_k_test = item2vec_recommender.evaluate(test_set, k=k)
# print(f"Hit Rate @ {k} on Test Set: {hit_rate_at_k_test}")
# print(f"NDCG @ {k} on Test Set: {ndcg_at_k_test}")

# TEST END

## UserMetaItem2Vec Recommender

In [None]:
usermetaitem2vec_recommender = Item2VecRecommender(algorithm="user-meta-item2vec", 
                                                   item_key_mapping=embeddings.mapping, 
                                                   user_item_frequency=data_loader.user_item_frequency,
                                                   embedding_vectors=embeddings.embedding_vectors, 
                                                   context_vectors=embeddings.context_vectors, 
                                                   user_vectors=embeddings.user_vectors)
print(usermetaitem2vec_recommender.embedding_vectors.shape)
print(usermetaitem2vec_recommender.context_vectors.shape)

# usermetaitem2vec_recommender = Item2VecRecommender(algorithm="user-meta-item2vec", 
#                                                    item_key_mapping=mapping, 
#                                                    user_item_frequency=user_item_frequency,
#                                                    embedding_vectors=embedding_vectors, 
#                                                    context_vectors=context_vectors, 
#                                                    user_vectors=model.docvecs)
# print(usermetaitem2vec_recommender.embedding_vectors.shape)
# print(usermetaitem2vec_recommender.context_vectors.shape)

# TEST START

In [None]:
from collections import OrderedDict

In [None]:
def get_closest_items_for_user(user_id):
    most_similar = model.wv.most_similar([model.docvecs[user_id]], topn=10)
    for item, similarity in most_similar:
        print(f"{product_key_to_name(item)} - {round(similarity, 3)}")
    print("\n")
        
def clone_get_closest_items_for_user(user_id):
    user_distances = np.dot(_l2_norm(model.wv.vectors), _l2_norm(model.docvecs[user_id]))
    user_candidate_list = argsort(user_distances, topn=10, reverse=True)
    user_indices = [(model.wv.index2word[item], float(user_distances[item])) for item in user_candidate_list]
    
    for item, similarity in user_indices:
        print(f"{product_key_to_name(item)} - {round(similarity, 3)}")
        
    print("\n")
    
def get_most_popular_for_user(user_id):
    most_popular = list(OrderedDict(sorted(user_item_frequency[user_id].items(), key=lambda t: t[1], reverse=True)).items())
    for item, occurences in most_popular[:10]:
        print(f"{product_key_to_name(item)} - {occurences}")
        
    print("\n")

In [None]:
user_id = 'user_71' # 71, 79

In [None]:
# get_closest_items_for_user(user_id)
clone_get_closest_items_for_user(user_id)
get_most_popular_for_user(user_id)

In [None]:
test_context_vectors = model.trainables.syn1neg
test_embedding = model.wv
test_user_vector = model.docvecs
user_id = validation_data[0][0]
items = validation_data[0][1:]

# First get the top complementary items
test_item_embeddings = [model.wv[key] for key in items if key in model.wv]
test_mean_basket_vector = np.mean(test_item_embeddings, 0)

test_distances = np.dot(test_context_vectors, test_mean_basket_vector)

test_candidate_list = argsort(test_distances, topn=100, reverse=True)
        
test_indices = [model.wv.index2word[item] for item in test_candidate_list]
test_indices_index = {i: item for (i, item) in enumerate(test_indices)}

# then rank the top items by distance to user (from largest to smallest)
test_candidate_embeddings = [model.wv[key] for key in test_indices if key in model.wv]
test_candidate_embeddings = np.array(test_candidate_embeddings)

test_user_distances = np.dot(_l2_norm(test_candidate_embeddings), _l2_norm(test_user_vector[user_id]))
test_user_candidate_list = argsort(test_user_distances, topn=10, reverse=True)
test_user_indices = [test_indices_index[item] for item in test_user_candidate_list]

# test_user_distances = np.dot(test_candidate_embeddings, test_user_vector[user_id])
# test_user_candidate_list = argsort(test_user_distances, topn=20, reverse=True)
# test_user_indices = [model.wv.index2word[item] for item in test_user_candidate_list]
for item in test_user_indices:
    print(f"{product_key_to_name(item)}")

In [None]:
from gensim.matutils import argsort
import math

In [None]:
# NOT NEEDED, init_sims does the job
def _l2_norm(m, replace=False):
        """ Return an L2-normalized version of a matrix. """
        dist = np.sqrt((m ** 2).sum(-1))[..., np.newaxis]
        if replace:
            m /= dist
            return m
        else:
            return (m / dist).astype(np.float32)

In [None]:
def predict_items(recommender, user_id, given_items):
        candidate_list = []
        # map the items word to its index
        target_items = [recommender.item_key_mapping[key] for key in given_items]
        # slice the word vectors array to only keep the relevant items
        item_embeddings = recommender.embedding_vectors[target_items]
        
        mean_basket_vector = np.mean(item_embeddings, 0)

        # complementary items need to be calculated via dot product not cosine similarity
        distances = np.dot(recommender.context_vectors, mean_basket_vector)

        # candidate_indices = np.arange(0, len(distances))
        # candidate_list = [(recommender.reverse_item_key_mapping[index], float(distances[index])) for index in candidate_indices]
        
        # ranked_candidate_list = []
        
        # rank the top items by distance to user (from largest to smallest)
        # candidate_embeddings = [recommender.embedding[product_id] for product_id, _ in candidate_list] #if key in self.embedding]
        # candidate_embeddings = np.array(candidate_embeddings)

        user_distances = np.dot(recommender.embedding_vectors, recommender.user_vectors[user_id])
        
        combined_distances = (distances * (1-recommender.alpha)) + (user_distances * (recommender.alpha))
        
        # candidate_indices = np.arange(0, len(combined_distances))
        # candidate_list = [(recommender.reverse_item_key_mapping[index], float(combined_distances[index])) for index in candidate_indices]  
        # ranked_candidates = {product_id: score for (product_id, score) in candidate_list}

        # if want to return the most similar items to user out of all items
        # return [item for item, similarity in self.embedding.most_similar([self.user_vectors[user_id]], topn=k)]
        return combined_distances

In [None]:
def evaluate_transaction(recommender, user_id, given_items, test_items):
        # get the predicted items and their scores
        item_scores = predict_items(recommender, user_id=user_id, given_items=given_items)
        # create a list of item scores
        # predicted_item_scores = np.array(list(item_scores.values()))
        # create a mapping for items to index in the scores list
        # item_indices = {key: index for index, key in enumerate(item_scores.keys())}
        
        # separate the target items from the other items
        negative_index = np.ones(recommender.n_items)
        mask_items = [recommender.item_key_mapping[key] for key in test_items]
        negative_index[mask_items] = 0
        target_item_scores = item_scores[mask_items]
        negative_items = item_scores[negative_index>0]
        
        # calculate the auc and ndcg
        n_negative = len(negative_items)
        false_predictions = (target_item_scores.reshape(1, len(target_item_scores)) <= negative_items.reshape(n_negative, 1)).sum(axis=0)
        auc = (n_negative - false_predictions) / n_negative
        ndcg = 1.0/np.log2(2 + false_predictions)
        
        return auc, ndcg

In [None]:
def evaluate(recommender, test_transactions):
        start_time = time.time()
        
        # more efficient than interating over numpy array
        test_transactions = list(test_transactions)
        metrics = []
        min_transaction_items = 2
        # after removing transactions with less than MIN_TRANSACTION_ITEMS
        actual_transaction_length = len(test_transactions)
        print(f"{actual_transaction_length} transactions to evaluate.")

        
        for test_transaction in test_transactions:
            # user id is always first in list, then all the purchased items
            user_id = test_transaction[0]
            items = [item for item in test_transaction[1:] if item in recommender.item_key_mapping]
            
            if len(items) < min_transaction_items:
                actual_transaction_length -= 1
                continue

            half = math.ceil(len(items) / 2)
            basket_item_ids = items[:half]
            hold_out_item_ids = items[half:]
            
            _auc, _ndcg = evaluate_transaction(recommender, user_id=user_id, given_items=basket_item_ids, test_items=hold_out_item_ids)
            metrics.append([_auc.mean(), _ndcg.mean()])

        actual_transaction_length = len(metrics)
        metrics = np.array(metrics).mean(axis=0)
        auc = round(metrics[0], 4)
        ndcg= round(metrics[1], 4)

        print(f"Evaluated {actual_transaction_length} transactions.")
        print(f"Took {round((time.time()-start_time)/60., 5)} minutes.")

        return auc, ndcg

In [None]:
# Validation Set and Test Set
validation_auc, validation_ndcg = evaluate(usermetaitem2vec_recommender, validation_data)
print(f"AUC on Validation Set: {validation_auc}")
print(f"NDCG on Validation Set: {validation_ndcg}")

In [None]:
# Baseline (without user vector)
# Hit Rate @ 10 on Validation Set: 0.1627
# NDCG @ 10 on Validation Set:     0.0914

# Without l2_norm (not cosine similarity)
# Hit Rate @ 10 on Validation Set: 0.0468
# NDCG @ 10 on Validation Set: 0.0225
    
# With l2_norm (cosine similarity)
# Hit Rate @ 10 on Validation Set: 0.0382
# NDCG @ 10 on Validation Set: 0.0173

# Hit Rate @ 10 on Validation Set: 0.0224
# NDCG @ 10 on Validation Set: 0.0108

# PV-DM method: Using the closest items for user
# Hit Rate @ 10 on Validation Set: 0.0302
# NDCG @ 10 on Validation Set: 0.0147

# PV-DM method, Candidate Generation: 20, Ranking by Cosine Similarity to User Vector
# Hit Rate @ 10 on Validation Set: 0.0844
# NDCG @ 10 on Validation Set: 0.0391

# PV-DBOW method: Using the closest items for user
# Hit Rate @ 10 on Validation Set: 0.0962
# NDCG @ 10 on Validation Set: 0.0501

# PV-DBOW method: Candidate Generation: 20, Ranking by Cosine Similarity to User Vector
# Hit Rate @ 10 on Validation Set: 0.1056
# NDCG @ 10 on Validation Set: 0.0502

# PV-DBOW method: Candidate Generation: 100, Ranking by Cosine Similarity to User Vector
# Hit Rate @ 10 on Validation Set: 0.0839
# NDCG @ 10 on Validation Set: 0.0423

# PV-DBOW method: Candidate Generation: 200, Ranking by Cosine Similarity to User Vector
# Hit Rate @ 10 on Validation Set: 0.0844
# NDCG @ 10 on Validation Set: 0.0425

# TEST OVER

In [None]:
# Validation Set and Test Set
# Within Basket Recommendations
val_auc, val_ndcg, val_recall, val_precision = usermetaitem2vec_recommender.evaluate(data_loader.validation_data, 
                                                                                     k=10, 
                                                                                     within_basket=True)

test_auc, test_ndcg, test_recall, test_precision = usermetaitem2vec_recommender.evaluate(data_loader.test_data, 
                                                                                         k=10, 
                                                                                         within_basket=True)

In [None]:
# Next Basket Recommendations
val_auc, val_ndcg, val_recall, val_precision = usermetaitem2vec_recommender.evaluate(data_loader.validation_data, 
                                                                             k=10, 
                                                                             within_basket=False)

test_auc, test_ndcg, test_recall, test_precision = usermetaitem2vec_recommender.evaluate(data_loader.test_data, 
                                                                                 k=10, 
                                                                                 within_basket=False)

## Visualization

In [None]:
def create_embedding_files_for_visualization(model):
    """ Create embedding files for visualization """

    target_vectors_filepath = (f"visualization/{algorithm}_target_vectors.tsv")
    target_metadata_filepath = (f"visualization/{algorithm}_target_metadata.tsv")

    out_v = open(target_vectors_filepath, "w", encoding="utf-8")
    out_m = open(target_metadata_filepath, "w", encoding="utf-8")

    # Meta File Header
    out_m.write("ProductName\tCategory\tAisle" + "\n")
    
    for key in model.wv.vocab.keys():
        embedding_vector = model.wv[key]
        # META Input
        out_m.write(product_key_to_meta(key) + "\n")
        out_v.write("\t".join([str(x) for x in embedding_vector]) + "\n")

    out_v.close()
    out_m.close()

In [None]:
embeddings.create_embedding_files_for_visualization(product_key_conversion=product_key_conversion)
# create_embedding_files_for_visualization(model=model)

## Users Visualization

In [None]:
orders = pd.read_csv(orders_path)

In [None]:
def create_user_embedding_files_for_visualization(model, users):
    """ Create embedding files for visualization """

    target_user_vectors_filepath = (f"visualization/{algorithm}_target_user_vectors.tsv")
    target_user_names_filepath = (f"visualization/{algorithm}_target_user_names.tsv")

    out_v = open(target_user_vectors_filepath, "w", encoding="utf-8")
    out_m = open(target_user_names_filepath, "w", encoding="utf-8")
    
    for user_id in users:
        user_name = f"user_{user_id}"
        if user_name in model.docvecs:
            user_embedding_vector = model.docvecs[user_name]
            # META Input
            out_m.write(f"{user_name} \n")
            out_v.write("\t".join([str(x) for x in user_embedding_vector]) + "\n")

    out_v.close()
    out_m.close()

In [None]:
create_user_embedding_files_for_visualization(model=model, users=list(orders["user_id"].unique()))

# Hyperparameter Search

In [None]:
# We perform
# a hyperparameter search (300k models evaluated) on: the number of
# epochs n (10 to 200 with step of +10), the window-size L (3, 7, 12, 15),
# the sub-sampling parameter t (Eq. (2)) (10−5
# to 10−1 with step of ×10), the negative sampling distribution parameter α (Eq. (3)) (−1.4
# to 1.4 with step of +0.2), the embedding size (50 to 200 with a step
# of 50), the number of negative samples (5 to 20 with a step of 5) and
# the learning rate (0.0025 to 0.25 with a step of ×10). The marginal
# benefit of including the 3 latter variables to the optimization is not
# significant, with less than 2% in terms of performance. Thus, for
# readability, we only focus on the influence of the 4 first hyperparameters and keep the other fixed to default values (respectively
# 50, 5 and 0.025).

In [None]:
# Baseline

# Most Popular 
# Hit Rate @ 10 on Validation Set: 0.0703 +/- 0.0006
# Hit Rate @ 10 on Test Set:       0.0709 +/- 0.0004
# NDCG @ 10 on Validation Set:     0.0389 +/- 0.0005
# NDCG @ 10 on Test Set:           0.039 +/- 0.0002

# Most Popular For User
# Hit Rate @ 10 on Validation Set: 0.3043 +/- 0.0008
# Hit Rate @ 10 on Test Set:       0.2804 +/- 0.0006
# NDCG @ 10 on Validation Set:     0.1693 +/- 0.0003
# NDCG @ 10 on Test Set:           0.1558 +/- 0.0005

# Item Co-Count
# Hit Rate @ 10 on Validation Set: 0.1071 +/- 0.0012
# Hit Rate @ 10 on Test Set:       0.1072 +/- 0.0001
# NDCG @ 10 on Validation Set:     0.0584 +/- 0.0007
# NDCG @ 10 on Test Set:           0.0586 +/- 0.0001

In [None]:
epochs = [5, 25]
window_sizes = [100, 200]
samples = [0.01, 0.1] # 0.01
ns_exponents = [0.25, 0.5, 0.75] # 0.5
embedding_sizes = [128]
numbers_of_negative_samples = [7, 14]

numbers_of_candidates = [25, 100] 
k_neighbors = 10
k_predictions = 10

results = []

for epoch in epochs:
    for window_size in window_sizes:
        for sample in samples:
            for ns_exponent in ns_exponents:
                for embedding_size in embedding_sizes:
                    for number_of_negative_samples in numbers_of_negative_samples:
                        start = time.time()
                        print(f"Epoch: {epoch}, Window Size: {window_size}, Sample: {sample}, NS Exponent: {ns_exponent}, Embedding Size: {embedding_size}, Number of Negative Samples: {number_of_negative_samples}")

                        train_sentences = SentenceIterator(sentences_filepath=train_sentences_path)

                        # Train the model
                        model = train_model(
                                            train_sentences,
                                            epochs=epoch,
                                            embedding_size=embedding_size,
                                            window_size=window_size,
                                            min_count= 10,
                                            number_of_negative_samples=number_of_negative_samples,
                                            sample=sample,
                                            ns_exponent=ns_exponent,
                                            save=False,
                                        )

                        model, embedding_vectors, context_vectors = get_filtered_model_vectors(model)

                        mapping = {item_key: index for index, item_key in enumerate(model.wv.index2word)}
                        mapping_back = {index: item_key for item_key, index in mapping.items()}
                        embedding = [model.wv[key] for key in mapping.keys()]

                        assert len(embedding) == len(mapping) == len(embedding_vectors) == len(context_vectors)

                        y_category_list = []
                        y_aisle_list = []

                        for key in model.wv.vocab.keys():
                            y_category_list.append(product_key_to_meta(key).split("\t")[1])
                            y_aisle_list.append(product_key_to_meta(key).split("\t")[2])


                        # K Neighbors Classifier
                        k_neighbors_classifier = KNeighborsClassifier(n_neighbors=k_neighbors, n_jobs=-1)

                        category_f1 = predict_labels(classifier=k_neighbors_classifier, x=embedding, y=y_category_list)
                        aisle_f1 = predict_labels(classifier=k_neighbors_classifier, x=embedding, y=y_aisle_list)

                        print(f"Category - Micro: {category_f1[0]}, Macro: {category_f1[1]}, Weighted: {category_f1[2]}")
                        print(f"Aisle    - Micro: {aisle_f1[0]}, Macro: {aisle_f1[1]}, Weighted: {aisle_f1[2]}")

                        # Recommender
                        usermetaitem2vec_recommender = UserItem2VecRecommender(algorithm="user-meta-item2vec", 
                                                                               item_key_mapping=mapping, 
                                                                               embedding=embedding_vectors, 
                                                                               context_vectors=context_vectors, 
                                                                               user_vectors=model.docvecs)

                        validation_auc, validation_ndcg = useritem2vec_recommender.evaluate(validation_data)
                        print(f"AUC on Validation Set: {validation_auc}")
                        print(f"NDCG on Validation Set: {validation_ndcg}")

                        # test_auc, test_ndcg = useritem2vec_recommender.evaluate(test_data)
                        # print(f"AUC on Test Set: {test_auc}")
                        # print(f"NDCG on Test Set: {test_ndcg}")


                        results.append((epoch, 
                                        window_size, 
                                        sample, 
                                        ns_exponent, 
                                        embedding_size,
                                        number_of_negative_samples,
                                        number_of_candidates,
                                        category_f1[0], 
                                        aisle_f1[0], 
                                        category_f1[1], 
                                        aisle_f1[1],
                                        category_f1[2], 
                                        aisle_f1[2],
                                        hit_rate_at_k_val, 
                                        ndcg_at_k_val
                                       ))

                        with open('parameter_search.txt', 'w') as f:
                            for line in results:
                                f.write(f"{str(line)}\n")


                        end = time.time()
                        print(f"Took {end - start} seconds\n")

results.sort(key=lambda x: x[12], reverse=True)
results

In [4]:
# Read
with open(f'results/{algorithm}_parameter_search.txt', 'r') as f:
    results = f.readlines()
    results = [eval(p.strip()) for p in results]

In [5]:
results_df = pd.DataFrame(results, columns=[
                                            "Epoch", 
                                            "Window Size", 
                                            "Sample",
                                            "NS Exponent", 
                                            "Embedding Size", 
                                            "Number of Negative Samples",
                                            "F1 Macro Category",
                                            "F1 Macro Aisle",
                                            "F1 Micro Category",
                                            "F1 Micro Aisle",
                                            "Within-basket AUC",
                                            "Within-basket NDCG",
                                            "Within-basket Recall",
                                            "Within-basket Precision",
                                            "Next-basket AUC",
                                            "Next-basket NDCG",
                                            "Next-basket Recall",
                                            "Next-basket Precision"
                                          ])
results_df.to_csv(f'results/{algorithm}/{small}results.csv', index=False)

In [14]:
# results_df = pd.read_csv(f'results/{algorithm}/{small}results.csv')
# results_df.sort_values("Sample", ascending=False)
column = "Number of Negative Samples"

print(results_df.groupby(column)[["Within-basket AUC", "Next-basket AUC", "Within-basket NDCG", "Next-basket NDCG", "Within-basket Recall", "Next-basket Recall"]].mean())
print("\n#########################\n")
print(results_df.groupby(column)[["Within-basket AUC", "Next-basket AUC", "Within-basket NDCG", "Next-basket NDCG", "Within-basket Recall", "Next-basket Recall"]].median())

                            Within-basket AUC  Next-basket AUC  \
Number of Negative Samples                                       
3                                      0.9602         0.968200   
7                                      0.9603         0.968544   

                            Within-basket NDCG  Next-basket NDCG  \
Number of Negative Samples                                         
3                                     0.152838          0.175788   
7                                     0.153944          0.177775   

                            Within-basket Recall  Next-basket Recall  
Number of Negative Samples                                            
3                                       0.071800            0.101363  
7                                       0.074069            0.104719  

#########################

                            Within-basket AUC  Next-basket AUC  \
Number of Negative Samples                                       
3                 

In [7]:
correlation = results_df.corr(method='spearman')
correlation.style.background_gradient(cmap='coolwarm')

  smin = np.nanmin(s.to_numpy()) if vmin is None else vmin
  smax = np.nanmax(s.to_numpy()) if vmax is None else vmax


Unnamed: 0,Epoch,Window Size,Sample,NS Exponent,Embedding Size,Number of Negative Samples,F1 Macro Category,F1 Macro Aisle,F1 Micro Category,F1 Micro Aisle,Within-basket AUC,Within-basket NDCG,Within-basket Recall,Within-basket Precision,Next-basket AUC,Next-basket NDCG,Next-basket Recall,Next-basket Precision
Epoch,1.0,,0.0,0.0,0.0,0.0,-0.250458,0.0,-0.412917,-0.101537,0.782911,0.105008,0.003387,0.084785,0.852988,0.433542,0.213247,0.318383
Window Size,,,,,,,,,,,,,,,,,,
Sample,0.0,,1.0,0.0,0.0,0.0,0.142152,0.148921,0.121844,0.135383,-0.223689,-0.067747,-0.016934,-0.179745,-0.152319,0.159191,0.155704,0.135482
NS Exponent,0.0,,0.0,1.0,0.0,0.0,0.866449,0.839372,0.859679,0.805526,-0.491438,-0.867164,-0.860231,-0.868201,0.260635,0.24048,0.423109,0.243868
Embedding Size,0.0,,0.0,0.0,1.0,0.0,-0.162459,-0.31138,-0.142152,-0.365533,0.26436,0.19308,0.220138,0.19331,0.389261,0.643539,0.653281,0.646926
Number of Negative Samples,0.0,,0.0,0.0,0.0,1.0,0.331687,0.39261,0.209843,0.406148,0.061006,0.413258,0.440276,0.386621,0.13878,0.491122,0.514501,0.56225
F1 Macro Category,-0.250458,,0.142152,0.866449,-0.162459,0.331687,1.0,0.949413,0.971408,0.946114,-0.665688,-0.693571,-0.642091,-0.708725,-0.024562,0.19369,0.404546,0.246148
F1 Macro Aisle,0.0,,0.148921,0.839372,-0.31138,0.39261,0.949413,1.0,0.877566,0.983871,-0.494081,-0.640008,-0.621183,-0.66575,0.128677,0.218819,0.372102,0.244131
F1 Micro Category,-0.412917,,0.121844,0.859679,-0.142152,0.209843,0.971408,0.877566,1.0,0.895528,-0.790494,-0.742732,-0.687024,-0.753904,-0.16937,0.066765,0.313262,0.128577
F1 Micro Aisle,-0.101537,,0.135383,0.805526,-0.365533,0.406148,0.946114,0.983871,0.895528,1.0,-0.56768,-0.61451,-0.597891,-0.64518,0.016314,0.119406,0.287783,0.155906


In [8]:
pd.set_option('display.max_rows', 500)

In [9]:
results_df["Avg. Recall"] = (results_df["Within-basket Recall"] + results_df["Next-basket Recall"]) /2
results_df.sort_values("Avg. Recall", ascending=False)

Unnamed: 0,Epoch,Window Size,Sample,NS Exponent,Embedding Size,Number of Negative Samples,F1 Macro Category,F1 Macro Aisle,F1 Micro Category,F1 Micro Aisle,Within-basket AUC,Within-basket NDCG,Within-basket Recall,Within-basket Precision,Next-basket AUC,Next-basket NDCG,Next-basket Recall,Next-basket Precision,Avg. Recall
31,15,5,0.1,0.5,128,7,0.9456,0.5676,0.8748,0.4504,0.9605,0.1523,0.0723,0.0304,0.9704,0.1823,0.1113,0.1046,0.0918
30,15,5,0.01,0.5,128,7,0.9274,0.5546,0.8375,0.438,0.9614,0.1532,0.0732,0.0311,0.9709,0.1813,0.1101,0.1041,0.09165
27,15,5,0.1,0.25,128,7,0.7422,0.4777,0.6684,0.3809,0.9617,0.1558,0.0769,0.0324,0.9694,0.1792,0.1059,0.1016,0.0914
26,15,5,0.01,0.25,128,7,0.7385,0.4692,0.6561,0.3709,0.9623,0.1558,0.0768,0.0325,0.9699,0.1789,0.1052,0.1013,0.091
19,5,5,0.1,0.25,128,7,0.8167,0.5014,0.7999,0.4026,0.96,0.1556,0.0768,0.0323,0.9673,0.1776,0.105,0.1008,0.0909
18,5,5,0.01,0.25,128,7,0.8036,0.4895,0.7824,0.3886,0.9605,0.1555,0.0769,0.0325,0.9676,0.1772,0.104,0.1004,0.09045
23,5,5,0.1,0.5,128,7,0.9641,0.5658,0.9493,0.4622,0.959,0.1523,0.0717,0.0301,0.9683,0.1785,0.1071,0.1015,0.0894
11,15,5,0.1,0.25,64,7,0.7953,0.5261,0.7311,0.426,0.9614,0.1557,0.076,0.0323,0.9686,0.1769,0.1021,0.0993,0.08905
22,5,5,0.01,0.5,128,7,0.9421,0.5557,0.9245,0.4552,0.9596,0.1531,0.0725,0.0308,0.9687,0.1774,0.1056,0.101,0.08905
3,5,5,0.1,0.25,64,7,0.8336,0.5232,0.818,0.4237,0.9595,0.1552,0.0759,0.0321,0.9665,0.176,0.1019,0.0989,0.0889
