In [None]:
# default_exp models.prod2vec

# Prod2Vec
> Implementation of Prod2vec model.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
from typing import List

import logging
import gensim
import numpy as np
import os
from abc import ABC
import ast

## Prod2Vec

In [None]:
#export
class Prod2Vec(object):
    """
    Implementation of the Prod2Vec skipgram model from
    Grbovic Mihajlo, Vladan Radosavljevic, Nemanja Djuric, Narayan Bhamidipati, Jaikit Savla, Varun Bhagwan, and Doug Sharp.
    "E-commerce in your inbox: Product recommendations at scale."
    In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining,
    pp. 1809-1818. ACM, 2015.
    """

    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger()

    def __init__(self, min_count=2, negative=5, size=100, window=5, decay_alpha=0.9):
        """
        :param min_count: (optional) the minimum item frequency. Items less frequent that min_count will be pruned
        :param negative: (optional) the minimum negative samples
        :param size: (optional) the size of the embeddings
        :param window: (optional) the size of the context window
        :param decay_alpha: (optional) the exponential decay factor used to discount the similarity scores for items
                back in the user profile. Lower values mean higher discounting of past user interactions. Allows values in [0-1].
        """
        super(Prod2Vec, self).__init__()
        self.min_count = min_count
        self.negative = negative
        self.size = size
        self.window = window
        self.decay_alpha = decay_alpha

    def __str__(self):
        return 'Prod2Vec(min_count={min_count}, ' \
               'size={size}, ' \
               'window={window}, ' \
               'decay_alpha={decay_alpha})'.format(**self.__dict__)

    def fit(self, train_data):
        self.model = gensim.models.Word2Vec(train_data,
                                            min_count=self.min_count,
                                            negative=self.negative,
                                            window=self.window,
                                            hs=1,
                                            size=self.size,
                                            sg=1,
                                            workers=-1)
        self.model.train(train_data, total_examples = self.model.corpus_count, 
                         epochs=10, report_delay=1)
        # As we do not plan to train the model any further, we are calling
        # init_sims(), which will make the model much more memory-efficient
        self.model.init_sims(replace=True)

    def aggregate_vectors(self, products):
        product_vec = []
        for i in products:
            try:
                product_vec.append(self.model[i])
            except KeyError:
                continue
            
        return np.mean(product_vec, axis=0)

    def recommend(self, user_profile, topk=5):
        """
        Given the user profile return a list of recommendation

        Args:
            user_profile: list of item ids visited/interacted by the user
            topk: (optional) top-k recommendations
        """
        rec = []
        try:
            vec = self.aggregate_vectors(user_profile)
            # extract most similar products for the input vector
            rec = self.model.wv.similar_by_vector(vec, topn= topk+1)[1:]
        except KeyError:
            rec = []

        return rec

In [None]:
import pandas as pd

!wget -q --show-progress https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx

df = pd.read_excel('Online Retail.xlsx')



In [None]:
# remove missing values
df.dropna(inplace=True)

# Convert the StockCode to string datatype
df['StockCode']= df['StockCode'].astype(str)

# Check out the number of unique customers in our dataset
customers = df["CustomerID"].unique().tolist()

# shuffle customer ID's
import random
random.shuffle(customers)

# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

# split data into train and validation set
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]

# list to capture purchase history of the customers
purchases_train = []

# populate the list with the product codes
from tqdm.notebook import tqdm
for i in tqdm(customers_train):
    temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_train.append(temp)

# list to capture purchase history of the customers
purchases_val = []

# populate the list with the product codes
for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_val.append(temp)

  0%|          | 0/3935 [00:00<?, ?it/s]

  0%|          | 0/437 [00:00<?, ?it/s]

In [None]:
# Build word2vec Embeddings for Products
# train word2vec model
model = Prod2Vec(window=10, negative=5, size=100, min_count=2)
model.fit(purchases_train)

In [None]:
import warnings
warnings.filterwarnings('ignore')

products = train_df[["StockCode", "Description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

In [None]:
random_sample = products.sample(1).values
recommendations = [[products_dict[a][0], b] for a,b in model.recommend(user_profile=random_sample[:,0])]

print(random_sample[:,1])
print(' ')
for rec in recommendations: print(rec)

['SET OF 3 BABUSHKA STACKING TINS']
 
['EDWARDIAN HEART PHOTO FRAME', 0.3702189028263092]
['SET OF 6 VINTAGE NOTELETS KIT', 0.34610092639923096]
['FRENCH STYLE STORAGE JAR JAM', 0.3301945626735687]
['BAG 500g SWIRLY MARBLES', 0.3177795708179474]
['SPOTTY BUNTING', 0.30998745560646057]


In [None]:
random_sample = products.sample(5).values
recommendations = [[products_dict[a][0], b] for a,b in model.recommend(user_profile=random_sample[:,0])]

print(random_sample[:,1])
print(' ')
for rec in recommendations: print(rec)

['SET OF 5 LUCKY CAT MAGNETS ' 'HEARTS GIFT TAPE'
 'PAINTED YELLOW WOODEN DAISY' 'COLOURFUL FLOWER FRUIT BOWL'
 'TUSCAN VILLA BIRD FEEDER']
 
['PAINTED YELLOW WOODEN DAISY', 0.498282253742218]
['HEARTS GIFT TAPE', 0.4829496443271637]
['TUSCAN VILLA BIRD FEEDER', 0.34984883666038513]
['STRAWBERRY RAFFIA FOOD COVER', 0.3352939486503601]
['IVORY PAPER CUP CAKE CASES ', 0.3215782642364502]


## Prod2Vec_v2

In [None]:
#export
class Prod2Vec_v2(ABC):
    def __init__(self):
        pass

    def train(self, items, iterations=15):
        # Get the item ID and rating for each item for each unique user
        x_train = [[str((x["sid"], x["rating"])) for x in y] for y in items]
        self._model = self.train_embeddings(x_train, iterations=iterations)

    def train_embeddings(
        self,
        sessions: list,
        min_c: int = 3,
        size: int = 48,
        window: int = 5,
        iterations: int = 15,
        ns_exponent: float = 0.75,
        is_debug: bool = True):
        """
        Train CBOW to get product embeddings with sensible defaults
        (https://arxiv.org/abs/2007.14906).
        """
        model = gensim.models.Word2Vec(min_count=min_c,
                                    size=size,
                                    window=window,
                                    iter=iterations,
                                    ns_exponent=ns_exponent)

        model.build_vocab(sessions)
        model.init_sims(replace=True)
        
        return model.wv

    def predict(self, prediction_input, *args, **kwargs):
        """
        Predicts the top 10 similar items recommended for each user according
        to the items that they've interacted and the ratings that they've given
        :param prediction_input: a list of lists containing a dictionary for
                                 each item interacted by that user
        :return:
        """
        all_predictions = []
        for items in prediction_input:
            predictions = []
            emb_vecs = []
            for item in items:
                emb_vec = self.get_vector(item)
                if emb_vec:
                    emb_vecs.append(emb_vec)
            if emb_vecs:
                # Calculate the average of all the latent vectors representing
                # the items interacted by the user as is done in https://arxiv.org/abs/2007.14906
                avg_emb_vec = np.mean(emb_vecs, axis=0)
                nn_products = self.model.similar_by_vector(avg_emb_vec, topn=10)
                for elem in nn_products:
                    elem = ast.literal_eval(elem[0])
                    predictions.append({"sid": elem[0], "rating": elem[1]})
            all_predictions.append(predictions)
        return all_predictions

    def get_vector(self, x):
        """
        Returns the latent vector that corresponds to the item ID and the rating
        :param x:
        :return:
        """
        item = str((x["sid"], x["rating"]))
        try:
            return list(self.model.get_vector(item))
        except Exception as e:
            return []

    @property
    def model(self):
        return self._model

## Prod2Vec_v3

In [None]:
#export
class Prod2Vec_v3:
    """
    Implementation of the Prod2Vec skipgram model from
    Grbovic Mihajlo, Vladan Radosavljevic, Nemanja Djuric, Narayan Bhamidipati, Jaikit Savla, Varun Bhagwan, and Doug Sharp.
    "E-commerce in your inbox: Product recommendations at scale."
    In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining,
    pp. 1809-1818. ACM, 2015.
    """

    def __init__(self, min_count=2, size=100, window=5, decay_alpha=0.9, workers=4):
        """
        :param min_count: (optional) the minimum item frequency. Items less frequent that min_count will be pruned
        :param size: (optional) the size of the embeddings
        :param window: (optional) the size of the context window
        :param decay_alpha: (optional) the exponential decay factor used to discount the similarity scores for items
                back in the user profile. Lower values mean higher discounting of past user interactions. Allows values in [0-1].
        :param workers: (optional) the number of threads used for training
        """
        self.min_count = min_count
        self.size = size
        self.window = window
        self.decay_alpha = decay_alpha
        self.workers = workers

    def __str__(self):
        return 'Prod2VecRecommender(min_count={min_count}, ' \
               'size={size}, ' \
               'window={window}, ' \
               'decay_alpha={decay_alpha}, ' \
               'workers={workers})'.format(**self.__dict__)

    def fit(self, train_data, seq_col='sequence'):
        sequences = train_data[seq_col].values
        self.model = gensim.models.Word2Vec(sequences,
                                            min_count=self.min_count,
                                            window=self.window,
                                            hs=1,
                                            size=self.size,
                                            sg=1,
                                            workers=self.workers)

    def recommend(self, user_profile, user_id=None):
        """
        Given the user profile return a list of recommendation
        :param user_profile: the user profile as a list of item identifiers
        :param user_id: (optional) the user id
        :return: list of recommendations e.g. [([2], 0.875), ([6], 1.0)]
        """
        user_profile = list(map(str, user_profile))
        rec = []
        try:
            # iterate the user profile backwards
            for i, item in enumerate(user_profile[::-1]):
                ms = self.model.most_similar(positive=item)
                # apply exponential decay to the similarity scores
                decay = self.decay_alpha ** i
                ms = [(x[0], decay * x[1]) for x in ms]
                rec.extend(ms)
            # sort items by similarity score
            rec = sorted(rec, key=lambda x: -x[1])
        except KeyError:
            rec = []
        return [([x[0]], x[1]) for x in rec]

    @staticmethod
    def get_recommendation_list(recommendation):
        return list(map(lambda x: x[0], recommendation))

    @staticmethod
    def get_recommendation_confidence_list(recommendation):
        return list(map(lambda x: x[1], recommendation))

Example

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from recohut.utils.data import load_dataset
from recohut.utils.filters import filter_by_time, filter_top_k
from recohut.utils.splitting import split_last_session_out
from recohut.models.itempop import ItemPop_v2
from recohut.evaluation.sequences import eval_seqreveal, eval_staticprofile
from recohut.evaluation.sequences import eval_reclength, eval_profilelength

In [None]:
df = load_dataset('music30_sample')
df.columns = ['session_id', 'user_id', 'item_id', 'ts', 'playtime']
df['ts'] = pd.to_datetime(df['ts'], unit='s')

# let's keep only the top-1k most popular items in the last month
df = filter_by_time(df, last_months=1, ts_col='ts')
df = filter_top_k(df, topk=1000, user_col='user_id', item_col='item_id', sess_col='session_id', ts_col='ts')

train, test = split_last_session_out(df, user_col='user_id', sess_col='session_id', seq_col='sequence', time_col='ts')

In [None]:
model = Prod2Vec_v3()
model.fit(train)

In [None]:
eval_seqreveal(train, test, model)

2891 sequences available for evaluation


100%|██████████| 2891/2891 [00:18<00:00, 154.10it/s]


{'GIVEN_K': 1,
 'LOOK_AHEAD': 1,
 'MRR@10': 0.09646700449978074,
 'Model': 'Prod2VecRecommender',
 'Precision@10': 0.023950388764308123,
 'Recall@10': 0.23162620325011596,
 'STEP': 1}

In [None]:
eval_staticprofile(train, test, model)

2891 sequences available for evaluation


100%|██████████| 2891/2891 [00:01<00:00, 2624.12it/s]


{'GIVEN_K': 1,
 'LOOK_AHEAD': 'all',
 'MRR@10': 0.1987120675550286,
 'Model': 'Prod2VecRecommender',
 'Precision@10': 0.10162573503977833,
 'Recall@10': 0.19535189892426655,
 'STEP': 1}

In [None]:
eval_reclength(train, test, model)

2891 sequences available for evaluation
Evaluating recommendation lists with length: 1


100%|██████████| 2891/2891 [00:18<00:00, 152.72it/s]


Evaluating recommendation lists with length: 5


100%|██████████| 2891/2891 [00:18<00:00, 153.73it/s]


Evaluating recommendation lists with length: 10


100%|██████████| 2891/2891 [00:20<00:00, 142.69it/s]


Evaluating recommendation lists with length: 20


100%|██████████| 2891/2891 [00:19<00:00, 148.46it/s]


Evaluating recommendation lists with length: 50


100%|██████████| 2891/2891 [00:26<00:00, 109.84it/s]


Evaluating recommendation lists with length: 100


100%|██████████| 2891/2891 [00:23<00:00, 122.21it/s]


[<Figure size 360x360 with 1 Axes>,
 <Figure size 360x360 with 1 Axes>,
 <Figure size 360x360 with 1 Axes>]

In [None]:
eval_profilelength(train, test, model)

1164 sequences available for evaluation
Evaluating profiles having length: 1


100%|██████████| 1164/1164 [00:00<00:00, 2759.01it/s]


Evaluating profiles having length: 2


100%|██████████| 1164/1164 [00:00<00:00, 1442.61it/s]


Evaluating profiles having length: 3


100%|██████████| 1164/1164 [00:01<00:00, 1114.06it/s]


Evaluating profiles having length: 4


100%|██████████| 1164/1164 [00:01<00:00, 882.29it/s]


[<Figure size 360x360 with 1 Axes>,
 <Figure size 360x360 with 1 Axes>,
 <Figure size 360x360 with 1 Axes>]

> References
>
> 1. [https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-07-19-session-based-prod2vec-coveo.ipynb](https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-07-19-session-based-prod2vec-coveo.ipynb)
> 2. [https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-06-11-recostep-session-based-recommender-using-word2vec.ipynb](https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-06-11-recostep-session-based-recommender-using-word2vec.ipynb)
> 3. [https://github.com/mquad/sars_tutorial/blob/master/recommenders/Prod2VecRecommender.py](https://github.com/mquad/sars_tutorial/blob/master/recommenders/Prod2VecRecommender.py)
> 4. [https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-04-24-rec-medium-word2vec.ipynb](https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-04-24-rec-medium-word2vec.ipynb)

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2022-01-13 11:15:25

recohut: 0.0.11

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

gensim    : 3.6.0
IPython   : 5.5.0
networkx  : 2.6.3
json      : 2.0.9
matplotlib: 3.2.2
torch     : 1.10.0+cu111
numpy     : 1.19.5

