In [None]:
# default_exp models.prod2vec

# Prod2Vec
> Implementation of Prod2vec model.

> References
1. [https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-07-19-session-based-prod2vec-coveo.ipynb](https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-07-19-session-based-prod2vec-coveo.ipynb)
2. [https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-06-11-recostep-session-based-recommender-using-word2vec.ipynb](https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-06-11-recostep-session-based-recommender-using-word2vec.ipynb)
3. [https://github.com/mquad/sars_tutorial/blob/master/recommenders/Prod2VecRecommender.py](https://github.com/mquad/sars_tutorial/blob/master/recommenders/Prod2VecRecommender.py)
4. [https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-04-24-rec-medium-word2vec.ipynb](https://nbviewer.org/github/sparsh-ai/stanza/blob/S543002/2021-04-24-rec-medium-word2vec.ipynb)

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import logging
import gensim
import numpy as np

In [None]:
#export
class Prod2Vec(object):
    """
    Implementation of the Prod2Vec skipgram model from
    Grbovic Mihajlo, Vladan Radosavljevic, Nemanja Djuric, Narayan Bhamidipati, Jaikit Savla, Varun Bhagwan, and Doug Sharp.
    "E-commerce in your inbox: Product recommendations at scale."
    In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining,
    pp. 1809-1818. ACM, 2015.
    """

    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger()

    def __init__(self, min_count=2, negative=5, size=100, window=5, decay_alpha=0.9):
        """
        :param min_count: (optional) the minimum item frequency. Items less frequent that min_count will be pruned
        :param negative: (optional) the minimum negative samples
        :param size: (optional) the size of the embeddings
        :param window: (optional) the size of the context window
        :param decay_alpha: (optional) the exponential decay factor used to discount the similarity scores for items
                back in the user profile. Lower values mean higher discounting of past user interactions. Allows values in [0-1].
        """
        super(Prod2Vec, self).__init__()
        self.min_count = min_count
        self.negative = negative
        self.size = size
        self.window = window
        self.decay_alpha = decay_alpha

    def __str__(self):
        return 'Prod2Vec(min_count={min_count}, ' \
               'size={size}, ' \
               'window={window}, ' \
               'decay_alpha={decay_alpha})'.format(**self.__dict__)

    def fit(self, train_data):
        self.model = gensim.models.Word2Vec(train_data,
                                            min_count=self.min_count,
                                            negative=self.negative,
                                            window=self.window,
                                            hs=1,
                                            size=self.size,
                                            sg=1,
                                            workers=-1)
        self.model.train(train_data, total_examples = self.model.corpus_count, 
                         epochs=10, report_delay=1)
        # As we do not plan to train the model any further, we are calling
        # init_sims(), which will make the model much more memory-efficient
        self.model.init_sims(replace=True)

    def aggregate_vectors(self, products):
        product_vec = []
        for i in products:
            try:
                product_vec.append(self.model[i])
            except KeyError:
                continue
            
        return np.mean(product_vec, axis=0)

    def recommend(self, user_profile, topk=5):
        """
        Given the user profile return a list of recommendation

        Args:
            user_profile: list of item ids visited/interacted by the user
            topk: (optional) top-k recommendations
        """
        rec = []
        try:
            vec = self.aggregate_vectors(user_profile)
            # extract most similar products for the input vector
            rec = self.model.wv.similar_by_vector(vec, topn= topk+1)[1:]
        except KeyError:
            rec = []

        return rec

In [None]:
import pandas as pd

!wget -q --show-progress https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx

df = pd.read_excel('Online Retail.xlsx')



In [None]:
# remove missing values
df.dropna(inplace=True)

# Convert the StockCode to string datatype
df['StockCode']= df['StockCode'].astype(str)

# Check out the number of unique customers in our dataset
customers = df["CustomerID"].unique().tolist()

# shuffle customer ID's
import random
random.shuffle(customers)

# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

# split data into train and validation set
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]

# list to capture purchase history of the customers
purchases_train = []

# populate the list with the product codes
from tqdm.notebook import tqdm
for i in tqdm(customers_train):
    temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_train.append(temp)

# list to capture purchase history of the customers
purchases_val = []

# populate the list with the product codes
for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_val.append(temp)

  0%|          | 0/3935 [00:00<?, ?it/s]

  0%|          | 0/437 [00:00<?, ?it/s]

In [None]:
# Build word2vec Embeddings for Products
# train word2vec model
model = Prod2Vec(window=10, negative=5, size=100, min_count=2)
model.fit(purchases_train)

In [None]:
import warnings
warnings.filterwarnings('ignore')

products = train_df[["StockCode", "Description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

In [None]:
random_sample = products.sample(1).values
recommendations = [[products_dict[a][0], b] for a,b in model.recommend(user_profile=random_sample[:,0])]

print(random_sample[:,1])
print(' ')
for rec in recommendations: print(rec)

['SET OF 3 BABUSHKA STACKING TINS']
 
['EDWARDIAN HEART PHOTO FRAME', 0.3702189028263092]
['SET OF 6 VINTAGE NOTELETS KIT', 0.34610092639923096]
['FRENCH STYLE STORAGE JAR JAM', 0.3301945626735687]
['BAG 500g SWIRLY MARBLES', 0.3177795708179474]
['SPOTTY BUNTING', 0.30998745560646057]


In [None]:
random_sample = products.sample(5).values
recommendations = [[products_dict[a][0], b] for a,b in model.recommend(user_profile=random_sample[:,0])]

print(random_sample[:,1])
print(' ')
for rec in recommendations: print(rec)

['SET OF 5 LUCKY CAT MAGNETS ' 'HEARTS GIFT TAPE'
 'PAINTED YELLOW WOODEN DAISY' 'COLOURFUL FLOWER FRUIT BOWL'
 'TUSCAN VILLA BIRD FEEDER']
 
['PAINTED YELLOW WOODEN DAISY', 0.498282253742218]
['HEARTS GIFT TAPE', 0.4829496443271637]
['TUSCAN VILLA BIRD FEEDER', 0.34984883666038513]
['STRAWBERRY RAFFIA FOOD COVER', 0.3352939486503601]
['IVORY PAPER CUP CAKE CASES ', 0.3215782642364502]


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2021-12-26 08:07:18

recohut: 0.0.7

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas    : 1.1.5
seaborn   : 0.11.2
torch     : 1.10.0+cu111
IPython   : 5.5.0
logging   : 0.5.1.2
numpy     : 1.19.5
gensim    : 3.6.0
matplotlib: 3.2.2

